summaryrefslogtreecommitdiff
path: root/src/intel/vulkan
diff options
context:
space:
mode:
Diffstat (limited to 'src/intel/vulkan')
-rw-r--r--src/intel/vulkan/TODO13
-rw-r--r--src/intel/vulkan/anv_acceleration_structure.c250
-rw-r--r--src/intel/vulkan/anv_allocator.c1280
-rw-r--r--src/intel/vulkan/anv_android.c435
-rw-r--r--src/intel/vulkan/anv_android.h27
-rw-r--r--src/intel/vulkan/anv_android_stubs.c28
-rw-r--r--src/intel/vulkan/anv_astc_emu.c516
-rw-r--r--src/intel/vulkan/anv_batch_chain.c1955
-rw-r--r--src/intel/vulkan/anv_blorp.c1974
-rw-r--r--src/intel/vulkan/anv_bo_sync.c240
-rw-r--r--src/intel/vulkan/anv_cmd_buffer.c2005
-rw-r--r--src/intel/vulkan/anv_descriptor_set.c2325
-rw-r--r--src/intel/vulkan/anv_device.c6700
-rw-r--r--src/intel/vulkan/anv_formats.c1484
-rw-r--r--src/intel/vulkan/anv_gem.c744
-rw-r--r--src/intel/vulkan/anv_gem_stubs.c299
-rw-r--r--src/intel/vulkan/anv_genX.h301
-rw-r--r--src/intel/vulkan/anv_image.c2980
-rw-r--r--src/intel/vulkan/anv_internal_kernels.c369
-rw-r--r--src/intel/vulkan/anv_internal_kernels.h131
-rw-r--r--src/intel/vulkan/anv_kmd_backend.c42
-rw-r--r--src/intel/vulkan/anv_kmd_backend.h136
-rw-r--r--src/intel/vulkan/anv_measure.c212
-rw-r--r--src/intel/vulkan/anv_measure.h6
-rw-r--r--src/intel/vulkan/anv_mesh_perprim_wa.c533
-rw-r--r--src/intel/vulkan/anv_nir.h93
-rw-r--r--src/intel/vulkan/anv_nir_add_base_work_group_id.c72
-rw-r--r--src/intel/vulkan/anv_nir_apply_pipeline_layout.c2077
-rw-r--r--src/intel/vulkan/anv_nir_compute_push_layout.c179
-rw-r--r--src/intel/vulkan/anv_nir_lower_load_patch_vertices_in.c66
-rw-r--r--src/intel/vulkan/anv_nir_lower_multiview.c208
-rw-r--r--src/intel/vulkan/anv_nir_lower_resource_intel.c170
-rw-r--r--src/intel/vulkan/anv_nir_lower_ubo_loads.c53
-rw-r--r--src/intel/vulkan/anv_nir_lower_ycbcr_textures.c367
-rw-r--r--src/intel/vulkan/anv_nir_push_descriptor_analysis.c261
-rw-r--r--src/intel/vulkan/anv_pass.c490
-rw-r--r--src/intel/vulkan/anv_perf.c81
-rw-r--r--src/intel/vulkan/anv_pipeline.c4256
-rw-r--r--src/intel/vulkan/anv_pipeline_cache.c1009
-rw-r--r--src/intel/vulkan/anv_private.h5516
-rw-r--r--src/intel/vulkan/anv_queue.c2678
-rw-r--r--src/intel/vulkan/anv_rmv.c864
-rw-r--r--src/intel/vulkan/anv_rmv.h118
-rw-r--r--src/intel/vulkan/anv_sparse.c1293
-rw-r--r--src/intel/vulkan/anv_util.c174
-rw-r--r--src/intel/vulkan/anv_utrace.c684
-rw-r--r--src/intel/vulkan/anv_va.c195
-rw-r--r--src/intel/vulkan/anv_video.c435
-rw-r--r--src/intel/vulkan/anv_wsi.c359
-rw-r--r--src/intel/vulkan/anv_wsi_display.c338
-rw-r--r--src/intel/vulkan/anv_wsi_x11.c96
-rw-r--r--src/intel/vulkan/genX_acceleration_structure.c1287
-rw-r--r--src/intel/vulkan/genX_blorp_exec.c361
-rw-r--r--src/intel/vulkan/genX_cmd_buffer.c8901
-rw-r--r--src/intel/vulkan/genX_cmd_compute.c1168
-rw-r--r--src/intel/vulkan/genX_cmd_draw.c2330
-rw-r--r--src/intel/vulkan/genX_cmd_draw_generated_flush.h79
-rw-r--r--src/intel/vulkan/genX_cmd_draw_generated_indirect.h656
-rw-r--r--src/intel/vulkan/genX_cmd_draw_helpers.h153
-rw-r--r--src/intel/vulkan/genX_cmd_video.c1195
-rw-r--r--src/intel/vulkan/genX_gfx_state.c2385
-rw-r--r--src/intel/vulkan/genX_gpu_memcpy.c353
-rw-r--r--src/intel/vulkan/genX_init_state.c1446
-rw-r--r--src/intel/vulkan/genX_internal_kernels.c111
-rw-r--r--src/intel/vulkan/genX_pipeline.c3144
-rw-r--r--src/intel/vulkan/genX_query.c959
-rw-r--r--src/intel/vulkan/genX_simple_shader.c704
-rw-r--r--src/intel/vulkan/genX_state.c894
-rw-r--r--src/intel/vulkan/gfx7_cmd_buffer.c476
-rw-r--r--src/intel/vulkan/gfx8_cmd_buffer.c844
-rw-r--r--src/intel/vulkan/grl/.gitignore1
-rw-r--r--src/intel/vulkan/grl/genX_grl.h54
-rw-r--r--src/intel/vulkan/grl/genX_grl_dispatch.c113
-rw-r--r--src/intel/vulkan/grl/genX_grl_uuid.cpp40
-rw-r--r--src/intel/vulkan/grl/gpu/AABB.h450
-rw-r--r--src/intel/vulkan/grl/gpu/api_interface.h840
-rw-r--r--src/intel/vulkan/grl/gpu/atomic_update.cl1112
-rw-r--r--src/intel/vulkan/grl/gpu/atomic_update.grl198
-rw-r--r--src/intel/vulkan/grl/gpu/binned_sah_shared.h265
-rw-r--r--src/intel/vulkan/grl/gpu/build_leaf.grl206
-rw-r--r--src/intel/vulkan/grl/gpu/build_primref.grl229
-rw-r--r--src/intel/vulkan/grl/gpu/build_refit.grl324
-rw-r--r--src/intel/vulkan/grl/gpu/bvh_build_BFS.cl4823
-rw-r--r--src/intel/vulkan/grl/gpu/bvh_build_DFS.cl2025
-rw-r--r--src/intel/vulkan/grl/gpu/bvh_build_leaf.cl357
-rw-r--r--src/intel/vulkan/grl/gpu/bvh_build_presplit.cl556
-rw-r--r--src/intel/vulkan/grl/gpu/bvh_build_primref.cl674
-rw-r--r--src/intel/vulkan/grl/gpu/bvh_build_primref.h246
-rw-r--r--src/intel/vulkan/grl/gpu/bvh_build_refit.cl491
-rw-r--r--src/intel/vulkan/grl/gpu/bvh_build_refit.h546
-rw-r--r--src/intel/vulkan/grl/gpu/bvh_build_sah_experimental.cl1917
-rw-r--r--src/intel/vulkan/grl/gpu/bvh_build_treelet_refit.h1507
-rw-r--r--src/intel/vulkan/grl/gpu/bvh_copy.cl763
-rw-r--r--src/intel/vulkan/grl/gpu/bvh_debug.cl208
-rw-r--r--src/intel/vulkan/grl/gpu/bvh_debug.grl107
-rw-r--r--src/intel/vulkan/grl/gpu/bvh_postbuild_info.cl97
-rw-r--r--src/intel/vulkan/grl/gpu/bvh_rebraid.cl1683
-rw-r--r--src/intel/vulkan/grl/gpu/common.h429
-rw-r--r--src/intel/vulkan/grl/gpu/copy.grl129
-rw-r--r--src/intel/vulkan/grl/gpu/d3d12.h525
-rw-r--r--src/intel/vulkan/grl/gpu/grl_api_interface_verify.cl59
-rw-r--r--src/intel/vulkan/grl/gpu/grl_api_interface_verify.grl27
-rw-r--r--src/intel/vulkan/grl/gpu/input_dump.cl723
-rw-r--r--src/intel/vulkan/grl/gpu/input_dump.grl252
-rw-r--r--src/intel/vulkan/grl/gpu/instance.h183
-rw-r--r--src/intel/vulkan/grl/gpu/intrinsics.h581
-rw-r--r--src/intel/vulkan/grl/gpu/libs/libraries.grl13
-rw-r--r--src/intel/vulkan/grl/gpu/libs/lsc_intrinsics.cl1033
-rw-r--r--src/intel/vulkan/grl/gpu/libs/lsc_intrinsics.h207
-rw-r--r--src/intel/vulkan/grl/gpu/libs/lsc_intrinsics_fallback.cl898
-rw-r--r--src/intel/vulkan/grl/gpu/mem_utils.h161
-rw-r--r--src/intel/vulkan/grl/gpu/misc.cl367
-rw-r--r--src/intel/vulkan/grl/gpu/misc.grl278
-rw-r--r--src/intel/vulkan/grl/gpu/misc_legacy.cl386
-rw-r--r--src/intel/vulkan/grl/gpu/misc_shared.h196
-rw-r--r--src/intel/vulkan/grl/gpu/morton/morton_common.h245
-rw-r--r--src/intel/vulkan/grl/gpu/morton/phase0.cl400
-rw-r--r--src/intel/vulkan/grl/gpu/morton/phase1.cl785
-rw-r--r--src/intel/vulkan/grl/gpu/morton/phase2.cl314
-rw-r--r--src/intel/vulkan/grl/gpu/morton/post_sort.cl521
-rw-r--r--src/intel/vulkan/grl/gpu/morton/pre_sort.cl117
-rw-r--r--src/intel/vulkan/grl/gpu/morton_builder.grl335
-rw-r--r--src/intel/vulkan/grl/gpu/morton_msb_radix_bitonic_sort.cl9
-rw-r--r--src/intel/vulkan/grl/gpu/morton_msb_radix_bitonic_sort.h924
-rw-r--r--src/intel/vulkan/grl/gpu/morton_msb_radix_bitonic_sort_shared.h135
-rw-r--r--src/intel/vulkan/grl/gpu/morton_radix_sort.cl9
-rw-r--r--src/intel/vulkan/grl/gpu/morton_radix_sort.h855
-rw-r--r--src/intel/vulkan/grl/gpu/msb_radix_bitonic_sort.grl297
-rw-r--r--src/intel/vulkan/grl/gpu/new_sah_builder.grl665
-rw-r--r--src/intel/vulkan/grl/gpu/postbuild_info.grl49
-rw-r--r--src/intel/vulkan/grl/gpu/presplit.grl62
-rw-r--r--src/intel/vulkan/grl/gpu/qbvh6.h933
-rw-r--r--src/intel/vulkan/grl/gpu/quad.h127
-rw-r--r--src/intel/vulkan/grl/gpu/radix_sort.grl163
-rw-r--r--src/intel/vulkan/grl/gpu/rebraid.grl167
-rw-r--r--src/intel/vulkan/grl/gpu/shared.h182
-rw-r--r--src/intel/vulkan/grl/gpu/structs.grl38
-rw-r--r--src/intel/vulkan/grl/gpu/traversal_shader.cl277
-rw-r--r--src/intel/vulkan/grl/gpu/traversal_shader.grl244
-rw-r--r--src/intel/vulkan/grl/grl_cl_kernel_gen.py226
-rw-r--r--src/intel/vulkan/grl/grl_metakernel_gen.py933
-rw-r--r--src/intel/vulkan/grl/grl_parser.py586
-rw-r--r--src/intel/vulkan/grl/grl_structs.h479
-rw-r--r--src/intel/vulkan/grl/include/AABB3f.h459
-rw-r--r--src/intel/vulkan/grl/include/GRLGen12.h691
-rw-r--r--src/intel/vulkan/grl/include/GRLIntTypes.h152
-rw-r--r--src/intel/vulkan/grl/include/GRLOCLCompatibility.h210
-rw-r--r--src/intel/vulkan/grl/include/GRLRTASCommon.h142
-rw-r--r--src/intel/vulkan/grl/include/GRLStructs.h60
-rw-r--r--src/intel/vulkan/grl/include/GRLUtilities.h32
-rw-r--r--src/intel/vulkan/grl/include/affinespace.h192
-rw-r--r--src/intel/vulkan/grl/meson.build203
-rw-r--r--src/intel/vulkan/i915/anv_batch_chain.c1107
-rw-r--r--src/intel/vulkan/i915/anv_batch_chain.h62
-rw-r--r--src/intel/vulkan/i915/anv_device.c400
-rw-r--r--src/intel/vulkan/i915/anv_device.h47
-rw-r--r--src/intel/vulkan/i915/anv_gem.c139
-rw-r--r--src/intel/vulkan/i915/anv_gem.h43
-rw-r--r--src/intel/vulkan/i915/anv_kmd_backend.c306
-rw-r--r--src/intel/vulkan/i915/anv_queue.c126
-rw-r--r--src/intel/vulkan/i915/anv_queue.h35
-rw-r--r--src/intel/vulkan/layers/anv_android_layer.c (renamed from src/intel/vulkan/anv_wsi_wayland.c)43
-rw-r--r--src/intel/vulkan/layers/anv_doom64.c137
-rw-r--r--src/intel/vulkan/layers/anv_hitman3.c41
-rw-r--r--src/intel/vulkan/layers/anv_rmv_layer.c122
-rw-r--r--src/intel/vulkan/meson.build243
-rw-r--r--src/intel/vulkan/tests/anv_tests.cpp25
-rw-r--r--src/intel/vulkan/tests/block_pool_grow_first.c26
-rw-r--r--src/intel/vulkan/tests/block_pool_max_size.c73
-rw-r--r--src/intel/vulkan/tests/block_pool_no_free.c45
-rw-r--r--src/intel/vulkan/tests/state_pool.c37
-rw-r--r--src/intel/vulkan/tests/state_pool_free_list_only.c39
-rw-r--r--src/intel/vulkan/tests/state_pool_max_size.c131
-rw-r--r--src/intel/vulkan/tests/state_pool_no_free.c28
-rw-r--r--src/intel/vulkan/tests/state_pool_padding.c29
-rw-r--r--src/intel/vulkan/tests/state_pool_test_helper.h53
-rw-r--r--src/intel/vulkan/tests/test_common.h31
-rw-r--r--src/intel/vulkan/xe/anv_batch_chain.c409
-rw-r--r--src/intel/vulkan/xe/anv_batch_chain.h65
-rw-r--r--src/intel/vulkan/xe/anv_device.c199
-rw-r--r--src/intel/vulkan/xe/anv_device.h40
-rw-r--r--src/intel/vulkan/xe/anv_kmd_backend.c355
-rw-r--r--src/intel/vulkan/xe/anv_queue.c164
-rw-r--r--src/intel/vulkan/xe/anv_queue.h35
184 files changed, 88058 insertions, 29637 deletions
diff --git a/src/intel/vulkan/TODO b/src/intel/vulkan/TODO
deleted file mode 100644
index 4c41e251888..00000000000
--- a/src/intel/vulkan/TODO
+++ /dev/null
@@ -1,13 +0,0 @@
-Intel Vulkan ToDo
-=================
-
-Missing Features:
- - Investigate CTS failures on HSW
- - Sparse memory
-
-Performance:
- - Multi-{sampled/gfx8,LOD} HiZ
- - MSAA fast clears
- - Pushing pieces of UBOs?
- - Enable guardband clipping
- - Use soft-pin to avoid relocations
diff --git a/src/intel/vulkan/anv_acceleration_structure.c b/src/intel/vulkan/anv_acceleration_structure.c
deleted file mode 100644
index 1d0ccc0b410..00000000000
--- a/src/intel/vulkan/anv_acceleration_structure.c
+++ /dev/null
@@ -1,250 +0,0 @@
-/*
- * Copyright © 2020 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-#include "anv_private.h"
-
-void
-anv_GetAccelerationStructureBuildSizesKHR(
- VkDevice device,
- VkAccelerationStructureBuildTypeKHR buildType,
- const VkAccelerationStructureBuildGeometryInfoKHR* pBuildInfo,
- const uint32_t* pMaxPrimitiveCounts,
- VkAccelerationStructureBuildSizesInfoKHR* pSizeInfo)
-{
- assert(pSizeInfo->sType ==
- VK_STRUCTURE_TYPE_ACCELERATION_STRUCTURE_BUILD_SIZES_INFO_KHR);
-
- uint64_t max_prim_count = 0;
- for (uint32_t i = 0; i < pBuildInfo->geometryCount; i++)
- max_prim_count += pMaxPrimitiveCounts[i];
-
- pSizeInfo->accelerationStructureSize = 0; /* TODO */
-
- uint64_t cpu_build_scratch_size = 0; /* TODO */
- uint64_t cpu_update_scratch_size = cpu_build_scratch_size;
-
- uint64_t gpu_build_scratch_size = 0; /* TODO */
- uint64_t gpu_update_scratch_size = gpu_build_scratch_size;
-
- switch (buildType) {
- case VK_ACCELERATION_STRUCTURE_BUILD_TYPE_HOST_KHR:
- pSizeInfo->buildScratchSize = cpu_build_scratch_size;
- pSizeInfo->updateScratchSize = cpu_update_scratch_size;
- break;
-
- case VK_ACCELERATION_STRUCTURE_BUILD_TYPE_DEVICE_KHR:
- pSizeInfo->buildScratchSize = gpu_build_scratch_size;
- pSizeInfo->updateScratchSize = gpu_update_scratch_size;
- break;
-
- case VK_ACCELERATION_STRUCTURE_BUILD_TYPE_HOST_OR_DEVICE_KHR:
- pSizeInfo->buildScratchSize = MAX2(cpu_build_scratch_size,
- gpu_build_scratch_size);
- pSizeInfo->updateScratchSize = MAX2(cpu_update_scratch_size,
- gpu_update_scratch_size);
- break;
-
- default:
- unreachable("Invalid acceleration structure build type");
- }
-}
-
-VkResult
-anv_CreateAccelerationStructureKHR(
- VkDevice _device,
- const VkAccelerationStructureCreateInfoKHR* pCreateInfo,
- const VkAllocationCallbacks* pAllocator,
- VkAccelerationStructureKHR* pAccelerationStructure)
-{
- ANV_FROM_HANDLE(anv_device, device, _device);
- ANV_FROM_HANDLE(anv_buffer, buffer, pCreateInfo->buffer);
- struct anv_acceleration_structure *accel;
-
- accel = vk_zalloc2(&device->vk.alloc, pAllocator, sizeof(*accel), 8,
- VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
- if (accel == NULL)
- return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
-
- vk_object_base_init(&device->vk, &accel->base,
- VK_OBJECT_TYPE_ACCELERATION_STRUCTURE_KHR);
-
- accel->size = pCreateInfo->size;
- accel->address = anv_address_add(buffer->address, pCreateInfo->offset);
-
- *pAccelerationStructure = anv_acceleration_structure_to_handle(accel);
-
- return VK_SUCCESS;
-}
-
-void
-anv_DestroyAccelerationStructureKHR(
- VkDevice _device,
- VkAccelerationStructureKHR accelerationStructure,
- const VkAllocationCallbacks* pAllocator)
-{
- ANV_FROM_HANDLE(anv_device, device, _device);
- ANV_FROM_HANDLE(anv_acceleration_structure, accel, accelerationStructure);
-
- if (!accel)
- return;
-
- vk_object_base_finish(&accel->base);
- vk_free2(&device->vk.alloc, pAllocator, accel);
-}
-
-VkDeviceAddress
-anv_GetAccelerationStructureDeviceAddressKHR(
- VkDevice device,
- const VkAccelerationStructureDeviceAddressInfoKHR* pInfo)
-{
- ANV_FROM_HANDLE(anv_acceleration_structure, accel,
- pInfo->accelerationStructure);
-
- assert(!anv_address_is_null(accel->address));
- assert(accel->address.bo->flags & EXEC_OBJECT_PINNED);
-
- return anv_address_physical(accel->address);
-}
-
-void
-anv_GetDeviceAccelerationStructureCompatibilityKHR(
- VkDevice device,
- const VkAccelerationStructureVersionInfoKHR* pVersionInfo,
- VkAccelerationStructureCompatibilityKHR* pCompatibility)
-{
- unreachable("Unimplemented");
-}
-
-VkResult
-anv_BuildAccelerationStructuresKHR(
- VkDevice device,
- VkDeferredOperationKHR deferredOperation,
- uint32_t infoCount,
- const VkAccelerationStructureBuildGeometryInfoKHR* pInfos,
- const VkAccelerationStructureBuildRangeInfoKHR* const* ppBuildRangeInfos)
-{
- unreachable("Unimplemented");
- return vk_error(VK_ERROR_FEATURE_NOT_PRESENT);
-}
-
-VkResult
-anv_CopyAccelerationStructureKHR(
- VkDevice device,
- VkDeferredOperationKHR deferredOperation,
- const VkCopyAccelerationStructureInfoKHR* pInfo)
-{
- unreachable("Unimplemented");
- return vk_error(VK_ERROR_FEATURE_NOT_PRESENT);
-}
-
-VkResult
-anv_CopyAccelerationStructureToMemoryKHR(
- VkDevice device,
- VkDeferredOperationKHR deferredOperation,
- const VkCopyAccelerationStructureToMemoryInfoKHR* pInfo)
-{
- unreachable("Unimplemented");
- return vk_error(VK_ERROR_FEATURE_NOT_PRESENT);
-}
-
-VkResult
-anv_CopyMemoryToAccelerationStructureKHR(
- VkDevice device,
- VkDeferredOperationKHR deferredOperation,
- const VkCopyMemoryToAccelerationStructureInfoKHR* pInfo)
-{
- unreachable("Unimplemented");
- return vk_error(VK_ERROR_FEATURE_NOT_PRESENT);
-}
-
-VkResult
-anv_WriteAccelerationStructuresPropertiesKHR(
- VkDevice device,
- uint32_t accelerationStructureCount,
- const VkAccelerationStructureKHR* pAccelerationStructures,
- VkQueryType queryType,
- size_t dataSize,
- void* pData,
- size_t stride)
-{
- unreachable("Unimplemented");
- return vk_error(VK_ERROR_FEATURE_NOT_PRESENT);
-}
-
-void
-anv_CmdBuildAccelerationStructuresKHR(
- VkCommandBuffer commandBuffer,
- uint32_t infoCount,
- const VkAccelerationStructureBuildGeometryInfoKHR* pInfos,
- const VkAccelerationStructureBuildRangeInfoKHR* const* ppBuildRangeInfos)
-{
- unreachable("Unimplemented");
-}
-
-void
-anv_CmdBuildAccelerationStructuresIndirectKHR(
- VkCommandBuffer commandBuffer,
- uint32_t infoCount,
- const VkAccelerationStructureBuildGeometryInfoKHR* pInfos,
- const VkDeviceAddress* pIndirectDeviceAddresses,
- const uint32_t* pIndirectStrides,
- const uint32_t* const* ppMaxPrimitiveCounts)
-{
- unreachable("Unimplemented");
-}
-
-void
-anv_CmdCopyAccelerationStructureKHR(
- VkCommandBuffer commandBuffer,
- const VkCopyAccelerationStructureInfoKHR* pInfo)
-{
- unreachable("Unimplemented");
-}
-
-void
-anv_CmdCopyAccelerationStructureToMemoryKHR(
- VkCommandBuffer commandBuffer,
- const VkCopyAccelerationStructureToMemoryInfoKHR* pInfo)
-{
- unreachable("Unimplemented");
-}
-
-void
-anv_CmdCopyMemoryToAccelerationStructureKHR(
- VkCommandBuffer commandBuffer,
- const VkCopyMemoryToAccelerationStructureInfoKHR* pInfo)
-{
- unreachable("Unimplemented");
-}
-
-void
-anv_CmdWriteAccelerationStructuresPropertiesKHR(
- VkCommandBuffer commandBuffer,
- uint32_t accelerationStructureCount,
- const VkAccelerationStructureKHR* pAccelerationStructures,
- VkQueryType queryType,
- VkQueryPool queryPool,
- uint32_t firstQuery)
-{
- unreachable("Unimplemented");
-}
diff --git a/src/intel/vulkan/anv_allocator.c b/src/intel/vulkan/anv_allocator.c
index 63eb855b85e..22af7f0ed1b 100644
--- a/src/intel/vulkan/anv_allocator.c
+++ b/src/intel/vulkan/anv_allocator.c
@@ -31,6 +31,7 @@
#include "common/intel_aux_map.h"
#include "util/anon_file.h"
+#include "util/futex.h"
#ifdef HAVE_VALGRIND
#define VG_NOACCESS_READ(__ptr) ({ \
@@ -78,7 +79,7 @@
* our allocation fast-path, there isn't really a way to munmap the old mmap,
* so we just keep it around until garbage collection time. While the block
* allocator is lockless for normal operations, we block other threads trying
- * to allocate while we're growing the map. It sholdn't happen often, and
+ * to allocate while we're growing the map. It shouldn't happen often, and
* growing is fast anyway.
*
* At the next level we can use various sub-allocators. The state pool is a
@@ -112,24 +113,6 @@
#define PAGE_SIZE 4096
#endif
-struct anv_mmap_cleanup {
- void *map;
- size_t size;
-};
-
-static inline uint32_t
-ilog2_round_up(uint32_t value)
-{
- assert(value != 0);
- return 32 - __builtin_clz(value - 1);
-}
-
-static inline uint32_t
-round_to_power_of_two(uint32_t value)
-{
- return 1 << ilog2_round_up(value);
-}
-
struct anv_state_table_cleanup {
void *map;
size_t size;
@@ -155,15 +138,12 @@ anv_state_table_init(struct anv_state_table *table,
* userptr and send a chunk of it off to the GPU.
*/
table->fd = os_create_anonymous_file(BLOCK_POOL_MEMFD_SIZE, "state table");
- if (table->fd == -1) {
- result = vk_error(VK_ERROR_INITIALIZATION_FAILED);
- goto fail_fd;
- }
+ if (table->fd == -1)
+ return vk_error(device, VK_ERROR_INITIALIZATION_FAILED);
- if (!u_vector_init(&table->cleanups,
- round_to_power_of_two(sizeof(struct anv_state_table_cleanup)),
- 128)) {
- result = vk_error(VK_ERROR_INITIALIZATION_FAILED);
+ if (!u_vector_init(&table->cleanups, 8,
+ sizeof(struct anv_state_table_cleanup))) {
+ result = vk_error(device, VK_ERROR_INITIALIZATION_FAILED);
goto fail_fd;
}
@@ -197,11 +177,11 @@ anv_state_table_expand_range(struct anv_state_table *table, uint32_t size)
/* Make sure that we don't go outside the bounds of the memfd */
if (size > BLOCK_POOL_MEMFD_SIZE)
- return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+ return vk_error(table->device, VK_ERROR_OUT_OF_HOST_MEMORY);
cleanup = u_vector_add(&table->cleanups);
if (!cleanup)
- return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+ return vk_error(table->device, VK_ERROR_OUT_OF_HOST_MEMORY);
*cleanup = ANV_STATE_TABLE_CLEANUP_INIT;
@@ -214,8 +194,8 @@ anv_state_table_expand_range(struct anv_state_table *table, uint32_t size)
map = mmap(NULL, size, PROT_READ | PROT_WRITE,
MAP_SHARED | MAP_POPULATE, table->fd, 0);
if (map == MAP_FAILED) {
- return vk_errorf(table->device, &table->device->vk.base,
- VK_ERROR_OUT_OF_HOST_MEMORY, "mmap failed: %m");
+ return vk_errorf(table->device, VK_ERROR_OUT_OF_HOST_MEMORY,
+ "mmap failed: %m");
}
cleanup->map = map;
@@ -232,8 +212,8 @@ anv_state_table_grow(struct anv_state_table *table)
{
VkResult result = VK_SUCCESS;
- uint32_t used = align_u32(table->state.next * ANV_STATE_ENTRY_SIZE,
- PAGE_SIZE);
+ uint32_t used = align(table->state.next * ANV_STATE_ENTRY_SIZE,
+ PAGE_SIZE);
uint32_t old_size = table->size;
/* The block pool is always initialized to a nonzero size and this function
@@ -312,7 +292,7 @@ anv_state_table_add(struct anv_state_table *table, uint32_t *idx,
old.u64 = __sync_lock_test_and_set(&table->state.u64, new.u64);
if (old.next != state.next)
- futex_wake(&table->state.end, INT_MAX);
+ futex_wake(&table->state.end, INT32_MAX);
} else {
futex_wait(&table->state.end, state.end, NULL);
continue;
@@ -364,62 +344,46 @@ anv_free_list_pop(union anv_free_list *list,
}
static VkResult
-anv_block_pool_expand_range(struct anv_block_pool *pool,
- uint32_t center_bo_offset, uint32_t size);
+anv_block_pool_expand_range(struct anv_block_pool *pool, uint32_t size);
VkResult
anv_block_pool_init(struct anv_block_pool *pool,
struct anv_device *device,
const char *name,
uint64_t start_address,
- uint32_t initial_size)
+ uint32_t initial_size,
+ uint32_t max_size)
{
VkResult result;
+ /* Make sure VMA addresses are aligned for the block pool */
+ assert(anv_is_aligned(start_address, device->info->mem_alignment));
+ assert(anv_is_aligned(initial_size, device->info->mem_alignment));
+ assert(max_size > 0);
+ assert(max_size > initial_size);
+
pool->name = name;
pool->device = device;
- pool->use_softpin = device->physical->use_softpin;
pool->nbos = 0;
pool->size = 0;
- pool->center_bo_offset = 0;
pool->start_address = intel_canonical_address(start_address);
- pool->map = NULL;
+ pool->max_size = max_size;
- if (pool->use_softpin) {
- pool->bo = NULL;
- pool->fd = -1;
- } else {
- /* Just make it 2GB up-front. The Linux kernel won't actually back it
- * with pages until we either map and fault on one of them or we use
- * userptr and send a chunk of it off to the GPU.
- */
- pool->fd = os_create_anonymous_file(BLOCK_POOL_MEMFD_SIZE, "block pool");
- if (pool->fd == -1)
- return vk_error(VK_ERROR_INITIALIZATION_FAILED);
-
- pool->wrapper_bo = (struct anv_bo) {
- .refcount = 1,
- .offset = -1,
- .is_wrapper = true,
- };
- pool->bo = &pool->wrapper_bo;
- }
-
- if (!u_vector_init(&pool->mmap_cleanups,
- round_to_power_of_two(sizeof(struct anv_mmap_cleanup)),
- 128)) {
- result = vk_error(VK_ERROR_INITIALIZATION_FAILED);
- goto fail_fd;
- }
+ pool->bo = NULL;
pool->state.next = 0;
pool->state.end = 0;
- pool->back_state.next = 0;
- pool->back_state.end = 0;
- result = anv_block_pool_expand_range(pool, 0, initial_size);
+ pool->bo_alloc_flags =
+ ANV_BO_ALLOC_FIXED_ADDRESS |
+ ANV_BO_ALLOC_MAPPED |
+ ANV_BO_ALLOC_HOST_CACHED_COHERENT |
+ ANV_BO_ALLOC_CAPTURE |
+ ANV_BO_ALLOC_INTERNAL;
+
+ result = anv_block_pool_expand_range(pool, initial_size);
if (result != VK_SUCCESS)
- goto fail_mmap_cleanups;
+ return result;
/* Make the entire pool available in the front of the pool. If back
* allocation needs to use this space, the "ends" will be re-arranged.
@@ -427,47 +391,22 @@ anv_block_pool_init(struct anv_block_pool *pool,
pool->state.end = pool->size;
return VK_SUCCESS;
-
- fail_mmap_cleanups:
- u_vector_finish(&pool->mmap_cleanups);
- fail_fd:
- if (pool->fd >= 0)
- close(pool->fd);
-
- return result;
}
void
anv_block_pool_finish(struct anv_block_pool *pool)
{
anv_block_pool_foreach_bo(bo, pool) {
- if (bo->map)
- anv_gem_munmap(pool->device, bo->map, bo->size);
- anv_gem_close(pool->device, bo->gem_handle);
+ assert(bo->refcount == 1);
+ anv_device_release_bo(pool->device, bo);
}
-
- struct anv_mmap_cleanup *cleanup;
- u_vector_foreach(cleanup, &pool->mmap_cleanups)
- munmap(cleanup->map, cleanup->size);
- u_vector_finish(&pool->mmap_cleanups);
-
- if (pool->fd >= 0)
- close(pool->fd);
}
static VkResult
-anv_block_pool_expand_range(struct anv_block_pool *pool,
- uint32_t center_bo_offset, uint32_t size)
+anv_block_pool_expand_range(struct anv_block_pool *pool, uint32_t size)
{
/* Assert that we only ever grow the pool */
- assert(center_bo_offset >= pool->back_state.end);
- assert(size - center_bo_offset >= pool->state.end);
-
- /* Assert that we don't go outside the bounds of the memfd */
- assert(center_bo_offset <= BLOCK_POOL_MEMFD_CENTER);
- assert(pool->use_softpin ||
- size - center_bo_offset <=
- BLOCK_POOL_MEMFD_SIZE - BLOCK_POOL_MEMFD_CENTER);
+ assert(size >= pool->state.end);
/* For state pool BOs we have to be a bit careful about where we place them
* in the GTT. There are two documented workarounds for state base address
@@ -495,73 +434,22 @@ anv_block_pool_expand_range(struct anv_block_pool *pool,
* hard work for us. When using softpin, we're in control and the fixed
* addresses we choose are fine for base addresses.
*/
- enum anv_bo_alloc_flags bo_alloc_flags = ANV_BO_ALLOC_CAPTURE;
- if (!pool->use_softpin)
- bo_alloc_flags |= ANV_BO_ALLOC_32BIT_ADDRESS;
-
- if (pool->use_softpin) {
- uint32_t new_bo_size = size - pool->size;
- struct anv_bo *new_bo;
- assert(center_bo_offset == 0);
- VkResult result = anv_device_alloc_bo(pool->device,
- pool->name,
- new_bo_size,
- bo_alloc_flags |
- ANV_BO_ALLOC_LOCAL_MEM |
- ANV_BO_ALLOC_FIXED_ADDRESS |
- ANV_BO_ALLOC_MAPPED |
- ANV_BO_ALLOC_SNOOPED,
- pool->start_address + pool->size,
- &new_bo);
- if (result != VK_SUCCESS)
- return result;
-
- pool->bos[pool->nbos++] = new_bo;
-
- /* This pointer will always point to the first BO in the list */
- pool->bo = pool->bos[0];
- } else {
- /* Just leak the old map until we destroy the pool. We can't munmap it
- * without races or imposing locking on the block allocate fast path. On
- * the whole the leaked maps adds up to less than the size of the
- * current map. MAP_POPULATE seems like the right thing to do, but we
- * should try to get some numbers.
- */
- void *map = mmap(NULL, size, PROT_READ | PROT_WRITE,
- MAP_SHARED | MAP_POPULATE, pool->fd,
- BLOCK_POOL_MEMFD_CENTER - center_bo_offset);
- if (map == MAP_FAILED)
- return vk_errorf(pool->device, &pool->device->vk.base,
- VK_ERROR_MEMORY_MAP_FAILED, "mmap failed: %m");
-
- struct anv_bo *new_bo;
- VkResult result = anv_device_import_bo_from_host_ptr(pool->device,
- map, size,
- bo_alloc_flags,
- 0 /* client_address */,
- &new_bo);
- if (result != VK_SUCCESS) {
- munmap(map, size);
- return result;
- }
- struct anv_mmap_cleanup *cleanup = u_vector_add(&pool->mmap_cleanups);
- if (!cleanup) {
- munmap(map, size);
- anv_device_release_bo(pool->device, new_bo);
- return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
- }
- cleanup->map = map;
- cleanup->size = size;
+ uint32_t new_bo_size = size - pool->size;
+ struct anv_bo *new_bo = NULL;
+ VkResult result = anv_device_alloc_bo(pool->device,
+ pool->name,
+ new_bo_size,
+ pool->bo_alloc_flags,
+ intel_48b_address(pool->start_address + pool->size),
+ &new_bo);
+ if (result != VK_SUCCESS)
+ return result;
- /* Now that we mapped the new memory, we can write the new
- * center_bo_offset back into pool and update pool->map. */
- pool->center_bo_offset = center_bo_offset;
- pool->map = map + center_bo_offset;
+ pool->bos[pool->nbos++] = new_bo;
- pool->bos[pool->nbos++] = new_bo;
- pool->wrapper_bo.map = new_bo;
- }
+ /* This pointer will always point to the first BO in the list */
+ pool->bo = pool->bos[0];
assert(pool->nbos < ANV_MAX_BLOCK_POOL_BOS);
pool->size = size;
@@ -578,24 +466,20 @@ anv_block_pool_expand_range(struct anv_block_pool *pool,
void*
anv_block_pool_map(struct anv_block_pool *pool, int32_t offset, uint32_t size)
{
- if (pool->use_softpin) {
- struct anv_bo *bo = NULL;
- int32_t bo_offset = 0;
- anv_block_pool_foreach_bo(iter_bo, pool) {
- if (offset < bo_offset + iter_bo->size) {
- bo = iter_bo;
- break;
- }
- bo_offset += iter_bo->size;
+ struct anv_bo *bo = NULL;
+ int32_t bo_offset = 0;
+ anv_block_pool_foreach_bo(iter_bo, pool) {
+ if (offset < bo_offset + iter_bo->size) {
+ bo = iter_bo;
+ break;
}
- assert(bo != NULL);
- assert(offset >= bo_offset);
- assert((offset - bo_offset) + size <= bo->size);
-
- return bo->map + (offset - bo_offset);
- } else {
- return pool->map + offset;
+ bo_offset += iter_bo->size;
}
+ assert(bo != NULL);
+ assert(offset >= bo_offset);
+ assert((offset - bo_offset) + size <= bo->size);
+
+ return bo->map + (offset - bo_offset);
}
/** Grows and re-centers the block pool.
@@ -612,14 +496,10 @@ anv_block_pool_map(struct anv_block_pool *pool, int32_t offset, uint32_t size)
* allocated for each end as we have used. This way the pool doesn't
* grow too far in one direction or the other.
*
- * 4) If the _alloc_back() has never been called, then the back portion of
- * the pool retains a size of zero. (This makes it easier for users of
- * the block pool that only want a one-sided pool.)
- *
- * 5) We have enough space allocated for at least one more block in
+ * 4) We have enough space allocated for at least one more block in
* whichever side `state` points to.
*
- * 6) The center of the pool is always aligned to both the block_size of
+ * 5) The center of the pool is always aligned to both the block_size of
* the pool and a 4K CPU page.
*/
static uint32_t
@@ -630,10 +510,10 @@ anv_block_pool_grow(struct anv_block_pool *pool, struct anv_block_state *state,
pthread_mutex_lock(&pool->device->mutex);
- assert(state == &pool->state || state == &pool->back_state);
+ assert(state == &pool->state);
/* Gather a little usage information on the pool. Since we may have
- * threadsd waiting in queue to get some storage while we resize, it's
+ * threads waiting in queue to get some storage while we resize, it's
* actually possible that total_used will be larger than old_size. In
* particular, block_pool_alloc() increments state->next prior to
* calling block_pool_grow, so this ensures that we get enough space for
@@ -642,11 +522,7 @@ anv_block_pool_grow(struct anv_block_pool *pool, struct anv_block_state *state,
* We align to a page size because it makes it easier to do our
* calculations later in such a way that we state page-aigned.
*/
- uint32_t back_used = align_u32(pool->back_state.next, PAGE_SIZE);
- uint32_t front_used = align_u32(pool->state.next, PAGE_SIZE);
- uint32_t total_used = front_used + back_used;
-
- assert(state == &pool->state || back_used > 0);
+ uint32_t total_used = align(pool->state.next, PAGE_SIZE);
uint32_t old_size = pool->size;
@@ -655,97 +531,49 @@ anv_block_pool_grow(struct anv_block_pool *pool, struct anv_block_state *state,
*/
assert(old_size > 0);
- const uint32_t old_back = pool->center_bo_offset;
- const uint32_t old_front = old_size - pool->center_bo_offset;
-
- /* The back_used and front_used may actually be smaller than the actual
- * requirement because they are based on the next pointers which are
- * updated prior to calling this function.
+ /* total_used may actually be smaller than the actual requirement because
+ * they are based on the next pointers which are updated prior to calling
+ * this function.
*/
- uint32_t back_required = MAX2(back_used, old_back);
- uint32_t front_required = MAX2(front_used, old_front);
-
- if (pool->use_softpin) {
- /* With softpin, the pool is made up of a bunch of buffers with separate
- * maps. Make sure we have enough contiguous space that we can get a
- * properly contiguous map for the next chunk.
- */
- assert(old_back == 0);
- front_required = MAX2(front_required, old_front + contiguous_size);
- }
-
- if (back_used * 2 <= back_required && front_used * 2 <= front_required) {
- /* If we're in this case then this isn't the firsta allocation and we
- * already have enough space on both sides to hold double what we
- * have allocated. There's nothing for us to do.
- */
- goto done;
- }
-
- uint32_t size = old_size * 2;
- while (size < back_required + front_required)
- size *= 2;
-
- assert(size > pool->size);
+ uint32_t required = MAX2(total_used, old_size);
- /* We compute a new center_bo_offset such that, when we double the size
- * of the pool, we maintain the ratio of how much is used by each side.
- * This way things should remain more-or-less balanced.
+ /* With softpin, the pool is made up of a bunch of buffers with separate
+ * maps. Make sure we have enough contiguous space that we can get a
+ * properly contiguous map for the next chunk.
*/
- uint32_t center_bo_offset;
- if (back_used == 0) {
- /* If we're in this case then we have never called alloc_back(). In
- * this case, we want keep the offset at 0 to make things as simple
- * as possible for users that don't care about back allocations.
- */
- center_bo_offset = 0;
- } else {
- /* Try to "center" the allocation based on how much is currently in
- * use on each side of the center line.
- */
- center_bo_offset = ((uint64_t)size * back_used) / total_used;
-
- /* Align down to a multiple of the page size */
- center_bo_offset &= ~(PAGE_SIZE - 1);
+ required = MAX2(required, old_size + contiguous_size);
- assert(center_bo_offset >= back_used);
+ if (required > pool->max_size) {
+ result = VK_ERROR_OUT_OF_DEVICE_MEMORY;
+ } else if (total_used * 2 > required) {
+ uint32_t size = old_size * 2;
+ while (size < required)
+ size *= 2;
- /* Make sure we don't shrink the back end of the pool */
- if (center_bo_offset < back_required)
- center_bo_offset = back_required;
+ size = MIN2(size, pool->max_size);
+ assert(size > pool->size);
- /* Make sure that we don't shrink the front end of the pool */
- if (size - center_bo_offset < front_required)
- center_bo_offset = size - front_required;
+ result = anv_block_pool_expand_range(pool, size);
}
- assert(center_bo_offset % PAGE_SIZE == 0);
-
- result = anv_block_pool_expand_range(pool, center_bo_offset, size);
-
-done:
pthread_mutex_unlock(&pool->device->mutex);
- if (result == VK_SUCCESS) {
- /* Return the appropriate new size. This function never actually
- * updates state->next. Instead, we let the caller do that because it
- * needs to do so in order to maintain its concurrency model.
- */
- if (state == &pool->state) {
- return pool->size - pool->center_bo_offset;
- } else {
- assert(pool->center_bo_offset > 0);
- return pool->center_bo_offset;
- }
- } else {
+ if (result != VK_SUCCESS)
return 0;
- }
+
+ /* Return the appropriate new size. This function never actually
+ * updates state->next. Instead, we let the caller do that because it
+ * needs to do so in order to maintain its concurrency model.
+ */
+ return pool->size;
}
-static uint32_t
+static VkResult
anv_block_pool_alloc_new(struct anv_block_pool *pool,
struct anv_block_state *pool_state,
- uint32_t block_size, uint32_t *padding)
+ uint32_t block_size,
+ int64_t *offset,
+ uint32_t *padding)
{
struct anv_block_state state, old, new;
@@ -755,10 +583,13 @@ anv_block_pool_alloc_new(struct anv_block_pool *pool,
while (1) {
state.u64 = __sync_fetch_and_add(&pool_state->u64, block_size);
- if (state.next + block_size <= state.end) {
- return state.next;
+ if (state.next + block_size > pool->max_size) {
+ return VK_ERROR_OUT_OF_DEVICE_MEMORY;
+ } else if (state.next + block_size <= state.end) {
+ *offset = state.next;
+ return VK_SUCCESS;
} else if (state.next <= state.end) {
- if (pool->use_softpin && state.next < state.end) {
+ if (state.next < state.end) {
/* We need to grow the block pool, but still have some leftover
* space that can't be used by that particular allocation. So we
* add that as a "padding", and return it.
@@ -782,12 +613,17 @@ anv_block_pool_alloc_new(struct anv_block_pool *pool,
new.next = state.next + block_size;
do {
new.end = anv_block_pool_grow(pool, pool_state, block_size);
+ if (pool->size > 0 && new.end == 0) {
+ futex_wake(&pool_state->end, INT32_MAX);
+ return VK_ERROR_OUT_OF_DEVICE_MEMORY;
+ }
} while (new.end < new.next);
old.u64 = __sync_lock_test_and_set(&pool_state->u64, new.u64);
if (old.next != state.next)
- futex_wake(&pool_state->end, INT_MAX);
- return state.next;
+ futex_wake(&pool_state->end, INT32_MAX);
+ *offset = state.next;
+ return VK_SUCCESS;
} else {
futex_wait(&pool_state->end, state.end, NULL);
continue;
@@ -795,60 +631,31 @@ anv_block_pool_alloc_new(struct anv_block_pool *pool,
}
}
-int32_t
+VkResult
anv_block_pool_alloc(struct anv_block_pool *pool,
- uint32_t block_size, uint32_t *padding)
-{
- uint32_t offset;
-
- offset = anv_block_pool_alloc_new(pool, &pool->state, block_size, padding);
-
- return offset;
-}
-
-/* Allocates a block out of the back of the block pool.
- *
- * This will allocated a block earlier than the "start" of the block pool.
- * The offsets returned from this function will be negative but will still
- * be correct relative to the block pool's map pointer.
- *
- * If you ever use anv_block_pool_alloc_back, then you will have to do
- * gymnastics with the block pool's BO when doing relocations.
- */
-int32_t
-anv_block_pool_alloc_back(struct anv_block_pool *pool,
- uint32_t block_size)
+ uint32_t block_size,
+ int64_t *offset, uint32_t *padding)
{
- int32_t offset = anv_block_pool_alloc_new(pool, &pool->back_state,
- block_size, NULL);
-
- /* The offset we get out of anv_block_pool_alloc_new() is actually the
- * number of bytes downwards from the middle to the end of the block.
- * We need to turn it into a (negative) offset from the middle to the
- * start of the block.
- */
- assert(offset >= 0);
- return -(offset + block_size);
+ return anv_block_pool_alloc_new(pool, &pool->state, block_size, offset, padding);
}
VkResult
anv_state_pool_init(struct anv_state_pool *pool,
struct anv_device *device,
- const char *name,
- uint64_t base_address,
- int32_t start_offset,
- uint32_t block_size)
+ const struct anv_state_pool_params *params)
{
- /* We don't want to ever see signed overflow */
- assert(start_offset < INT32_MAX - (int32_t)BLOCK_POOL_MEMFD_SIZE);
-
- VkResult result = anv_block_pool_init(&pool->block_pool, device, name,
- base_address + start_offset,
- block_size * 16);
+ uint32_t initial_size = MAX2(params->block_size * 16,
+ device->info->mem_alignment);
+
+ VkResult result = anv_block_pool_init(&pool->block_pool, device,
+ params->name,
+ params->base_address + params->start_offset,
+ initial_size,
+ params->max_size);
if (result != VK_SUCCESS)
return result;
- pool->start_offset = start_offset;
+ pool->start_offset = params->start_offset;
result = anv_state_table_init(&pool->table, device, 64);
if (result != VK_SUCCESS) {
@@ -856,9 +663,8 @@ anv_state_pool_init(struct anv_state_pool *pool,
return result;
}
- assert(util_is_power_of_two_or_zero(block_size));
- pool->block_size = block_size;
- pool->back_alloc_free_list = ANV_FREE_LIST_EMPTY;
+ assert(util_is_power_of_two_or_zero(params->block_size));
+ pool->block_size = params->block_size;
for (unsigned i = 0; i < ANV_STATE_BUCKETS; i++) {
pool->buckets[i].free_list = ANV_FREE_LIST_EMPTY;
pool->buckets[i].block.next = 0;
@@ -877,15 +683,15 @@ anv_state_pool_finish(struct anv_state_pool *pool)
anv_block_pool_finish(&pool->block_pool);
}
-static uint32_t
+static VkResult
anv_fixed_size_state_pool_alloc_new(struct anv_fixed_size_state_pool *pool,
struct anv_block_pool *block_pool,
uint32_t state_size,
uint32_t block_size,
+ int64_t *offset,
uint32_t *padding)
{
struct anv_block_state block, old, new;
- uint32_t offset;
/* We don't always use anv_block_pool_alloc(), which would set *padding to
* zero for us. So if we have a pointer to padding, we must zero it out
@@ -898,21 +704,25 @@ anv_fixed_size_state_pool_alloc_new(struct anv_fixed_size_state_pool *pool,
* Instead, we just grab whole (potentially large) blocks.
*/
if (state_size >= block_size)
- return anv_block_pool_alloc(block_pool, state_size, padding);
+ return anv_block_pool_alloc(block_pool, state_size, offset, padding);
restart:
block.u64 = __sync_fetch_and_add(&pool->block.u64, state_size);
if (block.next < block.end) {
- return block.next;
+ *offset = block.next;
+ return VK_SUCCESS;
} else if (block.next == block.end) {
- offset = anv_block_pool_alloc(block_pool, block_size, padding);
- new.next = offset + state_size;
- new.end = offset + block_size;
+ VkResult result = anv_block_pool_alloc(block_pool, block_size,
+ offset, padding);
+ if (result != VK_SUCCESS)
+ return result;
+ new.next = *offset + state_size;
+ new.end = *offset + block_size;
old.u64 = __sync_lock_test_and_set(&pool->block.u64, new.u64);
if (old.next != block.next)
- futex_wake(&pool->block.end, INT_MAX);
- return offset;
+ futex_wake(&pool->block.end, INT32_MAX);
+ return result;
} else {
futex_wait(&pool->block.end, block.end, NULL);
goto restart;
@@ -922,7 +732,7 @@ anv_fixed_size_state_pool_alloc_new(struct anv_fixed_size_state_pool *pool,
static uint32_t
anv_state_pool_get_bucket(uint32_t size)
{
- unsigned size_log2 = ilog2_round_up(size);
+ unsigned size_log2 = util_logbase2_ceil(size);
assert(size_log2 <= ANV_MAX_STATE_SIZE_LOG2);
if (size_log2 < ANV_MIN_STATE_SIZE_LOG2)
size_log2 = ANV_MIN_STATE_SIZE_LOG2;
@@ -992,7 +802,7 @@ anv_state_pool_return_chunk(struct anv_state_pool *pool,
if (nblocks > 0) {
/* First return divisor aligned and sized chunks. We start returning
- * larger blocks from the end fo the chunk, since they should already be
+ * larger blocks from the end of the chunk, since they should already be
* aligned to divisor. Also anv_state_pool_return_blocks() only accepts
* aligned chunks.
*/
@@ -1031,7 +841,7 @@ anv_state_pool_alloc_no_vg(struct anv_state_pool *pool,
struct anv_state *state;
uint32_t alloc_size = anv_state_pool_get_bucket_size(bucket);
- int32_t offset;
+ int64_t offset;
/* Try free list first. */
state = anv_free_list_pop(&pool->buckets[bucket].free_list,
@@ -1091,14 +901,19 @@ anv_state_pool_alloc_no_vg(struct anv_state_pool *pool,
}
uint32_t padding;
- offset = anv_fixed_size_state_pool_alloc_new(&pool->buckets[bucket],
- &pool->block_pool,
- alloc_size,
- pool->block_size,
- &padding);
- /* Everytime we allocate a new state, add it to the state pool */
- uint32_t idx;
- UNUSED VkResult result = anv_state_table_add(&pool->table, &idx, 1);
+ VkResult result =
+ anv_fixed_size_state_pool_alloc_new(&pool->buckets[bucket],
+ &pool->block_pool,
+ alloc_size,
+ pool->block_size,
+ &offset,
+ &padding);
+ if (result != VK_SUCCESS)
+ return ANV_STATE_NULL;
+
+ /* Every time we allocate a new state, add it to the state pool */
+ uint32_t idx = 0;
+ result = anv_state_table_add(&pool->table, &idx, 1);
assert(result == VK_SUCCESS);
state = anv_state_table_get(&pool->table, idx);
@@ -1126,52 +941,16 @@ anv_state_pool_alloc(struct anv_state_pool *pool, uint32_t size, uint32_t align)
return state;
}
-struct anv_state
-anv_state_pool_alloc_back(struct anv_state_pool *pool)
-{
- struct anv_state *state;
- uint32_t alloc_size = pool->block_size;
-
- /* This function is only used with pools where start_offset == 0 */
- assert(pool->start_offset == 0);
-
- state = anv_free_list_pop(&pool->back_alloc_free_list, &pool->table);
- if (state) {
- assert(state->offset < pool->start_offset);
- goto done;
- }
-
- int32_t offset;
- offset = anv_block_pool_alloc_back(&pool->block_pool,
- pool->block_size);
- uint32_t idx;
- UNUSED VkResult result = anv_state_table_add(&pool->table, &idx, 1);
- assert(result == VK_SUCCESS);
-
- state = anv_state_table_get(&pool->table, idx);
- state->offset = pool->start_offset + offset;
- state->alloc_size = alloc_size;
- state->map = anv_block_pool_map(&pool->block_pool, offset, alloc_size);
-
-done:
- VG(VALGRIND_MEMPOOL_ALLOC(pool, state->map, state->alloc_size));
- return *state;
-}
-
static void
anv_state_pool_free_no_vg(struct anv_state_pool *pool, struct anv_state state)
{
assert(util_is_power_of_two_or_zero(state.alloc_size));
unsigned bucket = anv_state_pool_get_bucket(state.alloc_size);
- if (state.offset < pool->start_offset) {
- assert(state.alloc_size == pool->block_size);
- anv_free_list_push(&pool->back_alloc_free_list,
- &pool->table, state.idx, 1);
- } else {
- anv_free_list_push(&pool->buckets[bucket].free_list,
- &pool->table, state.idx, 1);
- }
+ assert(state.offset >= pool->start_offset);
+
+ anv_free_list_push(&pool->buckets[bucket].free_list,
+ &pool->table, state.idx, 1);
}
void
@@ -1216,6 +995,7 @@ anv_state_stream_init(struct anv_state_stream *stream,
*/
stream->next = block_size;
+ stream->total_size = 0;
util_dynarray_init(&stream->all_blocks, NULL);
VG(VALGRIND_CREATE_MEMPOOL(stream, 0, false));
@@ -1243,14 +1023,17 @@ anv_state_stream_alloc(struct anv_state_stream *stream,
assert(alignment <= PAGE_SIZE);
- uint32_t offset = align_u32(stream->next, alignment);
+ uint32_t offset = align(stream->next, alignment);
if (offset + size > stream->block.alloc_size) {
uint32_t block_size = stream->block_size;
if (block_size < size)
- block_size = round_to_power_of_two(size);
+ block_size = util_next_power_of_two(size);
stream->block = anv_state_pool_alloc_no_vg(stream->state_pool,
block_size, PAGE_SIZE);
+ if (stream->block.alloc_size == 0)
+ return ANV_STATE_NULL;
+
util_dynarray_append(&stream->all_blocks,
struct anv_state, stream->block);
VG(VALGRIND_MAKE_MEM_NOACCESS(stream->block.map, block_size));
@@ -1258,6 +1041,7 @@ anv_state_stream_alloc(struct anv_state_stream *stream,
/* Reset back to the start */
stream->next = offset = 0;
assert(offset + size <= stream->block.alloc_size);
+ stream->total_size += block_size;
}
const bool new_block = stream->next == 0;
@@ -1323,12 +1107,108 @@ anv_state_reserved_pool_free(struct anv_state_reserved_pool *pool,
anv_free_list_push(&pool->reserved_blocks, &pool->pool->table, state.idx, 1);
}
+VkResult
+anv_state_reserved_array_pool_init(struct anv_state_reserved_array_pool *pool,
+ struct anv_state_pool *parent,
+ uint32_t count, uint32_t size, uint32_t alignment)
+{
+ pool->pool = parent;
+ pool->count = count;
+ pool->size = size;
+ pool->stride = align(size, alignment);
+ pool->states = vk_zalloc(&pool->pool->block_pool.device->vk.alloc,
+ sizeof(BITSET_WORD) * BITSET_WORDS(pool->count), 8,
+ VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
+ if (pool->states == NULL)
+ return VK_ERROR_OUT_OF_HOST_MEMORY;
+
+ BITSET_SET_RANGE(pool->states, 0, pool->count - 1);
+ simple_mtx_init(&pool->mutex, mtx_plain);
+
+ pool->state = anv_state_pool_alloc(pool->pool, pool->stride * count, alignment);
+
+ return VK_SUCCESS;
+}
+
+void
+anv_state_reserved_array_pool_finish(struct anv_state_reserved_array_pool *pool)
+{
+ anv_state_pool_free(pool->pool, pool->state);
+ vk_free(&pool->pool->block_pool.device->vk.alloc, pool->states);
+ simple_mtx_destroy(&pool->mutex);
+}
+
+struct anv_state
+anv_state_reserved_array_pool_alloc(struct anv_state_reserved_array_pool *pool,
+ bool alloc_back)
+{
+ simple_mtx_lock(&pool->mutex);
+ int idx = alloc_back ?
+ __bitset_last_bit(pool->states, BITSET_WORDS(pool->count)) :
+ __bitset_ffs(pool->states, BITSET_WORDS(pool->count));
+ if (idx != 0)
+ BITSET_CLEAR(pool->states, idx - 1);
+ simple_mtx_unlock(&pool->mutex);
+
+ if (idx == 0)
+ return ANV_STATE_NULL;
+
+ idx--;
+
+ struct anv_state state = pool->state;
+ state.offset += idx * pool->stride;
+ state.map += idx * pool->stride;
+ state.alloc_size = pool->size;
+
+ return state;
+}
+
+struct anv_state
+anv_state_reserved_array_pool_alloc_index(struct anv_state_reserved_array_pool *pool,
+ uint32_t idx)
+{
+ simple_mtx_lock(&pool->mutex);
+ bool already_allocated = !BITSET_TEST(pool->states, idx);
+ if (!already_allocated)
+ BITSET_CLEAR(pool->states, idx);
+ simple_mtx_unlock(&pool->mutex);
+
+ if (already_allocated)
+ return ANV_STATE_NULL;
+
+ struct anv_state state = pool->state;
+ state.offset += idx * pool->stride;
+ state.map += idx * pool->stride;
+ state.alloc_size = pool->size;
+
+ return state;
+}
+
+uint32_t
+anv_state_reserved_array_pool_state_index(struct anv_state_reserved_array_pool *pool,
+ struct anv_state state)
+{
+ return (state.offset - pool->state.offset) / pool->stride;
+}
+
+void
+anv_state_reserved_array_pool_free(struct anv_state_reserved_array_pool *pool,
+ struct anv_state state)
+{
+ unsigned idx = (state.offset - pool->state.offset) / pool->stride;
+ simple_mtx_lock(&pool->mutex);
+ BITSET_SET(pool->states, idx);
+ simple_mtx_unlock(&pool->mutex);
+ }
+
void
anv_bo_pool_init(struct anv_bo_pool *pool, struct anv_device *device,
- const char *name)
+ const char *name, enum anv_bo_alloc_flags alloc_flags)
{
pool->name = name;
pool->device = device;
+ pool->bo_alloc_flags = alloc_flags;
+
for (unsigned i = 0; i < ARRAY_SIZE(pool->free_list); i++) {
util_sparse_array_free_list_init(&pool->free_list[i],
&device->bo_cache.bo_map, 0,
@@ -1361,7 +1241,7 @@ VkResult
anv_bo_pool_alloc(struct anv_bo_pool *pool, uint32_t size,
struct anv_bo **bo_out)
{
- const unsigned size_log2 = size < 4096 ? 12 : ilog2_round_up(size);
+ const unsigned size_log2 = size < 4096 ? 12 : util_logbase2_ceil(size);
const unsigned pow2_size = 1 << size_log2;
const unsigned bucket = size_log2 - 12;
assert(bucket < ARRAY_SIZE(pool->free_list));
@@ -1377,10 +1257,7 @@ anv_bo_pool_alloc(struct anv_bo_pool *pool, uint32_t size,
VkResult result = anv_device_alloc_bo(pool->device,
pool->name,
pow2_size,
- ANV_BO_ALLOC_LOCAL_MEM |
- ANV_BO_ALLOC_MAPPED |
- ANV_BO_ALLOC_SNOOPED |
- ANV_BO_ALLOC_CAPTURE,
+ pool->bo_alloc_flags,
0 /* explicit_address */,
&bo);
if (result != VK_SUCCESS)
@@ -1401,7 +1278,7 @@ anv_bo_pool_free(struct anv_bo_pool *pool, struct anv_bo *bo)
VG(VALGRIND_MEMPOOL_FREE(pool, bo->map));
assert(util_is_power_of_two_or_zero(bo->size));
- const unsigned size_log2 = ilog2_round_up(bo->size);
+ const unsigned size_log2 = util_logbase2_ceil(bo->size);
const unsigned bucket = size_log2 - 12;
assert(bucket < ARRAY_SIZE(pool->free_list));
@@ -1431,7 +1308,7 @@ anv_scratch_pool_finish(struct anv_device *device, struct anv_scratch_pool *pool
for (unsigned i = 0; i < 16; i++) {
if (pool->surf_states[i].map != NULL) {
- anv_state_pool_free(&device->surface_state_pool,
+ anv_state_pool_free(&device->scratch_surface_state_pool,
pool->surf_states[i]);
}
}
@@ -1449,7 +1326,7 @@ anv_scratch_pool_alloc(struct anv_device *device, struct anv_scratch_pool *pool,
assert(stage < ARRAY_SIZE(pool->bos));
- const struct intel_device_info *devinfo = &device->info;
+ const struct intel_device_info *devinfo = device->info;
/* On GFX version 12.5, scratch access changed to a surface-based model.
* Instead of each shader type having its own layout based on IDs passed
@@ -1484,9 +1361,11 @@ anv_scratch_pool_alloc(struct anv_device *device, struct anv_scratch_pool *pool,
*
* so nothing will ever touch the top page.
*/
+ const enum anv_bo_alloc_flags alloc_flags =
+ ANV_BO_ALLOC_INTERNAL |
+ (devinfo->verx10 < 125 ? ANV_BO_ALLOC_32BIT_ADDRESS : 0);
VkResult result = anv_device_alloc_bo(device, "scratch", size,
- ANV_BO_ALLOC_32BIT_ADDRESS |
- ANV_BO_ALLOC_LOCAL_MEM,
+ alloc_flags,
0 /* explicit_address */,
&bo);
if (result != VK_SUCCESS)
@@ -1507,6 +1386,8 @@ anv_scratch_pool_get_surf(struct anv_device *device,
struct anv_scratch_pool *pool,
unsigned per_thread_scratch)
{
+ assert(device->info->verx10 >= 125);
+
if (per_thread_scratch == 0)
return 0;
@@ -1523,7 +1404,7 @@ anv_scratch_pool_get_surf(struct anv_device *device,
struct anv_address addr = { .bo = bo };
struct anv_state state =
- anv_state_pool_alloc(&device->surface_state_pool,
+ anv_state_pool_alloc(&device->scratch_surface_state_pool,
device->isl_dev.ss.size, 64);
isl_buffer_fill_state(&device->isl_dev, state.map,
@@ -1538,7 +1419,7 @@ anv_scratch_pool_get_surf(struct anv_device *device,
uint32_t current = p_atomic_cmpxchg(&pool->surfs[scratch_size_log2],
0, state.offset);
if (current) {
- anv_state_pool_free(&device->surface_state_pool, state);
+ anv_state_pool_free(&device->scratch_surface_state_pool, state);
return current;
} else {
pool->surf_states[scratch_size_log2] = state;
@@ -1547,13 +1428,13 @@ anv_scratch_pool_get_surf(struct anv_device *device,
}
VkResult
-anv_bo_cache_init(struct anv_bo_cache *cache)
+anv_bo_cache_init(struct anv_bo_cache *cache, struct anv_device *device)
{
util_sparse_array_init(&cache->bo_map, sizeof(struct anv_bo), 1024);
if (pthread_mutex_init(&cache->mutex, NULL)) {
util_sparse_array_finish(&cache->bo_map);
- return vk_errorf(NULL, NULL, VK_ERROR_OUT_OF_HOST_MEMORY,
+ return vk_errorf(device, VK_ERROR_OUT_OF_HOST_MEMORY,
"pthread_mutex_init failed: %m");
}
@@ -1567,50 +1448,113 @@ anv_bo_cache_finish(struct anv_bo_cache *cache)
pthread_mutex_destroy(&cache->mutex);
}
-#define ANV_BO_CACHE_SUPPORTED_FLAGS \
- (EXEC_OBJECT_WRITE | \
- EXEC_OBJECT_ASYNC | \
- EXEC_OBJECT_SUPPORTS_48B_ADDRESS | \
- EXEC_OBJECT_PINNED | \
- EXEC_OBJECT_CAPTURE)
+static void
+anv_bo_unmap_close(struct anv_device *device, struct anv_bo *bo)
+{
+ if (bo->map && !bo->from_host_ptr)
+ anv_device_unmap_bo(device, bo, bo->map, bo->size, false /* replace */);
-static uint32_t
-anv_bo_alloc_flags_to_bo_flags(struct anv_device *device,
- enum anv_bo_alloc_flags alloc_flags)
+ assert(bo->gem_handle != 0);
+ device->kmd_backend->gem_close(device, bo);
+}
+
+static void
+anv_bo_vma_free(struct anv_device *device, struct anv_bo *bo)
{
- struct anv_physical_device *pdevice = device->physical;
+ if (bo->offset != 0 && !(bo->alloc_flags & ANV_BO_ALLOC_FIXED_ADDRESS)) {
+ assert(bo->vma_heap != NULL);
+ anv_vma_free(device, bo->vma_heap, bo->offset, bo->size);
+ }
+ bo->vma_heap = NULL;
+}
- uint64_t bo_flags = 0;
- if (!(alloc_flags & ANV_BO_ALLOC_32BIT_ADDRESS) &&
- pdevice->supports_48bit_addresses)
- bo_flags |= EXEC_OBJECT_SUPPORTS_48B_ADDRESS;
+static void
+anv_bo_finish(struct anv_device *device, struct anv_bo *bo)
+{
+ /* Not releasing vma in case unbind fails */
+ if (device->kmd_backend->vm_unbind_bo(device, bo) == VK_SUCCESS)
+ anv_bo_vma_free(device, bo);
- if ((alloc_flags & ANV_BO_ALLOC_CAPTURE) && pdevice->has_exec_capture)
- bo_flags |= EXEC_OBJECT_CAPTURE;
+ anv_bo_unmap_close(device, bo);
+}
- if (alloc_flags & ANV_BO_ALLOC_IMPLICIT_WRITE) {
- assert(alloc_flags & ANV_BO_ALLOC_IMPLICIT_SYNC);
- bo_flags |= EXEC_OBJECT_WRITE;
- }
+static VkResult
+anv_bo_vma_alloc_or_close(struct anv_device *device,
+ struct anv_bo *bo,
+ enum anv_bo_alloc_flags alloc_flags,
+ uint64_t explicit_address)
+{
+ assert(bo->vma_heap == NULL);
+ assert(explicit_address == intel_48b_address(explicit_address));
- if (!(alloc_flags & ANV_BO_ALLOC_IMPLICIT_SYNC) && pdevice->has_exec_async)
- bo_flags |= EXEC_OBJECT_ASYNC;
+ uint32_t align = device->physical->info.mem_alignment;
- if (pdevice->use_softpin)
- bo_flags |= EXEC_OBJECT_PINNED;
+ /* If it's big enough to store a tiled resource, we need 64K alignment */
+ if (bo->size >= 64 * 1024)
+ align = MAX2(64 * 1024, align);
- return bo_flags;
+ /* If we're using the AUX map, make sure we follow the required
+ * alignment.
+ */
+ if (alloc_flags & ANV_BO_ALLOC_AUX_TT_ALIGNED)
+ align = MAX2(intel_aux_map_get_alignment(device->aux_map_ctx), align);
+
+ /* Opportunistically align addresses to 2Mb when above 1Mb. We do this
+ * because this gives an opportunity for the kernel to use Transparent Huge
+ * Pages (the 2MB page table layout) for faster memory access.
+ *
+ * Only available on ICL+.
+ */
+ if (device->info->ver >= 11 && bo->size >= 1 * 1024 * 1024)
+ align = MAX2(2 * 1024 * 1024, align);
+
+ if (alloc_flags & ANV_BO_ALLOC_FIXED_ADDRESS) {
+ bo->offset = intel_canonical_address(explicit_address);
+ } else {
+ bo->offset = anv_vma_alloc(device, bo->size, align, alloc_flags,
+ explicit_address, &bo->vma_heap);
+ if (bo->offset == 0) {
+ anv_bo_unmap_close(device, bo);
+ return vk_errorf(device, VK_ERROR_OUT_OF_DEVICE_MEMORY,
+ "failed to allocate virtual address for BO");
+ }
+ }
+
+ return VK_SUCCESS;
}
-static uint32_t
-anv_device_get_bo_align(struct anv_device *device,
- enum anv_bo_alloc_flags alloc_flags)
+enum intel_device_info_mmap_mode
+anv_bo_get_mmap_mode(struct anv_device *device, struct anv_bo *bo)
{
- /* Gfx12 CCS surface addresses need to be 64K aligned. */
- if (device->info.ver >= 12 && (alloc_flags & ANV_BO_ALLOC_IMPLICIT_CCS))
- return 64 * 1024;
+ enum anv_bo_alloc_flags alloc_flags = bo->alloc_flags;
+
+ if (device->info->has_set_pat_uapi)
+ return anv_device_get_pat_entry(device, alloc_flags)->mmap;
+
+ if (anv_physical_device_has_vram(device->physical)) {
+ if ((alloc_flags & ANV_BO_ALLOC_NO_LOCAL_MEM) ||
+ (alloc_flags & ANV_BO_ALLOC_IMPORTED))
+ return INTEL_DEVICE_INFO_MMAP_MODE_WB;
+
+ return INTEL_DEVICE_INFO_MMAP_MODE_WC;
+ }
+
+ /* gfx9 atom */
+ if (!device->info->has_llc) {
+ /* user wants a cached and coherent memory but to achieve it without
+ * LLC in older platforms DRM_IOCTL_I915_GEM_SET_CACHING needs to be
+ * supported and set.
+ */
+ if (alloc_flags & ANV_BO_ALLOC_HOST_CACHED)
+ return INTEL_DEVICE_INFO_MMAP_MODE_WB;
+
+ return INTEL_DEVICE_INFO_MMAP_MODE_WC;
+ }
+
+ if (alloc_flags & (ANV_BO_ALLOC_SCANOUT | ANV_BO_ALLOC_EXTERNAL))
+ return INTEL_DEVICE_INFO_MMAP_MODE_WC;
- return 4096;
+ return INTEL_DEVICE_INFO_MMAP_MODE_WB;
}
VkResult
@@ -1621,57 +1565,70 @@ anv_device_alloc_bo(struct anv_device *device,
uint64_t explicit_address,
struct anv_bo **bo_out)
{
- if (!(alloc_flags & ANV_BO_ALLOC_LOCAL_MEM))
- anv_perf_warn(device, NULL, "system memory used");
+ /* bo that needs CPU access needs to be HOST_CACHED, HOST_COHERENT or both */
+ assert((alloc_flags & ANV_BO_ALLOC_MAPPED) == 0 ||
+ (alloc_flags & (ANV_BO_ALLOC_HOST_CACHED | ANV_BO_ALLOC_HOST_COHERENT)));
- if (!device->physical->has_implicit_ccs)
- assert(!(alloc_flags & ANV_BO_ALLOC_IMPLICIT_CCS));
-
- const uint32_t bo_flags =
- anv_bo_alloc_flags_to_bo_flags(device, alloc_flags);
- assert(bo_flags == (bo_flags & ANV_BO_CACHE_SUPPORTED_FLAGS));
+ /* KMD requires a valid PAT index, so setting HOST_COHERENT/WC to bos that
+ * don't need CPU access
+ */
+ if ((alloc_flags & ANV_BO_ALLOC_MAPPED) == 0)
+ alloc_flags |= ANV_BO_ALLOC_HOST_COHERENT;
- /* The kernel is going to give us whole pages anyway */
- size = align_u64(size, 4096);
+ /* In platforms with LLC we can promote all bos to cached+coherent for free */
+ const enum anv_bo_alloc_flags not_allowed_promotion = ANV_BO_ALLOC_SCANOUT |
+ ANV_BO_ALLOC_EXTERNAL |
+ ANV_BO_ALLOC_PROTECTED;
+ if (device->info->has_llc && ((alloc_flags & not_allowed_promotion) == 0))
+ alloc_flags |= ANV_BO_ALLOC_HOST_COHERENT;
- const uint32_t align = anv_device_get_bo_align(device, alloc_flags);
+ const uint32_t bo_flags =
+ device->kmd_backend->bo_alloc_flags_to_bo_flags(device, alloc_flags);
- uint64_t ccs_size = 0;
- if (device->info.has_aux_map && (alloc_flags & ANV_BO_ALLOC_IMPLICIT_CCS)) {
- /* Align the size up to the next multiple of 64K so we don't have any
- * AUX-TT entries pointing from a 64K page to itself.
- */
- size = align_u64(size, 64 * 1024);
+ /* The kernel is going to give us whole pages anyway. */
+ size = align64(size, 4096);
- /* See anv_bo::_ccs_size */
- ccs_size = align_u64(DIV_ROUND_UP(size, INTEL_AUX_MAP_GFX12_CCS_SCALE), 4096);
+ const uint64_t ccs_offset = size;
+ if (alloc_flags & ANV_BO_ALLOC_AUX_CCS) {
+ assert(device->info->has_aux_map);
+ size += DIV_ROUND_UP(size, intel_aux_get_main_to_aux_ratio(device->aux_map_ctx));
+ size = align64(size, 4096);
}
- uint32_t gem_handle;
+ const struct intel_memory_class_instance *regions[2];
+ uint32_t nregions = 0;
/* If we have vram size, we have multiple memory regions and should choose
* one of them.
*/
- if (device->physical->vram.size > 0) {
- struct drm_i915_gem_memory_class_instance regions[2];
- uint32_t nregions = 0;
-
- if (alloc_flags & ANV_BO_ALLOC_LOCAL_MEM) {
- /* For vram allocation, still use system memory as a fallback. */
- regions[nregions++] = device->physical->vram.region;
- regions[nregions++] = device->physical->sys.region;
- } else {
+ if (anv_physical_device_has_vram(device->physical)) {
+ /* This always try to put the object in local memory. Here
+ * vram_non_mappable & vram_mappable actually are the same region.
+ */
+ if (alloc_flags & ANV_BO_ALLOC_NO_LOCAL_MEM)
regions[nregions++] = device->physical->sys.region;
- }
+ else
+ regions[nregions++] = device->physical->vram_non_mappable.region;
- gem_handle = anv_gem_create_regions(device, size + ccs_size,
- nregions, regions);
+ /* If the buffer is mapped on the host, add the system memory region.
+ * This ensures that if the buffer cannot live in mappable local memory,
+ * it can be spilled to system memory.
+ */
+ if (!(alloc_flags & ANV_BO_ALLOC_NO_LOCAL_MEM) &&
+ ((alloc_flags & ANV_BO_ALLOC_MAPPED) ||
+ (alloc_flags & ANV_BO_ALLOC_LOCAL_MEM_CPU_VISIBLE)))
+ regions[nregions++] = device->physical->sys.region;
} else {
- gem_handle = anv_gem_create(device, size + ccs_size);
+ regions[nregions++] = device->physical->sys.region;
}
+ uint64_t actual_size;
+ uint32_t gem_handle = device->kmd_backend->gem_create(device, regions,
+ nregions, size,
+ alloc_flags,
+ &actual_size);
if (gem_handle == 0)
- return vk_error(VK_ERROR_OUT_OF_DEVICE_MEMORY);
+ return vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
struct anv_bo new_bo = {
.name = name,
@@ -1679,67 +1636,32 @@ anv_device_alloc_bo(struct anv_device *device,
.refcount = 1,
.offset = -1,
.size = size,
- ._ccs_size = ccs_size,
+ .ccs_offset = ccs_offset,
+ .actual_size = actual_size,
.flags = bo_flags,
- .is_external = (alloc_flags & ANV_BO_ALLOC_EXTERNAL),
- .has_client_visible_address =
- (alloc_flags & ANV_BO_ALLOC_CLIENT_VISIBLE_ADDRESS) != 0,
- .has_implicit_ccs = ccs_size > 0,
+ .alloc_flags = alloc_flags,
};
if (alloc_flags & ANV_BO_ALLOC_MAPPED) {
- new_bo.map = anv_gem_mmap(device, new_bo.gem_handle, 0, size, 0);
- if (new_bo.map == MAP_FAILED) {
- anv_gem_close(device, new_bo.gem_handle);
- return vk_errorf(device, &device->vk.base,
- VK_ERROR_OUT_OF_HOST_MEMORY,
- "mmap failed: %m");
- }
- }
-
- if (alloc_flags & ANV_BO_ALLOC_SNOOPED) {
- assert(alloc_flags & ANV_BO_ALLOC_MAPPED);
- /* We don't want to change these defaults if it's going to be shared
- * with another process.
- */
- assert(!(alloc_flags & ANV_BO_ALLOC_EXTERNAL));
-
- /* Regular objects are created I915_CACHING_CACHED on LLC platforms and
- * I915_CACHING_NONE on non-LLC platforms. For many internal state
- * objects, we'd rather take the snooping overhead than risk forgetting
- * a CLFLUSH somewhere. Userptr objects are always created as
- * I915_CACHING_CACHED, which on non-LLC means snooped so there's no
- * need to do this there.
- */
- if (!device->info.has_llc) {
- anv_gem_set_caching(device, new_bo.gem_handle,
- I915_CACHING_CACHED);
+ VkResult result = anv_device_map_bo(device, &new_bo, 0, size,
+ NULL, &new_bo.map);
+ if (unlikely(result != VK_SUCCESS)) {
+ device->kmd_backend->gem_close(device, &new_bo);
+ return result;
}
}
- if (alloc_flags & ANV_BO_ALLOC_FIXED_ADDRESS) {
- new_bo.has_fixed_address = true;
- new_bo.offset = explicit_address;
- } else if (new_bo.flags & EXEC_OBJECT_PINNED) {
- new_bo.offset = anv_vma_alloc(device, new_bo.size + new_bo._ccs_size,
- align, alloc_flags, explicit_address);
- if (new_bo.offset == 0) {
- if (new_bo.map)
- anv_gem_munmap(device, new_bo.map, size);
- anv_gem_close(device, new_bo.gem_handle);
- return vk_errorf(device, NULL, VK_ERROR_OUT_OF_DEVICE_MEMORY,
- "failed to allocate virtual address for BO");
- }
- } else {
- assert(!new_bo.has_client_visible_address);
- }
+ VkResult result = anv_bo_vma_alloc_or_close(device, &new_bo,
+ alloc_flags,
+ explicit_address);
+ if (result != VK_SUCCESS)
+ return result;
- if (new_bo._ccs_size > 0) {
- assert(device->info.has_aux_map);
- intel_aux_map_add_mapping(device->aux_map_ctx,
- intel_canonical_address(new_bo.offset),
- intel_canonical_address(new_bo.offset + new_bo.size),
- new_bo.size, 0 /* format_bits */);
+ result = device->kmd_backend->vm_bind_bo(device, &new_bo);
+ if (result != VK_SUCCESS) {
+ anv_bo_vma_free(device, &new_bo);
+ anv_bo_unmap_close(device, &new_bo);
+ return result;
}
assert(new_bo.gem_handle);
@@ -1752,6 +1674,56 @@ anv_device_alloc_bo(struct anv_device *device,
*bo_out = bo;
+ ANV_RMV(bo_allocate, device, bo);
+
+ return VK_SUCCESS;
+}
+
+VkResult
+anv_device_map_bo(struct anv_device *device,
+ struct anv_bo *bo,
+ uint64_t offset,
+ size_t size,
+ void *placed_addr,
+ void **map_out)
+{
+ assert(!bo->from_host_ptr);
+ assert(size > 0);
+
+ void *map = device->kmd_backend->gem_mmap(device, bo, offset, size, placed_addr);
+ if (unlikely(map == MAP_FAILED))
+ return vk_errorf(device, VK_ERROR_MEMORY_MAP_FAILED, "mmap failed: %m");
+
+ assert(placed_addr == NULL || map == placed_addr);
+
+ assert(map != NULL);
+ VG(VALGRIND_MALLOCLIKE_BLOCK(map, size, 0, 1));
+
+ if (map_out)
+ *map_out = map;
+
+ return VK_SUCCESS;
+}
+
+VkResult
+anv_device_unmap_bo(struct anv_device *device,
+ struct anv_bo *bo,
+ void *map, size_t map_size,
+ bool replace)
+{
+ assert(!bo->from_host_ptr);
+
+ if (replace) {
+ map = mmap(map, map_size, PROT_NONE,
+ MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0);
+ if (map == MAP_FAILED) {
+ return vk_errorf(device, VK_ERROR_MEMORY_MAP_FAILED,
+ "Failed to map over original mapping");
+ }
+ } else {
+ VG(VALGRIND_FREELIKE_BLOCK(map, 0));
+ munmap(map, map_size);
+ }
return VK_SUCCESS;
}
@@ -1763,25 +1735,35 @@ anv_device_import_bo_from_host_ptr(struct anv_device *device,
struct anv_bo **bo_out)
{
assert(!(alloc_flags & (ANV_BO_ALLOC_MAPPED |
- ANV_BO_ALLOC_SNOOPED |
+ ANV_BO_ALLOC_HOST_CACHED |
+ ANV_BO_ALLOC_HOST_COHERENT |
+ ANV_BO_ALLOC_AUX_CCS |
+ ANV_BO_ALLOC_PROTECTED |
ANV_BO_ALLOC_FIXED_ADDRESS)));
-
- /* We can't do implicit CCS with an aux table on shared memory */
- if (!device->physical->has_implicit_ccs || device->info.has_aux_map)
- assert(!(alloc_flags & ANV_BO_ALLOC_IMPLICIT_CCS));
+ assert(alloc_flags & ANV_BO_ALLOC_EXTERNAL);
struct anv_bo_cache *cache = &device->bo_cache;
const uint32_t bo_flags =
- anv_bo_alloc_flags_to_bo_flags(device, alloc_flags);
- assert(bo_flags == (bo_flags & ANV_BO_CACHE_SUPPORTED_FLAGS));
+ device->kmd_backend->bo_alloc_flags_to_bo_flags(device, alloc_flags);
- uint32_t gem_handle = anv_gem_userptr(device, host_ptr, size);
+ uint32_t gem_handle = device->kmd_backend->gem_create_userptr(device, host_ptr, size);
if (!gem_handle)
- return vk_error(VK_ERROR_INVALID_EXTERNAL_HANDLE);
+ return vk_error(device, VK_ERROR_INVALID_EXTERNAL_HANDLE);
pthread_mutex_lock(&cache->mutex);
- struct anv_bo *bo = anv_device_lookup_bo(device, gem_handle);
+ struct anv_bo *bo = NULL;
+ if (device->info->kmd_type == INTEL_KMD_TYPE_XE) {
+ bo = vk_zalloc(&device->vk.alloc, sizeof(*bo), 8,
+ VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
+ if (!bo) {
+ pthread_mutex_unlock(&cache->mutex);
+ return VK_ERROR_OUT_OF_HOST_MEMORY;
+ }
+ } else {
+ bo = anv_device_lookup_bo(device, gem_handle);
+ }
+
if (bo->refcount > 0) {
/* VK_EXT_external_memory_host doesn't require handling importing the
* same pointer twice at the same time, but we don't get in the way. If
@@ -1790,59 +1772,59 @@ anv_device_import_bo_from_host_ptr(struct anv_device *device,
assert(bo->gem_handle == gem_handle);
if (bo_flags != bo->flags) {
pthread_mutex_unlock(&cache->mutex);
- return vk_errorf(device, NULL, VK_ERROR_INVALID_EXTERNAL_HANDLE,
+ return vk_errorf(device, VK_ERROR_INVALID_EXTERNAL_HANDLE,
"same host pointer imported two different ways");
}
- if (bo->has_client_visible_address !=
- ((alloc_flags & ANV_BO_ALLOC_CLIENT_VISIBLE_ADDRESS) != 0)) {
+ if ((bo->alloc_flags & ANV_BO_ALLOC_CLIENT_VISIBLE_ADDRESS) !=
+ (alloc_flags & ANV_BO_ALLOC_CLIENT_VISIBLE_ADDRESS)) {
pthread_mutex_unlock(&cache->mutex);
- return vk_errorf(device, NULL, VK_ERROR_INVALID_EXTERNAL_HANDLE,
+ return vk_errorf(device, VK_ERROR_INVALID_EXTERNAL_HANDLE,
"The same BO was imported with and without buffer "
"device address");
}
if (client_address && client_address != intel_48b_address(bo->offset)) {
pthread_mutex_unlock(&cache->mutex);
- return vk_errorf(device, NULL, VK_ERROR_INVALID_EXTERNAL_HANDLE,
+ return vk_errorf(device, VK_ERROR_INVALID_EXTERNAL_HANDLE,
"The same BO was imported at two different "
"addresses");
}
__sync_fetch_and_add(&bo->refcount, 1);
} else {
+ alloc_flags |= ANV_BO_ALLOC_IMPORTED;
struct anv_bo new_bo = {
.name = "host-ptr",
.gem_handle = gem_handle,
.refcount = 1,
.offset = -1,
.size = size,
+ .actual_size = size,
.map = host_ptr,
.flags = bo_flags,
- .is_external = true,
+ .alloc_flags = alloc_flags,
.from_host_ptr = true,
- .has_client_visible_address =
- (alloc_flags & ANV_BO_ALLOC_CLIENT_VISIBLE_ADDRESS) != 0,
};
- assert(client_address == intel_48b_address(client_address));
- if (new_bo.flags & EXEC_OBJECT_PINNED) {
- assert(new_bo._ccs_size == 0);
- new_bo.offset = anv_vma_alloc(device, new_bo.size,
- anv_device_get_bo_align(device,
- alloc_flags),
- alloc_flags, client_address);
- if (new_bo.offset == 0) {
- anv_gem_close(device, new_bo.gem_handle);
- pthread_mutex_unlock(&cache->mutex);
- return vk_errorf(device, NULL, VK_ERROR_OUT_OF_DEVICE_MEMORY,
- "failed to allocate virtual address for BO");
- }
- } else {
- assert(!new_bo.has_client_visible_address);
+ VkResult result = anv_bo_vma_alloc_or_close(device, &new_bo,
+ alloc_flags,
+ client_address);
+ if (result != VK_SUCCESS) {
+ pthread_mutex_unlock(&cache->mutex);
+ return result;
+ }
+
+ result = device->kmd_backend->vm_bind_bo(device, &new_bo);
+ if (result != VK_SUCCESS) {
+ anv_bo_vma_free(device, &new_bo);
+ pthread_mutex_unlock(&cache->mutex);
+ return result;
}
*bo = new_bo;
+
+ ANV_RMV(bo_allocate, device, bo);
}
pthread_mutex_unlock(&cache->mutex);
@@ -1859,125 +1841,90 @@ anv_device_import_bo(struct anv_device *device,
struct anv_bo **bo_out)
{
assert(!(alloc_flags & (ANV_BO_ALLOC_MAPPED |
- ANV_BO_ALLOC_SNOOPED |
+ ANV_BO_ALLOC_HOST_CACHED |
+ ANV_BO_ALLOC_HOST_COHERENT |
ANV_BO_ALLOC_FIXED_ADDRESS)));
-
- /* We can't do implicit CCS with an aux table on shared memory */
- if (!device->physical->has_implicit_ccs || device->info.has_aux_map)
- assert(!(alloc_flags & ANV_BO_ALLOC_IMPLICIT_CCS));
+ assert(alloc_flags & ANV_BO_ALLOC_EXTERNAL);
struct anv_bo_cache *cache = &device->bo_cache;
- const uint32_t bo_flags =
- anv_bo_alloc_flags_to_bo_flags(device, alloc_flags);
- assert(bo_flags == (bo_flags & ANV_BO_CACHE_SUPPORTED_FLAGS));
pthread_mutex_lock(&cache->mutex);
uint32_t gem_handle = anv_gem_fd_to_handle(device, fd);
if (!gem_handle) {
pthread_mutex_unlock(&cache->mutex);
- return vk_error(VK_ERROR_INVALID_EXTERNAL_HANDLE);
+ return vk_error(device, VK_ERROR_INVALID_EXTERNAL_HANDLE);
}
struct anv_bo *bo = anv_device_lookup_bo(device, gem_handle);
- if (bo->refcount > 0) {
- /* We have to be careful how we combine flags so that it makes sense.
- * Really, though, if we get to this case and it actually matters, the
- * client has imported a BO twice in different ways and they get what
- * they have coming.
- */
- uint64_t new_flags = 0;
- new_flags |= (bo->flags | bo_flags) & EXEC_OBJECT_WRITE;
- new_flags |= (bo->flags & bo_flags) & EXEC_OBJECT_ASYNC;
- new_flags |= (bo->flags & bo_flags) & EXEC_OBJECT_SUPPORTS_48B_ADDRESS;
- new_flags |= (bo->flags | bo_flags) & EXEC_OBJECT_PINNED;
- new_flags |= (bo->flags | bo_flags) & EXEC_OBJECT_CAPTURE;
-
- /* It's theoretically possible for a BO to get imported such that it's
- * both pinned and not pinned. The only way this can happen is if it
- * gets imported as both a semaphore and a memory object and that would
- * be an application error. Just fail out in that case.
- */
- if ((bo->flags & EXEC_OBJECT_PINNED) !=
- (bo_flags & EXEC_OBJECT_PINNED)) {
- pthread_mutex_unlock(&cache->mutex);
- return vk_errorf(device, NULL, VK_ERROR_INVALID_EXTERNAL_HANDLE,
- "The same BO was imported two different ways");
- }
- /* It's also theoretically possible that someone could export a BO from
- * one heap and import it into another or to import the same BO into two
- * different heaps. If this happens, we could potentially end up both
- * allowing and disallowing 48-bit addresses. There's not much we can
- * do about it if we're pinning so we just throw an error and hope no
- * app is actually that stupid.
- */
- if ((new_flags & EXEC_OBJECT_PINNED) &&
- (bo->flags & EXEC_OBJECT_SUPPORTS_48B_ADDRESS) !=
- (bo_flags & EXEC_OBJECT_SUPPORTS_48B_ADDRESS)) {
- pthread_mutex_unlock(&cache->mutex);
- return vk_errorf(device, NULL, VK_ERROR_INVALID_EXTERNAL_HANDLE,
- "The same BO was imported on two different heaps");
- }
+ uint32_t bo_flags;
+ VkResult result = anv_gem_import_bo_alloc_flags_to_bo_flags(device, bo,
+ alloc_flags,
+ &bo_flags);
+ if (result != VK_SUCCESS) {
+ pthread_mutex_unlock(&cache->mutex);
+ return result;
+ }
- if (bo->has_client_visible_address !=
- ((alloc_flags & ANV_BO_ALLOC_CLIENT_VISIBLE_ADDRESS) != 0)) {
+ if (bo->refcount > 0) {
+ if ((bo->alloc_flags & ANV_BO_ALLOC_CLIENT_VISIBLE_ADDRESS) !=
+ (alloc_flags & ANV_BO_ALLOC_CLIENT_VISIBLE_ADDRESS)) {
pthread_mutex_unlock(&cache->mutex);
- return vk_errorf(device, NULL, VK_ERROR_INVALID_EXTERNAL_HANDLE,
+ return vk_errorf(device, VK_ERROR_INVALID_EXTERNAL_HANDLE,
"The same BO was imported with and without buffer "
"device address");
}
if (client_address && client_address != intel_48b_address(bo->offset)) {
pthread_mutex_unlock(&cache->mutex);
- return vk_errorf(device, NULL, VK_ERROR_INVALID_EXTERNAL_HANDLE,
+ return vk_errorf(device, VK_ERROR_INVALID_EXTERNAL_HANDLE,
"The same BO was imported at two different "
"addresses");
}
- bo->flags = new_flags;
-
__sync_fetch_and_add(&bo->refcount, 1);
} else {
- off_t size = lseek(fd, 0, SEEK_END);
- if (size == (off_t)-1) {
- anv_gem_close(device, gem_handle);
- pthread_mutex_unlock(&cache->mutex);
- return vk_error(VK_ERROR_INVALID_EXTERNAL_HANDLE);
- }
-
+ alloc_flags |= ANV_BO_ALLOC_IMPORTED;
struct anv_bo new_bo = {
.name = "imported",
.gem_handle = gem_handle,
.refcount = 1,
.offset = -1,
- .size = size,
- .flags = bo_flags,
- .is_external = true,
- .has_client_visible_address =
- (alloc_flags & ANV_BO_ALLOC_CLIENT_VISIBLE_ADDRESS) != 0,
+ .alloc_flags = alloc_flags,
};
- assert(client_address == intel_48b_address(client_address));
- if (new_bo.flags & EXEC_OBJECT_PINNED) {
- assert(new_bo._ccs_size == 0);
- new_bo.offset = anv_vma_alloc(device, new_bo.size,
- anv_device_get_bo_align(device,
- alloc_flags),
- alloc_flags, client_address);
- if (new_bo.offset == 0) {
- anv_gem_close(device, new_bo.gem_handle);
- pthread_mutex_unlock(&cache->mutex);
- return vk_errorf(device, NULL, VK_ERROR_OUT_OF_DEVICE_MEMORY,
- "failed to allocate virtual address for BO");
- }
- } else {
- assert(!new_bo.has_client_visible_address);
+ off_t size = lseek(fd, 0, SEEK_END);
+ if (size == (off_t)-1) {
+ device->kmd_backend->gem_close(device, &new_bo);
+ pthread_mutex_unlock(&cache->mutex);
+ return vk_error(device, VK_ERROR_INVALID_EXTERNAL_HANDLE);
+ }
+ new_bo.size = size;
+ new_bo.actual_size = size;
+
+ VkResult result = anv_bo_vma_alloc_or_close(device, &new_bo,
+ alloc_flags,
+ client_address);
+ if (result != VK_SUCCESS) {
+ pthread_mutex_unlock(&cache->mutex);
+ return result;
+ }
+
+ result = device->kmd_backend->vm_bind_bo(device, &new_bo);
+ if (result != VK_SUCCESS) {
+ anv_bo_vma_free(device, &new_bo);
+ pthread_mutex_unlock(&cache->mutex);
+ return result;
}
*bo = new_bo;
+
+ ANV_RMV(bo_allocate, device, bo);
}
+ bo->flags = bo_flags;
+
pthread_mutex_unlock(&cache->mutex);
*bo_out = bo;
@@ -1994,17 +1941,49 @@ anv_device_export_bo(struct anv_device *device,
* to export it. This is done based on external options passed into
* anv_AllocateMemory.
*/
- assert(bo->is_external);
+ assert(anv_bo_is_external(bo));
int fd = anv_gem_handle_to_fd(device, bo->gem_handle);
if (fd < 0)
- return vk_error(VK_ERROR_TOO_MANY_OBJECTS);
+ return vk_error(device, VK_ERROR_TOO_MANY_OBJECTS);
*fd_out = fd;
return VK_SUCCESS;
}
+VkResult
+anv_device_get_bo_tiling(struct anv_device *device,
+ struct anv_bo *bo,
+ enum isl_tiling *tiling_out)
+{
+ int i915_tiling = anv_gem_get_tiling(device, bo->gem_handle);
+ if (i915_tiling < 0) {
+ return vk_errorf(device, VK_ERROR_INVALID_EXTERNAL_HANDLE,
+ "failed to get BO tiling: %m");
+ }
+
+ *tiling_out = isl_tiling_from_i915_tiling(i915_tiling);
+
+ return VK_SUCCESS;
+}
+
+VkResult
+anv_device_set_bo_tiling(struct anv_device *device,
+ struct anv_bo *bo,
+ uint32_t row_pitch_B,
+ enum isl_tiling tiling)
+{
+ int ret = anv_gem_set_tiling(device, bo->gem_handle, row_pitch_B,
+ isl_tiling_to_i915_tiling(tiling));
+ if (ret) {
+ return vk_errorf(device, VK_ERROR_OUT_OF_DEVICE_MEMORY,
+ "failed to set BO tiling: %m");
+ }
+
+ return VK_SUCCESS;
+}
+
static bool
atomic_dec_not_one(uint32_t *counter)
{
@@ -2028,7 +2007,10 @@ anv_device_release_bo(struct anv_device *device,
struct anv_bo *bo)
{
struct anv_bo_cache *cache = &device->bo_cache;
- assert(anv_device_lookup_bo(device, bo->gem_handle) == bo);
+ const bool bo_is_xe_userptr = device->info->kmd_type == INTEL_KMD_TYPE_XE &&
+ bo->from_host_ptr;
+ assert(bo_is_xe_userptr ||
+ anv_device_lookup_bo(device, bo->gem_handle) == bo);
/* Try to decrement the counter but don't go below one. If this succeeds
* then the refcount has been decremented and we are not the last
@@ -2037,6 +2019,8 @@ anv_device_release_bo(struct anv_device *device,
if (atomic_dec_not_one(&bo->refcount))
return;
+ ANV_RMV(bo_destroy, device, bo);
+
pthread_mutex_lock(&cache->mutex);
/* We are probably the last reference since our attempt to decrement above
@@ -2051,33 +2035,21 @@ anv_device_release_bo(struct anv_device *device,
}
assert(bo->refcount == 0);
- if (bo->map && !bo->from_host_ptr)
- anv_gem_munmap(device, bo->map, bo->size);
-
- if (bo->_ccs_size > 0) {
- assert(device->physical->has_implicit_ccs);
- assert(device->info.has_aux_map);
- assert(bo->has_implicit_ccs);
- intel_aux_map_unmap_range(device->aux_map_ctx,
- intel_canonical_address(bo->offset),
- bo->size);
- }
-
- if ((bo->flags & EXEC_OBJECT_PINNED) && !bo->has_fixed_address)
- anv_vma_free(device, bo->offset, bo->size + bo->_ccs_size);
-
- uint32_t gem_handle = bo->gem_handle;
-
/* Memset the BO just in case. The refcount being zero should be enough to
* prevent someone from assuming the data is valid but it's safer to just
- * stomp to zero just in case. We explicitly do this *before* we close the
- * GEM handle to ensure that if anyone allocates something and gets the
- * same GEM handle, the memset has already happen and won't stomp all over
- * any data they may write in this BO.
+ * stomp to zero just in case. We explicitly do this *before* we actually
+ * close the GEM handle to ensure that if anyone allocates something and
+ * gets the same GEM handle, the memset has already happen and won't stomp
+ * all over any data they may write in this BO.
*/
- memset(bo, 0, sizeof(*bo));
+ struct anv_bo old_bo = *bo;
+
+ if (bo_is_xe_userptr)
+ vk_free(&device->vk.alloc, bo);
+ else
+ memset(bo, 0, sizeof(*bo));
- anv_gem_close(device, gem_handle);
+ anv_bo_finish(device, &old_bo);
/* Don't unlock until we've actually closed the BO. The whole point of
* the BO cache is to ensure that we correctly handle races with creating
diff --git a/src/intel/vulkan/anv_android.c b/src/intel/vulkan/anv_android.c
index 418e844c471..2cea3fc9f36 100644
--- a/src/intel/vulkan/anv_android.c
+++ b/src/intel/vulkan/anv_android.c
@@ -34,16 +34,14 @@
#include <sync/sync.h>
#include "anv_private.h"
+#include "vk_android.h"
+#include "vk_common_entrypoints.h"
#include "vk_util.h"
static int anv_hal_open(const struct hw_module_t* mod, const char* id, struct hw_device_t** dev);
static int anv_hal_close(struct hw_device_t *dev);
-static void UNUSED
-static_asserts(void)
-{
- STATIC_ASSERT(HWVULKAN_DISPATCH_MAGIC == ICD_LOADER_MAGIC);
-}
+static_assert(HWVULKAN_DISPATCH_MAGIC == ICD_LOADER_MAGIC, "");
PUBLIC struct hwvulkan_module_t HAL_MODULE_INFO_SYM = {
.common = {
@@ -109,52 +107,34 @@ anv_hal_close(struct hw_device_t *dev)
enum {
/* Usage bit equal to GRALLOC_USAGE_HW_CAMERA_MASK */
- AHARDWAREBUFFER_USAGE_CAMERA_MASK = 0x00060000U,
+ BUFFER_USAGE_CAMERA_MASK = 0x00060000U,
};
inline VkFormat
vk_format_from_android(unsigned android_format, unsigned android_usage)
{
switch (android_format) {
- case AHARDWAREBUFFER_FORMAT_R8G8B8A8_UNORM:
- return VK_FORMAT_R8G8B8A8_UNORM;
case AHARDWAREBUFFER_FORMAT_R8G8B8X8_UNORM:
- case AHARDWAREBUFFER_FORMAT_R8G8B8_UNORM:
return VK_FORMAT_R8G8B8_UNORM;
- case AHARDWAREBUFFER_FORMAT_R5G6B5_UNORM:
- return VK_FORMAT_R5G6B5_UNORM_PACK16;
- case AHARDWAREBUFFER_FORMAT_R16G16B16A16_FLOAT:
- return VK_FORMAT_R16G16B16A16_SFLOAT;
- case AHARDWAREBUFFER_FORMAT_R10G10B10A2_UNORM:
- return VK_FORMAT_A2B10G10R10_UNORM_PACK32;
case AHARDWAREBUFFER_FORMAT_Y8Cb8Cr8_420:
case HAL_PIXEL_FORMAT_NV12_Y_TILED_INTEL:
return VK_FORMAT_G8_B8R8_2PLANE_420_UNORM;
+ case AHARDWAREBUFFER_FORMAT_YV12:
+ return VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM;
case AHARDWAREBUFFER_FORMAT_IMPLEMENTATION_DEFINED:
- if (android_usage & AHARDWAREBUFFER_USAGE_CAMERA_MASK)
+ if (android_usage & BUFFER_USAGE_CAMERA_MASK)
return VK_FORMAT_G8_B8R8_2PLANE_420_UNORM;
else
return VK_FORMAT_R8G8B8_UNORM;
- case AHARDWAREBUFFER_FORMAT_BLOB:
default:
- return VK_FORMAT_UNDEFINED;
+ return vk_ahb_format_to_image_format(android_format);
}
}
-static inline unsigned
-android_format_from_vk(unsigned vk_format)
+unsigned
+anv_ahb_format_for_vk_format(VkFormat vk_format)
{
switch (vk_format) {
- case VK_FORMAT_R8G8B8A8_UNORM:
- return AHARDWAREBUFFER_FORMAT_R8G8B8A8_UNORM;
- case VK_FORMAT_R8G8B8_UNORM:
- return AHARDWAREBUFFER_FORMAT_R8G8B8_UNORM;
- case VK_FORMAT_R5G6B5_UNORM_PACK16:
- return AHARDWAREBUFFER_FORMAT_R5G6B5_UNORM;
- case VK_FORMAT_R16G16B16A16_SFLOAT:
- return AHARDWAREBUFFER_FORMAT_R16G16B16A16_FLOAT;
- case VK_FORMAT_A2B10G10R10_UNORM_PACK32:
- return AHARDWAREBUFFER_FORMAT_R10G10B10A2_UNORM;
case VK_FORMAT_G8_B8R8_2PLANE_420_UNORM:
#ifdef HAVE_CROS_GRALLOC
return AHARDWAREBUFFER_FORMAT_Y8Cb8Cr8_420;
@@ -162,15 +142,15 @@ android_format_from_vk(unsigned vk_format)
return HAL_PIXEL_FORMAT_NV12_Y_TILED_INTEL;
#endif
default:
- return AHARDWAREBUFFER_FORMAT_BLOB;
+ return vk_image_format_to_ahb_format(vk_format);
}
}
static VkResult
-get_ahw_buffer_format_properties(
+get_ahw_buffer_format_properties2(
VkDevice device_h,
const struct AHardwareBuffer *buffer,
- VkAndroidHardwareBufferFormatPropertiesANDROID *pProperties)
+ VkAndroidHardwareBufferFormatProperties2ANDROID *pProperties)
{
ANV_FROM_HANDLE(anv_device, device, device_h);
@@ -191,12 +171,12 @@ get_ahw_buffer_format_properties(
return VK_ERROR_INVALID_EXTERNAL_HANDLE;
/* Fill properties fields based on description. */
- VkAndroidHardwareBufferFormatPropertiesANDROID *p = pProperties;
+ VkAndroidHardwareBufferFormatProperties2ANDROID *p = pProperties;
p->format = vk_format_from_android(desc.format, desc.usage);
+ p->externalFormat = p->format;
const struct anv_format *anv_format = anv_get_format(p->format);
- p->externalFormat = (uint64_t) (uintptr_t) anv_format;
/* Default to OPTIMAL tiling but set to linear in case
* of AHARDWAREBUFFER_USAGE_GPU_DATA_BUFFER usage.
@@ -207,8 +187,8 @@ get_ahw_buffer_format_properties(
tiling = VK_IMAGE_TILING_LINEAR;
p->formatFeatures =
- anv_get_image_format_features(&device->info, p->format, anv_format,
- tiling, NULL);
+ anv_get_image_format_features2(device->physical, p->format, anv_format,
+ tiling, NULL);
/* "Images can be created with an external format even if the Android hardware
* buffer has a format which has an equivalent Vulkan format to enable
@@ -223,7 +203,7 @@ get_ahw_buffer_format_properties(
* VK_FORMAT_FEATURE_COSITED_CHROMA_SAMPLES_BIT"
*/
p->formatFeatures |=
- VK_FORMAT_FEATURE_MIDPOINT_CHROMA_SAMPLES_BIT;
+ VK_FORMAT_FEATURE_2_MIDPOINT_CHROMA_SAMPLES_BIT;
/* "Implementations may not always be able to determine the color model,
* numerical range, or chroma offsets of the image contents, so the values
@@ -257,10 +237,30 @@ anv_GetAndroidHardwareBufferPropertiesANDROID(
VkAndroidHardwareBufferFormatPropertiesANDROID *format_prop =
vk_find_struct(pProperties->pNext,
ANDROID_HARDWARE_BUFFER_FORMAT_PROPERTIES_ANDROID);
-
/* Fill format properties of an Android hardware buffer. */
- if (format_prop)
- get_ahw_buffer_format_properties(device_h, buffer, format_prop);
+ if (format_prop) {
+ VkAndroidHardwareBufferFormatProperties2ANDROID format_prop2 = {
+ .sType = VK_STRUCTURE_TYPE_ANDROID_HARDWARE_BUFFER_FORMAT_PROPERTIES_2_ANDROID,
+ };
+ get_ahw_buffer_format_properties2(device_h, buffer, &format_prop2);
+
+ format_prop->format = format_prop2.format;
+ format_prop->externalFormat = format_prop2.externalFormat;
+ format_prop->formatFeatures =
+ vk_format_features2_to_features(format_prop2.formatFeatures);
+ format_prop->samplerYcbcrConversionComponents =
+ format_prop2.samplerYcbcrConversionComponents;
+ format_prop->suggestedYcbcrModel = format_prop2.suggestedYcbcrModel;
+ format_prop->suggestedYcbcrRange = format_prop2.suggestedYcbcrRange;
+ format_prop->suggestedXChromaOffset = format_prop2.suggestedXChromaOffset;
+ format_prop->suggestedYChromaOffset = format_prop2.suggestedYChromaOffset;
+ }
+
+ VkAndroidHardwareBufferFormatProperties2ANDROID *format_prop2 =
+ vk_find_struct(pProperties->pNext,
+ ANDROID_HARDWARE_BUFFER_FORMAT_PROPERTIES_2_ANDROID);
+ if (format_prop2)
+ get_ahw_buffer_format_properties2(device_h, buffer, format_prop2);
/* NOTE - We support buffers with only one handle but do not error on
* multiple handle case. Reason is that we want to support YUV formats
@@ -282,81 +282,21 @@ anv_GetAndroidHardwareBufferPropertiesANDROID(
return VK_SUCCESS;
}
-VkResult
-anv_GetMemoryAndroidHardwareBufferANDROID(
- VkDevice device_h,
- const VkMemoryGetAndroidHardwareBufferInfoANDROID *pInfo,
- struct AHardwareBuffer **pBuffer)
-{
- ANV_FROM_HANDLE(anv_device_memory, mem, pInfo->memory);
-
- /* Some quotes from Vulkan spec:
- *
- * "If the device memory was created by importing an Android hardware
- * buffer, vkGetMemoryAndroidHardwareBufferANDROID must return that same
- * Android hardware buffer object."
- *
- * "VK_EXTERNAL_MEMORY_HANDLE_TYPE_ANDROID_HARDWARE_BUFFER_BIT_ANDROID must
- * have been included in VkExportMemoryAllocateInfo::handleTypes when
- * memory was created."
- */
- if (mem->ahw) {
- *pBuffer = mem->ahw;
- /* Increase refcount. */
- AHardwareBuffer_acquire(mem->ahw);
- return VK_SUCCESS;
- }
-
- return VK_ERROR_OUT_OF_HOST_MEMORY;
-}
-
-#endif
-
-/* Construct ahw usage mask from image usage bits, see
- * 'AHardwareBuffer Usage Equivalence' in Vulkan spec.
- */
-uint64_t
-anv_ahw_usage_from_vk_usage(const VkImageCreateFlags vk_create,
- const VkImageUsageFlags vk_usage)
-{
- uint64_t ahw_usage = 0;
-#if ANDROID_API_LEVEL >= 26
- if (vk_usage & VK_IMAGE_USAGE_SAMPLED_BIT)
- ahw_usage |= AHARDWAREBUFFER_USAGE_GPU_SAMPLED_IMAGE;
-
- if (vk_usage & VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT)
- ahw_usage |= AHARDWAREBUFFER_USAGE_GPU_SAMPLED_IMAGE;
-
- if (vk_usage & VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT)
- ahw_usage |= AHARDWAREBUFFER_USAGE_GPU_COLOR_OUTPUT;
-
- if (vk_create & VK_IMAGE_CREATE_CUBE_COMPATIBLE_BIT)
- ahw_usage |= AHARDWAREBUFFER_USAGE_GPU_CUBE_MAP;
-
- if (vk_create & VK_IMAGE_CREATE_PROTECTED_BIT)
- ahw_usage |= AHARDWAREBUFFER_USAGE_PROTECTED_CONTENT;
-
- /* No usage bits set - set at least one GPU usage. */
- if (ahw_usage == 0)
- ahw_usage = AHARDWAREBUFFER_USAGE_GPU_SAMPLED_IMAGE;
#endif
- return ahw_usage;
-}
/*
* Called from anv_AllocateMemory when import AHardwareBuffer.
*/
VkResult
anv_import_ahw_memory(VkDevice device_h,
- struct anv_device_memory *mem,
- const VkImportAndroidHardwareBufferInfoANDROID *info)
+ struct anv_device_memory *mem)
{
#if ANDROID_API_LEVEL >= 26
ANV_FROM_HANDLE(anv_device, device, device_h);
/* Import from AHardwareBuffer to anv_device_memory. */
const native_handle_t *handle =
- AHardwareBuffer_getNativeHandle(info->buffer);
+ AHardwareBuffer_getNativeHandle(mem->vk.ahardware_buffer);
/* NOTE - We support buffers with only one handle but do not error on
* multiple handle case. Reason is that we want to support YUV formats
@@ -372,14 +312,6 @@ anv_import_ahw_memory(VkDevice device_h,
&mem->bo);
assert(result == VK_SUCCESS);
- /* "If the vkAllocateMemory command succeeds, the implementation must
- * acquire a reference to the imported hardware buffer, which it must
- * release when the device memory object is freed. If the command fails,
- * the implementation must not retain a reference."
- */
- AHardwareBuffer_acquire(info->buffer);
- mem->ahw = info->buffer;
-
return VK_SUCCESS;
#else
return VK_ERROR_EXTENSION_NOT_PRESENT;
@@ -387,80 +319,11 @@ anv_import_ahw_memory(VkDevice device_h,
}
VkResult
-anv_create_ahw_memory(VkDevice device_h,
- struct anv_device_memory *mem,
- const VkMemoryAllocateInfo *pAllocateInfo)
-{
-#if ANDROID_API_LEVEL >= 26
- const VkMemoryDedicatedAllocateInfo *dedicated_info =
- vk_find_struct_const(pAllocateInfo->pNext,
- MEMORY_DEDICATED_ALLOCATE_INFO);
-
- uint32_t w = 0;
- uint32_t h = 1;
- uint32_t layers = 1;
- uint32_t format = 0;
- uint64_t usage = 0;
-
- /* If caller passed dedicated information. */
- if (dedicated_info && dedicated_info->image) {
- ANV_FROM_HANDLE(anv_image, image, dedicated_info->image);
- w = image->vk.extent.width;
- h = image->vk.extent.height;
- layers = image->vk.array_layers;
- format = android_format_from_vk(image->vk.format);
- usage = anv_ahw_usage_from_vk_usage(image->vk.create_flags, image->vk.usage);
- } else if (dedicated_info && dedicated_info->buffer) {
- ANV_FROM_HANDLE(anv_buffer, buffer, dedicated_info->buffer);
- w = buffer->size;
- format = AHARDWAREBUFFER_FORMAT_BLOB;
- usage = AHARDWAREBUFFER_USAGE_CPU_READ_OFTEN |
- AHARDWAREBUFFER_USAGE_CPU_WRITE_OFTEN;
- } else {
- w = pAllocateInfo->allocationSize;
- format = AHARDWAREBUFFER_FORMAT_BLOB;
- usage = AHARDWAREBUFFER_USAGE_CPU_READ_OFTEN |
- AHARDWAREBUFFER_USAGE_CPU_WRITE_OFTEN;
- }
-
- struct AHardwareBuffer *ahw = NULL;
- struct AHardwareBuffer_Desc desc = {
- .width = w,
- .height = h,
- .layers = layers,
- .format = format,
- .usage = usage,
- };
-
- if (AHardwareBuffer_allocate(&desc, &ahw) != 0)
- return VK_ERROR_OUT_OF_HOST_MEMORY;
-
- const VkImportAndroidHardwareBufferInfoANDROID import_info = {
- .buffer = ahw,
- };
- VkResult result = anv_import_ahw_memory(device_h, mem, &import_info);
-
- /* Release a reference to avoid leak for AHB allocation. */
- AHardwareBuffer_release(ahw);
-
- return result;
-#else
- return VK_ERROR_EXTENSION_NOT_PRESENT;
-#endif
-
-}
-
-VkResult
-anv_image_from_gralloc(VkDevice device_h,
- const VkImageCreateInfo *base_info,
- const VkNativeBufferANDROID *gralloc_info,
- const VkAllocationCallbacks *alloc,
- VkImage *out_image_h)
-
+anv_image_init_from_gralloc(struct anv_device *device,
+ struct anv_image *image,
+ const VkImageCreateInfo *base_info,
+ const VkNativeBufferANDROID *gralloc_info)
{
- ANV_FROM_HANDLE(anv_device, device, device_h);
- VkImage image_h = VK_NULL_HANDLE;
- struct anv_image *image = NULL;
struct anv_bo *bo = NULL;
VkResult result;
@@ -469,13 +332,6 @@ anv_image_from_gralloc(VkDevice device_h,
.isl_extra_usage_flags = ISL_SURF_USAGE_DISABLE_AUX_BIT,
};
- if (gralloc_info->handle->numFds != 1) {
- return vk_errorf(device, &device->vk.base,
- VK_ERROR_INVALID_EXTERNAL_HANDLE,
- "VkNativeBufferANDROID::handle::numFds is %d, "
- "expected 1", gralloc_info->handle->numFds);
- }
-
/* Do not close the gralloc handle's dma_buf. The lifetime of the dma_buf
* must exceed that of the gralloc handle, and we do not own the gralloc
* handle.
@@ -492,69 +348,43 @@ anv_image_from_gralloc(VkDevice device_h,
*
*/
result = anv_device_import_bo(device, dma_buf,
+ ANV_BO_ALLOC_EXTERNAL |
ANV_BO_ALLOC_IMPLICIT_SYNC |
ANV_BO_ALLOC_IMPLICIT_WRITE,
0 /* client_address */,
&bo);
if (result != VK_SUCCESS) {
- return vk_errorf(device, &device->vk.base, result,
+ return vk_errorf(device, result,
"failed to import dma-buf from VkNativeBufferANDROID");
}
- int i915_tiling = anv_gem_get_tiling(device, bo->gem_handle);
- switch (i915_tiling) {
- case I915_TILING_NONE:
- anv_info.isl_tiling_flags = ISL_TILING_LINEAR_BIT;
- break;
- case I915_TILING_X:
- anv_info.isl_tiling_flags = ISL_TILING_X_BIT;
- break;
- case I915_TILING_Y:
- anv_info.isl_tiling_flags = ISL_TILING_Y0_BIT;
- break;
- case -1:
- result = vk_errorf(device, &device->vk.base,
- VK_ERROR_INVALID_EXTERNAL_HANDLE,
- "DRM_IOCTL_I915_GEM_GET_TILING failed for "
- "VkNativeBufferANDROID");
- goto fail_tiling;
- default:
- result = vk_errorf(device, &device->vk.base,
- VK_ERROR_INVALID_EXTERNAL_HANDLE,
- "DRM_IOCTL_I915_GEM_GET_TILING returned unknown "
- "tiling %d for VkNativeBufferANDROID", i915_tiling);
- goto fail_tiling;
+ enum isl_tiling tiling;
+ result = anv_device_get_bo_tiling(device, bo, &tiling);
+ if (result != VK_SUCCESS) {
+ return vk_errorf(device, result,
+ "failed to get tiling from VkNativeBufferANDROID");
}
+ anv_info.isl_tiling_flags = 1u << tiling;
- enum isl_format format = anv_get_isl_format(&device->info,
- base_info->format,
- VK_IMAGE_ASPECT_COLOR_BIT,
- base_info->tiling);
- assert(format != ISL_FORMAT_UNSUPPORTED);
+ anv_info.stride = gralloc_info->stride;
- result = anv_image_create(device_h, &anv_info, alloc, &image_h);
- image = anv_image_from_handle(image_h);
+ result = anv_image_init(device, image, &anv_info);
if (result != VK_SUCCESS)
- goto fail_create;
-
- VkImageMemoryRequirementsInfo2 mem_reqs_info = {
- .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_REQUIREMENTS_INFO_2,
- .image = image_h,
- };
+ goto fail_init;
VkMemoryRequirements2 mem_reqs = {
.sType = VK_STRUCTURE_TYPE_MEMORY_REQUIREMENTS_2,
};
- anv_GetImageMemoryRequirements2(device_h, &mem_reqs_info, &mem_reqs);
+ anv_image_get_memory_requirements(device, image, image->vk.aspects,
+ &mem_reqs);
VkDeviceSize aligned_image_size =
- align_u64(mem_reqs.memoryRequirements.size,
- mem_reqs.memoryRequirements.alignment);
+ align64(mem_reqs.memoryRequirements.size,
+ mem_reqs.memoryRequirements.alignment);
if (bo->size < aligned_image_size) {
- result = vk_errorf(device, &device->vk.base,
- VK_ERROR_INVALID_EXTERNAL_HANDLE,
+ result = vk_errorf(device, VK_ERROR_INVALID_EXTERNAL_HANDLE,
"dma-buf from VkNativeBufferANDROID is too small for "
"VkImage: %"PRIu64"B < %"PRIu64"B",
bo->size, aligned_image_size);
@@ -570,15 +400,11 @@ anv_image_from_gralloc(VkDevice device_h,
image->bindings[ANV_IMAGE_MEMORY_BINDING_MAIN].address.bo = bo;
image->from_gralloc = true;
- /* Don't clobber the out-parameter until success is certain. */
- *out_image_h = image_h;
-
return VK_SUCCESS;
fail_size:
- anv_DestroyImage(device_h, image_h, alloc);
- fail_create:
- fail_tiling:
+ anv_image_finish(image);
+ fail_init:
anv_device_release_bo(device, bo);
return result;
@@ -606,18 +432,19 @@ anv_image_bind_from_gralloc(struct anv_device *device,
*/
struct anv_bo *bo = NULL;
VkResult result = anv_device_import_bo(device, dma_buf,
+ ANV_BO_ALLOC_EXTERNAL |
ANV_BO_ALLOC_IMPLICIT_SYNC |
ANV_BO_ALLOC_IMPLICIT_WRITE,
0 /* client_address */,
&bo);
if (result != VK_SUCCESS) {
- return vk_errorf(device, &device->vk.base, result,
+ return vk_errorf(device, result,
"failed to import dma-buf from VkNativeBufferANDROID");
}
uint64_t img_size = image->bindings[ANV_IMAGE_MEMORY_BINDING_MAIN].memory_range.size;
if (img_size < bo->size) {
- result = vk_errorf(device, &device->vk.base, VK_ERROR_INVALID_EXTERNAL_HANDLE,
+ result = vk_errorf(device, VK_ERROR_INVALID_EXTERNAL_HANDLE,
"dma-buf from VkNativeBufferANDROID is too small for "
"VkImage: %"PRIu64"B < %"PRIu64"B",
bo->size, img_size);
@@ -661,7 +488,7 @@ format_supported_with_usage(VkDevice device_h, VkFormat format,
result = anv_GetPhysicalDeviceImageFormatProperties2(phys_dev_h,
&image_format_info, &image_format_props);
if (result != VK_SUCCESS) {
- return vk_errorf(device, &device->vk.base, result,
+ return vk_errorf(device, result,
"anv_GetPhysicalDeviceImageFormatProperties2 failed "
"inside %s", __func__);
}
@@ -700,7 +527,7 @@ setup_gralloc0_usage(struct anv_device *device, VkFormat format,
* gralloc swapchains.
*/
if (imageUsage != 0) {
- return vk_errorf(device, &device->vk.base, VK_ERROR_FORMAT_NOT_SUPPORTED,
+ return vk_errorf(device, VK_ERROR_FORMAT_NOT_SUPPORTED,
"unsupported VkImageUsageFlags(0x%x) for gralloc "
"swapchain", imageUsage);
}
@@ -745,7 +572,8 @@ VkResult anv_GetSwapchainGrallocUsage2ANDROID(
*grallocConsumerUsage = 0;
*grallocProducerUsage = 0;
- mesa_logd("%s: format=%d, usage=0x%x", __func__, format, imageUsage);
+ mesa_logd("%s: format=%d, usage=0x%x, swapchainUsage=0x%x", __func__, format,
+ imageUsage, swapchainImageUsage);
result = format_supported_with_usage(device_h, format, imageUsage);
if (result != VK_SUCCESS)
@@ -774,6 +602,13 @@ VkResult anv_GetSwapchainGrallocUsage2ANDROID(
*grallocConsumerUsage |= GRALLOC1_CONSUMER_USAGE_HWCOMPOSER;
}
+ if ((swapchainImageUsage & VK_SWAPCHAIN_IMAGE_USAGE_SHARED_BIT_ANDROID) &&
+ device->u_gralloc != NULL) {
+ uint64_t front_rendering_usage = 0;
+ u_gralloc_get_front_rendering_usage(device->u_gralloc, &front_rendering_usage);
+ *grallocProducerUsage |= front_rendering_usage;
+ }
+
return VK_SUCCESS;
}
#endif
@@ -796,115 +631,3 @@ VkResult anv_GetSwapchainGrallocUsageANDROID(
return setup_gralloc0_usage(device, format, imageUsage, grallocUsage);
}
-
-VkResult
-anv_AcquireImageANDROID(
- VkDevice device_h,
- VkImage image_h,
- int nativeFenceFd,
- VkSemaphore semaphore_h,
- VkFence fence_h)
-{
- VkResult result = VK_SUCCESS;
-
- /* From https://source.android.com/devices/graphics/implement-vulkan :
- *
- * "The driver takes ownership of the fence file descriptor and closes
- * the fence file descriptor when no longer needed. The driver must do
- * so even if neither a semaphore or fence object is provided, or even
- * if vkAcquireImageANDROID fails and returns an error."
- *
- * The Vulkan spec for VkImportFence/SemaphoreFdKHR(), however, requires
- * the file descriptor to be left alone on failure.
- */
- int semaphore_fd = -1, fence_fd = -1;
- if (nativeFenceFd >= 0) {
- if (semaphore_h != VK_NULL_HANDLE && fence_h != VK_NULL_HANDLE) {
- /* We have both so we have to import the sync file twice. One of
- * them needs to be a dup.
- */
- semaphore_fd = nativeFenceFd;
- fence_fd = dup(nativeFenceFd);
- if (fence_fd < 0) {
- VkResult err = (errno == EMFILE) ? VK_ERROR_TOO_MANY_OBJECTS :
- VK_ERROR_OUT_OF_HOST_MEMORY;
- close(nativeFenceFd);
- return vk_error(err);
- }
- } else if (semaphore_h != VK_NULL_HANDLE) {
- semaphore_fd = nativeFenceFd;
- } else if (fence_h != VK_NULL_HANDLE) {
- fence_fd = nativeFenceFd;
- } else {
- /* Nothing to import into so we have to close the file */
- close(nativeFenceFd);
- }
- }
-
- if (semaphore_h != VK_NULL_HANDLE) {
- const VkImportSemaphoreFdInfoKHR info = {
- .sType = VK_STRUCTURE_TYPE_IMPORT_SEMAPHORE_FD_INFO_KHR,
- .semaphore = semaphore_h,
- .flags = VK_SEMAPHORE_IMPORT_TEMPORARY_BIT,
- .handleType = VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT,
- .fd = semaphore_fd,
- };
- result = anv_ImportSemaphoreFdKHR(device_h, &info);
- if (result == VK_SUCCESS)
- semaphore_fd = -1; /* ANV took ownership */
- }
-
- if (result == VK_SUCCESS && fence_h != VK_NULL_HANDLE) {
- const VkImportFenceFdInfoKHR info = {
- .sType = VK_STRUCTURE_TYPE_IMPORT_FENCE_FD_INFO_KHR,
- .fence = fence_h,
- .flags = VK_FENCE_IMPORT_TEMPORARY_BIT,
- .handleType = VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT,
- .fd = fence_fd,
- };
- result = anv_ImportFenceFdKHR(device_h, &info);
- if (result == VK_SUCCESS)
- fence_fd = -1; /* ANV took ownership */
- }
-
- if (semaphore_fd >= 0)
- close(semaphore_fd);
- if (fence_fd >= 0)
- close(fence_fd);
-
- return result;
-}
-
-VkResult
-anv_QueueSignalReleaseImageANDROID(
- VkQueue queue,
- uint32_t waitSemaphoreCount,
- const VkSemaphore* pWaitSemaphores,
- VkImage image,
- int* pNativeFenceFd)
-{
- VkResult result;
-
- if (waitSemaphoreCount == 0)
- goto done;
-
- result = anv_QueueSubmit(queue, 1,
- &(VkSubmitInfo) {
- .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO,
- .waitSemaphoreCount = 1,
- .pWaitSemaphores = pWaitSemaphores,
- },
- (VkFence) VK_NULL_HANDLE);
- if (result != VK_SUCCESS)
- return result;
-
- done:
- if (pNativeFenceFd) {
- /* We can rely implicit on sync because above we submitted all
- * semaphores to the queue.
- */
- *pNativeFenceFd = -1;
- }
-
- return VK_SUCCESS;
-}
diff --git a/src/intel/vulkan/anv_android.h b/src/intel/vulkan/anv_android.h
index 2e329b3029c..cbd0a0a1634 100644
--- a/src/intel/vulkan/anv_android.h
+++ b/src/intel/vulkan/anv_android.h
@@ -24,7 +24,9 @@
#ifndef ANV_ANDROID_H
#define ANV_ANDROID_H
-#if defined(ANDROID) && ANDROID_API_LEVEL >= 26
+#include "util/detect_os.h"
+
+#if DETECT_OS_ANDROID && ANDROID_API_LEVEL >= 26
#include <vndk/hardware_buffer.h>
#endif
#include <vulkan/vulkan.h>
@@ -35,30 +37,21 @@ struct anv_device_memory;
struct anv_device;
struct anv_image;
-VkResult anv_image_from_gralloc(VkDevice device_h,
- const VkImageCreateInfo *base_info,
- const VkNativeBufferANDROID *gralloc_info,
- const VkAllocationCallbacks *alloc,
- VkImage *pImage);
+VkResult anv_image_init_from_gralloc(struct anv_device *device,
+ struct anv_image *image,
+ const VkImageCreateInfo *base_info,
+ const VkNativeBufferANDROID *gralloc_info);
VkResult anv_image_bind_from_gralloc(struct anv_device *device,
struct anv_image *image,
const VkNativeBufferANDROID *gralloc_info);
-VkResult anv_image_from_external(VkDevice device_h,
- const VkImageCreateInfo *base_info,
- const VkExternalMemoryImageCreateInfo *create_info,
- const VkAllocationCallbacks *alloc,
- VkImage *out_image_h);
-
-uint64_t anv_ahw_usage_from_vk_usage(const VkImageCreateFlags vk_create,
- const VkImageUsageFlags vk_usage);
+unsigned anv_ahb_format_for_vk_format(VkFormat vk_format);
VkResult anv_import_ahw_memory(VkDevice device_h,
- struct anv_device_memory *mem,
- const VkImportAndroidHardwareBufferInfoANDROID *info);
+ struct anv_device_memory *mem);
VkResult anv_create_ahw_memory(VkDevice device_h,
struct anv_device_memory *mem,
- const VkMemoryAllocateInfo *pAllocateInfo);
+ const VkMemoryDedicatedAllocateInfo *dedicated_info);
#endif /* ANV_ANDROID_H */
diff --git a/src/intel/vulkan/anv_android_stubs.c b/src/intel/vulkan/anv_android_stubs.c
index f6b2d1c8dd1..f1b2ef6b6f8 100644
--- a/src/intel/vulkan/anv_android_stubs.c
+++ b/src/intel/vulkan/anv_android_stubs.c
@@ -24,11 +24,10 @@
#include "anv_android.h"
VkResult
-anv_image_from_gralloc(VkDevice device_h,
- const VkImageCreateInfo *base_info,
- const VkNativeBufferANDROID *gralloc_info,
- const VkAllocationCallbacks *alloc,
- VkImage *pImage)
+anv_image_init_from_gralloc(struct anv_device *device,
+ struct anv_image *image,
+ const VkImageCreateInfo *base_info,
+ const VkNativeBufferANDROID *gralloc_info)
{
return VK_ERROR_EXTENSION_NOT_PRESENT;
}
@@ -40,17 +39,14 @@ VkResult anv_image_bind_from_gralloc(struct anv_device *device,
return VK_ERROR_EXTENSION_NOT_PRESENT;
}
-uint64_t
-anv_ahw_usage_from_vk_usage(const VkImageCreateFlags vk_create,
- const VkImageUsageFlags vk_usage)
+unsigned anv_ahb_format_for_vk_format(VkFormat vk_format)
{
return 0;
}
VkResult
anv_import_ahw_memory(VkDevice device_h,
- struct anv_device_memory *mem,
- const VkImportAndroidHardwareBufferInfoANDROID *info)
+ struct anv_device_memory *mem)
{
return VK_ERROR_EXTENSION_NOT_PRESENT;
}
@@ -58,17 +54,7 @@ anv_import_ahw_memory(VkDevice device_h,
VkResult
anv_create_ahw_memory(VkDevice device_h,
struct anv_device_memory *mem,
- const VkMemoryAllocateInfo *pAllocateInfo)
-{
- return VK_ERROR_EXTENSION_NOT_PRESENT;
-}
-
-VkResult
-anv_image_from_external(VkDevice device_h,
- const VkImageCreateInfo *base_info,
- const VkExternalMemoryImageCreateInfo *create_info,
- const VkAllocationCallbacks *alloc,
- VkImage *out_image_h)
+ const VkMemoryDedicatedAllocateInfo *dedicated_info)
{
return VK_ERROR_EXTENSION_NOT_PRESENT;
}
diff --git a/src/intel/vulkan/anv_astc_emu.c b/src/intel/vulkan/anv_astc_emu.c
new file mode 100644
index 00000000000..7a0f354a5e5
--- /dev/null
+++ b/src/intel/vulkan/anv_astc_emu.c
@@ -0,0 +1,516 @@
+/*
+ * Copyright 2023 Google LLC
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "anv_private.h"
+
+#include "compiler/nir/nir_builder.h"
+
+static void
+astc_emu_init_image_view(struct anv_cmd_buffer *cmd_buffer,
+ struct anv_image_view *iview,
+ struct anv_image *image,
+ VkFormat format,
+ VkImageUsageFlags usage,
+ uint32_t level, uint32_t layer)
+{
+ struct anv_device *device = cmd_buffer->device;
+
+ const VkImageViewCreateInfo create_info = {
+ .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
+ .pNext = &(VkImageViewUsageCreateInfo){
+ .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_USAGE_CREATE_INFO,
+ .usage = usage,
+ },
+ .image = anv_image_to_handle(image),
+ /* XXX we only need 2D but the shader expects 2D_ARRAY */
+ .viewType = VK_IMAGE_VIEW_TYPE_2D_ARRAY,
+ .format = format,
+ .subresourceRange = {
+ .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
+ .baseMipLevel = level,
+ .levelCount = 1,
+ .baseArrayLayer = layer,
+ .layerCount = 1,
+ },
+ };
+
+ memset(iview, 0, sizeof(*iview));
+ anv_image_view_init(device, iview, &create_info,
+ &cmd_buffer->surface_state_stream);
+}
+
+static void
+astc_emu_init_push_descriptor_set(struct anv_cmd_buffer *cmd_buffer,
+ struct anv_push_descriptor_set *push_set,
+ VkDescriptorSetLayout _layout,
+ uint32_t write_count,
+ const VkWriteDescriptorSet *writes)
+{
+ struct anv_device *device = cmd_buffer->device;
+ struct anv_descriptor_set_layout *layout =
+ anv_descriptor_set_layout_from_handle(_layout);
+
+ memset(push_set, 0, sizeof(*push_set));
+ anv_push_descriptor_set_init(cmd_buffer, push_set, layout);
+
+ anv_descriptor_set_write(device, &push_set->set, write_count, writes);
+}
+
+static void
+astc_emu_init_flush_denorm_shader(nir_builder *b)
+{
+ b->shader->info.workgroup_size[0] = 8;
+ b->shader->info.workgroup_size[1] = 8;
+
+ const struct glsl_type *src_type =
+ glsl_sampler_type(GLSL_SAMPLER_DIM_2D, false, true, GLSL_TYPE_UINT);
+ nir_variable *src_var =
+ nir_variable_create(b->shader, nir_var_uniform, src_type, "src");
+ src_var->data.descriptor_set = 0;
+ src_var->data.binding = 0;
+
+ const struct glsl_type *dst_type =
+ glsl_image_type(GLSL_SAMPLER_DIM_2D, true, GLSL_TYPE_UINT);
+ nir_variable *dst_var =
+ nir_variable_create(b->shader, nir_var_uniform, dst_type, "dst");
+ dst_var->data.descriptor_set = 0;
+ dst_var->data.binding = 1;
+
+ nir_def *zero = nir_imm_int(b, 0);
+ nir_def *consts = nir_load_push_constant(b, 4, 32, zero, .range = 16);
+ nir_def *offset = nir_channels(b, consts, 0x3);
+ nir_def *extent = nir_channels(b, consts, 0x3 << 2);
+
+ nir_def *coord = nir_load_global_invocation_id(b, 32);
+ coord = nir_iadd(b, nir_channels(b, coord, 0x3), offset);
+
+ nir_def *cond = nir_ilt(b, coord, extent);
+ cond = nir_iand(b, nir_channel(b, cond, 0), nir_channel(b, cond, 1));
+ nir_push_if(b, cond);
+ {
+ const struct glsl_type *val_type = glsl_vector_type(GLSL_TYPE_UINT, 4);
+ nir_variable *val_var =
+ nir_variable_create(b->shader, nir_var_shader_temp, val_type, "val");
+
+ coord = nir_vec3(b, nir_channel(b, coord, 0), nir_channel(b, coord, 1),
+ zero);
+ nir_def *val =
+ nir_txf_deref(b, nir_build_deref_var(b, src_var), coord, zero);
+ nir_store_var(b, val_var, val, 0xf);
+
+ /* A void-extent block has this layout
+ *
+ * struct astc_void_extent_block {
+ * uint16_t header;
+ * uint16_t dontcare0;
+ * uint16_t dontcare1;
+ * uint16_t dontcare2;
+ * uint16_t R;
+ * uint16_t G;
+ * uint16_t B;
+ * uint16_t A;
+ * };
+ *
+ * where the lower 12 bits are 0xdfc for 2D LDR.
+ */
+ nir_def *block_mode = nir_iand_imm(b, nir_channel(b, val, 0), 0xfff);
+ nir_push_if(b, nir_ieq_imm(b, block_mode, 0xdfc));
+ {
+ nir_def *color = nir_channels(b, val, 0x3 << 2);
+ nir_def *comps = nir_unpack_64_4x16(b, nir_pack_64_2x32(b, color));
+
+ /* flush denorms */
+ comps = nir_bcsel(b, nir_ult_imm(b, comps, 4),
+ nir_imm_intN_t(b, 0, 16), comps);
+
+ color = nir_unpack_64_2x32(b, nir_pack_64_4x16(b, comps));
+ val = nir_vec4(b, nir_channel(b, val, 0), nir_channel(b, val, 1),
+ nir_channel(b, color, 0), nir_channel(b, color, 1));
+ nir_store_var(b, val_var, val, 0x3 << 2);
+ }
+ nir_pop_if(b, NULL);
+
+ nir_def *dst = &nir_build_deref_var(b, dst_var)->def;
+ coord = nir_pad_vector(b, coord, 4);
+ val = nir_load_var(b, val_var);
+ nir_image_deref_store(b, dst, coord, nir_undef(b, 1, 32), val, zero,
+ .image_dim = GLSL_SAMPLER_DIM_2D,
+ .image_array = true);
+ }
+ nir_pop_if(b, NULL);
+}
+
+static VkResult
+astc_emu_init_flush_denorm_pipeline_locked(struct anv_device *device)
+{
+ struct anv_device_astc_emu *astc_emu = &device->astc_emu;
+ VkDevice _device = anv_device_to_handle(device);
+ VkResult result = VK_SUCCESS;
+
+ if (astc_emu->ds_layout == VK_NULL_HANDLE) {
+ const VkDescriptorSetLayoutCreateInfo ds_layout_create_info = {
+ .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
+ .flags = VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR,
+ .bindingCount = 2,
+ .pBindings = (VkDescriptorSetLayoutBinding[]){
+ {
+ .binding = 0,
+ .descriptorType = VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE,
+ .descriptorCount = 1,
+ .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
+ },
+ {
+ .binding = 1,
+ .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
+ .descriptorCount = 1,
+ .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
+ },
+ },
+ };
+ result = anv_CreateDescriptorSetLayout(_device, &ds_layout_create_info,
+ NULL, &astc_emu->ds_layout);
+ if (result != VK_SUCCESS)
+ goto out;
+ }
+
+ if (astc_emu->pipeline_layout == VK_NULL_HANDLE) {
+ const VkPipelineLayoutCreateInfo pipeline_layout_create_info = {
+ .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
+ .setLayoutCount = 1,
+ .pSetLayouts = &astc_emu->ds_layout,
+ .pushConstantRangeCount = 1,
+ .pPushConstantRanges = &(VkPushConstantRange){
+ .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
+ .size = sizeof(uint32_t) * 4,
+ },
+ };
+ result = anv_CreatePipelineLayout(_device, &pipeline_layout_create_info,
+ NULL, &astc_emu->pipeline_layout);
+ if (result != VK_SUCCESS)
+ goto out;
+ }
+
+ if (astc_emu->pipeline == VK_NULL_HANDLE) {
+ const struct nir_shader_compiler_options *options =
+ device->physical->compiler->nir_options[MESA_SHADER_COMPUTE];
+ nir_builder b = nir_builder_init_simple_shader(
+ MESA_SHADER_COMPUTE, options, "astc_emu_flush_denorm");
+ astc_emu_init_flush_denorm_shader(&b);
+
+ const VkComputePipelineCreateInfo pipeline_create_info = {
+ .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
+ .stage =
+ (VkPipelineShaderStageCreateInfo){
+ .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
+ .stage = VK_SHADER_STAGE_COMPUTE_BIT,
+ .module = vk_shader_module_handle_from_nir(b.shader),
+ .pName = "main",
+ },
+ .layout = astc_emu->pipeline_layout,
+ };
+ result = anv_CreateComputePipelines(_device, VK_NULL_HANDLE, 1,
+ &pipeline_create_info, NULL,
+ &astc_emu->pipeline);
+ ralloc_free(b.shader);
+
+ if (result != VK_SUCCESS)
+ goto out;
+ }
+
+out:
+ return result;
+}
+
+static VkResult
+astc_emu_init_flush_denorm_pipeline(struct anv_device *device)
+{
+ struct anv_device_astc_emu *astc_emu = &device->astc_emu;
+ VkResult result = VK_SUCCESS;
+
+ simple_mtx_lock(&astc_emu->mutex);
+ if (!astc_emu->pipeline)
+ result = astc_emu_init_flush_denorm_pipeline_locked(device);
+ simple_mtx_unlock(&astc_emu->mutex);
+
+ return result;
+}
+
+static void
+astc_emu_flush_denorm_slice(struct anv_cmd_buffer *cmd_buffer,
+ VkFormat astc_format,
+ VkImageLayout layout,
+ VkImageView src_view,
+ VkImageView dst_view,
+ VkRect2D rect)
+{
+ struct anv_device *device = cmd_buffer->device;
+ struct anv_device_astc_emu *astc_emu = &device->astc_emu;
+ VkCommandBuffer cmd_buffer_ = anv_cmd_buffer_to_handle(cmd_buffer);
+
+ VkResult result = astc_emu_init_flush_denorm_pipeline(device);
+ if (result != VK_SUCCESS) {
+ anv_batch_set_error(&cmd_buffer->batch, result);
+ return;
+ }
+
+ const uint32_t push_const[] = {
+ rect.offset.x,
+ rect.offset.y,
+ rect.offset.x + rect.extent.width,
+ rect.offset.y + rect.extent.height,
+ };
+
+ const VkWriteDescriptorSet set_writes[] = {
+ {
+ .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
+ .dstBinding = 0,
+ .descriptorCount = 1,
+ .descriptorType = VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE,
+ .pImageInfo = &(VkDescriptorImageInfo){
+ .imageView = src_view,
+ .imageLayout = layout,
+ },
+ },
+ {
+ .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
+ .dstBinding = 1,
+ .descriptorCount = 1,
+ .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
+ .pImageInfo = &(VkDescriptorImageInfo){
+ .imageView = dst_view,
+ .imageLayout = VK_IMAGE_LAYOUT_GENERAL,
+ },
+ },
+ };
+ struct anv_push_descriptor_set push_set;
+ astc_emu_init_push_descriptor_set(cmd_buffer,
+ &push_set,
+ astc_emu->ds_layout,
+ ARRAY_SIZE(set_writes),
+ set_writes);
+ VkDescriptorSet set = anv_descriptor_set_to_handle(&push_set.set);
+
+ anv_CmdBindPipeline(cmd_buffer_, VK_PIPELINE_BIND_POINT_COMPUTE,
+ astc_emu->pipeline);
+
+ VkPushConstantsInfoKHR push_info = {
+ .sType = VK_STRUCTURE_TYPE_PUSH_CONSTANTS_INFO_KHR,
+ .layout = astc_emu->pipeline_layout,
+ .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
+ .offset = 0,
+ .size = sizeof(push_const),
+ .pValues = push_const,
+ };
+ anv_CmdPushConstants2KHR(cmd_buffer_, &push_info);
+
+ VkBindDescriptorSetsInfoKHR bind_info = {
+ .sType = VK_STRUCTURE_TYPE_BIND_DESCRIPTOR_SETS_INFO_KHR,
+ .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
+ .layout = astc_emu->pipeline_layout,
+ .firstSet = 0,
+ .descriptorSetCount = 1,
+ .pDescriptorSets = &set,
+ .dynamicOffsetCount = 0,
+ .pDynamicOffsets = NULL,
+ };
+ anv_CmdBindDescriptorSets2KHR(cmd_buffer_, &bind_info);
+
+ /* each workgroup processes 8x8 texel blocks */
+ rect.extent.width = DIV_ROUND_UP(rect.extent.width, 8);
+ rect.extent.height = DIV_ROUND_UP(rect.extent.height, 8);
+
+ anv_genX(device->info, CmdDispatchBase)(cmd_buffer_, 0, 0, 0,
+ rect.extent.width,
+ rect.extent.height,
+ 1);
+
+ anv_push_descriptor_set_finish(&push_set);
+}
+
+static void
+astc_emu_decompress_slice(struct anv_cmd_buffer *cmd_buffer,
+ VkFormat astc_format,
+ VkImageLayout layout,
+ VkImageView src_view,
+ VkImageView dst_view,
+ VkRect2D rect)
+{
+ struct anv_device *device = cmd_buffer->device;
+ struct anv_device_astc_emu *astc_emu = &device->astc_emu;
+ VkCommandBuffer cmd_buffer_ = anv_cmd_buffer_to_handle(cmd_buffer);
+
+ VkPipeline pipeline =
+ vk_texcompress_astc_get_decode_pipeline(&device->vk, &device->vk.alloc,
+ astc_emu->texcompress,
+ VK_NULL_HANDLE, astc_format);
+ if (pipeline == VK_NULL_HANDLE) {
+ anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_UNKNOWN);
+ return;
+ }
+
+ anv_CmdBindPipeline(cmd_buffer_, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline);
+
+ struct vk_texcompress_astc_write_descriptor_set writes;
+ vk_texcompress_astc_fill_write_descriptor_sets(astc_emu->texcompress,
+ &writes, src_view, layout,
+ dst_view, astc_format);
+
+ struct anv_push_descriptor_set push_set;
+ astc_emu_init_push_descriptor_set(cmd_buffer, &push_set,
+ astc_emu->texcompress->ds_layout,
+ ARRAY_SIZE(writes.descriptor_set),
+ writes.descriptor_set);
+
+ VkDescriptorSet set = anv_descriptor_set_to_handle(&push_set.set);
+
+ VkBindDescriptorSetsInfoKHR bind_info = {
+ .sType = VK_STRUCTURE_TYPE_BIND_DESCRIPTOR_SETS_INFO_KHR,
+ .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
+ .layout = astc_emu->texcompress->p_layout,
+ .firstSet = 0,
+ .descriptorSetCount = 1,
+ .pDescriptorSets = &set,
+ .dynamicOffsetCount = 0,
+ .pDynamicOffsets = NULL,
+ };
+ anv_CmdBindDescriptorSets2KHR(cmd_buffer_, &bind_info);
+
+ const uint32_t push_const[] = {
+ rect.offset.x,
+ rect.offset.y,
+ (rect.offset.x + rect.extent.width) *
+ vk_format_get_blockwidth(astc_format),
+ (rect.offset.y + rect.extent.height) *
+ vk_format_get_blockheight(astc_format),
+ false, /* we don't use VK_IMAGE_VIEW_TYPE_3D */
+ };
+ VkPushConstantsInfoKHR push_info = {
+ .sType = VK_STRUCTURE_TYPE_PUSH_CONSTANTS_INFO_KHR,
+ .layout = astc_emu->texcompress->p_layout,
+ .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
+ .offset = 0,
+ .size = sizeof(push_const),
+ .pValues = push_const,
+ };
+ anv_CmdPushConstants2KHR(cmd_buffer_, &push_info);
+
+ /* each workgroup processes 2x2 texel blocks */
+ rect.extent.width = DIV_ROUND_UP(rect.extent.width, 2);
+ rect.extent.height = DIV_ROUND_UP(rect.extent.height, 2);
+
+ anv_genX(device->info, CmdDispatchBase)(cmd_buffer_, 0, 0, 0,
+ rect.extent.width,
+ rect.extent.height,
+ 1);
+
+ anv_push_descriptor_set_finish(&push_set);
+}
+
+void
+anv_astc_emu_process(struct anv_cmd_buffer *cmd_buffer,
+ struct anv_image *image,
+ VkImageLayout layout,
+ const VkImageSubresourceLayers *subresource,
+ VkOffset3D block_offset,
+ VkExtent3D block_extent)
+{
+ const bool flush_denorms =
+ cmd_buffer->device->physical->flush_astc_ldr_void_extent_denorms;
+
+ assert(image->emu_plane_format != VK_FORMAT_UNDEFINED);
+
+ const VkRect2D rect = {
+ .offset = {
+ .x = block_offset.x,
+ .y = block_offset.y,
+ },
+ .extent = {
+ .width = block_extent.width,
+ .height = block_extent.height,
+ },
+ };
+
+ /* process one layer at a time because anv_image_fill_surface_state
+ * requires an uncompressed view of a compressed image to be single layer
+ */
+ const bool is_3d = image->vk.image_type == VK_IMAGE_TYPE_3D;
+ const uint32_t slice_base = is_3d ?
+ block_offset.z : subresource->baseArrayLayer;
+ const uint32_t slice_count = is_3d ?
+ block_extent.depth : subresource->layerCount;
+
+ struct anv_cmd_saved_state saved;
+ anv_cmd_buffer_save_state(cmd_buffer,
+ ANV_CMD_SAVED_STATE_COMPUTE_PIPELINE |
+ ANV_CMD_SAVED_STATE_DESCRIPTOR_SET_0 |
+ ANV_CMD_SAVED_STATE_PUSH_CONSTANTS,
+ &saved);
+
+ for (uint32_t i = 0; i < slice_count; i++) {
+ struct anv_image_view src_view;
+ struct anv_image_view dst_view;
+ astc_emu_init_image_view(cmd_buffer, &src_view, image,
+ VK_FORMAT_R32G32B32A32_UINT,
+ VK_IMAGE_USAGE_SAMPLED_BIT,
+ subresource->mipLevel, slice_base + i);
+ astc_emu_init_image_view(cmd_buffer, &dst_view, image,
+ flush_denorms ? VK_FORMAT_R32G32B32A32_UINT
+ : VK_FORMAT_R8G8B8A8_UINT,
+ VK_IMAGE_USAGE_STORAGE_BIT,
+ subresource->mipLevel, slice_base + i);
+
+ if (flush_denorms) {
+ astc_emu_flush_denorm_slice(cmd_buffer, image->vk.format, layout,
+ anv_image_view_to_handle(&src_view),
+ anv_image_view_to_handle(&dst_view),
+ rect);
+ } else {
+ astc_emu_decompress_slice(cmd_buffer, image->vk.format, layout,
+ anv_image_view_to_handle(&src_view),
+ anv_image_view_to_handle(&dst_view),
+ rect);
+ }
+ }
+
+ anv_cmd_buffer_restore_state(cmd_buffer, &saved);
+}
+
+VkResult
+anv_device_init_astc_emu(struct anv_device *device)
+{
+ struct anv_device_astc_emu *astc_emu = &device->astc_emu;
+ VkResult result = VK_SUCCESS;
+
+ if (device->physical->flush_astc_ldr_void_extent_denorms)
+ simple_mtx_init(&astc_emu->mutex, mtx_plain);
+
+ if (device->physical->emu_astc_ldr) {
+ result = vk_texcompress_astc_init(&device->vk, &device->vk.alloc,
+ VK_NULL_HANDLE,
+ &astc_emu->texcompress);
+ }
+
+ return result;
+}
+
+void
+anv_device_finish_astc_emu(struct anv_device *device)
+{
+ struct anv_device_astc_emu *astc_emu = &device->astc_emu;
+
+ if (device->physical->flush_astc_ldr_void_extent_denorms) {
+ VkDevice _device = anv_device_to_handle(device);
+
+ anv_DestroyPipeline(_device, astc_emu->pipeline, NULL);
+ anv_DestroyPipelineLayout(_device, astc_emu->pipeline_layout, NULL);
+ anv_DestroyDescriptorSetLayout(_device, astc_emu->ds_layout, NULL);
+ simple_mtx_destroy(&astc_emu->mutex);
+ }
+
+ if (astc_emu->texcompress) {
+ vk_texcompress_astc_finish(&device->vk, &device->vk.alloc,
+ astc_emu->texcompress);
+ }
+}
diff --git a/src/intel/vulkan/anv_batch_chain.c b/src/intel/vulkan/anv_batch_chain.c
index e7eda9bf9fa..bb986847a08 100644
--- a/src/intel/vulkan/anv_batch_chain.c
+++ b/src/intel/vulkan/anv_batch_chain.c
@@ -27,22 +27,24 @@
#include <unistd.h>
#include <fcntl.h>
+#include <xf86drm.h>
+
#include "anv_private.h"
#include "anv_measure.h"
-#include "genxml/gen8_pack.h"
+#include "genxml/gen9_pack.h"
#include "genxml/genX_bits.h"
-#include "perf/intel_perf.h"
-#include "util/debug.h"
+#include "util/perf/u_trace.h"
/** \file anv_batch_chain.c
*
* This file contains functions related to anv_cmd_buffer as a data
* structure. This involves everything required to create and destroy
- * the actual batch buffers as well as link them together and handle
- * relocations and surface state. It specifically does *not* contain any
- * handling of actual vkCmd calls beyond vkCmdExecuteCommands.
+ * the actual batch buffers as well as link them together.
+ *
+ * It specifically does *not* contain any handling of actual vkCmd calls
+ * beyond vkCmdExecuteCommands.
*/
/*-----------------------------------------------------------------------*
@@ -51,49 +53,25 @@
VkResult
anv_reloc_list_init(struct anv_reloc_list *list,
- const VkAllocationCallbacks *alloc)
+ const VkAllocationCallbacks *alloc,
+ bool uses_relocs)
{
+ assert(alloc != NULL);
memset(list, 0, sizeof(*list));
+ list->uses_relocs = uses_relocs;
+ list->alloc = alloc;
return VK_SUCCESS;
}
static VkResult
anv_reloc_list_init_clone(struct anv_reloc_list *list,
- const VkAllocationCallbacks *alloc,
const struct anv_reloc_list *other_list)
{
- list->num_relocs = other_list->num_relocs;
- list->array_length = other_list->array_length;
-
- if (list->num_relocs > 0) {
- list->relocs =
- vk_alloc(alloc, list->array_length * sizeof(*list->relocs), 8,
- VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
- if (list->relocs == NULL)
- return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
-
- list->reloc_bos =
- vk_alloc(alloc, list->array_length * sizeof(*list->reloc_bos), 8,
- VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
- if (list->reloc_bos == NULL) {
- vk_free(alloc, list->relocs);
- return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
- }
-
- memcpy(list->relocs, other_list->relocs,
- list->array_length * sizeof(*list->relocs));
- memcpy(list->reloc_bos, other_list->reloc_bos,
- list->array_length * sizeof(*list->reloc_bos));
- } else {
- list->relocs = NULL;
- list->reloc_bos = NULL;
- }
-
list->dep_words = other_list->dep_words;
if (list->dep_words > 0) {
list->deps =
- vk_alloc(alloc, list->dep_words * sizeof(BITSET_WORD), 8,
+ vk_alloc(list->alloc, list->dep_words * sizeof(BITSET_WORD), 8,
VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
memcpy(list->deps, other_list->deps,
list->dep_words * sizeof(BITSET_WORD));
@@ -105,50 +83,13 @@ anv_reloc_list_init_clone(struct anv_reloc_list *list,
}
void
-anv_reloc_list_finish(struct anv_reloc_list *list,
- const VkAllocationCallbacks *alloc)
+anv_reloc_list_finish(struct anv_reloc_list *list)
{
- vk_free(alloc, list->relocs);
- vk_free(alloc, list->reloc_bos);
- vk_free(alloc, list->deps);
-}
-
-static VkResult
-anv_reloc_list_grow(struct anv_reloc_list *list,
- const VkAllocationCallbacks *alloc,
- size_t num_additional_relocs)
-{
- if (list->num_relocs + num_additional_relocs <= list->array_length)
- return VK_SUCCESS;
-
- size_t new_length = MAX2(16, list->array_length * 2);
- while (new_length < list->num_relocs + num_additional_relocs)
- new_length *= 2;
-
- struct drm_i915_gem_relocation_entry *new_relocs =
- vk_realloc(alloc, list->relocs,
- new_length * sizeof(*list->relocs), 8,
- VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
- if (new_relocs == NULL)
- return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
- list->relocs = new_relocs;
-
- struct anv_bo **new_reloc_bos =
- vk_realloc(alloc, list->reloc_bos,
- new_length * sizeof(*list->reloc_bos), 8,
- VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
- if (new_reloc_bos == NULL)
- return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
- list->reloc_bos = new_reloc_bos;
-
- list->array_length = new_length;
-
- return VK_SUCCESS;
+ vk_free(list->alloc, list->deps);
}
static VkResult
anv_reloc_list_grow_deps(struct anv_reloc_list *list,
- const VkAllocationCallbacks *alloc,
uint32_t min_num_words)
{
if (min_num_words <= list->dep_words)
@@ -159,10 +100,10 @@ anv_reloc_list_grow_deps(struct anv_reloc_list *list,
new_length *= 2;
BITSET_WORD *new_deps =
- vk_realloc(alloc, list->deps, new_length * sizeof(BITSET_WORD), 8,
+ vk_realloc(list->alloc, list->deps, new_length * sizeof(BITSET_WORD), 8,
VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
if (new_deps == NULL)
- return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+ return vk_error(NULL, VK_ERROR_OUT_OF_HOST_MEMORY);
list->deps = new_deps;
/* Zero out the new data */
@@ -173,18 +114,16 @@ anv_reloc_list_grow_deps(struct anv_reloc_list *list,
return VK_SUCCESS;
}
-#define READ_ONCE(x) (*(volatile __typeof__(x) *)&(x))
-
VkResult
-anv_reloc_list_add_bo(struct anv_reloc_list *list,
- const VkAllocationCallbacks *alloc,
- struct anv_bo *target_bo)
+anv_reloc_list_add_bo_impl(struct anv_reloc_list *list,
+ struct anv_bo *target_bo)
{
- assert(!target_bo->is_wrapper);
- assert(target_bo->flags & EXEC_OBJECT_PINNED);
+ /* This can happen with sparse resources. */
+ if (!target_bo)
+ return VK_SUCCESS;
uint32_t idx = target_bo->gem_handle;
- VkResult result = anv_reloc_list_grow_deps(list, alloc,
+ VkResult result = anv_reloc_list_grow_deps(list,
(idx / BITSET_WORDBITS) + 1);
if (unlikely(result != VK_SUCCESS))
return result;
@@ -194,75 +133,18 @@ anv_reloc_list_add_bo(struct anv_reloc_list *list,
return VK_SUCCESS;
}
-VkResult
-anv_reloc_list_add(struct anv_reloc_list *list,
- const VkAllocationCallbacks *alloc,
- uint32_t offset, struct anv_bo *target_bo, uint32_t delta,
- uint64_t *address_u64_out)
-{
- struct drm_i915_gem_relocation_entry *entry;
- int index;
-
- struct anv_bo *unwrapped_target_bo = anv_bo_unwrap(target_bo);
- uint64_t target_bo_offset = READ_ONCE(unwrapped_target_bo->offset);
- if (address_u64_out)
- *address_u64_out = target_bo_offset + delta;
-
- assert(unwrapped_target_bo->gem_handle > 0);
- assert(unwrapped_target_bo->refcount > 0);
-
- if (unwrapped_target_bo->flags & EXEC_OBJECT_PINNED)
- return anv_reloc_list_add_bo(list, alloc, unwrapped_target_bo);
-
- VkResult result = anv_reloc_list_grow(list, alloc, 1);
- if (result != VK_SUCCESS)
- return result;
-
- /* XXX: Can we use I915_EXEC_HANDLE_LUT? */
- index = list->num_relocs++;
- list->reloc_bos[index] = target_bo;
- entry = &list->relocs[index];
- entry->target_handle = -1; /* See also anv_cmd_buffer_process_relocs() */
- entry->delta = delta;
- entry->offset = offset;
- entry->presumed_offset = target_bo_offset;
- entry->read_domains = 0;
- entry->write_domain = 0;
- VG(VALGRIND_CHECK_MEM_IS_DEFINED(entry, sizeof(*entry)));
-
- return VK_SUCCESS;
-}
-
static void
anv_reloc_list_clear(struct anv_reloc_list *list)
{
- list->num_relocs = 0;
if (list->dep_words > 0)
memset(list->deps, 0, list->dep_words * sizeof(BITSET_WORD));
}
-static VkResult
+VkResult
anv_reloc_list_append(struct anv_reloc_list *list,
- const VkAllocationCallbacks *alloc,
- struct anv_reloc_list *other, uint32_t offset)
+ struct anv_reloc_list *other)
{
- VkResult result = anv_reloc_list_grow(list, alloc, other->num_relocs);
- if (result != VK_SUCCESS)
- return result;
-
- if (other->num_relocs > 0) {
- memcpy(&list->relocs[list->num_relocs], &other->relocs[0],
- other->num_relocs * sizeof(other->relocs[0]));
- memcpy(&list->reloc_bos[list->num_relocs], &other->reloc_bos[0],
- other->num_relocs * sizeof(other->reloc_bos[0]));
-
- for (uint32_t i = 0; i < other->num_relocs; i++)
- list->relocs[i + list->num_relocs].offset += offset;
-
- list->num_relocs += other->num_relocs;
- }
-
- anv_reloc_list_grow_deps(list, alloc, other->dep_words);
+ anv_reloc_list_grow_deps(list, other->dep_words);
for (uint32_t w = 0; w < other->dep_words; w++)
list->deps[w] |= other->deps[w];
@@ -273,15 +155,23 @@ anv_reloc_list_append(struct anv_reloc_list *list,
* Functions related to anv_batch
*-----------------------------------------------------------------------*/
+static VkResult
+anv_extend_batch(struct anv_batch *batch, uint32_t size)
+{
+ assert(batch->extend_cb != NULL);
+ VkResult result = batch->extend_cb(batch, size, batch->user_data);
+ if (result != VK_SUCCESS)
+ return anv_batch_set_error(batch, result);
+ return result;
+}
+
void *
anv_batch_emit_dwords(struct anv_batch *batch, int num_dwords)
{
- if (batch->next + num_dwords * 4 > batch->end) {
- VkResult result = batch->extend_cb(batch, batch->user_data);
- if (result != VK_SUCCESS) {
- anv_batch_set_error(batch, result);
+ uint32_t size = num_dwords * 4;
+ if (batch->next + size > batch->end) {
+ if (anv_extend_batch(batch, size) != VK_SUCCESS)
return NULL;
- }
}
void *p = batch->next;
@@ -292,10 +182,33 @@ anv_batch_emit_dwords(struct anv_batch *batch, int num_dwords)
return p;
}
+/* Ensure enough contiguous space is available */
+VkResult
+anv_batch_emit_ensure_space(struct anv_batch *batch, uint32_t size)
+{
+ if (batch->next + size > batch->end) {
+ VkResult result = anv_extend_batch(batch, size);
+ if (result != VK_SUCCESS)
+ return result;
+ }
+
+ assert(batch->next + size <= batch->end);
+
+ return VK_SUCCESS;
+}
+
+void
+anv_batch_advance(struct anv_batch *batch, uint32_t size)
+{
+ assert(batch->next + size <= batch->end);
+
+ batch->next += size;
+}
+
struct anv_address
anv_batch_address(struct anv_batch *batch, void *batch_location)
{
- assert(batch->start < batch_location);
+ assert(batch->start <= batch_location);
/* Allow a jump at the current location of the batch. */
assert(batch->next >= batch_location);
@@ -306,17 +219,12 @@ anv_batch_address(struct anv_batch *batch, void *batch_location)
void
anv_batch_emit_batch(struct anv_batch *batch, struct anv_batch *other)
{
- uint32_t size, offset;
-
- size = other->next - other->start;
+ uint32_t size = other->next - other->start;
assert(size % 4 == 0);
if (batch->next + size > batch->end) {
- VkResult result = batch->extend_cb(batch, batch->user_data);
- if (result != VK_SUCCESS) {
- anv_batch_set_error(batch, result);
+ if (anv_extend_batch(batch, size) != VK_SUCCESS)
return;
- }
}
assert(batch->next + size <= batch->end);
@@ -324,9 +232,7 @@ anv_batch_emit_batch(struct anv_batch *batch, struct anv_batch *other)
VG(VALGRIND_CHECK_MEM_IS_DEFINED(other->start, size));
memcpy(batch->next, other->start, size);
- offset = batch->next - batch->start;
- VkResult result = anv_reloc_list_append(batch->relocs, batch->alloc,
- other->relocs, offset);
+ VkResult result = anv_reloc_list_append(batch->relocs, other->relocs);
if (result != VK_SUCCESS) {
anv_batch_set_error(batch, result);
return;
@@ -346,17 +252,18 @@ anv_batch_bo_create(struct anv_cmd_buffer *cmd_buffer,
{
VkResult result;
- struct anv_batch_bo *bbo = vk_alloc(&cmd_buffer->pool->alloc, sizeof(*bbo),
+ struct anv_batch_bo *bbo = vk_zalloc(&cmd_buffer->vk.pool->alloc, sizeof(*bbo),
8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
if (bbo == NULL)
- return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+ return vk_error(cmd_buffer, VK_ERROR_OUT_OF_HOST_MEMORY);
result = anv_bo_pool_alloc(&cmd_buffer->device->batch_bo_pool,
size, &bbo->bo);
if (result != VK_SUCCESS)
goto fail_alloc;
- result = anv_reloc_list_init(&bbo->relocs, &cmd_buffer->pool->alloc);
+ const bool uses_relocs = cmd_buffer->device->physical->uses_relocs;
+ result = anv_reloc_list_init(&bbo->relocs, &cmd_buffer->vk.pool->alloc, uses_relocs);
if (result != VK_SUCCESS)
goto fail_bo_alloc;
@@ -367,7 +274,7 @@ anv_batch_bo_create(struct anv_cmd_buffer *cmd_buffer,
fail_bo_alloc:
anv_bo_pool_free(&cmd_buffer->device->batch_bo_pool, bbo->bo);
fail_alloc:
- vk_free(&cmd_buffer->pool->alloc, bbo);
+ vk_free(&cmd_buffer->vk.pool->alloc, bbo);
return result;
}
@@ -379,18 +286,17 @@ anv_batch_bo_clone(struct anv_cmd_buffer *cmd_buffer,
{
VkResult result;
- struct anv_batch_bo *bbo = vk_alloc(&cmd_buffer->pool->alloc, sizeof(*bbo),
+ struct anv_batch_bo *bbo = vk_alloc(&cmd_buffer->vk.pool->alloc, sizeof(*bbo),
8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
if (bbo == NULL)
- return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+ return vk_error(cmd_buffer, VK_ERROR_OUT_OF_HOST_MEMORY);
result = anv_bo_pool_alloc(&cmd_buffer->device->batch_bo_pool,
other_bbo->bo->size, &bbo->bo);
if (result != VK_SUCCESS)
goto fail_alloc;
- result = anv_reloc_list_init_clone(&bbo->relocs, &cmd_buffer->pool->alloc,
- &other_bbo->relocs);
+ result = anv_reloc_list_init_clone(&bbo->relocs, &other_bbo->relocs);
if (result != VK_SUCCESS)
goto fail_bo_alloc;
@@ -403,7 +309,7 @@ anv_batch_bo_clone(struct anv_cmd_buffer *cmd_buffer,
fail_bo_alloc:
anv_bo_pool_free(&cmd_buffer->device->batch_bo_pool, bbo->bo);
fail_alloc:
- vk_free(&cmd_buffer->pool->alloc, bbo);
+ vk_free(&cmd_buffer->vk.pool->alloc, bbo);
return result;
}
@@ -437,37 +343,6 @@ anv_batch_bo_finish(struct anv_batch_bo *bbo, struct anv_batch *batch)
VG(VALGRIND_CHECK_MEM_IS_DEFINED(batch->start, bbo->length));
}
-static VkResult
-anv_batch_bo_grow(struct anv_cmd_buffer *cmd_buffer, struct anv_batch_bo *bbo,
- struct anv_batch *batch, size_t aditional,
- size_t batch_padding)
-{
- assert(batch->start == bbo->bo->map);
- bbo->length = batch->next - batch->start;
-
- size_t new_size = bbo->bo->size;
- while (new_size <= bbo->length + aditional + batch_padding)
- new_size *= 2;
-
- if (new_size == bbo->bo->size)
- return VK_SUCCESS;
-
- struct anv_bo *new_bo;
- VkResult result = anv_bo_pool_alloc(&cmd_buffer->device->batch_bo_pool,
- new_size, &new_bo);
- if (result != VK_SUCCESS)
- return result;
-
- memcpy(new_bo->map, bbo->bo->map, bbo->length);
-
- anv_bo_pool_free(&cmd_buffer->device->batch_bo_pool, bbo->bo);
-
- bbo->bo = new_bo;
- anv_batch_bo_continue(bbo, batch, batch_padding);
-
- return VK_SUCCESS;
-}
-
static void
anv_batch_bo_link(struct anv_cmd_buffer *cmd_buffer,
struct anv_batch_bo *prev_bbo,
@@ -475,39 +350,30 @@ anv_batch_bo_link(struct anv_cmd_buffer *cmd_buffer,
uint32_t next_bbo_offset)
{
const uint32_t bb_start_offset =
- prev_bbo->length - GFX8_MI_BATCH_BUFFER_START_length * 4;
+ prev_bbo->length - GFX9_MI_BATCH_BUFFER_START_length * 4;
ASSERTED const uint32_t *bb_start = prev_bbo->bo->map + bb_start_offset;
/* Make sure we're looking at a MI_BATCH_BUFFER_START */
assert(((*bb_start >> 29) & 0x07) == 0);
assert(((*bb_start >> 23) & 0x3f) == 49);
- if (cmd_buffer->device->physical->use_softpin) {
- assert(prev_bbo->bo->flags & EXEC_OBJECT_PINNED);
- assert(next_bbo->bo->flags & EXEC_OBJECT_PINNED);
+ uint64_t *map = prev_bbo->bo->map + bb_start_offset + 4;
+ *map = intel_canonical_address(next_bbo->bo->offset + next_bbo_offset);
- write_reloc(cmd_buffer->device,
- prev_bbo->bo->map + bb_start_offset + 4,
- next_bbo->bo->offset + next_bbo_offset, true);
- } else {
- uint32_t reloc_idx = prev_bbo->relocs.num_relocs - 1;
- assert(prev_bbo->relocs.relocs[reloc_idx].offset == bb_start_offset + 4);
-
- prev_bbo->relocs.reloc_bos[reloc_idx] = next_bbo->bo;
- prev_bbo->relocs.relocs[reloc_idx].delta = next_bbo_offset;
-
- /* Use a bogus presumed offset to force a relocation */
- prev_bbo->relocs.relocs[reloc_idx].presumed_offset = -1;
- }
+#ifdef SUPPORT_INTEL_INTEGRATED_GPUS
+ if (cmd_buffer->device->physical->memory.need_flush &&
+ anv_bo_needs_host_cache_flush(prev_bbo->bo->alloc_flags))
+ intel_flush_range(map, sizeof(uint64_t));
+#endif
}
static void
anv_batch_bo_destroy(struct anv_batch_bo *bbo,
struct anv_cmd_buffer *cmd_buffer)
{
- anv_reloc_list_finish(&bbo->relocs, &cmd_buffer->pool->alloc);
+ anv_reloc_list_finish(&bbo->relocs);
anv_bo_pool_free(&cmd_buffer->device->batch_bo_pool, bbo->bo);
- vk_free(&cmd_buffer->pool->alloc, bbo);
+ vk_free(&cmd_buffer->vk.pool->alloc, bbo);
}
static VkResult
@@ -550,13 +416,36 @@ anv_batch_bo_list_clone(const struct list_head *list,
static struct anv_batch_bo *
anv_cmd_buffer_current_batch_bo(struct anv_cmd_buffer *cmd_buffer)
{
- return LIST_ENTRY(struct anv_batch_bo, cmd_buffer->batch_bos.prev, link);
+ return list_entry(cmd_buffer->batch_bos.prev, struct anv_batch_bo, link);
+}
+
+static struct anv_batch_bo *
+anv_cmd_buffer_current_generation_batch_bo(struct anv_cmd_buffer *cmd_buffer)
+{
+ return list_entry(cmd_buffer->generation.batch_bos.prev, struct anv_batch_bo, link);
}
struct anv_address
anv_cmd_buffer_surface_base_address(struct anv_cmd_buffer *cmd_buffer)
{
- struct anv_state_pool *pool = anv_binding_table_pool(cmd_buffer->device);
+ /* Only graphics & compute queues need binding tables. */
+ if (!(cmd_buffer->queue_family->queueFlags & (VK_QUEUE_GRAPHICS_BIT |
+ VK_QUEUE_COMPUTE_BIT)))
+ return ANV_NULL_ADDRESS;
+
+ /* If we've never allocated a binding table block, do it now. Otherwise we
+ * would trigger another STATE_BASE_ADDRESS emission which would require an
+ * additional bunch of flushes/stalls.
+ */
+ if (u_vector_length(&cmd_buffer->bt_block_states) == 0) {
+ VkResult result = anv_cmd_buffer_new_binding_table_block(cmd_buffer);
+ if (result != VK_SUCCESS) {
+ anv_batch_set_error(&cmd_buffer->batch, result);
+ return ANV_NULL_ADDRESS;
+ }
+ }
+
+ struct anv_state_pool *pool = &cmd_buffer->device->binding_table_pool;
struct anv_state *bt_block = u_vector_head(&cmd_buffer->bt_block_states);
return (struct anv_address) {
.bo = pool->block_pool.bo,
@@ -565,60 +454,57 @@ anv_cmd_buffer_surface_base_address(struct anv_cmd_buffer *cmd_buffer)
}
static void
-emit_batch_buffer_start(struct anv_cmd_buffer *cmd_buffer,
+emit_batch_buffer_start(struct anv_batch *batch,
struct anv_bo *bo, uint32_t offset)
{
- /* In gfx8+ the address field grew to two dwords to accomodate 48 bit
- * offsets. The high 16 bits are in the last dword, so we can use the gfx8
- * version in either case, as long as we set the instruction length in the
- * header accordingly. This means that we always emit three dwords here
- * and all the padding and adjustment we do in this file works for all
- * gens.
- */
-
-#define GFX7_MI_BATCH_BUFFER_START_length 2
-#define GFX7_MI_BATCH_BUFFER_START_length_bias 2
-
- const uint32_t gfx7_length =
- GFX7_MI_BATCH_BUFFER_START_length - GFX7_MI_BATCH_BUFFER_START_length_bias;
- const uint32_t gfx8_length =
- GFX8_MI_BATCH_BUFFER_START_length - GFX8_MI_BATCH_BUFFER_START_length_bias;
-
- anv_batch_emit(&cmd_buffer->batch, GFX8_MI_BATCH_BUFFER_START, bbs) {
- bbs.DWordLength = cmd_buffer->device->info.ver < 8 ?
- gfx7_length : gfx8_length;
+ anv_batch_emit(batch, GFX9_MI_BATCH_BUFFER_START, bbs) {
+ bbs.DWordLength = GFX9_MI_BATCH_BUFFER_START_length -
+ GFX9_MI_BATCH_BUFFER_START_length_bias;
bbs.SecondLevelBatchBuffer = Firstlevelbatch;
bbs.AddressSpaceIndicator = ASI_PPGTT;
bbs.BatchBufferStartAddress = (struct anv_address) { bo, offset };
}
}
+enum anv_cmd_buffer_batch {
+ ANV_CMD_BUFFER_BATCH_MAIN,
+ ANV_CMD_BUFFER_BATCH_GENERATION,
+};
+
static void
cmd_buffer_chain_to_batch_bo(struct anv_cmd_buffer *cmd_buffer,
- struct anv_batch_bo *bbo)
+ struct anv_batch_bo *bbo,
+ enum anv_cmd_buffer_batch batch_type)
{
- struct anv_batch *batch = &cmd_buffer->batch;
+ struct anv_batch *batch =
+ batch_type == ANV_CMD_BUFFER_BATCH_GENERATION ?
+ &cmd_buffer->generation.batch : &cmd_buffer->batch;
struct anv_batch_bo *current_bbo =
+ batch_type == ANV_CMD_BUFFER_BATCH_GENERATION ?
+ anv_cmd_buffer_current_generation_batch_bo(cmd_buffer) :
anv_cmd_buffer_current_batch_bo(cmd_buffer);
/* We set the end of the batch a little short so we would be sure we
* have room for the chaining command. Since we're about to emit the
* chaining command, let's set it back where it should go.
*/
- batch->end += GFX8_MI_BATCH_BUFFER_START_length * 4;
+ batch->end += GFX9_MI_BATCH_BUFFER_START_length * 4;
assert(batch->end == current_bbo->bo->map + current_bbo->bo->size);
- emit_batch_buffer_start(cmd_buffer, bbo->bo, 0);
+ emit_batch_buffer_start(batch, bbo->bo, 0);
anv_batch_bo_finish(current_bbo, batch);
+
+ /* Add the current amount of data written in the current_bbo to the command
+ * buffer.
+ */
+ cmd_buffer->total_batch_size += current_bbo->length;
}
static void
anv_cmd_buffer_record_chain_submit(struct anv_cmd_buffer *cmd_buffer_from,
struct anv_cmd_buffer *cmd_buffer_to)
{
- assert(cmd_buffer_from->device->physical->use_softpin);
-
uint32_t *bb_start = cmd_buffer_from->batch_end;
struct anv_batch_bo *last_bbo =
@@ -626,8 +512,8 @@ anv_cmd_buffer_record_chain_submit(struct anv_cmd_buffer *cmd_buffer_from,
struct anv_batch_bo *first_bbo =
list_first_entry(&cmd_buffer_to->batch_bos, struct anv_batch_bo, link);
- struct GFX8_MI_BATCH_BUFFER_START gen_bb_start = {
- __anv_cmd_header(GFX8_MI_BATCH_BUFFER_START),
+ struct GFX9_MI_BATCH_BUFFER_START gen_bb_start = {
+ __anv_cmd_header(GFX9_MI_BATCH_BUFFER_START),
.SecondLevelBatchBuffer = Firstlevelbatch,
.AddressSpaceIndicator = ASI_PPGTT,
.BatchBufferStartAddress = (struct anv_address) { first_bbo->bo, 0 },
@@ -636,10 +522,10 @@ anv_cmd_buffer_record_chain_submit(struct anv_cmd_buffer *cmd_buffer_from,
.start = last_bbo->bo->map,
.end = last_bbo->bo->map + last_bbo->bo->size,
.relocs = &last_bbo->relocs,
- .alloc = &cmd_buffer_from->pool->alloc,
+ .alloc = &cmd_buffer_from->vk.pool->alloc,
};
- __anv_cmd_pack(GFX8_MI_BATCH_BUFFER_START)(&local_batch, bb_start, &gen_bb_start);
+ __anv_cmd_pack(GFX9_MI_BATCH_BUFFER_START)(&local_batch, bb_start, &gen_bb_start);
last_bbo->chained = true;
}
@@ -647,56 +533,92 @@ anv_cmd_buffer_record_chain_submit(struct anv_cmd_buffer *cmd_buffer_from,
static void
anv_cmd_buffer_record_end_submit(struct anv_cmd_buffer *cmd_buffer)
{
- assert(cmd_buffer->device->physical->use_softpin);
-
struct anv_batch_bo *last_bbo =
list_last_entry(&cmd_buffer->batch_bos, struct anv_batch_bo, link);
last_bbo->chained = false;
uint32_t *batch = cmd_buffer->batch_end;
- anv_pack_struct(batch, GFX8_MI_BATCH_BUFFER_END,
- __anv_cmd_header(GFX8_MI_BATCH_BUFFER_END));
+ anv_pack_struct(batch, GFX9_MI_BATCH_BUFFER_END,
+ __anv_cmd_header(GFX9_MI_BATCH_BUFFER_END));
}
static VkResult
-anv_cmd_buffer_chain_batch(struct anv_batch *batch, void *_data)
+anv_cmd_buffer_chain_batch(struct anv_batch *batch, uint32_t size, void *_data)
{
+ /* The caller should not need that much space. Otherwise it should split
+ * its commands.
+ */
+ assert(size <= ANV_MAX_CMD_BUFFER_BATCH_SIZE);
+
struct anv_cmd_buffer *cmd_buffer = _data;
- struct anv_batch_bo *new_bbo;
+ struct anv_batch_bo *new_bbo = NULL;
+ /* Amount of reserved space at the end of the batch to account for the
+ * chaining instruction.
+ */
+ const uint32_t batch_padding = GFX9_MI_BATCH_BUFFER_START_length * 4;
/* Cap reallocation to chunk. */
- uint32_t alloc_size = MIN2(cmd_buffer->total_batch_size,
- ANV_MAX_CMD_BUFFER_BATCH_SIZE);
+ uint32_t alloc_size = MIN2(
+ MAX2(batch->allocated_batch_size, size + batch_padding),
+ ANV_MAX_CMD_BUFFER_BATCH_SIZE);
VkResult result = anv_batch_bo_create(cmd_buffer, alloc_size, &new_bbo);
if (result != VK_SUCCESS)
return result;
- cmd_buffer->total_batch_size += alloc_size;
+ batch->allocated_batch_size += alloc_size;
struct anv_batch_bo **seen_bbo = u_vector_add(&cmd_buffer->seen_bbos);
if (seen_bbo == NULL) {
anv_batch_bo_destroy(new_bbo, cmd_buffer);
- return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+ return vk_error(cmd_buffer, VK_ERROR_OUT_OF_HOST_MEMORY);
}
*seen_bbo = new_bbo;
- cmd_buffer_chain_to_batch_bo(cmd_buffer, new_bbo);
+ cmd_buffer_chain_to_batch_bo(cmd_buffer, new_bbo, ANV_CMD_BUFFER_BATCH_MAIN);
list_addtail(&new_bbo->link, &cmd_buffer->batch_bos);
- anv_batch_bo_start(new_bbo, batch, GFX8_MI_BATCH_BUFFER_START_length * 4);
+ anv_batch_bo_start(new_bbo, batch, batch_padding);
return VK_SUCCESS;
}
static VkResult
-anv_cmd_buffer_grow_batch(struct anv_batch *batch, void *_data)
+anv_cmd_buffer_chain_generation_batch(struct anv_batch *batch, uint32_t size, void *_data)
{
+ /* The caller should not need that much space. Otherwise it should split
+ * its commands.
+ */
+ assert(size <= ANV_MAX_CMD_BUFFER_BATCH_SIZE);
+
struct anv_cmd_buffer *cmd_buffer = _data;
- struct anv_batch_bo *bbo = anv_cmd_buffer_current_batch_bo(cmd_buffer);
+ struct anv_batch_bo *new_bbo = NULL;
+ /* Cap reallocation to chunk. */
+ uint32_t alloc_size = MIN2(
+ MAX2(batch->allocated_batch_size, size),
+ ANV_MAX_CMD_BUFFER_BATCH_SIZE);
- anv_batch_bo_grow(cmd_buffer, bbo, &cmd_buffer->batch, 4096,
- GFX8_MI_BATCH_BUFFER_START_length * 4);
+ VkResult result = anv_batch_bo_create(cmd_buffer, alloc_size, &new_bbo);
+ if (result != VK_SUCCESS)
+ return result;
+
+ batch->allocated_batch_size += alloc_size;
+
+ struct anv_batch_bo **seen_bbo = u_vector_add(&cmd_buffer->seen_bbos);
+ if (seen_bbo == NULL) {
+ anv_batch_bo_destroy(new_bbo, cmd_buffer);
+ return vk_error(cmd_buffer, VK_ERROR_OUT_OF_HOST_MEMORY);
+ }
+ *seen_bbo = new_bbo;
+
+ if (!list_is_empty(&cmd_buffer->generation.batch_bos)) {
+ cmd_buffer_chain_to_batch_bo(cmd_buffer, new_bbo,
+ ANV_CMD_BUFFER_BATCH_GENERATION);
+ }
+
+ list_addtail(&new_bbo->link, &cmd_buffer->generation.batch_bos);
+
+ anv_batch_bo_start(new_bbo, batch, GFX9_MI_BATCH_BUFFER_START_length * 4);
return VK_SUCCESS;
}
@@ -759,9 +681,6 @@ anv_cmd_buffer_grow_batch(struct anv_batch *batch, void *_data)
* surface state offsets so that they are correct relative to out new surface
* state base address at the bottom of the binding table block.
*
- * \see adjust_relocations_from_block_pool()
- * \see adjust_relocations_too_block_pool()
- *
* \param[in] entries The number of surface state entries the binding
* table should be able to hold.
*
@@ -776,9 +695,12 @@ struct anv_state
anv_cmd_buffer_alloc_binding_table(struct anv_cmd_buffer *cmd_buffer,
uint32_t entries, uint32_t *state_offset)
{
+ if (u_vector_length(&cmd_buffer->bt_block_states) == 0)
+ return (struct anv_state) { 0 };
+
struct anv_state *bt_block = u_vector_head(&cmd_buffer->bt_block_states);
- uint32_t bt_size = align_u32(entries * 4, 32);
+ uint32_t bt_size = align(entries * 4, 32);
struct anv_state state = cmd_buffer->bt_next;
if (bt_size > state.alloc_size)
@@ -789,26 +711,131 @@ anv_cmd_buffer_alloc_binding_table(struct anv_cmd_buffer *cmd_buffer,
cmd_buffer->bt_next.map += bt_size;
cmd_buffer->bt_next.alloc_size -= bt_size;
- assert(bt_block->offset < 0);
- *state_offset = -bt_block->offset;
+ if (cmd_buffer->device->info->verx10 >= 125) {
+ /* We're using 3DSTATE_BINDING_TABLE_POOL_ALLOC to change the binding
+ * table address independently from surface state base address. We no
+ * longer need any sort of offsetting.
+ */
+ *state_offset = 0;
+ } else {
+ assert(bt_block->offset < 0);
+ *state_offset = -bt_block->offset;
+ }
return state;
}
struct anv_state
-anv_cmd_buffer_alloc_surface_state(struct anv_cmd_buffer *cmd_buffer)
+anv_cmd_buffer_alloc_surface_states(struct anv_cmd_buffer *cmd_buffer,
+ uint32_t count)
{
+ if (count == 0)
+ return ANV_STATE_NULL;
struct isl_device *isl_dev = &cmd_buffer->device->isl_dev;
- return anv_state_stream_alloc(&cmd_buffer->surface_state_stream,
- isl_dev->ss.size, isl_dev->ss.align);
+ struct anv_state state =
+ anv_state_stream_alloc(&cmd_buffer->surface_state_stream,
+ count * isl_dev->ss.size,
+ isl_dev->ss.align);
+ if (state.map == NULL)
+ anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_OUT_OF_DEVICE_MEMORY);
+ return state;
}
struct anv_state
anv_cmd_buffer_alloc_dynamic_state(struct anv_cmd_buffer *cmd_buffer,
uint32_t size, uint32_t alignment)
{
- return anv_state_stream_alloc(&cmd_buffer->dynamic_state_stream,
- size, alignment);
+ if (size == 0)
+ return ANV_STATE_NULL;
+ assert(cmd_buffer->state.current_db_mode !=
+ ANV_CMD_DESCRIPTOR_BUFFER_MODE_UNKNOWN);
+ struct anv_state state =
+ anv_state_stream_alloc(cmd_buffer->state.current_db_mode ==
+ ANV_CMD_DESCRIPTOR_BUFFER_MODE_BUFFER ?
+ &cmd_buffer->dynamic_state_db_stream :
+ &cmd_buffer->dynamic_state_stream,
+ size, alignment);
+ if (state.map == NULL)
+ anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_OUT_OF_DEVICE_MEMORY);
+ return state;
+}
+
+struct anv_state
+anv_cmd_buffer_alloc_general_state(struct anv_cmd_buffer *cmd_buffer,
+ uint32_t size, uint32_t alignment)
+{
+ if (size == 0)
+ return ANV_STATE_NULL;
+ struct anv_state state =
+ anv_state_stream_alloc(&cmd_buffer->general_state_stream,
+ size, alignment);
+ if (state.map == NULL)
+ anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_OUT_OF_DEVICE_MEMORY);
+ return state;
+}
+
+/** Allocate space associated with a command buffer
+ *
+ * Some commands like vkCmdBuildAccelerationStructuresKHR() can end up needing
+ * large amount of temporary buffers. This function is here to deal with those
+ * potentially larger allocations, using a side BO if needed.
+ *
+ */
+struct anv_cmd_alloc
+anv_cmd_buffer_alloc_space(struct anv_cmd_buffer *cmd_buffer,
+ size_t size, uint32_t alignment,
+ bool mapped)
+{
+ /* Below 16k, source memory from dynamic state, otherwise allocate a BO. */
+ if (size < 16 * 1024) {
+ struct anv_state state =
+ anv_state_stream_alloc(&cmd_buffer->dynamic_state_stream,
+ size, alignment);
+ if (state.map == NULL) {
+ anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_OUT_OF_DEVICE_MEMORY);
+ return (struct anv_cmd_alloc) {
+ .address = ANV_NULL_ADDRESS,
+ };
+ }
+
+ return (struct anv_cmd_alloc) {
+ .address = anv_state_pool_state_address(
+ &cmd_buffer->device->dynamic_state_pool,
+ state),
+ .map = state.map,
+ .size = size,
+ };
+ }
+
+ assert(alignment <= 4096);
+
+ struct anv_bo *bo = NULL;
+ VkResult result =
+ anv_bo_pool_alloc(mapped ?
+ &cmd_buffer->device->batch_bo_pool :
+ &cmd_buffer->device->bvh_bo_pool,
+ align(size, 4096), &bo);
+ if (result != VK_SUCCESS) {
+ anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_OUT_OF_DEVICE_MEMORY);
+ return ANV_EMPTY_ALLOC;
+ }
+
+ struct anv_bo **bo_entry =
+ u_vector_add(&cmd_buffer->dynamic_bos);
+ if (bo_entry == NULL) {
+ anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_OUT_OF_HOST_MEMORY);
+ anv_bo_pool_free(bo->map != NULL ?
+ &cmd_buffer->device->batch_bo_pool :
+ &cmd_buffer->device->bvh_bo_pool, bo);
+ return ANV_EMPTY_ALLOC;
+ }
+ *bo_entry = bo;
+
+ return (struct anv_cmd_alloc) {
+ .address = (struct anv_address) { .bo = bo },
+ .map = bo->map,
+ .size = size,
+ };
}
VkResult
@@ -817,7 +844,7 @@ anv_cmd_buffer_new_binding_table_block(struct anv_cmd_buffer *cmd_buffer)
struct anv_state *bt_block = u_vector_add(&cmd_buffer->bt_block_states);
if (bt_block == NULL) {
anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_OUT_OF_HOST_MEMORY);
- return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+ return vk_error(cmd_buffer, VK_ERROR_OUT_OF_HOST_MEMORY);
}
*bt_block = anv_binding_table_pool_alloc(cmd_buffer->device);
@@ -834,55 +861,58 @@ anv_cmd_buffer_new_binding_table_block(struct anv_cmd_buffer *cmd_buffer)
VkResult
anv_cmd_buffer_init_batch_bo_chain(struct anv_cmd_buffer *cmd_buffer)
{
- struct anv_batch_bo *batch_bo;
+ struct anv_batch_bo *batch_bo = NULL;
VkResult result;
list_inithead(&cmd_buffer->batch_bos);
- cmd_buffer->total_batch_size = ANV_MIN_CMD_BUFFER_BATCH_SIZE;
+ cmd_buffer->total_batch_size = 0;
result = anv_batch_bo_create(cmd_buffer,
- cmd_buffer->total_batch_size,
+ ANV_MIN_CMD_BUFFER_BATCH_SIZE,
&batch_bo);
if (result != VK_SUCCESS)
return result;
list_addtail(&batch_bo->link, &cmd_buffer->batch_bos);
- cmd_buffer->batch.alloc = &cmd_buffer->pool->alloc;
+ cmd_buffer->batch.alloc = &cmd_buffer->vk.pool->alloc;
cmd_buffer->batch.user_data = cmd_buffer;
+ cmd_buffer->batch.allocated_batch_size = ANV_MIN_CMD_BUFFER_BATCH_SIZE;
- if (cmd_buffer->device->can_chain_batches) {
- cmd_buffer->batch.extend_cb = anv_cmd_buffer_chain_batch;
- } else {
- cmd_buffer->batch.extend_cb = anv_cmd_buffer_grow_batch;
- }
+ cmd_buffer->batch.extend_cb = anv_cmd_buffer_chain_batch;
+ cmd_buffer->batch.engine_class = cmd_buffer->queue_family->engine_class;
anv_batch_bo_start(batch_bo, &cmd_buffer->batch,
- GFX8_MI_BATCH_BUFFER_START_length * 4);
+ GFX9_MI_BATCH_BUFFER_START_length * 4);
- int success = u_vector_init(&cmd_buffer->seen_bbos,
- sizeof(struct anv_bo *),
- 8 * sizeof(struct anv_bo *));
+ /* Generation batch is initialized empty since it's possible it won't be
+ * used.
+ */
+ list_inithead(&cmd_buffer->generation.batch_bos);
+
+ cmd_buffer->generation.batch.alloc = &cmd_buffer->vk.pool->alloc;
+ cmd_buffer->generation.batch.user_data = cmd_buffer;
+ cmd_buffer->generation.batch.allocated_batch_size = 0;
+ cmd_buffer->generation.batch.extend_cb = anv_cmd_buffer_chain_generation_batch;
+ cmd_buffer->generation.batch.engine_class =
+ cmd_buffer->queue_family->engine_class;
+
+ int success = u_vector_init_pow2(&cmd_buffer->seen_bbos, 8,
+ sizeof(struct anv_bo *));
if (!success)
goto fail_batch_bo;
*(struct anv_batch_bo **)u_vector_add(&cmd_buffer->seen_bbos) = batch_bo;
- /* u_vector requires power-of-two size elements */
- unsigned pow2_state_size = util_next_power_of_two(sizeof(struct anv_state));
- success = u_vector_init(&cmd_buffer->bt_block_states,
- pow2_state_size, 8 * pow2_state_size);
+ success = u_vector_init(&cmd_buffer->bt_block_states, 8,
+ sizeof(struct anv_state));
if (!success)
goto fail_seen_bbos;
+ const bool uses_relocs = cmd_buffer->device->physical->uses_relocs;
result = anv_reloc_list_init(&cmd_buffer->surface_relocs,
- &cmd_buffer->pool->alloc);
- if (result != VK_SUCCESS)
- goto fail_bt_blocks;
- cmd_buffer->last_ss_pool_center = 0;
-
- result = anv_cmd_buffer_new_binding_table_block(cmd_buffer);
+ &cmd_buffer->vk.pool->alloc, uses_relocs);
if (result != VK_SUCCESS)
goto fail_bt_blocks;
@@ -906,7 +936,7 @@ anv_cmd_buffer_fini_batch_bo_chain(struct anv_cmd_buffer *cmd_buffer)
anv_binding_table_pool_free(cmd_buffer->device, *bt_block);
u_vector_finish(&cmd_buffer->bt_block_states);
- anv_reloc_list_finish(&cmd_buffer->surface_relocs, &cmd_buffer->pool->alloc);
+ anv_reloc_list_finish(&cmd_buffer->surface_relocs);
u_vector_finish(&cmd_buffer->seen_bbos);
@@ -916,6 +946,17 @@ anv_cmd_buffer_fini_batch_bo_chain(struct anv_cmd_buffer *cmd_buffer)
list_del(&bbo->link);
anv_batch_bo_destroy(bbo, cmd_buffer);
}
+ /* Also destroy all generation batch buffers */
+ list_for_each_entry_safe(struct anv_batch_bo, bbo,
+ &cmd_buffer->generation.batch_bos, link) {
+ list_del(&bbo->link);
+ anv_batch_bo_destroy(bbo, cmd_buffer);
+ }
+
+ if (cmd_buffer->generation.ring_bo) {
+ anv_bo_pool_free(&cmd_buffer->device->batch_bo_pool,
+ cmd_buffer->generation.ring_bo);
+ }
}
void
@@ -932,18 +973,15 @@ anv_cmd_buffer_reset_batch_bo_chain(struct anv_cmd_buffer *cmd_buffer)
anv_batch_bo_start(anv_cmd_buffer_current_batch_bo(cmd_buffer),
&cmd_buffer->batch,
- GFX8_MI_BATCH_BUFFER_START_length * 4);
+ GFX9_MI_BATCH_BUFFER_START_length * 4);
- while (u_vector_length(&cmd_buffer->bt_block_states) > 1) {
+ while (u_vector_length(&cmd_buffer->bt_block_states) > 0) {
struct anv_state *bt_block = u_vector_remove(&cmd_buffer->bt_block_states);
anv_binding_table_pool_free(cmd_buffer->device, *bt_block);
}
- assert(u_vector_length(&cmd_buffer->bt_block_states) == 1);
- cmd_buffer->bt_next = *(struct anv_state *)u_vector_head(&cmd_buffer->bt_block_states);
- cmd_buffer->bt_next.offset = 0;
+ cmd_buffer->bt_next = ANV_STATE_NULL;
anv_reloc_list_clear(&cmd_buffer->surface_relocs);
- cmd_buffer->last_ss_pool_center = 0;
/* Reset the list of seen buffers */
cmd_buffer->seen_bbos.head = 0;
@@ -953,25 +991,45 @@ anv_cmd_buffer_reset_batch_bo_chain(struct anv_cmd_buffer *cmd_buffer)
*(struct anv_batch_bo **)u_vector_add(&cmd_buffer->seen_bbos) = first_bbo;
+ assert(first_bbo->bo->size == ANV_MIN_CMD_BUFFER_BATCH_SIZE);
+ cmd_buffer->batch.allocated_batch_size = first_bbo->bo->size;
+
+ /* Delete all generation batch bos */
+ list_for_each_entry_safe(struct anv_batch_bo, bbo,
+ &cmd_buffer->generation.batch_bos, link) {
+ list_del(&bbo->link);
+ anv_batch_bo_destroy(bbo, cmd_buffer);
+ }
+
+ /* And reset generation batch */
+ cmd_buffer->generation.batch.allocated_batch_size = 0;
+ cmd_buffer->generation.batch.start = NULL;
+ cmd_buffer->generation.batch.end = NULL;
+ cmd_buffer->generation.batch.next = NULL;
- assert(!cmd_buffer->device->can_chain_batches ||
- first_bbo->bo->size == ANV_MIN_CMD_BUFFER_BATCH_SIZE);
- cmd_buffer->total_batch_size = first_bbo->bo->size;
+ if (cmd_buffer->generation.ring_bo) {
+ anv_bo_pool_free(&cmd_buffer->device->batch_bo_pool,
+ cmd_buffer->generation.ring_bo);
+ cmd_buffer->generation.ring_bo = NULL;
+ }
+
+ cmd_buffer->total_batch_size = 0;
}
void
anv_cmd_buffer_end_batch_buffer(struct anv_cmd_buffer *cmd_buffer)
{
+ const struct intel_device_info *devinfo = cmd_buffer->device->info;
struct anv_batch_bo *batch_bo = anv_cmd_buffer_current_batch_bo(cmd_buffer);
- if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) {
+ if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) {
/* When we start a batch buffer, we subtract a certain amount of
* padding from the end to ensure that we always have room to emit a
* BATCH_BUFFER_START to chain to the next BO. We need to remove
* that padding before we end the batch; otherwise, we may end up
* with our BATCH_BUFFER_END in another BO.
*/
- cmd_buffer->batch.end += GFX8_MI_BATCH_BUFFER_START_length * 4;
+ cmd_buffer->batch.end += GFX9_MI_BATCH_BUFFER_START_length * 4;
assert(cmd_buffer->batch.start == batch_bo->bo->map);
assert(cmd_buffer->batch.end == batch_bo->bo->map + batch_bo->bo->size);
@@ -983,50 +1041,29 @@ anv_cmd_buffer_end_batch_buffer(struct anv_cmd_buffer *cmd_buffer)
*/
batch_bo->chained = anv_cmd_buffer_is_chainable(cmd_buffer);
if (batch_bo->chained)
- emit_batch_buffer_start(cmd_buffer, batch_bo->bo, 0);
+ emit_batch_buffer_start(&cmd_buffer->batch, batch_bo->bo, 0);
else
- anv_batch_emit(&cmd_buffer->batch, GFX8_MI_BATCH_BUFFER_END, bbe);
+ anv_batch_emit(&cmd_buffer->batch, GFX9_MI_BATCH_BUFFER_END, bbe);
/* Round batch up to an even number of dwords. */
if ((cmd_buffer->batch.next - cmd_buffer->batch.start) & 4)
- anv_batch_emit(&cmd_buffer->batch, GFX8_MI_NOOP, noop);
+ anv_batch_emit(&cmd_buffer->batch, GFX9_MI_NOOP, noop);
cmd_buffer->exec_mode = ANV_CMD_BUFFER_EXEC_MODE_PRIMARY;
} else {
- assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
+ assert(cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
/* If this is a secondary command buffer, we need to determine the
* mode in which it will be executed with vkExecuteCommands. We
* determine this statically here so that this stays in sync with the
* actual ExecuteCommands implementation.
*/
const uint32_t length = cmd_buffer->batch.next - cmd_buffer->batch.start;
- if (!cmd_buffer->device->can_chain_batches) {
- cmd_buffer->exec_mode = ANV_CMD_BUFFER_EXEC_MODE_GROW_AND_EMIT;
- } else if (cmd_buffer->device->physical->use_call_secondary) {
+ if (cmd_buffer->device->physical->use_call_secondary) {
cmd_buffer->exec_mode = ANV_CMD_BUFFER_EXEC_MODE_CALL_AND_RETURN;
- /* If the secondary command buffer begins & ends in the same BO and
- * its length is less than the length of CS prefetch, add some NOOPs
- * instructions so the last MI_BATCH_BUFFER_START is outside the CS
- * prefetch.
- */
- if (cmd_buffer->batch_bos.next == cmd_buffer->batch_bos.prev) {
- const struct intel_device_info *devinfo = &cmd_buffer->device->info;
- /* Careful to have everything in signed integer. */
- int32_t prefetch_len = devinfo->cs_prefetch_size;
- int32_t batch_len =
- cmd_buffer->batch.next - cmd_buffer->batch.start;
-
- for (int32_t i = 0; i < (prefetch_len - batch_len); i += 4)
- anv_batch_emit(&cmd_buffer->batch, GFX8_MI_NOOP, noop);
- }
void *jump_addr =
- anv_batch_emitn(&cmd_buffer->batch,
- GFX8_MI_BATCH_BUFFER_START_length,
- GFX8_MI_BATCH_BUFFER_START,
- .AddressSpaceIndicator = ASI_PPGTT,
- .SecondLevelBatchBuffer = Firstlevelbatch) +
- (GFX8_MI_BATCH_BUFFER_START_BatchBufferStartAddress_start / 8);
+ anv_genX(devinfo, batch_emit_return)(&cmd_buffer->batch) +
+ (GFX9_MI_BATCH_BUFFER_START_BatchBufferStartAddress_start / 8);
cmd_buffer->return_addr = anv_batch_address(&cmd_buffer->batch, jump_addr);
/* The emit above may have caused us to chain batch buffers which
@@ -1054,11 +1091,11 @@ anv_cmd_buffer_end_batch_buffer(struct anv_cmd_buffer *cmd_buffer)
* have room for the chaining command. Since we're about to emit the
* chaining command, let's set it back where it should go.
*/
- cmd_buffer->batch.end += GFX8_MI_BATCH_BUFFER_START_length * 4;
+ cmd_buffer->batch.end += GFX9_MI_BATCH_BUFFER_START_length * 4;
assert(cmd_buffer->batch.start == batch_bo->bo->map);
assert(cmd_buffer->batch.end == batch_bo->bo->map + batch_bo->bo->size);
- emit_batch_buffer_start(cmd_buffer, batch_bo->bo, 0);
+ emit_batch_buffer_start(&cmd_buffer->batch, batch_bo->bo, 0);
assert(cmd_buffer->batch.start == batch_bo->bo->map);
} else {
cmd_buffer->exec_mode = ANV_CMD_BUFFER_EXEC_MODE_COPY_AND_CHAIN;
@@ -1066,6 +1103,11 @@ anv_cmd_buffer_end_batch_buffer(struct anv_cmd_buffer *cmd_buffer)
}
anv_batch_bo_finish(batch_bo, &cmd_buffer->batch);
+
+ /* Add the current amount of data written in the current_bbo to the command
+ * buffer.
+ */
+ cmd_buffer->total_batch_size += batch_bo->length;
}
static VkResult
@@ -1075,7 +1117,7 @@ anv_cmd_buffer_add_seen_bbos(struct anv_cmd_buffer *cmd_buffer,
list_for_each_entry(struct anv_batch_bo, bbo, list, link) {
struct anv_batch_bo **bbo_ptr = u_vector_add(&cmd_buffer->seen_bbos);
if (bbo_ptr == NULL)
- return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+ return vk_error(cmd_buffer, VK_ERROR_OUT_OF_HOST_MEMORY);
*bbo_ptr = bbo;
}
@@ -1092,21 +1134,13 @@ anv_cmd_buffer_add_secondary(struct anv_cmd_buffer *primary,
case ANV_CMD_BUFFER_EXEC_MODE_EMIT:
anv_batch_emit_batch(&primary->batch, &secondary->batch);
break;
- case ANV_CMD_BUFFER_EXEC_MODE_GROW_AND_EMIT: {
- struct anv_batch_bo *bbo = anv_cmd_buffer_current_batch_bo(primary);
- unsigned length = secondary->batch.end - secondary->batch.start;
- anv_batch_bo_grow(primary, bbo, &primary->batch, length,
- GFX8_MI_BATCH_BUFFER_START_length * 4);
- anv_batch_emit_batch(&primary->batch, &secondary->batch);
- break;
- }
case ANV_CMD_BUFFER_EXEC_MODE_CHAIN: {
struct anv_batch_bo *first_bbo =
list_first_entry(&secondary->batch_bos, struct anv_batch_bo, link);
struct anv_batch_bo *last_bbo =
list_last_entry(&secondary->batch_bos, struct anv_batch_bo, link);
- emit_batch_buffer_start(primary, first_bbo->bo, 0);
+ emit_batch_buffer_start(&primary->batch, first_bbo->bo, 0);
struct anv_batch_bo *this_bbo = anv_cmd_buffer_current_batch_bo(primary);
assert(primary->batch.start == this_bbo->bo->map);
@@ -1135,30 +1169,23 @@ anv_cmd_buffer_add_secondary(struct anv_cmd_buffer *primary,
struct anv_batch_bo *last_bbo =
list_last_entry(&copy_list, struct anv_batch_bo, link);
- cmd_buffer_chain_to_batch_bo(primary, first_bbo);
+ cmd_buffer_chain_to_batch_bo(primary, first_bbo,
+ ANV_CMD_BUFFER_BATCH_MAIN);
list_splicetail(&copy_list, &primary->batch_bos);
anv_batch_bo_continue(last_bbo, &primary->batch,
- GFX8_MI_BATCH_BUFFER_START_length * 4);
+ GFX9_MI_BATCH_BUFFER_START_length * 4);
break;
}
case ANV_CMD_BUFFER_EXEC_MODE_CALL_AND_RETURN: {
struct anv_batch_bo *first_bbo =
list_first_entry(&secondary->batch_bos, struct anv_batch_bo, link);
- uint64_t *write_return_addr =
- anv_batch_emitn(&primary->batch,
- GFX8_MI_STORE_DATA_IMM_length + 1 /* QWord write */,
- GFX8_MI_STORE_DATA_IMM,
- .Address = secondary->return_addr)
- + (GFX8_MI_STORE_DATA_IMM_ImmediateData_start / 8);
-
- emit_batch_buffer_start(primary, first_bbo->bo, 0);
-
- *write_return_addr =
- anv_address_physical(anv_batch_address(&primary->batch,
- primary->batch.next));
+ anv_genX(primary->device->info, batch_emit_secondary_call)(
+ &primary->batch,
+ (struct anv_address) { .bo = first_bbo->bo },
+ secondary->return_addr);
anv_cmd_buffer_add_seen_bbos(primary, &secondary->batch_bos);
break;
@@ -1167,904 +1194,524 @@ anv_cmd_buffer_add_secondary(struct anv_cmd_buffer *primary,
assert(!"Invalid execution mode");
}
- anv_reloc_list_append(&primary->surface_relocs, &primary->pool->alloc,
- &secondary->surface_relocs, 0);
-}
-
-struct anv_execbuf {
- struct drm_i915_gem_execbuffer2 execbuf;
-
- struct drm_i915_gem_execbuffer_ext_timeline_fences timeline_fences;
-
- struct drm_i915_gem_exec_object2 * objects;
- uint32_t bo_count;
- struct anv_bo ** bos;
-
- /* Allocated length of the 'objects' and 'bos' arrays */
- uint32_t array_length;
+ anv_reloc_list_append(&primary->surface_relocs, &secondary->surface_relocs);
- /* List of relocations for surface states, only used with platforms not
- * using softpin.
+ /* Add the amount of data written in the secondary buffer to the primary
+ * command buffer.
*/
- void * surface_states_relocs;
-
- /* Indicates whether any of the command buffers have relocations. This
- * doesn't not necessarily mean we'll need the kernel to process them. It
- * might be that a previous execbuf has already placed things in the VMA
- * and we can make i915 skip the relocations.
- */
- bool has_relocs;
-
- const VkAllocationCallbacks * alloc;
- VkSystemAllocationScope alloc_scope;
-
- int perf_query_pass;
-};
-
-static void
-anv_execbuf_init(struct anv_execbuf *exec)
-{
- memset(exec, 0, sizeof(*exec));
-}
-
-static void
-anv_execbuf_finish(struct anv_execbuf *exec)
-{
- vk_free(exec->alloc, exec->surface_states_relocs);
- vk_free(exec->alloc, exec->objects);
- vk_free(exec->alloc, exec->bos);
+ primary->total_batch_size += secondary->total_batch_size;
}
-static void
-anv_execbuf_add_ext(struct anv_execbuf *exec,
- uint32_t ext_name,
- struct i915_user_extension *ext)
+void
+anv_cmd_buffer_chain_command_buffers(struct anv_cmd_buffer **cmd_buffers,
+ uint32_t num_cmd_buffers)
{
- __u64 *iter = &exec->execbuf.cliprects_ptr;
-
- exec->execbuf.flags |= I915_EXEC_USE_EXTENSIONS;
-
- while (*iter != 0) {
- iter = (__u64 *) &((struct i915_user_extension *)(uintptr_t)*iter)->next_extension;
+ if (!anv_cmd_buffer_is_chainable(cmd_buffers[0])) {
+ assert(num_cmd_buffers == 1);
+ return;
}
- ext->name = ext_name;
+ /* Chain the N-1 first batch buffers */
+ for (uint32_t i = 0; i < (num_cmd_buffers - 1); i++) {
+ assert(cmd_buffers[i]->companion_rcs_cmd_buffer == NULL);
+ anv_cmd_buffer_record_chain_submit(cmd_buffers[i], cmd_buffers[i + 1]);
+ }
- *iter = (uintptr_t) ext;
+ /* Put an end to the last one */
+ anv_cmd_buffer_record_end_submit(cmd_buffers[num_cmd_buffers - 1]);
}
-static VkResult
-anv_execbuf_add_bo_bitset(struct anv_device *device,
- struct anv_execbuf *exec,
- uint32_t dep_words,
- BITSET_WORD *deps,
- uint32_t extra_flags);
-
-static VkResult
-anv_execbuf_add_bo(struct anv_device *device,
- struct anv_execbuf *exec,
- struct anv_bo *bo,
- struct anv_reloc_list *relocs,
- uint32_t extra_flags)
+static void
+anv_print_batch(struct anv_device *device,
+ struct anv_queue *queue,
+ struct anv_cmd_buffer *cmd_buffer)
{
- struct drm_i915_gem_exec_object2 *obj = NULL;
-
- bo = anv_bo_unwrap(bo);
-
- if (bo->index < exec->bo_count && exec->bos[bo->index] == bo)
- obj = &exec->objects[bo->index];
-
- if (obj == NULL) {
- /* We've never seen this one before. Add it to the list and assign
- * an id that we can use later.
- */
- if (exec->bo_count >= exec->array_length) {
- uint32_t new_len = exec->objects ? exec->array_length * 2 : 64;
-
- struct drm_i915_gem_exec_object2 *new_objects =
- vk_alloc(exec->alloc, new_len * sizeof(*new_objects), 8, exec->alloc_scope);
- if (new_objects == NULL)
- return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
-
- struct anv_bo **new_bos =
- vk_alloc(exec->alloc, new_len * sizeof(*new_bos), 8, exec->alloc_scope);
- if (new_bos == NULL) {
- vk_free(exec->alloc, new_objects);
- return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
- }
-
- if (exec->objects) {
- memcpy(new_objects, exec->objects,
- exec->bo_count * sizeof(*new_objects));
- memcpy(new_bos, exec->bos,
- exec->bo_count * sizeof(*new_bos));
- }
-
- vk_free(exec->alloc, exec->objects);
- vk_free(exec->alloc, exec->bos);
-
- exec->objects = new_objects;
- exec->bos = new_bos;
- exec->array_length = new_len;
- }
-
- assert(exec->bo_count < exec->array_length);
-
- bo->index = exec->bo_count++;
- obj = &exec->objects[bo->index];
- exec->bos[bo->index] = bo;
-
- obj->handle = bo->gem_handle;
- obj->relocation_count = 0;
- obj->relocs_ptr = 0;
- obj->alignment = 0;
- obj->offset = bo->offset;
- obj->flags = bo->flags | extra_flags;
- obj->rsvd1 = 0;
- obj->rsvd2 = 0;
+ struct anv_batch_bo *bbo =
+ list_first_entry(&cmd_buffer->batch_bos, struct anv_batch_bo, link);
+ device->cmd_buffer_being_decoded = cmd_buffer;
+ struct intel_batch_decode_ctx *ctx = queue->decoder;
+
+ if (cmd_buffer->is_companion_rcs_cmd_buffer) {
+ int render_queue_idx =
+ anv_get_first_render_queue_index(device->physical);
+ ctx = &device->decoder[render_queue_idx];
}
- if (extra_flags & EXEC_OBJECT_WRITE) {
- obj->flags |= EXEC_OBJECT_WRITE;
- obj->flags &= ~EXEC_OBJECT_ASYNC;
+ if (INTEL_DEBUG(DEBUG_BATCH)) {
+ intel_print_batch(ctx, bbo->bo->map,
+ bbo->bo->size, bbo->bo->offset, false);
}
-
- if (relocs != NULL) {
- assert(obj->relocation_count == 0);
-
- if (relocs->num_relocs > 0) {
- /* This is the first time we've ever seen a list of relocations for
- * this BO. Go ahead and set the relocations and then walk the list
- * of relocations and add them all.
- */
- exec->has_relocs = true;
- obj->relocation_count = relocs->num_relocs;
- obj->relocs_ptr = (uintptr_t) relocs->relocs;
-
- for (size_t i = 0; i < relocs->num_relocs; i++) {
- VkResult result;
-
- /* A quick sanity check on relocations */
- assert(relocs->relocs[i].offset < bo->size);
- result = anv_execbuf_add_bo(device, exec, relocs->reloc_bos[i],
- NULL, extra_flags);
- if (result != VK_SUCCESS)
- return result;
- }
- }
-
- return anv_execbuf_add_bo_bitset(device, exec, relocs->dep_words,
- relocs->deps, extra_flags);
+ if (INTEL_DEBUG(DEBUG_BATCH_STATS)) {
+ intel_batch_stats(ctx, bbo->bo->map,
+ bbo->bo->size, bbo->bo->offset, false);
}
-
- return VK_SUCCESS;
+ device->cmd_buffer_being_decoded = NULL;
}
-/* Add BO dependencies to execbuf */
-static VkResult
-anv_execbuf_add_bo_bitset(struct anv_device *device,
- struct anv_execbuf *exec,
- uint32_t dep_words,
- BITSET_WORD *deps,
- uint32_t extra_flags)
+void
+anv_cmd_buffer_exec_batch_debug(struct anv_queue *queue,
+ uint32_t cmd_buffer_count,
+ struct anv_cmd_buffer **cmd_buffers,
+ struct anv_query_pool *perf_query_pool,
+ uint32_t perf_query_pass)
{
- for (uint32_t w = 0; w < dep_words; w++) {
- BITSET_WORD mask = deps[w];
- while (mask) {
- int i = u_bit_scan(&mask);
- uint32_t gem_handle = w * BITSET_WORDBITS + i;
- struct anv_bo *bo = anv_device_lookup_bo(device, gem_handle);
- assert(bo->refcount > 0);
- VkResult result =
- anv_execbuf_add_bo(device, exec, bo, NULL, extra_flags);
- if (result != VK_SUCCESS)
- return result;
- }
- }
-
- return VK_SUCCESS;
-}
+ if (!INTEL_DEBUG(DEBUG_BATCH | DEBUG_BATCH_STATS))
+ return;
-static void
-anv_cmd_buffer_process_relocs(struct anv_cmd_buffer *cmd_buffer,
- struct anv_reloc_list *list)
-{
- for (size_t i = 0; i < list->num_relocs; i++)
- list->relocs[i].target_handle = anv_bo_unwrap(list->reloc_bos[i])->index;
-}
+ struct anv_device *device = queue->device;
+ const bool has_perf_query = perf_query_pool && perf_query_pass >= 0 &&
+ cmd_buffer_count;
+ uint64_t frame_id = device->debug_frame_desc->frame_id;
-static void
-adjust_relocations_from_state_pool(struct anv_state_pool *pool,
- struct anv_reloc_list *relocs,
- uint32_t last_pool_center_bo_offset)
-{
- assert(last_pool_center_bo_offset <= pool->block_pool.center_bo_offset);
- uint32_t delta = pool->block_pool.center_bo_offset - last_pool_center_bo_offset;
-
- for (size_t i = 0; i < relocs->num_relocs; i++) {
- /* All of the relocations from this block pool to other BO's should
- * have been emitted relative to the surface block pool center. We
- * need to add the center offset to make them relative to the
- * beginning of the actual GEM bo.
- */
- relocs->relocs[i].offset += delta;
- }
-}
+ if (!intel_debug_batch_in_range(device->debug_frame_desc->frame_id))
+ return;
+ fprintf(stderr, "Batch for frame %"PRIu64" on queue %d\n",
+ frame_id, (int)(queue - device->queues));
-static void
-adjust_relocations_to_state_pool(struct anv_state_pool *pool,
- struct anv_bo *from_bo,
- struct anv_reloc_list *relocs,
- uint32_t last_pool_center_bo_offset)
-{
- assert(!from_bo->is_wrapper);
- assert(last_pool_center_bo_offset <= pool->block_pool.center_bo_offset);
- uint32_t delta = pool->block_pool.center_bo_offset - last_pool_center_bo_offset;
-
- /* When we initially emit relocations into a block pool, we don't
- * actually know what the final center_bo_offset will be so we just emit
- * it as if center_bo_offset == 0. Now that we know what the center
- * offset is, we need to walk the list of relocations and adjust any
- * relocations that point to the pool bo with the correct offset.
- */
- for (size_t i = 0; i < relocs->num_relocs; i++) {
- if (relocs->reloc_bos[i] == pool->block_pool.bo) {
- /* Adjust the delta value in the relocation to correctly
- * correspond to the new delta. Initially, this value may have
- * been negative (if treated as unsigned), but we trust in
- * uint32_t roll-over to fix that for us at this point.
- */
- relocs->relocs[i].delta += delta;
+ if (cmd_buffer_count) {
+ if (has_perf_query) {
+ struct anv_bo *pass_batch_bo = perf_query_pool->bo;
+ uint64_t pass_batch_offset =
+ khr_perf_query_preamble_offset(perf_query_pool, perf_query_pass);
- /* Since the delta has changed, we need to update the actual
- * relocated value with the new presumed value. This function
- * should only be called on batch buffers, so we know it isn't in
- * use by the GPU at the moment.
- */
- assert(relocs->relocs[i].offset < from_bo->size);
- write_reloc(pool->block_pool.device,
- from_bo->map + relocs->relocs[i].offset,
- relocs->relocs[i].presumed_offset +
- relocs->relocs[i].delta, false);
+ if (INTEL_DEBUG(DEBUG_BATCH)) {
+ intel_print_batch(queue->decoder,
+ pass_batch_bo->map + pass_batch_offset, 64,
+ pass_batch_bo->offset + pass_batch_offset, false);
+ }
}
- }
-}
-
-static void
-anv_reloc_list_apply(struct anv_device *device,
- struct anv_reloc_list *list,
- struct anv_bo *bo,
- bool always_relocate)
-{
- bo = anv_bo_unwrap(bo);
-
- for (size_t i = 0; i < list->num_relocs; i++) {
- struct anv_bo *target_bo = anv_bo_unwrap(list->reloc_bos[i]);
- if (list->relocs[i].presumed_offset == target_bo->offset &&
- !always_relocate)
- continue;
- void *p = bo->map + list->relocs[i].offset;
- write_reloc(device, p, target_bo->offset + list->relocs[i].delta, true);
- list->relocs[i].presumed_offset = target_bo->offset;
+ for (uint32_t i = 0; i < cmd_buffer_count; i++)
+ anv_print_batch(device, queue, cmd_buffers[i]);
+ } else if (INTEL_DEBUG(DEBUG_BATCH)) {
+ intel_print_batch(queue->decoder, device->trivial_batch_bo->map,
+ device->trivial_batch_bo->size,
+ device->trivial_batch_bo->offset, false);
}
}
-/**
- * This function applies the relocation for a command buffer and writes the
- * actual addresses into the buffers as per what we were told by the kernel on
- * the previous execbuf2 call. This should be safe to do because, for each
- * relocated address, we have two cases:
- *
- * 1) The target BO is inactive (as seen by the kernel). In this case, it is
- * not in use by the GPU so updating the address is 100% ok. It won't be
- * in-use by the GPU (from our context) again until the next execbuf2
- * happens. If the kernel decides to move it in the next execbuf2, it
- * will have to do the relocations itself, but that's ok because it should
- * have all of the information needed to do so.
+/* We lock around execbuf for three main reasons:
*
- * 2) The target BO is active (as seen by the kernel). In this case, it
- * hasn't moved since the last execbuffer2 call because GTT shuffling
- * *only* happens when the BO is idle. (From our perspective, it only
- * happens inside the execbuffer2 ioctl, but the shuffling may be
- * triggered by another ioctl, with full-ppgtt this is limited to only
- * execbuffer2 ioctls on the same context, or memory pressure.) Since the
- * target BO hasn't moved, our anv_bo::offset exactly matches the BO's GTT
- * address and the relocated value we are writing into the BO will be the
- * same as the value that is already there.
+ * 1) When a block pool is resized, we create a new gem handle with a
+ * different size and, in the case of surface states, possibly a different
+ * center offset but we re-use the same anv_bo struct when we do so. If
+ * this happens in the middle of setting up an execbuf, we could end up
+ * with our list of BOs out of sync with our list of gem handles.
*
- * There is also a possibility that the target BO is active but the exact
- * RENDER_SURFACE_STATE object we are writing the relocation into isn't in
- * use. In this case, the address currently in the RENDER_SURFACE_STATE
- * may be stale but it's still safe to write the relocation because that
- * particular RENDER_SURFACE_STATE object isn't in-use by the GPU and
- * won't be until the next execbuf2 call.
+ * 2) The algorithm we use for building the list of unique buffers isn't
+ * thread-safe. While the client is supposed to synchronize around
+ * QueueSubmit, this would be extremely difficult to debug if it ever came
+ * up in the wild due to a broken app. It's better to play it safe and
+ * just lock around QueueSubmit.
*
- * By doing relocations on the CPU, we can tell the kernel that it doesn't
- * need to bother. We want to do this because the surface state buffer is
- * used by every command buffer so, if the kernel does the relocations, it
- * will always be busy and the kernel will always stall. This is also
- * probably the fastest mechanism for doing relocations since the kernel would
- * have to make a full copy of all the relocations lists.
+ * Since the only other things that ever take the device lock such as block
+ * pool resize only rarely happen, this will almost never be contended so
+ * taking a lock isn't really an expensive operation in this case.
*/
-static bool
-execbuf_can_skip_relocations(struct anv_execbuf *exec)
+static inline VkResult
+anv_queue_exec_locked(struct anv_queue *queue,
+ uint32_t wait_count,
+ const struct vk_sync_wait *waits,
+ uint32_t cmd_buffer_count,
+ struct anv_cmd_buffer **cmd_buffers,
+ uint32_t signal_count,
+ const struct vk_sync_signal *signals,
+ struct anv_query_pool *perf_query_pool,
+ uint32_t perf_query_pass,
+ struct anv_utrace_submit *utrace_submit)
{
- if (!exec->has_relocs)
- return true;
-
- static int userspace_relocs = -1;
- if (userspace_relocs < 0)
- userspace_relocs = env_var_as_boolean("ANV_USERSPACE_RELOCS", true);
- if (!userspace_relocs)
- return false;
-
- /* First, we have to check to see whether or not we can even do the
- * relocation. New buffers which have never been submitted to the kernel
- * don't have a valid offset so we need to let the kernel do relocations so
- * that we can get offsets for them. On future execbuf2 calls, those
- * buffers will have offsets and we will be able to skip relocating.
- * Invalid offsets are indicated by anv_bo::offset == (uint64_t)-1.
- */
- for (uint32_t i = 0; i < exec->bo_count; i++) {
- assert(!exec->bos[i]->is_wrapper);
- if (exec->bos[i]->offset == (uint64_t)-1)
- return false;
- }
-
- return true;
-}
+ struct anv_device *device = queue->device;
+ VkResult result = VK_SUCCESS;
-static void
-relocate_cmd_buffer(struct anv_cmd_buffer *cmd_buffer,
- struct anv_execbuf *exec)
-{
- /* Since surface states are shared between command buffers and we don't
- * know what order they will be submitted to the kernel, we don't know
- * what address is actually written in the surface state object at any
- * given time. The only option is to always relocate them.
+ /* We only need to synchronize the main & companion command buffers if we
+ * have a companion command buffer somewhere in the list of command
+ * buffers.
*/
- struct anv_bo *surface_state_bo =
- anv_bo_unwrap(cmd_buffer->device->surface_state_pool.block_pool.bo);
- anv_reloc_list_apply(cmd_buffer->device, &cmd_buffer->surface_relocs,
- surface_state_bo,
- true /* always relocate surface states */);
-
- /* Since we own all of the batch buffers, we know what values are stored
- * in the relocated addresses and only have to update them if the offsets
- * have changed.
- */
- struct anv_batch_bo **bbo;
- u_vector_foreach(bbo, &cmd_buffer->seen_bbos) {
- anv_reloc_list_apply(cmd_buffer->device,
- &(*bbo)->relocs, (*bbo)->bo, false);
+ bool needs_companion_sync = false;
+ for (uint32_t i = 0; i < cmd_buffer_count; i++) {
+ if (cmd_buffers[i]->companion_rcs_cmd_buffer != NULL) {
+ needs_companion_sync = true;
+ break;
+ }
}
- for (uint32_t i = 0; i < exec->bo_count; i++)
- exec->objects[i].offset = exec->bos[i]->offset;
-}
-
-static void
-reset_cmd_buffer_surface_offsets(struct anv_cmd_buffer *cmd_buffer)
-{
- /* In the case where we fall back to doing kernel relocations, we need to
- * ensure that the relocation list is valid. All relocations on the batch
- * buffers are already valid and kept up-to-date. Since surface states are
- * shared between command buffers and we don't know what order they will be
- * submitted to the kernel, we don't know what address is actually written
- * in the surface state object at any given time. The only option is to set
- * a bogus presumed offset and let the kernel relocate them.
- */
- for (size_t i = 0; i < cmd_buffer->surface_relocs.num_relocs; i++)
- cmd_buffer->surface_relocs.relocs[i].presumed_offset = -1;
-}
-
-static VkResult
-setup_execbuf_for_cmd_buffer(struct anv_execbuf *execbuf,
- struct anv_cmd_buffer *cmd_buffer)
-{
- struct anv_state_pool *ss_pool =
- &cmd_buffer->device->surface_state_pool;
+ result =
+ device->kmd_backend->queue_exec_locked(
+ queue,
+ wait_count, waits,
+ cmd_buffer_count, cmd_buffers,
+ needs_companion_sync ? 0 : signal_count, signals,
+ perf_query_pool,
+ perf_query_pass,
+ utrace_submit);
+ if (result != VK_SUCCESS)
+ return result;
- adjust_relocations_from_state_pool(ss_pool, &cmd_buffer->surface_relocs,
- cmd_buffer->last_ss_pool_center);
- VkResult result;
- if (cmd_buffer->device->physical->use_softpin) {
- /* Add surface dependencies (BOs) to the execbuf */
- anv_execbuf_add_bo_bitset(cmd_buffer->device, execbuf,
- cmd_buffer->surface_relocs.dep_words,
- cmd_buffer->surface_relocs.deps, 0);
- } else {
- /* Since we aren't in the softpin case, all of our STATE_BASE_ADDRESS BOs
- * will get added automatically by processing relocations on the batch
- * buffer. We have to add the surface state BO manually because it has
- * relocations of its own that we need to be sure are processsed.
+ if (needs_companion_sync) {
+ struct vk_sync_wait companion_sync = {
+ .sync = queue->companion_sync,
+ };
+ /* If any of the command buffer had a companion batch, the submission
+ * backend will signal queue->companion_sync, so to ensure completion,
+ * we just need to wait on that fence.
*/
- result = anv_execbuf_add_bo(cmd_buffer->device, execbuf,
- ss_pool->block_pool.bo,
- &cmd_buffer->surface_relocs, 0);
- if (result != VK_SUCCESS)
- return result;
- }
-
- /* First, we walk over all of the bos we've seen and add them and their
- * relocations to the validate list.
- */
- struct anv_batch_bo **bbo;
- u_vector_foreach(bbo, &cmd_buffer->seen_bbos) {
- adjust_relocations_to_state_pool(ss_pool, (*bbo)->bo, &(*bbo)->relocs,
- cmd_buffer->last_ss_pool_center);
-
- result = anv_execbuf_add_bo(cmd_buffer->device, execbuf,
- (*bbo)->bo, &(*bbo)->relocs, 0);
- if (result != VK_SUCCESS)
- return result;
+ result =
+ device->kmd_backend->queue_exec_locked(queue,
+ 1, &companion_sync,
+ 0, NULL,
+ signal_count, signals,
+ NULL, 0,
+ NULL);
}
- /* Now that we've adjusted all of the surface state relocations, we need to
- * record the surface state pool center so future executions of the command
- * buffer can adjust correctly.
- */
- cmd_buffer->last_ss_pool_center = ss_pool->block_pool.center_bo_offset;
-
- return VK_SUCCESS;
+ return result;
}
-static void
-chain_command_buffers(struct anv_cmd_buffer **cmd_buffers,
- uint32_t num_cmd_buffers)
+static inline bool
+can_chain_query_pools(struct anv_query_pool *p1, struct anv_query_pool *p2)
{
- if (!anv_cmd_buffer_is_chainable(cmd_buffers[0])) {
- assert(num_cmd_buffers == 1);
- return;
- }
-
- /* Chain the N-1 first batch buffers */
- for (uint32_t i = 0; i < (num_cmd_buffers - 1); i++)
- anv_cmd_buffer_record_chain_submit(cmd_buffers[i], cmd_buffers[i + 1]);
-
- /* Put an end to the last one */
- anv_cmd_buffer_record_end_submit(cmd_buffers[num_cmd_buffers - 1]);
+ return (!p1 || !p2 || p1 == p2);
}
static VkResult
-setup_execbuf_for_cmd_buffers(struct anv_execbuf *execbuf,
- struct anv_queue *queue,
- struct anv_cmd_buffer **cmd_buffers,
- uint32_t num_cmd_buffers)
+anv_queue_submit_sparse_bind_locked(struct anv_queue *queue,
+ struct vk_queue_submit *submit)
{
struct anv_device *device = queue->device;
- struct anv_state_pool *ss_pool = &device->surface_state_pool;
VkResult result;
- /* Edit the tail of the command buffers to chain them all together if they
- * can be.
+ /* When fake sparse is enabled, while we do accept creating "sparse"
+ * resources we can't really handle sparse submission. Fake sparse is
+ * supposed to be used by applications that request sparse to be enabled
+ * but don't actually *use* it.
*/
- chain_command_buffers(cmd_buffers, num_cmd_buffers);
-
- for (uint32_t i = 0; i < num_cmd_buffers; i++) {
- result = setup_execbuf_for_cmd_buffer(execbuf, cmd_buffers[i]);
- if (result != VK_SUCCESS)
- return result;
+ if (device->physical->sparse_type == ANV_SPARSE_TYPE_NOT_SUPPORTED) {
+ if (INTEL_DEBUG(DEBUG_SPARSE))
+ fprintf(stderr, "=== application submitting sparse operations: "
+ "buffer_bind:%d image_opaque_bind:%d image_bind:%d\n",
+ submit->buffer_bind_count, submit->image_opaque_bind_count,
+ submit->image_bind_count);
+ return vk_queue_set_lost(&queue->vk, "Sparse binding not supported");
}
- /* Add all the global BOs to the object list for softpin case. */
- if (device->physical->use_softpin) {
- anv_block_pool_foreach_bo(bo, &ss_pool->block_pool) {
- result = anv_execbuf_add_bo(device, execbuf, bo, NULL, 0);
- if (result != VK_SUCCESS)
- return result;
- }
+ assert(submit->command_buffer_count == 0);
- struct anv_block_pool *pool;
- pool = &device->dynamic_state_pool.block_pool;
- anv_block_pool_foreach_bo(bo, pool) {
- result = anv_execbuf_add_bo(device, execbuf, bo, NULL, 0);
- if (result != VK_SUCCESS)
- return result;
- }
+ if (INTEL_DEBUG(DEBUG_SPARSE)) {
+ fprintf(stderr, "[sparse submission, buffers:%u opaque_images:%u "
+ "images:%u waits:%u signals:%u]\n",
+ submit->buffer_bind_count,
+ submit->image_opaque_bind_count,
+ submit->image_bind_count,
+ submit->wait_count, submit->signal_count);
+ }
- pool = &device->general_state_pool.block_pool;
- anv_block_pool_foreach_bo(bo, pool) {
- result = anv_execbuf_add_bo(device, execbuf, bo, NULL, 0);
- if (result != VK_SUCCESS)
- return result;
- }
+ struct anv_sparse_submission sparse_submit = {
+ .queue = queue,
+ .binds = NULL,
+ .binds_len = 0,
+ .binds_capacity = 0,
+ .wait_count = submit->wait_count,
+ .signal_count = submit->signal_count,
+ .waits = submit->waits,
+ .signals = submit->signals,
+ };
- pool = &device->instruction_state_pool.block_pool;
- anv_block_pool_foreach_bo(bo, pool) {
- result = anv_execbuf_add_bo(device, execbuf, bo, NULL, 0);
- if (result != VK_SUCCESS)
- return result;
- }
+ for (uint32_t i = 0; i < submit->buffer_bind_count; i++) {
+ VkSparseBufferMemoryBindInfo *bind_info = &submit->buffer_binds[i];
+ ANV_FROM_HANDLE(anv_buffer, buffer, bind_info->buffer);
- pool = &device->binding_table_pool.block_pool;
- anv_block_pool_foreach_bo(bo, pool) {
- result = anv_execbuf_add_bo(device, execbuf, bo, NULL, 0);
- if (result != VK_SUCCESS)
- return result;
- }
+ assert(anv_buffer_is_sparse(buffer));
- /* Add the BOs for all user allocated memory objects because we can't
- * track after binding updates of VK_EXT_descriptor_indexing.
- */
- list_for_each_entry(struct anv_device_memory, mem,
- &device->memory_objects, link) {
- result = anv_execbuf_add_bo(device, execbuf, mem->bo, NULL, 0);
+ for (uint32_t j = 0; j < bind_info->bindCount; j++) {
+ result = anv_sparse_bind_buffer(device, buffer,
+ &bind_info->pBinds[j],
+ &sparse_submit);
if (result != VK_SUCCESS)
- return result;
+ goto out_free_submit;
}
- } else {
- /* We do not support chaining primary command buffers without
- * softpin.
- */
- assert(num_cmd_buffers == 1);
}
- bool no_reloc = true;
- if (execbuf->has_relocs) {
- no_reloc = execbuf_can_skip_relocations(execbuf);
- if (no_reloc) {
- /* If we were able to successfully relocate everything, tell the
- * kernel that it can skip doing relocations. The requirement for
- * using NO_RELOC is:
- *
- * 1) The addresses written in the objects must match the
- * corresponding reloc.presumed_offset which in turn must match
- * the corresponding execobject.offset.
- *
- * 2) To avoid stalling, execobject.offset should match the current
- * address of that object within the active context.
- *
- * In order to satisfy all of the invariants that make userspace
- * relocations to be safe (see relocate_cmd_buffer()), we need to
- * further ensure that the addresses we use match those used by the
- * kernel for the most recent execbuf2.
- *
- * The kernel may still choose to do relocations anyway if something
- * has moved in the GTT. In this case, the relocation list still
- * needs to be valid. All relocations on the batch buffers are
- * already valid and kept up-to-date. For surface state relocations,
- * by applying the relocations in relocate_cmd_buffer, we ensured
- * that the address in the RENDER_SURFACE_STATE matches
- * presumed_offset, so it should be safe for the kernel to relocate
- * them as needed.
- */
- for (uint32_t i = 0; i < num_cmd_buffers; i++) {
- relocate_cmd_buffer(cmd_buffers[i], execbuf);
+ for (uint32_t i = 0; i < submit->image_bind_count; i++) {
+ VkSparseImageMemoryBindInfo *bind_info = &submit->image_binds[i];
+ ANV_FROM_HANDLE(anv_image, image, bind_info->image);
- anv_reloc_list_apply(device, &cmd_buffers[i]->surface_relocs,
- device->surface_state_pool.block_pool.bo,
- true /* always relocate surface states */);
- }
- } else {
- /* In the case where we fall back to doing kernel relocations, we
- * need to ensure that the relocation list is valid. All relocations
- * on the batch buffers are already valid and kept up-to-date. Since
- * surface states are shared between command buffers and we don't
- * know what order they will be submitted to the kernel, we don't
- * know what address is actually written in the surface state object
- * at any given time. The only option is to set a bogus presumed
- * offset and let the kernel relocate them.
- */
- for (uint32_t i = 0; i < num_cmd_buffers; i++)
- reset_cmd_buffer_surface_offsets(cmd_buffers[i]);
+ assert(anv_image_is_sparse(image));
+ assert(image->vk.create_flags & VK_IMAGE_CREATE_SPARSE_RESIDENCY_BIT);
+
+ for (uint32_t j = 0; j < bind_info->bindCount; j++) {
+ result = anv_sparse_bind_image_memory(queue, image,
+ &bind_info->pBinds[j],
+ &sparse_submit);
+ if (result != VK_SUCCESS)
+ goto out_free_submit;
}
}
- struct anv_batch_bo *first_batch_bo =
- list_first_entry(&cmd_buffers[0]->batch_bos, struct anv_batch_bo, link);
-
- /* The kernel requires that the last entry in the validation list be the
- * batch buffer to execute. We can simply swap the element
- * corresponding to the first batch_bo in the chain with the last
- * element in the list.
- */
- if (first_batch_bo->bo->index != execbuf->bo_count - 1) {
- uint32_t idx = first_batch_bo->bo->index;
- uint32_t last_idx = execbuf->bo_count - 1;
-
- struct drm_i915_gem_exec_object2 tmp_obj = execbuf->objects[idx];
- assert(execbuf->bos[idx] == first_batch_bo->bo);
+ for (uint32_t i = 0; i < submit->image_opaque_bind_count; i++) {
+ VkSparseImageOpaqueMemoryBindInfo *bind_info =
+ &submit->image_opaque_binds[i];
+ ANV_FROM_HANDLE(anv_image, image, bind_info->image);
- execbuf->objects[idx] = execbuf->objects[last_idx];
- execbuf->bos[idx] = execbuf->bos[last_idx];
- execbuf->bos[idx]->index = idx;
+ assert(anv_image_is_sparse(image));
- execbuf->objects[last_idx] = tmp_obj;
- execbuf->bos[last_idx] = first_batch_bo->bo;
- first_batch_bo->bo->index = last_idx;
+ for (uint32_t j = 0; j < bind_info->bindCount; j++) {
+ result = anv_sparse_bind_image_opaque(device, image,
+ &bind_info->pBinds[j],
+ &sparse_submit);
+ if (result != VK_SUCCESS)
+ goto out_free_submit;
+ }
}
- /* If we are pinning our BOs, we shouldn't have to relocate anything */
- if (device->physical->use_softpin)
- assert(!execbuf->has_relocs);
+ result = anv_sparse_bind(device, &sparse_submit);
- /* Now we go through and fixup all of the relocation lists to point to the
- * correct indices in the object array (I915_EXEC_HANDLE_LUT). We have to
- * do this after we reorder the list above as some of the indices may have
- * changed.
- */
- struct anv_batch_bo **bbo;
- if (execbuf->has_relocs) {
- assert(num_cmd_buffers == 1);
- u_vector_foreach(bbo, &cmd_buffers[0]->seen_bbos)
- anv_cmd_buffer_process_relocs(cmd_buffers[0], &(*bbo)->relocs);
+out_free_submit:
+ vk_free(&device->vk.alloc, sparse_submit.binds);
+ return result;
+}
- anv_cmd_buffer_process_relocs(cmd_buffers[0], &cmd_buffers[0]->surface_relocs);
- }
+static VkResult
+anv_queue_submit_cmd_buffers_locked(struct anv_queue *queue,
+ struct vk_queue_submit *submit,
+ struct anv_utrace_submit *utrace_submit)
+{
+ VkResult result;
- if (!device->info.has_llc) {
- __builtin_ia32_mfence();
- for (uint32_t i = 0; i < num_cmd_buffers; i++) {
- u_vector_foreach(bbo, &cmd_buffers[i]->seen_bbos) {
- for (uint32_t i = 0; i < (*bbo)->length; i += CACHELINE_SIZE)
- __builtin_ia32_clflush((*bbo)->bo->map + i);
+ if (submit->command_buffer_count == 0) {
+ result = anv_queue_exec_locked(queue, submit->wait_count, submit->waits,
+ 0 /* cmd_buffer_count */,
+ NULL /* cmd_buffers */,
+ submit->signal_count, submit->signals,
+ NULL /* perf_query_pool */,
+ 0 /* perf_query_pass */,
+ utrace_submit);
+ if (result != VK_SUCCESS)
+ return result;
+ } else {
+ /* Everything's easier if we don't have to bother with container_of() */
+ STATIC_ASSERT(offsetof(struct anv_cmd_buffer, vk) == 0);
+ struct vk_command_buffer **vk_cmd_buffers = submit->command_buffers;
+ struct anv_cmd_buffer **cmd_buffers = (void *)vk_cmd_buffers;
+ uint32_t start = 0;
+ uint32_t end = submit->command_buffer_count;
+ struct anv_query_pool *perf_query_pool =
+ cmd_buffers[start]->perf_query_pool;
+ for (uint32_t n = 0; n < end; n++) {
+ bool can_chain = false;
+ uint32_t next = n + 1;
+ /* Can we chain the last buffer into the next one? */
+ if (next < end &&
+ anv_cmd_buffer_is_chainable(cmd_buffers[n]) &&
+ anv_cmd_buffer_is_chainable(cmd_buffers[next]) &&
+ can_chain_query_pools
+ (cmd_buffers[next]->perf_query_pool, perf_query_pool)) {
+ can_chain = true;
+ perf_query_pool =
+ perf_query_pool ? perf_query_pool :
+ cmd_buffers[next]->perf_query_pool;
+ }
+ if (!can_chain) {
+ /* The next buffer cannot be chained, or we have reached the
+ * last buffer, submit what have been chained so far.
+ */
+ VkResult result =
+ anv_queue_exec_locked(queue,
+ start == 0 ? submit->wait_count : 0,
+ start == 0 ? submit->waits : NULL,
+ next - start, &cmd_buffers[start],
+ next == end ? submit->signal_count : 0,
+ next == end ? submit->signals : NULL,
+ perf_query_pool,
+ submit->perf_pass_index,
+ next == end ? utrace_submit : NULL);
+ if (result != VK_SUCCESS)
+ return result;
+ if (next < end) {
+ start = next;
+ perf_query_pool = cmd_buffers[start]->perf_query_pool;
+ }
}
}
}
+ for (uint32_t i = 0; i < submit->signal_count; i++) {
+ if (!vk_sync_is_anv_bo_sync(submit->signals[i].sync))
+ continue;
- struct anv_batch *batch = &cmd_buffers[0]->batch;
- execbuf->execbuf = (struct drm_i915_gem_execbuffer2) {
- .buffers_ptr = (uintptr_t) execbuf->objects,
- .buffer_count = execbuf->bo_count,
- .batch_start_offset = 0,
- /* On platforms that cannot chain batch buffers because of the i915
- * command parser, we have to provide the batch length. Everywhere else
- * we'll chain batches so no point in passing a length.
+ struct anv_bo_sync *bo_sync =
+ container_of(submit->signals[i].sync, struct anv_bo_sync, sync);
+
+ /* Once the execbuf has returned, we need to set the fence state to
+ * SUBMITTED. We can't do this before calling execbuf because
+ * anv_GetFenceStatus does take the global device lock before checking
+ * fence->state.
+ *
+ * We set the fence state to SUBMITTED regardless of whether or not the
+ * execbuf succeeds because we need to ensure that vkWaitForFences() and
+ * vkGetFenceStatus() return a valid result (VK_ERROR_DEVICE_LOST or
+ * VK_SUCCESS) in a finite amount of time even if execbuf fails.
*/
- .batch_len = device->can_chain_batches ? 0 : batch->next - batch->start,
- .cliprects_ptr = 0,
- .num_cliprects = 0,
- .DR1 = 0,
- .DR4 = 0,
- .flags = I915_EXEC_HANDLE_LUT | queue->exec_flags | (no_reloc ? I915_EXEC_NO_RELOC : 0),
- .rsvd1 = device->context_id,
- .rsvd2 = 0,
- };
-
- return VK_SUCCESS;
-}
-
-static VkResult
-setup_empty_execbuf(struct anv_execbuf *execbuf, struct anv_queue *queue)
-{
- struct anv_device *device = queue->device;
- VkResult result = anv_execbuf_add_bo(device, execbuf,
- device->trivial_batch_bo,
- NULL, 0);
- if (result != VK_SUCCESS)
- return result;
+ assert(bo_sync->state == ANV_BO_SYNC_STATE_RESET);
+ bo_sync->state = ANV_BO_SYNC_STATE_SUBMITTED;
+ }
- execbuf->execbuf = (struct drm_i915_gem_execbuffer2) {
- .buffers_ptr = (uintptr_t) execbuf->objects,
- .buffer_count = execbuf->bo_count,
- .batch_start_offset = 0,
- .batch_len = 8, /* GFX7_MI_BATCH_BUFFER_END and NOOP */
- .flags = I915_EXEC_HANDLE_LUT | queue->exec_flags | I915_EXEC_NO_RELOC,
- .rsvd1 = device->context_id,
- .rsvd2 = 0,
- };
+ pthread_cond_broadcast(&queue->device->queue_submit);
return VK_SUCCESS;
}
-/* We lock around execbuf for three main reasons:
- *
- * 1) When a block pool is resized, we create a new gem handle with a
- * different size and, in the case of surface states, possibly a different
- * center offset but we re-use the same anv_bo struct when we do so. If
- * this happens in the middle of setting up an execbuf, we could end up
- * with our list of BOs out of sync with our list of gem handles.
- *
- * 2) The algorithm we use for building the list of unique buffers isn't
- * thread-safe. While the client is supposed to syncronize around
- * QueueSubmit, this would be extremely difficult to debug if it ever came
- * up in the wild due to a broken app. It's better to play it safe and
- * just lock around QueueSubmit.
- *
- * 3) The anv_cmd_buffer_execbuf function may perform relocations in
- * userspace. Due to the fact that the surface state buffer is shared
- * between batches, we can't afford to have that happen from multiple
- * threads at the same time. Even though the user is supposed to ensure
- * this doesn't happen, we play it safe as in (2) above.
- *
- * Since the only other things that ever take the device lock such as block
- * pool resize only rarely happen, this will almost never be contended so
- * taking a lock isn't really an expensive operation in this case.
- */
VkResult
-anv_queue_execbuf_locked(struct anv_queue *queue,
- struct anv_queue_submit *submit)
+anv_queue_submit(struct vk_queue *vk_queue,
+ struct vk_queue_submit *submit)
{
+ struct anv_queue *queue = container_of(vk_queue, struct anv_queue, vk);
struct anv_device *device = queue->device;
- struct anv_execbuf execbuf;
- anv_execbuf_init(&execbuf);
- execbuf.alloc = submit->alloc;
- execbuf.alloc_scope = submit->alloc_scope;
- execbuf.perf_query_pass = submit->perf_query_pass;
-
- /* Always add the workaround BO as it includes a driver identifier for the
- * error_state.
+ VkResult result;
+
+ if (queue->device->info->no_hw) {
+ for (uint32_t i = 0; i < submit->signal_count; i++) {
+ result = vk_sync_signal(&device->vk,
+ submit->signals[i].sync,
+ submit->signals[i].signal_value);
+ if (result != VK_SUCCESS)
+ return vk_queue_set_lost(&queue->vk, "vk_sync_signal failed");
+ }
+ return VK_SUCCESS;
+ }
+
+ /* Flush the trace points first before taking the lock as the flushing
+ * might try to take that same lock.
*/
- VkResult result =
- anv_execbuf_add_bo(device, &execbuf, device->workaround_bo, NULL, 0);
+ struct anv_utrace_submit *utrace_submit = NULL;
+ result = anv_device_utrace_flush_cmd_buffers(
+ queue,
+ submit->command_buffer_count,
+ (struct anv_cmd_buffer **)submit->command_buffers,
+ &utrace_submit);
if (result != VK_SUCCESS)
- goto error;
+ return result;
- for (uint32_t i = 0; i < submit->fence_bo_count; i++) {
- int signaled;
- struct anv_bo *bo = anv_unpack_ptr(submit->fence_bos[i], 1, &signaled);
+ pthread_mutex_lock(&device->mutex);
- result = anv_execbuf_add_bo(device, &execbuf, bo, NULL,
- signaled ? EXEC_OBJECT_WRITE : 0);
- if (result != VK_SUCCESS)
- goto error;
- }
+ uint64_t start_ts = intel_ds_begin_submit(&queue->ds);
- if (submit->cmd_buffer_count) {
- result = setup_execbuf_for_cmd_buffers(&execbuf, queue,
- submit->cmd_buffers,
- submit->cmd_buffer_count);
- } else if (submit->simple_bo) {
- result = anv_execbuf_add_bo(device, &execbuf, submit->simple_bo, NULL, 0);
- if (result != VK_SUCCESS)
- goto error;
-
- execbuf.execbuf = (struct drm_i915_gem_execbuffer2) {
- .buffers_ptr = (uintptr_t) execbuf.objects,
- .buffer_count = execbuf.bo_count,
- .batch_start_offset = 0,
- .batch_len = submit->simple_bo_size,
- .flags = I915_EXEC_HANDLE_LUT | queue->exec_flags | I915_EXEC_NO_RELOC,
- .rsvd1 = device->context_id,
- .rsvd2 = 0,
- };
+ if (submit->buffer_bind_count ||
+ submit->image_opaque_bind_count ||
+ submit->image_bind_count) {
+ result = anv_queue_submit_sparse_bind_locked(queue, submit);
} else {
- result = setup_empty_execbuf(&execbuf, queue);
+ result = anv_queue_submit_cmd_buffers_locked(queue, submit,
+ utrace_submit);
}
- if (result != VK_SUCCESS)
- goto error;
+ /* Take submission ID under lock */
+ intel_ds_end_submit(&queue->ds, start_ts);
- const bool has_perf_query =
- submit->perf_query_pass >= 0 &&
- submit->cmd_buffer_count &&
- submit->perf_query_pool;
+ pthread_mutex_unlock(&device->mutex);
- if (INTEL_DEBUG & DEBUG_SUBMIT) {
- fprintf(stderr, "Batch offset=0x%x len=0x%x on queue 0\n",
- execbuf.execbuf.batch_start_offset, execbuf.execbuf.batch_len);
- for (uint32_t i = 0; i < execbuf.bo_count; i++) {
- const struct anv_bo *bo = execbuf.bos[i];
+ intel_ds_device_process(&device->ds, true);
- fprintf(stderr, " BO: addr=0x%016"PRIx64" size=%010"PRIx64" handle=%05u name=%s\n",
- bo->offset, bo->size, bo->gem_handle, bo->name);
- }
- }
+ return result;
+}
- if (INTEL_DEBUG & DEBUG_BATCH) {
- fprintf(stderr, "Batch on queue %d\n", (int)(queue - device->queues));
- if (submit->cmd_buffer_count) {
- if (has_perf_query) {
- struct anv_query_pool *query_pool = submit->perf_query_pool;
- struct anv_bo *pass_batch_bo = query_pool->bo;
- uint64_t pass_batch_offset =
- khr_perf_query_preamble_offset(query_pool,
- submit->perf_query_pass);
-
- intel_print_batch(&device->decoder_ctx,
- pass_batch_bo->map + pass_batch_offset, 64,
- pass_batch_bo->offset + pass_batch_offset, false);
- }
+VkResult
+anv_queue_submit_simple_batch(struct anv_queue *queue,
+ struct anv_batch *batch,
+ bool is_companion_rcs_batch)
+{
+ struct anv_device *device = queue->device;
+ VkResult result = VK_SUCCESS;
- for (uint32_t i = 0; i < submit->cmd_buffer_count; i++) {
- struct anv_batch_bo **bo =
- u_vector_tail(&submit->cmd_buffers[i]->seen_bbos);
- device->cmd_buffer_being_decoded = submit->cmd_buffers[i];
- intel_print_batch(&device->decoder_ctx, (*bo)->bo->map,
- (*bo)->bo->size, (*bo)->bo->offset, false);
- device->cmd_buffer_being_decoded = NULL;
- }
- } else if (submit->simple_bo) {
- intel_print_batch(&device->decoder_ctx, submit->simple_bo->map,
- submit->simple_bo->size, submit->simple_bo->offset, false);
- } else {
- intel_print_batch(&device->decoder_ctx,
- device->trivial_batch_bo->map,
- device->trivial_batch_bo->size,
- device->trivial_batch_bo->offset, false);
- }
- }
+ if (anv_batch_has_error(batch))
+ return batch->status;
- if (submit->fence_count > 0) {
- if (device->has_thread_submit) {
- execbuf.timeline_fences.fence_count = submit->fence_count;
- execbuf.timeline_fences.handles_ptr = (uintptr_t)submit->fences;
- execbuf.timeline_fences.values_ptr = (uintptr_t)submit->fence_values;
- anv_execbuf_add_ext(&execbuf,
- DRM_I915_GEM_EXECBUFFER_EXT_TIMELINE_FENCES,
- &execbuf.timeline_fences.base);
- } else {
- execbuf.execbuf.flags |= I915_EXEC_FENCE_ARRAY;
- execbuf.execbuf.num_cliprects = submit->fence_count;
- execbuf.execbuf.cliprects_ptr = (uintptr_t)submit->fences;
- }
- }
+ if (queue->device->info->no_hw)
+ return VK_SUCCESS;
- if (submit->in_fence != -1) {
- assert(!device->has_thread_submit);
- execbuf.execbuf.flags |= I915_EXEC_FENCE_IN;
- execbuf.execbuf.rsvd2 |= (uint32_t)submit->in_fence;
- }
+ /* This is only used by device init so we can assume the queue is empty and
+ * we aren't fighting with a submit thread.
+ */
+ assert(vk_queue_is_empty(&queue->vk));
+
+ uint32_t batch_size = align(batch->next - batch->start, 8);
+
+ struct anv_bo *batch_bo = NULL;
+ result = anv_bo_pool_alloc(&device->batch_bo_pool, batch_size, &batch_bo);
+ if (result != VK_SUCCESS)
+ return result;
- if (submit->need_out_fence) {
- assert(!device->has_thread_submit);
- execbuf.execbuf.flags |= I915_EXEC_FENCE_OUT;
+ memcpy(batch_bo->map, batch->start, batch_size);
+#ifdef SUPPORT_INTEL_INTEGRATED_GPUS
+ if (device->physical->memory.need_flush &&
+ anv_bo_needs_host_cache_flush(batch_bo->alloc_flags))
+ intel_flush_range(batch_bo->map, batch_size);
+#endif
+
+ if (INTEL_DEBUG(DEBUG_BATCH) &&
+ intel_debug_batch_in_range(device->debug_frame_desc->frame_id)) {
+ int render_queue_idx =
+ anv_get_first_render_queue_index(device->physical);
+ struct intel_batch_decode_ctx *ctx = is_companion_rcs_batch ?
+ &device->decoder[render_queue_idx] :
+ queue->decoder;
+ intel_print_batch(ctx, batch_bo->map, batch_bo->size, batch_bo->offset,
+ false);
}
- if (has_perf_query) {
- struct anv_query_pool *query_pool = submit->perf_query_pool;
- assert(submit->perf_query_pass < query_pool->n_passes);
- struct intel_perf_query_info *query_info =
- query_pool->pass_query[submit->perf_query_pass];
+ result = device->kmd_backend->execute_simple_batch(queue, batch_bo,
+ batch_size,
+ is_companion_rcs_batch);
- /* Some performance queries just the pipeline statistic HW, no need for
- * OA in that case, so no need to reconfigure.
- */
- if ((INTEL_DEBUG & DEBUG_NO_OACONFIG) == 0 &&
- (query_info->kind == INTEL_PERF_QUERY_TYPE_OA ||
- query_info->kind == INTEL_PERF_QUERY_TYPE_RAW)) {
- int ret = intel_ioctl(device->perf_fd, I915_PERF_IOCTL_CONFIG,
- (void *)(uintptr_t) query_info->oa_metrics_set_id);
- if (ret < 0) {
- result = anv_device_set_lost(device,
- "i915-perf config failed: %s",
- strerror(errno));
- }
- }
+ anv_bo_pool_free(&device->batch_bo_pool, batch_bo);
- struct anv_bo *pass_batch_bo = query_pool->bo;
+ return result;
+}
- struct drm_i915_gem_exec_object2 query_pass_object = {
- .handle = pass_batch_bo->gem_handle,
- .offset = pass_batch_bo->offset,
- .flags = pass_batch_bo->flags,
- };
- struct drm_i915_gem_execbuffer2 query_pass_execbuf = {
- .buffers_ptr = (uintptr_t) &query_pass_object,
- .buffer_count = 1,
- .batch_start_offset = khr_perf_query_preamble_offset(query_pool,
- submit->perf_query_pass),
- .flags = I915_EXEC_HANDLE_LUT | queue->exec_flags,
- .rsvd1 = device->context_id,
- };
+VkResult
+anv_queue_submit_trtt_batch(struct anv_sparse_submission *submit,
+ struct anv_batch *batch)
+{
+ struct anv_queue *queue = submit->queue;
+ struct anv_device *device = queue->device;
+ VkResult result = VK_SUCCESS;
- int ret = queue->device->info.no_hw ? 0 :
- anv_gem_execbuffer(queue->device, &query_pass_execbuf);
- if (ret)
- result = anv_queue_set_lost(queue, "execbuf2 failed: %m");
- }
+ uint32_t batch_size = align(batch->next - batch->start, 8);
+ struct anv_trtt_batch_bo *trtt_bbo;
+ result = anv_trtt_batch_bo_new(device, batch_size, &trtt_bbo);
+ if (result != VK_SUCCESS)
+ return result;
- int ret = queue->device->info.no_hw ? 0 :
- anv_gem_execbuffer(queue->device, &execbuf.execbuf);
- if (ret)
- result = anv_queue_set_lost(queue, "execbuf2 failed: %m");
+ memcpy(trtt_bbo->bo->map, batch->start, trtt_bbo->size);
+#ifdef SUPPORT_INTEL_INTEGRATED_GPUS
+ if (device->physical->memory.need_flush &&
+ anv_bo_needs_host_cache_flush(trtt_bbo->bo->alloc_flags))
+ intel_flush_range(trtt_bbo->bo->map, trtt_bbo->size);
+#endif
- struct drm_i915_gem_exec_object2 *objects = execbuf.objects;
- for (uint32_t k = 0; k < execbuf.bo_count; k++) {
- if (execbuf.bos[k]->flags & EXEC_OBJECT_PINNED)
- assert(execbuf.bos[k]->offset == objects[k].offset);
- execbuf.bos[k]->offset = objects[k].offset;
+ if (INTEL_DEBUG(DEBUG_BATCH)) {
+ intel_print_batch(queue->decoder, trtt_bbo->bo->map, trtt_bbo->bo->size,
+ trtt_bbo->bo->offset, false);
}
- if (result == VK_SUCCESS && submit->need_out_fence)
- submit->out_fence = execbuf.execbuf.rsvd2 >> 32;
+ result = device->kmd_backend->execute_trtt_batch(submit, trtt_bbo);
- error:
- pthread_cond_broadcast(&device->queue_submit);
+ return result;
+}
- anv_execbuf_finish(&execbuf);
+void
+anv_cmd_buffer_clflush(struct anv_cmd_buffer **cmd_buffers,
+ uint32_t num_cmd_buffers)
+{
+#ifdef SUPPORT_INTEL_INTEGRATED_GPUS
+ struct anv_batch_bo **bbo;
- return result;
+ __builtin_ia32_mfence();
+
+ for (uint32_t i = 0; i < num_cmd_buffers; i++) {
+ u_vector_foreach(bbo, &cmd_buffers[i]->seen_bbos) {
+ intel_flush_range_no_fence((*bbo)->bo->map, (*bbo)->length);
+ }
+ }
+
+ __builtin_ia32_mfence();
+#endif
}
diff --git a/src/intel/vulkan/anv_blorp.c b/src/intel/vulkan/anv_blorp.c
index 765df4c5cf2..1fec49fdedd 100644
--- a/src/intel/vulkan/anv_blorp.c
+++ b/src/intel/vulkan/anv_blorp.c
@@ -22,6 +22,7 @@
*/
#include "anv_private.h"
+#include "genxml/gen8_pack.h"
static bool
lookup_blorp_shader(struct blorp_batch *batch,
@@ -31,11 +32,9 @@ lookup_blorp_shader(struct blorp_batch *batch,
struct blorp_context *blorp = batch->blorp;
struct anv_device *device = blorp->driver_ctx;
- /* The default cache must be a real cache */
- assert(device->default_pipeline_cache.cache);
-
struct anv_shader_bin *bin =
- anv_pipeline_cache_search(&device->default_pipeline_cache, key, key_size);
+ anv_device_search_for_kernel(device, device->internal_cache,
+ key, key_size, NULL);
if (!bin)
return false;
@@ -54,26 +53,29 @@ static bool
upload_blorp_shader(struct blorp_batch *batch, uint32_t stage,
const void *key, uint32_t key_size,
const void *kernel, uint32_t kernel_size,
- const struct brw_stage_prog_data *prog_data,
+ const void *prog_data,
uint32_t prog_data_size,
uint32_t *kernel_out, void *prog_data_out)
{
struct blorp_context *blorp = batch->blorp;
struct anv_device *device = blorp->driver_ctx;
- /* The blorp cache must be a real cache */
- assert(device->default_pipeline_cache.cache);
-
- struct anv_pipeline_bind_map bind_map = {
- .surface_count = 0,
- .sampler_count = 0,
+ struct anv_pipeline_bind_map empty_bind_map = {};
+ struct anv_push_descriptor_info empty_push_desc_info = {};
+ struct anv_shader_upload_params upload_params = {
+ .stage = stage,
+ .key_data = key,
+ .key_size = key_size,
+ .kernel_data = kernel,
+ .kernel_size = kernel_size,
+ .prog_data = prog_data,
+ .prog_data_size = prog_data_size,
+ .bind_map = &empty_bind_map,
+ .push_desc_info = &empty_push_desc_info,
};
struct anv_shader_bin *bin =
- anv_pipeline_cache_upload_kernel(&device->default_pipeline_cache, stage,
- key, key_size, kernel, kernel_size,
- prog_data, prog_data_size,
- NULL, 0, NULL, &bind_map);
+ anv_device_upload_kernel(device, device->internal_cache, &upload_params);
if (!bin)
return false;
@@ -89,84 +91,142 @@ upload_blorp_shader(struct blorp_batch *batch, uint32_t stage,
return true;
}
+static void
+upload_dynamic_state(struct blorp_context *context,
+ const void *data, uint32_t size,
+ uint32_t alignment, enum blorp_dynamic_state name)
+{
+ struct anv_device *device = context->driver_ctx;
+
+ device->blorp.dynamic_states[name].state =
+ anv_state_pool_emit_data(&device->dynamic_state_pool,
+ size, alignment, data);
+ if (device->vk.enabled_extensions.EXT_descriptor_buffer) {
+ device->blorp.dynamic_states[name].db_state =
+ anv_state_pool_emit_data(&device->dynamic_state_db_pool,
+ size, alignment, data);
+ }
+}
+
void
anv_device_init_blorp(struct anv_device *device)
{
- blorp_init(&device->blorp, device, &device->isl_dev);
- device->blorp.compiler = device->physical->compiler;
- device->blorp.lookup_shader = lookup_blorp_shader;
- device->blorp.upload_shader = upload_blorp_shader;
- switch (device->info.verx10) {
- case 70:
- device->blorp.exec = gfx7_blorp_exec;
- break;
- case 75:
- device->blorp.exec = gfx75_blorp_exec;
- break;
- case 80:
- device->blorp.exec = gfx8_blorp_exec;
- break;
- case 90:
- device->blorp.exec = gfx9_blorp_exec;
- break;
- case 110:
- device->blorp.exec = gfx11_blorp_exec;
- break;
- case 120:
- device->blorp.exec = gfx12_blorp_exec;
- break;
- case 125:
- device->blorp.exec = gfx125_blorp_exec;
- break;
- default:
- unreachable("Unknown hardware generation");
- }
+ const struct blorp_config config = {
+ .use_mesh_shading = device->vk.enabled_extensions.EXT_mesh_shader,
+ .use_unrestricted_depth_range =
+ device->vk.enabled_extensions.EXT_depth_range_unrestricted,
+ .use_cached_dynamic_states = true,
+ };
+
+ blorp_init_brw(&device->blorp.context, device, &device->isl_dev,
+ device->physical->compiler, &config);
+ device->blorp.context.lookup_shader = lookup_blorp_shader;
+ device->blorp.context.upload_shader = upload_blorp_shader;
+ device->blorp.context.enable_tbimr = device->physical->instance->enable_tbimr;
+ device->blorp.context.exec = anv_genX(device->info, blorp_exec);
+ device->blorp.context.upload_dynamic_state = upload_dynamic_state;
+
+ anv_genX(device->info, blorp_init_dynamic_states)(&device->blorp.context);
}
void
anv_device_finish_blorp(struct anv_device *device)
{
- blorp_finish(&device->blorp);
+#ifdef HAVE_VALGRIND
+ /* We only need to free these to prevent valgrind errors. The backing
+ * BO will go away in a couple of lines so we don't actually leak.
+ */
+ for (uint32_t i = 0; i < ARRAY_SIZE(device->blorp.dynamic_states); i++) {
+ anv_state_pool_free(&device->dynamic_state_pool,
+ device->blorp.dynamic_states[i].state);
+ if (device->vk.enabled_extensions.EXT_descriptor_buffer) {
+ anv_state_pool_free(&device->dynamic_state_db_pool,
+ device->blorp.dynamic_states[i].db_state);
+ }
+
+ }
+#endif
+ blorp_finish(&device->blorp.context);
+}
+
+static void
+anv_blorp_batch_init(struct anv_cmd_buffer *cmd_buffer,
+ struct blorp_batch *batch, enum blorp_batch_flags flags)
+{
+ VkQueueFlags queue_flags = cmd_buffer->queue_family->queueFlags;
+
+ if (queue_flags & VK_QUEUE_GRAPHICS_BIT) {
+ /* blorp runs on render engine by default */
+ } else if (queue_flags & VK_QUEUE_COMPUTE_BIT) {
+ flags |= BLORP_BATCH_USE_COMPUTE;
+ } else if (queue_flags & VK_QUEUE_TRANSFER_BIT) {
+ flags |= BLORP_BATCH_USE_BLITTER;
+ } else {
+ unreachable("unknown queue family");
+ }
+
+ /* Can't have both flags at the same time. */
+ assert((flags & BLORP_BATCH_USE_BLITTER) == 0 ||
+ (flags & BLORP_BATCH_USE_COMPUTE) == 0);
+
+ blorp_batch_init(&cmd_buffer->device->blorp.context, batch, cmd_buffer, flags);
}
static void
-get_blorp_surf_for_anv_buffer(struct anv_device *device,
- struct anv_buffer *buffer, uint64_t offset,
- uint32_t width, uint32_t height,
- uint32_t row_pitch, enum isl_format format,
- bool is_dest,
- struct blorp_surf *blorp_surf,
- struct isl_surf *isl_surf)
+anv_blorp_batch_finish(struct blorp_batch *batch)
{
- const struct isl_format_layout *fmtl =
- isl_format_get_layout(format);
- bool ok UNUSED;
+ blorp_batch_finish(batch);
+}
- /* ASTC is the only format which doesn't support linear layouts.
- * Create an equivalently sized surface with ISL to get around this.
- */
- if (fmtl->txc == ISL_TXC_ASTC) {
- /* Use an equivalently sized format */
- format = ISL_FORMAT_R32G32B32A32_UINT;
- assert(fmtl->bpb == isl_format_get_layout(format)->bpb);
+static isl_surf_usage_flags_t
+get_usage_flag_for_cmd_buffer(const struct anv_cmd_buffer *cmd_buffer,
+ bool is_dest)
+{
+ isl_surf_usage_flags_t usage;
- /* Shrink the dimensions for the new format */
- width = DIV_ROUND_UP(width, fmtl->bw);
- height = DIV_ROUND_UP(height, fmtl->bh);
+ switch (cmd_buffer->queue_family->engine_class) {
+ case INTEL_ENGINE_CLASS_RENDER:
+ usage = is_dest ? ISL_SURF_USAGE_RENDER_TARGET_BIT :
+ ISL_SURF_USAGE_TEXTURE_BIT;
+ break;
+ case INTEL_ENGINE_CLASS_COMPUTE:
+ usage = is_dest ? ISL_SURF_USAGE_STORAGE_BIT :
+ ISL_SURF_USAGE_TEXTURE_BIT;
+ break;
+ case INTEL_ENGINE_CLASS_COPY:
+ usage = is_dest ? ISL_SURF_USAGE_BLITTER_DST_BIT :
+ ISL_SURF_USAGE_BLITTER_SRC_BIT;
+ break;
+ default:
+ unreachable("Unhandled engine class");
}
+ return usage;
+}
+
+static void
+get_blorp_surf_for_anv_address(struct anv_cmd_buffer *cmd_buffer,
+ struct anv_address address,
+ uint32_t width, uint32_t height,
+ uint32_t row_pitch, enum isl_format format,
+ bool is_dest,
+ struct blorp_surf *blorp_surf,
+ struct isl_surf *isl_surf)
+{
+ bool ok UNUSED;
+ isl_surf_usage_flags_t usage =
+ get_usage_flag_for_cmd_buffer(cmd_buffer, is_dest);
+
*blorp_surf = (struct blorp_surf) {
.surf = isl_surf,
.addr = {
- .buffer = buffer->address.bo,
- .offset = buffer->address.offset + offset,
- .mocs = anv_mocs(device, buffer->address.bo,
- is_dest ? ISL_SURF_USAGE_RENDER_TARGET_BIT
- : ISL_SURF_USAGE_TEXTURE_BIT),
+ .buffer = address.bo,
+ .offset = address.offset,
+ .mocs = anv_mocs(cmd_buffer->device, address.bo, usage),
},
};
- ok = isl_surf_init(&device->isl_dev, isl_surf,
+ ok = isl_surf_init(&cmd_buffer->device->isl_dev, isl_surf,
.dim = ISL_SURF_DIM_2D,
.format = format,
.width = width,
@@ -176,12 +236,26 @@ get_blorp_surf_for_anv_buffer(struct anv_device *device,
.array_len = 1,
.samples = 1,
.row_pitch_B = row_pitch,
- .usage = is_dest ? ISL_SURF_USAGE_RENDER_TARGET_BIT
- : ISL_SURF_USAGE_TEXTURE_BIT,
+ .usage = usage,
.tiling_flags = ISL_TILING_LINEAR_BIT);
assert(ok);
}
+static void
+get_blorp_surf_for_anv_buffer(struct anv_cmd_buffer *cmd_buffer,
+ struct anv_buffer *buffer, uint64_t offset,
+ uint32_t width, uint32_t height,
+ uint32_t row_pitch, enum isl_format format,
+ bool is_dest,
+ struct blorp_surf *blorp_surf,
+ struct isl_surf *isl_surf)
+{
+ get_blorp_surf_for_anv_address(cmd_buffer,
+ anv_address_add(buffer->address, offset),
+ width, height, row_pitch, format,
+ is_dest, blorp_surf, isl_surf);
+}
+
/* Pick something high enough that it won't be used in core and low enough it
* will never map to an extension.
*/
@@ -197,7 +271,7 @@ anv_to_blorp_address(struct anv_address addr)
}
static void
-get_blorp_surf_for_anv_image(const struct anv_device *device,
+get_blorp_surf_for_anv_image(const struct anv_cmd_buffer *cmd_buffer,
const struct anv_image *image,
VkImageAspectFlags aspect,
VkImageUsageFlags usage,
@@ -205,18 +279,19 @@ get_blorp_surf_for_anv_image(const struct anv_device *device,
enum isl_aux_usage aux_usage,
struct blorp_surf *blorp_surf)
{
+ const struct anv_device *device = cmd_buffer->device;
const uint32_t plane = anv_image_aspect_to_plane(image, aspect);
if (layout != ANV_IMAGE_LAYOUT_EXPLICIT_AUX) {
assert(usage != 0);
- aux_usage = anv_layout_to_aux_usage(&device->info, image,
- aspect, usage, layout);
+ aux_usage = anv_layout_to_aux_usage(device->info, image,
+ aspect, usage, layout,
+ cmd_buffer->queue_family->queueFlags);
}
- isl_surf_usage_flags_t mocs_usage =
- (usage & VK_IMAGE_USAGE_TRANSFER_DST_BIT) ?
- ISL_SURF_USAGE_RENDER_TARGET_BIT : ISL_SURF_USAGE_TEXTURE_BIT;
-
+ isl_surf_usage_flags_t isl_usage =
+ get_usage_flag_for_cmd_buffer(cmd_buffer,
+ usage & VK_IMAGE_USAGE_TRANSFER_DST_BIT);
const struct anv_surface *surface = &image->planes[plane].primary_surface;
const struct anv_address address =
anv_image_address(image, &surface->memory_range);
@@ -226,7 +301,7 @@ get_blorp_surf_for_anv_image(const struct anv_device *device,
.addr = {
.buffer = address.bo,
.offset = address.offset,
- .mocs = anv_mocs(device, address.bo, mocs_usage),
+ .mocs = anv_mocs(device, address.bo, isl_usage),
},
};
@@ -242,7 +317,7 @@ get_blorp_surf_for_anv_image(const struct anv_device *device,
blorp_surf->aux_addr = (struct blorp_address) {
.buffer = aux_address.bo,
.offset = aux_address.offset,
- .mocs = anv_mocs(device, aux_address.bo, 0),
+ .mocs = anv_mocs(device, aux_address.bo, isl_usage),
};
}
@@ -267,33 +342,6 @@ get_blorp_surf_for_anv_image(const struct anv_device *device,
}
}
-static bool
-get_blorp_surf_for_anv_shadow_image(const struct anv_device *device,
- const struct anv_image *image,
- VkImageAspectFlags aspect,
- struct blorp_surf *blorp_surf)
-{
-
- const uint32_t plane = anv_image_aspect_to_plane(image, aspect);
- if (!anv_surface_is_valid(&image->planes[plane].shadow_surface))
- return false;
-
- const struct anv_surface *surface = &image->planes[plane].shadow_surface;
- const struct anv_address address =
- anv_image_address(image, &surface->memory_range);
-
- *blorp_surf = (struct blorp_surf) {
- .surf = &surface->isl,
- .addr = {
- .buffer = address.bo,
- .offset = address.offset,
- .mocs = anv_mocs(device, address.bo, ISL_SURF_USAGE_RENDER_TARGET_BIT),
- },
- };
-
- return true;
-}
-
static void
copy_image(struct anv_cmd_buffer *cmd_buffer,
struct blorp_batch *batch,
@@ -301,14 +349,14 @@ copy_image(struct anv_cmd_buffer *cmd_buffer,
VkImageLayout src_image_layout,
struct anv_image *dst_image,
VkImageLayout dst_image_layout,
- const VkImageCopy2KHR *region)
+ const VkImageCopy2 *region)
{
VkOffset3D srcOffset =
- anv_sanitize_image_offset(src_image->vk.image_type, region->srcOffset);
+ vk_image_sanitize_offset(&src_image->vk, region->srcOffset);
VkOffset3D dstOffset =
- anv_sanitize_image_offset(dst_image->vk.image_type, region->dstOffset);
+ vk_image_sanitize_offset(&dst_image->vk, region->dstOffset);
VkExtent3D extent =
- anv_sanitize_image_extent(src_image->vk.image_type, region->extent);
+ vk_image_sanitize_extent(&src_image->vk, region->extent);
const uint32_t dst_level = region->dstSubresource.mipLevel;
unsigned dst_base_layer, layer_count;
@@ -340,12 +388,12 @@ copy_image(struct anv_cmd_buffer *cmd_buffer,
if (util_bitcount(src_mask) > 1) {
anv_foreach_image_aspect_bit(aspect_bit, src_image, src_mask) {
struct blorp_surf src_surf, dst_surf;
- get_blorp_surf_for_anv_image(cmd_buffer->device,
+ get_blorp_surf_for_anv_image(cmd_buffer,
src_image, 1UL << aspect_bit,
VK_IMAGE_USAGE_TRANSFER_SRC_BIT,
src_image_layout, ISL_AUX_USAGE_NONE,
&src_surf);
- get_blorp_surf_for_anv_image(cmd_buffer->device,
+ get_blorp_surf_for_anv_image(cmd_buffer,
dst_image, 1UL << aspect_bit,
VK_IMAGE_USAGE_TRANSFER_DST_BIT,
dst_image_layout, ISL_AUX_USAGE_NONE,
@@ -362,28 +410,17 @@ copy_image(struct anv_cmd_buffer *cmd_buffer,
dstOffset.x, dstOffset.y,
extent.width, extent.height);
}
-
- struct blorp_surf dst_shadow_surf;
- if (get_blorp_surf_for_anv_shadow_image(cmd_buffer->device,
- dst_image,
- 1UL << aspect_bit,
- &dst_shadow_surf)) {
- for (unsigned i = 0; i < layer_count; i++) {
- blorp_copy(batch, &src_surf, src_level, src_base_layer + i,
- &dst_shadow_surf, dst_level, dst_base_layer + i,
- srcOffset.x, srcOffset.y,
- dstOffset.x, dstOffset.y,
- extent.width, extent.height);
- }
- }
}
} else {
+ /* This case handles the ycbcr images, aspect mask are compatible but
+ * don't need to be the same.
+ */
struct blorp_surf src_surf, dst_surf;
- get_blorp_surf_for_anv_image(cmd_buffer->device, src_image, src_mask,
+ get_blorp_surf_for_anv_image(cmd_buffer, src_image, src_mask,
VK_IMAGE_USAGE_TRANSFER_SRC_BIT,
src_image_layout, ISL_AUX_USAGE_NONE,
&src_surf);
- get_blorp_surf_for_anv_image(cmd_buffer->device, dst_image, dst_mask,
+ get_blorp_surf_for_anv_image(cmd_buffer, dst_image, dst_mask,
VK_IMAGE_USAGE_TRANSFER_DST_BIT,
dst_image_layout, ISL_AUX_USAGE_NONE,
&dst_surf);
@@ -398,32 +435,133 @@ copy_image(struct anv_cmd_buffer *cmd_buffer,
dstOffset.x, dstOffset.y,
extent.width, extent.height);
}
+ }
+}
- struct blorp_surf dst_shadow_surf;
- if (get_blorp_surf_for_anv_shadow_image(cmd_buffer->device,
- dst_image, dst_mask,
- &dst_shadow_surf)) {
- for (unsigned i = 0; i < layer_count; i++) {
- blorp_copy(batch, &src_surf, src_level, src_base_layer + i,
- &dst_shadow_surf, dst_level, dst_base_layer + i,
- srcOffset.x, srcOffset.y,
- dstOffset.x, dstOffset.y,
- extent.width, extent.height);
- }
+static struct anv_state
+record_main_rcs_cmd_buffer_done(struct anv_cmd_buffer *cmd_buffer)
+{
+ const struct intel_device_info *info = cmd_buffer->device->info;
+
+ const VkResult result = anv_cmd_buffer_ensure_rcs_companion(cmd_buffer);
+ if (result != VK_SUCCESS) {
+ anv_batch_set_error(&cmd_buffer->batch, result);
+ return ANV_STATE_NULL;
+ }
+
+ assert(cmd_buffer->companion_rcs_cmd_buffer != NULL);
+
+ /* Re-emit the aux table register in every command buffer. This way we're
+ * ensured that we have the table even if this command buffer doesn't
+ * initialize any images.
+ */
+ if (cmd_buffer->device->info->has_aux_map) {
+ anv_add_pending_pipe_bits(cmd_buffer->companion_rcs_cmd_buffer,
+ ANV_PIPE_AUX_TABLE_INVALIDATE_BIT,
+ "new cmd buffer with aux-tt");
+ }
+
+ return anv_genX(info, cmd_buffer_begin_companion_rcs_syncpoint)(cmd_buffer);
+}
+
+static void
+end_main_rcs_cmd_buffer_done(struct anv_cmd_buffer *cmd_buffer,
+ struct anv_state syncpoint)
+{
+ const struct intel_device_info *info = cmd_buffer->device->info;
+ anv_genX(info, cmd_buffer_end_companion_rcs_syncpoint)(cmd_buffer,
+ syncpoint);
+}
+
+static bool
+anv_blorp_blitter_execute_on_companion(struct anv_cmd_buffer *cmd_buffer,
+ struct anv_image *image,
+ const VkCopyBufferToImageInfo2* pCopyBufferToImageInfo,
+ const VkCopyImageToBufferInfo2* pCopyImageToBufferInfo)
+{
+ if (!anv_cmd_buffer_is_blitter_queue(cmd_buffer))
+ return false;
+
+ assert((pCopyBufferToImageInfo && !pCopyImageToBufferInfo) ||
+ (pCopyImageToBufferInfo && !pCopyBufferToImageInfo));
+
+ bool blorp_execute_on_companion = false;
+ VkImageAspectFlags aspect_mask = VK_IMAGE_ASPECT_NONE;
+ const uint32_t region_count = pCopyBufferToImageInfo ?
+ pCopyBufferToImageInfo->regionCount :
+ pCopyImageToBufferInfo->regionCount;
+
+ for (unsigned r = 0; r < region_count &&
+ !blorp_execute_on_companion; r++) {
+ if (pCopyBufferToImageInfo) {
+ aspect_mask =
+ pCopyBufferToImageInfo->pRegions[r].imageSubresource.aspectMask;
+ } else {
+ aspect_mask =
+ pCopyImageToBufferInfo->pRegions[r].imageSubresource.aspectMask;
+ }
+
+ enum isl_format linear_format =
+ anv_get_isl_format(cmd_buffer->device->info, image->vk.format,
+ aspect_mask, VK_IMAGE_TILING_LINEAR);
+ const struct isl_format_layout *linear_fmtl =
+ isl_format_get_layout(linear_format);
+
+ switch (linear_fmtl->bpb) {
+ case 96:
+ /* We can only support linear mode for 96bpp on blitter engine. */
+ blorp_execute_on_companion |=
+ image->vk.tiling != VK_IMAGE_TILING_LINEAR;
+ break;
+ default:
+ blorp_execute_on_companion |= linear_fmtl->bpb % 3 == 0;
+ break;
}
}
+
+ return blorp_execute_on_companion;
+}
+
+static bool
+anv_blorp_execute_on_companion(struct anv_cmd_buffer *cmd_buffer,
+ struct anv_image *dst_image)
+{
+ /* MSAA images have to be dealt with on the companion RCS command buffer
+ * for both CCS && BCS engines.
+ */
+ if ((anv_cmd_buffer_is_blitter_queue(cmd_buffer) ||
+ anv_cmd_buffer_is_compute_queue(cmd_buffer)) &&
+ dst_image->vk.samples > 1)
+ return true;
+
+ /* Emulation of formats is done through a compute shader, so we need
+ * the companion command buffer for the BCS engine.
+ */
+ if (anv_cmd_buffer_is_blitter_queue(cmd_buffer) &&
+ dst_image->emu_plane_format != VK_FORMAT_UNDEFINED)
+ return true;
+
+ return false;
}
-void anv_CmdCopyImage2KHR(
+void anv_CmdCopyImage2(
VkCommandBuffer commandBuffer,
- const VkCopyImageInfo2KHR* pCopyImageInfo)
+ const VkCopyImageInfo2* pCopyImageInfo)
{
ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
ANV_FROM_HANDLE(anv_image, src_image, pCopyImageInfo->srcImage);
ANV_FROM_HANDLE(anv_image, dst_image, pCopyImageInfo->dstImage);
+ struct anv_cmd_buffer *main_cmd_buffer = cmd_buffer;
+ UNUSED struct anv_state rcs_done = ANV_STATE_NULL;
+
+ if (anv_blorp_execute_on_companion(cmd_buffer, dst_image)) {
+ rcs_done = record_main_rcs_cmd_buffer_done(cmd_buffer);
+ cmd_buffer = cmd_buffer->companion_rcs_cmd_buffer;
+ }
+
struct blorp_batch batch;
- blorp_batch_init(&cmd_buffer->device->blorp, &batch, cmd_buffer, 0);
+ anv_blorp_batch_init(cmd_buffer, &batch, 0);
for (unsigned r = 0; r < pCopyImageInfo->regionCount; r++) {
copy_image(cmd_buffer, &batch,
@@ -432,7 +570,32 @@ void anv_CmdCopyImage2KHR(
&pCopyImageInfo->pRegions[r]);
}
- blorp_batch_finish(&batch);
+ anv_blorp_batch_finish(&batch);
+
+ if (dst_image->emu_plane_format != VK_FORMAT_UNDEFINED) {
+ assert(!anv_cmd_buffer_is_blitter_queue(cmd_buffer));
+ const enum anv_pipe_bits pipe_bits =
+ anv_cmd_buffer_is_compute_queue(cmd_buffer) ?
+ ANV_PIPE_HDC_PIPELINE_FLUSH_BIT :
+ ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;
+ anv_add_pending_pipe_bits(cmd_buffer, pipe_bits,
+ "Copy flush before astc emu");
+
+ for (unsigned r = 0; r < pCopyImageInfo->regionCount; r++) {
+ const VkImageCopy2 *region = &pCopyImageInfo->pRegions[r];
+ const VkOffset3D block_offset = vk_image_offset_to_elements(
+ &dst_image->vk, region->dstOffset);
+ const VkExtent3D block_extent = vk_image_extent_to_elements(
+ &src_image->vk, region->extent);
+ anv_astc_emu_process(cmd_buffer, dst_image,
+ pCopyImageInfo->dstImageLayout,
+ &region->dstSubresource,
+ block_offset, block_extent);
+ }
+ }
+
+ if (rcs_done.alloc_size)
+ end_main_rcs_cmd_buffer_done(main_cmd_buffer, rcs_done);
}
static enum isl_format
@@ -459,7 +622,7 @@ copy_buffer_to_image(struct anv_cmd_buffer *cmd_buffer,
struct anv_buffer *anv_buffer,
struct anv_image *anv_image,
VkImageLayout image_layout,
- const VkBufferImageCopy2KHR* region,
+ const VkBufferImageCopy2* region,
bool buffer_to_image)
{
struct {
@@ -481,18 +644,18 @@ copy_buffer_to_image(struct anv_cmd_buffer *cmd_buffer,
const VkImageAspectFlags aspect = region->imageSubresource.aspectMask;
- get_blorp_surf_for_anv_image(cmd_buffer->device, anv_image, aspect,
+ get_blorp_surf_for_anv_image(cmd_buffer, anv_image, aspect,
buffer_to_image ?
VK_IMAGE_USAGE_TRANSFER_DST_BIT :
VK_IMAGE_USAGE_TRANSFER_SRC_BIT,
image_layout, ISL_AUX_USAGE_NONE,
&image.surf);
image.offset =
- anv_sanitize_image_offset(anv_image->vk.image_type, region->imageOffset);
+ vk_image_sanitize_offset(&anv_image->vk, region->imageOffset);
image.level = region->imageSubresource.mipLevel;
VkExtent3D extent =
- anv_sanitize_image_extent(anv_image->vk.image_type, region->imageExtent);
+ vk_image_sanitize_extent(&anv_image->vk, region->imageExtent);
if (anv_image->vk.image_type != VK_IMAGE_TYPE_3D) {
image.offset.z = region->imageSubresource.baseArrayLayer;
extent.depth =
@@ -501,32 +664,17 @@ copy_buffer_to_image(struct anv_cmd_buffer *cmd_buffer,
}
const enum isl_format linear_format =
- anv_get_isl_format(&cmd_buffer->device->info, anv_image->vk.format,
+ anv_get_isl_format(cmd_buffer->device->info, anv_image->vk.format,
aspect, VK_IMAGE_TILING_LINEAR);
const struct isl_format_layout *linear_fmtl =
isl_format_get_layout(linear_format);
- const uint32_t buffer_row_length =
- region->bufferRowLength ?
- region->bufferRowLength : extent.width;
-
- const uint32_t buffer_image_height =
- region->bufferImageHeight ?
- region->bufferImageHeight : extent.height;
-
- const uint32_t buffer_row_pitch =
- DIV_ROUND_UP(buffer_row_length, linear_fmtl->bw) *
- (linear_fmtl->bpb / 8);
-
- const uint32_t buffer_layer_stride =
- DIV_ROUND_UP(buffer_image_height, linear_fmtl->bh) *
- buffer_row_pitch;
+ const struct vk_image_buffer_layout buffer_layout =
+ vk_image_buffer_copy_layout(&anv_image->vk, region);
/* Some formats have additional restrictions which may cause ISL to
- * fail to create a surface for us. Some examples include:
- *
- * 1. ASTC formats are not allowed to be LINEAR and must be tiled
- * 2. YCbCr formats have to have 2-pixel aligned strides
+ * fail to create a surface for us. For example, YCbCr formats
+ * have to have 2-pixel aligned strides.
*
* To avoid these issues, we always bind the buffer as if it's a
* "normal" format like RGBA32_UINT. Since we're using blorp_copy,
@@ -540,14 +688,12 @@ copy_buffer_to_image(struct anv_cmd_buffer *cmd_buffer,
isl_format_for_size(linear_fmtl->bpb / 8);
struct isl_surf buffer_isl_surf;
- get_blorp_surf_for_anv_buffer(cmd_buffer->device,
+ get_blorp_surf_for_anv_buffer(cmd_buffer,
anv_buffer, region->bufferOffset,
buffer_extent.width, buffer_extent.height,
- buffer_row_pitch, buffer_format, false,
- &buffer.surf, &buffer_isl_surf);
+ buffer_layout.row_stride_B, buffer_format,
+ false, &buffer.surf, &buffer_isl_surf);
- bool dst_has_shadow = false;
- struct blorp_surf dst_shadow_surf;
if (&image == dst) {
/* In this case, the source is the buffer and, since blorp takes its
* copy dimensions in terms of the source format, we have to use the
@@ -561,11 +707,6 @@ copy_buffer_to_image(struct anv_cmd_buffer *cmd_buffer,
aspect, dst->surf.aux_usage,
dst->level,
dst->offset.z, extent.depth);
-
- dst_has_shadow =
- get_blorp_surf_for_anv_shadow_image(cmd_buffer->device,
- anv_image, aspect,
- &dst_shadow_surf);
}
for (unsigned z = 0; z < extent.depth; z++) {
@@ -574,29 +715,40 @@ copy_buffer_to_image(struct anv_cmd_buffer *cmd_buffer,
src->offset.x, src->offset.y, dst->offset.x, dst->offset.y,
extent.width, extent.height);
- if (dst_has_shadow) {
- blorp_copy(batch, &src->surf, src->level, src->offset.z,
- &dst_shadow_surf, dst->level, dst->offset.z,
- src->offset.x, src->offset.y,
- dst->offset.x, dst->offset.y,
- extent.width, extent.height);
- }
-
image.offset.z++;
- buffer.surf.addr.offset += buffer_layer_stride;
+ buffer.surf.addr.offset += buffer_layout.image_stride_B;
}
}
-void anv_CmdCopyBufferToImage2KHR(
+void anv_CmdCopyBufferToImage2(
VkCommandBuffer commandBuffer,
- const VkCopyBufferToImageInfo2KHR* pCopyBufferToImageInfo)
+ const VkCopyBufferToImageInfo2* pCopyBufferToImageInfo)
{
ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
ANV_FROM_HANDLE(anv_buffer, src_buffer, pCopyBufferToImageInfo->srcBuffer);
ANV_FROM_HANDLE(anv_image, dst_image, pCopyBufferToImageInfo->dstImage);
+ struct anv_cmd_buffer *main_cmd_buffer = cmd_buffer;
+ UNUSED struct anv_state rcs_done = ANV_STATE_NULL;
+
+ bool blorp_execute_on_companion =
+ anv_blorp_execute_on_companion(cmd_buffer, dst_image);
+
+ /* Check if any one of the aspects is incompatible with the blitter engine,
+ * if true, use the companion RCS command buffer for blit operation since 3
+ * component formats are not supported natively except 96bpb on the blitter.
+ */
+ blorp_execute_on_companion |=
+ anv_blorp_blitter_execute_on_companion(cmd_buffer, dst_image,
+ pCopyBufferToImageInfo, NULL);
+
+ if (blorp_execute_on_companion) {
+ rcs_done = record_main_rcs_cmd_buffer_done(cmd_buffer);
+ cmd_buffer = cmd_buffer->companion_rcs_cmd_buffer;
+ }
+
struct blorp_batch batch;
- blorp_batch_init(&cmd_buffer->device->blorp, &batch, cmd_buffer, 0);
+ anv_blorp_batch_init(cmd_buffer, &batch, 0);
for (unsigned r = 0; r < pCopyBufferToImageInfo->regionCount; r++) {
copy_buffer_to_image(cmd_buffer, &batch, src_buffer, dst_image,
@@ -604,19 +756,76 @@ void anv_CmdCopyBufferToImage2KHR(
&pCopyBufferToImageInfo->pRegions[r], true);
}
- blorp_batch_finish(&batch);
+ anv_blorp_batch_finish(&batch);
+
+ if (dst_image->emu_plane_format != VK_FORMAT_UNDEFINED) {
+ assert(!anv_cmd_buffer_is_blitter_queue(cmd_buffer));
+ const enum anv_pipe_bits pipe_bits =
+ anv_cmd_buffer_is_compute_queue(cmd_buffer) ?
+ ANV_PIPE_HDC_PIPELINE_FLUSH_BIT :
+ ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;
+ anv_add_pending_pipe_bits(cmd_buffer, pipe_bits,
+ "Copy flush before astc emu");
+
+ for (unsigned r = 0; r < pCopyBufferToImageInfo->regionCount; r++) {
+ const VkBufferImageCopy2 *region =
+ &pCopyBufferToImageInfo->pRegions[r];
+ const VkOffset3D block_offset = vk_image_offset_to_elements(
+ &dst_image->vk, region->imageOffset);
+ const VkExtent3D block_extent = vk_image_extent_to_elements(
+ &dst_image->vk, region->imageExtent);
+ anv_astc_emu_process(cmd_buffer, dst_image,
+ pCopyBufferToImageInfo->dstImageLayout,
+ &region->imageSubresource,
+ block_offset, block_extent);
+ }
+ }
+
+ if (rcs_done.alloc_size)
+ end_main_rcs_cmd_buffer_done(main_cmd_buffer, rcs_done);
}
-void anv_CmdCopyImageToBuffer2KHR(
+static void
+anv_add_buffer_write_pending_bits(struct anv_cmd_buffer *cmd_buffer,
+ const char *reason)
+{
+ const struct intel_device_info *devinfo = cmd_buffer->device->info;
+
+ cmd_buffer->state.queries.buffer_write_bits |=
+ (cmd_buffer->queue_family->queueFlags & VK_QUEUE_GRAPHICS_BIT) == 0 ?
+ ANV_QUERY_COMPUTE_WRITES_PENDING_BITS :
+ ANV_QUERY_RENDER_TARGET_WRITES_PENDING_BITS(devinfo);
+}
+
+void anv_CmdCopyImageToBuffer2(
VkCommandBuffer commandBuffer,
- const VkCopyImageToBufferInfo2KHR* pCopyImageToBufferInfo)
+ const VkCopyImageToBufferInfo2* pCopyImageToBufferInfo)
{
ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
ANV_FROM_HANDLE(anv_image, src_image, pCopyImageToBufferInfo->srcImage);
ANV_FROM_HANDLE(anv_buffer, dst_buffer, pCopyImageToBufferInfo->dstBuffer);
+ UNUSED struct anv_cmd_buffer *main_cmd_buffer = cmd_buffer;
+ UNUSED struct anv_state rcs_done = ANV_STATE_NULL;
+
+ bool blorp_execute_on_companion =
+ anv_blorp_execute_on_companion(cmd_buffer, src_image);
+
+ /* Check if any one of the aspects is incompatible with the blitter engine,
+ * if true, use the companion RCS command buffer for blit operation since 3
+ * component formats are not supported natively except 96bpb on the blitter.
+ */
+ blorp_execute_on_companion |=
+ anv_blorp_blitter_execute_on_companion(cmd_buffer, src_image, NULL,
+ pCopyImageToBufferInfo);
+
+ if (blorp_execute_on_companion) {
+ rcs_done = record_main_rcs_cmd_buffer_done(cmd_buffer);
+ cmd_buffer = cmd_buffer->companion_rcs_cmd_buffer;
+ }
+
struct blorp_batch batch;
- blorp_batch_init(&cmd_buffer->device->blorp, &batch, cmd_buffer, 0);
+ anv_blorp_batch_init(cmd_buffer, &batch, 0);
for (unsigned r = 0; r < pCopyImageToBufferInfo->regionCount; r++) {
copy_buffer_to_image(cmd_buffer, &batch, dst_buffer, src_image,
@@ -624,9 +833,12 @@ void anv_CmdCopyImageToBuffer2KHR(
&pCopyImageToBufferInfo->pRegions[r], false);
}
- blorp_batch_finish(&batch);
+ anv_add_buffer_write_pending_bits(cmd_buffer, "after copy image to buffer");
- cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_RENDER_TARGET_BUFFER_WRITES;
+ anv_blorp_batch_finish(&batch);
+
+ if (rcs_done.alloc_size)
+ end_main_rcs_cmd_buffer_done(main_cmd_buffer, rcs_done);
}
static bool
@@ -657,7 +869,7 @@ blit_image(struct anv_cmd_buffer *cmd_buffer,
VkImageLayout src_image_layout,
struct anv_image *dst_image,
VkImageLayout dst_image_layout,
- const VkImageBlit2KHR *region,
+ const VkImageBlit2 *region,
VkFilter filter)
{
const VkImageSubresourceLayers *src_res = &region->srcSubresource;
@@ -681,20 +893,35 @@ blit_image(struct anv_cmd_buffer *cmd_buffer,
dst_res->aspectMask));
anv_foreach_image_aspect_bit(aspect_bit, src_image, src_res->aspectMask) {
- get_blorp_surf_for_anv_image(cmd_buffer->device,
+ get_blorp_surf_for_anv_image(cmd_buffer,
src_image, 1U << aspect_bit,
VK_IMAGE_USAGE_TRANSFER_SRC_BIT,
src_image_layout, ISL_AUX_USAGE_NONE, &src);
- get_blorp_surf_for_anv_image(cmd_buffer->device,
+ get_blorp_surf_for_anv_image(cmd_buffer,
dst_image, 1U << aspect_bit,
VK_IMAGE_USAGE_TRANSFER_DST_BIT,
dst_image_layout, ISL_AUX_USAGE_NONE, &dst);
+ VkFormat src_vk_format = src_image->vk.format;
+
+ if (src_image->emu_plane_format != VK_FORMAT_UNDEFINED) {
+ /* redirect src to the hidden plane */
+ const uint32_t plane = src_image->n_planes;
+ const struct anv_surface *surface =
+ &src_image->planes[plane].primary_surface;
+ const struct anv_address address =
+ anv_image_address(src_image, &surface->memory_range);
+ src.surf = &surface->isl,
+ src.addr.offset = address.offset;
+
+ src_vk_format = src_image->emu_plane_format;
+ }
+
struct anv_format_plane src_format =
- anv_get_format_aspect(&cmd_buffer->device->info, src_image->vk.format,
+ anv_get_format_aspect(cmd_buffer->device->info, src_vk_format,
1U << aspect_bit, src_image->vk.tiling);
struct anv_format_plane dst_format =
- anv_get_format_aspect(&cmd_buffer->device->info, dst_image->vk.format,
+ anv_get_format_aspect(cmd_buffer->device->info, dst_image->vk.format,
1U << aspect_bit, dst_image->vk.tiling);
unsigned dst_start, dst_end;
@@ -768,16 +995,16 @@ blit_image(struct anv_cmd_buffer *cmd_buffer,
}
}
-void anv_CmdBlitImage2KHR(
+void anv_CmdBlitImage2(
VkCommandBuffer commandBuffer,
- const VkBlitImageInfo2KHR* pBlitImageInfo)
+ const VkBlitImageInfo2* pBlitImageInfo)
{
ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
ANV_FROM_HANDLE(anv_image, src_image, pBlitImageInfo->srcImage);
ANV_FROM_HANDLE(anv_image, dst_image, pBlitImageInfo->dstImage);
struct blorp_batch batch;
- blorp_batch_init(&cmd_buffer->device->blorp, &batch, cmd_buffer, 0);
+ anv_blorp_batch_init(cmd_buffer, &batch, 0);
for (unsigned r = 0; r < pBlitImageInfo->regionCount; r++) {
blit_image(cmd_buffer, &batch,
@@ -786,7 +1013,7 @@ void anv_CmdBlitImage2KHR(
&pBlitImageInfo->pRegions[r], pBlitImageInfo->filter);
}
- blorp_batch_finish(&batch);
+ anv_blorp_batch_finish(&batch);
}
/**
@@ -815,43 +1042,46 @@ copy_buffer(struct anv_device *device,
struct blorp_batch *batch,
struct anv_buffer *src_buffer,
struct anv_buffer *dst_buffer,
- const VkBufferCopy2KHR *region)
+ const VkBufferCopy2 *region)
{
struct blorp_address src = {
.buffer = src_buffer->address.bo,
.offset = src_buffer->address.offset + region->srcOffset,
.mocs = anv_mocs(device, src_buffer->address.bo,
- ISL_SURF_USAGE_TEXTURE_BIT),
+ blorp_batch_isl_copy_usage(batch, false /* is_dest */)),
};
struct blorp_address dst = {
.buffer = dst_buffer->address.bo,
.offset = dst_buffer->address.offset + region->dstOffset,
.mocs = anv_mocs(device, dst_buffer->address.bo,
- ISL_SURF_USAGE_RENDER_TARGET_BIT),
+ blorp_batch_isl_copy_usage(batch, true /* is_dest */)),
};
blorp_buffer_copy(batch, src, dst, region->size);
}
-void anv_CmdCopyBuffer2KHR(
+void anv_CmdCopyBuffer2(
VkCommandBuffer commandBuffer,
- const VkCopyBufferInfo2KHR* pCopyBufferInfo)
+ const VkCopyBufferInfo2* pCopyBufferInfo)
{
ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
ANV_FROM_HANDLE(anv_buffer, src_buffer, pCopyBufferInfo->srcBuffer);
ANV_FROM_HANDLE(anv_buffer, dst_buffer, pCopyBufferInfo->dstBuffer);
struct blorp_batch batch;
- blorp_batch_init(&cmd_buffer->device->blorp, &batch, cmd_buffer, 0);
+ anv_blorp_batch_init(cmd_buffer, &batch,
+ cmd_buffer->state.current_pipeline ==
+ cmd_buffer->device->physical->gpgpu_pipeline_value ?
+ BLORP_BATCH_USE_COMPUTE : 0);
for (unsigned r = 0; r < pCopyBufferInfo->regionCount; r++) {
copy_buffer(cmd_buffer->device, &batch, src_buffer, dst_buffer,
&pCopyBufferInfo->pRegions[r]);
}
- blorp_batch_finish(&batch);
+ anv_add_buffer_write_pending_bits(cmd_buffer, "after copy buffer");
- cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_RENDER_TARGET_BUFFER_WRITES;
+ anv_blorp_batch_finish(&batch);
}
@@ -866,7 +1096,10 @@ void anv_CmdUpdateBuffer(
ANV_FROM_HANDLE(anv_buffer, dst_buffer, dstBuffer);
struct blorp_batch batch;
- blorp_batch_init(&cmd_buffer->device->blorp, &batch, cmd_buffer, 0);
+ anv_blorp_batch_init(cmd_buffer, &batch,
+ cmd_buffer->state.current_pipeline ==
+ cmd_buffer->device->physical->gpgpu_pipeline_value ?
+ BLORP_BATCH_USE_COMPUTE : 0);
/* We can't quite grab a full block because the state stream needs a
* little data at the top to build its linked list.
@@ -887,21 +1120,25 @@ void anv_CmdUpdateBuffer(
const uint32_t copy_size = MIN2(dataSize, max_update_size);
struct anv_state tmp_data =
- anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, copy_size, 64);
+ anv_cmd_buffer_alloc_temporary_state(cmd_buffer, copy_size, 64);
+ struct anv_address tmp_addr =
+ anv_cmd_buffer_temporary_state_address(cmd_buffer, tmp_data);
memcpy(tmp_data.map, pData, copy_size);
struct blorp_address src = {
- .buffer = cmd_buffer->device->dynamic_state_pool.block_pool.bo,
- .offset = tmp_data.offset,
- .mocs = isl_mocs(&cmd_buffer->device->isl_dev,
- ISL_SURF_USAGE_TEXTURE_BIT, false)
+ .buffer = tmp_addr.bo,
+ .offset = tmp_addr.offset,
+ .mocs = anv_mocs(cmd_buffer->device, NULL,
+ get_usage_flag_for_cmd_buffer(cmd_buffer,
+ false /* is_dest */)),
};
struct blorp_address dst = {
.buffer = dst_buffer->address.bo,
.offset = dst_buffer->address.offset + dstOffset,
.mocs = anv_mocs(cmd_buffer->device, dst_buffer->address.bo,
- ISL_SURF_USAGE_RENDER_TARGET_BIT),
+ get_usage_flag_for_cmd_buffer(cmd_buffer,
+ true /* is_dest */)),
};
blorp_buffer_copy(&batch, src, dst, copy_size);
@@ -911,44 +1148,33 @@ void anv_CmdUpdateBuffer(
pData = (void *)pData + copy_size;
}
- blorp_batch_finish(&batch);
+ anv_add_buffer_write_pending_bits(cmd_buffer, "update buffer");
- cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_RENDER_TARGET_BUFFER_WRITES;
+ anv_blorp_batch_finish(&batch);
}
-void anv_CmdFillBuffer(
- VkCommandBuffer commandBuffer,
- VkBuffer dstBuffer,
- VkDeviceSize dstOffset,
- VkDeviceSize fillSize,
- uint32_t data)
+void
+anv_cmd_buffer_fill_area(struct anv_cmd_buffer *cmd_buffer,
+ struct anv_address address,
+ VkDeviceSize size,
+ uint32_t data)
{
- ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
- ANV_FROM_HANDLE(anv_buffer, dst_buffer, dstBuffer);
struct blorp_surf surf;
struct isl_surf isl_surf;
struct blorp_batch batch;
- blorp_batch_init(&cmd_buffer->device->blorp, &batch, cmd_buffer, 0);
-
- fillSize = anv_buffer_get_range(dst_buffer, dstOffset, fillSize);
-
- /* From the Vulkan spec:
- *
- * "size is the number of bytes to fill, and must be either a multiple
- * of 4, or VK_WHOLE_SIZE to fill the range from offset to the end of
- * the buffer. If VK_WHOLE_SIZE is used and the remaining size of the
- * buffer is not a multiple of 4, then the nearest smaller multiple is
- * used."
- */
- fillSize &= ~3ull;
+ anv_blorp_batch_init(cmd_buffer, &batch,
+ cmd_buffer->state.current_pipeline ==
+ cmd_buffer->device->physical->gpgpu_pipeline_value ?
+ BLORP_BATCH_USE_COMPUTE : 0);
/* First, we compute the biggest format that can be used with the
* given offsets and size.
*/
int bs = 16;
- bs = gcd_pow2_u64(bs, dstOffset);
- bs = gcd_pow2_u64(bs, fillSize);
+ uint64_t offset = address.offset;
+ bs = gcd_pow2_u64(bs, offset);
+ bs = gcd_pow2_u64(bs, size);
enum isl_format isl_format = isl_format_for_size(bs);
union isl_color_value color = {
@@ -956,53 +1182,89 @@ void anv_CmdFillBuffer(
};
const uint64_t max_fill_size = MAX_SURFACE_DIM * MAX_SURFACE_DIM * bs;
- while (fillSize >= max_fill_size) {
- get_blorp_surf_for_anv_buffer(cmd_buffer->device,
- dst_buffer, dstOffset,
- MAX_SURFACE_DIM, MAX_SURFACE_DIM,
- MAX_SURFACE_DIM * bs, isl_format, true,
- &surf, &isl_surf);
+ while (size >= max_fill_size) {
+ get_blorp_surf_for_anv_address(cmd_buffer,
+ (struct anv_address) {
+ .bo = address.bo, .offset = offset,
+ },
+ MAX_SURFACE_DIM, MAX_SURFACE_DIM,
+ MAX_SURFACE_DIM * bs, isl_format,
+ true /* is_dest */,
+ &surf, &isl_surf);
blorp_clear(&batch, &surf, isl_format, ISL_SWIZZLE_IDENTITY,
0, 0, 1, 0, 0, MAX_SURFACE_DIM, MAX_SURFACE_DIM,
- color, NULL);
- fillSize -= max_fill_size;
- dstOffset += max_fill_size;
+ color, 0 /* color_write_disable */);
+ size -= max_fill_size;
+ offset += max_fill_size;
}
- uint64_t height = fillSize / (MAX_SURFACE_DIM * bs);
+ uint64_t height = size / (MAX_SURFACE_DIM * bs);
assert(height < MAX_SURFACE_DIM);
if (height != 0) {
const uint64_t rect_fill_size = height * MAX_SURFACE_DIM * bs;
- get_blorp_surf_for_anv_buffer(cmd_buffer->device,
- dst_buffer, dstOffset,
- MAX_SURFACE_DIM, height,
- MAX_SURFACE_DIM * bs, isl_format, true,
- &surf, &isl_surf);
+ get_blorp_surf_for_anv_address(cmd_buffer,
+ (struct anv_address) {
+ .bo = address.bo, .offset = offset,
+ },
+ MAX_SURFACE_DIM, height,
+ MAX_SURFACE_DIM * bs, isl_format,
+ true /* is_dest */,
+ &surf, &isl_surf);
blorp_clear(&batch, &surf, isl_format, ISL_SWIZZLE_IDENTITY,
0, 0, 1, 0, 0, MAX_SURFACE_DIM, height,
- color, NULL);
- fillSize -= rect_fill_size;
- dstOffset += rect_fill_size;
+ color, 0 /* color_write_disable */);
+ size -= rect_fill_size;
+ offset += rect_fill_size;
}
- if (fillSize != 0) {
- const uint32_t width = fillSize / bs;
- get_blorp_surf_for_anv_buffer(cmd_buffer->device,
- dst_buffer, dstOffset,
- width, 1,
- width * bs, isl_format, true,
- &surf, &isl_surf);
+ if (size != 0) {
+ const uint32_t width = size / bs;
+ get_blorp_surf_for_anv_address(cmd_buffer,
+ (struct anv_address) {
+ .bo = address.bo, .offset = offset,
+ },
+ width, 1,
+ width * bs, isl_format,
+ true /* is_dest */,
+ &surf, &isl_surf);
blorp_clear(&batch, &surf, isl_format, ISL_SWIZZLE_IDENTITY,
0, 0, 1, 0, 0, width, 1,
- color, NULL);
+ color, 0 /* color_write_disable */);
}
- blorp_batch_finish(&batch);
+ anv_blorp_batch_finish(&batch);
+}
+
+void anv_CmdFillBuffer(
+ VkCommandBuffer commandBuffer,
+ VkBuffer dstBuffer,
+ VkDeviceSize dstOffset,
+ VkDeviceSize fillSize,
+ uint32_t data)
+{
+ ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+ ANV_FROM_HANDLE(anv_buffer, dst_buffer, dstBuffer);
+
+ fillSize = vk_buffer_range(&dst_buffer->vk, dstOffset, fillSize);
+
+ /* From the Vulkan spec:
+ *
+ * "size is the number of bytes to fill, and must be either a multiple
+ * of 4, or VK_WHOLE_SIZE to fill the range from offset to the end of
+ * the buffer. If VK_WHOLE_SIZE is used and the remaining size of the
+ * buffer is not a multiple of 4, then the nearest smaller multiple is
+ * used."
+ */
+ fillSize &= ~3ull;
+
+ anv_cmd_buffer_fill_area(cmd_buffer,
+ anv_address_add(dst_buffer->address, dstOffset),
+ fillSize, data);
- cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_RENDER_TARGET_BUFFER_WRITES;
+ anv_add_buffer_write_pending_bits(cmd_buffer, "after fill buffer");
}
void anv_CmdClearColorImage(
@@ -1016,11 +1278,16 @@ void anv_CmdClearColorImage(
ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
ANV_FROM_HANDLE(anv_image, image, _image);
- static const bool color_write_disable[4] = { false, false, false, false };
+ struct anv_cmd_buffer *main_cmd_buffer = cmd_buffer;
+ UNUSED struct anv_state rcs_done = ANV_STATE_NULL;
- struct blorp_batch batch;
- blorp_batch_init(&cmd_buffer->device->blorp, &batch, cmd_buffer, 0);
+ if (anv_blorp_execute_on_companion(cmd_buffer, image)) {
+ rcs_done = record_main_rcs_cmd_buffer_done(cmd_buffer);
+ cmd_buffer = cmd_buffer->companion_rcs_cmd_buffer;
+ }
+ struct blorp_batch batch;
+ anv_blorp_batch_init(cmd_buffer, &batch, 0);
for (unsigned r = 0; r < rangeCount; r++) {
if (pRanges[r].aspectMask == 0)
@@ -1029,13 +1296,13 @@ void anv_CmdClearColorImage(
assert(pRanges[r].aspectMask & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV);
struct blorp_surf surf;
- get_blorp_surf_for_anv_image(cmd_buffer->device,
+ get_blorp_surf_for_anv_image(cmd_buffer,
image, pRanges[r].aspectMask,
VK_IMAGE_USAGE_TRANSFER_DST_BIT,
imageLayout, ISL_AUX_USAGE_NONE, &surf);
struct anv_format_plane src_format =
- anv_get_format_aspect(&cmd_buffer->device->info, image->vk.format,
+ anv_get_format_aspect(cmd_buffer->device->info, image->vk.format,
VK_IMAGE_ASPECT_COLOR_BIT, image->vk.tiling);
unsigned base_layer = pRanges[r].baseArrayLayer;
@@ -1046,12 +1313,12 @@ void anv_CmdClearColorImage(
for (uint32_t i = 0; i < level_count; i++) {
const unsigned level = pRanges[r].baseMipLevel + i;
- const unsigned level_width = anv_minify(image->vk.extent.width, level);
- const unsigned level_height = anv_minify(image->vk.extent.height, level);
+ const unsigned level_width = u_minify(image->vk.extent.width, level);
+ const unsigned level_height = u_minify(image->vk.extent.height, level);
if (image->vk.image_type == VK_IMAGE_TYPE_3D) {
base_layer = 0;
- layer_count = anv_minify(image->vk.extent.depth, level);
+ layer_count = u_minify(image->vk.extent.depth, level);
}
anv_cmd_buffer_mark_image_written(cmd_buffer, image,
@@ -1063,11 +1330,14 @@ void anv_CmdClearColorImage(
src_format.isl_format, src_format.swizzle,
level, base_layer, layer_count,
0, 0, level_width, level_height,
- vk_to_isl_color(*pColor), color_write_disable);
+ vk_to_isl_color(*pColor), 0 /* color_write_disable */);
}
}
- blorp_batch_finish(&batch);
+ anv_blorp_batch_finish(&batch);
+
+ if (rcs_done.alloc_size)
+ end_main_rcs_cmd_buffer_done(main_cmd_buffer, rcs_done);
}
void anv_CmdClearDepthStencilImage(
@@ -1082,11 +1352,12 @@ void anv_CmdClearDepthStencilImage(
ANV_FROM_HANDLE(anv_image, image, image_h);
struct blorp_batch batch;
- blorp_batch_init(&cmd_buffer->device->blorp, &batch, cmd_buffer, 0);
+ anv_blorp_batch_init(cmd_buffer, &batch, 0);
+ assert((batch.flags & BLORP_BATCH_USE_COMPUTE) == 0);
- struct blorp_surf depth, stencil, stencil_shadow;
+ struct blorp_surf depth, stencil;
if (image->vk.aspects & VK_IMAGE_ASPECT_DEPTH_BIT) {
- get_blorp_surf_for_anv_image(cmd_buffer->device,
+ get_blorp_surf_for_anv_image(cmd_buffer,
image, VK_IMAGE_ASPECT_DEPTH_BIT,
VK_IMAGE_USAGE_TRANSFER_DST_BIT,
imageLayout, ISL_AUX_USAGE_NONE, &depth);
@@ -1094,17 +1365,11 @@ void anv_CmdClearDepthStencilImage(
memset(&depth, 0, sizeof(depth));
}
- bool has_stencil_shadow = false;
if (image->vk.aspects & VK_IMAGE_ASPECT_STENCIL_BIT) {
- get_blorp_surf_for_anv_image(cmd_buffer->device,
+ get_blorp_surf_for_anv_image(cmd_buffer,
image, VK_IMAGE_ASPECT_STENCIL_BIT,
VK_IMAGE_USAGE_TRANSFER_DST_BIT,
imageLayout, ISL_AUX_USAGE_NONE, &stencil);
-
- has_stencil_shadow =
- get_blorp_surf_for_anv_shadow_image(cmd_buffer->device, image,
- VK_IMAGE_ASPECT_STENCIL_BIT,
- &stencil_shadow);
} else {
memset(&stencil, 0, sizeof(stencil));
}
@@ -1124,11 +1389,11 @@ void anv_CmdClearDepthStencilImage(
for (uint32_t i = 0; i < level_count; i++) {
const unsigned level = pRanges[r].baseMipLevel + i;
- const unsigned level_width = anv_minify(image->vk.extent.width, level);
- const unsigned level_height = anv_minify(image->vk.extent.height, level);
+ const unsigned level_width = u_minify(image->vk.extent.width, level);
+ const unsigned level_height = u_minify(image->vk.extent.height, level);
if (image->vk.image_type == VK_IMAGE_TYPE_3D)
- layer_count = anv_minify(image->vk.extent.depth, level);
+ layer_count = u_minify(image->vk.extent.depth, level);
blorp_clear_depth_stencil(&batch, &depth, &stencil,
level, base_layer, layer_count,
@@ -1136,21 +1401,10 @@ void anv_CmdClearDepthStencilImage(
clear_depth, pDepthStencil->depth,
clear_stencil ? 0xff : 0,
pDepthStencil->stencil);
-
- if (clear_stencil && has_stencil_shadow) {
- union isl_color_value stencil_color = {
- .u32 = { pDepthStencil->stencil, },
- };
- blorp_clear(&batch, &stencil_shadow,
- ISL_FORMAT_R8_UINT, ISL_SWIZZLE_IDENTITY,
- level, base_layer, layer_count,
- 0, 0, level_width, level_height,
- stencil_color, NULL);
- }
}
}
- blorp_batch_finish(&batch);
+ anv_blorp_batch_finish(&batch);
}
VkResult
@@ -1170,7 +1424,7 @@ anv_cmd_buffer_alloc_blorp_binding_table(struct anv_cmd_buffer *cmd_buffer,
/* Re-emit state base addresses so we get the new surface state base
* address before we start emitting binding tables etc.
*/
- anv_cmd_buffer_emit_state_base_address(cmd_buffer);
+ anv_cmd_buffer_emit_bt_pool_base_address(cmd_buffer);
*bt_state = anv_cmd_buffer_alloc_binding_table(cmd_buffer, num_entries,
state_offset);
@@ -1201,43 +1455,418 @@ binding_table_for_surface_state(struct anv_cmd_buffer *cmd_buffer,
return VK_SUCCESS;
}
+static bool
+can_fast_clear_color_att(struct anv_cmd_buffer *cmd_buffer,
+ struct blorp_batch *batch,
+ const struct anv_attachment *att,
+ const VkClearAttachment *attachment,
+ uint32_t rectCount, const VkClearRect *pRects)
+{
+ union isl_color_value clear_color =
+ vk_to_isl_color(attachment->clearValue.color);
+
+ if (INTEL_DEBUG(DEBUG_NO_FAST_CLEAR))
+ return false;
+
+ /* We don't support fast clearing with conditional rendering at the
+ * moment. All the tracking done around fast clears (clear color updates
+ * and fast-clear type updates) happens unconditionally.
+ */
+ if (batch->flags & BLORP_BATCH_PREDICATE_ENABLE)
+ return false;
+
+ if (rectCount > 1) {
+ anv_perf_warn(VK_LOG_OBJS(&cmd_buffer->device->vk.base),
+ "Fast clears for vkCmdClearAttachments supported only for rectCount == 1");
+ return false;
+ }
+
+ /* We only support fast-clears on the first layer */
+ if (pRects[0].layerCount > 1 || pRects[0].baseArrayLayer > 0)
+ return false;
+
+ bool is_multiview = cmd_buffer->state.gfx.view_mask != 0;
+ if (is_multiview && (cmd_buffer->state.gfx.view_mask != 1))
+ return false;
+
+ return anv_can_fast_clear_color_view(cmd_buffer->device,
+ (struct anv_image_view *)att->iview,
+ att->layout,
+ clear_color,
+ pRects->layerCount,
+ pRects->rect,
+ cmd_buffer->queue_family->queueFlags);
+}
+
+static void
+exec_ccs_op(struct anv_cmd_buffer *cmd_buffer,
+ struct blorp_batch *batch,
+ const struct anv_image *image,
+ enum isl_format format, struct isl_swizzle swizzle,
+ VkImageAspectFlagBits aspect, uint32_t level,
+ uint32_t base_layer, uint32_t layer_count,
+ enum isl_aux_op ccs_op, union isl_color_value *clear_value)
+{
+ assert(image->vk.aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV);
+ assert(image->vk.samples == 1);
+ assert(level < anv_image_aux_levels(image, aspect));
+ /* Multi-LOD YcBcR is not allowed */
+ assert(image->n_planes == 1 || level == 0);
+ assert(base_layer + layer_count <=
+ anv_image_aux_layers(image, aspect, level));
+
+ const uint32_t plane = anv_image_aspect_to_plane(image, aspect);
+ const struct intel_device_info *devinfo = cmd_buffer->device->info;
+
+ struct blorp_surf surf;
+ get_blorp_surf_for_anv_image(cmd_buffer, image, aspect,
+ 0, ANV_IMAGE_LAYOUT_EXPLICIT_AUX,
+ image->planes[plane].aux_usage,
+ &surf);
+
+ uint32_t level_width = u_minify(surf.surf->logical_level0_px.w, level);
+ uint32_t level_height = u_minify(surf.surf->logical_level0_px.h, level);
+
+ /* Blorp will store the clear color for us if we provide the clear color
+ * address and we are doing a fast clear. So we save the clear value into
+ * the blorp surface.
+ */
+ if (clear_value)
+ surf.clear_color = *clear_value;
+
+ char flush_reason[64];
+ int ret =
+ snprintf(flush_reason, sizeof(flush_reason),
+ "ccs op start: %s", isl_aux_op_to_name(ccs_op));
+ assert(ret < sizeof(flush_reason));
+
+ /* From the Sky Lake PRM Vol. 7, "Render Target Fast Clear":
+ *
+ * "After Render target fast clear, pipe-control with color cache
+ * write-flush must be issued before sending any DRAW commands on
+ * that render target."
+ *
+ * This comment is a bit cryptic and doesn't really tell you what's going
+ * or what's really needed. It appears that fast clear ops are not
+ * properly synchronized with other drawing. This means that we cannot
+ * have a fast clear operation in the pipe at the same time as other
+ * regular drawing operations. We need to use a PIPE_CONTROL to ensure
+ * that the contents of the previous draw hit the render target before we
+ * resolve and then use a second PIPE_CONTROL after the resolve to ensure
+ * that it is completed before any additional drawing occurs.
+ */
+ anv_add_pending_pipe_bits(cmd_buffer,
+ ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
+ ANV_PIPE_TILE_CACHE_FLUSH_BIT |
+ (devinfo->verx10 == 120 ?
+ ANV_PIPE_DEPTH_STALL_BIT : 0) |
+ (devinfo->verx10 == 125 ?
+ ANV_PIPE_HDC_PIPELINE_FLUSH_BIT |
+ ANV_PIPE_DATA_CACHE_FLUSH_BIT : 0) |
+ ANV_PIPE_PSS_STALL_SYNC_BIT |
+ ANV_PIPE_END_OF_PIPE_SYNC_BIT,
+ flush_reason);
+
+ switch (ccs_op) {
+ case ISL_AUX_OP_FAST_CLEAR:
+ /* From the ICL PRMs, Volume 9: Render Engine, State Caching :
+ *
+ * "Any values referenced by pointers within the RENDER_SURFACE_STATE
+ * or SAMPLER_STATE (e.g. Clear Color Pointer, Border Color or
+ * Indirect State Pointer) are considered to be part of that state
+ * and any changes to these referenced values requires an
+ * invalidation of the L1 state cache to ensure the new values are
+ * being used as part of the state. In the case of surface data
+ * pointed to by the Surface Base Address in RENDER SURFACE STATE,
+ * the Texture Cache must be invalidated if the surface data
+ * changes."
+ *
+ * and From the Render Target Fast Clear section,
+ *
+ * "HwManaged FastClear allows SW to store FastClearValue in separate
+ * graphics allocation, instead of keeping them in
+ * RENDER_SURFACE_STATE. This behavior can be enabled by setting
+ * ClearValueAddressEnable in RENDER_SURFACE_STATE.
+ *
+ * Proper sequence of commands is as follows:
+ *
+ * 1. Storing clear color to allocation
+ * 2. Ensuring that step 1. is finished and visible for TextureCache
+ * 3. Performing FastClear
+ *
+ * Step 2. is required on products with ClearColorConversion feature.
+ * This feature is enabled by setting ClearColorConversionEnable.
+ * This causes HW to read stored color from ClearColorAllocation and
+ * write back with the native format or RenderTarget - and clear
+ * color needs to be present and visible. Reading is done from
+ * TextureCache, writing is done to RenderCache."
+ *
+ * We're going to change the clear color. Invalidate the texture cache
+ * now to ensure the clear color conversion feature works properly.
+ * Although the docs seem to require invalidating the texture cache
+ * after updating the clear color allocation, we can do this beforehand
+ * so long as we ensure:
+ *
+ * 1. Step 1 is complete before the texture cache is accessed in step 3
+ * 2. We don't access the texture cache between invalidation and step 3
+ *
+ * The second requirement is satisfied because we'll be performing step
+ * 1 and 3 right after invalidating. The first is satisfied because
+ * BLORP updates the clear color before performing the fast clear and it
+ * performs the synchronizations suggested by the Render Target Fast
+ * Clear section (not quoted here) to ensure its completion.
+ *
+ * While we're here, also invalidate the state cache as suggested.
+ */
+ if (devinfo->ver >= 11) {
+ anv_add_pending_pipe_bits(cmd_buffer,
+ ANV_PIPE_STATE_CACHE_INVALIDATE_BIT |
+ ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT,
+ "before blorp clear color update");
+ }
+
+ blorp_fast_clear(batch, &surf, format, swizzle,
+ level, base_layer, layer_count,
+ 0, 0, level_width, level_height);
+ break;
+ case ISL_AUX_OP_FULL_RESOLVE:
+ case ISL_AUX_OP_PARTIAL_RESOLVE: {
+ /* Wa_1508744258: Enable RHWO optimization for resolves */
+ const bool enable_rhwo_opt =
+ intel_needs_workaround(cmd_buffer->device->info, 1508744258);
+
+ if (enable_rhwo_opt)
+ cmd_buffer->state.pending_rhwo_optimization_enabled = true;
+
+ blorp_ccs_resolve(batch, &surf, level, base_layer, layer_count,
+ format, ccs_op);
+
+ if (enable_rhwo_opt)
+ cmd_buffer->state.pending_rhwo_optimization_enabled = false;
+ break;
+ }
+ case ISL_AUX_OP_AMBIGUATE:
+ for (uint32_t a = 0; a < layer_count; a++) {
+ const uint32_t layer = base_layer + a;
+ blorp_ccs_ambiguate(batch, &surf, level, layer);
+ }
+ break;
+ default:
+ unreachable("Unsupported CCS operation");
+ }
+
+ anv_add_pending_pipe_bits(cmd_buffer,
+ ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
+ (devinfo->verx10 == 120 ?
+ ANV_PIPE_TILE_CACHE_FLUSH_BIT |
+ ANV_PIPE_DEPTH_STALL_BIT : 0) |
+ ANV_PIPE_PSS_STALL_SYNC_BIT |
+ ANV_PIPE_END_OF_PIPE_SYNC_BIT,
+ "ccs op finish");
+}
+
+static void
+exec_mcs_op(struct anv_cmd_buffer *cmd_buffer,
+ struct blorp_batch *batch,
+ const struct anv_image *image,
+ enum isl_format format, struct isl_swizzle swizzle,
+ VkImageAspectFlagBits aspect,
+ uint32_t base_layer, uint32_t layer_count,
+ enum isl_aux_op mcs_op, union isl_color_value *clear_value)
+{
+ assert(image->vk.aspects == VK_IMAGE_ASPECT_COLOR_BIT);
+ assert(image->vk.samples > 1);
+ assert(base_layer + layer_count <= anv_image_aux_layers(image, aspect, 0));
+
+ /* Multisampling with multi-planar formats is not supported */
+ assert(image->n_planes == 1);
+
+ const struct intel_device_info *devinfo = cmd_buffer->device->info;
+ struct blorp_surf surf;
+ get_blorp_surf_for_anv_image(cmd_buffer, image, aspect,
+ 0, ANV_IMAGE_LAYOUT_EXPLICIT_AUX,
+ ISL_AUX_USAGE_MCS, &surf);
+
+ /* Blorp will store the clear color for us if we provide the clear color
+ * address and we are doing a fast clear. So we save the clear value into
+ * the blorp surface.
+ */
+ if (clear_value)
+ surf.clear_color = *clear_value;
+
+ /* From the Sky Lake PRM Vol. 7, "Render Target Fast Clear":
+ *
+ * "After Render target fast clear, pipe-control with color cache
+ * write-flush must be issued before sending any DRAW commands on
+ * that render target."
+ *
+ * This comment is a bit cryptic and doesn't really tell you what's going
+ * or what's really needed. It appears that fast clear ops are not
+ * properly synchronized with other drawing. This means that we cannot
+ * have a fast clear operation in the pipe at the same time as other
+ * regular drawing operations. We need to use a PIPE_CONTROL to ensure
+ * that the contents of the previous draw hit the render target before we
+ * resolve and then use a second PIPE_CONTROL after the resolve to ensure
+ * that it is completed before any additional drawing occurs.
+ */
+ anv_add_pending_pipe_bits(cmd_buffer,
+ ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
+ ANV_PIPE_TILE_CACHE_FLUSH_BIT |
+ (devinfo->verx10 == 120 ?
+ ANV_PIPE_DEPTH_STALL_BIT : 0) |
+ (devinfo->verx10 == 125 ?
+ ANV_PIPE_HDC_PIPELINE_FLUSH_BIT |
+ ANV_PIPE_DATA_CACHE_FLUSH_BIT : 0) |
+ ANV_PIPE_PSS_STALL_SYNC_BIT |
+ ANV_PIPE_END_OF_PIPE_SYNC_BIT,
+ "before fast clear mcs");
+
+ switch (mcs_op) {
+ case ISL_AUX_OP_FAST_CLEAR:
+ /* From the ICL PRMs, Volume 9: Render Engine, State Caching :
+ *
+ * "Any values referenced by pointers within the RENDER_SURFACE_STATE
+ * or SAMPLER_STATE (e.g. Clear Color Pointer, Border Color or
+ * Indirect State Pointer) are considered to be part of that state
+ * and any changes to these referenced values requires an
+ * invalidation of the L1 state cache to ensure the new values are
+ * being used as part of the state. In the case of surface data
+ * pointed to by the Surface Base Address in RENDER SURFACE STATE,
+ * the Texture Cache must be invalidated if the surface data
+ * changes."
+ *
+ * and From the Render Target Fast Clear section,
+ *
+ * "HwManaged FastClear allows SW to store FastClearValue in separate
+ * graphics allocation, instead of keeping them in
+ * RENDER_SURFACE_STATE. This behavior can be enabled by setting
+ * ClearValueAddressEnable in RENDER_SURFACE_STATE.
+ *
+ * Proper sequence of commands is as follows:
+ *
+ * 1. Storing clear color to allocation
+ * 2. Ensuring that step 1. is finished and visible for TextureCache
+ * 3. Performing FastClear
+ *
+ * Step 2. is required on products with ClearColorConversion feature.
+ * This feature is enabled by setting ClearColorConversionEnable.
+ * This causes HW to read stored color from ClearColorAllocation and
+ * write back with the native format or RenderTarget - and clear
+ * color needs to be present and visible. Reading is done from
+ * TextureCache, writing is done to RenderCache."
+ *
+ * We're going to change the clear color. Invalidate the texture cache
+ * now to ensure the clear color conversion feature works properly.
+ * Although the docs seem to require invalidating the texture cache
+ * after updating the clear color allocation, we can do this beforehand
+ * so long as we ensure:
+ *
+ * 1. Step 1 is complete before the texture cache is accessed in step 3
+ * 2. We don't access the texture cache between invalidation and step 3
+ *
+ * The second requirement is satisfied because we'll be performing step
+ * 1 and 3 right after invalidating. The first is satisfied because
+ * BLORP updates the clear color before performing the fast clear and it
+ * performs the synchronizations suggested by the Render Target Fast
+ * Clear section (not quoted here) to ensure its completion.
+ *
+ * While we're here, also invalidate the state cache as suggested.
+ */
+ if (devinfo->ver >= 11) {
+ anv_add_pending_pipe_bits(cmd_buffer,
+ ANV_PIPE_STATE_CACHE_INVALIDATE_BIT |
+ ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT,
+ "before blorp clear color update");
+ }
+
+ blorp_fast_clear(batch, &surf, format, swizzle,
+ 0, base_layer, layer_count,
+ 0, 0, image->vk.extent.width, image->vk.extent.height);
+ break;
+ case ISL_AUX_OP_PARTIAL_RESOLVE:
+ blorp_mcs_partial_resolve(batch, &surf, format,
+ base_layer, layer_count);
+ break;
+ case ISL_AUX_OP_AMBIGUATE:
+ blorp_mcs_ambiguate(batch, &surf, base_layer, layer_count);
+ break;
+ case ISL_AUX_OP_FULL_RESOLVE:
+ default:
+ unreachable("Unsupported MCS operation");
+ }
+
+ anv_add_pending_pipe_bits(cmd_buffer,
+ ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
+ (devinfo->verx10 == 120 ?
+ ANV_PIPE_TILE_CACHE_FLUSH_BIT |
+ ANV_PIPE_DEPTH_STALL_BIT : 0) |
+ ANV_PIPE_PSS_STALL_SYNC_BIT |
+ ANV_PIPE_END_OF_PIPE_SYNC_BIT,
+ "after fast clear mcs");
+}
+
static void
clear_color_attachment(struct anv_cmd_buffer *cmd_buffer,
struct blorp_batch *batch,
const VkClearAttachment *attachment,
uint32_t rectCount, const VkClearRect *pRects)
{
- const struct anv_subpass *subpass = cmd_buffer->state.subpass;
- const uint32_t color_att = attachment->colorAttachment;
- assert(color_att < subpass->color_count);
- const uint32_t att_idx = subpass->color_attachments[color_att].attachment;
+ struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
+ const uint32_t att_idx = attachment->colorAttachment;
+ assert(att_idx < gfx->color_att_count);
+ const struct anv_attachment *att = &gfx->color_att[att_idx];
- if (att_idx == VK_ATTACHMENT_UNUSED)
+ if (att->vk_format == VK_FORMAT_UNDEFINED)
return;
- struct anv_render_pass_attachment *pass_att =
- &cmd_buffer->state.pass->attachments[att_idx];
- struct anv_attachment_state *att_state =
- &cmd_buffer->state.attachments[att_idx];
+ union isl_color_value clear_color =
+ vk_to_isl_color(attachment->clearValue.color);
+
+ const struct anv_image_view *iview = att->iview;
+ if (iview &&
+ can_fast_clear_color_att(cmd_buffer, batch, att,
+ attachment, rectCount, pRects)) {
+ if (iview->image->vk.samples == 1) {
+ exec_ccs_op(cmd_buffer, batch, iview->image,
+ iview->planes[0].isl.format,
+ iview->planes[0].isl.swizzle,
+ VK_IMAGE_ASPECT_COLOR_BIT,
+ 0, 0, 1, ISL_AUX_OP_FAST_CLEAR,
+ &clear_color);
+ } else {
+ exec_mcs_op(cmd_buffer, batch, iview->image,
+ iview->planes[0].isl.format,
+ iview->planes[0].isl.swizzle,
+ VK_IMAGE_ASPECT_COLOR_BIT,
+ 0, 1, ISL_AUX_OP_FAST_CLEAR,
+ &clear_color);
+ }
+
+ anv_cmd_buffer_mark_image_fast_cleared(cmd_buffer, iview->image,
+ iview->planes[0].isl.format,
+ clear_color);
+ anv_cmd_buffer_load_clear_color_from_image(cmd_buffer,
+ att->surface_state.state,
+ iview->image);
+ return;
+ }
uint32_t binding_table;
VkResult result =
- binding_table_for_surface_state(cmd_buffer, att_state->color.state,
+ binding_table_for_surface_state(cmd_buffer, att->surface_state.state,
&binding_table);
if (result != VK_SUCCESS)
return;
- union isl_color_value clear_color =
- vk_to_isl_color(attachment->clearValue.color);
-
/* If multiview is enabled we ignore baseArrayLayer and layerCount */
- if (subpass->view_mask) {
- u_foreach_bit(view_idx, subpass->view_mask) {
+ if (gfx->view_mask) {
+ u_foreach_bit(view_idx, gfx->view_mask) {
for (uint32_t r = 0; r < rectCount; ++r) {
const VkOffset2D offset = pRects[r].rect.offset;
const VkExtent2D extent = pRects[r].rect.extent;
blorp_clear_attachments(batch, binding_table,
- ISL_FORMAT_UNSUPPORTED, pass_att->samples,
+ ISL_FORMAT_UNSUPPORTED,
+ gfx->samples,
view_idx, 1,
offset.x, offset.y,
offset.x + extent.width,
@@ -1253,7 +1882,8 @@ clear_color_attachment(struct anv_cmd_buffer *cmd_buffer,
const VkExtent2D extent = pRects[r].rect.extent;
assert(pRects[r].layerCount != VK_REMAINING_ARRAY_LAYERS);
blorp_clear_attachments(batch, binding_table,
- ISL_FORMAT_UNSUPPORTED, pass_att->samples,
+ ISL_FORMAT_UNSUPPORTED,
+ gfx->samples,
pRects[r].baseArrayLayer,
pRects[r].layerCount,
offset.x, offset.y,
@@ -1263,28 +1893,213 @@ clear_color_attachment(struct anv_cmd_buffer *cmd_buffer,
}
static void
+anv_fast_clear_depth_stencil(struct anv_cmd_buffer *cmd_buffer,
+ struct blorp_batch *batch,
+ const struct anv_image *image,
+ VkImageAspectFlags aspects,
+ uint32_t level,
+ uint32_t base_layer, uint32_t layer_count,
+ VkRect2D area, uint8_t stencil_value)
+{
+ assert(image->vk.aspects & (VK_IMAGE_ASPECT_DEPTH_BIT |
+ VK_IMAGE_ASPECT_STENCIL_BIT));
+
+ struct blorp_surf depth = {};
+ if (aspects & VK_IMAGE_ASPECT_DEPTH_BIT) {
+ const uint32_t plane =
+ anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_DEPTH_BIT);
+ assert(base_layer + layer_count <=
+ anv_image_aux_layers(image, VK_IMAGE_ASPECT_DEPTH_BIT, level));
+ get_blorp_surf_for_anv_image(cmd_buffer,
+ image, VK_IMAGE_ASPECT_DEPTH_BIT,
+ 0, ANV_IMAGE_LAYOUT_EXPLICIT_AUX,
+ image->planes[plane].aux_usage, &depth);
+ }
+
+ struct blorp_surf stencil = {};
+ if (aspects & VK_IMAGE_ASPECT_STENCIL_BIT) {
+ const uint32_t plane =
+ anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_STENCIL_BIT);
+ get_blorp_surf_for_anv_image(cmd_buffer,
+ image, VK_IMAGE_ASPECT_STENCIL_BIT,
+ 0, ANV_IMAGE_LAYOUT_EXPLICIT_AUX,
+ image->planes[plane].aux_usage, &stencil);
+ }
+
+ /* From the Sky Lake PRM Volume 7, "Depth Buffer Clear":
+ *
+ * "The following is required when performing a depth buffer clear with
+ * using the WM_STATE or 3DSTATE_WM:
+ *
+ * * If other rendering operations have preceded this clear, a
+ * PIPE_CONTROL with depth cache flush enabled, Depth Stall bit
+ * enabled must be issued before the rectangle primitive used for
+ * the depth buffer clear operation.
+ * * [...]"
+ *
+ * Even though the PRM only says that this is required if using 3DSTATE_WM
+ * and a 3DPRIMITIVE, the GPU appears to also need this to avoid occasional
+ * hangs when doing a clear with WM_HZ_OP.
+ */
+ anv_add_pending_pipe_bits(cmd_buffer,
+ ANV_PIPE_DEPTH_CACHE_FLUSH_BIT |
+ ANV_PIPE_DEPTH_STALL_BIT,
+ "before clear hiz");
+
+ if ((aspects & VK_IMAGE_ASPECT_DEPTH_BIT) &&
+ depth.aux_usage == ISL_AUX_USAGE_HIZ_CCS_WT) {
+ /* From Bspec 47010 (Depth Buffer Clear):
+ *
+ * Since the fast clear cycles to CCS are not cached in TileCache,
+ * any previous depth buffer writes to overlapping pixels must be
+ * flushed out of TileCache before a succeeding Depth Buffer Clear.
+ * This restriction only applies to Depth Buffer with write-thru
+ * enabled, since fast clears to CCS only occur for write-thru mode.
+ *
+ * There may have been a write to this depth buffer. Flush it from the
+ * tile cache just in case.
+ *
+ * Set CS stall bit to guarantee that the fast clear starts the execution
+ * after the tile cache flush completed.
+ *
+ * There is no Bspec requirement to flush the data cache but the
+ * experiment shows that flusing the data cache helps to resolve the
+ * corruption.
+ */
+ unsigned wa_flush = cmd_buffer->device->info->verx10 >= 125 ?
+ ANV_PIPE_DATA_CACHE_FLUSH_BIT : 0;
+ anv_add_pending_pipe_bits(cmd_buffer,
+ ANV_PIPE_DEPTH_CACHE_FLUSH_BIT |
+ ANV_PIPE_CS_STALL_BIT |
+ ANV_PIPE_TILE_CACHE_FLUSH_BIT |
+ wa_flush,
+ "before clear hiz_ccs_wt");
+ }
+
+ blorp_hiz_clear_depth_stencil(batch, &depth, &stencil,
+ level, base_layer, layer_count,
+ area.offset.x, area.offset.y,
+ area.offset.x + area.extent.width,
+ area.offset.y + area.extent.height,
+ aspects & VK_IMAGE_ASPECT_DEPTH_BIT,
+ ANV_HZ_FC_VAL,
+ aspects & VK_IMAGE_ASPECT_STENCIL_BIT,
+ stencil_value);
+
+ /* From the SKL PRM, Depth Buffer Clear:
+ *
+ * "Depth Buffer Clear Workaround
+ *
+ * Depth buffer clear pass using any of the methods (WM_STATE,
+ * 3DSTATE_WM or 3DSTATE_WM_HZ_OP) must be followed by a PIPE_CONTROL
+ * command with DEPTH_STALL bit and Depth FLUSH bits “set” before
+ * starting to render. DepthStall and DepthFlush are not needed between
+ * consecutive depth clear passes nor is it required if the depth-clear
+ * pass was done with “full_surf_clear” bit set in the
+ * 3DSTATE_WM_HZ_OP."
+ *
+ * Even though the PRM provides a bunch of conditions under which this is
+ * supposedly unnecessary, we choose to perform the flush unconditionally
+ * just to be safe.
+ *
+ * From Bspec 46959, a programming note applicable to Gfx12+:
+ *
+ * "Since HZ_OP has to be sent twice (first time set the clear/resolve state
+ * and 2nd time to clear the state), and HW internally flushes the depth
+ * cache on HZ_OP, there is no need to explicitly send a Depth Cache flush
+ * after Clear or Resolve."
+ */
+ if (cmd_buffer->device->info->verx10 < 120) {
+ anv_add_pending_pipe_bits(cmd_buffer,
+ ANV_PIPE_DEPTH_CACHE_FLUSH_BIT |
+ ANV_PIPE_DEPTH_STALL_BIT,
+ "after clear hiz");
+ }
+}
+
+static bool
+can_hiz_clear_att(struct anv_cmd_buffer *cmd_buffer,
+ struct blorp_batch *batch,
+ const struct anv_attachment *ds_att,
+ const VkClearAttachment *attachment,
+ uint32_t rectCount, const VkClearRect *pRects)
+{
+ if (INTEL_DEBUG(DEBUG_NO_FAST_CLEAR))
+ return false;
+
+ /* From Bspec's section MI_PREDICATE:
+ *
+ * "The MI_PREDICATE command is used to control the Predicate state bit,
+ * which in turn can be used to enable/disable the processing of
+ * 3DPRIMITIVE commands."
+ *
+ * Also from BDW/CHV Bspec's 3DSTATE_WM_HZ_OP programming notes:
+ *
+ * "This command does NOT support predication from the use of the
+ * MI_PREDICATE register. To predicate depth clears and resolves on you
+ * must fall back to using the 3D_PRIMITIVE or GPGPU_WALKER commands."
+ *
+ * Since BLORP's predication is currently dependent on MI_PREDICATE, fall
+ * back to the slow depth clear path when the BLORP_BATCH_PREDICATE_ENABLE
+ * flag is set.
+ */
+ if (batch->flags & BLORP_BATCH_PREDICATE_ENABLE)
+ return false;
+
+ if (rectCount > 1) {
+ anv_perf_warn(VK_LOG_OBJS(&cmd_buffer->device->vk.base),
+ "Fast clears for vkCmdClearAttachments supported only for rectCount == 1");
+ return false;
+ }
+
+ /* When the BLORP_BATCH_NO_EMIT_DEPTH_STENCIL flag is set, BLORP can only
+ * clear the first slice of the currently configured depth/stencil view.
+ */
+ assert(batch->flags & BLORP_BATCH_NO_EMIT_DEPTH_STENCIL);
+ if (pRects[0].layerCount > 1 || pRects[0].baseArrayLayer > 0)
+ return false;
+
+ return anv_can_hiz_clear_ds_view(cmd_buffer->device, ds_att->iview,
+ ds_att->layout,
+ attachment->aspectMask,
+ attachment->clearValue.depthStencil.depth,
+ pRects->rect,
+ cmd_buffer->queue_family->queueFlags);
+}
+
+static void
clear_depth_stencil_attachment(struct anv_cmd_buffer *cmd_buffer,
struct blorp_batch *batch,
const VkClearAttachment *attachment,
uint32_t rectCount, const VkClearRect *pRects)
{
static const union isl_color_value color_value = { .u32 = { 0, } };
- const struct anv_subpass *subpass = cmd_buffer->state.subpass;
- if (!subpass->depth_stencil_attachment)
+ struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
+ const struct anv_attachment *d_att = &gfx->depth_att;
+ const struct anv_attachment *s_att = &gfx->stencil_att;
+ if (d_att->vk_format == VK_FORMAT_UNDEFINED &&
+ s_att->vk_format == VK_FORMAT_UNDEFINED)
return;
- const uint32_t att_idx = subpass->depth_stencil_attachment->attachment;
- assert(att_idx != VK_ATTACHMENT_UNUSED);
- struct anv_render_pass_attachment *pass_att =
- &cmd_buffer->state.pass->attachments[att_idx];
+ const struct anv_attachment *ds_att = d_att->iview ? d_att : s_att;
+ if (ds_att->iview &&
+ can_hiz_clear_att(cmd_buffer, batch, ds_att, attachment, rectCount, pRects)) {
+ anv_fast_clear_depth_stencil(cmd_buffer, batch, ds_att->iview->image,
+ attachment->aspectMask,
+ ds_att->iview->planes[0].isl.base_level,
+ ds_att->iview->planes[0].isl.base_array_layer,
+ pRects[0].layerCount, pRects->rect,
+ attachment->clearValue.depthStencil.stencil);
+ return;
+ }
bool clear_depth = attachment->aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT;
bool clear_stencil = attachment->aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT;
enum isl_format depth_format = ISL_FORMAT_UNSUPPORTED;
- if (clear_depth) {
- depth_format = anv_get_isl_format(&cmd_buffer->device->info,
- pass_att->format,
+ if (d_att->vk_format != VK_FORMAT_UNDEFINED) {
+ depth_format = anv_get_isl_format(cmd_buffer->device->info,
+ d_att->vk_format,
VK_IMAGE_ASPECT_DEPTH_BIT,
VK_IMAGE_TILING_OPTIMAL);
}
@@ -1292,20 +2107,21 @@ clear_depth_stencil_attachment(struct anv_cmd_buffer *cmd_buffer,
uint32_t binding_table;
VkResult result =
binding_table_for_surface_state(cmd_buffer,
- cmd_buffer->state.null_surface_state,
+ gfx->null_surface_state,
&binding_table);
if (result != VK_SUCCESS)
return;
/* If multiview is enabled we ignore baseArrayLayer and layerCount */
- if (subpass->view_mask) {
- u_foreach_bit(view_idx, subpass->view_mask) {
+ if (gfx->view_mask) {
+ u_foreach_bit(view_idx, gfx->view_mask) {
for (uint32_t r = 0; r < rectCount; ++r) {
const VkOffset2D offset = pRects[r].rect.offset;
const VkExtent2D extent = pRects[r].rect.extent;
VkClearDepthStencilValue value = attachment->clearValue.depthStencil;
blorp_clear_attachments(batch, binding_table,
- depth_format, pass_att->samples,
+ depth_format,
+ gfx->samples,
view_idx, 1,
offset.x, offset.y,
offset.x + extent.width,
@@ -1324,7 +2140,8 @@ clear_depth_stencil_attachment(struct anv_cmd_buffer *cmd_buffer,
VkClearDepthStencilValue value = attachment->clearValue.depthStencil;
assert(pRects[r].layerCount != VK_REMAINING_ARRAY_LAYERS);
blorp_clear_attachments(batch, binding_table,
- depth_format, pass_att->samples,
+ depth_format,
+ gfx->samples,
pRects[r].baseArrayLayer,
pRects[r].layerCount,
offset.x, offset.y,
@@ -1353,7 +2170,7 @@ void anv_CmdClearAttachments(
anv_cmd_emit_conditional_render_predicate(cmd_buffer);
flags |= BLORP_BATCH_PREDICATE_ENABLE;
}
- blorp_batch_init(&cmd_buffer->device->blorp, &batch, cmd_buffer, flags);
+ anv_blorp_batch_init(cmd_buffer, &batch, flags);
for (uint32_t a = 0; a < attachmentCount; ++a) {
if (pAttachments[a].aspectMask & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) {
@@ -1368,21 +2185,17 @@ void anv_CmdClearAttachments(
}
}
- blorp_batch_finish(&batch);
+ anv_blorp_batch_finish(&batch);
}
-enum subpass_stage {
- SUBPASS_STAGE_LOAD,
- SUBPASS_STAGE_DRAW,
- SUBPASS_STAGE_RESOLVE,
-};
-
-void
+static void
anv_image_msaa_resolve(struct anv_cmd_buffer *cmd_buffer,
const struct anv_image *src_image,
+ enum isl_format src_format_override,
enum isl_aux_usage src_aux_usage,
uint32_t src_level, uint32_t src_base_layer,
const struct anv_image *dst_image,
+ enum isl_format dst_format_override,
enum isl_aux_usage dst_aux_usage,
uint32_t dst_level, uint32_t dst_base_layer,
VkImageAspectFlagBits aspect,
@@ -1393,16 +2206,16 @@ anv_image_msaa_resolve(struct anv_cmd_buffer *cmd_buffer,
enum blorp_filter filter)
{
struct blorp_batch batch;
- blorp_batch_init(&cmd_buffer->device->blorp, &batch, cmd_buffer, 0);
+ anv_blorp_batch_init(cmd_buffer, &batch, 0);
+ assert((batch.flags & BLORP_BATCH_USE_COMPUTE) == 0);
assert(src_image->vk.image_type == VK_IMAGE_TYPE_2D);
assert(src_image->vk.samples > 1);
assert(dst_image->vk.image_type == VK_IMAGE_TYPE_2D);
assert(dst_image->vk.samples == 1);
- assert(src_image->n_planes == dst_image->n_planes);
struct blorp_surf src_surf, dst_surf;
- get_blorp_surf_for_anv_image(cmd_buffer->device, src_image, aspect,
+ get_blorp_surf_for_anv_image(cmd_buffer, src_image, aspect,
VK_IMAGE_USAGE_TRANSFER_SRC_BIT,
ANV_IMAGE_LAYOUT_EXPLICIT_AUX,
src_aux_usage, &src_surf);
@@ -1411,7 +2224,7 @@ anv_image_msaa_resolve(struct anv_cmd_buffer *cmd_buffer,
anv_image_get_clear_color_addr(cmd_buffer->device, src_image,
VK_IMAGE_ASPECT_COLOR_BIT));
}
- get_blorp_surf_for_anv_image(cmd_buffer->device, dst_image, aspect,
+ get_blorp_surf_for_anv_image(cmd_buffer, dst_image, aspect,
VK_IMAGE_USAGE_TRANSFER_DST_BIT,
ANV_IMAGE_LAYOUT_EXPLICIT_AUX,
dst_aux_usage, &dst_surf);
@@ -1435,15 +2248,105 @@ anv_image_msaa_resolve(struct anv_cmd_buffer *cmd_buffer,
for (uint32_t l = 0; l < layer_count; l++) {
blorp_blit(&batch,
&src_surf, src_level, src_base_layer + l,
- ISL_FORMAT_UNSUPPORTED, ISL_SWIZZLE_IDENTITY,
+ src_format_override, ISL_SWIZZLE_IDENTITY,
&dst_surf, dst_level, dst_base_layer + l,
- ISL_FORMAT_UNSUPPORTED, ISL_SWIZZLE_IDENTITY,
+ dst_format_override, ISL_SWIZZLE_IDENTITY,
src_x, src_y, src_x + width, src_y + height,
dst_x, dst_y, dst_x + width, dst_y + height,
filter, false, false);
}
- blorp_batch_finish(&batch);
+ anv_blorp_batch_finish(&batch);
+}
+
+static enum blorp_filter
+vk_to_blorp_resolve_mode(VkResolveModeFlagBits vk_mode)
+{
+ switch (vk_mode) {
+ case VK_RESOLVE_MODE_SAMPLE_ZERO_BIT:
+ return BLORP_FILTER_SAMPLE_0;
+ case VK_RESOLVE_MODE_AVERAGE_BIT:
+ return BLORP_FILTER_AVERAGE;
+ case VK_RESOLVE_MODE_MIN_BIT:
+ return BLORP_FILTER_MIN_SAMPLE;
+ case VK_RESOLVE_MODE_MAX_BIT:
+ return BLORP_FILTER_MAX_SAMPLE;
+ default:
+ return BLORP_FILTER_NONE;
+ }
+}
+
+void
+anv_attachment_msaa_resolve(struct anv_cmd_buffer *cmd_buffer,
+ const struct anv_attachment *att,
+ VkImageLayout layout,
+ VkImageAspectFlagBits aspect)
+{
+ struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
+ const struct anv_image_view *src_iview = att->iview;
+ const struct anv_image_view *dst_iview = att->resolve_iview;
+
+ enum isl_aux_usage src_aux_usage =
+ anv_layout_to_aux_usage(cmd_buffer->device->info,
+ src_iview->image, aspect,
+ VK_IMAGE_USAGE_TRANSFER_SRC_BIT,
+ layout,
+ cmd_buffer->queue_family->queueFlags);
+
+ enum isl_aux_usage dst_aux_usage =
+ anv_layout_to_aux_usage(cmd_buffer->device->info,
+ dst_iview->image, aspect,
+ VK_IMAGE_USAGE_TRANSFER_DST_BIT,
+ att->resolve_layout,
+ cmd_buffer->queue_family->queueFlags);
+
+ enum blorp_filter filter = vk_to_blorp_resolve_mode(att->resolve_mode);
+
+ /* Depth/stencil should not use their view format for resolve because they
+ * go in pairs.
+ */
+ enum isl_format src_format = ISL_FORMAT_UNSUPPORTED;
+ enum isl_format dst_format = ISL_FORMAT_UNSUPPORTED;
+ if (!(aspect & (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT))) {
+ src_format = src_iview->planes[0].isl.format;
+ dst_format = dst_iview->planes[0].isl.format;
+ }
+
+ const VkRect2D render_area = gfx->render_area;
+ if (gfx->view_mask == 0) {
+ anv_image_msaa_resolve(cmd_buffer,
+ src_iview->image, src_format, src_aux_usage,
+ src_iview->planes[0].isl.base_level,
+ src_iview->planes[0].isl.base_array_layer,
+ dst_iview->image, dst_format, dst_aux_usage,
+ dst_iview->planes[0].isl.base_level,
+ dst_iview->planes[0].isl.base_array_layer,
+ aspect,
+ render_area.offset.x, render_area.offset.y,
+ render_area.offset.x, render_area.offset.y,
+ render_area.extent.width,
+ render_area.extent.height,
+ gfx->layer_count, filter);
+ } else {
+ uint32_t res_view_mask = gfx->view_mask;
+ while (res_view_mask) {
+ int i = u_bit_scan(&res_view_mask);
+
+ anv_image_msaa_resolve(cmd_buffer,
+ src_iview->image, src_format, src_aux_usage,
+ src_iview->planes[0].isl.base_level,
+ src_iview->planes[0].isl.base_array_layer + i,
+ dst_iview->image, dst_format, dst_aux_usage,
+ dst_iview->planes[0].isl.base_level,
+ dst_iview->planes[0].isl.base_array_layer + i,
+ aspect,
+ render_area.offset.x, render_area.offset.y,
+ render_area.offset.x, render_area.offset.y,
+ render_area.extent.width,
+ render_area.extent.height,
+ 1, filter);
+ }
+ }
}
static void
@@ -1452,7 +2355,7 @@ resolve_image(struct anv_cmd_buffer *cmd_buffer,
VkImageLayout src_image_layout,
struct anv_image *dst_image,
VkImageLayout dst_image_layout,
- const VkImageResolve2KHR *region)
+ const VkImageResolve2 *region)
{
assert(region->srcSubresource.aspectMask == region->dstSubresource.aspectMask);
assert(vk_image_subresource_layer_count(&src_image->vk, &region->srcSubresource) ==
@@ -1464,21 +2367,23 @@ resolve_image(struct anv_cmd_buffer *cmd_buffer,
anv_foreach_image_aspect_bit(aspect_bit, src_image,
region->srcSubresource.aspectMask) {
enum isl_aux_usage src_aux_usage =
- anv_layout_to_aux_usage(&cmd_buffer->device->info, src_image,
+ anv_layout_to_aux_usage(cmd_buffer->device->info, src_image,
(1 << aspect_bit),
VK_IMAGE_USAGE_TRANSFER_SRC_BIT,
- src_image_layout);
+ src_image_layout,
+ cmd_buffer->queue_family->queueFlags);
enum isl_aux_usage dst_aux_usage =
- anv_layout_to_aux_usage(&cmd_buffer->device->info, dst_image,
+ anv_layout_to_aux_usage(cmd_buffer->device->info, dst_image,
(1 << aspect_bit),
VK_IMAGE_USAGE_TRANSFER_DST_BIT,
- dst_image_layout);
+ dst_image_layout,
+ cmd_buffer->queue_family->queueFlags);
anv_image_msaa_resolve(cmd_buffer,
- src_image, src_aux_usage,
+ src_image, ISL_FORMAT_UNSUPPORTED, src_aux_usage,
region->srcSubresource.mipLevel,
region->srcSubresource.baseArrayLayer,
- dst_image, dst_aux_usage,
+ dst_image, ISL_FORMAT_UNSUPPORTED, dst_aux_usage,
region->dstSubresource.mipLevel,
region->dstSubresource.baseArrayLayer,
(1 << aspect_bit),
@@ -1492,9 +2397,9 @@ resolve_image(struct anv_cmd_buffer *cmd_buffer,
}
}
-void anv_CmdResolveImage2KHR(
+void anv_CmdResolveImage2(
VkCommandBuffer commandBuffer,
- const VkResolveImageInfo2KHR* pResolveImageInfo)
+ const VkResolveImageInfo2* pResolveImageInfo)
{
ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
ANV_FROM_HANDLE(anv_image, src_image, pResolveImageInfo->srcImage);
@@ -1509,63 +2414,6 @@ void anv_CmdResolveImage2KHR(
}
void
-anv_image_copy_to_shadow(struct anv_cmd_buffer *cmd_buffer,
- const struct anv_image *image,
- VkImageAspectFlagBits aspect,
- uint32_t base_level, uint32_t level_count,
- uint32_t base_layer, uint32_t layer_count)
-{
- struct blorp_batch batch;
- blorp_batch_init(&cmd_buffer->device->blorp, &batch, cmd_buffer, 0);
-
- /* We don't know who touched the main surface last so flush a bunch of
- * caches to ensure we get good data.
- */
- anv_add_pending_pipe_bits(cmd_buffer,
- ANV_PIPE_DEPTH_CACHE_FLUSH_BIT |
- ANV_PIPE_HDC_PIPELINE_FLUSH_BIT |
- ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
- ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT,
- "before copy_to_shadow");
-
- struct blorp_surf surf;
- get_blorp_surf_for_anv_image(cmd_buffer->device,
- image, aspect,
- VK_IMAGE_USAGE_TRANSFER_SRC_BIT,
- VK_IMAGE_LAYOUT_GENERAL,
- ISL_AUX_USAGE_NONE, &surf);
- assert(surf.aux_usage == ISL_AUX_USAGE_NONE);
-
- struct blorp_surf shadow_surf;
- get_blorp_surf_for_anv_shadow_image(cmd_buffer->device,
- image, aspect, &shadow_surf);
-
- for (uint32_t l = 0; l < level_count; l++) {
- const uint32_t level = base_level + l;
-
- const VkExtent3D extent = vk_image_mip_level_extent(&image->vk, level);
-
- if (image->vk.image_type == VK_IMAGE_TYPE_3D)
- layer_count = extent.depth;
-
- for (uint32_t a = 0; a < layer_count; a++) {
- const uint32_t layer = base_layer + a;
-
- blorp_copy(&batch, &surf, level, layer,
- &shadow_surf, level, layer,
- 0, 0, 0, 0, extent.width, extent.height);
- }
- }
-
- /* We just wrote to the buffer with the render cache. Flush it. */
- anv_add_pending_pipe_bits(cmd_buffer,
- ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT,
- "after copy_to_shadow");
-
- blorp_batch_finish(&batch);
-}
-
-void
anv_image_clear_color(struct anv_cmd_buffer *cmd_buffer,
const struct anv_image *image,
VkImageAspectFlagBits aspect,
@@ -1580,10 +2428,10 @@ anv_image_clear_color(struct anv_cmd_buffer *cmd_buffer,
assert(image->n_planes == 1);
struct blorp_batch batch;
- blorp_batch_init(&cmd_buffer->device->blorp, &batch, cmd_buffer, 0);
+ anv_blorp_batch_init(cmd_buffer, &batch, 0);
struct blorp_surf surf;
- get_blorp_surf_for_anv_image(cmd_buffer->device, image, aspect,
+ get_blorp_surf_for_anv_image(cmd_buffer, image, aspect,
VK_IMAGE_USAGE_TRANSFER_DST_BIT,
ANV_IMAGE_LAYOUT_EXPLICIT_AUX,
aux_usage, &surf);
@@ -1595,9 +2443,9 @@ anv_image_clear_color(struct anv_cmd_buffer *cmd_buffer,
area.offset.x, area.offset.y,
area.offset.x + area.extent.width,
area.offset.y + area.extent.height,
- clear_color, NULL);
+ clear_color, 0 /* color_write_disable */);
- blorp_batch_finish(&batch);
+ anv_blorp_batch_finish(&batch);
}
void
@@ -1614,11 +2462,12 @@ anv_image_clear_depth_stencil(struct anv_cmd_buffer *cmd_buffer,
VK_IMAGE_ASPECT_STENCIL_BIT));
struct blorp_batch batch;
- blorp_batch_init(&cmd_buffer->device->blorp, &batch, cmd_buffer, 0);
+ anv_blorp_batch_init(cmd_buffer, &batch, 0);
+ assert((batch.flags & BLORP_BATCH_USE_COMPUTE) == 0);
struct blorp_surf depth = {};
if (aspects & VK_IMAGE_ASPECT_DEPTH_BIT) {
- get_blorp_surf_for_anv_image(cmd_buffer->device,
+ get_blorp_surf_for_anv_image(cmd_buffer,
image, VK_IMAGE_ASPECT_DEPTH_BIT,
0, ANV_IMAGE_LAYOUT_EXPLICIT_AUX,
depth_aux_usage, &depth);
@@ -1628,7 +2477,7 @@ anv_image_clear_depth_stencil(struct anv_cmd_buffer *cmd_buffer,
if (aspects & VK_IMAGE_ASPECT_STENCIL_BIT) {
const uint32_t plane =
anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_STENCIL_BIT);
- get_blorp_surf_for_anv_image(cmd_buffer->device,
+ get_blorp_surf_for_anv_image(cmd_buffer,
image, VK_IMAGE_ASPECT_STENCIL_BIT,
0, ANV_IMAGE_LAYOUT_EXPLICIT_AUX,
image->planes[plane].aux_usage, &stencil);
@@ -1659,28 +2508,10 @@ anv_image_clear_depth_stencil(struct anv_cmd_buffer *cmd_buffer,
*/
anv_add_pending_pipe_bits(cmd_buffer,
ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
- ANV_PIPE_TILE_CACHE_FLUSH_BIT |
ANV_PIPE_END_OF_PIPE_SYNC_BIT,
"after clear DS");
- struct blorp_surf stencil_shadow;
- if ((aspects & VK_IMAGE_ASPECT_STENCIL_BIT) &&
- get_blorp_surf_for_anv_shadow_image(cmd_buffer->device, image,
- VK_IMAGE_ASPECT_STENCIL_BIT,
- &stencil_shadow)) {
- union isl_color_value stencil_color = {
- .u32 = { stencil_value },
- };
- blorp_clear(&batch, &stencil_shadow,
- ISL_FORMAT_R8_UINT, ISL_SWIZZLE_IDENTITY,
- level, base_layer, layer_count,
- area.offset.x, area.offset.y,
- area.offset.x + area.extent.width,
- area.offset.y + area.extent.height,
- stencil_color, NULL);
- }
-
- blorp_batch_finish(&batch);
+ anv_blorp_batch_finish(&batch);
}
void
@@ -1696,17 +2527,18 @@ anv_image_hiz_op(struct anv_cmd_buffer *cmd_buffer,
assert(plane == 0);
struct blorp_batch batch;
- blorp_batch_init(&cmd_buffer->device->blorp, &batch, cmd_buffer, 0);
+ anv_blorp_batch_init(cmd_buffer, &batch, 0);
+ assert((batch.flags & BLORP_BATCH_USE_COMPUTE) == 0);
struct blorp_surf surf;
- get_blorp_surf_for_anv_image(cmd_buffer->device,
+ get_blorp_surf_for_anv_image(cmd_buffer,
image, VK_IMAGE_ASPECT_DEPTH_BIT,
0, ANV_IMAGE_LAYOUT_EXPLICIT_AUX,
image->planes[plane].aux_usage, &surf);
blorp_hiz_op(&batch, &surf, level, base_layer, layer_count, hiz_op);
- blorp_batch_finish(&batch);
+ anv_blorp_batch_finish(&batch);
}
void
@@ -1717,86 +2549,14 @@ anv_image_hiz_clear(struct anv_cmd_buffer *cmd_buffer,
uint32_t base_layer, uint32_t layer_count,
VkRect2D area, uint8_t stencil_value)
{
- assert(image->vk.aspects & (VK_IMAGE_ASPECT_DEPTH_BIT |
- VK_IMAGE_ASPECT_STENCIL_BIT));
-
struct blorp_batch batch;
- blorp_batch_init(&cmd_buffer->device->blorp, &batch, cmd_buffer, 0);
+ anv_blorp_batch_init(cmd_buffer, &batch, 0);
+ assert((batch.flags & BLORP_BATCH_USE_COMPUTE) == 0);
- struct blorp_surf depth = {};
- if (aspects & VK_IMAGE_ASPECT_DEPTH_BIT) {
- const uint32_t plane =
- anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_DEPTH_BIT);
- assert(base_layer + layer_count <=
- anv_image_aux_layers(image, VK_IMAGE_ASPECT_DEPTH_BIT, level));
- get_blorp_surf_for_anv_image(cmd_buffer->device,
- image, VK_IMAGE_ASPECT_DEPTH_BIT,
- 0, ANV_IMAGE_LAYOUT_EXPLICIT_AUX,
- image->planes[plane].aux_usage, &depth);
- }
-
- struct blorp_surf stencil = {};
- if (aspects & VK_IMAGE_ASPECT_STENCIL_BIT) {
- const uint32_t plane =
- anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_STENCIL_BIT);
- get_blorp_surf_for_anv_image(cmd_buffer->device,
- image, VK_IMAGE_ASPECT_STENCIL_BIT,
- 0, ANV_IMAGE_LAYOUT_EXPLICIT_AUX,
- image->planes[plane].aux_usage, &stencil);
- }
-
- /* From the Sky Lake PRM Volume 7, "Depth Buffer Clear":
- *
- * "The following is required when performing a depth buffer clear with
- * using the WM_STATE or 3DSTATE_WM:
- *
- * * If other rendering operations have preceded this clear, a
- * PIPE_CONTROL with depth cache flush enabled, Depth Stall bit
- * enabled must be issued before the rectangle primitive used for
- * the depth buffer clear operation.
- * * [...]"
- *
- * Even though the PRM only says that this is required if using 3DSTATE_WM
- * and a 3DPRIMITIVE, the GPU appears to also need this to avoid occasional
- * hangs when doing a clear with WM_HZ_OP.
- */
- anv_add_pending_pipe_bits(cmd_buffer,
- ANV_PIPE_DEPTH_CACHE_FLUSH_BIT |
- ANV_PIPE_DEPTH_STALL_BIT,
- "before clear hiz");
+ anv_fast_clear_depth_stencil(cmd_buffer, &batch, image, aspects, level,
+ base_layer, layer_count, area, stencil_value);
- blorp_hiz_clear_depth_stencil(&batch, &depth, &stencil,
- level, base_layer, layer_count,
- area.offset.x, area.offset.y,
- area.offset.x + area.extent.width,
- area.offset.y + area.extent.height,
- aspects & VK_IMAGE_ASPECT_DEPTH_BIT,
- ANV_HZ_FC_VAL,
- aspects & VK_IMAGE_ASPECT_STENCIL_BIT,
- stencil_value);
-
- blorp_batch_finish(&batch);
-
- /* From the SKL PRM, Depth Buffer Clear:
- *
- * "Depth Buffer Clear Workaround
- *
- * Depth buffer clear pass using any of the methods (WM_STATE,
- * 3DSTATE_WM or 3DSTATE_WM_HZ_OP) must be followed by a PIPE_CONTROL
- * command with DEPTH_STALL bit and Depth FLUSH bits “set” before
- * starting to render. DepthStall and DepthFlush are not needed between
- * consecutive depth clear passes nor is it required if the depth-clear
- * pass was done with “full_surf_clear” bit set in the
- * 3DSTATE_WM_HZ_OP."
- *
- * Even though the PRM provides a bunch of conditions under which this is
- * supposedly unnecessary, we choose to perform the flush unconditionally
- * just to be safe.
- */
- anv_add_pending_pipe_bits(cmd_buffer,
- ANV_PIPE_DEPTH_CACHE_FLUSH_BIT |
- ANV_PIPE_DEPTH_STALL_BIT,
- "after clear hiz");
+ anv_blorp_batch_finish(&batch);
}
void
@@ -1808,73 +2568,16 @@ anv_image_mcs_op(struct anv_cmd_buffer *cmd_buffer,
enum isl_aux_op mcs_op, union isl_color_value *clear_value,
bool predicate)
{
- assert(image->vk.aspects == VK_IMAGE_ASPECT_COLOR_BIT);
- assert(image->vk.samples > 1);
- assert(base_layer + layer_count <= anv_image_aux_layers(image, aspect, 0));
-
- /* Multisampling with multi-planar formats is not supported */
- assert(image->n_planes == 1);
-
struct blorp_batch batch;
- blorp_batch_init(&cmd_buffer->device->blorp, &batch, cmd_buffer,
- BLORP_BATCH_PREDICATE_ENABLE * predicate +
- BLORP_BATCH_NO_UPDATE_CLEAR_COLOR * !clear_value);
-
- struct blorp_surf surf;
- get_blorp_surf_for_anv_image(cmd_buffer->device, image, aspect,
- 0, ANV_IMAGE_LAYOUT_EXPLICIT_AUX,
- ISL_AUX_USAGE_MCS, &surf);
-
- /* Blorp will store the clear color for us if we provide the clear color
- * address and we are doing a fast clear. So we save the clear value into
- * the blorp surface.
- */
- if (clear_value)
- surf.clear_color = *clear_value;
-
- /* From the Sky Lake PRM Vol. 7, "Render Target Fast Clear":
- *
- * "After Render target fast clear, pipe-control with color cache
- * write-flush must be issued before sending any DRAW commands on
- * that render target."
- *
- * This comment is a bit cryptic and doesn't really tell you what's going
- * or what's really needed. It appears that fast clear ops are not
- * properly synchronized with other drawing. This means that we cannot
- * have a fast clear operation in the pipe at the same time as other
- * regular drawing operations. We need to use a PIPE_CONTROL to ensure
- * that the contents of the previous draw hit the render target before we
- * resolve and then use a second PIPE_CONTROL after the resolve to ensure
- * that it is completed before any additional drawing occurs.
- */
- anv_add_pending_pipe_bits(cmd_buffer,
- ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
- ANV_PIPE_TILE_CACHE_FLUSH_BIT |
- ANV_PIPE_END_OF_PIPE_SYNC_BIT,
- "before fast clear mcs");
-
- switch (mcs_op) {
- case ISL_AUX_OP_FAST_CLEAR:
- blorp_fast_clear(&batch, &surf, format, swizzle,
- 0, base_layer, layer_count,
- 0, 0, image->vk.extent.width, image->vk.extent.height);
- break;
- case ISL_AUX_OP_PARTIAL_RESOLVE:
- blorp_mcs_partial_resolve(&batch, &surf, format,
- base_layer, layer_count);
- break;
- case ISL_AUX_OP_FULL_RESOLVE:
- case ISL_AUX_OP_AMBIGUATE:
- default:
- unreachable("Unsupported MCS operation");
- }
+ anv_blorp_batch_init(cmd_buffer, &batch,
+ BLORP_BATCH_PREDICATE_ENABLE * predicate +
+ BLORP_BATCH_NO_UPDATE_CLEAR_COLOR * !clear_value);
+ assert((batch.flags & BLORP_BATCH_USE_COMPUTE) == 0);
- anv_add_pending_pipe_bits(cmd_buffer,
- ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
- ANV_PIPE_END_OF_PIPE_SYNC_BIT,
- "after fast clear mcs");
+ exec_mcs_op(cmd_buffer, &batch, image, format, swizzle, aspect,
+ base_layer, layer_count, mcs_op, clear_value);
- blorp_batch_finish(&batch);
+ anv_blorp_batch_finish(&batch);
}
void
@@ -1886,83 +2589,14 @@ anv_image_ccs_op(struct anv_cmd_buffer *cmd_buffer,
enum isl_aux_op ccs_op, union isl_color_value *clear_value,
bool predicate)
{
- assert(image->vk.aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV);
- assert(image->vk.samples == 1);
- assert(level < anv_image_aux_levels(image, aspect));
- /* Multi-LOD YcBcR is not allowed */
- assert(image->n_planes == 1 || level == 0);
- assert(base_layer + layer_count <=
- anv_image_aux_layers(image, aspect, level));
-
- const uint32_t plane = anv_image_aspect_to_plane(image, aspect);
-
struct blorp_batch batch;
- blorp_batch_init(&cmd_buffer->device->blorp, &batch, cmd_buffer,
- BLORP_BATCH_PREDICATE_ENABLE * predicate +
- BLORP_BATCH_NO_UPDATE_CLEAR_COLOR * !clear_value);
+ anv_blorp_batch_init(cmd_buffer, &batch,
+ BLORP_BATCH_PREDICATE_ENABLE * predicate +
+ BLORP_BATCH_NO_UPDATE_CLEAR_COLOR * !clear_value);
+ assert((batch.flags & BLORP_BATCH_USE_COMPUTE) == 0);
- struct blorp_surf surf;
- get_blorp_surf_for_anv_image(cmd_buffer->device, image, aspect,
- 0, ANV_IMAGE_LAYOUT_EXPLICIT_AUX,
- image->planes[plane].aux_usage,
- &surf);
-
- uint32_t level_width = anv_minify(surf.surf->logical_level0_px.w, level);
- uint32_t level_height = anv_minify(surf.surf->logical_level0_px.h, level);
-
- /* Blorp will store the clear color for us if we provide the clear color
- * address and we are doing a fast clear. So we save the clear value into
- * the blorp surface.
- */
- if (clear_value)
- surf.clear_color = *clear_value;
-
- /* From the Sky Lake PRM Vol. 7, "Render Target Fast Clear":
- *
- * "After Render target fast clear, pipe-control with color cache
- * write-flush must be issued before sending any DRAW commands on
- * that render target."
- *
- * This comment is a bit cryptic and doesn't really tell you what's going
- * or what's really needed. It appears that fast clear ops are not
- * properly synchronized with other drawing. This means that we cannot
- * have a fast clear operation in the pipe at the same time as other
- * regular drawing operations. We need to use a PIPE_CONTROL to ensure
- * that the contents of the previous draw hit the render target before we
- * resolve and then use a second PIPE_CONTROL after the resolve to ensure
- * that it is completed before any additional drawing occurs.
- */
- anv_add_pending_pipe_bits(cmd_buffer,
- ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
- ANV_PIPE_TILE_CACHE_FLUSH_BIT |
- ANV_PIPE_END_OF_PIPE_SYNC_BIT,
- "before fast clear ccs");
-
- switch (ccs_op) {
- case ISL_AUX_OP_FAST_CLEAR:
- blorp_fast_clear(&batch, &surf, format, swizzle,
- level, base_layer, layer_count,
- 0, 0, level_width, level_height);
- break;
- case ISL_AUX_OP_FULL_RESOLVE:
- case ISL_AUX_OP_PARTIAL_RESOLVE:
- blorp_ccs_resolve(&batch, &surf, level, base_layer, layer_count,
- format, ccs_op);
- break;
- case ISL_AUX_OP_AMBIGUATE:
- for (uint32_t a = 0; a < layer_count; a++) {
- const uint32_t layer = base_layer + a;
- blorp_ccs_ambiguate(&batch, &surf, level, layer);
- }
- break;
- default:
- unreachable("Unsupported CCS operation");
- }
-
- anv_add_pending_pipe_bits(cmd_buffer,
- ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
- ANV_PIPE_END_OF_PIPE_SYNC_BIT,
- "after fast clear ccs");
+ exec_ccs_op(cmd_buffer, &batch, image, format, swizzle, aspect, level,
+ base_layer, layer_count, ccs_op, clear_value);
- blorp_batch_finish(&batch);
+ anv_blorp_batch_finish(&batch);
}
diff --git a/src/intel/vulkan/anv_bo_sync.c b/src/intel/vulkan/anv_bo_sync.c
new file mode 100644
index 00000000000..af12c6d61dd
--- /dev/null
+++ b/src/intel/vulkan/anv_bo_sync.c
@@ -0,0 +1,240 @@
+/*
+ * Copyright © 2021 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "anv_private.h"
+
+#include "util/os_time.h"
+#include "util/perf/cpu_trace.h"
+
+static struct anv_bo_sync *
+to_anv_bo_sync(struct vk_sync *sync)
+{
+ assert(sync->type == &anv_bo_sync_type);
+ return container_of(sync, struct anv_bo_sync, sync);
+}
+
+static VkResult
+anv_bo_sync_init(struct vk_device *vk_device,
+ struct vk_sync *vk_sync,
+ uint64_t initial_value)
+{
+ struct anv_device *device = container_of(vk_device, struct anv_device, vk);
+ struct anv_bo_sync *sync = to_anv_bo_sync(vk_sync);
+
+ sync->state = initial_value ? ANV_BO_SYNC_STATE_SIGNALED :
+ ANV_BO_SYNC_STATE_RESET;
+
+ return anv_device_alloc_bo(device, "bo-sync", 4096,
+ ANV_BO_ALLOC_EXTERNAL |
+ ANV_BO_ALLOC_IMPLICIT_SYNC |
+ ANV_BO_ALLOC_INTERNAL,
+ 0 /* explicit_address */,
+ &sync->bo);
+}
+
+static void
+anv_bo_sync_finish(struct vk_device *vk_device,
+ struct vk_sync *vk_sync)
+{
+ struct anv_device *device = container_of(vk_device, struct anv_device, vk);
+ struct anv_bo_sync *sync = to_anv_bo_sync(vk_sync);
+
+ anv_device_release_bo(device, sync->bo);
+}
+
+static VkResult
+anv_bo_sync_reset(struct vk_device *vk_device,
+ struct vk_sync *vk_sync)
+{
+ struct anv_bo_sync *sync = to_anv_bo_sync(vk_sync);
+
+ sync->state = ANV_BO_SYNC_STATE_RESET;
+
+ return VK_SUCCESS;
+}
+
+static int64_t
+anv_get_relative_timeout(uint64_t abs_timeout)
+{
+ uint64_t now = os_time_get_nano();
+
+ /* We don't want negative timeouts.
+ *
+ * DRM_IOCTL_I915_GEM_WAIT uses a signed 64 bit timeout and is
+ * supposed to block indefinitely timeouts < 0. Unfortunately,
+ * this was broken for a couple of kernel releases. Since there's
+ * no way to know whether or not the kernel we're using is one of
+ * the broken ones, the best we can do is to clamp the timeout to
+ * INT64_MAX. This limits the maximum timeout from 584 years to
+ * 292 years - likely not a big deal.
+ */
+ if (abs_timeout < now)
+ return 0;
+
+ uint64_t rel_timeout = abs_timeout - now;
+ if (rel_timeout > (uint64_t) INT64_MAX)
+ rel_timeout = INT64_MAX;
+
+ return rel_timeout;
+}
+
+static VkResult
+anv_bo_sync_wait(struct vk_device *vk_device,
+ uint32_t wait_count,
+ const struct vk_sync_wait *waits,
+ enum vk_sync_wait_flags wait_flags,
+ uint64_t abs_timeout_ns)
+{
+ struct anv_device *device = container_of(vk_device, struct anv_device, vk);
+ VkResult result;
+ MESA_TRACE_FUNC();
+
+ uint32_t pending = wait_count;
+ while (pending) {
+ pending = 0;
+ bool signaled = false;
+ for (uint32_t i = 0; i < wait_count; i++) {
+ struct anv_bo_sync *sync = to_anv_bo_sync(waits[i].sync);
+ switch (sync->state) {
+ case ANV_BO_SYNC_STATE_RESET:
+ /* This fence hasn't been submitted yet, we'll catch it the next
+ * time around. Yes, this may mean we dead-loop but, short of
+ * lots of locking and a condition variable, there's not much that
+ * we can do about that.
+ */
+ assert(!(wait_flags & VK_SYNC_WAIT_PENDING));
+ pending++;
+ continue;
+
+ case ANV_BO_SYNC_STATE_SIGNALED:
+ /* This fence is not pending. If waitAll isn't set, we can return
+ * early. Otherwise, we have to keep going.
+ */
+ if (wait_flags & VK_SYNC_WAIT_ANY)
+ return VK_SUCCESS;
+ continue;
+
+ case ANV_BO_SYNC_STATE_SUBMITTED:
+ /* These are the fences we really care about. Go ahead and wait
+ * on it until we hit a timeout.
+ */
+ if (!(wait_flags & VK_SYNC_WAIT_PENDING)) {
+ uint64_t rel_timeout = anv_get_relative_timeout(abs_timeout_ns);
+ result = anv_device_wait(device, sync->bo, rel_timeout);
+ /* This also covers VK_TIMEOUT */
+ if (result != VK_SUCCESS)
+ return result;
+
+ sync->state = ANV_BO_SYNC_STATE_SIGNALED;
+ signaled = true;
+ }
+ if (wait_flags & VK_SYNC_WAIT_ANY)
+ return VK_SUCCESS;
+ break;
+
+ default:
+ unreachable("Invalid BO sync state");
+ }
+ }
+
+ if (pending && !signaled) {
+ /* If we've hit this then someone decided to vkWaitForFences before
+ * they've actually submitted any of them to a queue. This is a
+ * fairly pessimal case, so it's ok to lock here and use a standard
+ * pthreads condition variable.
+ */
+ pthread_mutex_lock(&device->mutex);
+
+ /* It's possible that some of the fences have changed state since the
+ * last time we checked. Now that we have the lock, check for
+ * pending fences again and don't wait if it's changed.
+ */
+ uint32_t now_pending = 0;
+ for (uint32_t i = 0; i < wait_count; i++) {
+ struct anv_bo_sync *sync = to_anv_bo_sync(waits[i].sync);
+ if (sync->state == ANV_BO_SYNC_STATE_RESET)
+ now_pending++;
+ }
+ assert(now_pending <= pending);
+
+ if (now_pending == pending) {
+ struct timespec abstime = {
+ .tv_sec = abs_timeout_ns / NSEC_PER_SEC,
+ .tv_nsec = abs_timeout_ns % NSEC_PER_SEC,
+ };
+
+ ASSERTED int ret;
+ ret = pthread_cond_timedwait(&device->queue_submit,
+ &device->mutex, &abstime);
+ assert(ret != EINVAL);
+ if (os_time_get_nano() >= abs_timeout_ns) {
+ pthread_mutex_unlock(&device->mutex);
+ return VK_TIMEOUT;
+ }
+ }
+
+ pthread_mutex_unlock(&device->mutex);
+ }
+ }
+
+ return VK_SUCCESS;
+}
+
+const struct vk_sync_type anv_bo_sync_type = {
+ .size = sizeof(struct anv_bo_sync),
+ .features = VK_SYNC_FEATURE_BINARY |
+ VK_SYNC_FEATURE_GPU_WAIT |
+ VK_SYNC_FEATURE_GPU_MULTI_WAIT |
+ VK_SYNC_FEATURE_CPU_WAIT |
+ VK_SYNC_FEATURE_CPU_RESET |
+ VK_SYNC_FEATURE_WAIT_ANY |
+ VK_SYNC_FEATURE_WAIT_PENDING,
+ .init = anv_bo_sync_init,
+ .finish = anv_bo_sync_finish,
+ .reset = anv_bo_sync_reset,
+ .wait_many = anv_bo_sync_wait,
+};
+
+VkResult
+anv_create_sync_for_memory(struct vk_device *device,
+ VkDeviceMemory memory,
+ bool signal_memory,
+ struct vk_sync **sync_out)
+{
+ ANV_FROM_HANDLE(anv_device_memory, mem, memory);
+ struct anv_bo_sync *bo_sync;
+
+ bo_sync = vk_zalloc(&device->alloc, sizeof(*bo_sync), 8,
+ VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
+ if (bo_sync == NULL)
+ return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+ bo_sync->sync.type = &anv_bo_sync_type;
+ bo_sync->state = signal_memory ? ANV_BO_SYNC_STATE_RESET :
+ ANV_BO_SYNC_STATE_SUBMITTED;
+ bo_sync->bo = anv_bo_ref(mem->bo);
+
+ *sync_out = &bo_sync->sync;
+
+ return VK_SUCCESS;
+}
diff --git a/src/intel/vulkan/anv_cmd_buffer.c b/src/intel/vulkan/anv_cmd_buffer.c
index ece9dd32f96..25a79f3e52f 100644
--- a/src/intel/vulkan/anv_cmd_buffer.c
+++ b/src/intel/vulkan/anv_cmd_buffer.c
@@ -41,181 +41,6 @@
* is concerned, most of anv_cmd_buffer is magic.
*/
-/* TODO: These are taken from GLES. We should check the Vulkan spec */
-const struct anv_dynamic_state default_dynamic_state = {
- .viewport = {
- .count = 0,
- },
- .scissor = {
- .count = 0,
- },
- .line_width = 1.0f,
- .depth_bias = {
- .bias = 0.0f,
- .clamp = 0.0f,
- .slope = 0.0f,
- },
- .blend_constants = { 0.0f, 0.0f, 0.0f, 0.0f },
- .depth_bounds = {
- .min = 0.0f,
- .max = 1.0f,
- },
- .stencil_compare_mask = {
- .front = ~0u,
- .back = ~0u,
- },
- .stencil_write_mask = {
- .front = ~0u,
- .back = ~0u,
- },
- .stencil_reference = {
- .front = 0u,
- .back = 0u,
- },
- .stencil_op = {
- .front = {
- .fail_op = 0,
- .pass_op = 0,
- .depth_fail_op = 0,
- .compare_op = 0,
- },
- .back = {
- .fail_op = 0,
- .pass_op = 0,
- .depth_fail_op = 0,
- .compare_op = 0,
- },
- },
- .line_stipple = {
- .factor = 0u,
- .pattern = 0u,
- },
- .cull_mode = 0,
- .front_face = 0,
- .primitive_topology = 0,
- .depth_test_enable = 0,
- .depth_write_enable = 0,
- .depth_compare_op = 0,
- .depth_bounds_test_enable = 0,
- .stencil_test_enable = 0,
- .dyn_vbo_stride = 0,
- .dyn_vbo_size = 0,
- .color_writes = 0xff,
- .raster_discard = 0,
- .depth_bias_enable = 0,
- .primitive_restart_enable = 0,
- .logic_op = 0,
-};
-
-/**
- * Copy the dynamic state from src to dest based on the copy_mask.
- *
- * Avoid copying states that have not changed, except for VIEWPORT, SCISSOR and
- * BLEND_CONSTANTS (always copy them if they are in the copy_mask).
- *
- * Returns a mask of the states which changed.
- */
-anv_cmd_dirty_mask_t
-anv_dynamic_state_copy(struct anv_dynamic_state *dest,
- const struct anv_dynamic_state *src,
- anv_cmd_dirty_mask_t copy_mask)
-{
- anv_cmd_dirty_mask_t changed = 0;
-
- if (copy_mask & ANV_CMD_DIRTY_DYNAMIC_VIEWPORT) {
- dest->viewport.count = src->viewport.count;
- typed_memcpy(dest->viewport.viewports, src->viewport.viewports,
- src->viewport.count);
- changed |= ANV_CMD_DIRTY_DYNAMIC_VIEWPORT;
- }
-
- if (copy_mask & ANV_CMD_DIRTY_DYNAMIC_SCISSOR) {
- dest->scissor.count = src->scissor.count;
- typed_memcpy(dest->scissor.scissors, src->scissor.scissors,
- src->scissor.count);
- changed |= ANV_CMD_DIRTY_DYNAMIC_SCISSOR;
- }
-
- if (copy_mask & ANV_CMD_DIRTY_DYNAMIC_BLEND_CONSTANTS) {
- typed_memcpy(dest->blend_constants, src->blend_constants, 4);
- changed |= ANV_CMD_DIRTY_DYNAMIC_BLEND_CONSTANTS;
- }
-
-#define ANV_CMP_COPY(field, flag) \
- if (copy_mask & flag) { \
- if (dest->field != src->field) { \
- dest->field = src->field; \
- changed |= flag; \
- } \
- }
-
- ANV_CMP_COPY(line_width, ANV_CMD_DIRTY_DYNAMIC_LINE_WIDTH);
-
- ANV_CMP_COPY(depth_bias.bias, ANV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS);
- ANV_CMP_COPY(depth_bias.clamp, ANV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS);
- ANV_CMP_COPY(depth_bias.slope, ANV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS);
-
- ANV_CMP_COPY(depth_bounds.min, ANV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS);
- ANV_CMP_COPY(depth_bounds.max, ANV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS);
-
- ANV_CMP_COPY(stencil_compare_mask.front, ANV_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK);
- ANV_CMP_COPY(stencil_compare_mask.back, ANV_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK);
-
- ANV_CMP_COPY(stencil_write_mask.front, ANV_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK);
- ANV_CMP_COPY(stencil_write_mask.back, ANV_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK);
-
- ANV_CMP_COPY(stencil_reference.front, ANV_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE);
- ANV_CMP_COPY(stencil_reference.back, ANV_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE);
-
- ANV_CMP_COPY(line_stipple.factor, ANV_CMD_DIRTY_DYNAMIC_LINE_STIPPLE);
- ANV_CMP_COPY(line_stipple.pattern, ANV_CMD_DIRTY_DYNAMIC_LINE_STIPPLE);
-
- ANV_CMP_COPY(cull_mode, ANV_CMD_DIRTY_DYNAMIC_CULL_MODE);
- ANV_CMP_COPY(front_face, ANV_CMD_DIRTY_DYNAMIC_FRONT_FACE);
- ANV_CMP_COPY(primitive_topology, ANV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY);
- ANV_CMP_COPY(depth_test_enable, ANV_CMD_DIRTY_DYNAMIC_DEPTH_TEST_ENABLE);
- ANV_CMP_COPY(depth_write_enable, ANV_CMD_DIRTY_DYNAMIC_DEPTH_WRITE_ENABLE);
- ANV_CMP_COPY(depth_compare_op, ANV_CMD_DIRTY_DYNAMIC_DEPTH_COMPARE_OP);
- ANV_CMP_COPY(depth_bounds_test_enable, ANV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS_TEST_ENABLE);
- ANV_CMP_COPY(stencil_test_enable, ANV_CMD_DIRTY_DYNAMIC_STENCIL_TEST_ENABLE);
-
- if (copy_mask & VK_DYNAMIC_STATE_STENCIL_OP_EXT) {
- ANV_CMP_COPY(stencil_op.front.fail_op, ANV_CMD_DIRTY_DYNAMIC_STENCIL_OP);
- ANV_CMP_COPY(stencil_op.front.pass_op, ANV_CMD_DIRTY_DYNAMIC_STENCIL_OP);
- ANV_CMP_COPY(stencil_op.front.depth_fail_op, ANV_CMD_DIRTY_DYNAMIC_STENCIL_OP);
- ANV_CMP_COPY(stencil_op.front.compare_op, ANV_CMD_DIRTY_DYNAMIC_STENCIL_OP);
- ANV_CMP_COPY(stencil_op.back.fail_op, ANV_CMD_DIRTY_DYNAMIC_STENCIL_OP);
- ANV_CMP_COPY(stencil_op.back.pass_op, ANV_CMD_DIRTY_DYNAMIC_STENCIL_OP);
- ANV_CMP_COPY(stencil_op.back.depth_fail_op, ANV_CMD_DIRTY_DYNAMIC_STENCIL_OP);
- ANV_CMP_COPY(stencil_op.back.compare_op, ANV_CMD_DIRTY_DYNAMIC_STENCIL_OP);
- }
-
- ANV_CMP_COPY(dyn_vbo_stride, ANV_CMD_DIRTY_DYNAMIC_VERTEX_INPUT_BINDING_STRIDE);
- ANV_CMP_COPY(dyn_vbo_size, ANV_CMD_DIRTY_DYNAMIC_VERTEX_INPUT_BINDING_STRIDE);
-
- ANV_CMP_COPY(raster_discard, ANV_CMD_DIRTY_DYNAMIC_RASTERIZER_DISCARD_ENABLE);
- ANV_CMP_COPY(depth_bias_enable, ANV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS_ENABLE);
- ANV_CMP_COPY(primitive_restart_enable, ANV_CMD_DIRTY_DYNAMIC_PRIMITIVE_RESTART_ENABLE);
- ANV_CMP_COPY(logic_op, ANV_CMD_DIRTY_DYNAMIC_LOGIC_OP);
-
- if (copy_mask & ANV_CMD_DIRTY_DYNAMIC_SAMPLE_LOCATIONS) {
- dest->sample_locations.samples = src->sample_locations.samples;
- typed_memcpy(dest->sample_locations.locations,
- src->sample_locations.locations,
- dest->sample_locations.samples);
- changed |= ANV_CMD_DIRTY_DYNAMIC_SAMPLE_LOCATIONS;
- }
-
- ANV_CMP_COPY(color_writes, ANV_CMD_DIRTY_DYNAMIC_COLOR_BLEND_STATE);
-
- ANV_CMP_COPY(fragment_shading_rate.width, ANV_CMD_DIRTY_DYNAMIC_SHADING_RATE);
- ANV_CMP_COPY(fragment_shading_rate.height, ANV_CMD_DIRTY_DYNAMIC_SHADING_RATE);
-
-#undef ANV_CMP_COPY
-
- return changed;
-}
-
static void
anv_cmd_state_init(struct anv_cmd_buffer *cmd_buffer)
{
@@ -224,21 +49,20 @@ anv_cmd_state_init(struct anv_cmd_buffer *cmd_buffer)
memset(state, 0, sizeof(*state));
state->current_pipeline = UINT32_MAX;
- state->restart_index = UINT32_MAX;
- state->gfx.dynamic = default_dynamic_state;
+ state->gfx.restart_index = UINT32_MAX;
+ state->gfx.object_preemption = true;
+ state->gfx.dirty = 0;
+
+ memcpy(state->gfx.dyn_state.dirty,
+ cmd_buffer->device->gfx_dirty_state,
+ sizeof(state->gfx.dyn_state.dirty));
}
static void
anv_cmd_pipeline_state_finish(struct anv_cmd_buffer *cmd_buffer,
struct anv_cmd_pipeline_state *pipe_state)
{
- for (uint32_t i = 0; i < ARRAY_SIZE(pipe_state->push_descriptors); i++) {
- if (pipe_state->push_descriptors[i]) {
- anv_descriptor_set_layout_unref(cmd_buffer->device,
- pipe_state->push_descriptors[i]->set.layout);
- vk_free(&cmd_buffer->pool->alloc, pipe_state->push_descriptors[i]);
- }
- }
+ anv_push_descriptor_set_finish(&pipe_state->push_descriptor);
}
static void
@@ -248,8 +72,6 @@ anv_cmd_state_finish(struct anv_cmd_buffer *cmd_buffer)
anv_cmd_pipeline_state_finish(cmd_buffer, &state->gfx.base);
anv_cmd_pipeline_state_finish(cmd_buffer, &state->compute.base);
-
- vk_free(&cmd_buffer->pool->alloc, state->attachments);
}
static void
@@ -257,158 +79,262 @@ anv_cmd_state_reset(struct anv_cmd_buffer *cmd_buffer)
{
anv_cmd_state_finish(cmd_buffer);
anv_cmd_state_init(cmd_buffer);
+
+ cmd_buffer->last_compute_walker = NULL;
+ cmd_buffer->last_indirect_dispatch = NULL;
}
-static VkResult anv_create_cmd_buffer(
- struct anv_device * device,
- struct anv_cmd_pool * pool,
- VkCommandBufferLevel level,
- VkCommandBuffer* pCommandBuffer)
+VkResult
+anv_cmd_buffer_ensure_rcs_companion(struct anv_cmd_buffer *cmd_buffer)
{
+ if (cmd_buffer->companion_rcs_cmd_buffer)
+ return VK_SUCCESS;
+
+ VkResult result = VK_SUCCESS;
+ pthread_mutex_lock(&cmd_buffer->device->mutex);
+ VK_FROM_HANDLE(vk_command_pool, pool,
+ cmd_buffer->device->companion_rcs_cmd_pool);
+ assert(pool != NULL);
+
+ struct vk_command_buffer *tmp_cmd_buffer = NULL;
+ result = pool->command_buffer_ops->create(pool, cmd_buffer->vk.level, &tmp_cmd_buffer);
+
+ if (result != VK_SUCCESS)
+ goto unlock_and_return;
+
+ cmd_buffer->companion_rcs_cmd_buffer =
+ container_of(tmp_cmd_buffer, struct anv_cmd_buffer, vk);
+ anv_genX(cmd_buffer->device->info, cmd_buffer_begin_companion)(
+ cmd_buffer->companion_rcs_cmd_buffer, cmd_buffer->vk.level);
+
+unlock_and_return:
+ pthread_mutex_unlock(&cmd_buffer->device->mutex);
+ return result;
+}
+
+static VkResult
+anv_create_cmd_buffer(struct vk_command_pool *pool,
+ VkCommandBufferLevel level,
+ struct vk_command_buffer **cmd_buffer_out)
+{
+ struct anv_device *device =
+ container_of(pool->base.device, struct anv_device, vk);
struct anv_cmd_buffer *cmd_buffer;
VkResult result;
- cmd_buffer = vk_object_alloc(&device->vk, &pool->alloc, sizeof(*cmd_buffer),
- VK_OBJECT_TYPE_COMMAND_BUFFER);
+ cmd_buffer = vk_zalloc(&pool->alloc, sizeof(*cmd_buffer), 8,
+ VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
if (cmd_buffer == NULL)
- return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+ return vk_error(pool, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+ result = vk_command_buffer_init(pool, &cmd_buffer->vk,
+ &anv_cmd_buffer_ops, level);
+ if (result != VK_SUCCESS)
+ goto fail_alloc;
+
+ cmd_buffer->vk.dynamic_graphics_state.ms.sample_locations =
+ &cmd_buffer->state.gfx.sample_locations;
+ cmd_buffer->vk.dynamic_graphics_state.vi =
+ &cmd_buffer->state.gfx.vertex_input;
cmd_buffer->batch.status = VK_SUCCESS;
+ cmd_buffer->generation.batch.status = VK_SUCCESS;
cmd_buffer->device = device;
- cmd_buffer->pool = pool;
- cmd_buffer->level = level;
+
+ assert(pool->queue_family_index < device->physical->queue.family_count);
+ cmd_buffer->queue_family =
+ &device->physical->queue.families[pool->queue_family_index];
result = anv_cmd_buffer_init_batch_bo_chain(cmd_buffer);
if (result != VK_SUCCESS)
- goto fail;
+ goto fail_vk;
anv_state_stream_init(&cmd_buffer->surface_state_stream,
- &device->surface_state_pool, 4096);
+ &device->internal_surface_state_pool, 4096);
anv_state_stream_init(&cmd_buffer->dynamic_state_stream,
&device->dynamic_state_pool, 16384);
+ anv_state_stream_init(&cmd_buffer->dynamic_state_db_stream,
+ &device->dynamic_state_db_pool, 16384);
anv_state_stream_init(&cmd_buffer->general_state_stream,
&device->general_state_pool, 16384);
+ anv_state_stream_init(&cmd_buffer->indirect_push_descriptor_stream,
+ &device->indirect_push_descriptor_pool, 4096);
+ anv_state_stream_init(&cmd_buffer->push_descriptor_buffer_stream,
+ &device->push_descriptor_buffer_pool, 4096);
- cmd_buffer->self_mod_locations = NULL;
-
- anv_cmd_state_init(cmd_buffer);
+ int success = u_vector_init_pow2(&cmd_buffer->dynamic_bos, 8,
+ sizeof(struct anv_bo *));
+ if (!success)
+ goto fail_batch_bo;
- list_addtail(&cmd_buffer->pool_link, &pool->cmd_buffers);
+ cmd_buffer->self_mod_locations = NULL;
+ cmd_buffer->companion_rcs_cmd_buffer = NULL;
+ cmd_buffer->is_companion_rcs_cmd_buffer = false;
- anv_measure_init(cmd_buffer);
+ cmd_buffer->generation.jump_addr = ANV_NULL_ADDRESS;
+ cmd_buffer->generation.return_addr = ANV_NULL_ADDRESS;
- *pCommandBuffer = anv_cmd_buffer_to_handle(cmd_buffer);
+ cmd_buffer->last_compute_walker = NULL;
+ cmd_buffer->last_indirect_dispatch = NULL;
- return VK_SUCCESS;
+ memset(&cmd_buffer->generation.shader_state, 0,
+ sizeof(cmd_buffer->generation.shader_state));
- fail:
- vk_free(&cmd_buffer->pool->alloc, cmd_buffer);
+ anv_cmd_state_init(cmd_buffer);
- return result;
-}
+ anv_measure_init(cmd_buffer);
-VkResult anv_AllocateCommandBuffers(
- VkDevice _device,
- const VkCommandBufferAllocateInfo* pAllocateInfo,
- VkCommandBuffer* pCommandBuffers)
-{
- ANV_FROM_HANDLE(anv_device, device, _device);
- ANV_FROM_HANDLE(anv_cmd_pool, pool, pAllocateInfo->commandPool);
+ u_trace_init(&cmd_buffer->trace, &device->ds.trace_context);
- VkResult result = VK_SUCCESS;
- uint32_t i;
+ *cmd_buffer_out = &cmd_buffer->vk;
- for (i = 0; i < pAllocateInfo->commandBufferCount; i++) {
- result = anv_create_cmd_buffer(device, pool, pAllocateInfo->level,
- &pCommandBuffers[i]);
- if (result != VK_SUCCESS)
- break;
- }
+ return VK_SUCCESS;
- if (result != VK_SUCCESS) {
- anv_FreeCommandBuffers(_device, pAllocateInfo->commandPool,
- i, pCommandBuffers);
- for (i = 0; i < pAllocateInfo->commandBufferCount; i++)
- pCommandBuffers[i] = VK_NULL_HANDLE;
- }
+ fail_batch_bo:
+ anv_cmd_buffer_fini_batch_bo_chain(cmd_buffer);
+ fail_vk:
+ vk_command_buffer_finish(&cmd_buffer->vk);
+ fail_alloc:
+ vk_free2(&device->vk.alloc, &pool->alloc, cmd_buffer);
return result;
}
static void
-anv_cmd_buffer_destroy(struct anv_cmd_buffer *cmd_buffer)
+destroy_cmd_buffer(struct anv_cmd_buffer *cmd_buffer)
{
- anv_measure_destroy(cmd_buffer);
+ u_trace_fini(&cmd_buffer->trace);
- list_del(&cmd_buffer->pool_link);
+ anv_measure_destroy(cmd_buffer);
anv_cmd_buffer_fini_batch_bo_chain(cmd_buffer);
anv_state_stream_finish(&cmd_buffer->surface_state_stream);
anv_state_stream_finish(&cmd_buffer->dynamic_state_stream);
+ anv_state_stream_finish(&cmd_buffer->dynamic_state_db_stream);
anv_state_stream_finish(&cmd_buffer->general_state_stream);
+ anv_state_stream_finish(&cmd_buffer->indirect_push_descriptor_stream);
+ anv_state_stream_finish(&cmd_buffer->push_descriptor_buffer_stream);
+
+ while (u_vector_length(&cmd_buffer->dynamic_bos) > 0) {
+ struct anv_bo **bo = u_vector_remove(&cmd_buffer->dynamic_bos);
+ anv_bo_pool_free((*bo)->map != NULL ?
+ &cmd_buffer->device->batch_bo_pool :
+ &cmd_buffer->device->bvh_bo_pool, *bo);
+ }
+ u_vector_finish(&cmd_buffer->dynamic_bos);
anv_cmd_state_finish(cmd_buffer);
- vk_free(&cmd_buffer->pool->alloc, cmd_buffer->self_mod_locations);
+ vk_free(&cmd_buffer->vk.pool->alloc, cmd_buffer->self_mod_locations);
- vk_object_free(&cmd_buffer->device->vk, &cmd_buffer->pool->alloc, cmd_buffer);
+ vk_command_buffer_finish(&cmd_buffer->vk);
+ vk_free(&cmd_buffer->vk.pool->alloc, cmd_buffer);
}
-void anv_FreeCommandBuffers(
- VkDevice device,
- VkCommandPool commandPool,
- uint32_t commandBufferCount,
- const VkCommandBuffer* pCommandBuffers)
+static void
+anv_cmd_buffer_destroy(struct vk_command_buffer *vk_cmd_buffer)
{
- for (uint32_t i = 0; i < commandBufferCount; i++) {
- ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, pCommandBuffers[i]);
-
- if (!cmd_buffer)
- continue;
+ struct anv_cmd_buffer *cmd_buffer =
+ container_of(vk_cmd_buffer, struct anv_cmd_buffer, vk);
+ struct anv_device *device = cmd_buffer->device;
- anv_cmd_buffer_destroy(cmd_buffer);
+ pthread_mutex_lock(&device->mutex);
+ if (cmd_buffer->companion_rcs_cmd_buffer) {
+ destroy_cmd_buffer(cmd_buffer->companion_rcs_cmd_buffer);
+ cmd_buffer->companion_rcs_cmd_buffer = NULL;
}
+
+ ANV_RMV(cmd_buffer_destroy, cmd_buffer->device, cmd_buffer);
+
+ destroy_cmd_buffer(cmd_buffer);
+ pthread_mutex_unlock(&device->mutex);
}
-VkResult
-anv_cmd_buffer_reset(struct anv_cmd_buffer *cmd_buffer)
+static void
+reset_cmd_buffer(struct anv_cmd_buffer *cmd_buffer,
+ UNUSED VkCommandBufferResetFlags flags)
{
+ vk_command_buffer_reset(&cmd_buffer->vk);
+
cmd_buffer->usage_flags = 0;
cmd_buffer->perf_query_pool = NULL;
+ cmd_buffer->is_companion_rcs_cmd_buffer = false;
anv_cmd_buffer_reset_batch_bo_chain(cmd_buffer);
anv_cmd_state_reset(cmd_buffer);
+ memset(&cmd_buffer->generation.shader_state, 0,
+ sizeof(cmd_buffer->generation.shader_state));
+
+ cmd_buffer->generation.jump_addr = ANV_NULL_ADDRESS;
+ cmd_buffer->generation.return_addr = ANV_NULL_ADDRESS;
+
anv_state_stream_finish(&cmd_buffer->surface_state_stream);
anv_state_stream_init(&cmd_buffer->surface_state_stream,
- &cmd_buffer->device->surface_state_pool, 4096);
+ &cmd_buffer->device->internal_surface_state_pool, 4096);
anv_state_stream_finish(&cmd_buffer->dynamic_state_stream);
anv_state_stream_init(&cmd_buffer->dynamic_state_stream,
&cmd_buffer->device->dynamic_state_pool, 16384);
+ anv_state_stream_finish(&cmd_buffer->dynamic_state_db_stream);
+ anv_state_stream_init(&cmd_buffer->dynamic_state_db_stream,
+ &cmd_buffer->device->dynamic_state_db_pool, 16384);
+
anv_state_stream_finish(&cmd_buffer->general_state_stream);
anv_state_stream_init(&cmd_buffer->general_state_stream,
&cmd_buffer->device->general_state_pool, 16384);
+ anv_state_stream_finish(&cmd_buffer->indirect_push_descriptor_stream);
+ anv_state_stream_init(&cmd_buffer->indirect_push_descriptor_stream,
+ &cmd_buffer->device->indirect_push_descriptor_pool,
+ 4096);
+
+ anv_state_stream_finish(&cmd_buffer->push_descriptor_buffer_stream);
+ anv_state_stream_init(&cmd_buffer->push_descriptor_buffer_stream,
+ &cmd_buffer->device->push_descriptor_buffer_pool, 4096);
+
+ while (u_vector_length(&cmd_buffer->dynamic_bos) > 0) {
+ struct anv_bo **bo = u_vector_remove(&cmd_buffer->dynamic_bos);
+ anv_device_release_bo(cmd_buffer->device, *bo);
+ }
+
anv_measure_reset(cmd_buffer);
- return VK_SUCCESS;
+
+ u_trace_fini(&cmd_buffer->trace);
+ u_trace_init(&cmd_buffer->trace, &cmd_buffer->device->ds.trace_context);
}
-VkResult anv_ResetCommandBuffer(
- VkCommandBuffer commandBuffer,
- VkCommandBufferResetFlags flags)
+void
+anv_cmd_buffer_reset(struct vk_command_buffer *vk_cmd_buffer,
+ UNUSED VkCommandBufferResetFlags flags)
{
- ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
- return anv_cmd_buffer_reset(cmd_buffer);
+ struct anv_cmd_buffer *cmd_buffer =
+ container_of(vk_cmd_buffer, struct anv_cmd_buffer, vk);
+
+ if (cmd_buffer->companion_rcs_cmd_buffer) {
+ reset_cmd_buffer(cmd_buffer->companion_rcs_cmd_buffer, flags);
+ destroy_cmd_buffer(cmd_buffer->companion_rcs_cmd_buffer);
+ cmd_buffer->companion_rcs_cmd_buffer = NULL;
+ }
+
+ ANV_RMV(cmd_buffer_destroy, cmd_buffer->device, cmd_buffer);
+
+ reset_cmd_buffer(cmd_buffer, flags);
}
+const struct vk_command_buffer_ops anv_cmd_buffer_ops = {
+ .create = anv_create_cmd_buffer,
+ .reset = anv_cmd_buffer_reset,
+ .destroy = anv_cmd_buffer_destroy,
+};
+
void
-anv_cmd_buffer_emit_state_base_address(struct anv_cmd_buffer *cmd_buffer)
+anv_cmd_buffer_emit_bt_pool_base_address(struct anv_cmd_buffer *cmd_buffer)
{
- const struct intel_device_info *devinfo = &cmd_buffer->device->info;
- anv_genX(devinfo, cmd_buffer_emit_state_base_address)(cmd_buffer);
+ const struct intel_device_info *devinfo = cmd_buffer->device->info;
+ anv_genX(devinfo, cmd_buffer_emit_bt_pool_base_address)(cmd_buffer);
}
void
@@ -420,7 +346,7 @@ anv_cmd_buffer_mark_image_written(struct anv_cmd_buffer *cmd_buffer,
uint32_t base_layer,
uint32_t layer_count)
{
- const struct intel_device_info *devinfo = &cmd_buffer->device->info;
+ const struct intel_device_info *devinfo = cmd_buffer->device->info;
anv_genX(devinfo, cmd_buffer_mark_image_written)(cmd_buffer, image,
aspect, aux_usage,
level, base_layer,
@@ -428,12 +354,63 @@ anv_cmd_buffer_mark_image_written(struct anv_cmd_buffer *cmd_buffer,
}
void
+anv_cmd_buffer_mark_image_fast_cleared(struct anv_cmd_buffer *cmd_buffer,
+ const struct anv_image *image,
+ const enum isl_format format,
+ union isl_color_value clear_color)
+{
+ const struct intel_device_info *devinfo = cmd_buffer->device->info;
+ anv_genX(devinfo, set_fast_clear_state)(cmd_buffer, image, format,
+ clear_color);
+}
+
+void
+anv_cmd_buffer_load_clear_color_from_image(struct anv_cmd_buffer *cmd_buffer,
+ struct anv_state state,
+ const struct anv_image *image)
+{
+ const struct intel_device_info *devinfo = cmd_buffer->device->info;
+ anv_genX(devinfo, load_image_clear_color)(cmd_buffer, state, image);
+}
+
+void
anv_cmd_emit_conditional_render_predicate(struct anv_cmd_buffer *cmd_buffer)
{
- const struct intel_device_info *devinfo = &cmd_buffer->device->info;
+ const struct intel_device_info *devinfo = cmd_buffer->device->info;
anv_genX(devinfo, cmd_emit_conditional_render_predicate)(cmd_buffer);
}
+static void
+clear_pending_query_bits(enum anv_query_bits *query_bits,
+ enum anv_pipe_bits flushed_bits)
+{
+ if (flushed_bits & ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT)
+ *query_bits &= ~ANV_QUERY_WRITES_RT_FLUSH;
+
+ if (flushed_bits & ANV_PIPE_TILE_CACHE_FLUSH_BIT)
+ *query_bits &= ~ANV_QUERY_WRITES_TILE_FLUSH;
+
+ if ((flushed_bits & ANV_PIPE_DATA_CACHE_FLUSH_BIT) &&
+ (flushed_bits & ANV_PIPE_HDC_PIPELINE_FLUSH_BIT) &&
+ (flushed_bits & ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT))
+ *query_bits &= ~ANV_QUERY_WRITES_TILE_FLUSH;
+
+ /* Once RT/TILE have been flushed, we can consider the CS_STALL flush */
+ if ((*query_bits & (ANV_QUERY_WRITES_TILE_FLUSH |
+ ANV_QUERY_WRITES_RT_FLUSH |
+ ANV_QUERY_WRITES_DATA_FLUSH)) == 0 &&
+ (flushed_bits & (ANV_PIPE_END_OF_PIPE_SYNC_BIT | ANV_PIPE_CS_STALL_BIT)))
+ *query_bits &= ~ANV_QUERY_WRITES_CS_STALL;
+}
+
+void
+anv_cmd_buffer_update_pending_query_bits(struct anv_cmd_buffer *cmd_buffer,
+ enum anv_pipe_bits flushed_bits)
+{
+ clear_pending_query_bits(&cmd_buffer->state.queries.clear_bits, flushed_bits);
+ clear_pending_query_bits(&cmd_buffer->state.queries.buffer_write_bits, flushed_bits);
+}
+
static bool
mem_update(void *dst, const void *src, size_t size)
{
@@ -465,6 +442,184 @@ set_dirty_for_bind_map(struct anv_cmd_buffer *cmd_buffer,
cmd_buffer->state.push_constants_dirty |= mesa_to_vk_shader_stage(stage);
}
+static void
+anv_cmd_buffer_set_ray_query_buffer(struct anv_cmd_buffer *cmd_buffer,
+ struct anv_cmd_pipeline_state *pipeline_state,
+ struct anv_pipeline *pipeline,
+ VkShaderStageFlags stages)
+{
+ struct anv_device *device = cmd_buffer->device;
+
+ uint64_t ray_shadow_size =
+ align64(brw_rt_ray_queries_shadow_stacks_size(device->info,
+ pipeline->ray_queries),
+ 4096);
+ if (ray_shadow_size > 0 &&
+ (!cmd_buffer->state.ray_query_shadow_bo ||
+ cmd_buffer->state.ray_query_shadow_bo->size < ray_shadow_size)) {
+ unsigned shadow_size_log2 = MAX2(util_logbase2_ceil(ray_shadow_size), 16);
+ unsigned bucket = shadow_size_log2 - 16;
+ assert(bucket < ARRAY_SIZE(device->ray_query_shadow_bos));
+
+ struct anv_bo *bo = p_atomic_read(&device->ray_query_shadow_bos[bucket]);
+ if (bo == NULL) {
+ struct anv_bo *new_bo;
+ VkResult result = anv_device_alloc_bo(device, "RT queries shadow",
+ ray_shadow_size,
+ ANV_BO_ALLOC_INTERNAL, /* alloc_flags */
+ 0, /* explicit_address */
+ &new_bo);
+ if (result != VK_SUCCESS) {
+ anv_batch_set_error(&cmd_buffer->batch, result);
+ return;
+ }
+
+ bo = p_atomic_cmpxchg(&device->ray_query_shadow_bos[bucket], NULL, new_bo);
+ if (bo != NULL) {
+ anv_device_release_bo(device, bo);
+ } else {
+ bo = new_bo;
+ }
+ }
+ cmd_buffer->state.ray_query_shadow_bo = bo;
+
+ /* Add the ray query buffers to the batch list. */
+ anv_reloc_list_add_bo(cmd_buffer->batch.relocs,
+ cmd_buffer->state.ray_query_shadow_bo);
+ }
+
+ /* Add the HW buffer to the list of BO used. */
+ anv_reloc_list_add_bo(cmd_buffer->batch.relocs,
+ device->ray_query_bo);
+
+ /* Fill the push constants & mark them dirty. */
+ struct anv_address ray_query_globals_addr =
+ anv_genX(device->info, cmd_buffer_ray_query_globals)(cmd_buffer);
+ pipeline_state->push_constants.ray_query_globals =
+ anv_address_physical(ray_query_globals_addr);
+ cmd_buffer->state.push_constants_dirty |= stages;
+ pipeline_state->push_constants_data_dirty = true;
+}
+
+/**
+ * This function compute changes between 2 pipelines and flags the dirty HW
+ * state appropriately.
+ */
+static void
+anv_cmd_buffer_flush_pipeline_state(struct anv_cmd_buffer *cmd_buffer,
+ struct anv_graphics_pipeline *old_pipeline,
+ struct anv_graphics_pipeline *new_pipeline)
+{
+ struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
+ struct anv_gfx_dynamic_state *hw_state = &gfx->dyn_state;
+
+#define diff_fix_state(bit, name) \
+ do { \
+ /* Fixed states should always have matching sizes */ \
+ assert(old_pipeline == NULL || \
+ old_pipeline->name.len == new_pipeline->name.len); \
+ /* Don't bother memcmp if the state is already dirty */ \
+ if (!BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_##bit) && \
+ (old_pipeline == NULL || \
+ memcmp(&old_pipeline->batch_data[old_pipeline->name.offset], \
+ &new_pipeline->batch_data[new_pipeline->name.offset], \
+ 4 * new_pipeline->name.len) != 0)) \
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_##bit); \
+ } while (0)
+#define diff_var_state(bit, name) \
+ do { \
+ /* Don't bother memcmp if the state is already dirty */ \
+ /* Also if the new state is empty, avoid marking dirty */ \
+ if (!BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_##bit) && \
+ new_pipeline->name.len != 0 && \
+ (old_pipeline == NULL || \
+ old_pipeline->name.len != new_pipeline->name.len || \
+ memcmp(&old_pipeline->batch_data[old_pipeline->name.offset], \
+ &new_pipeline->batch_data[new_pipeline->name.offset], \
+ 4 * new_pipeline->name.len) != 0)) \
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_##bit); \
+ } while (0)
+#define assert_identical(bit, name) \
+ do { \
+ /* Fixed states should always have matching sizes */ \
+ assert(old_pipeline == NULL || \
+ old_pipeline->name.len == new_pipeline->name.len); \
+ assert(old_pipeline == NULL || \
+ memcmp(&old_pipeline->batch_data[old_pipeline->name.offset], \
+ &new_pipeline->batch_data[new_pipeline->name.offset], \
+ 4 * new_pipeline->name.len) == 0); \
+ } while (0)
+#define assert_empty(name) assert(new_pipeline->name.len == 0)
+
+ /* Compare all states, including partial packed ones, the dynamic part is
+ * left at 0 but the static part could still change.
+ */
+ diff_fix_state(URB, final.urb);
+ diff_fix_state(VF_SGVS, final.vf_sgvs);
+ if (cmd_buffer->device->info->ver >= 11)
+ diff_fix_state(VF_SGVS_2, final.vf_sgvs_2);
+ if (cmd_buffer->device->info->ver >= 12)
+ diff_fix_state(PRIMITIVE_REPLICATION, final.primitive_replication);
+ diff_fix_state(SBE, final.sbe);
+ diff_fix_state(SBE_SWIZ, final.sbe_swiz);
+ diff_fix_state(VS, final.vs);
+ diff_fix_state(HS, final.hs);
+ diff_fix_state(DS, final.ds);
+
+ diff_fix_state(CLIP, partial.clip);
+ diff_fix_state(SF, partial.sf);
+ diff_fix_state(RASTER, partial.raster);
+ diff_fix_state(MULTISAMPLE, partial.ms);
+ diff_fix_state(WM, partial.wm);
+ diff_fix_state(STREAMOUT, partial.so);
+ diff_fix_state(GS, partial.gs);
+ diff_fix_state(TE, partial.te);
+ diff_fix_state(VFG, partial.vfg);
+ diff_fix_state(PS, partial.ps);
+ diff_fix_state(PS_EXTRA, partial.ps_extra);
+
+ if (cmd_buffer->device->vk.enabled_extensions.EXT_mesh_shader) {
+ diff_fix_state(TASK_CONTROL, final.task_control);
+ diff_fix_state(TASK_SHADER, final.task_shader);
+ diff_fix_state(TASK_REDISTRIB, final.task_redistrib);
+ diff_fix_state(MESH_CONTROL, final.mesh_control);
+ diff_fix_state(MESH_SHADER, final.mesh_shader);
+ diff_fix_state(MESH_DISTRIB, final.mesh_distrib);
+ diff_fix_state(CLIP_MESH, final.clip_mesh);
+ diff_fix_state(SBE_MESH, final.sbe_mesh);
+ } else {
+ assert_empty(final.task_control);
+ assert_empty(final.task_shader);
+ assert_empty(final.task_redistrib);
+ assert_empty(final.mesh_control);
+ assert_empty(final.mesh_shader);
+ assert_empty(final.mesh_distrib);
+ assert_empty(final.clip_mesh);
+ assert_empty(final.sbe_mesh);
+ }
+
+ /* States that should never vary between pipelines, but can be affected by
+ * blorp etc...
+ */
+ assert_identical(VF_STATISTICS, final.vf_statistics);
+
+ /* States that can vary in length */
+ diff_var_state(VF_SGVS_INSTANCING, final.vf_sgvs_instancing);
+ diff_var_state(SO_DECL_LIST, final.so_decl_list);
+
+#undef diff_fix_state
+#undef diff_var_state
+#undef assert_identical
+#undef assert_empty
+
+ /* We're not diffing the following :
+ * - anv_graphics_pipeline::vertex_input_data
+ * - anv_graphics_pipeline::final::vf_instancing
+ *
+ * since they are tracked by the runtime.
+ */
+}
+
void anv_CmdBindPipeline(
VkCommandBuffer commandBuffer,
VkPipelineBindPoint pipelineBindPoint,
@@ -472,463 +627,194 @@ void anv_CmdBindPipeline(
{
ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
ANV_FROM_HANDLE(anv_pipeline, pipeline, _pipeline);
+ struct anv_cmd_pipeline_state *state;
+ VkShaderStageFlags stages = 0;
switch (pipelineBindPoint) {
case VK_PIPELINE_BIND_POINT_COMPUTE: {
- struct anv_compute_pipeline *compute_pipeline =
- anv_pipeline_to_compute(pipeline);
- if (cmd_buffer->state.compute.pipeline == compute_pipeline)
+ if (cmd_buffer->state.compute.base.pipeline == pipeline)
return;
- cmd_buffer->state.compute.pipeline = compute_pipeline;
+ cmd_buffer->state.compute.base.pipeline = pipeline;
cmd_buffer->state.compute.pipeline_dirty = true;
+
+ struct anv_compute_pipeline *compute_pipeline =
+ anv_pipeline_to_compute(pipeline);
set_dirty_for_bind_map(cmd_buffer, MESA_SHADER_COMPUTE,
&compute_pipeline->cs->bind_map);
+
+ state = &cmd_buffer->state.compute.base;
+ stages = VK_SHADER_STAGE_COMPUTE_BIT;
break;
}
case VK_PIPELINE_BIND_POINT_GRAPHICS: {
- struct anv_graphics_pipeline *gfx_pipeline =
+ struct anv_graphics_pipeline *new_pipeline =
anv_pipeline_to_graphics(pipeline);
- if (cmd_buffer->state.gfx.pipeline == gfx_pipeline)
+
+ /* Apply the non dynamic state from the pipeline */
+ vk_cmd_set_dynamic_graphics_state(&cmd_buffer->vk,
+ &new_pipeline->dynamic_state);
+
+ if (cmd_buffer->state.gfx.base.pipeline == pipeline)
return;
- cmd_buffer->state.gfx.pipeline = gfx_pipeline;
- cmd_buffer->state.gfx.vb_dirty |= gfx_pipeline->vb_used;
+ struct anv_graphics_pipeline *old_pipeline =
+ cmd_buffer->state.gfx.base.pipeline == NULL ? NULL :
+ anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
+
+ cmd_buffer->state.gfx.base.pipeline = pipeline;
cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_PIPELINE;
- anv_foreach_stage(stage, gfx_pipeline->active_stages) {
+ anv_foreach_stage(stage, new_pipeline->base.base.active_stages) {
set_dirty_for_bind_map(cmd_buffer, stage,
- &gfx_pipeline->shaders[stage]->bind_map);
+ &new_pipeline->base.shaders[stage]->bind_map);
+ }
+
+ state = &cmd_buffer->state.gfx.base;
+ stages = new_pipeline->base.base.active_stages;
+
+
+ /* When the pipeline is using independent states and dynamic buffers,
+ * this will trigger an update of anv_push_constants::dynamic_base_index
+ * & anv_push_constants::dynamic_offsets.
+ */
+ struct anv_push_constants *push =
+ &cmd_buffer->state.gfx.base.push_constants;
+ struct anv_pipeline_sets_layout *layout = &new_pipeline->base.base.layout;
+ if (layout->independent_sets && layout->num_dynamic_buffers > 0) {
+ bool modified = false;
+ for (uint32_t s = 0; s < layout->num_sets; s++) {
+ if (layout->set[s].layout == NULL)
+ continue;
+
+ assert(layout->set[s].dynamic_offset_start < MAX_DYNAMIC_BUFFERS);
+ if (layout->set[s].layout->dynamic_offset_count > 0 &&
+ (push->desc_surface_offsets[s] & ANV_DESCRIPTOR_SET_DYNAMIC_INDEX_MASK) !=
+ layout->set[s].dynamic_offset_start) {
+ push->desc_surface_offsets[s] &= ~ANV_DESCRIPTOR_SET_DYNAMIC_INDEX_MASK;
+ push->desc_surface_offsets[s] |= (layout->set[s].dynamic_offset_start &
+ ANV_DESCRIPTOR_SET_DYNAMIC_INDEX_MASK);
+ modified = true;
+ }
+ }
+ if (modified) {
+ cmd_buffer->state.push_constants_dirty |= stages;
+ state->push_constants_data_dirty = true;
+ }
}
- /* Apply the dynamic state from the pipeline */
- cmd_buffer->state.gfx.dirty |=
- anv_dynamic_state_copy(&cmd_buffer->state.gfx.dynamic,
- &gfx_pipeline->dynamic_state,
- gfx_pipeline->dynamic_state_mask);
+ anv_cmd_buffer_flush_pipeline_state(cmd_buffer, old_pipeline, new_pipeline);
break;
}
case VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR: {
- struct anv_ray_tracing_pipeline *rt_pipeline =
- anv_pipeline_to_ray_tracing(pipeline);
- if (cmd_buffer->state.rt.pipeline == rt_pipeline)
+ if (cmd_buffer->state.rt.base.pipeline == pipeline)
return;
- cmd_buffer->state.rt.pipeline = rt_pipeline;
+ cmd_buffer->state.rt.base.pipeline = pipeline;
cmd_buffer->state.rt.pipeline_dirty = true;
+ struct anv_ray_tracing_pipeline *rt_pipeline =
+ anv_pipeline_to_ray_tracing(pipeline);
if (rt_pipeline->stack_size > 0) {
anv_CmdSetRayTracingPipelineStackSizeKHR(commandBuffer,
rt_pipeline->stack_size);
}
+
+ state = &cmd_buffer->state.rt.base;
break;
}
default:
- assert(!"invalid bind point");
+ unreachable("invalid bind point");
break;
}
-}
-
-void anv_CmdSetRasterizerDiscardEnableEXT(
- VkCommandBuffer commandBuffer,
- VkBool32 rasterizerDiscardEnable)
-{
- ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
-
- cmd_buffer->state.gfx.dynamic.raster_discard = rasterizerDiscardEnable;
-
- cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_DYNAMIC_RASTERIZER_DISCARD_ENABLE;
-}
-
-void anv_CmdSetDepthBiasEnableEXT(
- VkCommandBuffer commandBuffer,
- VkBool32 depthBiasEnable)
-{
- ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
-
- cmd_buffer->state.gfx.dynamic.depth_bias_enable = depthBiasEnable;
-
- cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS_ENABLE;
-}
-
-void anv_CmdSetPrimitiveRestartEnableEXT(
- VkCommandBuffer commandBuffer,
- VkBool32 primitiveRestartEnable)
-{
- ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
-
- cmd_buffer->state.gfx.dynamic.primitive_restart_enable = primitiveRestartEnable;
-
- cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_DYNAMIC_PRIMITIVE_RESTART_ENABLE;
-}
-
-void anv_CmdSetLogicOpEXT(
- VkCommandBuffer commandBuffer,
- VkLogicOp logicOp)
-{
- ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
-
- cmd_buffer->state.gfx.dynamic.logic_op = logicOp;
-
- cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_DYNAMIC_LOGIC_OP;
-}
-
-void anv_CmdSetPatchControlPointsEXT(
- VkCommandBuffer commandBuffer,
- uint32_t patchControlPoints)
-{
- ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
- anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_FEATURE_NOT_PRESENT);
-}
-
-void anv_CmdSetViewport(
- VkCommandBuffer commandBuffer,
- uint32_t firstViewport,
- uint32_t viewportCount,
- const VkViewport* pViewports)
-{
- ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
-
- const uint32_t total_count = firstViewport + viewportCount;
- if (cmd_buffer->state.gfx.dynamic.viewport.count < total_count)
- cmd_buffer->state.gfx.dynamic.viewport.count = total_count;
-
- memcpy(cmd_buffer->state.gfx.dynamic.viewport.viewports + firstViewport,
- pViewports, viewportCount * sizeof(*pViewports));
-
- cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_DYNAMIC_VIEWPORT;
-}
-
-void anv_CmdSetViewportWithCountEXT(
- VkCommandBuffer commandBuffer,
- uint32_t viewportCount,
- const VkViewport* pViewports)
-{
- ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
-
- cmd_buffer->state.gfx.dynamic.viewport.count = viewportCount;
-
- memcpy(cmd_buffer->state.gfx.dynamic.viewport.viewports,
- pViewports, viewportCount * sizeof(*pViewports));
-
- cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_DYNAMIC_VIEWPORT;
-}
-
-void anv_CmdSetScissor(
- VkCommandBuffer commandBuffer,
- uint32_t firstScissor,
- uint32_t scissorCount,
- const VkRect2D* pScissors)
-{
- ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
-
- const uint32_t total_count = firstScissor + scissorCount;
- if (cmd_buffer->state.gfx.dynamic.scissor.count < total_count)
- cmd_buffer->state.gfx.dynamic.scissor.count = total_count;
-
- memcpy(cmd_buffer->state.gfx.dynamic.scissor.scissors + firstScissor,
- pScissors, scissorCount * sizeof(*pScissors));
-
- cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_DYNAMIC_SCISSOR;
-}
-void anv_CmdSetScissorWithCountEXT(
- VkCommandBuffer commandBuffer,
- uint32_t scissorCount,
- const VkRect2D* pScissors)
-{
- ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
-
- cmd_buffer->state.gfx.dynamic.scissor.count = scissorCount;
-
- memcpy(cmd_buffer->state.gfx.dynamic.scissor.scissors,
- pScissors, scissorCount * sizeof(*pScissors));
-
- cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_DYNAMIC_SCISSOR;
+ if (pipeline->ray_queries > 0)
+ anv_cmd_buffer_set_ray_query_buffer(cmd_buffer, state, pipeline, stages);
}
-void anv_CmdSetPrimitiveTopologyEXT(
- VkCommandBuffer commandBuffer,
- VkPrimitiveTopology primitiveTopology)
+static struct anv_cmd_pipeline_state *
+anv_cmd_buffer_get_pipeline_layout_state(struct anv_cmd_buffer *cmd_buffer,
+ VkPipelineBindPoint bind_point,
+ const struct anv_descriptor_set_layout *set_layout,
+ VkShaderStageFlags *out_stages)
{
- ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
-
- cmd_buffer->state.gfx.dynamic.primitive_topology = primitiveTopology;
-
- cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY;
-}
-
-void anv_CmdSetLineWidth(
- VkCommandBuffer commandBuffer,
- float lineWidth)
-{
- ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
-
- cmd_buffer->state.gfx.dynamic.line_width = lineWidth;
- cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_DYNAMIC_LINE_WIDTH;
-}
-
-void anv_CmdSetDepthBias(
- VkCommandBuffer commandBuffer,
- float depthBiasConstantFactor,
- float depthBiasClamp,
- float depthBiasSlopeFactor)
-{
- ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
-
- cmd_buffer->state.gfx.dynamic.depth_bias.bias = depthBiasConstantFactor;
- cmd_buffer->state.gfx.dynamic.depth_bias.clamp = depthBiasClamp;
- cmd_buffer->state.gfx.dynamic.depth_bias.slope = depthBiasSlopeFactor;
-
- cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS;
-}
-
-void anv_CmdSetBlendConstants(
- VkCommandBuffer commandBuffer,
- const float blendConstants[4])
-{
- ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
-
- memcpy(cmd_buffer->state.gfx.dynamic.blend_constants,
- blendConstants, sizeof(float) * 4);
-
- cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_DYNAMIC_BLEND_CONSTANTS;
-}
-
-void anv_CmdSetDepthBounds(
- VkCommandBuffer commandBuffer,
- float minDepthBounds,
- float maxDepthBounds)
-{
- ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
-
- cmd_buffer->state.gfx.dynamic.depth_bounds.min = minDepthBounds;
- cmd_buffer->state.gfx.dynamic.depth_bounds.max = maxDepthBounds;
-
- cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS;
-}
-
-void anv_CmdSetStencilCompareMask(
- VkCommandBuffer commandBuffer,
- VkStencilFaceFlags faceMask,
- uint32_t compareMask)
-{
- ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
-
- if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
- cmd_buffer->state.gfx.dynamic.stencil_compare_mask.front = compareMask;
- if (faceMask & VK_STENCIL_FACE_BACK_BIT)
- cmd_buffer->state.gfx.dynamic.stencil_compare_mask.back = compareMask;
-
- cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK;
-}
-
-void anv_CmdSetStencilWriteMask(
- VkCommandBuffer commandBuffer,
- VkStencilFaceFlags faceMask,
- uint32_t writeMask)
-{
- ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
-
- if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
- cmd_buffer->state.gfx.dynamic.stencil_write_mask.front = writeMask;
- if (faceMask & VK_STENCIL_FACE_BACK_BIT)
- cmd_buffer->state.gfx.dynamic.stencil_write_mask.back = writeMask;
+ *out_stages = set_layout->shader_stages;
- cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK;
-}
-
-void anv_CmdSetStencilReference(
- VkCommandBuffer commandBuffer,
- VkStencilFaceFlags faceMask,
- uint32_t reference)
-{
- ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
-
- if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
- cmd_buffer->state.gfx.dynamic.stencil_reference.front = reference;
- if (faceMask & VK_STENCIL_FACE_BACK_BIT)
- cmd_buffer->state.gfx.dynamic.stencil_reference.back = reference;
-
- cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE;
-}
-
-void anv_CmdSetSampleLocationsEXT(
- VkCommandBuffer commandBuffer,
- const VkSampleLocationsInfoEXT* pSampleLocationsInfo)
-{
- ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
-
- struct anv_dynamic_state *dyn_state = &cmd_buffer->state.gfx.dynamic;
- uint32_t samples = pSampleLocationsInfo->sampleLocationsPerPixel;
-
- dyn_state->sample_locations.samples = samples;
- typed_memcpy(dyn_state->sample_locations.locations,
- pSampleLocationsInfo->pSampleLocations, samples);
-
- cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_DYNAMIC_SAMPLE_LOCATIONS;
-}
-
-void anv_CmdSetLineStippleEXT(
- VkCommandBuffer commandBuffer,
- uint32_t lineStippleFactor,
- uint16_t lineStipplePattern)
-{
- ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
-
- cmd_buffer->state.gfx.dynamic.line_stipple.factor = lineStippleFactor;
- cmd_buffer->state.gfx.dynamic.line_stipple.pattern = lineStipplePattern;
-
- cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_DYNAMIC_LINE_STIPPLE;
-}
-
-void anv_CmdSetCullModeEXT(
- VkCommandBuffer commandBuffer,
- VkCullModeFlags cullMode)
-{
- ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
-
- cmd_buffer->state.gfx.dynamic.cull_mode = cullMode;
-
- cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_DYNAMIC_CULL_MODE;
-}
-
-void anv_CmdSetFrontFaceEXT(
- VkCommandBuffer commandBuffer,
- VkFrontFace frontFace)
-{
- ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
-
- cmd_buffer->state.gfx.dynamic.front_face = frontFace;
-
- cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_DYNAMIC_FRONT_FACE;
-}
-
-void anv_CmdSetDepthTestEnableEXT(
- VkCommandBuffer commandBuffer,
- VkBool32 depthTestEnable)
-
-{
- ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
-
- cmd_buffer->state.gfx.dynamic.depth_test_enable = depthTestEnable;
-
- cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_DYNAMIC_DEPTH_TEST_ENABLE;
-}
-
-void anv_CmdSetDepthWriteEnableEXT(
- VkCommandBuffer commandBuffer,
- VkBool32 depthWriteEnable)
-{
- ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
-
- cmd_buffer->state.gfx.dynamic.depth_write_enable = depthWriteEnable;
-
- cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_DYNAMIC_DEPTH_WRITE_ENABLE;
-}
-
-void anv_CmdSetDepthCompareOpEXT(
- VkCommandBuffer commandBuffer,
- VkCompareOp depthCompareOp)
-{
- ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
-
- cmd_buffer->state.gfx.dynamic.depth_compare_op = depthCompareOp;
-
- cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_DYNAMIC_DEPTH_COMPARE_OP;
-}
-
-void anv_CmdSetDepthBoundsTestEnableEXT(
- VkCommandBuffer commandBuffer,
- VkBool32 depthBoundsTestEnable)
-{
- ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
-
- cmd_buffer->state.gfx.dynamic.depth_bounds_test_enable = depthBoundsTestEnable;
-
- cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS_TEST_ENABLE;
-}
+ switch (bind_point) {
+ case VK_PIPELINE_BIND_POINT_GRAPHICS:
+ *out_stages &= VK_SHADER_STAGE_ALL_GRAPHICS |
+ (cmd_buffer->device->vk.enabled_extensions.EXT_mesh_shader ?
+ (VK_SHADER_STAGE_TASK_BIT_EXT |
+ VK_SHADER_STAGE_MESH_BIT_EXT) : 0);
+ return &cmd_buffer->state.gfx.base;
-void anv_CmdSetStencilTestEnableEXT(
- VkCommandBuffer commandBuffer,
- VkBool32 stencilTestEnable)
-{
- ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+ case VK_PIPELINE_BIND_POINT_COMPUTE:
+ *out_stages &= VK_SHADER_STAGE_COMPUTE_BIT;
+ return &cmd_buffer->state.compute.base;
- cmd_buffer->state.gfx.dynamic.stencil_test_enable = stencilTestEnable;
+ case VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR:
+ *out_stages &= VK_SHADER_STAGE_RAYGEN_BIT_KHR |
+ VK_SHADER_STAGE_ANY_HIT_BIT_KHR |
+ VK_SHADER_STAGE_CLOSEST_HIT_BIT_KHR |
+ VK_SHADER_STAGE_MISS_BIT_KHR |
+ VK_SHADER_STAGE_INTERSECTION_BIT_KHR |
+ VK_SHADER_STAGE_CALLABLE_BIT_KHR;
+ return &cmd_buffer->state.rt.base;
- cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_DYNAMIC_STENCIL_TEST_ENABLE;
+ default:
+ unreachable("invalid bind point");
+ }
}
-void anv_CmdSetStencilOpEXT(
- VkCommandBuffer commandBuffer,
- VkStencilFaceFlags faceMask,
- VkStencilOp failOp,
- VkStencilOp passOp,
- VkStencilOp depthFailOp,
- VkCompareOp compareOp)
+static void
+anv_cmd_buffer_maybe_dirty_descriptor_mode(struct anv_cmd_buffer *cmd_buffer,
+ enum anv_cmd_descriptor_buffer_mode new_mode)
{
- ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
-
- if (faceMask & VK_STENCIL_FACE_FRONT_BIT) {
- cmd_buffer->state.gfx.dynamic.stencil_op.front.fail_op = failOp;
- cmd_buffer->state.gfx.dynamic.stencil_op.front.pass_op = passOp;
- cmd_buffer->state.gfx.dynamic.stencil_op.front.depth_fail_op = depthFailOp;
- cmd_buffer->state.gfx.dynamic.stencil_op.front.compare_op = compareOp;
- }
-
- if (faceMask & VK_STENCIL_FACE_BACK_BIT) {
- cmd_buffer->state.gfx.dynamic.stencil_op.back.fail_op = failOp;
- cmd_buffer->state.gfx.dynamic.stencil_op.back.pass_op = passOp;
- cmd_buffer->state.gfx.dynamic.stencil_op.back.depth_fail_op = depthFailOp;
- cmd_buffer->state.gfx.dynamic.stencil_op.back.compare_op = compareOp;
- }
+ if (cmd_buffer->state.current_db_mode == new_mode)
+ return;
- cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_DYNAMIC_STENCIL_OP;
+ /* Ensure we program the STATE_BASE_ADDRESS properly at least once */
+ cmd_buffer->state.descriptor_buffers.dirty = true;
+ cmd_buffer->state.pending_db_mode = new_mode;
}
static void
anv_cmd_buffer_bind_descriptor_set(struct anv_cmd_buffer *cmd_buffer,
VkPipelineBindPoint bind_point,
- struct anv_pipeline_layout *layout,
+ struct anv_pipeline_sets_layout *layout,
uint32_t set_index,
struct anv_descriptor_set *set,
uint32_t *dynamic_offset_count,
const uint32_t **dynamic_offsets)
{
+ /* Either we have no pool because it's a push descriptor or the pool is not
+ * host only :
+ *
+ * VUID-vkCmdBindDescriptorSets-pDescriptorSets-04616:
+ *
+ * "Each element of pDescriptorSets must not have been allocated from a
+ * VkDescriptorPool with the
+ * VK_DESCRIPTOR_POOL_CREATE_HOST_ONLY_BIT_EXT flag set"
+ */
+ assert(!set->pool || !set->pool->host_only);
+
struct anv_descriptor_set_layout *set_layout =
layout->set[set_index].layout;
- VkShaderStageFlags stages = set_layout->shader_stages;
- struct anv_cmd_pipeline_state *pipe_state;
+ anv_cmd_buffer_maybe_dirty_descriptor_mode(
+ cmd_buffer,
+ (set->layout->flags &
+ VK_DESCRIPTOR_SET_LAYOUT_CREATE_DESCRIPTOR_BUFFER_BIT_EXT) != 0 ?
+ ANV_CMD_DESCRIPTOR_BUFFER_MODE_BUFFER :
+ ANV_CMD_DESCRIPTOR_BUFFER_MODE_LEGACY);
- switch (bind_point) {
- case VK_PIPELINE_BIND_POINT_GRAPHICS:
- stages &= VK_SHADER_STAGE_ALL_GRAPHICS;
- pipe_state = &cmd_buffer->state.gfx.base;
- break;
-
- case VK_PIPELINE_BIND_POINT_COMPUTE:
- stages &= VK_SHADER_STAGE_COMPUTE_BIT;
- pipe_state = &cmd_buffer->state.compute.base;
- break;
-
- case VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR:
- stages &= VK_SHADER_STAGE_RAYGEN_BIT_KHR |
- VK_SHADER_STAGE_ANY_HIT_BIT_KHR |
- VK_SHADER_STAGE_CLOSEST_HIT_BIT_KHR |
- VK_SHADER_STAGE_MISS_BIT_KHR |
- VK_SHADER_STAGE_INTERSECTION_BIT_KHR |
- VK_SHADER_STAGE_CALLABLE_BIT_KHR;
- pipe_state = &cmd_buffer->state.rt.base;
- break;
-
- default:
- unreachable("invalid bind point");
- }
+ VkShaderStageFlags stages;
+ struct anv_cmd_pipeline_state *pipe_state =
+ anv_cmd_buffer_get_pipeline_layout_state(cmd_buffer, bind_point,
+ set_layout, &stages);
VkShaderStageFlags dirty_stages = 0;
/* If it's a push descriptor set, we have to flag things as dirty
@@ -936,23 +822,58 @@ anv_cmd_buffer_bind_descriptor_set(struct anv_cmd_buffer *cmd_buffer,
* may have edited in-place.
*/
if (pipe_state->descriptors[set_index] != set ||
- anv_descriptor_set_is_push(set)) {
+ anv_descriptor_set_is_push(set)) {
pipe_state->descriptors[set_index] = set;
- /* Ray-tracing shaders are entirely bindless and so they don't have
- * access to HW binding tables. This means that we have to upload the
- * descriptor set as an 64-bit address in the push constants.
- */
- if (bind_point == VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR) {
- struct anv_push_constants *push = &pipe_state->push_constants;
+ if (set->layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_DESCRIPTOR_BUFFER_BIT_EXT) {
+ assert(set->is_push);
- struct anv_address addr = anv_descriptor_set_address(set);
- push->desc_sets[set_index] = anv_address_physical(addr);
+ pipe_state->descriptor_buffers[set_index].buffer_index = -1;
+ pipe_state->descriptor_buffers[set_index].buffer_offset = set->desc_offset;
+ pipe_state->descriptor_buffers[set_index].bound = true;
+ cmd_buffer->state.descriptors_dirty |= stages;
+ cmd_buffer->state.descriptor_buffers.offsets_dirty |= stages;
+ } else {
+ /* When using indirect descriptors, stages that have access to the HW
+ * binding tables, never need to access the
+ * anv_push_constants::desc_offsets fields, because any data they
+ * need from the descriptor buffer is accessible through a binding
+ * table entry. For stages that are "bindless" (Mesh/Task/RT), we
+ * need to provide anv_push_constants::desc_offsets matching the
+ * bound descriptor so that shaders can access the descriptor buffer
+ * through A64 messages.
+ *
+ * With direct descriptors, the shaders can use the
+ * anv_push_constants::desc_offsets to build bindless offsets. So
+ * it's we always need to update the push constant data.
+ */
+ bool update_desc_sets =
+ !cmd_buffer->device->physical->indirect_descriptors ||
+ (stages & (VK_SHADER_STAGE_TASK_BIT_EXT |
+ VK_SHADER_STAGE_MESH_BIT_EXT |
+ VK_SHADER_STAGE_RAYGEN_BIT_KHR |
+ VK_SHADER_STAGE_ANY_HIT_BIT_KHR |
+ VK_SHADER_STAGE_CLOSEST_HIT_BIT_KHR |
+ VK_SHADER_STAGE_MISS_BIT_KHR |
+ VK_SHADER_STAGE_INTERSECTION_BIT_KHR |
+ VK_SHADER_STAGE_CALLABLE_BIT_KHR));
+
+ if (update_desc_sets) {
+ struct anv_push_constants *push = &pipe_state->push_constants;
+ uint64_t offset =
+ anv_address_physical(set->desc_surface_addr) -
+ cmd_buffer->device->physical->va.internal_surface_state_pool.addr;
+ assert((offset & ~ANV_DESCRIPTOR_SET_OFFSET_MASK) == 0);
+ push->desc_surface_offsets[set_index] &= ~ANV_DESCRIPTOR_SET_OFFSET_MASK;
+ push->desc_surface_offsets[set_index] |= offset;
+ push->desc_sampler_offsets[set_index] |=
+ anv_address_physical(set->desc_sampler_addr) -
+ cmd_buffer->device->physical->va.dynamic_state_pool.addr;
- if (addr.bo) {
anv_reloc_list_add_bo(cmd_buffer->batch.relocs,
- cmd_buffer->batch.alloc,
- addr.bo);
+ set->desc_surface_addr.bo);
+ anv_reloc_list_add_bo(cmd_buffer->batch.relocs,
+ set->desc_sampler_addr.bo);
}
}
@@ -967,6 +888,11 @@ anv_cmd_buffer_bind_descriptor_set(struct anv_cmd_buffer *cmd_buffer,
uint32_t *push_offsets =
&push->dynamic_offsets[dynamic_offset_start];
+ memcpy(pipe_state->dynamic_offsets[set_index].offsets,
+ *dynamic_offsets,
+ sizeof(uint32_t) * MIN2(*dynamic_offset_count,
+ set_layout->dynamic_offset_count));
+
/* Assert that everything is in range */
assert(set_layout->dynamic_offset_count <= *dynamic_offset_count);
assert(dynamic_offset_start + set_layout->dynamic_offset_count <=
@@ -974,7 +900,8 @@ anv_cmd_buffer_bind_descriptor_set(struct anv_cmd_buffer *cmd_buffer,
for (uint32_t i = 0; i < set_layout->dynamic_offset_count; i++) {
if (push_offsets[i] != (*dynamic_offsets)[i]) {
- push_offsets[i] = (*dynamic_offsets)[i];
+ pipe_state->dynamic_offsets[set_index].offsets[i] =
+ push_offsets[i] = (*dynamic_offsets)[i];
/* dynamic_offset_stages[] elements could contain blanket
* values like VK_SHADER_STAGE_ALL, so limit this to the
* binding point's bits.
@@ -988,35 +915,187 @@ anv_cmd_buffer_bind_descriptor_set(struct anv_cmd_buffer *cmd_buffer,
}
}
- cmd_buffer->state.descriptors_dirty |= dirty_stages;
+ if (set->is_push)
+ cmd_buffer->state.push_descriptors_dirty |= dirty_stages;
+ else
+ cmd_buffer->state.descriptors_dirty |= dirty_stages;
cmd_buffer->state.push_constants_dirty |= dirty_stages;
+ pipe_state->push_constants_data_dirty = true;
}
-void anv_CmdBindDescriptorSets(
+#define ANV_GRAPHICS_STAGE_BITS \
+ (VK_SHADER_STAGE_ALL_GRAPHICS | \
+ VK_SHADER_STAGE_MESH_BIT_EXT | \
+ VK_SHADER_STAGE_TASK_BIT_EXT)
+
+#define ANV_RT_STAGE_BITS \
+ (VK_SHADER_STAGE_RAYGEN_BIT_KHR | \
+ VK_SHADER_STAGE_ANY_HIT_BIT_KHR | \
+ VK_SHADER_STAGE_CLOSEST_HIT_BIT_KHR | \
+ VK_SHADER_STAGE_MISS_BIT_KHR | \
+ VK_SHADER_STAGE_INTERSECTION_BIT_KHR | \
+ VK_SHADER_STAGE_CALLABLE_BIT_KHR)
+
+void anv_CmdBindDescriptorSets2KHR(
VkCommandBuffer commandBuffer,
- VkPipelineBindPoint pipelineBindPoint,
- VkPipelineLayout _layout,
- uint32_t firstSet,
- uint32_t descriptorSetCount,
- const VkDescriptorSet* pDescriptorSets,
- uint32_t dynamicOffsetCount,
- const uint32_t* pDynamicOffsets)
+ const VkBindDescriptorSetsInfoKHR* pInfo)
{
ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
- ANV_FROM_HANDLE(anv_pipeline_layout, layout, _layout);
+ ANV_FROM_HANDLE(anv_pipeline_layout, pipeline_layout, pInfo->layout);
+ struct anv_pipeline_sets_layout *layout = &pipeline_layout->sets_layout;
+
+ assert(pInfo->firstSet + pInfo->descriptorSetCount <= MAX_SETS);
+
+ if (pInfo->stageFlags & VK_SHADER_STAGE_COMPUTE_BIT) {
+ uint32_t dynamicOffsetCount = pInfo->dynamicOffsetCount;
+ const uint32_t *pDynamicOffsets = pInfo->pDynamicOffsets;
+
+ for (uint32_t i = 0; i < pInfo->descriptorSetCount; i++) {
+ ANV_FROM_HANDLE(anv_descriptor_set, set, pInfo->pDescriptorSets[i]);
+ if (set == NULL)
+ continue;
+ anv_cmd_buffer_bind_descriptor_set(cmd_buffer,
+ VK_PIPELINE_BIND_POINT_COMPUTE,
+ layout, pInfo->firstSet + i, set,
+ &dynamicOffsetCount,
+ &pDynamicOffsets);
+ }
+ }
+ if (pInfo->stageFlags & ANV_GRAPHICS_STAGE_BITS) {
+ uint32_t dynamicOffsetCount = pInfo->dynamicOffsetCount;
+ const uint32_t *pDynamicOffsets = pInfo->pDynamicOffsets;
+
+ for (uint32_t i = 0; i < pInfo->descriptorSetCount; i++) {
+ ANV_FROM_HANDLE(anv_descriptor_set, set, pInfo->pDescriptorSets[i]);
+ if (set == NULL)
+ continue;
+ anv_cmd_buffer_bind_descriptor_set(cmd_buffer,
+ VK_PIPELINE_BIND_POINT_GRAPHICS,
+ layout, pInfo->firstSet + i, set,
+ &dynamicOffsetCount,
+ &pDynamicOffsets);
+ }
+ }
+ if (pInfo->stageFlags & ANV_RT_STAGE_BITS) {
+ uint32_t dynamicOffsetCount = pInfo->dynamicOffsetCount;
+ const uint32_t *pDynamicOffsets = pInfo->pDynamicOffsets;
+
+ for (uint32_t i = 0; i < pInfo->descriptorSetCount; i++) {
+ ANV_FROM_HANDLE(anv_descriptor_set, set, pInfo->pDescriptorSets[i]);
+ if (set == NULL)
+ continue;
+ anv_cmd_buffer_bind_descriptor_set(cmd_buffer,
+ VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR,
+ layout, pInfo->firstSet + i, set,
+ &dynamicOffsetCount,
+ &pDynamicOffsets);
+ }
+ }
+}
- assert(firstSet + descriptorSetCount <= MAX_SETS);
+void anv_CmdBindDescriptorBuffersEXT(
+ VkCommandBuffer commandBuffer,
+ uint32_t bufferCount,
+ const VkDescriptorBufferBindingInfoEXT* pBindingInfos)
+{
+ ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+ struct anv_cmd_state *state = &cmd_buffer->state;
- for (uint32_t i = 0; i < descriptorSetCount; i++) {
- ANV_FROM_HANDLE(anv_descriptor_set, set, pDescriptorSets[i]);
- anv_cmd_buffer_bind_descriptor_set(cmd_buffer, pipelineBindPoint,
- layout, firstSet + i, set,
- &dynamicOffsetCount,
- &pDynamicOffsets);
+ for (uint32_t i = 0; i < bufferCount; i++) {
+ assert(pBindingInfos[i].address >= cmd_buffer->device->physical->va.descriptor_buffer_pool.addr &&
+ pBindingInfos[i].address < (cmd_buffer->device->physical->va.descriptor_buffer_pool.addr +
+ cmd_buffer->device->physical->va.descriptor_buffer_pool.size));
+
+ if (state->descriptor_buffers.address[i] != pBindingInfos[i].address) {
+ state->descriptor_buffers.address[i] = pBindingInfos[i].address;
+ if (pBindingInfos[i].usage & VK_BUFFER_USAGE_RESOURCE_DESCRIPTOR_BUFFER_BIT_EXT)
+ state->descriptor_buffers.surfaces_address = pBindingInfos[i].address;
+ if (pBindingInfos[i].usage & VK_BUFFER_USAGE_SAMPLER_DESCRIPTOR_BUFFER_BIT_EXT)
+ state->descriptor_buffers.samplers_address = pBindingInfos[i].address;
+ state->descriptor_buffers.dirty = true;
+ state->descriptor_buffers.offsets_dirty = ~0;
+ }
+ }
+
+ anv_cmd_buffer_maybe_dirty_descriptor_mode(cmd_buffer,
+ ANV_CMD_DESCRIPTOR_BUFFER_MODE_BUFFER);
+}
+
+static void
+anv_cmd_buffer_set_descriptor_buffer_offsets(struct anv_cmd_buffer *cmd_buffer,
+ VkPipelineBindPoint bind_point,
+ struct anv_pipeline_layout *layout,
+ uint32_t first_set,
+ uint32_t set_count,
+ const VkDeviceSize *buffer_offsets,
+ const uint32_t *buffer_indices)
+{
+ for (uint32_t i = 0; i < set_count; i++) {
+ const uint32_t set_index = first_set + i;
+
+ const struct anv_descriptor_set_layout *set_layout =
+ layout->sets_layout.set[set_index].layout;
+ VkShaderStageFlags stages;
+ struct anv_cmd_pipeline_state *pipe_state =
+ anv_cmd_buffer_get_pipeline_layout_state(cmd_buffer, bind_point,
+ set_layout, &stages);
+
+ if (buffer_offsets[i] != pipe_state->descriptor_buffers[set_index].buffer_offset ||
+ buffer_indices[i] != pipe_state->descriptor_buffers[set_index].buffer_index ||
+ !pipe_state->descriptor_buffers[set_index].bound) {
+ pipe_state->descriptor_buffers[set_index].buffer_index = buffer_indices[i];
+ pipe_state->descriptor_buffers[set_index].buffer_offset = buffer_offsets[i];
+ cmd_buffer->state.descriptors_dirty |= stages;
+ cmd_buffer->state.descriptor_buffers.offsets_dirty |= stages;
+ }
+ pipe_state->descriptor_buffers[set_index].bound = true;
+ }
+}
+
+void anv_CmdSetDescriptorBufferOffsets2EXT(
+ VkCommandBuffer commandBuffer,
+ const VkSetDescriptorBufferOffsetsInfoEXT* pSetDescriptorBufferOffsetsInfo)
+{
+ ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+ ANV_FROM_HANDLE(anv_pipeline_layout, layout, pSetDescriptorBufferOffsetsInfo->layout);
+
+ if (pSetDescriptorBufferOffsetsInfo->stageFlags & VK_SHADER_STAGE_COMPUTE_BIT) {
+ anv_cmd_buffer_set_descriptor_buffer_offsets(cmd_buffer,
+ VK_PIPELINE_BIND_POINT_COMPUTE,
+ layout,
+ pSetDescriptorBufferOffsetsInfo->firstSet,
+ pSetDescriptorBufferOffsetsInfo->setCount,
+ pSetDescriptorBufferOffsetsInfo->pOffsets,
+ pSetDescriptorBufferOffsetsInfo->pBufferIndices);
+ }
+ if (pSetDescriptorBufferOffsetsInfo->stageFlags & ANV_GRAPHICS_STAGE_BITS) {
+ anv_cmd_buffer_set_descriptor_buffer_offsets(cmd_buffer,
+ VK_PIPELINE_BIND_POINT_GRAPHICS,
+ layout,
+ pSetDescriptorBufferOffsetsInfo->firstSet,
+ pSetDescriptorBufferOffsetsInfo->setCount,
+ pSetDescriptorBufferOffsetsInfo->pOffsets,
+ pSetDescriptorBufferOffsetsInfo->pBufferIndices);
+ }
+ if (pSetDescriptorBufferOffsetsInfo->stageFlags & ANV_RT_STAGE_BITS) {
+ anv_cmd_buffer_set_descriptor_buffer_offsets(cmd_buffer,
+ VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR,
+ layout,
+ pSetDescriptorBufferOffsetsInfo->firstSet,
+ pSetDescriptorBufferOffsetsInfo->setCount,
+ pSetDescriptorBufferOffsetsInfo->pOffsets,
+ pSetDescriptorBufferOffsetsInfo->pBufferIndices);
}
}
-void anv_CmdBindVertexBuffers2EXT(
+void anv_CmdBindDescriptorBufferEmbeddedSamplers2EXT(
+ VkCommandBuffer commandBuffer,
+ const VkBindDescriptorBufferEmbeddedSamplersInfoEXT* pBindDescriptorBufferEmbeddedSamplersInfo)
+{
+ /* no-op */
+}
+
+void anv_CmdBindVertexBuffers2(
VkCommandBuffer commandBuffer,
uint32_t firstBinding,
uint32_t bindingCount,
@@ -1031,31 +1110,29 @@ void anv_CmdBindVertexBuffers2EXT(
/* We have to defer setting up vertex buffer since we need the buffer
* stride from the pipeline. */
- if (pSizes)
- cmd_buffer->state.gfx.dynamic.dyn_vbo_size = true;
- if (pStrides)
- cmd_buffer->state.gfx.dynamic.dyn_vbo_stride = true;
-
assert(firstBinding + bindingCount <= MAX_VBS);
for (uint32_t i = 0; i < bindingCount; i++) {
- vb[firstBinding + i].buffer = anv_buffer_from_handle(pBuffers[i]);
- vb[firstBinding + i].offset = pOffsets[i];
- vb[firstBinding + i].size = pSizes ? pSizes[i] : 0;
- vb[firstBinding + i].stride = pStrides ? pStrides[i] : 0;
+ ANV_FROM_HANDLE(anv_buffer, buffer, pBuffers[i]);
+
+ if (buffer == NULL) {
+ vb[firstBinding + i] = (struct anv_vertex_binding) {
+ .buffer = NULL,
+ };
+ } else {
+ vb[firstBinding + i] = (struct anv_vertex_binding) {
+ .buffer = buffer,
+ .offset = pOffsets[i],
+ .size = vk_buffer_range(&buffer->vk, pOffsets[i],
+ pSizes ? pSizes[i] : VK_WHOLE_SIZE),
+ };
+ }
cmd_buffer->state.gfx.vb_dirty |= 1 << (firstBinding + i);
}
-}
-void anv_CmdBindVertexBuffers(
- VkCommandBuffer commandBuffer,
- uint32_t firstBinding,
- uint32_t bindingCount,
- const VkBuffer* pBuffers,
- const VkDeviceSize* pOffsets)
-{
- return anv_CmdBindVertexBuffers2EXT(commandBuffer, firstBinding,
- bindingCount, pBuffers, pOffsets,
- NULL, NULL);
+ if (pStrides != NULL) {
+ vk_cmd_set_vertex_binding_strides(&cmd_buffer->vk, firstBinding,
+ bindingCount, pStrides);
+ }
}
void anv_CmdBindTransformFeedbackBuffersEXT(
@@ -1081,8 +1158,8 @@ void anv_CmdBindTransformFeedbackBuffersEXT(
xfb[firstBinding + i].buffer = buffer;
xfb[firstBinding + i].offset = pOffsets[i];
xfb[firstBinding + i].size =
- anv_buffer_get_range(buffer, pOffsets[i],
- pSizes ? pSizes[i] : VK_WHOLE_SIZE);
+ vk_buffer_range(&buffer->vk, pOffsets[i],
+ pSizes ? pSizes[i] : VK_WHOLE_SIZE);
}
}
}
@@ -1146,9 +1223,12 @@ anv_cmd_buffer_gfx_push_constants(struct anv_cmd_buffer *cmd_buffer)
&cmd_buffer->state.gfx.base.push_constants;
struct anv_state state =
- anv_cmd_buffer_alloc_dynamic_state(cmd_buffer,
- sizeof(struct anv_push_constants),
- 32 /* bottom 5 bits MBZ */);
+ anv_cmd_buffer_alloc_temporary_state(cmd_buffer,
+ sizeof(struct anv_push_constants),
+ 32 /* bottom 5 bits MBZ */);
+ if (state.alloc_size == 0)
+ return state;
+
memcpy(state.map, data, sizeof(struct anv_push_constants));
return state;
@@ -1157,22 +1237,22 @@ anv_cmd_buffer_gfx_push_constants(struct anv_cmd_buffer *cmd_buffer)
struct anv_state
anv_cmd_buffer_cs_push_constants(struct anv_cmd_buffer *cmd_buffer)
{
- const struct intel_device_info *devinfo = &cmd_buffer->device->info;
- struct anv_push_constants *data =
- &cmd_buffer->state.compute.base.push_constants;
- struct anv_compute_pipeline *pipeline = cmd_buffer->state.compute.pipeline;
+ const struct intel_device_info *devinfo = cmd_buffer->device->info;
+ struct anv_cmd_pipeline_state *pipe_state = &cmd_buffer->state.compute.base;
+ struct anv_push_constants *data = &pipe_state->push_constants;
+ struct anv_compute_pipeline *pipeline =
+ anv_pipeline_to_compute(cmd_buffer->state.compute.base.pipeline);
const struct brw_cs_prog_data *cs_prog_data = get_cs_prog_data(pipeline);
const struct anv_push_range *range = &pipeline->cs->bind_map.push_ranges[0];
- const struct brw_cs_dispatch_info dispatch =
+ const struct intel_cs_dispatch_info dispatch =
brw_cs_get_dispatch_info(devinfo, cs_prog_data, NULL);
const unsigned total_push_constants_size =
brw_cs_push_const_total_size(cs_prog_data, dispatch.threads);
if (total_push_constants_size == 0)
return (struct anv_state) { .offset = 0 };
- const unsigned push_constant_alignment =
- cmd_buffer->device->info.ver < 8 ? 32 : 64;
+ const unsigned push_constant_alignment = 64;
const unsigned aligned_total_push_constants_size =
ALIGN(total_push_constants_size, push_constant_alignment);
struct anv_state state;
@@ -1185,6 +1265,8 @@ anv_cmd_buffer_cs_push_constants(struct anv_cmd_buffer *cmd_buffer)
aligned_total_push_constants_size,
push_constant_alignment);
}
+ if (state.map == NULL)
+ return state;
void *dst = state.map;
const void *src = (char *)data + (range->start * 32);
@@ -1211,394 +1293,131 @@ anv_cmd_buffer_cs_push_constants(struct anv_cmd_buffer *cmd_buffer)
return state;
}
-void anv_CmdPushConstants(
+void anv_CmdPushConstants2KHR(
VkCommandBuffer commandBuffer,
- VkPipelineLayout layout,
- VkShaderStageFlags stageFlags,
- uint32_t offset,
- uint32_t size,
- const void* pValues)
+ const VkPushConstantsInfoKHR* pInfo)
{
ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
- if (stageFlags & VK_SHADER_STAGE_ALL_GRAPHICS) {
+ if (pInfo->stageFlags & ANV_GRAPHICS_STAGE_BITS) {
struct anv_cmd_pipeline_state *pipe_state =
&cmd_buffer->state.gfx.base;
- memcpy(pipe_state->push_constants.client_data + offset, pValues, size);
+ memcpy(pipe_state->push_constants.client_data + pInfo->offset,
+ pInfo->pValues, pInfo->size);
+ pipe_state->push_constants_data_dirty = true;
}
- if (stageFlags & VK_SHADER_STAGE_COMPUTE_BIT) {
+ if (pInfo->stageFlags & VK_SHADER_STAGE_COMPUTE_BIT) {
struct anv_cmd_pipeline_state *pipe_state =
&cmd_buffer->state.compute.base;
- memcpy(pipe_state->push_constants.client_data + offset, pValues, size);
+ memcpy(pipe_state->push_constants.client_data + pInfo->offset,
+ pInfo->pValues, pInfo->size);
+ pipe_state->push_constants_data_dirty = true;
}
- if (stageFlags & (VK_SHADER_STAGE_RAYGEN_BIT_KHR |
- VK_SHADER_STAGE_ANY_HIT_BIT_KHR |
- VK_SHADER_STAGE_CLOSEST_HIT_BIT_KHR |
- VK_SHADER_STAGE_MISS_BIT_KHR |
- VK_SHADER_STAGE_INTERSECTION_BIT_KHR |
- VK_SHADER_STAGE_CALLABLE_BIT_KHR)) {
+ if (pInfo->stageFlags & ANV_RT_STAGE_BITS) {
struct anv_cmd_pipeline_state *pipe_state =
&cmd_buffer->state.rt.base;
- memcpy(pipe_state->push_constants.client_data + offset, pValues, size);
+ memcpy(pipe_state->push_constants.client_data + pInfo->offset,
+ pInfo->pValues, pInfo->size);
+ pipe_state->push_constants_data_dirty = true;
}
- cmd_buffer->state.push_constants_dirty |= stageFlags;
-}
-
-VkResult anv_CreateCommandPool(
- VkDevice _device,
- const VkCommandPoolCreateInfo* pCreateInfo,
- const VkAllocationCallbacks* pAllocator,
- VkCommandPool* pCmdPool)
-{
- ANV_FROM_HANDLE(anv_device, device, _device);
- struct anv_cmd_pool *pool;
-
- pool = vk_object_alloc(&device->vk, pAllocator, sizeof(*pool),
- VK_OBJECT_TYPE_COMMAND_POOL);
- if (pool == NULL)
- return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
-
- if (pAllocator)
- pool->alloc = *pAllocator;
- else
- pool->alloc = device->vk.alloc;
-
- list_inithead(&pool->cmd_buffers);
-
- pool->flags = pCreateInfo->flags;
-
- *pCmdPool = anv_cmd_pool_to_handle(pool);
-
- return VK_SUCCESS;
+ cmd_buffer->state.push_constants_dirty |= pInfo->stageFlags;
}
-void anv_DestroyCommandPool(
- VkDevice _device,
- VkCommandPool commandPool,
- const VkAllocationCallbacks* pAllocator)
+static struct anv_cmd_pipeline_state *
+anv_cmd_buffer_get_pipe_state(struct anv_cmd_buffer *cmd_buffer,
+ VkPipelineBindPoint bind_point)
{
- ANV_FROM_HANDLE(anv_device, device, _device);
- ANV_FROM_HANDLE(anv_cmd_pool, pool, commandPool);
-
- if (!pool)
- return;
-
- list_for_each_entry_safe(struct anv_cmd_buffer, cmd_buffer,
- &pool->cmd_buffers, pool_link) {
- anv_cmd_buffer_destroy(cmd_buffer);
- }
-
- vk_object_free(&device->vk, pAllocator, pool);
-}
-
-VkResult anv_ResetCommandPool(
- VkDevice device,
- VkCommandPool commandPool,
- VkCommandPoolResetFlags flags)
-{
- ANV_FROM_HANDLE(anv_cmd_pool, pool, commandPool);
-
- list_for_each_entry(struct anv_cmd_buffer, cmd_buffer,
- &pool->cmd_buffers, pool_link) {
- anv_cmd_buffer_reset(cmd_buffer);
- }
-
- return VK_SUCCESS;
-}
-
-void anv_TrimCommandPool(
- VkDevice device,
- VkCommandPool commandPool,
- VkCommandPoolTrimFlags flags)
-{
- /* Nothing for us to do here. Our pools stay pretty tidy. */
-}
-
-/**
- * Return NULL if the current subpass has no depthstencil attachment.
- */
-const struct anv_image_view *
-anv_cmd_buffer_get_depth_stencil_view(const struct anv_cmd_buffer *cmd_buffer)
-{
- const struct anv_subpass *subpass = cmd_buffer->state.subpass;
-
- if (subpass->depth_stencil_attachment == NULL)
- return NULL;
-
- const struct anv_image_view *iview =
- cmd_buffer->state.attachments[subpass->depth_stencil_attachment->attachment].image_view;
-
- assert(iview->vk.aspects & (VK_IMAGE_ASPECT_DEPTH_BIT |
- VK_IMAGE_ASPECT_STENCIL_BIT));
-
- return iview;
-}
-
-static struct anv_descriptor_set *
-anv_cmd_buffer_push_descriptor_set(struct anv_cmd_buffer *cmd_buffer,
- VkPipelineBindPoint bind_point,
- struct anv_descriptor_set_layout *layout,
- uint32_t _set)
-{
- struct anv_cmd_pipeline_state *pipe_state;
-
switch (bind_point) {
case VK_PIPELINE_BIND_POINT_GRAPHICS:
- pipe_state = &cmd_buffer->state.gfx.base;
- break;
-
+ return &cmd_buffer->state.gfx.base;
case VK_PIPELINE_BIND_POINT_COMPUTE:
- pipe_state = &cmd_buffer->state.compute.base;
- break;
-
+ return &cmd_buffer->state.compute.base;
case VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR:
- pipe_state = &cmd_buffer->state.rt.base;
+ return &cmd_buffer->state.rt.base;
break;
-
default:
unreachable("invalid bind point");
}
-
- struct anv_push_descriptor_set **push_set =
- &pipe_state->push_descriptors[_set];
-
- if (*push_set == NULL) {
- *push_set = vk_zalloc(&cmd_buffer->pool->alloc,
- sizeof(struct anv_push_descriptor_set), 8,
- VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
- if (*push_set == NULL) {
- anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_OUT_OF_HOST_MEMORY);
- return NULL;
- }
- }
-
- struct anv_descriptor_set *set = &(*push_set)->set;
-
- if (set->layout != layout) {
- if (set->layout)
- anv_descriptor_set_layout_unref(cmd_buffer->device, set->layout);
- anv_descriptor_set_layout_ref(layout);
- set->layout = layout;
- }
- set->size = anv_descriptor_set_layout_size(layout, 0);
- set->buffer_view_count = layout->buffer_view_count;
- set->descriptor_count = layout->descriptor_count;
- set->buffer_views = (*push_set)->buffer_views;
-
- if (layout->descriptor_buffer_size &&
- ((*push_set)->set_used_on_gpu ||
- set->desc_mem.alloc_size < layout->descriptor_buffer_size)) {
- /* The previous buffer is either actively used by some GPU command (so
- * we can't modify it) or is too small. Allocate a new one.
- */
- struct anv_state desc_mem =
- anv_state_stream_alloc(&cmd_buffer->dynamic_state_stream,
- anv_descriptor_set_layout_descriptor_buffer_size(layout, 0),
- ANV_UBO_ALIGNMENT);
- if (set->desc_mem.alloc_size) {
- /* TODO: Do we really need to copy all the time? */
- memcpy(desc_mem.map, set->desc_mem.map,
- MIN2(desc_mem.alloc_size, set->desc_mem.alloc_size));
- }
- set->desc_mem = desc_mem;
-
- set->desc_addr = (struct anv_address) {
- .bo = cmd_buffer->dynamic_state_stream.state_pool->block_pool.bo,
- .offset = set->desc_mem.offset,
- };
-
- enum isl_format format =
- anv_isl_format_for_descriptor_type(cmd_buffer->device,
- VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER);
-
- const struct isl_device *isl_dev = &cmd_buffer->device->isl_dev;
- set->desc_surface_state =
- anv_state_stream_alloc(&cmd_buffer->surface_state_stream,
- isl_dev->ss.size, isl_dev->ss.align);
- anv_fill_buffer_surface_state(cmd_buffer->device,
- set->desc_surface_state, format,
- ISL_SURF_USAGE_CONSTANT_BUFFER_BIT,
- set->desc_addr,
- layout->descriptor_buffer_size, 1);
- }
-
- return set;
}
-void anv_CmdPushDescriptorSetKHR(
- VkCommandBuffer commandBuffer,
- VkPipelineBindPoint pipelineBindPoint,
- VkPipelineLayout _layout,
- uint32_t _set,
- uint32_t descriptorWriteCount,
- const VkWriteDescriptorSet* pDescriptorWrites)
-{
- ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
- ANV_FROM_HANDLE(anv_pipeline_layout, layout, _layout);
-
- assert(_set < MAX_SETS);
-
- struct anv_descriptor_set_layout *set_layout = layout->set[_set].layout;
-
- struct anv_descriptor_set *set =
- anv_cmd_buffer_push_descriptor_set(cmd_buffer, pipelineBindPoint,
- set_layout, _set);
- if (!set)
- return;
-
- /* Go through the user supplied descriptors. */
- for (uint32_t i = 0; i < descriptorWriteCount; i++) {
- const VkWriteDescriptorSet *write = &pDescriptorWrites[i];
-
- switch (write->descriptorType) {
- case VK_DESCRIPTOR_TYPE_SAMPLER:
- case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
- case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
- case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
- case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
- for (uint32_t j = 0; j < write->descriptorCount; j++) {
- anv_descriptor_set_write_image_view(cmd_buffer->device, set,
- write->pImageInfo + j,
- write->descriptorType,
- write->dstBinding,
- write->dstArrayElement + j);
- }
- break;
-
- case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
- case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
- for (uint32_t j = 0; j < write->descriptorCount; j++) {
- ANV_FROM_HANDLE(anv_buffer_view, bview,
- write->pTexelBufferView[j]);
-
- anv_descriptor_set_write_buffer_view(cmd_buffer->device, set,
- write->descriptorType,
- bview,
- write->dstBinding,
- write->dstArrayElement + j);
- }
- break;
-
- case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
- case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
- case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
- case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
- for (uint32_t j = 0; j < write->descriptorCount; j++) {
- ANV_FROM_HANDLE(anv_buffer, buffer, write->pBufferInfo[j].buffer);
-
- anv_descriptor_set_write_buffer(cmd_buffer->device, set,
- &cmd_buffer->surface_state_stream,
- write->descriptorType,
- buffer,
- write->dstBinding,
- write->dstArrayElement + j,
- write->pBufferInfo[j].offset,
- write->pBufferInfo[j].range);
- }
- break;
-
- case VK_DESCRIPTOR_TYPE_ACCELERATION_STRUCTURE_KHR: {
- const VkWriteDescriptorSetAccelerationStructureKHR *accel_write =
- vk_find_struct_const(write, WRITE_DESCRIPTOR_SET_ACCELERATION_STRUCTURE_KHR);
- assert(accel_write->accelerationStructureCount ==
- write->descriptorCount);
- for (uint32_t j = 0; j < write->descriptorCount; j++) {
- ANV_FROM_HANDLE(anv_acceleration_structure, accel,
- accel_write->pAccelerationStructures[j]);
- anv_descriptor_set_write_acceleration_structure(cmd_buffer->device,
- set, accel,
- write->dstBinding,
- write->dstArrayElement + j);
- }
- break;
- }
-
- default:
- break;
- }
- }
-
- anv_cmd_buffer_bind_descriptor_set(cmd_buffer, pipelineBindPoint,
- layout, _set, set, NULL, NULL);
-}
-
-void anv_CmdPushDescriptorSetWithTemplateKHR(
- VkCommandBuffer commandBuffer,
- VkDescriptorUpdateTemplate descriptorUpdateTemplate,
- VkPipelineLayout _layout,
- uint32_t _set,
- const void* pData)
+static void
+anv_cmd_buffer_push_descriptor_sets(struct anv_cmd_buffer *cmd_buffer,
+ VkPipelineBindPoint bind_point,
+ const VkPushDescriptorSetInfoKHR *pInfo)
{
- ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
- ANV_FROM_HANDLE(anv_descriptor_update_template, template,
- descriptorUpdateTemplate);
- ANV_FROM_HANDLE(anv_pipeline_layout, layout, _layout);
+ ANV_FROM_HANDLE(anv_pipeline_layout, pipeline_layout, pInfo->layout);
+ struct anv_pipeline_sets_layout *layout = &pipeline_layout->sets_layout;
- assert(_set < MAX_PUSH_DESCRIPTORS);
+ assert(pInfo->set < MAX_SETS);
- struct anv_descriptor_set_layout *set_layout = layout->set[_set].layout;
-
- struct anv_descriptor_set *set =
- anv_cmd_buffer_push_descriptor_set(cmd_buffer, template->bind_point,
- set_layout, _set);
- if (!set)
+ struct anv_descriptor_set_layout *set_layout = layout->set[pInfo->set].layout;
+ struct anv_push_descriptor_set *push_set =
+ &anv_cmd_buffer_get_pipe_state(cmd_buffer,
+ bind_point)->push_descriptor;
+ if (!anv_push_descriptor_set_init(cmd_buffer, push_set, set_layout))
return;
- anv_descriptor_set_write_template(cmd_buffer->device, set,
- &cmd_buffer->surface_state_stream,
- template,
- pData);
-
- anv_cmd_buffer_bind_descriptor_set(cmd_buffer, template->bind_point,
- layout, _set, set, NULL, NULL);
-}
+ anv_descriptor_set_write(cmd_buffer->device, &push_set->set,
+ pInfo->descriptorWriteCount,
+ pInfo->pDescriptorWrites);
-void anv_CmdSetDeviceMask(
- VkCommandBuffer commandBuffer,
- uint32_t deviceMask)
-{
- /* No-op */
+ anv_cmd_buffer_bind_descriptor_set(cmd_buffer, bind_point,
+ layout, pInfo->set, &push_set->set,
+ NULL, NULL);
}
-void anv_CmdSetColorWriteEnableEXT(
- VkCommandBuffer commandBuffer,
- uint32_t attachmentCount,
- const VkBool32* pColorWriteEnables)
+void anv_CmdPushDescriptorSet2KHR(
+ VkCommandBuffer commandBuffer,
+ const VkPushDescriptorSetInfoKHR* pInfo)
{
ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
- assert(attachmentCount < MAX_RTS);
-
- uint8_t color_writes = 0;
- for (uint32_t i = 0; i < attachmentCount; i++)
- color_writes |= pColorWriteEnables[i] ? (1 << i) : 0;
-
- if (cmd_buffer->state.gfx.dynamic.color_writes != color_writes) {
- cmd_buffer->state.gfx.dynamic.color_writes = color_writes;
- cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_DYNAMIC_COLOR_BLEND_STATE;
- }
+ if (pInfo->stageFlags & VK_SHADER_STAGE_COMPUTE_BIT)
+ anv_cmd_buffer_push_descriptor_sets(cmd_buffer,
+ VK_PIPELINE_BIND_POINT_COMPUTE,
+ pInfo);
+ if (pInfo->stageFlags & ANV_GRAPHICS_STAGE_BITS)
+ anv_cmd_buffer_push_descriptor_sets(cmd_buffer,
+ VK_PIPELINE_BIND_POINT_GRAPHICS,
+ pInfo);
+ if (pInfo->stageFlags & ANV_RT_STAGE_BITS)
+ anv_cmd_buffer_push_descriptor_sets(cmd_buffer,
+ VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR,
+ pInfo);
}
-void anv_CmdSetFragmentShadingRateKHR(
- VkCommandBuffer commandBuffer,
- const VkExtent2D* pFragmentSize,
- const VkFragmentShadingRateCombinerOpKHR combinerOps[2])
+void anv_CmdPushDescriptorSetWithTemplate2KHR(
+ VkCommandBuffer commandBuffer,
+ const VkPushDescriptorSetWithTemplateInfoKHR* pInfo)
{
ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+ VK_FROM_HANDLE(vk_descriptor_update_template, template,
+ pInfo->descriptorUpdateTemplate);
+ ANV_FROM_HANDLE(anv_pipeline_layout, pipeline_layout, pInfo->layout);
+ struct anv_pipeline_sets_layout *layout = &pipeline_layout->sets_layout;
+
+ assert(pInfo->set < MAX_PUSH_DESCRIPTORS);
+
+ struct anv_descriptor_set_layout *set_layout = layout->set[pInfo->set].layout;
+ UNUSED VkShaderStageFlags stages;
+ struct anv_cmd_pipeline_state *pipe_state =
+ anv_cmd_buffer_get_pipeline_layout_state(cmd_buffer, template->bind_point,
+ set_layout, &stages);
+ struct anv_push_descriptor_set *push_set = &pipe_state->push_descriptor;
+ if (!anv_push_descriptor_set_init(cmd_buffer, push_set, set_layout))
+ return;
- cmd_buffer->state.gfx.dynamic.fragment_shading_rate = *pFragmentSize;
- cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_DYNAMIC_SHADING_RATE;
-}
+ anv_descriptor_set_write_template(cmd_buffer->device, &push_set->set,
+ template,
+ pInfo->pData);
-static inline uint32_t
-ilog2_round_up(uint32_t value)
-{
- assert(value != 0);
- return 32 - __builtin_clz(value - 1);
+ anv_cmd_buffer_bind_descriptor_set(cmd_buffer, template->bind_point,
+ layout, pInfo->set, &push_set->set,
+ NULL, NULL);
}
void anv_CmdSetRayTracingPipelineStackSizeKHR(
@@ -1614,14 +1433,14 @@ void anv_CmdSetRayTracingPipelineStackSizeKHR(
uint32_t stack_ids_per_dss = 2048; /* TODO */
- unsigned stack_size_log2 = ilog2_round_up(pipelineStackSize);
+ unsigned stack_size_log2 = util_logbase2_ceil(pipelineStackSize);
if (stack_size_log2 < 10)
stack_size_log2 = 10;
if (rt->scratch.layout.total_size == 1 << stack_size_log2)
return;
- brw_rt_compute_scratch_layout(&rt->scratch.layout, &device->info,
+ brw_rt_compute_scratch_layout(&rt->scratch.layout, device->info,
stack_ids_per_dss, 1 << stack_size_log2);
unsigned bucket = stack_size_log2 - 10;
@@ -1632,7 +1451,7 @@ void anv_CmdSetRayTracingPipelineStackSizeKHR(
struct anv_bo *new_bo;
VkResult result = anv_device_alloc_bo(device, "RT scratch",
rt->scratch.layout.total_size,
- 0, /* alloc_flags */
+ ANV_BO_ALLOC_INTERNAL, /* alloc_flags */
0, /* explicit_address */
&new_bo);
if (result != VK_SUCCESS) {
@@ -1651,3 +1470,69 @@ void anv_CmdSetRayTracingPipelineStackSizeKHR(
rt->scratch.bo = bo;
}
+
+void
+anv_cmd_buffer_save_state(struct anv_cmd_buffer *cmd_buffer,
+ uint32_t flags,
+ struct anv_cmd_saved_state *state)
+{
+ state->flags = flags;
+
+ /* we only support the compute pipeline at the moment */
+ assert(state->flags & ANV_CMD_SAVED_STATE_COMPUTE_PIPELINE);
+ const struct anv_cmd_pipeline_state *pipe_state =
+ &cmd_buffer->state.compute.base;
+
+ if (state->flags & ANV_CMD_SAVED_STATE_COMPUTE_PIPELINE)
+ state->pipeline = pipe_state->pipeline;
+
+ if (state->flags & ANV_CMD_SAVED_STATE_DESCRIPTOR_SET_0)
+ state->descriptor_set = pipe_state->descriptors[0];
+
+ if (state->flags & ANV_CMD_SAVED_STATE_PUSH_CONSTANTS) {
+ memcpy(state->push_constants, pipe_state->push_constants.client_data,
+ sizeof(state->push_constants));
+ }
+}
+
+void
+anv_cmd_buffer_restore_state(struct anv_cmd_buffer *cmd_buffer,
+ struct anv_cmd_saved_state *state)
+{
+ VkCommandBuffer cmd_buffer_ = anv_cmd_buffer_to_handle(cmd_buffer);
+
+ assert(state->flags & ANV_CMD_SAVED_STATE_COMPUTE_PIPELINE);
+ const VkPipelineBindPoint bind_point = VK_PIPELINE_BIND_POINT_COMPUTE;
+ const VkShaderStageFlags stage_flags = VK_SHADER_STAGE_COMPUTE_BIT;
+ struct anv_cmd_pipeline_state *pipe_state = &cmd_buffer->state.compute.base;
+
+ if (state->flags & ANV_CMD_SAVED_STATE_COMPUTE_PIPELINE) {
+ if (state->pipeline) {
+ anv_CmdBindPipeline(cmd_buffer_, bind_point,
+ anv_pipeline_to_handle(state->pipeline));
+ } else {
+ pipe_state->pipeline = NULL;
+ }
+ }
+
+ if (state->flags & ANV_CMD_SAVED_STATE_DESCRIPTOR_SET_0) {
+ if (state->descriptor_set) {
+ anv_cmd_buffer_bind_descriptor_set(cmd_buffer, bind_point, NULL, 0,
+ state->descriptor_set, NULL, NULL);
+ } else {
+ pipe_state->descriptors[0] = NULL;
+ }
+ }
+
+ if (state->flags & ANV_CMD_SAVED_STATE_PUSH_CONSTANTS) {
+ VkPushConstantsInfoKHR push_info = {
+ .sType = VK_STRUCTURE_TYPE_PUSH_CONSTANTS_INFO_KHR,
+ .layout = VK_NULL_HANDLE,
+ .stageFlags = stage_flags,
+ .offset = 0,
+ .size = sizeof(state->push_constants),
+ .pValues = state->push_constants,
+ };
+ anv_CmdPushConstants2KHR(cmd_buffer_, &push_info);
+ }
+}
diff --git a/src/intel/vulkan/anv_descriptor_set.c b/src/intel/vulkan/anv_descriptor_set.c
index cab5402e51b..21fa7f534ca 100644
--- a/src/intel/vulkan/anv_descriptor_set.c
+++ b/src/intel/vulkan/anv_descriptor_set.c
@@ -36,215 +36,465 @@
* Descriptor set layouts.
*/
+static void
+anv_descriptor_data_alignment(enum anv_descriptor_data data,
+ enum anv_descriptor_set_layout_type layout_type,
+ unsigned *out_surface_align,
+ unsigned *out_sampler_align)
+{
+ unsigned surface_align = 1, sampler_align = 1;
+
+ if (data & (ANV_DESCRIPTOR_INDIRECT_SAMPLED_IMAGE |
+ ANV_DESCRIPTOR_INDIRECT_STORAGE_IMAGE |
+ ANV_DESCRIPTOR_INDIRECT_ADDRESS_RANGE))
+ surface_align = MAX2(surface_align, 8);
+
+ if (data & ANV_DESCRIPTOR_SURFACE)
+ surface_align = MAX2(surface_align, ANV_SURFACE_STATE_SIZE);
+
+ if (data & ANV_DESCRIPTOR_SURFACE_SAMPLER) {
+ surface_align = MAX2(surface_align, ANV_SURFACE_STATE_SIZE);
+ if (layout_type == ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_DIRECT)
+ sampler_align = MAX2(sampler_align, ANV_SAMPLER_STATE_SIZE);
+ }
+
+ if (data & ANV_DESCRIPTOR_SAMPLER) {
+ if (layout_type == ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_DIRECT)
+ sampler_align = MAX2(sampler_align, ANV_SAMPLER_STATE_SIZE);
+ else
+ surface_align = MAX2(surface_align, ANV_SAMPLER_STATE_SIZE);
+ }
+
+ if (data & ANV_DESCRIPTOR_INLINE_UNIFORM)
+ surface_align = MAX2(surface_align, ANV_UBO_ALIGNMENT);
+
+ *out_surface_align = surface_align;
+ *out_sampler_align = sampler_align;
+}
+
static enum anv_descriptor_data
-anv_descriptor_data_for_type(const struct anv_physical_device *device,
- VkDescriptorType type)
+anv_indirect_descriptor_data_for_type(VkDescriptorType type)
{
enum anv_descriptor_data data = 0;
switch (type) {
case VK_DESCRIPTOR_TYPE_SAMPLER:
- data = ANV_DESCRIPTOR_SAMPLER_STATE;
- if (device->has_bindless_samplers)
- data |= ANV_DESCRIPTOR_SAMPLED_IMAGE;
+ data = ANV_DESCRIPTOR_BTI_SAMPLER_STATE |
+ ANV_DESCRIPTOR_INDIRECT_SAMPLED_IMAGE;
break;
case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
- data = ANV_DESCRIPTOR_SURFACE_STATE |
- ANV_DESCRIPTOR_SAMPLER_STATE;
- if (device->has_bindless_images || device->has_bindless_samplers)
- data |= ANV_DESCRIPTOR_SAMPLED_IMAGE;
+ data = ANV_DESCRIPTOR_BTI_SURFACE_STATE |
+ ANV_DESCRIPTOR_BTI_SAMPLER_STATE |
+ ANV_DESCRIPTOR_INDIRECT_SAMPLED_IMAGE;
break;
case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
- data = ANV_DESCRIPTOR_SURFACE_STATE;
- if (device->has_bindless_images)
- data |= ANV_DESCRIPTOR_SAMPLED_IMAGE;
- break;
-
case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
- data = ANV_DESCRIPTOR_SURFACE_STATE;
+ data = ANV_DESCRIPTOR_BTI_SURFACE_STATE |
+ ANV_DESCRIPTOR_INDIRECT_SAMPLED_IMAGE;
break;
case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
- data = ANV_DESCRIPTOR_SURFACE_STATE;
- if (device->info.ver < 9)
- data |= ANV_DESCRIPTOR_IMAGE_PARAM;
- if (device->has_bindless_images)
- data |= ANV_DESCRIPTOR_STORAGE_IMAGE;
+ data = ANV_DESCRIPTOR_BTI_SURFACE_STATE |
+ ANV_DESCRIPTOR_INDIRECT_STORAGE_IMAGE;
break;
case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
- data = ANV_DESCRIPTOR_SURFACE_STATE |
+ data = ANV_DESCRIPTOR_BTI_SURFACE_STATE |
ANV_DESCRIPTOR_BUFFER_VIEW;
break;
case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
- data = ANV_DESCRIPTOR_SURFACE_STATE;
+ data = ANV_DESCRIPTOR_BTI_SURFACE_STATE;
break;
- case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT:
+ case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK:
data = ANV_DESCRIPTOR_INLINE_UNIFORM;
break;
case VK_DESCRIPTOR_TYPE_ACCELERATION_STRUCTURE_KHR:
- data = ANV_DESCRIPTOR_ADDRESS_RANGE;
+ data = ANV_DESCRIPTOR_INDIRECT_ADDRESS_RANGE;
break;
default:
unreachable("Unsupported descriptor type");
}
- /* On gfx8 and above when we have softpin enabled, we also need to push
- * SSBO address ranges so that we can use A64 messages in the shader.
- */
- if (device->has_a64_buffer_access &&
- (type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER ||
- type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC ||
- type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER ||
- type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC))
- data |= ANV_DESCRIPTOR_ADDRESS_RANGE;
-
- /* On Ivy Bridge and Bay Trail, we need swizzles textures in the shader
- * Do not handle VK_DESCRIPTOR_TYPE_STORAGE_IMAGE and
- * VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT because they already must
- * have identity swizzle.
+ /* We also need to push SSBO address ranges so that we can use A64
+ * messages in the shader.
*/
- if (device->info.verx10 == 70 &&
- (type == VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE ||
- type == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER))
- data |= ANV_DESCRIPTOR_TEXTURE_SWIZZLE;
+ if (type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER ||
+ type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC ||
+ type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER ||
+ type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC)
+ data |= ANV_DESCRIPTOR_INDIRECT_ADDRESS_RANGE;
+
+ return data;
+}
+
+static enum anv_descriptor_data
+anv_direct_descriptor_data_for_type(const struct anv_physical_device *device,
+ enum anv_descriptor_set_layout_type layout_type,
+ VkDescriptorSetLayoutCreateFlags set_flags,
+ VkDescriptorType type)
+{
+ enum anv_descriptor_data data = 0;
+
+ switch (type) {
+ case VK_DESCRIPTOR_TYPE_SAMPLER:
+ if (set_flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_EMBEDDED_IMMUTABLE_SAMPLERS_BIT_EXT)
+ return 0;
+ data = ANV_DESCRIPTOR_BTI_SAMPLER_STATE |
+ ANV_DESCRIPTOR_SAMPLER;
+ break;
+
+ case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
+ if (layout_type == ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_DIRECT) {
+ data = ANV_DESCRIPTOR_BTI_SURFACE_STATE |
+ ANV_DESCRIPTOR_BTI_SAMPLER_STATE |
+ ANV_DESCRIPTOR_SURFACE |
+ ANV_DESCRIPTOR_SAMPLER;
+ } else {
+ data = ANV_DESCRIPTOR_BTI_SURFACE_STATE |
+ ANV_DESCRIPTOR_BTI_SAMPLER_STATE |
+ ANV_DESCRIPTOR_SURFACE_SAMPLER;
+ }
+ break;
+
+ case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
+ case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
+ case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
+ case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
+ case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
+ case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
+ case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
+ case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
+ case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
+ data = ANV_DESCRIPTOR_BTI_SURFACE_STATE |
+ ANV_DESCRIPTOR_SURFACE;
+ break;
+
+ case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK:
+ data = ANV_DESCRIPTOR_INLINE_UNIFORM;
+ break;
+
+ case VK_DESCRIPTOR_TYPE_ACCELERATION_STRUCTURE_KHR:
+ data = ANV_DESCRIPTOR_INDIRECT_ADDRESS_RANGE;
+ break;
+
+ default:
+ unreachable("Unsupported descriptor type");
+ }
+
+ if (layout_type == ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_BUFFER) {
+ if (set_flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR) {
+ /* Push descriptors are special with descriptor buffers. On Gfx12.5+
+ * they have their own pool and are not reachable by the binding
+ * table. On previous generations, they are only reachable through
+ * the binding table.
+ */
+ if (device->uses_ex_bso) {
+ data &= ~(ANV_DESCRIPTOR_BTI_SURFACE_STATE |
+ ANV_DESCRIPTOR_BTI_SAMPLER_STATE);
+ }
+ } else {
+ /* Non push descriptor buffers cannot be accesses through the binding
+ * table on all platforms.
+ */
+ data &= ~(ANV_DESCRIPTOR_BTI_SURFACE_STATE |
+ ANV_DESCRIPTOR_BTI_SAMPLER_STATE);
+ }
+ }
return data;
}
-static unsigned
-anv_descriptor_data_size(enum anv_descriptor_data data)
+static enum anv_descriptor_data
+anv_descriptor_data_for_type(const struct anv_physical_device *device,
+ enum anv_descriptor_set_layout_type layout_type,
+ VkDescriptorSetLayoutCreateFlags set_flags,
+ VkDescriptorType type)
{
- unsigned size = 0;
+ if (layout_type == ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_BUFFER)
+ return anv_direct_descriptor_data_for_type(device, layout_type, set_flags, type);
+ else if (device->indirect_descriptors)
+ return anv_indirect_descriptor_data_for_type(type);
+ else
+ return anv_direct_descriptor_data_for_type(device, layout_type, set_flags, type);
+}
- if (data & ANV_DESCRIPTOR_SAMPLED_IMAGE)
- size += sizeof(struct anv_sampled_image_descriptor);
+static enum anv_descriptor_data
+anv_descriptor_data_for_mutable_type(const struct anv_physical_device *device,
+ enum anv_descriptor_set_layout_type layout_type,
+ VkDescriptorSetLayoutCreateFlags set_flags,
+ const VkMutableDescriptorTypeCreateInfoEXT *mutable_info,
+ int binding)
+{
+ enum anv_descriptor_data desc_data = 0;
- if (data & ANV_DESCRIPTOR_STORAGE_IMAGE)
- size += sizeof(struct anv_storage_image_descriptor);
+ if (!mutable_info || mutable_info->mutableDescriptorTypeListCount <= binding) {
+ for(VkDescriptorType i = 0; i <= VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT; i++) {
+ if (i == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC ||
+ i == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC ||
+ i == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK)
+ continue;
- if (data & ANV_DESCRIPTOR_IMAGE_PARAM)
- size += BRW_IMAGE_PARAM_SIZE * 4;
+ desc_data |= anv_descriptor_data_for_type(device, layout_type, set_flags, i);
+ }
- if (data & ANV_DESCRIPTOR_ADDRESS_RANGE)
- size += sizeof(struct anv_address_range_descriptor);
+ desc_data |= anv_descriptor_data_for_type(
+ device, layout_type, set_flags, VK_DESCRIPTOR_TYPE_ACCELERATION_STRUCTURE_KHR);
- if (data & ANV_DESCRIPTOR_TEXTURE_SWIZZLE)
- size += sizeof(struct anv_texture_swizzle_descriptor);
+ return desc_data;
+ }
- return size;
+ const VkMutableDescriptorTypeListEXT *type_list =
+ &mutable_info->pMutableDescriptorTypeLists[binding];
+ for (uint32_t i = 0; i < type_list->descriptorTypeCount; i++) {
+ desc_data |=
+ anv_descriptor_data_for_type(device, layout_type, set_flags,
+ type_list->pDescriptorTypes[i]);
+ }
+
+ return desc_data;
+}
+
+static void
+anv_descriptor_data_size(enum anv_descriptor_data data,
+ enum anv_descriptor_set_layout_type layout_type,
+ uint16_t *out_surface_size,
+ uint16_t *out_sampler_size)
+{
+ unsigned surface_size = 0;
+ unsigned sampler_size = 0;
+
+ if (data & ANV_DESCRIPTOR_INDIRECT_SAMPLED_IMAGE)
+ surface_size += sizeof(struct anv_sampled_image_descriptor);
+
+ if (data & ANV_DESCRIPTOR_INDIRECT_STORAGE_IMAGE)
+ surface_size += sizeof(struct anv_storage_image_descriptor);
+
+ if (data & ANV_DESCRIPTOR_INDIRECT_ADDRESS_RANGE)
+ surface_size += sizeof(struct anv_address_range_descriptor);
+
+ if (data & ANV_DESCRIPTOR_SURFACE)
+ surface_size += ANV_SURFACE_STATE_SIZE;
+
+ /* Direct descriptors have sampler states stored separately */
+ if (layout_type == ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_DIRECT) {
+ if (data & ANV_DESCRIPTOR_SAMPLER)
+ sampler_size += ANV_SAMPLER_STATE_SIZE;
+
+ if (data & ANV_DESCRIPTOR_SURFACE_SAMPLER) {
+ surface_size += ANV_SURFACE_STATE_SIZE;
+ sampler_size += ANV_SAMPLER_STATE_SIZE;
+ }
+ } else {
+ if (data & ANV_DESCRIPTOR_SAMPLER)
+ surface_size += ANV_SAMPLER_STATE_SIZE;
+
+ if (data & ANV_DESCRIPTOR_SURFACE_SAMPLER) {
+ surface_size += ALIGN(ANV_SURFACE_STATE_SIZE + ANV_SAMPLER_STATE_SIZE,
+ ANV_SURFACE_STATE_SIZE);
+ }
+ }
+
+ *out_surface_size = surface_size;
+ *out_sampler_size = sampler_size;
}
static bool
anv_needs_descriptor_buffer(VkDescriptorType desc_type,
+ enum anv_descriptor_set_layout_type layout_type,
enum anv_descriptor_data desc_data)
{
- if (desc_type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT ||
- anv_descriptor_data_size(desc_data) > 0)
+ if (desc_type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK)
return true;
- return false;
+
+ uint16_t surface_size, sampler_size;
+ anv_descriptor_data_size(desc_data, layout_type,
+ &surface_size, &sampler_size);
+ return surface_size > 0 || sampler_size > 0;
}
/** Returns the size in bytes of each descriptor with the given layout */
-unsigned
-anv_descriptor_size(const struct anv_descriptor_set_binding_layout *layout)
+static void
+anv_descriptor_size(const struct anv_descriptor_set_binding_layout *layout,
+ enum anv_descriptor_set_layout_type layout_type,
+ uint16_t *out_surface_stride,
+ uint16_t *out_sampler_stride)
{
if (layout->data & ANV_DESCRIPTOR_INLINE_UNIFORM) {
assert(layout->data == ANV_DESCRIPTOR_INLINE_UNIFORM);
- return layout->array_size;
+ assert(layout->array_size <= UINT16_MAX);
+ *out_surface_stride = layout->array_size;
+ *out_sampler_stride = 0;
+ return;
}
- unsigned size = anv_descriptor_data_size(layout->data);
-
- /* For multi-planar bindings, we make every descriptor consume the maximum
- * number of planes so we don't have to bother with walking arrays and
- * adding things up every time. Fortunately, YCbCr samplers aren't all
- * that common and likely won't be in the middle of big arrays.
- */
- if (layout->max_plane_count > 1)
- size *= layout->max_plane_count;
-
- return size;
+ anv_descriptor_data_size(layout->data, layout_type,
+ out_surface_stride,
+ out_sampler_stride);
}
-/** Returns the size in bytes of each descriptor of the given type
- *
- * This version of the function does not have access to the entire layout so
- * it may only work on certain descriptor types where the descriptor size is
- * entirely determined by the descriptor type. Whenever possible, code should
- * use anv_descriptor_size() instead.
- */
-unsigned
-anv_descriptor_type_size(const struct anv_physical_device *pdevice,
- VkDescriptorType type)
+/** Returns size in bytes of the biggest descriptor in the given layout */
+static void
+anv_descriptor_size_for_mutable_type(const struct anv_physical_device *device,
+ enum anv_descriptor_set_layout_type layout_type,
+ VkDescriptorSetLayoutCreateFlags set_flags,
+ const VkMutableDescriptorTypeCreateInfoEXT *mutable_info,
+ int binding,
+ uint16_t *out_surface_stride,
+ uint16_t *out_sampler_stride)
{
- assert(type != VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT &&
- type != VK_DESCRIPTOR_TYPE_SAMPLER &&
- type != VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE &&
- type != VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER);
+ *out_surface_stride = 0;
+ *out_sampler_stride = 0;
+
+ if (!mutable_info ||
+ mutable_info->mutableDescriptorTypeListCount <= binding ||
+ binding >= mutable_info->mutableDescriptorTypeListCount) {
+ for(VkDescriptorType i = 0; i <= VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT; i++) {
+
+ if (i == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC ||
+ i == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC ||
+ i == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK)
+ continue;
+
+ enum anv_descriptor_data desc_data =
+ anv_descriptor_data_for_type(device, layout_type, set_flags, i);
+ uint16_t surface_stride, sampler_stride;
+ anv_descriptor_data_size(desc_data, layout_type,
+ &surface_stride, &sampler_stride);
+
+ *out_surface_stride = MAX2(*out_surface_stride, surface_stride);
+ *out_sampler_stride = MAX2(*out_sampler_stride, sampler_stride);
+ }
+
+ enum anv_descriptor_data desc_data =
+ anv_descriptor_data_for_type(device, layout_type, set_flags,
+ VK_DESCRIPTOR_TYPE_ACCELERATION_STRUCTURE_KHR);
+ uint16_t surface_stride, sampler_stride;
+ anv_descriptor_data_size(desc_data, layout_type,
+ &surface_stride, &sampler_stride);
+
+ *out_surface_stride = MAX2(*out_surface_stride, surface_stride);
+ *out_sampler_stride = MAX2(*out_sampler_stride, sampler_stride);
- return anv_descriptor_data_size(anv_descriptor_data_for_type(pdevice, type));
+ return;
+ }
+
+ const VkMutableDescriptorTypeListEXT *type_list =
+ &mutable_info->pMutableDescriptorTypeLists[binding];
+ for (uint32_t i = 0; i < type_list->descriptorTypeCount; i++) {
+ enum anv_descriptor_data desc_data =
+ anv_descriptor_data_for_type(device, layout_type, set_flags,
+ type_list->pDescriptorTypes[i]);
+
+ uint16_t surface_stride, sampler_stride;
+ anv_descriptor_data_size(desc_data, layout_type,
+ &surface_stride, &sampler_stride);
+
+ *out_surface_stride = MAX2(*out_surface_stride, surface_stride);
+ *out_sampler_stride = MAX2(*out_sampler_stride, sampler_stride);
+ }
}
static bool
anv_descriptor_data_supports_bindless(const struct anv_physical_device *pdevice,
- enum anv_descriptor_data data,
- bool sampler)
+ VkDescriptorSetLayoutCreateFlags set_flags,
+ enum anv_descriptor_data data)
{
- if (data & ANV_DESCRIPTOR_ADDRESS_RANGE) {
- assert(pdevice->has_a64_buffer_access);
- return true;
- }
+ if (set_flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_DESCRIPTOR_BUFFER_BIT_EXT) {
+ /* When using descriptor buffers, on platforms that don't have extended
+ * bindless offset, all push descriptors have to go through the binding
+ * tables.
+ */
+ if (!pdevice->uses_ex_bso &&
+ (set_flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR)) {
+ return data & (ANV_DESCRIPTOR_INDIRECT_ADDRESS_RANGE |
+ ANV_DESCRIPTOR_INDIRECT_SAMPLED_IMAGE |
+ ANV_DESCRIPTOR_INDIRECT_STORAGE_IMAGE);
+ }
- if (data & ANV_DESCRIPTOR_SAMPLED_IMAGE) {
- assert(pdevice->has_bindless_images || pdevice->has_bindless_samplers);
- return sampler ? pdevice->has_bindless_samplers :
- pdevice->has_bindless_images;
- }
+ /* Otherwise we can do bindless for everything */
+ return true;
+ } else {
+ if (pdevice->indirect_descriptors) {
+ return data & (ANV_DESCRIPTOR_INDIRECT_ADDRESS_RANGE |
+ ANV_DESCRIPTOR_INDIRECT_SAMPLED_IMAGE |
+ ANV_DESCRIPTOR_INDIRECT_STORAGE_IMAGE);
+ }
- if (data & ANV_DESCRIPTOR_STORAGE_IMAGE) {
- assert(pdevice->has_bindless_images);
+ /* Direct descriptor support bindless for everything */
return true;
}
-
- return false;
}
bool
anv_descriptor_supports_bindless(const struct anv_physical_device *pdevice,
- const struct anv_descriptor_set_binding_layout *binding,
- bool sampler)
+ const struct anv_descriptor_set_layout *set,
+ const struct anv_descriptor_set_binding_layout *binding)
{
- return anv_descriptor_data_supports_bindless(pdevice, binding->data,
- sampler);
+ return anv_descriptor_data_supports_bindless(pdevice, set->flags, binding->data);
}
bool
anv_descriptor_requires_bindless(const struct anv_physical_device *pdevice,
- const struct anv_descriptor_set_binding_layout *binding,
- bool sampler)
+ const struct anv_descriptor_set_layout *set,
+ const struct anv_descriptor_set_binding_layout *binding)
{
if (pdevice->always_use_bindless)
- return anv_descriptor_supports_bindless(pdevice, binding, sampler);
+ return anv_descriptor_supports_bindless(pdevice, set, binding);
- static const VkDescriptorBindingFlagBitsEXT flags_requiring_bindless =
- VK_DESCRIPTOR_BINDING_UPDATE_AFTER_BIND_BIT_EXT |
- VK_DESCRIPTOR_BINDING_UPDATE_UNUSED_WHILE_PENDING_BIT_EXT |
- VK_DESCRIPTOR_BINDING_PARTIALLY_BOUND_BIT_EXT;
+ if (set->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR)
+ return false;
+
+ if (set->flags & (VK_DESCRIPTOR_SET_LAYOUT_CREATE_DESCRIPTOR_BUFFER_BIT_EXT |
+ VK_DESCRIPTOR_SET_LAYOUT_CREATE_EMBEDDED_IMMUTABLE_SAMPLERS_BIT_EXT))
+ return true;
+
+ static const VkDescriptorBindingFlagBits flags_requiring_bindless =
+ VK_DESCRIPTOR_BINDING_UPDATE_AFTER_BIND_BIT |
+ VK_DESCRIPTOR_BINDING_UPDATE_UNUSED_WHILE_PENDING_BIT |
+ VK_DESCRIPTOR_BINDING_PARTIALLY_BOUND_BIT;
return (binding->flags & flags_requiring_bindless) != 0;
}
+static enum anv_descriptor_set_layout_type
+anv_descriptor_set_layout_type_for_flags(const struct anv_physical_device *device,
+ const VkDescriptorSetLayoutCreateInfo *pCreateInfo)
+{
+ if (pCreateInfo->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_DESCRIPTOR_BUFFER_BIT_EXT)
+ return ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_BUFFER;
+ else if (device->indirect_descriptors)
+ return ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_INDIRECT;
+ else
+ return ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_DIRECT;
+}
+
+static bool
+mutable_list_includes_type(const VkMutableDescriptorTypeCreateInfoEXT *mutable_info,
+ uint32_t binding, VkDescriptorType type)
+{
+ if (!mutable_info || mutable_info->mutableDescriptorTypeListCount == 0)
+ return true;
+
+ const VkMutableDescriptorTypeListEXT *type_list =
+ &mutable_info->pMutableDescriptorTypeLists[binding];
+ for (uint32_t i = 0; i < type_list->descriptorTypeCount; i++) {
+ if (type_list->pDescriptorTypes[i] == type)
+ return true;
+ }
+
+ return false;
+}
+
void anv_GetDescriptorSetLayoutSupport(
VkDevice _device,
const VkDescriptorSetLayoutCreateInfo* pCreateInfo,
@@ -260,6 +510,12 @@ void anv_GetDescriptorSetLayoutSupport(
const VkDescriptorSetLayoutBindingFlagsCreateInfo *binding_flags_info =
vk_find_struct_const(pCreateInfo->pNext,
DESCRIPTOR_SET_LAYOUT_BINDING_FLAGS_CREATE_INFO);
+ const VkMutableDescriptorTypeCreateInfoEXT *mutable_info =
+ vk_find_struct_const(pCreateInfo->pNext,
+ MUTABLE_DESCRIPTOR_TYPE_CREATE_INFO_EXT);
+
+ enum anv_descriptor_set_layout_type layout_type =
+ anv_descriptor_set_layout_type_for_flags(pdevice, pCreateInfo);
for (uint32_t b = 0; b < pCreateInfo->bindingCount; b++) {
const VkDescriptorSetLayoutBinding *binding = &pCreateInfo->pBindings[b];
@@ -270,10 +526,32 @@ void anv_GetDescriptorSetLayoutSupport(
flags = binding_flags_info->pBindingFlags[b];
}
+ /* Combined image/sampler descriptor are not supported with descriptor
+ * buffers & mutable descriptor types because we cannot know from the
+ * shader where to find the sampler structure. It can be written to the
+ * beginning of the descriptor (at offset 0) or in the second part (at
+ * offset 64bytes).
+ */
+ if ((pCreateInfo->flags &
+ VK_DESCRIPTOR_SET_LAYOUT_CREATE_DESCRIPTOR_BUFFER_BIT_EXT) &&
+ binding->descriptorType == VK_DESCRIPTOR_TYPE_MUTABLE_EXT &&
+ mutable_list_includes_type(mutable_info, b,
+ VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER)) {
+ pSupport->supported = false;
+ return;
+ }
+
enum anv_descriptor_data desc_data =
- anv_descriptor_data_for_type(pdevice, binding->descriptorType);
+ binding->descriptorType == VK_DESCRIPTOR_TYPE_MUTABLE_EXT ?
+ anv_descriptor_data_for_mutable_type(pdevice, layout_type,
+ pCreateInfo->flags,
+ mutable_info, b) :
+ anv_descriptor_data_for_type(pdevice, layout_type,
+ pCreateInfo->flags,
+ binding->descriptorType);
- if (anv_needs_descriptor_buffer(binding->descriptorType, desc_data))
+ if (anv_needs_descriptor_buffer(binding->descriptorType,
+ layout_type, desc_data))
needs_descriptor_buffer = true;
if (flags & VK_DESCRIPTOR_BINDING_VARIABLE_DESCRIPTOR_COUNT_BIT)
@@ -284,12 +562,14 @@ void anv_GetDescriptorSetLayoutSupport(
/* There is no real limit on samplers */
break;
- case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT:
+ case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK:
/* Inline uniforms don't use a binding */
break;
case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
- if (anv_descriptor_data_supports_bindless(pdevice, desc_data, false))
+ if (anv_descriptor_data_supports_bindless(pdevice,
+ pCreateInfo->flags,
+ desc_data))
break;
if (binding->pImmutableSamplers) {
@@ -306,7 +586,9 @@ void anv_GetDescriptorSetLayoutSupport(
break;
default:
- if (anv_descriptor_data_supports_bindless(pdevice, desc_data, false))
+ if (anv_descriptor_data_supports_bindless(pdevice,
+ pCreateInfo->flags,
+ desc_data))
break;
anv_foreach_stage(s, binding->stageFlags)
@@ -324,7 +606,7 @@ void anv_GetDescriptorSetLayoutSupport(
vk_find_struct(pSupport->pNext,
DESCRIPTOR_SET_VARIABLE_DESCRIPTOR_COUNT_LAYOUT_SUPPORT);
if (vdcls != NULL) {
- if (varying_desc_type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT) {
+ if (varying_desc_type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) {
vdcls->maxVariableDescriptorCount = MAX_INLINE_UNIFORM_BLOCK_SIZE;
} else if (varying_desc_type != VK_DESCRIPTOR_TYPE_MAX_ENUM) {
vdcls->maxVariableDescriptorCount = UINT16_MAX;
@@ -378,7 +660,7 @@ VkResult anv_CreateDescriptorSetLayout(
immutable_sampler_count += pCreateInfo->pBindings[j].descriptorCount;
}
- /* We need to allocate decriptor set layouts off the device allocator
+ /* We need to allocate descriptor set layouts off the device allocator
* with DEVICE scope because they are reference counted and may not be
* destroyed when vkDestroyDescriptorSetLayout is called.
*/
@@ -391,10 +673,13 @@ VkResult anv_CreateDescriptorSetLayout(
if (!vk_object_multizalloc(&device->vk, &ma, NULL,
VK_OBJECT_TYPE_DESCRIPTOR_SET_LAYOUT))
- return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+ return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
set_layout->ref_cnt = 1;
set_layout->binding_count = num_bindings;
+ set_layout->flags = pCreateInfo->flags;
+ set_layout->type = anv_descriptor_set_layout_type_for_flags(device->physical,
+ pCreateInfo);
for (uint32_t b = 0; b < num_bindings; b++) {
/* Initialize all binding_layout entries to -1 */
@@ -412,7 +697,9 @@ VkResult anv_CreateDescriptorSetLayout(
uint32_t buffer_view_count = 0;
uint32_t dynamic_offset_count = 0;
- uint32_t descriptor_buffer_size = 0;
+ uint32_t descriptor_buffer_surface_size = 0;
+ uint32_t descriptor_buffer_sampler_size = 0;
+ uint32_t sampler_count = 0;
for (uint32_t j = 0; j < pCreateInfo->bindingCount; j++) {
const VkDescriptorSetLayoutBinding *binding = &pCreateInfo->pBindings[j];
@@ -424,9 +711,13 @@ VkResult anv_CreateDescriptorSetLayout(
set_layout->binding[b].immutable_samplers = (void *)(uintptr_t)(j + 1);
}
- const VkDescriptorSetLayoutBindingFlagsCreateInfoEXT *binding_flags_info =
+ const VkDescriptorSetLayoutBindingFlagsCreateInfo *binding_flags_info =
vk_find_struct_const(pCreateInfo->pNext,
- DESCRIPTOR_SET_LAYOUT_BINDING_FLAGS_CREATE_INFO_EXT);
+ DESCRIPTOR_SET_LAYOUT_BINDING_FLAGS_CREATE_INFO);
+
+ const VkMutableDescriptorTypeCreateInfoEXT *mutable_info =
+ vk_find_struct_const(pCreateInfo->pNext,
+ MUTABLE_DESCRIPTOR_TYPE_CREATE_INFO_EXT);
for (uint32_t b = 0; b < num_bindings; b++) {
/* We stashed the pCreateInfo->pBindings[] index (plus one) in the
@@ -466,13 +757,21 @@ VkResult anv_CreateDescriptorSetLayout(
assert(!(set_layout->binding[b].flags &
(VK_DESCRIPTOR_BINDING_UPDATE_AFTER_BIND_BIT |
VK_DESCRIPTOR_BINDING_UPDATE_UNUSED_WHILE_PENDING_BIT |
- VK_DESCRIPTOR_BINDING_VARIABLE_DESCRIPTOR_COUNT_BIT_EXT)));
+ VK_DESCRIPTOR_BINDING_VARIABLE_DESCRIPTOR_COUNT_BIT)));
}
}
set_layout->binding[b].data =
+ binding->descriptorType == VK_DESCRIPTOR_TYPE_MUTABLE_EXT ?
+ anv_descriptor_data_for_mutable_type(device->physical,
+ set_layout->type,
+ pCreateInfo->flags,
+ mutable_info, b) :
anv_descriptor_data_for_type(device->physical,
+ set_layout->type,
+ pCreateInfo->flags,
binding->descriptorType);
+
set_layout->binding[b].array_size = binding->descriptorCount;
set_layout->binding[b].descriptor_index = set_layout->descriptor_count;
set_layout->descriptor_count += binding->descriptorCount;
@@ -485,6 +784,7 @@ VkResult anv_CreateDescriptorSetLayout(
switch (binding->descriptorType) {
case VK_DESCRIPTOR_TYPE_SAMPLER:
case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
+ case VK_DESCRIPTOR_TYPE_MUTABLE_EXT:
set_layout->binding[b].max_plane_count = 1;
if (binding->pImmutableSamplers) {
set_layout->binding[b].immutable_samplers = samplers;
@@ -522,27 +822,77 @@ VkResult anv_CreateDescriptorSetLayout(
break;
}
- if (binding->descriptorType ==
- VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT) {
- /* Inline uniform blocks are specified to use the descriptor array
- * size as the size in bytes of the block.
- */
- descriptor_buffer_size = align_u32(descriptor_buffer_size,
- ANV_UBO_ALIGNMENT);
- set_layout->binding[b].descriptor_offset = descriptor_buffer_size;
- descriptor_buffer_size += binding->descriptorCount;
+ if (binding->descriptorType == VK_DESCRIPTOR_TYPE_MUTABLE_EXT) {
+ anv_descriptor_size_for_mutable_type(
+ device->physical, set_layout->type,
+ pCreateInfo->flags, mutable_info, b,
+ &set_layout->binding[b].descriptor_data_surface_size,
+ &set_layout->binding[b].descriptor_data_sampler_size);
+ } else {
+ anv_descriptor_size(&set_layout->binding[b],
+ set_layout->type,
+ &set_layout->binding[b].descriptor_data_surface_size,
+ &set_layout->binding[b].descriptor_data_sampler_size);
+ }
+
+ /* For multi-planar bindings, we make every descriptor consume the maximum
+ * number of planes so we don't have to bother with walking arrays and
+ * adding things up every time. Fortunately, YCbCr samplers aren't all
+ * that common and likely won't be in the middle of big arrays.
+ */
+ set_layout->binding[b].descriptor_surface_stride =
+ MAX2(set_layout->binding[b].max_plane_count, 1) *
+ set_layout->binding[b].descriptor_data_surface_size;
+ set_layout->binding[b].descriptor_sampler_stride =
+ MAX2(set_layout->binding[b].max_plane_count, 1) *
+ set_layout->binding[b].descriptor_data_sampler_size;
+
+ if (binding->descriptorType == VK_DESCRIPTOR_TYPE_SAMPLER) {
+ sampler_count += binding->descriptorCount *
+ set_layout->binding[b].max_plane_count;
+ }
+
+ unsigned surface_align, sampler_align;
+ anv_descriptor_data_alignment(set_layout->binding[b].data,
+ set_layout->type,
+ &surface_align,
+ &sampler_align);
+ descriptor_buffer_surface_size =
+ align(descriptor_buffer_surface_size, surface_align);
+ descriptor_buffer_sampler_size =
+ align(descriptor_buffer_sampler_size, sampler_align);
+
+ if (binding->descriptorType == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) {
+ set_layout->binding[b].descriptor_surface_offset = descriptor_buffer_surface_size;
+ descriptor_buffer_surface_size += binding->descriptorCount;
} else {
- set_layout->binding[b].descriptor_offset = descriptor_buffer_size;
- descriptor_buffer_size += anv_descriptor_size(&set_layout->binding[b]) *
- binding->descriptorCount;
+ set_layout->binding[b].descriptor_surface_offset = descriptor_buffer_surface_size;
+ descriptor_buffer_surface_size +=
+ set_layout->binding[b].descriptor_surface_stride * binding->descriptorCount;
}
+ set_layout->binding[b].descriptor_sampler_offset = descriptor_buffer_sampler_size;
+ descriptor_buffer_sampler_size +=
+ set_layout->binding[b].descriptor_sampler_stride * binding->descriptorCount;
+
set_layout->shader_stages |= binding->stageFlags;
}
+ /* Sanity checks */
+ assert(descriptor_buffer_sampler_size == 0 ||
+ set_layout->type == ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_DIRECT);
+
set_layout->buffer_view_count = buffer_view_count;
set_layout->dynamic_offset_count = dynamic_offset_count;
- set_layout->descriptor_buffer_size = descriptor_buffer_size;
+ set_layout->descriptor_buffer_surface_size = descriptor_buffer_surface_size;
+ set_layout->descriptor_buffer_sampler_size = descriptor_buffer_sampler_size;
+
+ if (pCreateInfo->flags &
+ VK_DESCRIPTOR_SET_LAYOUT_CREATE_EMBEDDED_IMMUTABLE_SAMPLERS_BIT_EXT) {
+ assert(set_layout->descriptor_buffer_surface_size == 0);
+ assert(set_layout->descriptor_buffer_sampler_size == 0);
+ set_layout->embedded_sampler_count = sampler_count;
+ }
*pSetLayout = anv_descriptor_set_layout_to_handle(set_layout);
@@ -583,7 +933,7 @@ set_layout_descriptor_count(const struct anv_descriptor_set_layout *set_layout,
assert(var_desc_count <= dynamic_binding->array_size);
uint32_t shrink = dynamic_binding->array_size - var_desc_count;
- if (dynamic_binding->type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT)
+ if (dynamic_binding->type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK)
return set_layout->descriptor_count;
return set_layout->descriptor_count - shrink;
@@ -607,30 +957,50 @@ set_layout_buffer_view_count(const struct anv_descriptor_set_layout *set_layout,
return set_layout->buffer_view_count - shrink;
}
-uint32_t
+static bool
+anv_descriptor_set_layout_empty(const struct anv_descriptor_set_layout *set_layout)
+{
+ return set_layout->binding_count == 0;
+}
+
+static void
anv_descriptor_set_layout_descriptor_buffer_size(const struct anv_descriptor_set_layout *set_layout,
- uint32_t var_desc_count)
+ uint32_t var_desc_count,
+ uint32_t *out_surface_size,
+ uint32_t *out_sampler_size)
{
const struct anv_descriptor_set_binding_layout *dynamic_binding =
set_layout_dynamic_binding(set_layout);
- if (dynamic_binding == NULL)
- return ALIGN(set_layout->descriptor_buffer_size, ANV_UBO_ALIGNMENT);
+ if (dynamic_binding == NULL) {
+ *out_surface_size = ALIGN(set_layout->descriptor_buffer_surface_size,
+ ANV_UBO_ALIGNMENT);
+ *out_sampler_size = set_layout->descriptor_buffer_sampler_size;
+ return;
+ }
assert(var_desc_count <= dynamic_binding->array_size);
uint32_t shrink = dynamic_binding->array_size - var_desc_count;
- uint32_t set_size;
+ uint32_t set_surface_size, set_sampler_size;
- if (dynamic_binding->type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT) {
+ if (dynamic_binding->type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) {
/* Inline uniform blocks are specified to use the descriptor array
* size as the size in bytes of the block.
*/
- set_size = set_layout->descriptor_buffer_size - shrink;
+ set_surface_size = set_layout->descriptor_buffer_surface_size - shrink;
+ set_sampler_size = 0;
} else {
- set_size = set_layout->descriptor_buffer_size -
- shrink * anv_descriptor_size(dynamic_binding);
+ set_surface_size =
+ set_layout->descriptor_buffer_surface_size > 0 ?
+ (set_layout->descriptor_buffer_surface_size -
+ shrink * dynamic_binding->descriptor_surface_stride) : 0;
+ set_sampler_size =
+ set_layout->descriptor_buffer_sampler_size > 0 ?
+ (set_layout->descriptor_buffer_sampler_size -
+ shrink * dynamic_binding->descriptor_sampler_stride) : 0;
}
- return ALIGN(set_size, ANV_UBO_ALIGNMENT);
+ *out_surface_size = ALIGN(set_surface_size, ANV_UBO_ALIGNMENT);
+ *out_sampler_size = set_sampler_size;
}
void anv_DestroyDescriptorSetLayout(
@@ -647,23 +1017,52 @@ void anv_DestroyDescriptorSetLayout(
anv_descriptor_set_layout_unref(device, set_layout);
}
+void
+anv_descriptor_set_layout_print(const struct anv_descriptor_set_layout *layout)
+{
+ fprintf(stderr, "set layout:\n");
+ for (uint32_t b = 0; b < layout->binding_count; b++) {
+ fprintf(stderr, " binding%03u: offsets=0x%08x/0x%08x sizes=%04u/%04u strides=%03u/%03u planes=%hhu count=%03u\n",
+ b,
+ layout->binding[b].descriptor_surface_offset,
+ layout->binding[b].descriptor_sampler_offset,
+ layout->binding[b].descriptor_data_surface_size,
+ layout->binding[b].descriptor_data_sampler_size,
+ layout->binding[b].descriptor_surface_stride,
+ layout->binding[b].descriptor_sampler_stride,
+ layout->binding[b].max_plane_count,
+ layout->binding[b].array_size);
+ }
+}
+
#define SHA1_UPDATE_VALUE(ctx, x) _mesa_sha1_update(ctx, &(x), sizeof(x));
static void
sha1_update_immutable_sampler(struct mesa_sha1 *ctx,
+ bool embedded_sampler,
const struct anv_sampler *sampler)
{
- if (!sampler->conversion)
+ if (!sampler->vk.ycbcr_conversion)
return;
- /* The only thing that affects the shader is ycbcr conversion */
- _mesa_sha1_update(ctx, sampler->conversion,
- sizeof(*sampler->conversion));
+ /* Hash the conversion if any as this affect placement of descriptors in
+ * the set due to the number of planes.
+ */
+ SHA1_UPDATE_VALUE(ctx, sampler->vk.ycbcr_conversion->state);
+
+ /* For embedded samplers, we need to hash the sampler parameters as the
+ * sampler handle is baked into the shader and this ultimately is part of
+ * the shader hash key. We can only consider 2 shaders identical if all
+ * their embedded samplers parameters are identical.
+ */
+ if (embedded_sampler)
+ SHA1_UPDATE_VALUE(ctx, sampler->sha1);
}
static void
sha1_update_descriptor_set_binding_layout(struct mesa_sha1 *ctx,
- const struct anv_descriptor_set_binding_layout *layout)
+ bool embedded_samplers,
+ const struct anv_descriptor_set_binding_layout *layout)
{
SHA1_UPDATE_VALUE(ctx, layout->flags);
SHA1_UPDATE_VALUE(ctx, layout->data);
@@ -672,11 +1071,14 @@ sha1_update_descriptor_set_binding_layout(struct mesa_sha1 *ctx,
SHA1_UPDATE_VALUE(ctx, layout->descriptor_index);
SHA1_UPDATE_VALUE(ctx, layout->dynamic_offset_index);
SHA1_UPDATE_VALUE(ctx, layout->buffer_view_index);
- SHA1_UPDATE_VALUE(ctx, layout->descriptor_offset);
+ SHA1_UPDATE_VALUE(ctx, layout->descriptor_surface_offset);
+ SHA1_UPDATE_VALUE(ctx, layout->descriptor_sampler_offset);
if (layout->immutable_samplers) {
- for (uint16_t i = 0; i < layout->array_size; i++)
- sha1_update_immutable_sampler(ctx, layout->immutable_samplers[i]);
+ for (uint16_t i = 0; i < layout->array_size; i++) {
+ sha1_update_immutable_sampler(ctx, embedded_samplers,
+ layout->immutable_samplers[i]);
+ }
}
}
@@ -684,15 +1086,22 @@ static void
sha1_update_descriptor_set_layout(struct mesa_sha1 *ctx,
const struct anv_descriptor_set_layout *layout)
{
+ SHA1_UPDATE_VALUE(ctx, layout->flags);
SHA1_UPDATE_VALUE(ctx, layout->binding_count);
SHA1_UPDATE_VALUE(ctx, layout->descriptor_count);
SHA1_UPDATE_VALUE(ctx, layout->shader_stages);
SHA1_UPDATE_VALUE(ctx, layout->buffer_view_count);
SHA1_UPDATE_VALUE(ctx, layout->dynamic_offset_count);
- SHA1_UPDATE_VALUE(ctx, layout->descriptor_buffer_size);
+ SHA1_UPDATE_VALUE(ctx, layout->descriptor_buffer_surface_size);
+ SHA1_UPDATE_VALUE(ctx, layout->descriptor_buffer_sampler_size);
- for (uint16_t i = 0; i < layout->binding_count; i++)
- sha1_update_descriptor_set_binding_layout(ctx, &layout->binding[i]);
+ bool embedded_samplers =
+ layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_EMBEDDED_IMMUTABLE_SAMPLERS_BIT_EXT;
+
+ for (uint16_t i = 0; i < layout->binding_count; i++) {
+ sha1_update_descriptor_set_binding_layout(ctx, embedded_samplers,
+ &layout->binding[i]);
+ }
}
/*
@@ -700,6 +1109,107 @@ sha1_update_descriptor_set_layout(struct mesa_sha1 *ctx,
* just multiple descriptor set layouts pasted together
*/
+void
+anv_pipeline_sets_layout_init(struct anv_pipeline_sets_layout *layout,
+ struct anv_device *device,
+ bool independent_sets)
+{
+ memset(layout, 0, sizeof(*layout));
+
+ layout->device = device;
+ layout->push_descriptor_set_index = -1;
+ layout->independent_sets = independent_sets;
+}
+
+void
+anv_pipeline_sets_layout_add(struct anv_pipeline_sets_layout *layout,
+ uint32_t set_idx,
+ struct anv_descriptor_set_layout *set_layout)
+{
+ if (layout->set[set_idx].layout)
+ return;
+
+ /* Workaround CTS : Internal CTS issue 3584 */
+ if (layout->independent_sets && anv_descriptor_set_layout_empty(set_layout))
+ return;
+
+ if (layout->type == ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_UNKNOWN)
+ layout->type = set_layout->type;
+ else
+ assert(layout->type == set_layout->type);
+
+ layout->num_sets = MAX2(set_idx + 1, layout->num_sets);
+
+ layout->set[set_idx].layout =
+ anv_descriptor_set_layout_ref(set_layout);
+
+ layout->set[set_idx].dynamic_offset_start = layout->num_dynamic_buffers;
+ layout->num_dynamic_buffers += set_layout->dynamic_offset_count;
+
+ assert(layout->num_dynamic_buffers < MAX_DYNAMIC_BUFFERS);
+
+ if (set_layout->flags &
+ VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR) {
+ assert(layout->push_descriptor_set_index == -1);
+ layout->push_descriptor_set_index = set_idx;
+ }
+}
+
+uint32_t
+anv_pipeline_sets_layout_embedded_sampler_count(const struct anv_pipeline_sets_layout *layout)
+{
+ uint32_t count = 0;
+ for (unsigned s = 0; s < layout->num_sets; s++) {
+ if (!layout->set[s].layout)
+ continue;
+ count += layout->set[s].layout->embedded_sampler_count;
+ }
+ return count;
+}
+
+void
+anv_pipeline_sets_layout_hash(struct anv_pipeline_sets_layout *layout)
+{
+ struct mesa_sha1 ctx;
+ _mesa_sha1_init(&ctx);
+ for (unsigned s = 0; s < layout->num_sets; s++) {
+ if (!layout->set[s].layout)
+ continue;
+ sha1_update_descriptor_set_layout(&ctx, layout->set[s].layout);
+ _mesa_sha1_update(&ctx, &layout->set[s].dynamic_offset_start,
+ sizeof(layout->set[s].dynamic_offset_start));
+ }
+ _mesa_sha1_update(&ctx, &layout->num_sets, sizeof(layout->num_sets));
+ _mesa_sha1_final(&ctx, layout->sha1);
+}
+
+void
+anv_pipeline_sets_layout_fini(struct anv_pipeline_sets_layout *layout)
+{
+ for (unsigned s = 0; s < layout->num_sets; s++) {
+ if (!layout->set[s].layout)
+ continue;
+
+ anv_descriptor_set_layout_unref(layout->device, layout->set[s].layout);
+ }
+}
+
+void
+anv_pipeline_sets_layout_print(const struct anv_pipeline_sets_layout *layout)
+{
+ fprintf(stderr, "layout: dyn_count=%u sets=%u ind=%u\n",
+ layout->num_dynamic_buffers,
+ layout->num_sets,
+ layout->independent_sets);
+ for (unsigned s = 0; s < layout->num_sets; s++) {
+ if (!layout->set[s].layout)
+ continue;
+
+ fprintf(stderr, " set%i: dyn_start=%u flags=0x%x\n",
+ s, layout->set[s].dynamic_offset_start, layout->set[s].layout->flags);
+ }
+}
+
VkResult anv_CreatePipelineLayout(
VkDevice _device,
const VkPipelineLayoutCreateInfo* pCreateInfo,
@@ -711,40 +1221,33 @@ VkResult anv_CreatePipelineLayout(
assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO);
- layout = vk_object_alloc(&device->vk, pAllocator, sizeof(*layout),
- VK_OBJECT_TYPE_PIPELINE_LAYOUT);
+ layout = vk_object_zalloc(&device->vk, pAllocator, sizeof(*layout),
+ VK_OBJECT_TYPE_PIPELINE_LAYOUT);
if (layout == NULL)
- return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
-
- layout->num_sets = pCreateInfo->setLayoutCount;
+ return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
- unsigned dynamic_offset_count = 0;
+ anv_pipeline_sets_layout_init(&layout->sets_layout, device,
+ pCreateInfo->flags & VK_PIPELINE_LAYOUT_CREATE_INDEPENDENT_SETS_BIT_EXT);
for (uint32_t set = 0; set < pCreateInfo->setLayoutCount; set++) {
ANV_FROM_HANDLE(anv_descriptor_set_layout, set_layout,
pCreateInfo->pSetLayouts[set]);
- layout->set[set].layout = set_layout;
- anv_descriptor_set_layout_ref(set_layout);
- layout->set[set].dynamic_offset_start = dynamic_offset_count;
- for (uint32_t b = 0; b < set_layout->binding_count; b++) {
- if (set_layout->binding[b].dynamic_offset_index < 0)
- continue;
+ /* VUID-VkPipelineLayoutCreateInfo-graphicsPipelineLibrary-06753
+ *
+ * "If graphicsPipelineLibrary is not enabled, elements of
+ * pSetLayouts must be valid VkDescriptorSetLayout objects"
+ *
+ * As a result of supporting graphicsPipelineLibrary, we need to allow
+ * null descriptor set layouts.
+ */
+ if (set_layout == NULL)
+ continue;
- dynamic_offset_count += set_layout->binding[b].array_size;
- }
+ anv_pipeline_sets_layout_add(&layout->sets_layout, set, set_layout);
}
- assert(dynamic_offset_count < MAX_DYNAMIC_BUFFERS);
- struct mesa_sha1 ctx;
- _mesa_sha1_init(&ctx);
- for (unsigned s = 0; s < layout->num_sets; s++) {
- sha1_update_descriptor_set_layout(&ctx, layout->set[s].layout);
- _mesa_sha1_update(&ctx, &layout->set[s].dynamic_offset_start,
- sizeof(layout->set[s].dynamic_offset_start));
- }
- _mesa_sha1_update(&ctx, &layout->num_sets, sizeof(layout->num_sets));
- _mesa_sha1_final(&ctx, layout->sha1);
+ anv_pipeline_sets_layout_hash(&layout->sets_layout);
*pPipelineLayout = anv_pipeline_layout_to_handle(layout);
@@ -757,30 +1260,29 @@ void anv_DestroyPipelineLayout(
const VkAllocationCallbacks* pAllocator)
{
ANV_FROM_HANDLE(anv_device, device, _device);
- ANV_FROM_HANDLE(anv_pipeline_layout, pipeline_layout, _pipelineLayout);
+ ANV_FROM_HANDLE(anv_pipeline_layout, layout, _pipelineLayout);
- if (!pipeline_layout)
+ if (!layout)
return;
- for (uint32_t i = 0; i < pipeline_layout->num_sets; i++)
- anv_descriptor_set_layout_unref(device, pipeline_layout->set[i].layout);
+ anv_pipeline_sets_layout_fini(&layout->sets_layout);
- vk_object_free(&device->vk, pAllocator, pipeline_layout);
+ vk_object_free(&device->vk, pAllocator, layout);
}
/*
* Descriptor pools.
*
- * These are implemented using a big pool of memory and a free-list for the
+ * These are implemented using a big pool of memory and a vma heap for the
* host memory allocations and a state_stream and a free list for the buffer
* view surface state. The spec allows us to fail to allocate due to
* fragmentation in all cases but two: 1) after pool reset, allocating up
* until the pool size with no freeing must succeed and 2) allocating and
- * freeing only descriptor sets with the same layout. Case 1) is easy enogh,
- * and the free lists lets us recycle blocks for case 2).
+ * freeing only descriptor sets with the same layout. Case 1) is easy enough,
+ * and the vma heap ensures case 2).
*/
-/* The vma heap reserves 0 to mean NULL; we have to offset by some ammount to
+/* The vma heap reserves 0 to mean NULL; we have to offset by some amount to
* ensure we can allocate the entire BO without hitting zero. The actual
* amount doesn't matter.
*/
@@ -788,6 +1290,108 @@ void anv_DestroyPipelineLayout(
#define EMPTY 1
+static VkResult
+anv_descriptor_pool_heap_init(struct anv_device *device,
+ struct anv_descriptor_pool_heap *heap,
+ uint32_t size,
+ bool host_only,
+ bool samplers)
+{
+ if (size == 0)
+ return VK_SUCCESS;
+
+ if (host_only) {
+ heap->size = size;
+ heap->host_mem = vk_zalloc(&device->vk.alloc, size, 8,
+ VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+ if (heap->host_mem == NULL)
+ return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+ } else {
+ const char *bo_name =
+ device->physical->indirect_descriptors ? "indirect descriptors" :
+ samplers ? "direct sampler" : "direct surfaces";
+
+ heap->size = align(size, 4096);
+
+ VkResult result = anv_device_alloc_bo(device,
+ bo_name, heap->size,
+ ANV_BO_ALLOC_CAPTURE |
+ ANV_BO_ALLOC_MAPPED |
+ ANV_BO_ALLOC_HOST_CACHED_COHERENT |
+ (samplers ?
+ ANV_BO_ALLOC_SAMPLER_POOL :
+ ANV_BO_ALLOC_DESCRIPTOR_POOL),
+ 0 /* explicit_address */,
+ &heap->bo);
+ if (result != VK_SUCCESS)
+ return vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
+ }
+
+ util_vma_heap_init(&heap->heap, POOL_HEAP_OFFSET, heap->size);
+
+ return VK_SUCCESS;
+}
+
+static void
+anv_descriptor_pool_heap_fini(struct anv_device *device,
+ struct anv_descriptor_pool_heap *heap)
+{
+ if (heap->size == 0)
+ return;
+
+ util_vma_heap_finish(&heap->heap);
+
+ if (heap->bo)
+ anv_device_release_bo(device, heap->bo);
+
+ if (heap->host_mem)
+ vk_free(&device->vk.alloc, heap->host_mem);
+}
+
+static void
+anv_descriptor_pool_heap_reset(struct anv_device *device,
+ struct anv_descriptor_pool_heap *heap)
+{
+ if (heap->size == 0)
+ return;
+
+ util_vma_heap_finish(&heap->heap);
+ util_vma_heap_init(&heap->heap, POOL_HEAP_OFFSET, heap->size);
+}
+
+static VkResult
+anv_descriptor_pool_heap_alloc(struct anv_descriptor_pool *pool,
+ struct anv_descriptor_pool_heap *heap,
+ uint32_t size, uint32_t alignment,
+ struct anv_state *state)
+{
+ uint64_t pool_vma_offset =
+ util_vma_heap_alloc(&heap->heap, size, alignment);
+ if (pool_vma_offset == 0)
+ return vk_error(pool, VK_ERROR_FRAGMENTED_POOL);
+
+ assert(pool_vma_offset >= POOL_HEAP_OFFSET &&
+ pool_vma_offset - POOL_HEAP_OFFSET <= INT32_MAX);
+
+ state->offset = pool_vma_offset - POOL_HEAP_OFFSET;
+ state->alloc_size = size;
+ if (heap->host_mem)
+ state->map = heap->host_mem + state->offset;
+ else
+ state->map = heap->bo->map + state->offset;
+
+ return VK_SUCCESS;
+}
+
+static void
+anv_descriptor_pool_heap_free(struct anv_descriptor_pool_heap *heap,
+ struct anv_state state)
+{
+ util_vma_heap_free(&heap->heap,
+ (uint64_t)state.offset + POOL_HEAP_OFFSET,
+ state.alloc_size);
+}
+
VkResult anv_CreateDescriptorPool(
VkDevice _device,
const VkDescriptorPoolCreateInfo* pCreateInfo,
@@ -797,41 +1401,71 @@ VkResult anv_CreateDescriptorPool(
ANV_FROM_HANDLE(anv_device, device, _device);
struct anv_descriptor_pool *pool;
- const VkDescriptorPoolInlineUniformBlockCreateInfoEXT *inline_info =
+ const VkDescriptorPoolInlineUniformBlockCreateInfo *inline_info =
+ vk_find_struct_const(pCreateInfo->pNext,
+ DESCRIPTOR_POOL_INLINE_UNIFORM_BLOCK_CREATE_INFO);
+ const VkMutableDescriptorTypeCreateInfoEXT *mutable_info =
vk_find_struct_const(pCreateInfo->pNext,
- DESCRIPTOR_POOL_INLINE_UNIFORM_BLOCK_CREATE_INFO_EXT);
+ MUTABLE_DESCRIPTOR_TYPE_CREATE_INFO_EXT);
uint32_t descriptor_count = 0;
uint32_t buffer_view_count = 0;
- uint32_t descriptor_bo_size = 0;
+ uint32_t descriptor_bo_surface_size = 0;
+ uint32_t descriptor_bo_sampler_size = 0;
+
+ const enum anv_descriptor_set_layout_type layout_type =
+ device->physical->indirect_descriptors ?
+ ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_INDIRECT :
+ ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_DIRECT;
+
for (uint32_t i = 0; i < pCreateInfo->poolSizeCount; i++) {
enum anv_descriptor_data desc_data =
- anv_descriptor_data_for_type(device->physical,
+ pCreateInfo->pPoolSizes[i].type == VK_DESCRIPTOR_TYPE_MUTABLE_EXT ?
+ anv_descriptor_data_for_mutable_type(device->physical, layout_type,
+ pCreateInfo->flags,
+ mutable_info, i) :
+ anv_descriptor_data_for_type(device->physical, layout_type,
+ pCreateInfo->flags,
pCreateInfo->pPoolSizes[i].type);
if (desc_data & ANV_DESCRIPTOR_BUFFER_VIEW)
buffer_view_count += pCreateInfo->pPoolSizes[i].descriptorCount;
- unsigned desc_data_size = anv_descriptor_data_size(desc_data) *
- pCreateInfo->pPoolSizes[i].descriptorCount;
+ uint16_t desc_surface_size, desc_sampler_size;
+ if (pCreateInfo->pPoolSizes[i].type == VK_DESCRIPTOR_TYPE_MUTABLE_EXT) {
+ anv_descriptor_size_for_mutable_type(device->physical, layout_type,
+ pCreateInfo->flags, mutable_info, i,
+ &desc_surface_size, &desc_sampler_size);
+ } else {
+ anv_descriptor_data_size(desc_data, layout_type,
+ &desc_surface_size, &desc_sampler_size);
+ }
+
+ uint32_t desc_data_surface_size =
+ desc_surface_size * pCreateInfo->pPoolSizes[i].descriptorCount;
+ uint32_t desc_data_sampler_size =
+ desc_sampler_size * pCreateInfo->pPoolSizes[i].descriptorCount;
/* Combined image sampler descriptors can take up to 3 slots if they
* hold a YCbCr image.
*/
if (pCreateInfo->pPoolSizes[i].type ==
- VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER)
- desc_data_size *= 3;
+ VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER) {
+ desc_data_surface_size *= 3;
+ desc_data_sampler_size *= 3;
+ }
if (pCreateInfo->pPoolSizes[i].type ==
- VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT) {
+ VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) {
/* Inline uniform blocks are specified to use the descriptor array
* size as the size in bytes of the block.
*/
assert(inline_info);
- desc_data_size += pCreateInfo->pPoolSizes[i].descriptorCount;
+ desc_data_surface_size += pCreateInfo->pPoolSizes[i].descriptorCount;
}
- descriptor_bo_size += desc_data_size;
+ descriptor_bo_surface_size += desc_data_surface_size;
+ descriptor_bo_sampler_size += desc_data_sampler_size;
descriptor_count += pCreateInfo->pPoolSizes[i].descriptorCount;
}
@@ -843,53 +1477,70 @@ VkResult anv_CreateDescriptorPool(
* extra space that we can chop it into maxSets pieces and align each one
* of them to 32B.
*/
- descriptor_bo_size += ANV_UBO_ALIGNMENT * pCreateInfo->maxSets;
+ descriptor_bo_surface_size += ANV_UBO_ALIGNMENT * pCreateInfo->maxSets;
/* We align inline uniform blocks to ANV_UBO_ALIGNMENT */
if (inline_info) {
- descriptor_bo_size +=
+ descriptor_bo_surface_size +=
ANV_UBO_ALIGNMENT * inline_info->maxInlineUniformBlockBindings;
}
- descriptor_bo_size = ALIGN(descriptor_bo_size, 4096);
- const size_t pool_size =
+ const bool host_only =
+ pCreateInfo->flags & VK_DESCRIPTOR_POOL_CREATE_HOST_ONLY_BIT_EXT;
+
+ /* For host_only pools, allocate some memory to hold the written surface
+ * states of the internal anv_buffer_view. With normal pools, the memory
+ * holding surface state is allocated from the device surface_state_pool.
+ */
+ const size_t host_mem_size =
pCreateInfo->maxSets * sizeof(struct anv_descriptor_set) +
descriptor_count * sizeof(struct anv_descriptor) +
- buffer_view_count * sizeof(struct anv_buffer_view);
- const size_t total_size = sizeof(*pool) + pool_size;
+ buffer_view_count * sizeof(struct anv_buffer_view) +
+ (host_only ? buffer_view_count * ANV_SURFACE_STATE_SIZE : 0);
- pool = vk_object_alloc(&device->vk, pAllocator, total_size,
- VK_OBJECT_TYPE_DESCRIPTOR_POOL);
+ pool = vk_object_zalloc(&device->vk, pAllocator,
+ sizeof(*pool) + host_mem_size,
+ VK_OBJECT_TYPE_DESCRIPTOR_POOL);
if (!pool)
- return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+ return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
- pool->size = pool_size;
- pool->next = 0;
- pool->free_list = EMPTY;
+ pool->host_mem_size = host_mem_size;
+ util_vma_heap_init(&pool->host_heap, POOL_HEAP_OFFSET, host_mem_size);
- if (descriptor_bo_size > 0) {
- VkResult result = anv_device_alloc_bo(device,
- "descriptors",
- descriptor_bo_size,
- ANV_BO_ALLOC_MAPPED |
- ANV_BO_ALLOC_SNOOPED,
- 0 /* explicit_address */,
- &pool->bo);
- if (result != VK_SUCCESS) {
- vk_object_free(&device->vk, pAllocator, pool);
- return result;
- }
+ pool->host_only = host_only;
- util_vma_heap_init(&pool->bo_heap, POOL_HEAP_OFFSET, descriptor_bo_size);
- } else {
- pool->bo = NULL;
+ VkResult result = anv_descriptor_pool_heap_init(device,
+ &pool->surfaces,
+ descriptor_bo_surface_size,
+ pool->host_only,
+ false /* samplers */);
+ if (result != VK_SUCCESS) {
+ vk_object_free(&device->vk, pAllocator, pool);
+ return result;
}
+ result = anv_descriptor_pool_heap_init(device,
+ &pool->samplers,
+ descriptor_bo_sampler_size,
+ pool->host_only,
+ true /* samplers */);
+ if (result != VK_SUCCESS) {
+ anv_descriptor_pool_heap_fini(device, &pool->surfaces);
+ vk_object_free(&device->vk, pAllocator, pool);
+ return result;
+ }
+
+ /* All the surface states allocated by the descriptor pool are internal. We
+ * have to allocate them to handle the fact that we do not have surface
+ * states for VkBuffers.
+ */
anv_state_stream_init(&pool->surface_state_stream,
- &device->surface_state_pool, 4096);
+ &device->internal_surface_state_pool, 4096);
pool->surface_state_free_list = NULL;
list_inithead(&pool->desc_sets);
+ ANV_RMV(descriptor_pool_create, device, pCreateInfo, pool, false);
+
*pDescriptorPool = anv_descriptor_pool_to_handle(pool);
return VK_SUCCESS;
@@ -906,17 +1557,20 @@ void anv_DestroyDescriptorPool(
if (!pool)
return;
+ ANV_RMV(resource_destroy, device, pool);
+
list_for_each_entry_safe(struct anv_descriptor_set, set,
&pool->desc_sets, pool_link) {
anv_descriptor_set_layout_unref(device, set->layout);
}
- if (pool->bo) {
- util_vma_heap_finish(&pool->bo_heap);
- anv_device_release_bo(device, pool->bo);
- }
+ util_vma_heap_finish(&pool->host_heap);
+
anv_state_stream_finish(&pool->surface_state_stream);
+ anv_descriptor_pool_heap_fini(device, &pool->surfaces);
+ anv_descriptor_pool_heap_fini(device, &pool->samplers);
+
vk_object_free(&device->vk, pAllocator, pool);
}
@@ -934,73 +1588,51 @@ VkResult anv_ResetDescriptorPool(
}
list_inithead(&pool->desc_sets);
- pool->next = 0;
- pool->free_list = EMPTY;
+ util_vma_heap_finish(&pool->host_heap);
+ util_vma_heap_init(&pool->host_heap, POOL_HEAP_OFFSET, pool->host_mem_size);
- if (pool->bo) {
- util_vma_heap_finish(&pool->bo_heap);
- util_vma_heap_init(&pool->bo_heap, POOL_HEAP_OFFSET, pool->bo->size);
- }
+ anv_descriptor_pool_heap_reset(device, &pool->surfaces);
+ anv_descriptor_pool_heap_reset(device, &pool->samplers);
anv_state_stream_finish(&pool->surface_state_stream);
anv_state_stream_init(&pool->surface_state_stream,
- &device->surface_state_pool, 4096);
+ &device->internal_surface_state_pool, 4096);
pool->surface_state_free_list = NULL;
return VK_SUCCESS;
}
-struct pool_free_list_entry {
- uint32_t next;
- uint32_t size;
-};
-
static VkResult
anv_descriptor_pool_alloc_set(struct anv_descriptor_pool *pool,
uint32_t size,
struct anv_descriptor_set **set)
{
- if (size <= pool->size - pool->next) {
- *set = (struct anv_descriptor_set *) (pool->data + pool->next);
- (*set)->size = size;
- pool->next += size;
- return VK_SUCCESS;
- } else {
- struct pool_free_list_entry *entry;
- uint32_t *link = &pool->free_list;
- for (uint32_t f = pool->free_list; f != EMPTY; f = entry->next) {
- entry = (struct pool_free_list_entry *) (pool->data + f);
- if (size <= entry->size) {
- *link = entry->next;
- *set = (struct anv_descriptor_set *) entry;
- (*set)->size = entry->size;
- return VK_SUCCESS;
- }
- link = &entry->next;
- }
+ uint64_t vma_offset = util_vma_heap_alloc(&pool->host_heap, size, 1);
- if (pool->free_list != EMPTY) {
- return vk_error(VK_ERROR_FRAGMENTED_POOL);
+ if (vma_offset == 0) {
+ if (size <= pool->host_heap.free_size) {
+ return VK_ERROR_FRAGMENTED_POOL;
} else {
- return vk_error(VK_ERROR_OUT_OF_POOL_MEMORY);
+ return VK_ERROR_OUT_OF_POOL_MEMORY;
}
}
+
+ assert(vma_offset >= POOL_HEAP_OFFSET);
+ uint64_t host_mem_offset = vma_offset - POOL_HEAP_OFFSET;
+
+ *set = (struct anv_descriptor_set *) (pool->host_mem + host_mem_offset);
+ (*set)->size = size;
+
+ return VK_SUCCESS;
}
static void
anv_descriptor_pool_free_set(struct anv_descriptor_pool *pool,
struct anv_descriptor_set *set)
{
- /* Put the descriptor set allocation back on the free list. */
- const uint32_t index = (char *) set - pool->data;
- if (index + set->size == pool->next) {
- pool->next = index;
- } else {
- struct pool_free_list_entry *entry = (struct pool_free_list_entry *) set;
- entry->next = pool->free_list;
- entry->size = set->size;
- pool->free_list = (char *) entry - pool->data;
- }
+ util_vma_heap_free(&pool->host_heap,
+ ((char *) set - pool->host_mem) + POOL_HEAP_OFFSET,
+ set->size);
}
struct surface_state_free_list_entry {
@@ -1011,16 +1643,21 @@ struct surface_state_free_list_entry {
static struct anv_state
anv_descriptor_pool_alloc_state(struct anv_descriptor_pool *pool)
{
+ assert(!pool->host_only);
+
struct surface_state_free_list_entry *entry =
pool->surface_state_free_list;
if (entry) {
struct anv_state state = entry->state;
pool->surface_state_free_list = entry->next;
- assert(state.alloc_size == 64);
+ assert(state.alloc_size == ANV_SURFACE_STATE_SIZE);
return state;
} else {
- return anv_state_stream_alloc(&pool->surface_state_stream, 64, 64);
+ struct anv_state state =
+ anv_state_stream_alloc(&pool->surface_state_stream,
+ ANV_SURFACE_STATE_SIZE, 64);
+ return state;
}
}
@@ -1028,6 +1665,7 @@ static void
anv_descriptor_pool_free_state(struct anv_descriptor_pool *pool,
struct anv_state state)
{
+ assert(state.alloc_size);
/* Put the buffer view surface state back on the free list. */
struct surface_state_free_list_entry *entry = state.map;
entry->next = pool->surface_state_free_list;
@@ -1035,9 +1673,9 @@ anv_descriptor_pool_free_state(struct anv_descriptor_pool *pool,
pool->surface_state_free_list = entry;
}
-size_t
+static size_t
anv_descriptor_set_layout_size(const struct anv_descriptor_set_layout *layout,
- uint32_t var_desc_count)
+ bool host_only, uint32_t var_desc_count)
{
const uint32_t descriptor_count =
set_layout_descriptor_count(layout, var_desc_count);
@@ -1046,10 +1684,11 @@ anv_descriptor_set_layout_size(const struct anv_descriptor_set_layout *layout,
return sizeof(struct anv_descriptor_set) +
descriptor_count * sizeof(struct anv_descriptor) +
- buffer_view_count * sizeof(struct anv_buffer_view);
+ buffer_view_count * sizeof(struct anv_buffer_view) +
+ (host_only ? buffer_view_count * ANV_SURFACE_STATE_SIZE : 0);
}
-VkResult
+static VkResult
anv_descriptor_set_create(struct anv_device *device,
struct anv_descriptor_pool *pool,
struct anv_descriptor_set_layout *layout,
@@ -1057,46 +1696,78 @@ anv_descriptor_set_create(struct anv_device *device,
struct anv_descriptor_set **out_set)
{
struct anv_descriptor_set *set;
- const size_t size = anv_descriptor_set_layout_size(layout, var_desc_count);
+ const size_t size = anv_descriptor_set_layout_size(layout,
+ pool->host_only,
+ var_desc_count);
VkResult result = anv_descriptor_pool_alloc_set(pool, size, &set);
if (result != VK_SUCCESS)
return result;
- uint32_t descriptor_buffer_size =
- anv_descriptor_set_layout_descriptor_buffer_size(layout, var_desc_count);
- if (descriptor_buffer_size) {
- uint64_t pool_vma_offset =
- util_vma_heap_alloc(&pool->bo_heap, descriptor_buffer_size,
- ANV_UBO_ALIGNMENT);
- if (pool_vma_offset == 0) {
+ uint32_t descriptor_buffer_surface_size, descriptor_buffer_sampler_size;
+ anv_descriptor_set_layout_descriptor_buffer_size(layout, var_desc_count,
+ &descriptor_buffer_surface_size,
+ &descriptor_buffer_sampler_size);
+
+ set->desc_surface_state = ANV_STATE_NULL;
+ set->is_push = false;
+
+ if (descriptor_buffer_surface_size) {
+ result = anv_descriptor_pool_heap_alloc(pool, &pool->surfaces,
+ descriptor_buffer_surface_size,
+ ANV_UBO_ALIGNMENT,
+ &set->desc_surface_mem);
+ if (result != VK_SUCCESS) {
anv_descriptor_pool_free_set(pool, set);
- return vk_error(VK_ERROR_FRAGMENTED_POOL);
+ return result;
}
- assert(pool_vma_offset >= POOL_HEAP_OFFSET &&
- pool_vma_offset - POOL_HEAP_OFFSET <= INT32_MAX);
- set->desc_mem.offset = pool_vma_offset - POOL_HEAP_OFFSET;
- set->desc_mem.alloc_size = descriptor_buffer_size;
- set->desc_mem.map = pool->bo->map + set->desc_mem.offset;
-
- set->desc_addr = (struct anv_address) {
- .bo = pool->bo,
- .offset = set->desc_mem.offset,
+
+ set->desc_surface_addr = (struct anv_address) {
+ .bo = pool->surfaces.bo,
+ .offset = set->desc_surface_mem.offset,
};
+ set->desc_offset = anv_address_physical(set->desc_surface_addr) -
+ device->physical->va.internal_surface_state_pool.addr;
enum isl_format format =
anv_isl_format_for_descriptor_type(device,
VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER);
- set->desc_surface_state = anv_descriptor_pool_alloc_state(pool);
- anv_fill_buffer_surface_state(device, set->desc_surface_state, format,
- ISL_SURF_USAGE_CONSTANT_BUFFER_BIT,
- set->desc_addr,
- descriptor_buffer_size, 1);
+ if (!pool->host_only) {
+ set->desc_surface_state = anv_descriptor_pool_alloc_state(pool);
+ if (set->desc_surface_state.map == NULL) {
+ anv_descriptor_pool_free_set(pool, set);
+ return vk_error(pool, VK_ERROR_OUT_OF_DEVICE_MEMORY);
+ }
+
+ anv_fill_buffer_surface_state(device, set->desc_surface_state.map,
+ format, ISL_SWIZZLE_IDENTITY,
+ ISL_SURF_USAGE_CONSTANT_BUFFER_BIT,
+ set->desc_surface_addr,
+ descriptor_buffer_surface_size, 1);
+ }
} else {
- set->desc_mem = ANV_STATE_NULL;
- set->desc_addr = (struct anv_address) { .bo = NULL, .offset = 0 };
- set->desc_surface_state = ANV_STATE_NULL;
+ set->desc_surface_mem = ANV_STATE_NULL;
+ set->desc_surface_addr = ANV_NULL_ADDRESS;
+ }
+
+ if (descriptor_buffer_sampler_size) {
+ result = anv_descriptor_pool_heap_alloc(pool, &pool->samplers,
+ descriptor_buffer_sampler_size,
+ ANV_SAMPLER_STATE_SIZE,
+ &set->desc_sampler_mem);
+ if (result != VK_SUCCESS) {
+ anv_descriptor_pool_free_set(pool, set);
+ return result;
+ }
+
+ set->desc_sampler_addr = (struct anv_address) {
+ .bo = pool->samplers.bo,
+ .offset = set->desc_sampler_mem.offset,
+ };
+ } else {
+ set->desc_sampler_mem = ANV_STATE_NULL;
+ set->desc_sampler_addr = ANV_NULL_ADDRESS;
}
vk_object_base_init(&device->vk, &set->base,
@@ -1120,7 +1791,6 @@ anv_descriptor_set_create(struct anv_device *device,
sizeof(struct anv_descriptor) * set->descriptor_count);
/* Go through and fill out immutable samplers if we have any */
- struct anv_descriptor *desc = set->descriptors;
for (uint32_t b = 0; b < layout->binding_count; b++) {
if (layout->binding[b].immutable_samplers) {
for (uint32_t i = 0; i < layout->binding[b].array_size; i++) {
@@ -1139,13 +1809,30 @@ anv_descriptor_set_create(struct anv_device *device,
b, i);
}
}
- desc += layout->binding[b].array_size;
}
- /* Allocate surface state for the buffer views. */
- for (uint32_t b = 0; b < set->buffer_view_count; b++) {
- set->buffer_views[b].surface_state =
- anv_descriptor_pool_alloc_state(pool);
+ /* Allocate surface states for real descriptor sets if we're using indirect
+ * descriptors. For host only sets, we just store the surface state data in
+ * malloc memory.
+ */
+ if (device->physical->indirect_descriptors) {
+ if (!pool->host_only) {
+ for (uint32_t b = 0; b < set->buffer_view_count; b++) {
+ set->buffer_views[b].general.state =
+ anv_descriptor_pool_alloc_state(pool);
+ }
+ } else {
+ void *host_surface_states =
+ set->buffer_views + set->buffer_view_count;
+ memset(host_surface_states, 0,
+ set->buffer_view_count * ANV_SURFACE_STATE_SIZE);
+ for (uint32_t b = 0; b < set->buffer_view_count; b++) {
+ set->buffer_views[b].general.state = (struct anv_state) {
+ .alloc_size = ANV_SURFACE_STATE_SIZE,
+ .map = host_surface_states + b * ANV_SURFACE_STATE_SIZE,
+ };
+ }
+ }
}
list_addtail(&set->pool_link, &pool->desc_sets);
@@ -1155,22 +1842,32 @@ anv_descriptor_set_create(struct anv_device *device,
return VK_SUCCESS;
}
-void
+static void
anv_descriptor_set_destroy(struct anv_device *device,
struct anv_descriptor_pool *pool,
struct anv_descriptor_set *set)
{
anv_descriptor_set_layout_unref(device, set->layout);
- if (set->desc_mem.alloc_size) {
- util_vma_heap_free(&pool->bo_heap,
- (uint64_t)set->desc_mem.offset + POOL_HEAP_OFFSET,
- set->desc_mem.alloc_size);
- anv_descriptor_pool_free_state(pool, set->desc_surface_state);
+ if (set->desc_surface_mem.alloc_size) {
+ anv_descriptor_pool_heap_free(&pool->surfaces, set->desc_surface_mem);
+ if (set->desc_surface_state.alloc_size)
+ anv_descriptor_pool_free_state(pool, set->desc_surface_state);
}
- for (uint32_t b = 0; b < set->buffer_view_count; b++)
- anv_descriptor_pool_free_state(pool, set->buffer_views[b].surface_state);
+ if (set->desc_sampler_mem.alloc_size)
+ anv_descriptor_pool_heap_free(&pool->samplers, set->desc_sampler_mem);
+
+ if (device->physical->indirect_descriptors) {
+ if (!pool->host_only) {
+ for (uint32_t b = 0; b < set->buffer_view_count; b++) {
+ if (set->buffer_views[b].general.state.alloc_size) {
+ anv_descriptor_pool_free_state(
+ pool, set->buffer_views[b].general.state);
+ }
+ }
+ }
+ }
list_del(&set->pool_link);
@@ -1187,7 +1884,7 @@ VkResult anv_AllocateDescriptorSets(
ANV_FROM_HANDLE(anv_descriptor_pool, pool, pAllocateInfo->descriptorPool);
VkResult result = VK_SUCCESS;
- struct anv_descriptor_set *set;
+ struct anv_descriptor_set *set = NULL;
uint32_t i;
const VkDescriptorSetVariableDescriptorCountAllocateInfo *vdcai =
@@ -1212,9 +1909,20 @@ VkResult anv_AllocateDescriptorSets(
pDescriptorSets[i] = anv_descriptor_set_to_handle(set);
}
- if (result != VK_SUCCESS)
+ if (result != VK_SUCCESS) {
anv_FreeDescriptorSets(_device, pAllocateInfo->descriptorPool,
i, pDescriptorSets);
+ /* The Vulkan 1.3.228 spec, section 14.2.3. Allocation of Descriptor Sets:
+ *
+ * "If the creation of any of those descriptor sets fails, then the
+ * implementation must destroy all successfully created descriptor
+ * set objects from this command, set all entries of the
+ * pDescriptorSets array to VK_NULL_HANDLE and return the error."
+ */
+ for (i = 0; i < pAllocateInfo->descriptorSetCount; i++)
+ pDescriptorSets[i] = VK_NULL_HANDLE;
+
+ }
return result;
}
@@ -1240,34 +1948,194 @@ VkResult anv_FreeDescriptorSets(
return VK_SUCCESS;
}
-static void
-anv_descriptor_set_write_image_param(uint32_t *param_desc_map,
- const struct brw_image_param *param)
+bool
+anv_push_descriptor_set_init(struct anv_cmd_buffer *cmd_buffer,
+ struct anv_push_descriptor_set *push_set,
+ struct anv_descriptor_set_layout *layout)
{
-#define WRITE_PARAM_FIELD(field, FIELD) \
- for (unsigned i = 0; i < ARRAY_SIZE(param->field); i++) \
- param_desc_map[BRW_IMAGE_PARAM_##FIELD##_OFFSET + i] = param->field[i]
+ struct anv_descriptor_set *set = &push_set->set;
+ /* Only copy the old descriptor data if needed :
+ * - not if there was no previous layout
+ * - not if the layout is different (descriptor set data becomes
+ * undefined)
+ * - not if there is only one descriptor, we know the entire data will
+ * be replaced
+ *
+ * TODO: we could optimizer further, try to keep a copy of the old data on
+ * the host, try to copy only the non newly written bits, ...
+ */
+ const bool copy_old_descriptors = set->layout != NULL &&
+ set->layout == layout &&
+ layout->descriptor_count > 1;
- WRITE_PARAM_FIELD(offset, OFFSET);
- WRITE_PARAM_FIELD(size, SIZE);
- WRITE_PARAM_FIELD(stride, STRIDE);
- WRITE_PARAM_FIELD(tiling, TILING);
- WRITE_PARAM_FIELD(swizzling, SWIZZLING);
- WRITE_PARAM_FIELD(size, SIZE);
+ if (set->layout != layout) {
+ if (set->layout) {
+ anv_descriptor_set_layout_unref(cmd_buffer->device, set->layout);
+ } else {
+ /* one-time initialization */
+ vk_object_base_init(&cmd_buffer->device->vk, &set->base,
+ VK_OBJECT_TYPE_DESCRIPTOR_SET);
+ set->is_push = true;
+ set->buffer_views = push_set->buffer_views;
+ }
-#undef WRITE_PARAM_FIELD
+ anv_descriptor_set_layout_ref(layout);
+ set->layout = layout;
+ set->generate_surface_states = 0;
+ }
+
+ assert(set->is_push && set->buffer_views);
+ set->size = anv_descriptor_set_layout_size(layout, false /* host_only */, 0);
+ set->buffer_view_count = layout->buffer_view_count;
+ set->descriptor_count = layout->descriptor_count;
+
+ if (layout->descriptor_buffer_surface_size &&
+ (push_set->set_used_on_gpu ||
+ set->desc_surface_mem.alloc_size < layout->descriptor_buffer_surface_size)) {
+ struct anv_physical_device *pdevice = cmd_buffer->device->physical;
+ struct anv_state_stream *push_stream;
+ uint64_t push_base_address;
+
+ if (layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_DESCRIPTOR_BUFFER_BIT_EXT) {
+ push_stream = pdevice->uses_ex_bso ?
+ &cmd_buffer->push_descriptor_buffer_stream :
+ &cmd_buffer->surface_state_stream;
+ push_base_address = pdevice->uses_ex_bso ?
+ pdevice->va.push_descriptor_buffer_pool.addr :
+ pdevice->va.internal_surface_state_pool.addr;
+ } else {
+ push_stream = pdevice->indirect_descriptors ?
+ &cmd_buffer->indirect_push_descriptor_stream :
+ &cmd_buffer->surface_state_stream;
+ push_base_address = pdevice->indirect_descriptors ?
+ pdevice->va.indirect_push_descriptor_pool.addr :
+ pdevice->va.internal_surface_state_pool.addr;
+ }
+
+ uint32_t surface_size, sampler_size;
+ anv_descriptor_set_layout_descriptor_buffer_size(layout, 0,
+ &surface_size,
+ &sampler_size);
+
+ /* The previous buffer is either actively used by some GPU command (so
+ * we can't modify it) or is too small. Allocate a new one.
+ */
+ struct anv_state desc_surface_mem =
+ anv_state_stream_alloc(push_stream, surface_size, ANV_UBO_ALIGNMENT);
+ if (desc_surface_mem.map == NULL)
+ return false;
+
+ if (copy_old_descriptors) {
+ memcpy(desc_surface_mem.map, set->desc_surface_mem.map,
+ MIN2(desc_surface_mem.alloc_size,
+ set->desc_surface_mem.alloc_size));
+ }
+ set->desc_surface_mem = desc_surface_mem;
+
+ set->desc_surface_addr = anv_state_pool_state_address(
+ push_stream->state_pool,
+ set->desc_surface_mem);
+ set->desc_offset = anv_address_physical(set->desc_surface_addr) -
+ push_base_address;
+ }
+
+ if (layout->descriptor_buffer_sampler_size &&
+ (push_set->set_used_on_gpu ||
+ set->desc_sampler_mem.alloc_size < layout->descriptor_buffer_sampler_size)) {
+ struct anv_physical_device *pdevice = cmd_buffer->device->physical;
+ assert(!pdevice->indirect_descriptors);
+ struct anv_state_stream *push_stream = &cmd_buffer->dynamic_state_stream;
+
+ uint32_t surface_size, sampler_size;
+ anv_descriptor_set_layout_descriptor_buffer_size(layout, 0,
+ &surface_size,
+ &sampler_size);
+
+ /* The previous buffer is either actively used by some GPU command (so
+ * we can't modify it) or is too small. Allocate a new one.
+ */
+ struct anv_state desc_sampler_mem =
+ anv_state_stream_alloc(push_stream, sampler_size, ANV_SAMPLER_STATE_SIZE);
+ if (desc_sampler_mem.map == NULL)
+ return false;
+
+ if (copy_old_descriptors) {
+ memcpy(desc_sampler_mem.map, set->desc_sampler_mem.map,
+ MIN2(desc_sampler_mem.alloc_size,
+ set->desc_sampler_mem.alloc_size));
+ }
+ set->desc_sampler_mem = desc_sampler_mem;
+
+ set->desc_sampler_addr = anv_state_pool_state_address(
+ push_stream->state_pool,
+ set->desc_sampler_mem);
+ }
+
+ if (push_set->set_used_on_gpu) {
+ set->desc_surface_state = ANV_STATE_NULL;
+ push_set->set_used_on_gpu = false;
+ }
+
+ return true;
+}
+
+void
+anv_push_descriptor_set_finish(struct anv_push_descriptor_set *push_set)
+{
+ struct anv_descriptor_set *set = &push_set->set;
+ if (set->layout) {
+ struct anv_device *device =
+ container_of(set->base.device, struct anv_device, vk);
+ anv_descriptor_set_layout_unref(device, set->layout);
+ }
}
static uint32_t
-anv_surface_state_to_handle(struct anv_state state)
+anv_surface_state_to_handle(struct anv_physical_device *device,
+ struct anv_state state)
{
/* Bits 31:12 of the bindless surface offset in the extended message
* descriptor is bits 25:6 of the byte-based address.
*/
assert(state.offset >= 0);
uint32_t offset = state.offset;
- assert((offset & 0x3f) == 0 && offset < (1 << 26));
- return offset << 6;
+ if (device->uses_ex_bso) {
+ assert((offset & 0x3f) == 0);
+ return offset;
+ } else {
+ assert((offset & 0x3f) == 0 && offset < (1 << 26));
+ return offset << 6;
+ }
+}
+
+static const void *
+anv_image_view_surface_data_for_plane_layout(struct anv_image_view *image_view,
+ VkDescriptorType desc_type,
+ unsigned plane,
+ VkImageLayout layout)
+{
+ if (desc_type == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER ||
+ desc_type == VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE ||
+ desc_type == VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT) {
+ return layout == VK_IMAGE_LAYOUT_GENERAL ?
+ &image_view->planes[plane].general_sampler.state_data :
+ &image_view->planes[plane].optimal_sampler.state_data;
+ }
+
+ if (desc_type == VK_DESCRIPTOR_TYPE_STORAGE_IMAGE)
+ return &image_view->planes[plane].storage.state_data;
+
+ unreachable("Invalid descriptor type");
+}
+
+static const uint32_t *
+anv_sampler_state_for_descriptor_set(const struct anv_sampler *sampler,
+ const struct anv_descriptor_set *set,
+ uint32_t plane)
+{
+ if (set->layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_DESCRIPTOR_BUFFER_BIT_EXT)
+ return sampler->db_state[plane];
+ return sampler->state[plane];
}
void
@@ -1289,7 +2157,8 @@ anv_descriptor_set_write_image_view(struct anv_device *device,
* set initialization to set the bindless samplers.
*/
assert(type == bind_layout->type ||
- type == VK_DESCRIPTOR_TYPE_SAMPLER);
+ type == VK_DESCRIPTOR_TYPE_SAMPLER ||
+ bind_layout->type == VK_DESCRIPTOR_TYPE_MUTABLE_EXT);
switch (type) {
case VK_DESCRIPTOR_TYPE_SAMPLER:
@@ -1322,21 +2191,27 @@ anv_descriptor_set_write_image_view(struct anv_device *device,
.sampler = sampler,
};
- void *desc_map = set->desc_mem.map + bind_layout->descriptor_offset +
- element * anv_descriptor_size(bind_layout);
- memset(desc_map, 0, anv_descriptor_size(bind_layout));
+ void *desc_surface_map = set->desc_surface_mem.map +
+ bind_layout->descriptor_surface_offset +
+ element * bind_layout->descriptor_surface_stride;
- if (bind_layout->data & ANV_DESCRIPTOR_SAMPLED_IMAGE) {
+ enum anv_descriptor_data data =
+ bind_layout->type == VK_DESCRIPTOR_TYPE_MUTABLE_EXT ?
+ anv_descriptor_data_for_type(device->physical, set->layout->type,
+ set->layout->flags, type) :
+ bind_layout->data;
+
+ if (data & ANV_DESCRIPTOR_INDIRECT_SAMPLED_IMAGE) {
struct anv_sampled_image_descriptor desc_data[3];
memset(desc_data, 0, sizeof(desc_data));
if (image_view) {
for (unsigned p = 0; p < image_view->n_planes; p++) {
- struct anv_surface_state sstate =
- (desc->layout == VK_IMAGE_LAYOUT_GENERAL) ?
- image_view->planes[p].general_sampler_surface_state :
- image_view->planes[p].optimal_sampler_surface_state;
- desc_data[p].image = anv_surface_state_to_handle(sstate.state);
+ const struct anv_surface_state *sstate =
+ anv_image_view_texture_surface_state(image_view, p,
+ desc->layout);
+ desc_data[p].image =
+ anv_surface_state_to_handle(device->physical, sstate->state);
}
}
@@ -1349,55 +2224,101 @@ anv_descriptor_set_write_image_view(struct anv_device *device,
* can be no more than the size of our array of handles.
*/
assert(bind_layout->max_plane_count <= ARRAY_SIZE(desc_data));
- memcpy(desc_map, desc_data,
+ memcpy(desc_surface_map, desc_data,
MAX2(1, bind_layout->max_plane_count) * sizeof(desc_data[0]));
}
- if (image_view == NULL)
- return;
+ if (data & ANV_DESCRIPTOR_INDIRECT_STORAGE_IMAGE) {
+ if (image_view) {
+ assert(image_view->n_planes == 1);
+ struct anv_storage_image_descriptor desc_data = {
+ .vanilla = anv_surface_state_to_handle(
+ device->physical,
+ anv_image_view_storage_surface_state(image_view)->state),
+ .image_depth = image_view->vk.storage.z_slice_count,
+ };
+ memcpy(desc_surface_map, &desc_data, sizeof(desc_data));
+ } else {
+ memset(desc_surface_map, 0, bind_layout->descriptor_surface_stride);
+ }
+ }
- if (bind_layout->data & ANV_DESCRIPTOR_STORAGE_IMAGE) {
- assert(!(bind_layout->data & ANV_DESCRIPTOR_IMAGE_PARAM));
- assert(image_view->n_planes == 1);
- struct anv_storage_image_descriptor desc_data = {
- .read_write = anv_surface_state_to_handle(
- image_view->planes[0].storage_surface_state.state),
- .write_only = anv_surface_state_to_handle(
- image_view->planes[0].writeonly_storage_surface_state.state),
- };
- memcpy(desc_map, &desc_data, sizeof(desc_data));
+ if (data & ANV_DESCRIPTOR_SAMPLER) {
+ void *sampler_map =
+ set->layout->type == ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_DIRECT ?
+ (set->desc_sampler_mem.map +
+ bind_layout->descriptor_sampler_offset +
+ element * bind_layout->descriptor_sampler_stride) : desc_surface_map;
+ if (sampler) {
+ for (unsigned p = 0; p < sampler->n_planes; p++) {
+ memcpy(sampler_map + p * ANV_SAMPLER_STATE_SIZE,
+ anv_sampler_state_for_descriptor_set(sampler, set, p),
+ ANV_SAMPLER_STATE_SIZE);
+ }
+ } else {
+ memset(sampler_map, 0, bind_layout->descriptor_sampler_stride);
+ }
}
- if (bind_layout->data & ANV_DESCRIPTOR_IMAGE_PARAM) {
- /* Storage images can only ever have one plane */
- assert(image_view->n_planes == 1);
- const struct brw_image_param *image_param =
- &image_view->planes[0].storage_image_param;
+ if (data & ANV_DESCRIPTOR_SURFACE) {
+ unsigned max_plane_count = image_view ? image_view->n_planes : 1;
+
+ for (unsigned p = 0; p < max_plane_count; p++) {
+ void *plane_map = desc_surface_map + p * ANV_SURFACE_STATE_SIZE;
- anv_descriptor_set_write_image_param(desc_map, image_param);
+ if (image_view) {
+ memcpy(plane_map,
+ anv_image_view_surface_data_for_plane_layout(image_view, type,
+ p, desc->layout),
+ ANV_SURFACE_STATE_SIZE);
+ } else {
+ memcpy(plane_map, &device->host_null_surface_state, ANV_SURFACE_STATE_SIZE);
+ }
+ }
}
- if (bind_layout->data & ANV_DESCRIPTOR_TEXTURE_SWIZZLE) {
- assert(!(bind_layout->data & ANV_DESCRIPTOR_SAMPLED_IMAGE));
- assert(image_view);
- struct anv_texture_swizzle_descriptor desc_data[3];
- memset(desc_data, 0, sizeof(desc_data));
+ if (data & ANV_DESCRIPTOR_SURFACE_SAMPLER) {
+ unsigned max_plane_count =
+ MAX2(image_view ? image_view->n_planes : 1,
+ sampler ? sampler->n_planes : 1);
- for (unsigned p = 0; p < image_view->n_planes; p++) {
- desc_data[p] = (struct anv_texture_swizzle_descriptor) {
- .swizzle = {
- (uint8_t)image_view->planes[p].isl.swizzle.r,
- (uint8_t)image_view->planes[p].isl.swizzle.g,
- (uint8_t)image_view->planes[p].isl.swizzle.b,
- (uint8_t)image_view->planes[p].isl.swizzle.a,
- },
- };
+ for (unsigned p = 0; p < max_plane_count; p++) {
+ void *plane_map = desc_surface_map + p * 2 * ANV_SURFACE_STATE_SIZE;
+
+ if (image_view) {
+ memcpy(plane_map,
+ anv_image_view_surface_data_for_plane_layout(image_view, type,
+ p, desc->layout),
+ ANV_SURFACE_STATE_SIZE);
+ } else {
+ memcpy(plane_map, &device->host_null_surface_state, ANV_SURFACE_STATE_SIZE);
+ }
+
+ if (sampler) {
+ memcpy(plane_map + ANV_SURFACE_STATE_SIZE,
+ anv_sampler_state_for_descriptor_set(sampler, set, p),
+ ANV_SAMPLER_STATE_SIZE);
+ } else {
+ memset(plane_map + ANV_SURFACE_STATE_SIZE, 0,
+ ANV_SAMPLER_STATE_SIZE);
+ }
}
- memcpy(desc_map, desc_data,
- MAX2(1, bind_layout->max_plane_count) * sizeof(desc_data[0]));
}
}
+static const void *
+anv_buffer_view_surface_data(struct anv_buffer_view *buffer_view,
+ VkDescriptorType desc_type)
+{
+ if (desc_type == VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER)
+ return &buffer_view->general.state_data;
+
+ if (desc_type == VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER)
+ return &buffer_view->storage.state_data;
+
+ unreachable("Invalid descriptor type");
+}
+
void
anv_descriptor_set_write_buffer_view(struct anv_device *device,
struct anv_descriptor_set *set,
@@ -1411,50 +2332,79 @@ anv_descriptor_set_write_buffer_view(struct anv_device *device,
struct anv_descriptor *desc =
&set->descriptors[bind_layout->descriptor_index + element];
- assert(type == bind_layout->type);
-
- void *desc_map = set->desc_mem.map + bind_layout->descriptor_offset +
- element * anv_descriptor_size(bind_layout);
-
- if (buffer_view == NULL) {
- *desc = (struct anv_descriptor) { .type = type, };
- memset(desc_map, 0, anv_descriptor_size(bind_layout));
- return;
- }
+ assert(type == bind_layout->type ||
+ bind_layout->type == VK_DESCRIPTOR_TYPE_MUTABLE_EXT);
*desc = (struct anv_descriptor) {
.type = type,
.buffer_view = buffer_view,
};
- if (bind_layout->data & ANV_DESCRIPTOR_SAMPLED_IMAGE) {
+ enum anv_descriptor_data data =
+ bind_layout->type == VK_DESCRIPTOR_TYPE_MUTABLE_EXT ?
+ anv_descriptor_data_for_type(device->physical, set->layout->type,
+ set->layout->flags, type) :
+ bind_layout->data;
+
+ void *desc_map = set->desc_surface_mem.map +
+ bind_layout->descriptor_surface_offset +
+ element * bind_layout->descriptor_surface_stride;
+
+ if (buffer_view == NULL) {
+ if (data & ANV_DESCRIPTOR_SURFACE)
+ memcpy(desc_map, &device->host_null_surface_state, ANV_SURFACE_STATE_SIZE);
+ else
+ memset(desc_map, 0, bind_layout->descriptor_surface_stride);
+ return;
+ }
+
+ if (data & ANV_DESCRIPTOR_INDIRECT_SAMPLED_IMAGE) {
struct anv_sampled_image_descriptor desc_data = {
- .image = anv_surface_state_to_handle(buffer_view->surface_state),
+ .image = anv_surface_state_to_handle(
+ device->physical, buffer_view->general.state),
};
memcpy(desc_map, &desc_data, sizeof(desc_data));
}
- if (bind_layout->data & ANV_DESCRIPTOR_STORAGE_IMAGE) {
- assert(!(bind_layout->data & ANV_DESCRIPTOR_IMAGE_PARAM));
+ if (data & ANV_DESCRIPTOR_INDIRECT_STORAGE_IMAGE) {
struct anv_storage_image_descriptor desc_data = {
- .read_write = anv_surface_state_to_handle(
- buffer_view->storage_surface_state),
- .write_only = anv_surface_state_to_handle(
- buffer_view->writeonly_storage_surface_state),
+ .vanilla = anv_surface_state_to_handle(
+ device->physical, buffer_view->storage.state),
};
memcpy(desc_map, &desc_data, sizeof(desc_data));
}
- if (bind_layout->data & ANV_DESCRIPTOR_IMAGE_PARAM) {
- anv_descriptor_set_write_image_param(desc_map,
- &buffer_view->storage_image_param);
+ if (data & ANV_DESCRIPTOR_SURFACE) {
+ memcpy(desc_map,
+ anv_buffer_view_surface_data(buffer_view, type),
+ ANV_SURFACE_STATE_SIZE);
}
}
void
+anv_descriptor_write_surface_state(struct anv_device *device,
+ struct anv_descriptor *desc,
+ struct anv_state surface_state)
+{
+ assert(surface_state.alloc_size);
+
+ struct anv_buffer_view *bview = desc->buffer_view;
+
+ bview->general.state = surface_state;
+
+ isl_surf_usage_flags_t usage =
+ anv_isl_usage_for_descriptor_type(desc->type);
+
+ enum isl_format format =
+ anv_isl_format_for_descriptor_type(device, desc->type);
+ anv_fill_buffer_surface_state(device, bview->general.state.map,
+ format, ISL_SWIZZLE_IDENTITY,
+ usage, bview->address, bview->vk.range, 1);
+}
+
+void
anv_descriptor_set_write_buffer(struct anv_device *device,
struct anv_descriptor_set *set,
- struct anv_state_stream *alloc_stream,
VkDescriptorType type,
struct anv_buffer *buffer,
uint32_t binding,
@@ -1464,76 +2414,95 @@ anv_descriptor_set_write_buffer(struct anv_device *device,
{
const struct anv_descriptor_set_binding_layout *bind_layout =
&set->layout->binding[binding];
- struct anv_descriptor *desc =
- &set->descriptors[bind_layout->descriptor_index + element];
+ const uint32_t descriptor_index = bind_layout->descriptor_index + element;
+ struct anv_descriptor *desc = &set->descriptors[descriptor_index];
+
+ assert(type == bind_layout->type ||
+ bind_layout->type == VK_DESCRIPTOR_TYPE_MUTABLE_EXT);
+
+ *desc = (struct anv_descriptor) {
+ .type = type,
+ .offset = offset,
+ .range = range,
+ .buffer = buffer,
+ };
- assert(type == bind_layout->type);
+ enum anv_descriptor_data data =
+ bind_layout->type == VK_DESCRIPTOR_TYPE_MUTABLE_EXT ?
+ anv_descriptor_data_for_type(device->physical, set->layout->type,
+ set->layout->flags, type) :
+ bind_layout->data;
- void *desc_map = set->desc_mem.map + bind_layout->descriptor_offset +
- element * anv_descriptor_size(bind_layout);
+ void *desc_map = set->desc_surface_mem.map +
+ bind_layout->descriptor_surface_offset +
+ element * bind_layout->descriptor_surface_stride;
if (buffer == NULL) {
- *desc = (struct anv_descriptor) { .type = type, };
- memset(desc_map, 0, anv_descriptor_size(bind_layout));
+ if (data & ANV_DESCRIPTOR_SURFACE)
+ memcpy(desc_map, &device->host_null_surface_state, ANV_SURFACE_STATE_SIZE);
+ else
+ memset(desc_map, 0, bind_layout->descriptor_surface_stride);
return;
}
struct anv_address bind_addr = anv_address_add(buffer->address, offset);
- uint64_t bind_range = anv_buffer_get_range(buffer, offset, range);
+ desc->bind_range = vk_buffer_range(&buffer->vk, offset, range);
/* We report a bounds checking alignment of 32B for the sake of block
* messages which read an entire register worth at a time.
*/
if (type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER ||
type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC)
- bind_range = align_u64(bind_range, ANV_UBO_ALIGNMENT);
+ desc->bind_range = align64(desc->bind_range, ANV_UBO_ALIGNMENT);
- if (type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC ||
- type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC) {
- *desc = (struct anv_descriptor) {
- .type = type,
- .buffer = buffer,
- .offset = offset,
- .range = range,
+ if (data & ANV_DESCRIPTOR_INDIRECT_ADDRESS_RANGE) {
+ struct anv_address_range_descriptor desc_data = {
+ .address = anv_address_physical(bind_addr),
+ .range = desc->bind_range,
};
- } else {
- assert(bind_layout->data & ANV_DESCRIPTOR_BUFFER_VIEW);
+ memcpy(desc_map, &desc_data, sizeof(desc_data));
+ }
+
+ if (data & ANV_DESCRIPTOR_SURFACE) {
+ isl_surf_usage_flags_t usage =
+ anv_isl_usage_for_descriptor_type(desc->type);
+
+ enum isl_format format =
+ anv_isl_format_for_descriptor_type(device, desc->type);
+
+ isl_buffer_fill_state(&device->isl_dev, desc_map,
+ .address = anv_address_physical(bind_addr),
+ .mocs = isl_mocs(&device->isl_dev, usage,
+ bind_addr.bo && anv_bo_is_external(bind_addr.bo)),
+ .size_B = desc->bind_range,
+ .format = format,
+ .swizzle = ISL_SWIZZLE_IDENTITY,
+ .stride_B = 1);
+ }
+
+ if (type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC ||
+ type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC)
+ return;
+
+ if (data & ANV_DESCRIPTOR_BUFFER_VIEW) {
struct anv_buffer_view *bview =
&set->buffer_views[bind_layout->buffer_view_index + element];
- bview->format = anv_isl_format_for_descriptor_type(device, type);
- bview->range = bind_range;
- bview->address = bind_addr;
-
- /* If we're writing descriptors through a push command, we need to
- * allocate the surface state from the command buffer. Otherwise it will
- * be allocated by the descriptor pool when calling
- * vkAllocateDescriptorSets. */
- if (alloc_stream)
- bview->surface_state = anv_state_stream_alloc(alloc_stream, 64, 64);
+ desc->set_buffer_view = bview;
- isl_surf_usage_flags_t usage =
- (type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER ||
- type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC) ?
- ISL_SURF_USAGE_CONSTANT_BUFFER_BIT :
- ISL_SURF_USAGE_STORAGE_BIT;
-
- anv_fill_buffer_surface_state(device, bview->surface_state,
- bview->format, usage,
- bind_addr, bind_range, 1);
-
- *desc = (struct anv_descriptor) {
- .type = type,
- .buffer_view = bview,
- };
- }
+ bview->vk.range = desc->bind_range;
+ bview->address = bind_addr;
- if (bind_layout->data & ANV_DESCRIPTOR_ADDRESS_RANGE) {
- struct anv_address_range_descriptor desc_data = {
- .address = anv_address_physical(bind_addr),
- .range = bind_range,
- };
- memcpy(desc_map, &desc_data, sizeof(desc_data));
+ if (set->is_push) {
+ set->generate_surface_states |= BITFIELD_BIT(descriptor_index);
+ /* Reset the surface state to make sure
+ * genX(cmd_buffer_emit_push_descriptor_surfaces) generates a new
+ * one.
+ */
+ bview->general.state = ANV_STATE_NULL;
+ } else {
+ anv_descriptor_write_surface_state(device, desc, bview->general.state);
+ }
}
}
@@ -1550,7 +2519,8 @@ anv_descriptor_set_write_inline_uniform_data(struct anv_device *device,
assert(bind_layout->data & ANV_DESCRIPTOR_INLINE_UNIFORM);
- void *desc_map = set->desc_mem.map + bind_layout->descriptor_offset;
+ void *desc_map = set->desc_surface_mem.map +
+ bind_layout->descriptor_surface_offset;
memcpy(desc_map + offset, data, size);
}
@@ -1558,7 +2528,7 @@ anv_descriptor_set_write_inline_uniform_data(struct anv_device *device,
void
anv_descriptor_set_write_acceleration_structure(struct anv_device *device,
struct anv_descriptor_set *set,
- struct anv_acceleration_structure *accel,
+ struct vk_acceleration_structure *accel,
uint32_t binding,
uint32_t element)
{
@@ -1567,35 +2537,36 @@ anv_descriptor_set_write_acceleration_structure(struct anv_device *device,
struct anv_descriptor *desc =
&set->descriptors[bind_layout->descriptor_index + element];
- assert(bind_layout->type == VK_DESCRIPTOR_TYPE_ACCELERATION_STRUCTURE_KHR);
+ assert(bind_layout->data & ANV_DESCRIPTOR_INDIRECT_ADDRESS_RANGE);
*desc = (struct anv_descriptor) {
.type = VK_DESCRIPTOR_TYPE_ACCELERATION_STRUCTURE_KHR,
+ .accel_struct = accel,
};
struct anv_address_range_descriptor desc_data = { };
if (accel != NULL) {
- desc_data.address = anv_address_physical(accel->address);
+ desc_data.address = vk_acceleration_structure_get_va(accel);
desc_data.range = accel->size;
}
- assert(anv_descriptor_size(bind_layout) == sizeof(desc_data));
+ assert(sizeof(desc_data) <= bind_layout->descriptor_surface_stride);
- void *desc_map = set->desc_mem.map + bind_layout->descriptor_offset +
- element * sizeof(desc_data);
+ void *desc_map = set->desc_surface_mem.map +
+ bind_layout->descriptor_surface_offset +
+ element * bind_layout->descriptor_surface_stride;
memcpy(desc_map, &desc_data, sizeof(desc_data));
}
-void anv_UpdateDescriptorSets(
- VkDevice _device,
- uint32_t descriptorWriteCount,
- const VkWriteDescriptorSet* pDescriptorWrites,
- uint32_t descriptorCopyCount,
- const VkCopyDescriptorSet* pDescriptorCopies)
+void
+anv_descriptor_set_write(struct anv_device *device,
+ struct anv_descriptor_set *set_override,
+ uint32_t write_count,
+ const VkWriteDescriptorSet *writes)
{
- ANV_FROM_HANDLE(anv_device, device, _device);
-
- for (uint32_t i = 0; i < descriptorWriteCount; i++) {
- const VkWriteDescriptorSet *write = &pDescriptorWrites[i];
- ANV_FROM_HANDLE(anv_descriptor_set, set, write->dstSet);
+ for (uint32_t i = 0; i < write_count; i++) {
+ const VkWriteDescriptorSet *write = &writes[i];
+ struct anv_descriptor_set *set = unlikely(set_override) ?
+ set_override :
+ anv_descriptor_set_from_handle(write->dstSet);
switch (write->descriptorType) {
case VK_DESCRIPTOR_TYPE_SAMPLER:
@@ -1634,7 +2605,6 @@ void anv_UpdateDescriptorSets(
ANV_FROM_HANDLE(anv_buffer, buffer, write->pBufferInfo[j].buffer);
anv_descriptor_set_write_buffer(device, set,
- NULL,
write->descriptorType,
buffer,
write->dstBinding,
@@ -1644,10 +2614,10 @@ void anv_UpdateDescriptorSets(
}
break;
- case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT: {
- const VkWriteDescriptorSetInlineUniformBlockEXT *inline_write =
+ case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK: {
+ const VkWriteDescriptorSetInlineUniformBlock *inline_write =
vk_find_struct_const(write->pNext,
- WRITE_DESCRIPTOR_SET_INLINE_UNIFORM_BLOCK_EXT);
+ WRITE_DESCRIPTOR_SET_INLINE_UNIFORM_BLOCK);
assert(inline_write->dataSize == write->descriptorCount);
anv_descriptor_set_write_inline_uniform_data(device, set,
write->dstBinding,
@@ -1663,7 +2633,7 @@ void anv_UpdateDescriptorSets(
assert(accel_write->accelerationStructureCount ==
write->descriptorCount);
for (uint32_t j = 0; j < write->descriptorCount; j++) {
- ANV_FROM_HANDLE(anv_acceleration_structure, accel,
+ ANV_FROM_HANDLE(vk_acceleration_structure, accel,
accel_write->pAccelerationStructures[j]);
anv_descriptor_set_write_acceleration_structure(device, set, accel,
write->dstBinding,
@@ -1676,6 +2646,19 @@ void anv_UpdateDescriptorSets(
break;
}
}
+}
+
+void anv_UpdateDescriptorSets(
+ VkDevice _device,
+ uint32_t descriptorWriteCount,
+ const VkWriteDescriptorSet* pDescriptorWrites,
+ uint32_t descriptorCopyCount,
+ const VkCopyDescriptorSet* pDescriptorCopies)
+{
+ ANV_FROM_HANDLE(anv_device, device, _device);
+
+ anv_descriptor_set_write(device, NULL, descriptorWriteCount,
+ pDescriptorWrites);
for (uint32_t i = 0; i < descriptorCopyCount; i++) {
const VkCopyDescriptorSet *copy = &pDescriptorCopies[i];
@@ -1684,35 +2667,85 @@ void anv_UpdateDescriptorSets(
const struct anv_descriptor_set_binding_layout *src_layout =
&src->layout->binding[copy->srcBinding];
- struct anv_descriptor *src_desc =
- &src->descriptors[src_layout->descriptor_index];
- src_desc += copy->srcArrayElement;
-
const struct anv_descriptor_set_binding_layout *dst_layout =
&dst->layout->binding[copy->dstBinding];
- struct anv_descriptor *dst_desc =
- &dst->descriptors[dst_layout->descriptor_index];
- dst_desc += copy->dstArrayElement;
-
- if (src_layout->data & ANV_DESCRIPTOR_INLINE_UNIFORM) {
- assert(src_layout->data == ANV_DESCRIPTOR_INLINE_UNIFORM);
- memcpy(dst->desc_mem.map + dst_layout->descriptor_offset +
- copy->dstArrayElement,
- src->desc_mem.map + src_layout->descriptor_offset +
- copy->srcArrayElement,
- copy->descriptorCount);
- } else {
- for (uint32_t j = 0; j < copy->descriptorCount; j++)
- dst_desc[j] = src_desc[j];
-
- unsigned desc_size = anv_descriptor_size(src_layout);
- if (desc_size > 0) {
- assert(desc_size == anv_descriptor_size(dst_layout));
- memcpy(dst->desc_mem.map + dst_layout->descriptor_offset +
- copy->dstArrayElement * desc_size,
- src->desc_mem.map + src_layout->descriptor_offset +
- copy->srcArrayElement * desc_size,
- copy->descriptorCount * desc_size);
+
+ if (src_layout->type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) {
+ anv_descriptor_set_write_inline_uniform_data(device, dst,
+ copy->dstBinding,
+ src->desc_surface_mem.map +
+ src_layout->descriptor_surface_offset + copy->srcArrayElement,
+ copy->dstArrayElement,
+ copy->descriptorCount);
+ continue;
+ }
+
+ uint32_t copy_surface_element_size =
+ MIN2(src_layout->descriptor_surface_stride,
+ dst_layout->descriptor_surface_stride);
+ uint32_t copy_sampler_element_size =
+ MIN2(src_layout->descriptor_sampler_stride,
+ dst_layout->descriptor_sampler_stride);
+ for (uint32_t j = 0; j < copy->descriptorCount; j++) {
+ struct anv_descriptor *src_desc =
+ &src->descriptors[src_layout->descriptor_index +
+ copy->srcArrayElement + j];
+ struct anv_descriptor *dst_desc =
+ &dst->descriptors[dst_layout->descriptor_index +
+ copy->dstArrayElement + j];
+
+ /* Copy the memory containing one of the following structure read by
+ * the shaders :
+ * - anv_sampled_image_descriptor
+ * - anv_storage_image_descriptor
+ * - anv_address_range_descriptor
+ * - RENDER_SURFACE_STATE
+ * - SAMPLER_STATE
+ */
+ memcpy(dst->desc_surface_mem.map +
+ dst_layout->descriptor_surface_offset +
+ (copy->dstArrayElement + j) * dst_layout->descriptor_surface_stride,
+ src->desc_surface_mem.map +
+ src_layout->descriptor_surface_offset +
+ (copy->srcArrayElement + j) * src_layout->descriptor_surface_stride,
+ copy_surface_element_size);
+ memcpy(dst->desc_sampler_mem.map +
+ dst_layout->descriptor_sampler_offset +
+ (copy->dstArrayElement + j) * dst_layout->descriptor_sampler_stride,
+ src->desc_sampler_mem.map +
+ src_layout->descriptor_sampler_offset +
+ (copy->srcArrayElement + j) * src_layout->descriptor_sampler_stride,
+ copy_sampler_element_size);
+
+ /* Copy the CPU side data anv_descriptor */
+ *dst_desc = *src_desc;
+
+ /* If the CPU side may contain a buffer view, we need to copy that as
+ * well
+ */
+ const enum anv_descriptor_data data =
+ src_layout->type == VK_DESCRIPTOR_TYPE_MUTABLE_EXT ?
+ anv_descriptor_data_for_type(device->physical,
+ src->layout->type,
+ src->layout->flags,
+ src_desc->type) :
+ src_layout->data;
+ if (data & ANV_DESCRIPTOR_BUFFER_VIEW) {
+ struct anv_buffer_view *src_bview =
+ &src->buffer_views[src_layout->buffer_view_index +
+ copy->srcArrayElement + j];
+ struct anv_buffer_view *dst_bview =
+ &dst->buffer_views[dst_layout->buffer_view_index +
+ copy->dstArrayElement + j];
+
+ dst_desc->set_buffer_view = dst_bview;
+
+ dst_bview->vk.range = src_bview->vk.range;
+ dst_bview->address = src_bview->address;
+
+ memcpy(dst_bview->general.state.map,
+ src_bview->general.state.map,
+ ANV_SURFACE_STATE_SIZE);
}
}
}
@@ -1725,12 +2758,11 @@ void anv_UpdateDescriptorSets(
void
anv_descriptor_set_write_template(struct anv_device *device,
struct anv_descriptor_set *set,
- struct anv_state_stream *alloc_stream,
- const struct anv_descriptor_update_template *template,
+ const struct vk_descriptor_update_template *template,
const void *data)
{
for (uint32_t i = 0; i < template->entry_count; i++) {
- const struct anv_descriptor_template_entry *entry =
+ const struct vk_descriptor_template_entry *entry =
&template->entries[i];
switch (entry->type) {
@@ -1774,7 +2806,6 @@ anv_descriptor_set_write_template(struct anv_device *device,
ANV_FROM_HANDLE(anv_buffer, buffer, info->buffer);
anv_descriptor_set_write_buffer(device, set,
- alloc_stream,
entry->type,
buffer,
entry->binding,
@@ -1783,7 +2814,7 @@ anv_descriptor_set_write_template(struct anv_device *device,
}
break;
- case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT:
+ case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK:
anv_descriptor_set_write_inline_uniform_data(device, set,
entry->binding,
data + entry->offset,
@@ -1791,79 +2822,241 @@ anv_descriptor_set_write_template(struct anv_device *device,
entry->array_count);
break;
+ case VK_DESCRIPTOR_TYPE_ACCELERATION_STRUCTURE_KHR:
+ for (uint32_t j = 0; j < entry->array_count; j++) {
+ VkAccelerationStructureKHR *accel_obj =
+ (VkAccelerationStructureKHR *)(data + entry->offset + j * entry->stride);
+ ANV_FROM_HANDLE(vk_acceleration_structure, accel, *accel_obj);
+
+ anv_descriptor_set_write_acceleration_structure(device, set,
+ accel,
+ entry->binding,
+ entry->array_element + j);
+ }
+ break;
+
default:
break;
}
}
}
-VkResult anv_CreateDescriptorUpdateTemplate(
+void anv_UpdateDescriptorSetWithTemplate(
VkDevice _device,
- const VkDescriptorUpdateTemplateCreateInfo* pCreateInfo,
- const VkAllocationCallbacks* pAllocator,
- VkDescriptorUpdateTemplate* pDescriptorUpdateTemplate)
+ VkDescriptorSet descriptorSet,
+ VkDescriptorUpdateTemplate descriptorUpdateTemplate,
+ const void* pData)
{
ANV_FROM_HANDLE(anv_device, device, _device);
- struct anv_descriptor_update_template *template;
+ ANV_FROM_HANDLE(anv_descriptor_set, set, descriptorSet);
+ VK_FROM_HANDLE(vk_descriptor_update_template, template,
+ descriptorUpdateTemplate);
- size_t size = sizeof(*template) +
- pCreateInfo->descriptorUpdateEntryCount * sizeof(template->entries[0]);
- template = vk_object_alloc(&device->vk, pAllocator, size,
- VK_OBJECT_TYPE_DESCRIPTOR_UPDATE_TEMPLATE);
- if (template == NULL)
- return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+ anv_descriptor_set_write_template(device, set, template, pData);
+}
- template->bind_point = pCreateInfo->pipelineBindPoint;
+void anv_GetDescriptorSetLayoutSizeEXT(
+ VkDevice device,
+ VkDescriptorSetLayout layout,
+ VkDeviceSize* pLayoutSizeInBytes)
+{
+ ANV_FROM_HANDLE(anv_descriptor_set_layout, set_layout, layout);
- if (pCreateInfo->templateType == VK_DESCRIPTOR_UPDATE_TEMPLATE_TYPE_DESCRIPTOR_SET)
- template->set = pCreateInfo->set;
+ *pLayoutSizeInBytes = set_layout->descriptor_buffer_surface_size;
+}
- template->entry_count = pCreateInfo->descriptorUpdateEntryCount;
- for (uint32_t i = 0; i < template->entry_count; i++) {
- const VkDescriptorUpdateTemplateEntry *pEntry =
- &pCreateInfo->pDescriptorUpdateEntries[i];
-
- template->entries[i] = (struct anv_descriptor_template_entry) {
- .type = pEntry->descriptorType,
- .binding = pEntry->dstBinding,
- .array_element = pEntry->dstArrayElement,
- .array_count = pEntry->descriptorCount,
- .offset = pEntry->offset,
- .stride = pEntry->stride,
- };
- }
+void anv_GetDescriptorSetLayoutBindingOffsetEXT(
+ VkDevice device,
+ VkDescriptorSetLayout layout,
+ uint32_t binding,
+ VkDeviceSize* pOffset)
+{
+ ANV_FROM_HANDLE(anv_descriptor_set_layout, set_layout, layout);
+ assert(binding < set_layout->binding_count);
+ const struct anv_descriptor_set_binding_layout *bind_layout =
+ &set_layout->binding[binding];
- *pDescriptorUpdateTemplate =
- anv_descriptor_update_template_to_handle(template);
+ *pOffset = bind_layout->descriptor_surface_offset;
+}
- return VK_SUCCESS;
+static bool
+address_info_is_null(const VkDescriptorAddressInfoEXT *addr_info)
+{
+ return addr_info == NULL || addr_info->address == 0 || addr_info->range == 0;
}
-void anv_DestroyDescriptorUpdateTemplate(
+void anv_GetDescriptorEXT(
VkDevice _device,
- VkDescriptorUpdateTemplate descriptorUpdateTemplate,
- const VkAllocationCallbacks* pAllocator)
+ const VkDescriptorGetInfoEXT* pDescriptorInfo,
+ size_t dataSize,
+ void* pDescriptor)
{
ANV_FROM_HANDLE(anv_device, device, _device);
- ANV_FROM_HANDLE(anv_descriptor_update_template, template,
- descriptorUpdateTemplate);
+ struct anv_sampler *sampler;
+ struct anv_image_view *image_view;
- if (!template)
- return;
+ switch (pDescriptorInfo->type) {
+ case VK_DESCRIPTOR_TYPE_SAMPLER:
+ if (pDescriptorInfo->data.pSampler &&
+ (sampler = anv_sampler_from_handle(*pDescriptorInfo->data.pSampler))) {
+ memcpy(pDescriptor, sampler->db_state[0], ANV_SAMPLER_STATE_SIZE);
+ } else {
+ memset(pDescriptor, 0, ANV_SAMPLER_STATE_SIZE);
+ }
+ break;
- vk_object_free(&device->vk, pAllocator, template);
-}
+ case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
+ for (uint32_t i = 0; i < dataSize / (2 * ANV_SURFACE_STATE_SIZE); i++) {
+ uint32_t desc_offset = i * 2 * ANV_SURFACE_STATE_SIZE;
+
+ if (pDescriptorInfo->data.pCombinedImageSampler &&
+ (image_view = anv_image_view_from_handle(
+ pDescriptorInfo->data.pCombinedImageSampler->imageView))) {
+ const VkImageLayout layout =
+ pDescriptorInfo->data.pCombinedImageSampler->imageLayout;
+ memcpy(pDescriptor + desc_offset,
+ anv_image_view_surface_data_for_plane_layout(image_view,
+ pDescriptorInfo->type,
+ i,
+ layout),
+ ANV_SURFACE_STATE_SIZE);
+ } else {
+ memcpy(pDescriptor + desc_offset,
+ device->host_null_surface_state,
+ ANV_SURFACE_STATE_SIZE);
+ }
-void anv_UpdateDescriptorSetWithTemplate(
- VkDevice _device,
- VkDescriptorSet descriptorSet,
- VkDescriptorUpdateTemplate descriptorUpdateTemplate,
- const void* pData)
-{
- ANV_FROM_HANDLE(anv_device, device, _device);
- ANV_FROM_HANDLE(anv_descriptor_set, set, descriptorSet);
- ANV_FROM_HANDLE(anv_descriptor_update_template, template,
- descriptorUpdateTemplate);
+ if (pDescriptorInfo->data.pCombinedImageSampler &&
+ (sampler = anv_sampler_from_handle(
+ pDescriptorInfo->data.pCombinedImageSampler->sampler))) {
+ memcpy(pDescriptor + desc_offset + ANV_SURFACE_STATE_SIZE,
+ sampler->db_state[i],
+ ANV_SAMPLER_STATE_SIZE);
+ } else {
+ memset(pDescriptor + desc_offset + ANV_SURFACE_STATE_SIZE,
+ 0, ANV_SAMPLER_STATE_SIZE);
+ }
+ }
+ break;
+
+ case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
+ case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
+ case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
+ if (pDescriptorInfo->data.pSampledImage &&
+ (image_view = anv_image_view_from_handle(
+ pDescriptorInfo->data.pSampledImage->imageView))) {
+ const VkImageLayout layout =
+ pDescriptorInfo->data.pSampledImage->imageLayout;
+
+ memcpy(pDescriptor,
+ anv_image_view_surface_data_for_plane_layout(image_view,
+ pDescriptorInfo->type,
+ 0,
+ layout),
+ ANV_SURFACE_STATE_SIZE);
+ } else {
+ memcpy(pDescriptor, device->host_null_surface_state,
+ ANV_SURFACE_STATE_SIZE);
+ }
+ break;
+
+ case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER: {
+ const VkDescriptorAddressInfoEXT *addr_info =
+ pDescriptorInfo->data.pUniformTexelBuffer;
+
+ if (!address_info_is_null(addr_info)) {
+ struct anv_format_plane format =
+ anv_get_format_plane(device->info,
+ addr_info->format,
+ 0, VK_IMAGE_TILING_LINEAR);
+ const uint32_t format_bs =
+ isl_format_get_layout(format.isl_format)->bpb / 8;
+
+ anv_fill_buffer_surface_state(device, pDescriptor,
+ format.isl_format, format.swizzle,
+ ISL_SURF_USAGE_TEXTURE_BIT,
+ anv_address_from_u64(addr_info->address),
+ align_down_npot_u32(addr_info->range, format_bs),
+ format_bs);
+ } else {
+ memcpy(pDescriptor, device->host_null_surface_state,
+ ANV_SURFACE_STATE_SIZE);
+ }
+ break;
+ }
+
+ case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER: {
+ const VkDescriptorAddressInfoEXT *addr_info =
+ pDescriptorInfo->data.pStorageTexelBuffer;
+
+ if (!address_info_is_null(addr_info)) {
+ struct anv_format_plane format =
+ anv_get_format_plane(device->info,
+ addr_info->format,
+ 0, VK_IMAGE_TILING_LINEAR);
+ const uint32_t format_bs =
+ isl_format_get_layout(format.isl_format)->bpb / 8;
+
+ anv_fill_buffer_surface_state(device, pDescriptor,
+ format.isl_format, format.swizzle,
+ ISL_SURF_USAGE_STORAGE_BIT,
+ anv_address_from_u64(addr_info->address),
+ align_down_npot_u32(addr_info->range, format_bs),
+ format_bs);
+ } else {
+ memcpy(pDescriptor, device->host_null_surface_state,
+ ANV_SURFACE_STATE_SIZE);
+ }
+ break;
+ }
+
+ case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
+ case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER: {
+ const VkDescriptorAddressInfoEXT *addr_info =
+ pDescriptorInfo->data.pStorageBuffer;
+
+ if (!address_info_is_null(addr_info)) {
+ VkDeviceSize range = addr_info->range;
- anv_descriptor_set_write_template(device, set, NULL, template, pData);
+ /* We report a bounds checking alignment of 32B for the sake of block
+ * messages which read an entire register worth at a time.
+ */
+ if (pDescriptorInfo->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER)
+ range = align64(range, ANV_UBO_ALIGNMENT);
+
+ isl_surf_usage_flags_t usage =
+ pDescriptorInfo->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER ?
+ ISL_SURF_USAGE_CONSTANT_BUFFER_BIT :
+ ISL_SURF_USAGE_STORAGE_BIT;
+
+ enum isl_format format =
+ anv_isl_format_for_descriptor_type(device, pDescriptorInfo->type);
+
+ isl_buffer_fill_state(&device->isl_dev, pDescriptor,
+ .address = addr_info->address,
+ .mocs = isl_mocs(&device->isl_dev, usage, false),
+ .size_B = range,
+ .format = format,
+ .swizzle = ISL_SWIZZLE_IDENTITY,
+ .stride_B = 1);
+ } else {
+ memcpy(pDescriptor, device->host_null_surface_state,
+ ANV_SURFACE_STATE_SIZE);
+ }
+ break;
+ }
+
+ case VK_DESCRIPTOR_TYPE_ACCELERATION_STRUCTURE_KHR: {
+ struct anv_address_range_descriptor desc_data = {
+ .address = pDescriptorInfo->data.accelerationStructure,
+ .range = 0,
+ };
+
+ memcpy(pDescriptor, &desc_data, sizeof(desc_data));
+ break;
+ }
+
+ default:
+ unreachable("Invalid descriptor type");
+ }
}
diff --git a/src/intel/vulkan/anv_device.c b/src/intel/vulkan/anv_device.c
index 3d3ad15151e..507be254624 100644
--- a/src/intel/vulkan/anv_device.c
+++ b/src/intel/vulkan/anv_device.c
@@ -22,6 +22,7 @@
*/
#include <assert.h>
+#include <inttypes.h>
#include <stdbool.h>
#include <string.h>
#ifdef MAJOR_IN_MKDEV
@@ -40,35 +41,77 @@
#include "anv_private.h"
#include "anv_measure.h"
-#include "util/debug.h"
+#include "util/u_debug.h"
#include "util/build_id.h"
#include "util/disk_cache.h"
#include "util/mesa-sha1.h"
#include "util/os_file.h"
#include "util/os_misc.h"
#include "util/u_atomic.h"
+#if DETECT_OS_ANDROID
+#include "util/u_gralloc/u_gralloc.h"
+#endif
#include "util/u_string.h"
#include "util/driconf.h"
#include "git_sha1.h"
+#include "vk_common_entrypoints.h"
#include "vk_util.h"
#include "vk_deferred_operation.h"
+#include "vk_drm_syncobj.h"
#include "common/intel_aux_map.h"
-#include "common/intel_defines.h"
#include "common/intel_uuid.h"
#include "perf/intel_perf.h"
+#include "i915/anv_device.h"
+#include "xe/anv_device.h"
+#include "xe/anv_queue.h"
+
#include "genxml/gen7_pack.h"
+#include "genxml/genX_bits.h"
static const driOptionDescription anv_dri_options[] = {
DRI_CONF_SECTION_PERFORMANCE
+ DRI_CONF_ADAPTIVE_SYNC(true)
DRI_CONF_VK_X11_OVERRIDE_MIN_IMAGE_COUNT(0)
DRI_CONF_VK_X11_STRICT_IMAGE_COUNT(false)
- DRI_CONF_VK_XWAYLAND_WAIT_READY(true)
+ DRI_CONF_VK_KHR_PRESENT_WAIT(false)
+ DRI_CONF_VK_XWAYLAND_WAIT_READY(false)
+ DRI_CONF_ANV_ASSUME_FULL_SUBGROUPS(0)
+ DRI_CONF_ANV_DISABLE_FCV(false)
+ DRI_CONF_ANV_EXTERNAL_MEMORY_IMPLICIT_SYNC(true)
+ DRI_CONF_ANV_SAMPLE_MASK_OUT_OPENGL_BEHAVIOUR(false)
+ DRI_CONF_ANV_FORCE_FILTER_ADDR_ROUNDING(false)
+ DRI_CONF_ANV_FP64_WORKAROUND_ENABLED(false)
+ DRI_CONF_ANV_GENERATED_INDIRECT_THRESHOLD(4)
+ DRI_CONF_ANV_GENERATED_INDIRECT_RING_THRESHOLD(100)
+ DRI_CONF_NO_16BIT(false)
+ DRI_CONF_INTEL_ENABLE_WA_14018912822(false)
+ DRI_CONF_ANV_QUERY_CLEAR_WITH_BLORP_THRESHOLD(6)
+ DRI_CONF_ANV_QUERY_COPY_WITH_SHADER_THRESHOLD(6)
+ DRI_CONF_ANV_FORCE_INDIRECT_DESCRIPTORS(false)
+ DRI_CONF_SHADER_SPILLING_RATE(0)
+ DRI_CONF_OPT_B(intel_tbimr, true, "Enable TBIMR tiled rendering")
+ DRI_CONF_ANV_COMPRESSION_CONTROL_ENABLED(false)
DRI_CONF_SECTION_END
DRI_CONF_SECTION_DEBUG
DRI_CONF_ALWAYS_FLUSH_CACHE(false)
DRI_CONF_VK_WSI_FORCE_BGRA8_UNORM_FIRST(false)
+ DRI_CONF_VK_WSI_FORCE_SWAPCHAIN_TO_CURRENT_EXTENT(false)
+ DRI_CONF_VK_X11_IGNORE_SUBOPTIMAL(false)
+ DRI_CONF_LIMIT_TRIG_INPUT_RANGE(false)
+ DRI_CONF_ANV_MESH_CONV_PRIM_ATTRS_TO_VERT_ATTRS(-2)
+ DRI_CONF_FORCE_VK_VENDOR(0)
+ DRI_CONF_FAKE_SPARSE(false)
+#if DETECT_OS_ANDROID && ANDROID_API_LEVEL >= 34
+ DRI_CONF_VK_REQUIRE_ASTC(true)
+#else
+ DRI_CONF_VK_REQUIRE_ASTC(false)
+#endif
+ DRI_CONF_SECTION_END
+
+ DRI_CONF_SECTION_QUALITY
+ DRI_CONF_PP_LOWER_DEPTH_RANGE_RATE()
DRI_CONF_SECTION_END
};
@@ -77,9 +120,6 @@ static const driOptionDescription anv_dri_options[] = {
*/
#define MAX_DEBUG_MESSAGE_LENGTH 4096
-/* Render engine timestamp register */
-#define TIMESTAMP 0x2358
-
/* The "RAW" clocks on Linux are called "FAST" on FreeBSD */
#if !defined(CLOCK_MONOTONIC_RAW) && defined(CLOCK_MONOTONIC_FAST)
#define CLOCK_MONOTONIC_RAW CLOCK_MONOTONIC_FAST
@@ -90,19 +130,14 @@ compiler_debug_log(void *data, UNUSED unsigned *id, const char *fmt, ...)
{
char str[MAX_DEBUG_MESSAGE_LENGTH];
struct anv_device *device = (struct anv_device *)data;
- struct anv_instance *instance = device->physical->instance;
-
- if (list_is_empty(&instance->vk.debug_report.callbacks))
- return;
+ UNUSED struct anv_instance *instance = device->physical->instance;
va_list args;
va_start(args, fmt);
(void) vsnprintf(str, MAX_DEBUG_MESSAGE_LENGTH, fmt, args);
va_end(args);
- vk_debug_report(&instance->vk,
- VK_DEBUG_REPORT_DEBUG_BIT_EXT,
- NULL, 0, 0, "anv", str);
+ //vk_logd(VK_LOG_NO_OBJS(&instance->vk), "%s", str);
}
static void
@@ -111,7 +146,7 @@ compiler_perf_log(UNUSED void *data, UNUSED unsigned *id, const char *fmt, ...)
va_list args;
va_start(args, fmt);
- if (INTEL_DEBUG & DEBUG_PERF)
+ if (INTEL_DEBUG(DEBUG_PERF))
mesa_logd_v(fmt, args);
va_end(args);
@@ -124,10 +159,14 @@ compiler_perf_log(UNUSED void *data, UNUSED unsigned *id, const char *fmt, ...)
#define ANV_USE_WSI_PLATFORM
#endif
-#ifdef ANDROID
+#ifdef ANDROID_STRICT
+#if ANDROID_API_LEVEL >= 33
+#define ANV_API_VERSION VK_MAKE_VERSION(1, 3, VK_HEADER_VERSION)
+#else
#define ANV_API_VERSION VK_MAKE_VERSION(1, 1, VK_HEADER_VERSION)
+#endif
#else
-#define ANV_API_VERSION VK_MAKE_VERSION(1, 2, VK_HEADER_VERSION)
+#define ANV_API_VERSION VK_MAKE_VERSION(1, 3, VK_HEADER_VERSION)
#endif
VkResult anv_EnumerateInstanceVersion(
@@ -144,11 +183,14 @@ static const struct vk_instance_extension_table instance_extensions = {
.KHR_external_semaphore_capabilities = true,
.KHR_get_physical_device_properties2 = true,
.EXT_debug_report = true,
+ .EXT_debug_utils = true,
#ifdef ANV_USE_WSI_PLATFORM
.KHR_get_surface_capabilities2 = true,
.KHR_surface = true,
.KHR_surface_protected_capabilities = true,
+ .EXT_surface_maintenance1 = true,
+ .EXT_swapchain_colorspace = true,
#endif
#ifdef VK_USE_PLATFORM_WAYLAND_KHR
.KHR_wayland_surface = true,
@@ -169,18 +211,29 @@ static const struct vk_instance_extension_table instance_extensions = {
.EXT_display_surface_counter = true,
.EXT_acquire_drm_display = true,
#endif
+#ifndef VK_USE_PLATFORM_WIN32_KHR
+ .EXT_headless_surface = true,
+#endif
};
static void
get_device_extensions(const struct anv_physical_device *device,
struct vk_device_extension_table *ext)
{
+ const bool has_syncobj_wait =
+ (device->sync_syncobj_type.features & VK_SYNC_FEATURE_CPU_WAIT) != 0;
+
+ const bool rt_enabled = ANV_SUPPORT_RT && device->info.has_ray_tracing;
+
*ext = (struct vk_device_extension_table) {
- .KHR_8bit_storage = device->info.ver >= 8,
- .KHR_16bit_storage = device->info.ver >= 8,
+ .KHR_8bit_storage = true,
+ .KHR_16bit_storage = !device->instance->no_16bit,
+ .KHR_acceleration_structure = rt_enabled,
.KHR_bind_memory2 = true,
- .KHR_buffer_device_address = device->has_a64_buffer_access,
+ .KHR_buffer_device_address = true,
+ .KHR_calibrated_timestamps = device->has_reg_timestamp,
.KHR_copy_commands2 = true,
+ .KHR_cooperative_matrix = anv_has_cooperative_matrix(device),
.KHR_create_renderpass2 = true,
.KHR_dedicated_allocation = true,
.KHR_deferred_host_operations = true,
@@ -189,43 +242,74 @@ get_device_extensions(const struct anv_physical_device *device,
.KHR_device_group = true,
.KHR_draw_indirect_count = true,
.KHR_driver_properties = true,
- .KHR_external_fence = device->has_syncobj_wait,
- .KHR_external_fence_fd = device->has_syncobj_wait,
+ .KHR_dynamic_rendering = true,
+ .KHR_external_fence = has_syncobj_wait,
+ .KHR_external_fence_fd = has_syncobj_wait,
.KHR_external_memory = true,
.KHR_external_memory_fd = true,
.KHR_external_semaphore = true,
.KHR_external_semaphore_fd = true,
+ .KHR_format_feature_flags2 = true,
.KHR_fragment_shading_rate = device->info.ver >= 11,
.KHR_get_memory_requirements2 = true,
+ .KHR_global_priority = device->max_context_priority >=
+ VK_QUEUE_GLOBAL_PRIORITY_MEDIUM_KHR,
.KHR_image_format_list = true,
.KHR_imageless_framebuffer = true,
#ifdef ANV_USE_WSI_PLATFORM
.KHR_incremental_present = true,
#endif
+ .KHR_index_type_uint8 = true,
+ .KHR_line_rasterization = true,
+ .KHR_load_store_op_none = true,
.KHR_maintenance1 = true,
.KHR_maintenance2 = true,
.KHR_maintenance3 = true,
+ .KHR_maintenance4 = true,
+ .KHR_maintenance5 = true,
+ .KHR_maintenance6 = true,
+ .KHR_map_memory2 = true,
.KHR_multiview = true,
.KHR_performance_query =
- device->use_softpin && device->perf &&
+ device->perf &&
(device->perf->i915_perf_version >= 3 ||
- INTEL_DEBUG & DEBUG_NO_OACONFIG) &&
+ INTEL_DEBUG(DEBUG_NO_OACONFIG)) &&
device->use_call_secondary,
.KHR_pipeline_executable_properties = true,
+ .KHR_pipeline_library = true,
+ /* Hide these behind dri configs for now since we cannot implement it reliably on
+ * all surfaces yet. There is no surface capability query for present wait/id,
+ * but the feature is useful enough to hide behind an opt-in mechanism for now.
+ * If the instance only enables surface extensions that unconditionally support present wait,
+ * we can also expose the extension that way. */
+ .KHR_present_id =
+ driQueryOptionb(&device->instance->dri_options, "vk_khr_present_wait") ||
+ wsi_common_vk_instance_supports_present_wait(&device->instance->vk),
+ .KHR_present_wait =
+ driQueryOptionb(&device->instance->dri_options, "vk_khr_present_wait") ||
+ wsi_common_vk_instance_supports_present_wait(&device->instance->vk),
.KHR_push_descriptor = true,
+ .KHR_ray_query = rt_enabled,
+ .KHR_ray_tracing_maintenance1 = rt_enabled,
+ .KHR_ray_tracing_pipeline = rt_enabled,
+ .KHR_ray_tracing_position_fetch = rt_enabled,
.KHR_relaxed_block_layout = true,
.KHR_sampler_mirror_clamp_to_edge = true,
.KHR_sampler_ycbcr_conversion = true,
.KHR_separate_depth_stencil_layouts = true,
- .KHR_shader_atomic_int64 = device->info.ver >= 9 &&
- device->use_softpin,
+ .KHR_shader_atomic_int64 = true,
.KHR_shader_clock = true,
.KHR_shader_draw_parameters = true,
- .KHR_shader_float16_int8 = device->info.ver >= 8,
- .KHR_shader_float_controls = device->info.ver >= 8,
+ .KHR_shader_expect_assume = true,
+ .KHR_shader_float16_int8 = !device->instance->no_16bit,
+ .KHR_shader_float_controls = true,
+ .KHR_shader_float_controls2 = true,
.KHR_shader_integer_dot_product = true,
+ .KHR_shader_maximal_reconvergence = true,
.KHR_shader_non_semantic_info = true,
- .KHR_shader_subgroup_extended_types = device->info.ver >= 8,
+ .KHR_shader_quad_control = true,
+ .KHR_shader_subgroup_extended_types = true,
+ .KHR_shader_subgroup_rotate = true,
.KHR_shader_subgroup_uniform_control_flow = true,
.KHR_shader_terminate_invocation = true,
.KHR_spirv_1_4 = true,
@@ -234,64 +318,110 @@ get_device_extensions(const struct anv_physical_device *device,
.KHR_swapchain = true,
.KHR_swapchain_mutable_format = true,
#endif
+ .KHR_synchronization2 = true,
.KHR_timeline_semaphore = true,
.KHR_uniform_buffer_standard_layout = true,
.KHR_variable_pointers = true,
+ .KHR_vertex_attribute_divisor = true,
+ .KHR_video_queue = device->video_decode_enabled,
+ .KHR_video_decode_queue = device->video_decode_enabled,
+ .KHR_video_decode_h264 = VIDEO_CODEC_H264DEC && device->video_decode_enabled,
+ .KHR_video_decode_h265 = VIDEO_CODEC_H265DEC && device->video_decode_enabled,
.KHR_vulkan_memory_model = true,
.KHR_workgroup_memory_explicit_layout = true,
.KHR_zero_initialize_workgroup_memory = true,
.EXT_4444_formats = true,
- .EXT_buffer_device_address = device->has_a64_buffer_access,
+ .EXT_attachment_feedback_loop_layout = true,
+ .EXT_attachment_feedback_loop_dynamic_state = true,
+ .EXT_border_color_swizzle = true,
+ .EXT_buffer_device_address = true,
.EXT_calibrated_timestamps = device->has_reg_timestamp,
.EXT_color_write_enable = true,
- .EXT_conditional_rendering = device->info.verx10 >= 75,
- .EXT_conservative_rasterization = device->info.ver >= 9,
- .EXT_custom_border_color = device->info.ver >= 8,
+ .EXT_conditional_rendering = true,
+ .EXT_conservative_rasterization = true,
+ .EXT_custom_border_color = true,
+ .EXT_depth_bias_control = true,
+ .EXT_depth_clamp_zero_one = true,
+ .EXT_depth_clip_control = true,
+ .EXT_depth_range_unrestricted = device->info.ver >= 20,
.EXT_depth_clip_enable = true,
- .EXT_descriptor_indexing = device->has_a64_buffer_access &&
- device->has_bindless_images,
+ .EXT_descriptor_buffer = true,
+ .EXT_descriptor_indexing = true,
#ifdef VK_USE_PLATFORM_DISPLAY_KHR
.EXT_display_control = true,
#endif
+ .EXT_dynamic_rendering_unused_attachments = true,
.EXT_extended_dynamic_state = true,
.EXT_extended_dynamic_state2 = true,
+ .EXT_extended_dynamic_state3 = true,
.EXT_external_memory_dma_buf = true,
.EXT_external_memory_host = true,
- .EXT_fragment_shader_interlock = device->info.ver >= 9,
- .EXT_global_priority = device->has_context_priority,
+ .EXT_fragment_shader_interlock = true,
+ .EXT_global_priority = device->max_context_priority >=
+ VK_QUEUE_GLOBAL_PRIORITY_MEDIUM_KHR,
+ .EXT_global_priority_query = device->max_context_priority >=
+ VK_QUEUE_GLOBAL_PRIORITY_MEDIUM_KHR,
+ .EXT_graphics_pipeline_library = !debug_get_bool_option("ANV_NO_GPL", false),
.EXT_host_query_reset = true,
+ .EXT_image_2d_view_of_3d = true,
+ .EXT_image_compression_control = device->instance->compression_control_enabled,
.EXT_image_robustness = true,
.EXT_image_drm_format_modifier = true,
+ .EXT_image_sliced_view_of_3d = true,
+ .EXT_image_view_min_lod = true,
.EXT_index_type_uint8 = true,
.EXT_inline_uniform_block = true,
.EXT_line_rasterization = true,
- .EXT_memory_budget = device->sys.available,
+ .EXT_load_store_op_none = true,
+ .EXT_map_memory_placed = device->info.has_mmap_offset,
+ /* Enable the extension only if we have support on both the local &
+ * system memory
+ */
+ .EXT_memory_budget = (!device->info.has_local_mem ||
+ device->vram_mappable.available > 0) &&
+ device->sys.available,
+ .EXT_mesh_shader = device->info.has_mesh_shading,
+ .EXT_mutable_descriptor_type = true,
+ .EXT_nested_command_buffer = true,
+ .EXT_non_seamless_cube_map = true,
.EXT_pci_bus_info = true,
.EXT_physical_device_drm = true,
.EXT_pipeline_creation_cache_control = true,
.EXT_pipeline_creation_feedback = true,
- .EXT_post_depth_coverage = device->info.ver >= 9,
+ .EXT_pipeline_library_group_handles = rt_enabled,
+ .EXT_pipeline_robustness = true,
+ .EXT_post_depth_coverage = true,
+ .EXT_primitives_generated_query = true,
+ .EXT_primitive_topology_list_restart = true,
.EXT_private_data = true,
.EXT_provoking_vertex = true,
.EXT_queue_family_foreign = true,
.EXT_robustness2 = true,
.EXT_sample_locations = true,
- .EXT_sampler_filter_minmax = device->info.ver >= 9,
+ .EXT_sampler_filter_minmax = true,
.EXT_scalar_block_layout = true,
.EXT_separate_stencil_usage = true,
.EXT_shader_atomic_float = true,
- .EXT_shader_atomic_float2 = device->info.ver >= 9,
+ .EXT_shader_atomic_float2 = true,
.EXT_shader_demote_to_helper_invocation = true,
- .EXT_shader_stencil_export = device->info.ver >= 9,
+ .EXT_shader_module_identifier = true,
+ .EXT_shader_stencil_export = true,
.EXT_shader_subgroup_ballot = true,
.EXT_shader_subgroup_vote = true,
.EXT_shader_viewport_index_layer = true,
.EXT_subgroup_size_control = true,
+#ifdef ANV_USE_WSI_PLATFORM
+ .EXT_swapchain_maintenance1 = true,
+#endif
.EXT_texel_buffer_alignment = true,
+ .EXT_tooling_info = true,
.EXT_transform_feedback = true,
.EXT_vertex_attribute_divisor = true,
+ .EXT_vertex_input_dynamic_state = true,
.EXT_ycbcr_image_arrays = true,
-#ifdef ANDROID
+ .AMD_buffer_marker = true,
+ .AMD_texture_gather_bias_lod = device->info.ver >= 20,
+#if DETECT_OS_ANDROID
.ANDROID_external_memory_android_hardware_buffer = true,
.ANDROID_native_buffer = true,
#endif
@@ -300,173 +430,1484 @@ get_device_extensions(const struct anv_physical_device *device,
.GOOGLE_user_type = true,
.INTEL_performance_query = device->perf &&
device->perf->i915_perf_version >= 3,
- .INTEL_shader_integer_functions2 = device->info.ver >= 8,
+ .INTEL_shader_integer_functions2 = true,
.EXT_multi_draw = true,
.NV_compute_shader_derivatives = true,
+ .VALVE_mutable_descriptor_type = true,
};
}
-static uint64_t
-anv_compute_sys_heap_size(struct anv_physical_device *device,
- uint64_t total_ram)
+static void
+get_features(const struct anv_physical_device *pdevice,
+ struct vk_features *features)
{
- /* We don't want to burn too much ram with the GPU. If the user has 4GiB
- * or less, we use at most half. If they have more than 4GiB, we use 3/4.
+ struct vk_app_info *app_info = &pdevice->instance->vk.app_info;
+
+ const bool rt_enabled = ANV_SUPPORT_RT && pdevice->info.has_ray_tracing;
+
+ const bool mesh_shader =
+ pdevice->vk.supported_extensions.EXT_mesh_shader;
+
+ const bool has_sparse_or_fake = pdevice->sparse_type != ANV_SPARSE_TYPE_NOT_SUPPORTED;
+
+ *features = (struct vk_features) {
+ /* Vulkan 1.0 */
+ .robustBufferAccess = true,
+ .fullDrawIndexUint32 = true,
+ .imageCubeArray = true,
+ .independentBlend = true,
+ .geometryShader = true,
+ .tessellationShader = true,
+ .sampleRateShading = true,
+ .dualSrcBlend = true,
+ .logicOp = true,
+ .multiDrawIndirect = true,
+ .drawIndirectFirstInstance = true,
+ .depthClamp = true,
+ .depthBiasClamp = true,
+ .fillModeNonSolid = true,
+ .depthBounds = pdevice->info.ver >= 12,
+ .wideLines = true,
+ .largePoints = true,
+ .alphaToOne = true,
+ .multiViewport = true,
+ .samplerAnisotropy = true,
+ .textureCompressionETC2 = true,
+ .textureCompressionASTC_LDR = pdevice->has_astc_ldr ||
+ pdevice->emu_astc_ldr,
+ .textureCompressionBC = true,
+ .occlusionQueryPrecise = true,
+ .pipelineStatisticsQuery = true,
+ .vertexPipelineStoresAndAtomics = true,
+ .fragmentStoresAndAtomics = true,
+ .shaderTessellationAndGeometryPointSize = true,
+ .shaderImageGatherExtended = true,
+ .shaderStorageImageExtendedFormats = true,
+ .shaderStorageImageMultisample = false,
+ /* Gfx12.5 has all the required format supported in HW for typed
+ * read/writes
+ */
+ .shaderStorageImageReadWithoutFormat = pdevice->info.verx10 >= 125,
+ .shaderStorageImageWriteWithoutFormat = true,
+ .shaderUniformBufferArrayDynamicIndexing = true,
+ .shaderSampledImageArrayDynamicIndexing = true,
+ .shaderStorageBufferArrayDynamicIndexing = true,
+ .shaderStorageImageArrayDynamicIndexing = true,
+ .shaderClipDistance = true,
+ .shaderCullDistance = true,
+ .shaderFloat64 = pdevice->info.has_64bit_float ||
+ pdevice->instance->fp64_workaround_enabled,
+ .shaderInt64 = true,
+ .shaderInt16 = true,
+ .shaderResourceMinLod = true,
+ .shaderResourceResidency = has_sparse_or_fake,
+ .sparseBinding = has_sparse_or_fake,
+ .sparseResidencyAliased = has_sparse_or_fake,
+ .sparseResidencyBuffer = has_sparse_or_fake,
+ .sparseResidencyImage2D = has_sparse_or_fake,
+ .sparseResidencyImage3D = has_sparse_or_fake,
+ .sparseResidency2Samples = false,
+ .sparseResidency4Samples = false,
+ .sparseResidency8Samples = false,
+ .sparseResidency16Samples = false,
+ .variableMultisampleRate = true,
+ .inheritedQueries = true,
+
+ /* Vulkan 1.1 */
+ .storageBuffer16BitAccess = !pdevice->instance->no_16bit,
+ .uniformAndStorageBuffer16BitAccess = !pdevice->instance->no_16bit,
+ .storagePushConstant16 = true,
+ .storageInputOutput16 = false,
+ .multiview = true,
+ .multiviewGeometryShader = true,
+ .multiviewTessellationShader = true,
+ .variablePointersStorageBuffer = true,
+ .variablePointers = true,
+ .protectedMemory = pdevice->has_protected_contexts,
+ .samplerYcbcrConversion = true,
+ .shaderDrawParameters = true,
+
+ /* Vulkan 1.2 */
+ .samplerMirrorClampToEdge = true,
+ .drawIndirectCount = true,
+ .storageBuffer8BitAccess = true,
+ .uniformAndStorageBuffer8BitAccess = true,
+ .storagePushConstant8 = true,
+ .shaderBufferInt64Atomics = true,
+ .shaderSharedInt64Atomics = false,
+ .shaderFloat16 = !pdevice->instance->no_16bit,
+ .shaderInt8 = !pdevice->instance->no_16bit,
+
+ .descriptorIndexing = true,
+ .shaderInputAttachmentArrayDynamicIndexing = false,
+ .shaderUniformTexelBufferArrayDynamicIndexing = true,
+ .shaderStorageTexelBufferArrayDynamicIndexing = true,
+ .shaderUniformBufferArrayNonUniformIndexing = true,
+ .shaderSampledImageArrayNonUniformIndexing = true,
+ .shaderStorageBufferArrayNonUniformIndexing = true,
+ .shaderStorageImageArrayNonUniformIndexing = true,
+ .shaderInputAttachmentArrayNonUniformIndexing = false,
+ .shaderUniformTexelBufferArrayNonUniformIndexing = true,
+ .shaderStorageTexelBufferArrayNonUniformIndexing = true,
+ .descriptorBindingUniformBufferUpdateAfterBind = true,
+ .descriptorBindingSampledImageUpdateAfterBind = true,
+ .descriptorBindingStorageImageUpdateAfterBind = true,
+ .descriptorBindingStorageBufferUpdateAfterBind = true,
+ .descriptorBindingUniformTexelBufferUpdateAfterBind = true,
+ .descriptorBindingStorageTexelBufferUpdateAfterBind = true,
+ .descriptorBindingUpdateUnusedWhilePending = true,
+ .descriptorBindingPartiallyBound = true,
+ .descriptorBindingVariableDescriptorCount = true,
+ .runtimeDescriptorArray = true,
+
+ .samplerFilterMinmax = true,
+ .scalarBlockLayout = true,
+ .imagelessFramebuffer = true,
+ .uniformBufferStandardLayout = true,
+ .shaderSubgroupExtendedTypes = true,
+ .separateDepthStencilLayouts = true,
+ .hostQueryReset = true,
+ .timelineSemaphore = true,
+ .bufferDeviceAddress = true,
+ .bufferDeviceAddressCaptureReplay = true,
+ .bufferDeviceAddressMultiDevice = false,
+ .vulkanMemoryModel = true,
+ .vulkanMemoryModelDeviceScope = true,
+ .vulkanMemoryModelAvailabilityVisibilityChains = true,
+ .shaderOutputViewportIndex = true,
+ .shaderOutputLayer = true,
+ .subgroupBroadcastDynamicId = true,
+
+ /* Vulkan 1.3 */
+ .robustImageAccess = true,
+ .inlineUniformBlock = true,
+ .descriptorBindingInlineUniformBlockUpdateAfterBind = true,
+ .pipelineCreationCacheControl = true,
+ .privateData = true,
+ .shaderDemoteToHelperInvocation = true,
+ .shaderTerminateInvocation = true,
+ .subgroupSizeControl = true,
+ .computeFullSubgroups = true,
+ .synchronization2 = true,
+ .textureCompressionASTC_HDR = false,
+ .shaderZeroInitializeWorkgroupMemory = true,
+ .dynamicRendering = true,
+ .shaderIntegerDotProduct = true,
+ .maintenance4 = true,
+
+ /* VK_EXT_4444_formats */
+ .formatA4R4G4B4 = true,
+ .formatA4B4G4R4 = false,
+
+ /* VK_KHR_acceleration_structure */
+ .accelerationStructure = rt_enabled,
+ .accelerationStructureCaptureReplay = false, /* TODO */
+ .accelerationStructureIndirectBuild = false, /* TODO */
+ .accelerationStructureHostCommands = false,
+ .descriptorBindingAccelerationStructureUpdateAfterBind = rt_enabled,
+
+ /* VK_EXT_border_color_swizzle */
+ .borderColorSwizzle = true,
+ .borderColorSwizzleFromImage = true,
+
+ /* VK_EXT_color_write_enable */
+ .colorWriteEnable = true,
+
+ /* VK_EXT_image_2d_view_of_3d */
+ .image2DViewOf3D = true,
+ .sampler2DViewOf3D = true,
+
+ /* VK_EXT_image_sliced_view_of_3d */
+ .imageSlicedViewOf3D = true,
+
+ /* VK_NV_compute_shader_derivatives */
+ .computeDerivativeGroupQuads = true,
+ .computeDerivativeGroupLinear = true,
+
+ /* VK_EXT_conditional_rendering */
+ .conditionalRendering = true,
+ .inheritedConditionalRendering = true,
+
+ /* VK_EXT_custom_border_color */
+ .customBorderColors = true,
+ .customBorderColorWithoutFormat = true,
+
+ /* VK_EXT_depth_clamp_zero_one */
+ .depthClampZeroOne = true,
+
+ /* VK_EXT_depth_clip_enable */
+ .depthClipEnable = true,
+
+ /* VK_EXT_fragment_shader_interlock */
+ .fragmentShaderSampleInterlock = true,
+ .fragmentShaderPixelInterlock = true,
+ .fragmentShaderShadingRateInterlock = false,
+
+ /* VK_EXT_global_priority_query */
+ .globalPriorityQuery = true,
+
+ /* VK_EXT_graphics_pipeline_library */
+ .graphicsPipelineLibrary =
+ pdevice->vk.supported_extensions.EXT_graphics_pipeline_library,
+
+ /* VK_KHR_fragment_shading_rate */
+ .pipelineFragmentShadingRate = true,
+ .primitiveFragmentShadingRate =
+ pdevice->info.has_coarse_pixel_primitive_and_cb,
+ .attachmentFragmentShadingRate =
+ pdevice->info.has_coarse_pixel_primitive_and_cb,
+
+ /* VK_EXT_image_view_min_lod */
+ .minLod = true,
+
+ /* VK_EXT_index_type_uint8 */
+ .indexTypeUint8 = true,
+
+ /* VK_EXT_line_rasterization */
+ /* Rectangular lines must use the strict algorithm, which is not
+ * supported for wide lines prior to ICL. See rasterization_mode for
+ * details and how the HW states are programmed.
+ */
+ .rectangularLines = pdevice->info.ver >= 10,
+ .bresenhamLines = true,
+ /* Support for Smooth lines with MSAA was removed on gfx11. From the
+ * BSpec section "Multisample ModesState" table for "AA Line Support
+ * Requirements":
+ *
+ * GFX10:BUG:######## NUM_MULTISAMPLES == 1
+ *
+ * Fortunately, this isn't a case most people care about.
+ */
+ .smoothLines = pdevice->info.ver < 10,
+ .stippledRectangularLines = false,
+ .stippledBresenhamLines = true,
+ .stippledSmoothLines = false,
+
+ /* VK_NV_mesh_shader */
+ .taskShaderNV = false,
+ .meshShaderNV = false,
+
+ /* VK_EXT_mesh_shader */
+ .taskShader = mesh_shader,
+ .meshShader = mesh_shader,
+ .multiviewMeshShader = false,
+ .primitiveFragmentShadingRateMeshShader = mesh_shader,
+ .meshShaderQueries = false,
+
+ /* VK_EXT_mutable_descriptor_type */
+ .mutableDescriptorType = true,
+
+ /* VK_KHR_performance_query */
+ .performanceCounterQueryPools = true,
+ /* HW only supports a single configuration at a time. */
+ .performanceCounterMultipleQueryPools = false,
+
+ /* VK_KHR_pipeline_executable_properties */
+ .pipelineExecutableInfo = true,
+
+ /* VK_EXT_primitives_generated_query */
+ .primitivesGeneratedQuery = true,
+ .primitivesGeneratedQueryWithRasterizerDiscard = false,
+ .primitivesGeneratedQueryWithNonZeroStreams = false,
+
+ /* VK_EXT_pipeline_library_group_handles */
+ .pipelineLibraryGroupHandles = true,
+
+ /* VK_EXT_provoking_vertex */
+ .provokingVertexLast = true,
+ .transformFeedbackPreservesProvokingVertex = true,
+
+ /* VK_KHR_ray_query */
+ .rayQuery = rt_enabled,
+
+ /* VK_KHR_ray_tracing_maintenance1 */
+ .rayTracingMaintenance1 = rt_enabled,
+ .rayTracingPipelineTraceRaysIndirect2 = rt_enabled,
+
+ /* VK_KHR_ray_tracing_pipeline */
+ .rayTracingPipeline = rt_enabled,
+ .rayTracingPipelineShaderGroupHandleCaptureReplay = false,
+ .rayTracingPipelineShaderGroupHandleCaptureReplayMixed = false,
+ .rayTracingPipelineTraceRaysIndirect = rt_enabled,
+ .rayTraversalPrimitiveCulling = rt_enabled,
+
+ /* VK_EXT_robustness2 */
+ .robustBufferAccess2 = true,
+ .robustImageAccess2 = true,
+ .nullDescriptor = true,
+
+ /* VK_EXT_shader_atomic_float */
+ .shaderBufferFloat32Atomics = true,
+ .shaderBufferFloat32AtomicAdd = pdevice->info.has_lsc,
+ .shaderBufferFloat64Atomics =
+ pdevice->info.has_64bit_float && pdevice->info.has_lsc,
+ .shaderBufferFloat64AtomicAdd = false,
+ .shaderSharedFloat32Atomics = true,
+ .shaderSharedFloat32AtomicAdd = false,
+ .shaderSharedFloat64Atomics = false,
+ .shaderSharedFloat64AtomicAdd = false,
+ .shaderImageFloat32Atomics = true,
+ .shaderImageFloat32AtomicAdd = pdevice->info.ver >= 20,
+ .sparseImageFloat32Atomics = false,
+ .sparseImageFloat32AtomicAdd = false,
+
+ /* VK_EXT_shader_atomic_float2 */
+ .shaderBufferFloat16Atomics = pdevice->info.has_lsc,
+ .shaderBufferFloat16AtomicAdd = false,
+ .shaderBufferFloat16AtomicMinMax = pdevice->info.has_lsc,
+ .shaderBufferFloat32AtomicMinMax = true,
+ .shaderBufferFloat64AtomicMinMax =
+ pdevice->info.has_64bit_float && pdevice->info.has_lsc,
+ .shaderSharedFloat16Atomics = pdevice->info.has_lsc,
+ .shaderSharedFloat16AtomicAdd = false,
+ .shaderSharedFloat16AtomicMinMax = pdevice->info.has_lsc,
+ .shaderSharedFloat32AtomicMinMax = true,
+ .shaderSharedFloat64AtomicMinMax = false,
+ .shaderImageFloat32AtomicMinMax = false,
+ .sparseImageFloat32AtomicMinMax = false,
+
+ /* VK_KHR_shader_clock */
+ .shaderSubgroupClock = true,
+ .shaderDeviceClock = false,
+
+ /* VK_INTEL_shader_integer_functions2 */
+ .shaderIntegerFunctions2 = true,
+
+ /* VK_EXT_shader_module_identifier */
+ .shaderModuleIdentifier = true,
+
+ /* VK_KHR_shader_subgroup_uniform_control_flow */
+ .shaderSubgroupUniformControlFlow = true,
+
+ /* VK_EXT_texel_buffer_alignment */
+ .texelBufferAlignment = true,
+
+ /* VK_EXT_transform_feedback */
+ .transformFeedback = true,
+ .geometryStreams = true,
+
+ /* VK_KHR_vertex_attribute_divisor */
+ .vertexAttributeInstanceRateDivisor = true,
+ .vertexAttributeInstanceRateZeroDivisor = true,
+
+ /* VK_KHR_workgroup_memory_explicit_layout */
+ .workgroupMemoryExplicitLayout = true,
+ .workgroupMemoryExplicitLayoutScalarBlockLayout = true,
+ .workgroupMemoryExplicitLayout8BitAccess = true,
+ .workgroupMemoryExplicitLayout16BitAccess = true,
+
+ /* VK_EXT_ycbcr_image_arrays */
+ .ycbcrImageArrays = true,
+
+ /* VK_EXT_extended_dynamic_state */
+ .extendedDynamicState = true,
+
+ /* VK_EXT_extended_dynamic_state2 */
+ .extendedDynamicState2 = true,
+ .extendedDynamicState2LogicOp = true,
+ .extendedDynamicState2PatchControlPoints = true,
+
+ /* VK_EXT_extended_dynamic_state3 */
+ .extendedDynamicState3PolygonMode = true,
+ .extendedDynamicState3TessellationDomainOrigin = true,
+ .extendedDynamicState3RasterizationStream = true,
+ .extendedDynamicState3LineStippleEnable = true,
+ .extendedDynamicState3LineRasterizationMode = true,
+ .extendedDynamicState3LogicOpEnable = true,
+ .extendedDynamicState3AlphaToOneEnable = true,
+ .extendedDynamicState3DepthClipEnable = true,
+ .extendedDynamicState3DepthClampEnable = true,
+ .extendedDynamicState3DepthClipNegativeOneToOne = true,
+ .extendedDynamicState3ProvokingVertexMode = true,
+ .extendedDynamicState3ColorBlendEnable = true,
+ .extendedDynamicState3ColorWriteMask = true,
+ .extendedDynamicState3ColorBlendEquation = true,
+ .extendedDynamicState3SampleLocationsEnable = true,
+ .extendedDynamicState3SampleMask = true,
+ .extendedDynamicState3ConservativeRasterizationMode = true,
+ .extendedDynamicState3AlphaToCoverageEnable = true,
+ .extendedDynamicState3RasterizationSamples = true,
+
+ .extendedDynamicState3ExtraPrimitiveOverestimationSize = false,
+ .extendedDynamicState3ViewportWScalingEnable = false,
+ .extendedDynamicState3ViewportSwizzle = false,
+ .extendedDynamicState3ShadingRateImageEnable = false,
+ .extendedDynamicState3CoverageToColorEnable = false,
+ .extendedDynamicState3CoverageToColorLocation = false,
+ .extendedDynamicState3CoverageModulationMode = false,
+ .extendedDynamicState3CoverageModulationTableEnable = false,
+ .extendedDynamicState3CoverageModulationTable = false,
+ .extendedDynamicState3CoverageReductionMode = false,
+ .extendedDynamicState3RepresentativeFragmentTestEnable = false,
+ .extendedDynamicState3ColorBlendAdvanced = false,
+
+ /* VK_EXT_multi_draw */
+ .multiDraw = true,
+
+ /* VK_EXT_non_seamless_cube_map */
+ .nonSeamlessCubeMap = true,
+
+ /* VK_EXT_primitive_topology_list_restart */
+ .primitiveTopologyListRestart = true,
+ .primitiveTopologyPatchListRestart = true,
+
+ /* VK_EXT_depth_clip_control */
+ .depthClipControl = true,
+
+ /* VK_KHR_present_id */
+ .presentId = pdevice->vk.supported_extensions.KHR_present_id,
+
+ /* VK_KHR_present_wait */
+ .presentWait = pdevice->vk.supported_extensions.KHR_present_wait,
+
+ /* VK_EXT_vertex_input_dynamic_state */
+ .vertexInputDynamicState = true,
+
+ /* VK_KHR_ray_tracing_position_fetch */
+ .rayTracingPositionFetch = rt_enabled,
+
+ /* VK_EXT_dynamic_rendering_unused_attachments */
+ .dynamicRenderingUnusedAttachments = true,
+
+ /* VK_EXT_depth_bias_control */
+ .depthBiasControl = true,
+ .floatRepresentation = true,
+ .leastRepresentableValueForceUnormRepresentation = false,
+ .depthBiasExact = true,
+
+ /* VK_EXT_pipeline_robustness */
+ .pipelineRobustness = true,
+
+ /* VK_KHR_maintenance5 */
+ .maintenance5 = true,
+
+ /* VK_KHR_maintenance6 */
+ .maintenance6 = true,
+
+ /* VK_EXT_nested_command_buffer */
+ .nestedCommandBuffer = true,
+ .nestedCommandBufferRendering = true,
+ .nestedCommandBufferSimultaneousUse = false,
+
+ /* VK_KHR_cooperative_matrix */
+ .cooperativeMatrix = anv_has_cooperative_matrix(pdevice),
+
+ /* VK_KHR_shader_maximal_reconvergence */
+ .shaderMaximalReconvergence = true,
+
+ /* VK_KHR_shader_subgroup_rotate */
+ .shaderSubgroupRotate = true,
+ .shaderSubgroupRotateClustered = true,
+
+ /* VK_EXT_attachment_feedback_loop_layout */
+ .attachmentFeedbackLoopLayout = true,
+
+ /* VK_EXT_attachment_feedback_loop_dynamic_state */
+ .attachmentFeedbackLoopDynamicState = true,
+
+ /* VK_KHR_shader_expect_assume */
+ .shaderExpectAssume = true,
+
+ /* VK_EXT_descriptor_buffer */
+ .descriptorBuffer = true,
+ .descriptorBufferCaptureReplay = true,
+ .descriptorBufferImageLayoutIgnored = false,
+ .descriptorBufferPushDescriptors = true,
+
+ /* VK_EXT_map_memory_placed */
+ .memoryMapPlaced = true,
+ .memoryMapRangePlaced = false,
+ .memoryUnmapReserve = true,
+
+ /* VK_KHR_shader_quad_control */
+ .shaderQuadControl = true,
+
+#ifdef ANV_USE_WSI_PLATFORM
+ /* VK_EXT_swapchain_maintenance1 */
+ .swapchainMaintenance1 = true,
+#endif
+
+ /* VK_EXT_image_compression_control */
+ .imageCompressionControl = true,
+
+ /* VK_KHR_shader_float_controls2 */
+ .shaderFloatControls2 = true,
+ };
+
+ /* The new DOOM and Wolfenstein games require depthBounds without
+ * checking for it. They seem to run fine without it so just claim it's
+ * there and accept the consequences.
*/
- uint64_t available_ram;
- if (total_ram <= 4ull * 1024ull * 1024ull * 1024ull)
- available_ram = total_ram / 2;
- else
- available_ram = total_ram * 3 / 4;
+ if (app_info->engine_name && strcmp(app_info->engine_name, "idTech") == 0)
+ features->depthBounds = true;
+}
+
+#define MAX_PER_STAGE_DESCRIPTOR_UNIFORM_BUFFERS 64
+
+#define MAX_PER_STAGE_DESCRIPTOR_INPUT_ATTACHMENTS 64
+#define MAX_DESCRIPTOR_SET_INPUT_ATTACHMENTS 256
+
+#define MAX_CUSTOM_BORDER_COLORS 4096
+
+static VkDeviceSize
+anx_get_physical_device_max_heap_size(const struct anv_physical_device *pdevice)
+{
+ VkDeviceSize ret = 0;
+
+ for (uint32_t i = 0; i < pdevice->memory.heap_count; i++) {
+ if (pdevice->memory.heaps[i].size > ret)
+ ret = pdevice->memory.heaps[i].size;
+ }
+
+ return ret;
+}
+
+static void
+get_properties_1_1(const struct anv_physical_device *pdevice,
+ struct vk_properties *p)
+{
+ memcpy(p->deviceUUID, pdevice->device_uuid, VK_UUID_SIZE);
+ memcpy(p->driverUUID, pdevice->driver_uuid, VK_UUID_SIZE);
+ memset(p->deviceLUID, 0, VK_LUID_SIZE);
+ p->deviceNodeMask = 0;
+ p->deviceLUIDValid = false;
- /* We also want to leave some padding for things we allocate in the driver,
- * so don't go over 3/4 of the GTT either.
+ p->subgroupSize = BRW_SUBGROUP_SIZE;
+ VkShaderStageFlags scalar_stages = 0;
+ for (unsigned stage = 0; stage < MESA_SHADER_STAGES; stage++) {
+ scalar_stages |= mesa_to_vk_shader_stage(stage);
+ }
+ if (pdevice->vk.supported_extensions.KHR_ray_tracing_pipeline) {
+ scalar_stages |= VK_SHADER_STAGE_RAYGEN_BIT_KHR |
+ VK_SHADER_STAGE_ANY_HIT_BIT_KHR |
+ VK_SHADER_STAGE_CLOSEST_HIT_BIT_KHR |
+ VK_SHADER_STAGE_MISS_BIT_KHR |
+ VK_SHADER_STAGE_INTERSECTION_BIT_KHR |
+ VK_SHADER_STAGE_CALLABLE_BIT_KHR;
+ }
+ if (pdevice->vk.supported_extensions.EXT_mesh_shader) {
+ scalar_stages |= VK_SHADER_STAGE_TASK_BIT_EXT |
+ VK_SHADER_STAGE_MESH_BIT_EXT;
+ }
+ p->subgroupSupportedStages = scalar_stages;
+ p->subgroupSupportedOperations = VK_SUBGROUP_FEATURE_BASIC_BIT |
+ VK_SUBGROUP_FEATURE_VOTE_BIT |
+ VK_SUBGROUP_FEATURE_BALLOT_BIT |
+ VK_SUBGROUP_FEATURE_SHUFFLE_BIT |
+ VK_SUBGROUP_FEATURE_SHUFFLE_RELATIVE_BIT |
+ VK_SUBGROUP_FEATURE_QUAD_BIT |
+ VK_SUBGROUP_FEATURE_ARITHMETIC_BIT |
+ VK_SUBGROUP_FEATURE_CLUSTERED_BIT |
+ VK_SUBGROUP_FEATURE_ROTATE_BIT_KHR |
+ VK_SUBGROUP_FEATURE_ROTATE_CLUSTERED_BIT_KHR;
+ p->subgroupQuadOperationsInAllStages = true;
+
+ p->pointClippingBehavior = VK_POINT_CLIPPING_BEHAVIOR_USER_CLIP_PLANES_ONLY;
+ p->maxMultiviewViewCount = 16;
+ p->maxMultiviewInstanceIndex = UINT32_MAX / 16;
+ /* Our protected implementation is a memory encryption mechanism, it
+ * shouldn't page fault, but it hangs the HW so in terms of user visibility
+ * it's similar to a fault.
*/
- available_ram = MIN2(available_ram, device->gtt_size * 3 / 4);
+ p->protectedNoFault = false;
+ /* This value doesn't matter for us today as our per-stage descriptors are
+ * the real limit.
+ */
+ p->maxPerSetDescriptors = 1024;
- if (available_ram > (2ull << 30) && !device->supports_48bit_addresses) {
- /* When running with an overridden PCI ID, we may get a GTT size from
- * the kernel that is greater than 2 GiB but the execbuf check for 48bit
- * address support can still fail. Just clamp the address space size to
- * 2 GiB if we don't have 48-bit support.
- */
- mesa_logw("%s:%d: The kernel reported a GTT size larger than 2 GiB but "
- "not support for 48-bit addresses",
- __FILE__, __LINE__);
- available_ram = 2ull << 30;
+ for (uint32_t i = 0; i < pdevice->memory.heap_count; i++) {
+ p->maxMemoryAllocationSize = MAX2(p->maxMemoryAllocationSize,
+ pdevice->memory.heaps[i].size);
}
+}
- return available_ram;
+static void
+get_properties_1_2(const struct anv_physical_device *pdevice,
+ struct vk_properties *p)
+{
+ p->driverID = VK_DRIVER_ID_INTEL_OPEN_SOURCE_MESA;
+ memset(p->driverName, 0, sizeof(p->driverName));
+ snprintf(p->driverName, VK_MAX_DRIVER_NAME_SIZE,
+ "Intel open-source Mesa driver");
+ memset(p->driverInfo, 0, sizeof(p->driverInfo));
+ snprintf(p->driverInfo, VK_MAX_DRIVER_INFO_SIZE,
+ "Mesa " PACKAGE_VERSION MESA_GIT_SHA1);
+
+ p->conformanceVersion = (VkConformanceVersion) {
+ .major = 1,
+ .minor = 3,
+ .subminor = 6,
+ .patch = 0,
+ };
+
+ p->denormBehaviorIndependence =
+ VK_SHADER_FLOAT_CONTROLS_INDEPENDENCE_ALL;
+ p->roundingModeIndependence =
+ VK_SHADER_FLOAT_CONTROLS_INDEPENDENCE_NONE;
+
+ /* Broadwell does not support HF denorms and there are restrictions
+ * other gens. According to Kabylake's PRM:
+ *
+ * "math - Extended Math Function
+ * [...]
+ * Restriction : Half-float denorms are always retained."
+ */
+ p->shaderDenormFlushToZeroFloat16 = false;
+ p->shaderDenormPreserveFloat16 = true;
+ p->shaderRoundingModeRTEFloat16 = true;
+ p->shaderRoundingModeRTZFloat16 = true;
+ p->shaderSignedZeroInfNanPreserveFloat16 = true;
+
+ p->shaderDenormFlushToZeroFloat32 = true;
+ p->shaderDenormPreserveFloat32 = true;
+ p->shaderRoundingModeRTEFloat32 = true;
+ p->shaderRoundingModeRTZFloat32 = true;
+ p->shaderSignedZeroInfNanPreserveFloat32 = true;
+
+ p->shaderDenormFlushToZeroFloat64 = true;
+ p->shaderDenormPreserveFloat64 = true;
+ p->shaderRoundingModeRTEFloat64 = true;
+ p->shaderRoundingModeRTZFloat64 = true;
+ p->shaderSignedZeroInfNanPreserveFloat64 = true;
+
+ /* It's a bit hard to exactly map our implementation to the limits
+ * described by Vulkan. The bindless surface handle in the extended
+ * message descriptors is 20 bits and it's an index into the table of
+ * RENDER_SURFACE_STATE structs that starts at bindless surface base
+ * address. This means that we can have at must 1M surface states
+ * allocated at any given time. Since most image views take two
+ * descriptors, this means we have a limit of about 500K image views.
+ *
+ * However, since we allocate surface states at vkCreateImageView time,
+ * this means our limit is actually something on the order of 500K image
+ * views allocated at any time. The actual limit describe by Vulkan, on
+ * the other hand, is a limit of how many you can have in a descriptor set.
+ * Assuming anyone using 1M descriptors will be using the same image view
+ * twice a bunch of times (or a bunch of null descriptors), we can safely
+ * advertise a larger limit here.
+ */
+ const unsigned max_bindless_views =
+ anv_physical_device_bindless_heap_size(pdevice, false) / ANV_SURFACE_STATE_SIZE;
+ p->maxUpdateAfterBindDescriptorsInAllPools = max_bindless_views;
+ p->shaderUniformBufferArrayNonUniformIndexingNative = false;
+ p->shaderSampledImageArrayNonUniformIndexingNative = false;
+ p->shaderStorageBufferArrayNonUniformIndexingNative = true;
+ p->shaderStorageImageArrayNonUniformIndexingNative = false;
+ p->shaderInputAttachmentArrayNonUniformIndexingNative = false;
+ p->robustBufferAccessUpdateAfterBind = true;
+ p->quadDivergentImplicitLod = false;
+ p->maxPerStageDescriptorUpdateAfterBindSamplers = max_bindless_views;
+ p->maxPerStageDescriptorUpdateAfterBindUniformBuffers = MAX_PER_STAGE_DESCRIPTOR_UNIFORM_BUFFERS;
+ p->maxPerStageDescriptorUpdateAfterBindStorageBuffers = UINT32_MAX;
+ p->maxPerStageDescriptorUpdateAfterBindSampledImages = max_bindless_views;
+ p->maxPerStageDescriptorUpdateAfterBindStorageImages = max_bindless_views;
+ p->maxPerStageDescriptorUpdateAfterBindInputAttachments = MAX_PER_STAGE_DESCRIPTOR_INPUT_ATTACHMENTS;
+ p->maxPerStageUpdateAfterBindResources = UINT32_MAX;
+ p->maxDescriptorSetUpdateAfterBindSamplers = max_bindless_views;
+ p->maxDescriptorSetUpdateAfterBindUniformBuffers = 6 * MAX_PER_STAGE_DESCRIPTOR_UNIFORM_BUFFERS;
+ p->maxDescriptorSetUpdateAfterBindUniformBuffersDynamic = MAX_DYNAMIC_BUFFERS / 2;
+ p->maxDescriptorSetUpdateAfterBindStorageBuffers = UINT32_MAX;
+ p->maxDescriptorSetUpdateAfterBindStorageBuffersDynamic = MAX_DYNAMIC_BUFFERS / 2;
+ p->maxDescriptorSetUpdateAfterBindSampledImages = max_bindless_views;
+ p->maxDescriptorSetUpdateAfterBindStorageImages = max_bindless_views;
+ p->maxDescriptorSetUpdateAfterBindInputAttachments = MAX_DESCRIPTOR_SET_INPUT_ATTACHMENTS;
+
+ /* We support all of the depth resolve modes */
+ p->supportedDepthResolveModes = VK_RESOLVE_MODE_SAMPLE_ZERO_BIT |
+ VK_RESOLVE_MODE_AVERAGE_BIT |
+ VK_RESOLVE_MODE_MIN_BIT |
+ VK_RESOLVE_MODE_MAX_BIT;
+ /* Average doesn't make sense for stencil so we don't support that */
+ p->supportedStencilResolveModes = VK_RESOLVE_MODE_SAMPLE_ZERO_BIT |
+ VK_RESOLVE_MODE_MIN_BIT |
+ VK_RESOLVE_MODE_MAX_BIT;
+ p->independentResolveNone = true;
+ p->independentResolve = true;
+
+ p->filterMinmaxSingleComponentFormats = true;
+ p->filterMinmaxImageComponentMapping = true;
+
+ p->maxTimelineSemaphoreValueDifference = UINT64_MAX;
+
+ p->framebufferIntegerColorSampleCounts =
+ isl_device_get_sample_counts(&pdevice->isl_dev);
}
-static VkResult MUST_CHECK
-anv_gather_meminfo(struct anv_physical_device *device, int fd, bool update)
+static void
+get_properties_1_3(const struct anv_physical_device *pdevice,
+ struct vk_properties *p)
{
- char sys_mem_regions[sizeof(struct drm_i915_query_memory_regions) +
- sizeof(struct drm_i915_memory_region_info)];
-
- struct drm_i915_query_memory_regions *mem_regions =
- intel_i915_query_alloc(fd, DRM_I915_QUERY_MEMORY_REGIONS);
- if (mem_regions == NULL) {
- if (device->info.has_local_mem) {
- return vk_errorfi(device->instance, NULL,
- VK_ERROR_INCOMPATIBLE_DRIVER,
- "failed to memory regions: %m");
- }
+ if (pdevice->info.ver >= 20)
+ p->minSubgroupSize = 16;
+ else
+ p->minSubgroupSize = 8;
+ p->maxSubgroupSize = 32;
+ p->maxComputeWorkgroupSubgroups = pdevice->info.max_cs_workgroup_threads;
+ p->requiredSubgroupSizeStages = VK_SHADER_STAGE_COMPUTE_BIT |
+ VK_SHADER_STAGE_TASK_BIT_EXT |
+ VK_SHADER_STAGE_MESH_BIT_EXT;
+
+ p->maxInlineUniformBlockSize = MAX_INLINE_UNIFORM_BLOCK_SIZE;
+ p->maxPerStageDescriptorInlineUniformBlocks =
+ MAX_INLINE_UNIFORM_BLOCK_DESCRIPTORS;
+ p->maxPerStageDescriptorUpdateAfterBindInlineUniformBlocks =
+ MAX_INLINE_UNIFORM_BLOCK_DESCRIPTORS;
+ p->maxDescriptorSetInlineUniformBlocks =
+ MAX_INLINE_UNIFORM_BLOCK_DESCRIPTORS;
+ p->maxDescriptorSetUpdateAfterBindInlineUniformBlocks =
+ MAX_INLINE_UNIFORM_BLOCK_DESCRIPTORS;
+ p->maxInlineUniformTotalSize = UINT16_MAX;
+
+ p->integerDotProduct8BitUnsignedAccelerated = false;
+ p->integerDotProduct8BitSignedAccelerated = false;
+ p->integerDotProduct8BitMixedSignednessAccelerated = false;
+ p->integerDotProduct4x8BitPackedUnsignedAccelerated = pdevice->info.ver >= 12;
+ p->integerDotProduct4x8BitPackedSignedAccelerated = pdevice->info.ver >= 12;
+ p->integerDotProduct4x8BitPackedMixedSignednessAccelerated = pdevice->info.ver >= 12;
+ p->integerDotProduct16BitUnsignedAccelerated = false;
+ p->integerDotProduct16BitSignedAccelerated = false;
+ p->integerDotProduct16BitMixedSignednessAccelerated = false;
+ p->integerDotProduct32BitUnsignedAccelerated = false;
+ p->integerDotProduct32BitSignedAccelerated = false;
+ p->integerDotProduct32BitMixedSignednessAccelerated = false;
+ p->integerDotProduct64BitUnsignedAccelerated = false;
+ p->integerDotProduct64BitSignedAccelerated = false;
+ p->integerDotProduct64BitMixedSignednessAccelerated = false;
+ p->integerDotProductAccumulatingSaturating8BitUnsignedAccelerated = false;
+ p->integerDotProductAccumulatingSaturating8BitSignedAccelerated = false;
+ p->integerDotProductAccumulatingSaturating8BitMixedSignednessAccelerated = false;
+ p->integerDotProductAccumulatingSaturating4x8BitPackedUnsignedAccelerated = pdevice->info.ver >= 12;
+ p->integerDotProductAccumulatingSaturating4x8BitPackedSignedAccelerated = pdevice->info.ver >= 12;
+ p->integerDotProductAccumulatingSaturating4x8BitPackedMixedSignednessAccelerated = pdevice->info.ver >= 12;
+ p->integerDotProductAccumulatingSaturating16BitUnsignedAccelerated = false;
+ p->integerDotProductAccumulatingSaturating16BitSignedAccelerated = false;
+ p->integerDotProductAccumulatingSaturating16BitMixedSignednessAccelerated = false;
+ p->integerDotProductAccumulatingSaturating32BitUnsignedAccelerated = false;
+ p->integerDotProductAccumulatingSaturating32BitSignedAccelerated = false;
+ p->integerDotProductAccumulatingSaturating32BitMixedSignednessAccelerated = false;
+ p->integerDotProductAccumulatingSaturating64BitUnsignedAccelerated = false;
+ p->integerDotProductAccumulatingSaturating64BitSignedAccelerated = false;
+ p->integerDotProductAccumulatingSaturating64BitMixedSignednessAccelerated = false;
+
+ /* From the SKL PRM Vol. 2d, docs for RENDER_SURFACE_STATE::Surface
+ * Base Address:
+ *
+ * "For SURFTYPE_BUFFER non-rendertarget surfaces, this field
+ * specifies the base address of the first element of the surface,
+ * computed in software by adding the surface base address to the
+ * byte offset of the element in the buffer. The base address must
+ * be aligned to element size."
+ *
+ * The typed dataport messages require that things be texel aligned.
+ * Otherwise, we may just load/store the wrong data or, in the worst
+ * case, there may be hangs.
+ */
+ p->storageTexelBufferOffsetAlignmentBytes = 16;
+ p->storageTexelBufferOffsetSingleTexelAlignment = true;
- uint64_t total_phys;
- if (!os_get_total_physical_memory(&total_phys)) {
- return vk_errorfi(device->instance, NULL,
- VK_ERROR_INITIALIZATION_FAILED,
- "failed to get total physical memory: %m");
- }
+ /* The sampler, however, is much more forgiving and it can handle
+ * arbitrary byte alignment for linear and buffer surfaces. It's
+ * hard to find a good PRM citation for this but years of empirical
+ * experience demonstrate that this is true.
+ */
+ p->uniformTexelBufferOffsetAlignmentBytes = 1;
+ p->uniformTexelBufferOffsetSingleTexelAlignment = true;
+
+ p->maxBufferSize = pdevice->isl_dev.max_buffer_size;
+}
+
+static void
+get_properties(const struct anv_physical_device *pdevice,
+ struct vk_properties *props)
+{
+
+ const struct intel_device_info *devinfo = &pdevice->info;
+
+ const uint32_t max_ssbos = UINT16_MAX;
+ const uint32_t max_textures = UINT16_MAX;
+ const uint32_t max_samplers = UINT16_MAX;
+ const uint32_t max_images = UINT16_MAX;
+ const VkDeviceSize max_heap_size = anx_get_physical_device_max_heap_size(pdevice);
+
+ /* Claim a high per-stage limit since we have bindless. */
+ const uint32_t max_per_stage = UINT32_MAX;
- uint64_t available;
- if (!os_get_available_system_memory(&available))
- available = 0; /* Silently disable VK_EXT_memory_budget */
+ const uint32_t max_workgroup_size =
+ MIN2(1024, 32 * devinfo->max_cs_workgroup_threads);
- /* The kernel query failed. Fake it using OS memory queries. This
- * should be roughly the same for integrated GPUs.
+ const bool has_sparse_or_fake = pdevice->sparse_type != ANV_SPARSE_TYPE_NOT_SUPPORTED;
+ const bool sparse_uses_trtt = pdevice->sparse_type == ANV_SPARSE_TYPE_TRTT;
+
+ uint64_t sparse_addr_space_size =
+ !has_sparse_or_fake ? 0 :
+ sparse_uses_trtt ? pdevice->va.trtt.size :
+ pdevice->va.high_heap.size;
+
+ VkSampleCountFlags sample_counts =
+ isl_device_get_sample_counts(&pdevice->isl_dev);
+
+
+ *props = (struct vk_properties) {
+ .apiVersion = ANV_API_VERSION,
+ .driverVersion = vk_get_driver_version(),
+ .vendorID = pdevice->instance->force_vk_vendor != 0 ?
+ pdevice->instance->force_vk_vendor : 0x8086,
+ .deviceID = pdevice->info.pci_device_id,
+ .deviceType = pdevice->info.has_local_mem ?
+ VK_PHYSICAL_DEVICE_TYPE_DISCRETE_GPU :
+ VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU,
+
+ /* Limits: */
+ .maxImageDimension1D = (1 << 14),
+ .maxImageDimension2D = (1 << 14),
+ .maxImageDimension3D = (1 << 11),
+ .maxImageDimensionCube = (1 << 14),
+ .maxImageArrayLayers = (1 << 11),
+ .maxTexelBufferElements = 128 * 1024 * 1024,
+ .maxUniformBufferRange = pdevice->compiler->indirect_ubos_use_sampler ? (1u << 27) : (1u << 30),
+ .maxStorageBufferRange = MIN3(pdevice->isl_dev.max_buffer_size, max_heap_size, UINT32_MAX),
+ .maxPushConstantsSize = MAX_PUSH_CONSTANTS_SIZE,
+ .maxMemoryAllocationCount = UINT32_MAX,
+ .maxSamplerAllocationCount = 64 * 1024,
+ .bufferImageGranularity = 1,
+ .sparseAddressSpaceSize = sparse_addr_space_size,
+ .maxBoundDescriptorSets = MAX_SETS,
+ .maxPerStageDescriptorSamplers = max_samplers,
+ .maxPerStageDescriptorUniformBuffers = MAX_PER_STAGE_DESCRIPTOR_UNIFORM_BUFFERS,
+ .maxPerStageDescriptorStorageBuffers = max_ssbos,
+ .maxPerStageDescriptorSampledImages = max_textures,
+ .maxPerStageDescriptorStorageImages = max_images,
+ .maxPerStageDescriptorInputAttachments = MAX_PER_STAGE_DESCRIPTOR_INPUT_ATTACHMENTS,
+ .maxPerStageResources = max_per_stage,
+ .maxDescriptorSetSamplers = 6 * max_samplers, /* number of stages * maxPerStageDescriptorSamplers */
+ .maxDescriptorSetUniformBuffers = 6 * MAX_PER_STAGE_DESCRIPTOR_UNIFORM_BUFFERS, /* number of stages * maxPerStageDescriptorUniformBuffers */
+ .maxDescriptorSetUniformBuffersDynamic = MAX_DYNAMIC_BUFFERS / 2,
+ .maxDescriptorSetStorageBuffers = 6 * max_ssbos, /* number of stages * maxPerStageDescriptorStorageBuffers */
+ .maxDescriptorSetStorageBuffersDynamic = MAX_DYNAMIC_BUFFERS / 2,
+ .maxDescriptorSetSampledImages = 6 * max_textures, /* number of stages * maxPerStageDescriptorSampledImages */
+ .maxDescriptorSetStorageImages = 6 * max_images, /* number of stages * maxPerStageDescriptorStorageImages */
+ .maxDescriptorSetInputAttachments = MAX_DESCRIPTOR_SET_INPUT_ATTACHMENTS,
+ .maxVertexInputAttributes = MAX_VES,
+ .maxVertexInputBindings = MAX_VBS,
+ /* Broadwell PRMs: Volume 2d: Command Reference: Structures:
+ *
+ * VERTEX_ELEMENT_STATE::Source Element Offset: [0,2047]
*/
- mem_regions = (void *)sys_mem_regions;
- mem_regions->num_regions = 1;
- mem_regions->regions[0] = (struct drm_i915_memory_region_info) {
- .region.memory_class = I915_MEMORY_CLASS_SYSTEM,
- .probed_size = total_phys,
- .unallocated_size = available,
- };
- }
+ .maxVertexInputAttributeOffset = 2047,
+ /* Skylake PRMs: Volume 2d: Command Reference: Structures:
+ *
+ * VERTEX_BUFFER_STATE::Buffer Pitch: [0,4095]
+ */
+ .maxVertexInputBindingStride = 4095,
+ .maxVertexOutputComponents = 128,
+ .maxTessellationGenerationLevel = 64,
+ .maxTessellationPatchSize = 32,
+ .maxTessellationControlPerVertexInputComponents = 128,
+ .maxTessellationControlPerVertexOutputComponents = 128,
+ .maxTessellationControlPerPatchOutputComponents = 128,
+ .maxTessellationControlTotalOutputComponents = 2048,
+ .maxTessellationEvaluationInputComponents = 128,
+ .maxTessellationEvaluationOutputComponents = 128,
+ .maxGeometryShaderInvocations = 32,
+ .maxGeometryInputComponents = 128,
+ .maxGeometryOutputComponents = 128,
+ .maxGeometryOutputVertices = 256,
+ .maxGeometryTotalOutputComponents = 1024,
+ .maxFragmentInputComponents = 116, /* 128 components - (PSIZ, CLIP_DIST0, CLIP_DIST1) */
+ .maxFragmentOutputAttachments = 8,
+ .maxFragmentDualSrcAttachments = 1,
+ .maxFragmentCombinedOutputResources = MAX_RTS + max_ssbos + max_images,
+ .maxComputeSharedMemorySize = 64 * 1024,
+ .maxComputeWorkGroupCount = { 65535, 65535, 65535 },
+ .maxComputeWorkGroupInvocations = max_workgroup_size,
+ .maxComputeWorkGroupSize = {
+ max_workgroup_size,
+ max_workgroup_size,
+ max_workgroup_size,
+ },
+ .subPixelPrecisionBits = 8,
+ .subTexelPrecisionBits = 8,
+ .mipmapPrecisionBits = 8,
+ .maxDrawIndexedIndexValue = UINT32_MAX,
+ .maxDrawIndirectCount = UINT32_MAX,
+ .maxSamplerLodBias = 16,
+ .maxSamplerAnisotropy = 16,
+ .maxViewports = MAX_VIEWPORTS,
+ .maxViewportDimensions = { (1 << 14), (1 << 14) },
+ .viewportBoundsRange = { INT16_MIN, INT16_MAX },
+ .viewportSubPixelBits = 13, /* We take a float? */
+ .minMemoryMapAlignment = 4096, /* A page */
+ /* The dataport requires texel alignment so we need to assume a worst
+ * case of R32G32B32A32 which is 16 bytes.
+ */
+ .minTexelBufferOffsetAlignment = 16,
+ .minUniformBufferOffsetAlignment = ANV_UBO_ALIGNMENT,
+ .minStorageBufferOffsetAlignment = ANV_SSBO_ALIGNMENT,
+ .minTexelOffset = -8,
+ .maxTexelOffset = 7,
+ .minTexelGatherOffset = -32,
+ .maxTexelGatherOffset = 31,
+ .minInterpolationOffset = -0.5,
+ .maxInterpolationOffset = 0.4375,
+ .subPixelInterpolationOffsetBits = 4,
+ .maxFramebufferWidth = (1 << 14),
+ .maxFramebufferHeight = (1 << 14),
+ .maxFramebufferLayers = (1 << 11),
+ .framebufferColorSampleCounts = sample_counts,
+ .framebufferDepthSampleCounts = sample_counts,
+ .framebufferStencilSampleCounts = sample_counts,
+ .framebufferNoAttachmentsSampleCounts = sample_counts,
+ .maxColorAttachments = MAX_RTS,
+ .sampledImageColorSampleCounts = sample_counts,
+ .sampledImageIntegerSampleCounts = sample_counts,
+ .sampledImageDepthSampleCounts = sample_counts,
+ .sampledImageStencilSampleCounts = sample_counts,
+ .storageImageSampleCounts = VK_SAMPLE_COUNT_1_BIT,
+ .maxSampleMaskWords = 1,
+ .timestampComputeAndGraphics = true,
+ .timestampPeriod = 1000000000.0 / devinfo->timestamp_frequency,
+ .maxClipDistances = 8,
+ .maxCullDistances = 8,
+ .maxCombinedClipAndCullDistances = 8,
+ .discreteQueuePriorities = 2,
+ .pointSizeRange = { 0.125, 255.875 },
+ /* While SKL and up support much wider lines than we are setting here,
+ * in practice we run into conformance issues if we go past this limit.
+ * Since the Windows driver does the same, it's probably fair to assume
+ * that no one needs more than this.
+ */
+ .lineWidthRange = { 0.0, 8.0 },
+ .pointSizeGranularity = (1.0 / 8.0),
+ .lineWidthGranularity = (1.0 / 128.0),
+ .strictLines = false,
+ .standardSampleLocations = true,
+ .optimalBufferCopyOffsetAlignment = 128,
+ .optimalBufferCopyRowPitchAlignment = 128,
+ .nonCoherentAtomSize = 64,
- for(int i = 0; i < mem_regions->num_regions; i++) {
- struct drm_i915_memory_region_info *info = &mem_regions->regions[i];
+ /* Sparse: */
+ .sparseResidencyStandard2DBlockShape = has_sparse_or_fake,
+ .sparseResidencyStandard2DMultisampleBlockShape = false,
+ .sparseResidencyStandard3DBlockShape = has_sparse_or_fake,
+ .sparseResidencyAlignedMipSize = false,
+ .sparseResidencyNonResidentStrict = has_sparse_or_fake,
- struct anv_memregion *region;
- switch (info->region.memory_class) {
- case I915_MEMORY_CLASS_SYSTEM:
- region = &device->sys;
- break;
- case I915_MEMORY_CLASS_DEVICE:
- region = &device->vram;
- break;
- default:
- /* We don't know what kind of memory this is */
- continue;
- }
+ /* VK_KHR_cooperative_matrix */
+ .cooperativeMatrixSupportedStages = VK_SHADER_STAGE_COMPUTE_BIT,
+ };
- uint64_t size = info->probed_size;
- if (info->region.memory_class == I915_MEMORY_CLASS_SYSTEM)
- size = anv_compute_sys_heap_size(device, size);
+ snprintf(props->deviceName, sizeof(props->deviceName),
+ "%s", pdevice->info.name);
+ memcpy(props->pipelineCacheUUID,
+ pdevice->pipeline_cache_uuid, VK_UUID_SIZE);
- uint64_t available = MIN2(size, info->unallocated_size);
+ get_properties_1_1(pdevice, props);
+ get_properties_1_2(pdevice, props);
+ get_properties_1_3(pdevice, props);
+
+ /* VK_KHR_acceleration_structure */
+ {
+ props->maxGeometryCount = (1u << 24) - 1;
+ props->maxInstanceCount = (1u << 24) - 1;
+ props->maxPrimitiveCount = (1u << 29) - 1;
+ props->maxPerStageDescriptorAccelerationStructures = UINT16_MAX;
+ props->maxPerStageDescriptorUpdateAfterBindAccelerationStructures = UINT16_MAX;
+ props->maxDescriptorSetAccelerationStructures = UINT16_MAX;
+ props->maxDescriptorSetUpdateAfterBindAccelerationStructures = UINT16_MAX;
+ props->minAccelerationStructureScratchOffsetAlignment = 64;
+ }
- if (update) {
- assert(region->region.memory_class == info->region.memory_class);
- assert(region->region.memory_instance == info->region.memory_instance);
- assert(region->size == size);
+ /* VK_KHR_fragment_shading_rate */
+ {
+ props->primitiveFragmentShadingRateWithMultipleViewports =
+ pdevice->info.has_coarse_pixel_primitive_and_cb;
+ props->layeredShadingRateAttachments =
+ pdevice->info.has_coarse_pixel_primitive_and_cb;
+ props->fragmentShadingRateNonTrivialCombinerOps =
+ pdevice->info.has_coarse_pixel_primitive_and_cb;
+ props->maxFragmentSize = (VkExtent2D) { 4, 4 };
+ props->maxFragmentSizeAspectRatio =
+ pdevice->info.has_coarse_pixel_primitive_and_cb ?
+ 2 : 4;
+ props->maxFragmentShadingRateCoverageSamples = 4 * 4 *
+ (pdevice->info.has_coarse_pixel_primitive_and_cb ? 4 : 16);
+ props->maxFragmentShadingRateRasterizationSamples =
+ pdevice->info.has_coarse_pixel_primitive_and_cb ?
+ VK_SAMPLE_COUNT_4_BIT : VK_SAMPLE_COUNT_16_BIT;
+ props->fragmentShadingRateWithShaderDepthStencilWrites = false;
+ props->fragmentShadingRateWithSampleMask = true;
+ props->fragmentShadingRateWithShaderSampleMask = false;
+ props->fragmentShadingRateWithConservativeRasterization = true;
+ props->fragmentShadingRateWithFragmentShaderInterlock = true;
+ props->fragmentShadingRateWithCustomSampleLocations = true;
+ props->fragmentShadingRateStrictMultiplyCombiner = true;
+
+ if (pdevice->info.has_coarse_pixel_primitive_and_cb) {
+ props->minFragmentShadingRateAttachmentTexelSize = (VkExtent2D) { 8, 8 };
+ props->maxFragmentShadingRateAttachmentTexelSize = (VkExtent2D) { 8, 8 };
+ props->maxFragmentShadingRateAttachmentTexelSizeAspectRatio = 1;
} else {
- region->region = info->region;
- region->size = size;
+ /* Those must be 0 if attachmentFragmentShadingRate is not supported. */
+ props->minFragmentShadingRateAttachmentTexelSize = (VkExtent2D) { 0, 0 };
+ props->maxFragmentShadingRateAttachmentTexelSize = (VkExtent2D) { 0, 0 };
+ props->maxFragmentShadingRateAttachmentTexelSizeAspectRatio = 0;
}
- region->available = available;
}
- if (mem_regions != (void *)sys_mem_regions)
- free(mem_regions);
+ /* VK_KHR_maintenance5 */
+ {
+ props->earlyFragmentMultisampleCoverageAfterSampleCounting = false;
+ props->earlyFragmentSampleMaskTestBeforeSampleCounting = false;
+ props->depthStencilSwizzleOneSupport = true;
+ props->polygonModePointSize = true;
+ props->nonStrictSinglePixelWideLinesUseParallelogram = false;
+ props->nonStrictWideLinesUseParallelogram = false;
+ }
- return VK_SUCCESS;
+ /* VK_KHR_maintenance6 */
+ {
+ props->blockTexelViewCompatibleMultipleLayers = true;
+ props->maxCombinedImageSamplerDescriptorCount = 3;
+ props->fragmentShadingRateClampCombinerInputs = true;
+ }
+
+ /* VK_KHR_performance_query */
+ {
+ props->allowCommandBufferQueryCopies = false;
+ }
+
+ /* VK_KHR_push_descriptor */
+ {
+ props->maxPushDescriptors = MAX_PUSH_DESCRIPTORS;
+ }
+
+ /* VK_KHR_ray_tracing_pipeline */
+ {
+ /* TODO */
+ props->shaderGroupHandleSize = 32;
+ props->maxRayRecursionDepth = 31;
+ /* MemRay::hitGroupSRStride is 16 bits */
+ props->maxShaderGroupStride = UINT16_MAX;
+ /* MemRay::hitGroupSRBasePtr requires 16B alignment */
+ props->shaderGroupBaseAlignment = 16;
+ props->shaderGroupHandleAlignment = 16;
+ props->shaderGroupHandleCaptureReplaySize = 32;
+ props->maxRayDispatchInvocationCount = 1U << 30; /* required min limit */
+ props->maxRayHitAttributeSize = BRW_RT_SIZEOF_HIT_ATTRIB_DATA;
+ }
+
+ /* VK_KHR_vertex_attribute_divisor */
+ {
+ props->maxVertexAttribDivisor = UINT32_MAX / 16;
+ props->supportsNonZeroFirstInstance = true;
+ }
+
+ /* VK_EXT_conservative_rasterization */
+ {
+ /* There's nothing in the public docs about this value as far as I can
+ * tell. However, this is the value the Windows driver reports and
+ * there's a comment on a rejected HW feature in the internal docs that
+ * says:
+ *
+ * "This is similar to conservative rasterization, except the
+ * primitive area is not extended by 1/512 and..."
+ *
+ * That's a bit of an obtuse reference but it's the best we've got for
+ * now.
+ */
+ props->primitiveOverestimationSize = 1.0f / 512.0f;
+ props->maxExtraPrimitiveOverestimationSize = 0.0f;
+ props->extraPrimitiveOverestimationSizeGranularity = 0.0f;
+ props->primitiveUnderestimation = false;
+ props->conservativePointAndLineRasterization = false;
+ props->degenerateTrianglesRasterized = true;
+ props->degenerateLinesRasterized = false;
+ props->fullyCoveredFragmentShaderInputVariable = false;
+ props->conservativeRasterizationPostDepthCoverage = true;
+ }
+
+ /* VK_EXT_custom_border_color */
+ {
+ props->maxCustomBorderColorSamplers = MAX_CUSTOM_BORDER_COLORS;
+ }
+
+ /* VK_EXT_descriptor_buffer */
+ {
+ props->combinedImageSamplerDescriptorSingleArray = true;
+ props->bufferlessPushDescriptors = true;
+ /* Written to the buffer before a timeline semaphore is signaled, but
+ * after vkQueueSubmit().
+ */
+ props->allowSamplerImageViewPostSubmitCreation = true;
+ props->descriptorBufferOffsetAlignment = ANV_SURFACE_STATE_SIZE;
+
+ if (pdevice->uses_ex_bso) {
+ props->maxDescriptorBufferBindings = MAX_SETS;
+ props->maxResourceDescriptorBufferBindings = MAX_SETS;
+ props->maxSamplerDescriptorBufferBindings = MAX_SETS;
+ props->maxEmbeddedImmutableSamplerBindings = MAX_SETS;
+ } else {
+ props->maxDescriptorBufferBindings = 3; /* resources, samplers, push (we don't care about push) */
+ props->maxResourceDescriptorBufferBindings = 1;
+ props->maxSamplerDescriptorBufferBindings = 1;
+ props->maxEmbeddedImmutableSamplerBindings = 1;
+ }
+ props->maxEmbeddedImmutableSamplers = MAX_EMBEDDED_SAMPLERS;
+
+ /* Storing a 64bit address */
+ props->bufferCaptureReplayDescriptorDataSize = 8;
+ props->imageCaptureReplayDescriptorDataSize = 8;
+ /* Offset inside the reserved border color pool */
+ props->samplerCaptureReplayDescriptorDataSize = 4;
+
+ /* Not affected by replay */
+ props->imageViewCaptureReplayDescriptorDataSize = 0;
+ /* The acceleration structure virtual address backing is coming from a
+ * buffer, so as long as that buffer is captured/replayed correctly we
+ * should always get the same address.
+ */
+ props->accelerationStructureCaptureReplayDescriptorDataSize = 0;
+
+ props->samplerDescriptorSize = ANV_SAMPLER_STATE_SIZE;
+ props->combinedImageSamplerDescriptorSize = align(ANV_SURFACE_STATE_SIZE + ANV_SAMPLER_STATE_SIZE,
+ ANV_SURFACE_STATE_SIZE);
+ props->sampledImageDescriptorSize = ANV_SURFACE_STATE_SIZE;
+ props->storageImageDescriptorSize = ANV_SURFACE_STATE_SIZE;
+ props->uniformTexelBufferDescriptorSize = ANV_SURFACE_STATE_SIZE;
+ props->robustUniformTexelBufferDescriptorSize = ANV_SURFACE_STATE_SIZE;
+ props->storageTexelBufferDescriptorSize = ANV_SURFACE_STATE_SIZE;
+ props->robustStorageTexelBufferDescriptorSize = ANV_SURFACE_STATE_SIZE;
+ props->uniformBufferDescriptorSize = ANV_SURFACE_STATE_SIZE;
+ props->robustUniformBufferDescriptorSize = ANV_SURFACE_STATE_SIZE;
+ props->storageBufferDescriptorSize = ANV_SURFACE_STATE_SIZE;
+ props->robustStorageBufferDescriptorSize = ANV_SURFACE_STATE_SIZE;
+ props->inputAttachmentDescriptorSize = ANV_SURFACE_STATE_SIZE;
+ props->accelerationStructureDescriptorSize = sizeof(struct anv_address_range_descriptor);
+ props->maxSamplerDescriptorBufferRange = pdevice->va.descriptor_buffer_pool.size;
+ props->maxResourceDescriptorBufferRange = anv_physical_device_bindless_heap_size(pdevice,
+ true);
+ props->resourceDescriptorBufferAddressSpaceSize = pdevice->va.descriptor_buffer_pool.size;
+ props->descriptorBufferAddressSpaceSize = pdevice->va.descriptor_buffer_pool.size;
+ props->samplerDescriptorBufferAddressSpaceSize = pdevice->va.descriptor_buffer_pool.size;
+ }
+
+ /* VK_EXT_extended_dynamic_state3 */
+ {
+ props->dynamicPrimitiveTopologyUnrestricted = true;
+ }
+
+ /* VK_EXT_external_memory_host */
+ {
+ props->minImportedHostPointerAlignment = 4096;
+ }
+
+ /* VK_EXT_graphics_pipeline_library */
+ {
+ props->graphicsPipelineLibraryFastLinking = true;
+ props->graphicsPipelineLibraryIndependentInterpolationDecoration = true;
+ }
+
+ /* VK_EXT_line_rasterization */
+ {
+ /* In the Skylake PRM Vol. 7, subsection titled "GIQ (Diamond) Sampling
+ * Rules - Legacy Mode", it says the following:
+ *
+ * "Note that the device divides a pixel into a 16x16 array of
+ * subpixels, referenced by their upper left corners."
+ *
+ * This is the only known reference in the PRMs to the subpixel
+ * precision of line rasterization and a "16x16 array of subpixels"
+ * implies 4 subpixel precision bits. Empirical testing has shown that 4
+ * subpixel precision bits applies to all line rasterization types.
+ */
+ props->lineSubPixelPrecisionBits = 4;
+ }
+
+ /* VK_EXT_map_memory_placed */
+ {
+ props->minPlacedMemoryMapAlignment = 4096;
+ }
+
+ /* VK_EXT_mesh_shader */
+ {
+ /* Bounded by the maximum representable size in
+ * 3DSTATE_MESH_SHADER_BODY::SharedLocalMemorySize. Same for Task.
+ */
+ const uint32_t max_slm_size = 64 * 1024;
+
+ /* Bounded by the maximum representable size in
+ * 3DSTATE_MESH_SHADER_BODY::LocalXMaximum. Same for Task.
+ */
+ const uint32_t max_workgroup_size = 1 << 10;
+
+ /* 3DMESH_3D limitation. */
+ const uint32_t max_threadgroup_count = 1 << 22;
+
+ /* 3DMESH_3D limitation. */
+ const uint32_t max_threadgroup_xyz = 65535;
+
+ const uint32_t max_urb_size = 64 * 1024;
+
+ props->maxTaskWorkGroupTotalCount = max_threadgroup_count;
+ props->maxTaskWorkGroupCount[0] = max_threadgroup_xyz;
+ props->maxTaskWorkGroupCount[1] = max_threadgroup_xyz;
+ props->maxTaskWorkGroupCount[2] = max_threadgroup_xyz;
+
+ props->maxTaskWorkGroupInvocations = max_workgroup_size;
+ props->maxTaskWorkGroupSize[0] = max_workgroup_size;
+ props->maxTaskWorkGroupSize[1] = max_workgroup_size;
+ props->maxTaskWorkGroupSize[2] = max_workgroup_size;
+
+ /* TUE header with padding */
+ const uint32_t task_payload_reserved = 32;
+
+ props->maxTaskPayloadSize = max_urb_size - task_payload_reserved;
+ props->maxTaskSharedMemorySize = max_slm_size;
+ props->maxTaskPayloadAndSharedMemorySize =
+ props->maxTaskPayloadSize +
+ props->maxTaskSharedMemorySize;
+
+ props->maxMeshWorkGroupTotalCount = max_threadgroup_count;
+ props->maxMeshWorkGroupCount[0] = max_threadgroup_xyz;
+ props->maxMeshWorkGroupCount[1] = max_threadgroup_xyz;
+ props->maxMeshWorkGroupCount[2] = max_threadgroup_xyz;
+
+ props->maxMeshWorkGroupInvocations = max_workgroup_size;
+ props->maxMeshWorkGroupSize[0] = max_workgroup_size;
+ props->maxMeshWorkGroupSize[1] = max_workgroup_size;
+ props->maxMeshWorkGroupSize[2] = max_workgroup_size;
+
+ props->maxMeshSharedMemorySize = max_slm_size;
+ props->maxMeshPayloadAndSharedMemorySize =
+ props->maxTaskPayloadSize +
+ props->maxMeshSharedMemorySize;
+
+ /* Unfortunately spec's formula for the max output size doesn't match our hardware
+ * (because some per-primitive and per-vertex attributes have alignment restrictions),
+ * so we have to advertise the minimum value mandated by the spec to not overflow it.
+ */
+ props->maxMeshOutputPrimitives = 256;
+ props->maxMeshOutputVertices = 256;
+
+ /* NumPrim + Primitive Data List */
+ const uint32_t max_indices_memory =
+ ALIGN(sizeof(uint32_t) +
+ sizeof(uint32_t) * props->maxMeshOutputVertices, 32);
+
+ props->maxMeshOutputMemorySize = MIN2(max_urb_size - max_indices_memory, 32768);
+
+ props->maxMeshPayloadAndOutputMemorySize =
+ props->maxTaskPayloadSize +
+ props->maxMeshOutputMemorySize;
+
+ props->maxMeshOutputComponents = 128;
+
+ /* RTAIndex is 11-bits wide */
+ props->maxMeshOutputLayers = 1 << 11;
+
+ props->maxMeshMultiviewViewCount = 1;
+
+ /* Elements in Vertex Data Array must be aligned to 32 bytes (8 dwords). */
+ props->meshOutputPerVertexGranularity = 8;
+ /* Elements in Primitive Data Array must be aligned to 32 bytes (8 dwords). */
+ props->meshOutputPerPrimitiveGranularity = 8;
+
+ /* SIMD16 */
+ props->maxPreferredTaskWorkGroupInvocations = 16;
+ props->maxPreferredMeshWorkGroupInvocations = 16;
+
+ props->prefersLocalInvocationVertexOutput = false;
+ props->prefersLocalInvocationPrimitiveOutput = false;
+ props->prefersCompactVertexOutput = false;
+ props->prefersCompactPrimitiveOutput = false;
+
+ /* Spec minimum values */
+ assert(props->maxTaskWorkGroupTotalCount >= (1U << 22));
+ assert(props->maxTaskWorkGroupCount[0] >= 65535);
+ assert(props->maxTaskWorkGroupCount[1] >= 65535);
+ assert(props->maxTaskWorkGroupCount[2] >= 65535);
+
+ assert(props->maxTaskWorkGroupInvocations >= 128);
+ assert(props->maxTaskWorkGroupSize[0] >= 128);
+ assert(props->maxTaskWorkGroupSize[1] >= 128);
+ assert(props->maxTaskWorkGroupSize[2] >= 128);
+
+ assert(props->maxTaskPayloadSize >= 16384);
+ assert(props->maxTaskSharedMemorySize >= 32768);
+ assert(props->maxTaskPayloadAndSharedMemorySize >= 32768);
+
+
+ assert(props->maxMeshWorkGroupTotalCount >= (1U << 22));
+ assert(props->maxMeshWorkGroupCount[0] >= 65535);
+ assert(props->maxMeshWorkGroupCount[1] >= 65535);
+ assert(props->maxMeshWorkGroupCount[2] >= 65535);
+
+ assert(props->maxMeshWorkGroupInvocations >= 128);
+ assert(props->maxMeshWorkGroupSize[0] >= 128);
+ assert(props->maxMeshWorkGroupSize[1] >= 128);
+ assert(props->maxMeshWorkGroupSize[2] >= 128);
+
+ assert(props->maxMeshSharedMemorySize >= 28672);
+ assert(props->maxMeshPayloadAndSharedMemorySize >= 28672);
+ assert(props->maxMeshOutputMemorySize >= 32768);
+ assert(props->maxMeshPayloadAndOutputMemorySize >= 48128);
+
+ assert(props->maxMeshOutputComponents >= 128);
+
+ assert(props->maxMeshOutputVertices >= 256);
+ assert(props->maxMeshOutputPrimitives >= 256);
+ assert(props->maxMeshOutputLayers >= 8);
+ assert(props->maxMeshMultiviewViewCount >= 1);
+ }
+
+ /* VK_EXT_multi_draw */
+ {
+ props->maxMultiDrawCount = 2048;
+ }
+
+ /* VK_EXT_nested_command_buffer */
+ {
+ props->maxCommandBufferNestingLevel = UINT32_MAX;
+ }
+
+ /* VK_EXT_pci_bus_info */
+ {
+ props->pciDomain = pdevice->info.pci_domain;
+ props->pciBus = pdevice->info.pci_bus;
+ props->pciDevice = pdevice->info.pci_dev;
+ props->pciFunction = pdevice->info.pci_func;
+ }
+
+ /* VK_EXT_physical_device_drm */
+ {
+ props->drmHasPrimary = pdevice->has_master;
+ props->drmPrimaryMajor = pdevice->master_major;
+ props->drmPrimaryMinor = pdevice->master_minor;
+ props->drmHasRender = pdevice->has_local;
+ props->drmRenderMajor = pdevice->local_major;
+ props->drmRenderMinor = pdevice->local_minor;
+ }
+
+ /* VK_EXT_pipeline_robustness */
+ {
+ props->defaultRobustnessStorageBuffers =
+ VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_DISABLED_EXT;
+ props->defaultRobustnessUniformBuffers =
+ VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_DISABLED_EXT;
+ props->defaultRobustnessVertexInputs =
+ VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_ROBUST_BUFFER_ACCESS_2_EXT;
+ props->defaultRobustnessImages =
+ VK_PIPELINE_ROBUSTNESS_IMAGE_BEHAVIOR_ROBUST_IMAGE_ACCESS_2_EXT;
+ }
+
+ /* VK_EXT_provoking_vertex */
+ {
+ props->provokingVertexModePerPipeline = true;
+ props->transformFeedbackPreservesTriangleFanProvokingVertex = false;
+ }
+
+ /* VK_EXT_robustness2 */
+ {
+ props->robustStorageBufferAccessSizeAlignment =
+ ANV_SSBO_BOUNDS_CHECK_ALIGNMENT;
+ props->robustUniformBufferAccessSizeAlignment =
+ ANV_UBO_ALIGNMENT;
+ }
+
+ /* VK_EXT_sample_locations */
+ {
+ props->sampleLocationSampleCounts =
+ isl_device_get_sample_counts(&pdevice->isl_dev);
+
+ /* See also anv_GetPhysicalDeviceMultisamplePropertiesEXT */
+ props->maxSampleLocationGridSize.width = 1;
+ props->maxSampleLocationGridSize.height = 1;
+
+ props->sampleLocationCoordinateRange[0] = 0;
+ props->sampleLocationCoordinateRange[1] = 0.9375;
+ props->sampleLocationSubPixelBits = 4;
+
+ props->variableSampleLocations = true;
+ }
+
+ /* VK_EXT_shader_module_identifier */
+ {
+ STATIC_ASSERT(sizeof(vk_shaderModuleIdentifierAlgorithmUUID) ==
+ sizeof(props->shaderModuleIdentifierAlgorithmUUID));
+ memcpy(props->shaderModuleIdentifierAlgorithmUUID,
+ vk_shaderModuleIdentifierAlgorithmUUID,
+ sizeof(props->shaderModuleIdentifierAlgorithmUUID));
+ }
+
+ /* VK_EXT_transform_feedback */
+ {
+ props->maxTransformFeedbackStreams = MAX_XFB_STREAMS;
+ props->maxTransformFeedbackBuffers = MAX_XFB_BUFFERS;
+ props->maxTransformFeedbackBufferSize = (1ull << 32);
+ props->maxTransformFeedbackStreamDataSize = 128 * 4;
+ props->maxTransformFeedbackBufferDataSize = 128 * 4;
+ props->maxTransformFeedbackBufferDataStride = 2048;
+ props->transformFeedbackQueries = true;
+ props->transformFeedbackStreamsLinesTriangles = false;
+ props->transformFeedbackRasterizationStreamSelect = false;
+ props->transformFeedbackDraw = true;
+ }
}
static VkResult MUST_CHECK
anv_init_meminfo(struct anv_physical_device *device, int fd)
{
- return anv_gather_meminfo(device, fd, false);
+ const struct intel_device_info *devinfo = &device->info;
+
+ device->sys.region = &devinfo->mem.sram.mem;
+ device->sys.size = devinfo->mem.sram.mappable.size;
+ device->sys.available = devinfo->mem.sram.mappable.free;
+
+ device->vram_mappable.region = &devinfo->mem.vram.mem;
+ device->vram_mappable.size = devinfo->mem.vram.mappable.size;
+ device->vram_mappable.available = devinfo->mem.vram.mappable.free;
+
+ device->vram_non_mappable.region = &devinfo->mem.vram.mem;
+ device->vram_non_mappable.size = devinfo->mem.vram.unmappable.size;
+ device->vram_non_mappable.available = devinfo->mem.vram.unmappable.free;
+
+ return VK_SUCCESS;
}
static void
anv_update_meminfo(struct anv_physical_device *device, int fd)
{
- ASSERTED VkResult result = anv_gather_meminfo(device, fd, true);
- assert(result == VK_SUCCESS);
-}
+ if (!intel_device_info_update_memory_info(&device->info, fd))
+ return;
+ const struct intel_device_info *devinfo = &device->info;
+ device->sys.available = devinfo->mem.sram.mappable.free;
+ device->vram_mappable.available = devinfo->mem.vram.mappable.free;
+ device->vram_non_mappable.available = devinfo->mem.vram.unmappable.free;
+}
static VkResult
anv_physical_device_init_heaps(struct anv_physical_device *device, int fd)
{
- if (anv_gem_get_context_param(fd, 0, I915_CONTEXT_PARAM_GTT_SIZE,
- &device->gtt_size) == -1) {
- /* If, for whatever reason, we can't actually get the GTT size from the
- * kernel (too old?) fall back to the aperture size.
- */
- anv_perf_warn(NULL, NULL,
- "Failed to get I915_CONTEXT_PARAM_GTT_SIZE: %m");
-
- if (intel_get_aperture_size(fd, &device->gtt_size) == -1) {
- return vk_errorfi(device->instance, NULL,
- VK_ERROR_INITIALIZATION_FAILED,
- "failed to get aperture size: %m");
- }
- }
-
- /* We only allow 48-bit addresses with softpin because knowing the actual
- * address is required for the vertex cache flush workaround.
- */
- device->supports_48bit_addresses = (device->info.ver >= 8) &&
- device->gtt_size > (4ULL << 30 /* GiB */);
-
VkResult result = anv_init_meminfo(device, fd);
if (result != VK_SUCCESS)
return result;
assert(device->sys.size != 0);
- if (device->vram.size > 0) {
- /* We can create 2 different heaps when we have local memory support,
- * first heap with local memory size and second with system memory size.
+ if (anv_physical_device_has_vram(device)) {
+ /* We can create 2 or 3 different heaps when we have local memory
+ * support, first heap with local memory size and second with system
+ * memory size and the third is added only if part of the vram is
+ * mappable to the host.
*/
device->memory.heap_count = 2;
device->memory.heaps[0] = (struct anv_memory_heap) {
- .size = device->vram.size,
+ /* If there is a vram_non_mappable, use that for the device only
+ * heap. Otherwise use the vram_mappable.
+ */
+ .size = device->vram_non_mappable.size != 0 ?
+ device->vram_non_mappable.size : device->vram_mappable.size,
.flags = VK_MEMORY_HEAP_DEVICE_LOCAL_BIT,
.is_local_mem = true,
};
@@ -475,43 +1916,17 @@ anv_physical_device_init_heaps(struct anv_physical_device *device, int fd)
.flags = 0,
.is_local_mem = false,
};
-
- device->memory.type_count = 3;
- device->memory.types[0] = (struct anv_memory_type) {
- .propertyFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT,
- .heapIndex = 0,
- };
- device->memory.types[1] = (struct anv_memory_type) {
- .propertyFlags = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
- VK_MEMORY_PROPERTY_HOST_COHERENT_BIT |
- VK_MEMORY_PROPERTY_HOST_CACHED_BIT,
- .heapIndex = 1,
- };
- device->memory.types[2] = (struct anv_memory_type) {
- .propertyFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT |
- VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
- VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
- .heapIndex = 0,
- };
- } else if (device->info.has_llc) {
- device->memory.heap_count = 1;
- device->memory.heaps[0] = (struct anv_memory_heap) {
- .size = device->sys.size,
- .flags = VK_MEMORY_HEAP_DEVICE_LOCAL_BIT,
- .is_local_mem = false,
- };
-
- /* Big core GPUs share LLC with the CPU and thus one memory type can be
- * both cached and coherent at the same time.
+ /* Add an additional smaller vram mappable heap if we can't map all the
+ * vram to the host.
*/
- device->memory.type_count = 1;
- device->memory.types[0] = (struct anv_memory_type) {
- .propertyFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT |
- VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
- VK_MEMORY_PROPERTY_HOST_COHERENT_BIT |
- VK_MEMORY_PROPERTY_HOST_CACHED_BIT,
- .heapIndex = 0,
- };
+ if (device->vram_non_mappable.size > 0) {
+ device->memory.heap_count++;
+ device->memory.heaps[2] = (struct anv_memory_heap) {
+ .size = device->vram_mappable.size,
+ .flags = VK_MEMORY_HEAP_DEVICE_LOCAL_BIT,
+ .is_local_mem = true,
+ };
+ }
} else {
device->memory.heap_count = 1;
device->memory.heaps[0] = (struct anv_memory_heap) {
@@ -519,33 +1934,60 @@ anv_physical_device_init_heaps(struct anv_physical_device *device, int fd)
.flags = VK_MEMORY_HEAP_DEVICE_LOCAL_BIT,
.is_local_mem = false,
};
+ }
- /* The spec requires that we expose a host-visible, coherent memory
- * type, but Atom GPUs don't share LLC. Thus we offer two memory types
- * to give the application a choice between cached, but not coherent and
- * coherent but uncached (WC though).
- */
- device->memory.type_count = 2;
- device->memory.types[0] = (struct anv_memory_type) {
- .propertyFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT |
- VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
- VK_MEMORY_PROPERTY_HOST_CACHED_BIT,
- .heapIndex = 0,
- };
- device->memory.types[1] = (struct anv_memory_type) {
- .propertyFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT |
- VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
- VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
- .heapIndex = 0,
- };
+ switch (device->info.kmd_type) {
+ case INTEL_KMD_TYPE_XE:
+ result = anv_xe_physical_device_init_memory_types(device);
+ break;
+ case INTEL_KMD_TYPE_I915:
+ default:
+ result = anv_i915_physical_device_init_memory_types(device);
+ break;
+ }
+
+ if (result != VK_SUCCESS)
+ return result;
+
+ /* Replicate all non protected memory types for descriptor buffers because
+ * we want to identify memory allocations to place them in the right memory
+ * heap.
+ */
+ device->memory.default_buffer_mem_types =
+ BITFIELD_RANGE(0, device->memory.type_count);
+ device->memory.protected_mem_types = 0;
+ device->memory.desc_buffer_mem_types = 0;
+
+ uint32_t base_types_count = device->memory.type_count;
+ for (int i = 0; i < base_types_count; i++) {
+ if (device->memory.types[i].propertyFlags &
+ VK_MEMORY_PROPERTY_PROTECTED_BIT) {
+ device->memory.protected_mem_types |= BITFIELD_BIT(i);
+ device->memory.default_buffer_mem_types &= (~BITFIELD_BIT(i));
+ continue;
+ }
+
+ assert(device->memory.type_count < ARRAY_SIZE(device->memory.types));
+
+ device->memory.desc_buffer_mem_types |=
+ BITFIELD_BIT(device->memory.type_count);
+
+ struct anv_memory_type *new_type =
+ &device->memory.types[device->memory.type_count++];
+ *new_type = device->memory.types[i];
+ new_type->descriptor_buffer = true;
}
- device->memory.need_clflush = false;
for (unsigned i = 0; i < device->memory.type_count; i++) {
VkMemoryPropertyFlags props = device->memory.types[i].propertyFlags;
if ((props & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT) &&
!(props & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT))
- device->memory.need_clflush = true;
+#ifdef SUPPORT_INTEL_INTEGRATED_GPUS
+ device->memory.need_flush = true;
+#else
+ return vk_errorf(device, VK_ERROR_INITIALIZATION_FAILED,
+ "Memory configuration requires flushing, but it's not implemented for this architecture");
+#endif
}
return VK_SUCCESS;
@@ -557,16 +1999,14 @@ anv_physical_device_init_uuids(struct anv_physical_device *device)
const struct build_id_note *note =
build_id_find_nhdr_for_addr(anv_physical_device_init_uuids);
if (!note) {
- return vk_errorfi(device->instance, NULL,
- VK_ERROR_INITIALIZATION_FAILED,
- "Failed to find build-id");
+ return vk_errorf(device, VK_ERROR_INITIALIZATION_FAILED,
+ "Failed to find build-id");
}
unsigned build_id_len = build_id_length(note);
if (build_id_len < 20) {
- return vk_errorfi(device->instance, NULL,
- VK_ERROR_INITIALIZATION_FAILED,
- "build-id too short. It needs to be a SHA");
+ return vk_errorf(device, VK_ERROR_INITIALIZATION_FAILED,
+ "build-id too short. It needs to be a SHA");
}
memcpy(device->driver_build_sha1, build_id_data(note), 20);
@@ -580,21 +2020,14 @@ anv_physical_device_init_uuids(struct anv_physical_device *device)
*/
_mesa_sha1_init(&sha1_ctx);
_mesa_sha1_update(&sha1_ctx, build_id_data(note), build_id_len);
- _mesa_sha1_update(&sha1_ctx, &device->info.chipset_id,
- sizeof(device->info.chipset_id));
+ brw_device_sha1_update(&sha1_ctx, &device->info);
_mesa_sha1_update(&sha1_ctx, &device->always_use_bindless,
sizeof(device->always_use_bindless));
- _mesa_sha1_update(&sha1_ctx, &device->has_a64_buffer_access,
- sizeof(device->has_a64_buffer_access));
- _mesa_sha1_update(&sha1_ctx, &device->has_bindless_images,
- sizeof(device->has_bindless_images));
- _mesa_sha1_update(&sha1_ctx, &device->has_bindless_samplers,
- sizeof(device->has_bindless_samplers));
_mesa_sha1_final(&sha1_ctx, sha1);
memcpy(device->pipeline_cache_uuid, sha1, VK_UUID_SIZE);
intel_uuid_compute_driver_id(device->driver_uuid, &device->info, VK_UUID_SIZE);
- intel_uuid_compute_device_id(device->device_uuid, &device->isl_dev, VK_UUID_SIZE);
+ intel_uuid_compute_device_id(device->device_uuid, &device->info, VK_UUID_SIZE);
return VK_SUCCESS;
}
@@ -605,7 +2038,7 @@ anv_physical_device_init_disk_cache(struct anv_physical_device *device)
#ifdef ENABLE_SHADER_CACHE
char renderer[10];
ASSERTED int len = snprintf(renderer, sizeof(renderer), "anv_%04x",
- device->info.chipset_id);
+ device->info.pci_device_id);
assert(len == sizeof(renderer) - 2);
char timestamp[41];
@@ -613,9 +2046,7 @@ anv_physical_device_init_disk_cache(struct anv_physical_device *device)
const uint64_t driver_flags =
brw_get_compiler_config_value(device->compiler);
- device->disk_cache = disk_cache_create(renderer, timestamp, driver_flags);
-#else
- device->disk_cache = NULL;
+ device->vk.disk_cache = disk_cache_create(renderer, timestamp, driver_flags);
#endif
}
@@ -623,10 +2054,12 @@ static void
anv_physical_device_free_disk_cache(struct anv_physical_device *device)
{
#ifdef ENABLE_SHADER_CACHE
- if (device->disk_cache)
- disk_cache_destroy(device->disk_cache);
+ if (device->vk.disk_cache) {
+ disk_cache_destroy(device->vk.disk_cache);
+ device->vk.disk_cache = NULL;
+ }
#else
- assert(device->disk_cache == NULL);
+ assert(device->vk.disk_cache == NULL);
#endif
}
@@ -637,6 +2070,7 @@ anv_physical_device_free_disk_cache(struct anv_physical_device *device)
* * "gc" is for graphics queues with compute support
* * "g" is for graphics queues with no compute support
* * "c" is for compute queues with no graphics support
+ * * "v" is for video queues with no graphics support
*
* For example, ANV_QUEUE_OVERRIDE=gc=2,c=1 would override the number of
* advertised queues to be 2 queues with graphics+compute support, and 1 queue
@@ -651,11 +2085,12 @@ anv_physical_device_free_disk_cache(struct anv_physical_device *device)
* number of graphics+compute queues to be 0.
*/
static void
-anv_override_engine_counts(int *gc_count, int *g_count, int *c_count)
+anv_override_engine_counts(int *gc_count, int *g_count, int *c_count, int *v_count)
{
int gc_override = -1;
int g_override = -1;
int c_override = -1;
+ int v_override = -1;
char *env = getenv("ANV_QUEUE_OVERRIDE");
if (env == NULL)
@@ -671,6 +2106,8 @@ anv_override_engine_counts(int *gc_count, int *g_count, int *c_count)
g_override = strtol(next + 2, NULL, 0);
} else if (strncmp(next, "c=", 2) == 0) {
c_override = strtol(next + 2, NULL, 0);
+ } else if (strncmp(next, "v=", 2) == 0) {
+ v_override = strtol(next + 2, NULL, 0);
} else {
mesa_logw("Ignoring unsupported ANV_QUEUE_OVERRIDE token: %s", next);
}
@@ -686,58 +2123,119 @@ anv_override_engine_counts(int *gc_count, int *g_count, int *c_count)
"Vulkan specification");
if (c_override >= 0)
*c_count = c_override;
+ if (v_override >= 0)
+ *v_count = v_override;
}
static void
anv_physical_device_init_queue_families(struct anv_physical_device *pdevice)
{
uint32_t family_count = 0;
+ VkQueueFlags sparse_flags = pdevice->sparse_type != ANV_SPARSE_TYPE_NOT_SUPPORTED ?
+ VK_QUEUE_SPARSE_BINDING_BIT : 0;
+ VkQueueFlags protected_flag = pdevice->has_protected_contexts ?
+ VK_QUEUE_PROTECTED_BIT : 0;
if (pdevice->engine_info) {
int gc_count =
- anv_gem_count_engines(pdevice->engine_info, I915_ENGINE_CLASS_RENDER);
+ intel_engines_count(pdevice->engine_info,
+ INTEL_ENGINE_CLASS_RENDER);
+ int v_count =
+ intel_engines_count(pdevice->engine_info, INTEL_ENGINE_CLASS_VIDEO);
int g_count = 0;
int c_count = 0;
+ const bool kernel_supports_non_render_engines = pdevice->has_vm_control;
+ const bool sparse_supports_non_render_engines =
+ pdevice->sparse_type != ANV_SPARSE_TYPE_TRTT;
+ const bool can_use_non_render_engines =
+ kernel_supports_non_render_engines &&
+ sparse_supports_non_render_engines;
- anv_override_engine_counts(&gc_count, &g_count, &c_count);
+ if (can_use_non_render_engines) {
+ c_count = intel_engines_supported_count(pdevice->local_fd,
+ &pdevice->info,
+ pdevice->engine_info,
+ INTEL_ENGINE_CLASS_COMPUTE);
+ }
+ enum intel_engine_class compute_class =
+ c_count < 1 ? INTEL_ENGINE_CLASS_RENDER : INTEL_ENGINE_CLASS_COMPUTE;
+
+ int blit_count = 0;
+ if (pdevice->info.verx10 >= 125 && can_use_non_render_engines) {
+ blit_count = intel_engines_supported_count(pdevice->local_fd,
+ &pdevice->info,
+ pdevice->engine_info,
+ INTEL_ENGINE_CLASS_COPY);
+ }
+
+ anv_override_engine_counts(&gc_count, &g_count, &c_count, &v_count);
if (gc_count > 0) {
pdevice->queue.families[family_count++] = (struct anv_queue_family) {
.queueFlags = VK_QUEUE_GRAPHICS_BIT |
VK_QUEUE_COMPUTE_BIT |
- VK_QUEUE_TRANSFER_BIT,
+ VK_QUEUE_TRANSFER_BIT |
+ sparse_flags |
+ protected_flag,
.queueCount = gc_count,
- .engine_class = I915_ENGINE_CLASS_RENDER,
+ .engine_class = INTEL_ENGINE_CLASS_RENDER,
};
}
if (g_count > 0) {
pdevice->queue.families[family_count++] = (struct anv_queue_family) {
.queueFlags = VK_QUEUE_GRAPHICS_BIT |
- VK_QUEUE_TRANSFER_BIT,
+ VK_QUEUE_TRANSFER_BIT |
+ sparse_flags |
+ protected_flag,
.queueCount = g_count,
- .engine_class = I915_ENGINE_CLASS_RENDER,
+ .engine_class = INTEL_ENGINE_CLASS_RENDER,
};
}
if (c_count > 0) {
pdevice->queue.families[family_count++] = (struct anv_queue_family) {
.queueFlags = VK_QUEUE_COMPUTE_BIT |
- VK_QUEUE_TRANSFER_BIT,
+ VK_QUEUE_TRANSFER_BIT |
+ sparse_flags |
+ protected_flag,
.queueCount = c_count,
- .engine_class = I915_ENGINE_CLASS_RENDER,
+ .engine_class = compute_class,
+ };
+ }
+ if (v_count > 0 && pdevice->video_decode_enabled) {
+ /* HEVC support on Gfx9 is only available on VCS0. So limit the number of video queues
+ * to the first VCS engine instance.
+ *
+ * We should be able to query HEVC support from the kernel using the engine query uAPI,
+ * but this appears to be broken :
+ * https://gitlab.freedesktop.org/drm/intel/-/issues/8832
+ *
+ * When this bug is fixed we should be able to check HEVC support to determine the
+ * correct number of queues.
+ */
+ /* TODO: enable protected content on video queue */
+ pdevice->queue.families[family_count++] = (struct anv_queue_family) {
+ .queueFlags = VK_QUEUE_VIDEO_DECODE_BIT_KHR,
+ .queueCount = pdevice->info.ver == 9 ? MIN2(1, v_count) : v_count,
+ .engine_class = INTEL_ENGINE_CLASS_VIDEO,
+ };
+ }
+ if (blit_count > 0) {
+ pdevice->queue.families[family_count++] = (struct anv_queue_family) {
+ .queueFlags = VK_QUEUE_TRANSFER_BIT |
+ protected_flag,
+ .queueCount = blit_count,
+ .engine_class = INTEL_ENGINE_CLASS_COPY,
};
}
- /* Increase count below when other families are added as a reminder to
- * increase the ANV_MAX_QUEUE_FAMILIES value.
- */
- STATIC_ASSERT(ANV_MAX_QUEUE_FAMILIES >= 3);
} else {
/* Default to a single render queue */
pdevice->queue.families[family_count++] = (struct anv_queue_family) {
.queueFlags = VK_QUEUE_GRAPHICS_BIT |
VK_QUEUE_COMPUTE_BIT |
- VK_QUEUE_TRANSFER_BIT,
+ VK_QUEUE_TRANSFER_BIT |
+ sparse_flags,
.queueCount = 1,
- .engine_class = I915_ENGINE_CLASS_RENDER,
+ .engine_class = INTEL_ENGINE_CLASS_RENDER,
};
family_count = 1;
}
@@ -746,45 +2244,79 @@ anv_physical_device_init_queue_families(struct anv_physical_device *pdevice)
}
static VkResult
-anv_physical_device_try_create(struct anv_instance *instance,
- drmDevicePtr drm_device,
- struct anv_physical_device **device_out)
+anv_physical_device_get_parameters(struct anv_physical_device *device)
+{
+ switch (device->info.kmd_type) {
+ case INTEL_KMD_TYPE_I915:
+ return anv_i915_physical_device_get_parameters(device);
+ case INTEL_KMD_TYPE_XE:
+ return anv_xe_physical_device_get_parameters(device);
+ default:
+ unreachable("Missing");
+ return VK_ERROR_UNKNOWN;
+ }
+}
+
+static VkResult
+anv_physical_device_try_create(struct vk_instance *vk_instance,
+ struct _drmDevice *drm_device,
+ struct vk_physical_device **out)
{
+ struct anv_instance *instance =
+ container_of(vk_instance, struct anv_instance, vk);
+
+ if (!(drm_device->available_nodes & (1 << DRM_NODE_RENDER)) ||
+ drm_device->bustype != DRM_BUS_PCI ||
+ drm_device->deviceinfo.pci->vendor_id != 0x8086)
+ return VK_ERROR_INCOMPATIBLE_DRIVER;
+
const char *primary_path = drm_device->nodes[DRM_NODE_PRIMARY];
const char *path = drm_device->nodes[DRM_NODE_RENDER];
VkResult result;
int fd;
int master_fd = -1;
- brw_process_intel_debug_variable();
+ process_intel_debug_variable();
fd = open(path, O_RDWR | O_CLOEXEC);
if (fd < 0) {
if (errno == ENOMEM) {
- return vk_errorfi(instance, NULL, VK_ERROR_OUT_OF_HOST_MEMORY,
- "Unable to open device %s: out of memory", path);
+ return vk_errorf(instance, VK_ERROR_OUT_OF_HOST_MEMORY,
+ "Unable to open device %s: out of memory", path);
}
- return vk_errorfi(instance, NULL, VK_ERROR_INCOMPATIBLE_DRIVER,
- "Unable to open device %s: %m", path);
+ return vk_errorf(instance, VK_ERROR_INCOMPATIBLE_DRIVER,
+ "Unable to open device %s: %m", path);
}
struct intel_device_info devinfo;
- if (!intel_get_device_info_from_fd(fd, &devinfo)) {
- result = vk_error(VK_ERROR_INCOMPATIBLE_DRIVER);
+ if (!intel_get_device_info_from_fd(fd, &devinfo, 9, -1)) {
+ result = VK_ERROR_INCOMPATIBLE_DRIVER;
goto fail_fd;
}
- if (devinfo.is_haswell) {
- mesa_logw("Haswell Vulkan support is incomplete");
- } else if (devinfo.ver == 7 && !devinfo.is_baytrail) {
- mesa_logw("Ivy Bridge Vulkan support is incomplete");
- } else if (devinfo.ver == 7 && devinfo.is_baytrail) {
- mesa_logw("Bay Trail Vulkan support is incomplete");
- } else if (devinfo.ver >= 8 && devinfo.ver <= 12) {
- /* Gfx8-12 fully supported */
- } else {
- result = vk_errorfi(instance, NULL, VK_ERROR_INCOMPATIBLE_DRIVER,
- "Vulkan not yet supported on %s", devinfo.name);
+ if (devinfo.ver == 20) {
+ mesa_logw("Vulkan not yet supported on %s", devinfo.name);
+ } else if (devinfo.ver > 12) {
+ result = vk_errorf(instance, VK_ERROR_INCOMPATIBLE_DRIVER,
+ "Vulkan not yet supported on %s", devinfo.name);
+ goto fail_fd;
+ } else if (devinfo.ver < 9) {
+ /* Silently fail here, hasvk should pick up this device. */
+ result = VK_ERROR_INCOMPATIBLE_DRIVER;
+ goto fail_fd;
+ }
+
+ /* Disable Wa_16013994831 on Gfx12.0 because we found other cases where we
+ * need to always disable preemption :
+ * - https://gitlab.freedesktop.org/mesa/mesa/-/issues/5963
+ * - https://gitlab.freedesktop.org/mesa/mesa/-/issues/5662
+ */
+ if (devinfo.verx10 == 120)
+ BITSET_CLEAR(devinfo.workarounds, INTEL_WA_16013994831);
+
+ if (!devinfo.has_context_isolation) {
+ result = vk_errorf(instance, VK_ERROR_INCOMPATIBLE_DRIVER,
+ "Vulkan requires context isolation for %s", devinfo.name);
goto fail_fd;
}
@@ -792,19 +2324,21 @@ anv_physical_device_try_create(struct anv_instance *instance,
vk_zalloc(&instance->vk.alloc, sizeof(*device), 8,
VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE);
if (device == NULL) {
- result = vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+ result = vk_error(instance, VK_ERROR_OUT_OF_HOST_MEMORY);
goto fail_fd;
}
struct vk_physical_device_dispatch_table dispatch_table;
vk_physical_device_dispatch_table_from_entrypoints(
&dispatch_table, &anv_physical_device_entrypoints, true);
+ vk_physical_device_dispatch_table_from_entrypoints(
+ &dispatch_table, &wsi_physical_device_entrypoints, false);
result = vk_physical_device_init(&device->vk, &instance->vk,
- NULL, /* We set up extensions later */
+ NULL, NULL, NULL, /* We set up extensions later */
&dispatch_table);
if (result != VK_SUCCESS) {
- vk_error(result);
+ vk_error(instance, result);
goto fail_alloc;
}
device->instance = instance;
@@ -814,175 +2348,156 @@ anv_physical_device_try_create(struct anv_instance *instance,
device->info = devinfo;
- device->pci_info.domain = drm_device->businfo.pci->domain;
- device->pci_info.bus = drm_device->businfo.pci->bus;
- device->pci_info.device = drm_device->businfo.pci->dev;
- device->pci_info.function = drm_device->businfo.pci->func;
-
- device->cmd_parser_version = -1;
- if (device->info.ver == 7) {
- device->cmd_parser_version =
- anv_gem_get_param(fd, I915_PARAM_CMD_PARSER_VERSION);
- if (device->cmd_parser_version == -1) {
- result = vk_errorfi(device->instance, NULL,
- VK_ERROR_INITIALIZATION_FAILED,
- "failed to get command parser version");
- goto fail_base;
- }
- }
-
- if (!anv_gem_get_param(fd, I915_PARAM_HAS_WAIT_TIMEOUT)) {
- result = vk_errorfi(device->instance, NULL,
- VK_ERROR_INITIALIZATION_FAILED,
- "kernel missing gem wait");
+ device->local_fd = fd;
+ result = anv_physical_device_get_parameters(device);
+ if (result != VK_SUCCESS)
goto fail_base;
- }
- if (!anv_gem_get_param(fd, I915_PARAM_HAS_EXECBUF2)) {
- result = vk_errorfi(device->instance, NULL,
- VK_ERROR_INITIALIZATION_FAILED,
- "kernel missing execbuf2");
- goto fail_base;
- }
+ device->gtt_size = device->info.gtt_size ? device->info.gtt_size :
+ device->info.aperture_bytes;
- if (!device->info.has_llc &&
- anv_gem_get_param(fd, I915_PARAM_MMAP_VERSION) < 1) {
- result = vk_errorfi(device->instance, NULL,
- VK_ERROR_INITIALIZATION_FAILED,
- "kernel missing wc mmap");
+ if (device->gtt_size < (4ULL << 30 /* GiB */)) {
+ vk_errorf(instance, VK_ERROR_INCOMPATIBLE_DRIVER,
+ "GTT size too small: 0x%016"PRIx64, device->gtt_size);
goto fail_base;
}
- if (device->info.ver >= 8 && !device->info.is_cherryview &&
- !anv_gem_get_param(fd, I915_PARAM_HAS_EXEC_SOFTPIN)) {
- result = vk_errorfi(device->instance, NULL,
- VK_ERROR_INITIALIZATION_FAILED,
- "kernel missing softpin");
- goto fail_alloc;
+ /* We currently only have the right bits for instructions in Gen12+. If the
+ * kernel ever starts supporting that feature on previous generations,
+ * we'll need to edit genxml prior to enabling here.
+ */
+ device->has_protected_contexts = device->info.ver >= 12 &&
+ intel_gem_supports_protected_context(fd, device->info.kmd_type);
+
+ /* Just pick one; they're all the same */
+ device->has_astc_ldr =
+ isl_format_supports_sampling(&device->info,
+ ISL_FORMAT_ASTC_LDR_2D_4X4_FLT16);
+ if (!device->has_astc_ldr &&
+ driQueryOptionb(&device->instance->dri_options, "vk_require_astc"))
+ device->emu_astc_ldr = true;
+ if (devinfo.ver == 9 && !intel_device_info_is_9lp(&devinfo)) {
+ device->flush_astc_ldr_void_extent_denorms =
+ device->has_astc_ldr && !device->emu_astc_ldr;
}
+ device->disable_fcv = device->info.verx10 >= 125 ||
+ instance->disable_fcv;
- if (!anv_gem_get_param(fd, I915_PARAM_HAS_EXEC_FENCE_ARRAY)) {
- result = vk_errorfi(device->instance, NULL,
- VK_ERROR_INITIALIZATION_FAILED,
- "kernel missing syncobj support");
+ result = anv_physical_device_init_heaps(device, fd);
+ if (result != VK_SUCCESS)
goto fail_base;
- }
- device->has_exec_async = anv_gem_get_param(fd, I915_PARAM_HAS_EXEC_ASYNC);
- device->has_exec_capture = anv_gem_get_param(fd, I915_PARAM_HAS_EXEC_CAPTURE);
- device->has_exec_fence = anv_gem_get_param(fd, I915_PARAM_HAS_EXEC_FENCE);
- device->has_syncobj_wait = anv_gem_supports_syncobj_wait(fd);
- device->has_syncobj_wait_available =
- anv_gem_get_drm_cap(fd, DRM_CAP_SYNCOBJ_TIMELINE) != 0;
+ if (debug_get_bool_option("ANV_QUEUE_THREAD_DISABLE", false))
+ device->has_exec_timeline = false;
- device->has_context_priority = anv_gem_has_context_priority(fd);
+ device->has_cooperative_matrix =
+ device->info.cooperative_matrix_configurations[0].scope != INTEL_CMAT_SCOPE_NONE;
- /* Initialize memory regions struct to 0. */
- memset(&device->vram, 0, sizeof(device->vram));
- memset(&device->sys, 0, sizeof(device->sys));
+ unsigned st_idx = 0;
- result = anv_physical_device_init_heaps(device, fd);
- if (result != VK_SUCCESS)
- goto fail_base;
+ device->sync_syncobj_type = vk_drm_syncobj_get_type(fd);
+ if (!device->has_exec_timeline)
+ device->sync_syncobj_type.features &= ~VK_SYNC_FEATURE_TIMELINE;
+ device->sync_types[st_idx++] = &device->sync_syncobj_type;
- device->use_softpin = device->info.ver >= 8 &&
- !device->info.is_cherryview;
- assert(device->use_softpin == device->supports_48bit_addresses);
+ /* anv_bo_sync_type is only supported with i915 for now */
+ if (device->info.kmd_type == INTEL_KMD_TYPE_I915) {
+ if (!(device->sync_syncobj_type.features & VK_SYNC_FEATURE_CPU_WAIT))
+ device->sync_types[st_idx++] = &anv_bo_sync_type;
- device->has_context_isolation =
- anv_gem_get_param(fd, I915_PARAM_HAS_CONTEXT_ISOLATION);
+ if (!(device->sync_syncobj_type.features & VK_SYNC_FEATURE_TIMELINE)) {
+ device->sync_timeline_type = vk_sync_timeline_get_type(&anv_bo_sync_type);
+ device->sync_types[st_idx++] = &device->sync_timeline_type.sync;
+ }
+ } else {
+ assert(vk_sync_type_is_drm_syncobj(&device->sync_syncobj_type));
+ assert(device->sync_syncobj_type.features & VK_SYNC_FEATURE_TIMELINE);
+ assert(device->sync_syncobj_type.features & VK_SYNC_FEATURE_CPU_WAIT);
+ }
- device->has_exec_timeline =
- anv_gem_get_param(fd, I915_PARAM_HAS_EXEC_TIMELINE_FENCES);
- if (env_var_as_boolean("ANV_QUEUE_THREAD_DISABLE", false))
- device->has_exec_timeline = false;
+ device->sync_types[st_idx++] = NULL;
+ assert(st_idx <= ARRAY_SIZE(device->sync_types));
+ device->vk.supported_sync_types = device->sync_types;
- device->has_thread_submit =
- device->has_syncobj_wait_available && device->has_exec_timeline;
+ device->vk.pipeline_cache_import_ops = anv_cache_import_ops;
device->always_use_bindless =
- env_var_as_boolean("ANV_ALWAYS_BINDLESS", false);
+ debug_get_bool_option("ANV_ALWAYS_BINDLESS", false);
device->use_call_secondary =
- device->use_softpin &&
- !env_var_as_boolean("ANV_DISABLE_SECONDARY_CMD_BUFFER_CALLS", false);
+ !debug_get_bool_option("ANV_DISABLE_SECONDARY_CMD_BUFFER_CALLS", false);
- /* We first got the A64 messages on broadwell and we can only use them if
- * we can pass addresses directly into the shader which requires softpin.
- */
- device->has_a64_buffer_access = device->info.ver >= 8 &&
- device->use_softpin;
+ device->video_decode_enabled = debug_get_bool_option("ANV_VIDEO_DECODE", false);
- /* We first get bindless image access on Skylake.
- */
- device->has_bindless_images = device->info.ver >= 9;
+ device->uses_ex_bso = device->info.verx10 >= 125;
- /* We've had bindless samplers since Ivy Bridge (forever in Vulkan terms)
- * because it's just a matter of setting the sampler address in the sample
- * message header. However, we've not bothered to wire it up for vec4 so
- * we leave it disabled on gfx7.
+ /* For now always use indirect descriptors. We'll update this
+ * to !uses_ex_bso when all the infrastructure is built up.
*/
- device->has_bindless_samplers = device->info.ver >= 8;
-
- device->has_implicit_ccs = device->info.has_aux_map;
+ device->indirect_descriptors =
+ !device->uses_ex_bso ||
+ driQueryOptionb(&instance->dri_options, "force_indirect_descriptors");
+ device->alloc_aux_tt_mem =
+ device->info.has_aux_map && device->info.verx10 >= 125;
/* Check if we can read the GPU timestamp register from the CPU */
uint64_t u64_ignore;
- device->has_reg_timestamp = anv_gem_reg_read(fd, TIMESTAMP | I915_REG_READ_8B_WA,
- &u64_ignore) == 0;
+ device->has_reg_timestamp = intel_gem_read_render_timestamp(fd,
+ device->info.kmd_type,
+ &u64_ignore);
+
+ device->uses_relocs = device->info.kmd_type != INTEL_KMD_TYPE_XE;
+
+ /* While xe.ko can use both vm_bind and TR-TT, i915.ko only has TR-TT. */
+ if (device->info.kmd_type == INTEL_KMD_TYPE_XE) {
+ if (debug_get_bool_option("ANV_SPARSE_USE_TRTT", false))
+ device->sparse_type = ANV_SPARSE_TYPE_TRTT;
+ else
+ device->sparse_type = ANV_SPARSE_TYPE_VM_BIND;
+ } else {
+ if (device->info.ver >= 12 &&
+ device->has_exec_timeline &&
+ debug_get_bool_option("ANV_SPARSE", true)) {
+ device->sparse_type = ANV_SPARSE_TYPE_TRTT;
+ } else if (instance->has_fake_sparse) {
+ device->sparse_type = ANV_SPARSE_TYPE_FAKE;
+ } else {
+ device->sparse_type = ANV_SPARSE_TYPE_NOT_SUPPORTED;
+ }
+ }
- device->always_flush_cache =
+ device->always_flush_cache = INTEL_DEBUG(DEBUG_STALL) ||
driQueryOptionb(&instance->dri_options, "always_flush_cache");
- device->has_mmap_offset =
- anv_gem_get_param(fd, I915_PARAM_MMAP_GTT_VERSION) >= 4;
-
- device->has_userptr_probe =
- anv_gem_get_param(fd, I915_PARAM_HAS_USERPTR_PROBE);
-
device->compiler = brw_compiler_create(NULL, &device->info);
if (device->compiler == NULL) {
- result = vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+ result = vk_error(instance, VK_ERROR_OUT_OF_HOST_MEMORY);
goto fail_base;
}
device->compiler->shader_debug_log = compiler_debug_log;
device->compiler->shader_perf_log = compiler_perf_log;
- device->compiler->supports_pull_constants = false;
- device->compiler->constant_buffer_0_is_relative =
- device->info.ver < 8 || !device->has_context_isolation;
- device->compiler->supports_shader_constants = true;
- device->compiler->compact_params = false;
device->compiler->indirect_ubos_use_sampler = device->info.ver < 12;
+ device->compiler->extended_bindless_surface_offset = device->uses_ex_bso;
+ device->compiler->use_bindless_sampler_offset = false;
+ device->compiler->spilling_rate =
+ driQueryOptioni(&instance->dri_options, "shader_spilling_rate");
- /* Broadwell PRM says:
- *
- * "Before Gfx8, there was a historical configuration control field to
- * swizzle address bit[6] for in X/Y tiling modes. This was set in three
- * different places: TILECTL[1:0], ARB_MODE[5:4], and
- * DISP_ARB_CTL[14:13].
- *
- * For Gfx8 and subsequent generations, the swizzle fields are all
- * reserved, and the CPU's memory controller performs all address
- * swizzling modifications."
- */
- bool swizzled =
- device->info.ver < 8 && anv_gem_get_bit6_swizzle(fd, I915_TILING_X);
-
- isl_device_init(&device->isl_dev, &device->info, swizzled);
+ isl_device_init(&device->isl_dev, &device->info);
+ device->isl_dev.buffer_length_in_aux_addr = true;
result = anv_physical_device_init_uuids(device);
if (result != VK_SUCCESS)
goto fail_compiler;
+ anv_physical_device_init_va_ranges(device);
+
anv_physical_device_init_disk_cache(device);
if (instance->vk.enabled_extensions.KHR_display) {
master_fd = open(primary_path, O_RDWR | O_CLOEXEC);
if (master_fd >= 0) {
- /* prod the device with a GETPARAM call which will fail if
- * we don't have permission to even render on this device
- */
- if (anv_gem_get_param(master_fd, I915_PARAM_CHIPSET_ID) == 0) {
+ /* fail if we don't have permission to even render on this device */
+ if (!intel_gem_can_render_on_fd(master_fd, device->info.kmd_type)) {
close(master_fd);
master_fd = -1;
}
@@ -990,25 +2505,15 @@ anv_physical_device_try_create(struct anv_instance *instance,
}
device->master_fd = master_fd;
- device->engine_info = anv_gem_get_engine_info(fd);
+ device->engine_info = intel_engine_get_info(fd, device->info.kmd_type);
+ device->info.has_compute_engine = device->engine_info &&
+ intel_engines_count(device->engine_info,
+ INTEL_ENGINE_CLASS_COMPUTE);
anv_physical_device_init_queue_families(device);
- result = anv_init_wsi(device);
- if (result != VK_SUCCESS)
- goto fail_engine_info;
-
anv_physical_device_init_perf(device, fd);
- anv_measure_device_init(device);
-
- get_device_extensions(device, &device->vk.supported_extensions);
-
- device->local_fd = fd;
-
- anv_genX(&device->info, init_physical_device_state)(device);
-
- *device_out = device;
-
+ /* Gather major/minor before WSI. */
struct stat st;
if (stat(primary_path, &st) == 0) {
@@ -1031,9 +2536,24 @@ anv_physical_device_try_create(struct anv_instance *instance,
device->local_minor = 0;
}
+ get_device_extensions(device, &device->vk.supported_extensions);
+ get_features(device, &device->vk.supported_features);
+ get_properties(device, &device->vk.properties);
+
+ result = anv_init_wsi(device);
+ if (result != VK_SUCCESS)
+ goto fail_perf;
+
+ anv_measure_device_init(device);
+
+ anv_genX(&device->info, init_physical_device_state)(device);
+
+ *out = &device->vk;
+
return VK_SUCCESS;
-fail_engine_info:
+fail_perf:
+ ralloc_free(device->perf);
free(device->engine_info);
anv_physical_device_free_disk_cache(device);
fail_compiler:
@@ -1050,8 +2570,11 @@ fail_fd:
}
static void
-anv_physical_device_destroy(struct anv_physical_device *device)
+anv_physical_device_destroy(struct vk_physical_device *vk_device)
{
+ struct anv_physical_device *device =
+ container_of(vk_device, struct anv_physical_device, vk);
+
anv_finish_wsi(device);
anv_measure_device_destroy(device);
free(device->engine_info);
@@ -1071,7 +2594,7 @@ VkResult anv_EnumerateInstanceExtensionProperties(
VkExtensionProperties* pProperties)
{
if (pLayerName)
- return vk_error(VK_ERROR_LAYER_NOT_PRESENT);
+ return vk_error(NULL, VK_ERROR_LAYER_NOT_PRESENT);
return vk_enumerate_instance_extension_properties(
&instance_extensions, pPropertyCount, pProperties);
@@ -1088,6 +2611,44 @@ anv_init_dri_options(struct anv_instance *instance)
instance->vk.app_info.app_version,
instance->vk.app_info.engine_name,
instance->vk.app_info.engine_version);
+
+ instance->assume_full_subgroups =
+ driQueryOptioni(&instance->dri_options, "anv_assume_full_subgroups");
+ instance->limit_trig_input_range =
+ driQueryOptionb(&instance->dri_options, "limit_trig_input_range");
+ instance->sample_mask_out_opengl_behaviour =
+ driQueryOptionb(&instance->dri_options, "anv_sample_mask_out_opengl_behaviour");
+ instance->force_filter_addr_rounding =
+ driQueryOptionb(&instance->dri_options, "anv_force_filter_addr_rounding");
+ instance->lower_depth_range_rate =
+ driQueryOptionf(&instance->dri_options, "lower_depth_range_rate");
+ instance->no_16bit =
+ driQueryOptionb(&instance->dri_options, "no_16bit");
+ instance->intel_enable_wa_14018912822 =
+ driQueryOptionb(&instance->dri_options, "intel_enable_wa_14018912822");
+ instance->mesh_conv_prim_attrs_to_vert_attrs =
+ driQueryOptioni(&instance->dri_options, "anv_mesh_conv_prim_attrs_to_vert_attrs");
+ instance->fp64_workaround_enabled =
+ driQueryOptionb(&instance->dri_options, "fp64_workaround_enabled");
+ instance->generated_indirect_threshold =
+ driQueryOptioni(&instance->dri_options, "generated_indirect_threshold");
+ instance->generated_indirect_ring_threshold =
+ driQueryOptioni(&instance->dri_options, "generated_indirect_ring_threshold");
+ instance->query_clear_with_blorp_threshold =
+ driQueryOptioni(&instance->dri_options, "query_clear_with_blorp_threshold");
+ instance->query_copy_with_shader_threshold =
+ driQueryOptioni(&instance->dri_options, "query_copy_with_shader_threshold");
+ instance->force_vk_vendor =
+ driQueryOptioni(&instance->dri_options, "force_vk_vendor");
+ instance->has_fake_sparse =
+ driQueryOptionb(&instance->dri_options, "fake_sparse");
+ instance->enable_tbimr = driQueryOptionb(&instance->dri_options, "intel_tbimr");
+ instance->disable_fcv =
+ driQueryOptionb(&instance->dri_options, "anv_disable_fcv");
+ instance->external_memory_implicit_sync =
+ driQueryOptionb(&instance->dri_options, "anv_external_memory_implicit_sync");
+ instance->compression_control_enabled =
+ driQueryOptionb(&instance->dri_options, "compression_control_enabled");
}
VkResult anv_CreateInstance(
@@ -1106,29 +2667,30 @@ VkResult anv_CreateInstance(
instance = vk_alloc(pAllocator, sizeof(*instance), 8,
VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE);
if (!instance)
- return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+ return vk_error(NULL, VK_ERROR_OUT_OF_HOST_MEMORY);
struct vk_instance_dispatch_table dispatch_table;
vk_instance_dispatch_table_from_entrypoints(
&dispatch_table, &anv_instance_entrypoints, true);
+ vk_instance_dispatch_table_from_entrypoints(
+ &dispatch_table, &wsi_instance_entrypoints, false);
result = vk_instance_init(&instance->vk, &instance_extensions,
&dispatch_table, pCreateInfo, pAllocator);
if (result != VK_SUCCESS) {
vk_free(pAllocator, instance);
- return vk_error(result);
+ return vk_error(NULL, result);
}
- instance->physical_devices_enumerated = false;
- list_inithead(&instance->physical_devices);
-
- instance->pipeline_cache_enabled =
- env_var_as_boolean("ANV_ENABLE_PIPELINE_CACHE", true);
+ instance->vk.physical_devices.try_create_for_drm = anv_physical_device_try_create;
+ instance->vk.physical_devices.destroy = anv_physical_device_destroy;
VG(VALGRIND_CREATE_MEMPOOL(instance, 0, false));
anv_init_dri_options(instance);
+ intel_driver_ds_init();
+
*pInstance = anv_instance_to_handle(instance);
return VK_SUCCESS;
@@ -1143,10 +2705,6 @@ void anv_DestroyInstance(
if (!instance)
return;
- list_for_each_entry_safe(struct anv_physical_device, pdevice,
- &instance->physical_devices, link)
- anv_physical_device_destroy(pdevice);
-
VG(VALGRIND_DESTROY_MEMPOOL(instance));
driDestroyOptionCache(&instance->dri_options);
@@ -1156,1644 +2714,71 @@ void anv_DestroyInstance(
vk_free(&instance->vk.alloc, instance);
}
-static VkResult
-anv_enumerate_physical_devices(struct anv_instance *instance)
-{
- if (instance->physical_devices_enumerated)
- return VK_SUCCESS;
-
- instance->physical_devices_enumerated = true;
-
- /* TODO: Check for more devices ? */
- drmDevicePtr devices[8];
- int max_devices;
-
- max_devices = drmGetDevices2(0, devices, ARRAY_SIZE(devices));
- if (max_devices < 1)
- return VK_SUCCESS;
-
- VkResult result = VK_SUCCESS;
- for (unsigned i = 0; i < (unsigned)max_devices; i++) {
- if (devices[i]->available_nodes & 1 << DRM_NODE_RENDER &&
- devices[i]->bustype == DRM_BUS_PCI &&
- devices[i]->deviceinfo.pci->vendor_id == 0x8086) {
-
- struct anv_physical_device *pdevice;
- result = anv_physical_device_try_create(instance, devices[i],
- &pdevice);
- /* Incompatible DRM device, skip. */
- if (result == VK_ERROR_INCOMPATIBLE_DRIVER) {
- result = VK_SUCCESS;
- continue;
- }
-
- /* Error creating the physical device, report the error. */
- if (result != VK_SUCCESS)
- break;
-
- list_addtail(&pdevice->link, &instance->physical_devices);
- }
- }
- drmFreeDevices(devices, max_devices);
-
- /* If we successfully enumerated any devices, call it success */
- return result;
-}
-
-VkResult anv_EnumeratePhysicalDevices(
- VkInstance _instance,
- uint32_t* pPhysicalDeviceCount,
- VkPhysicalDevice* pPhysicalDevices)
-{
- ANV_FROM_HANDLE(anv_instance, instance, _instance);
- VK_OUTARRAY_MAKE(out, pPhysicalDevices, pPhysicalDeviceCount);
-
- VkResult result = anv_enumerate_physical_devices(instance);
- if (result != VK_SUCCESS)
- return result;
-
- list_for_each_entry(struct anv_physical_device, pdevice,
- &instance->physical_devices, link) {
- vk_outarray_append(&out, i) {
- *i = anv_physical_device_to_handle(pdevice);
- }
- }
-
- return vk_outarray_status(&out);
-}
-
-VkResult anv_EnumeratePhysicalDeviceGroups(
- VkInstance _instance,
- uint32_t* pPhysicalDeviceGroupCount,
- VkPhysicalDeviceGroupProperties* pPhysicalDeviceGroupProperties)
-{
- ANV_FROM_HANDLE(anv_instance, instance, _instance);
- VK_OUTARRAY_MAKE(out, pPhysicalDeviceGroupProperties,
- pPhysicalDeviceGroupCount);
-
- VkResult result = anv_enumerate_physical_devices(instance);
- if (result != VK_SUCCESS)
- return result;
-
- list_for_each_entry(struct anv_physical_device, pdevice,
- &instance->physical_devices, link) {
- vk_outarray_append(&out, p) {
- p->physicalDeviceCount = 1;
- memset(p->physicalDevices, 0, sizeof(p->physicalDevices));
- p->physicalDevices[0] = anv_physical_device_to_handle(pdevice);
- p->subsetAllocation = false;
-
- vk_foreach_struct(ext, p->pNext)
- anv_debug_ignored_stype(ext->sType);
- }
- }
-
- return vk_outarray_status(&out);
-}
-
-void anv_GetPhysicalDeviceFeatures(
- VkPhysicalDevice physicalDevice,
- VkPhysicalDeviceFeatures* pFeatures)
-{
- ANV_FROM_HANDLE(anv_physical_device, pdevice, physicalDevice);
-
- *pFeatures = (VkPhysicalDeviceFeatures) {
- .robustBufferAccess = true,
- .fullDrawIndexUint32 = true,
- .imageCubeArray = true,
- .independentBlend = true,
- .geometryShader = true,
- .tessellationShader = true,
- .sampleRateShading = true,
- .dualSrcBlend = true,
- .logicOp = true,
- .multiDrawIndirect = true,
- .drawIndirectFirstInstance = true,
- .depthClamp = true,
- .depthBiasClamp = true,
- .fillModeNonSolid = true,
- .depthBounds = pdevice->info.ver >= 12,
- .wideLines = true,
- .largePoints = true,
- .alphaToOne = true,
- .multiViewport = true,
- .samplerAnisotropy = true,
- .textureCompressionETC2 = pdevice->info.ver >= 8 ||
- pdevice->info.is_baytrail,
- .textureCompressionASTC_LDR = pdevice->info.ver >= 9, /* FINISHME CHV */
- .textureCompressionBC = true,
- .occlusionQueryPrecise = true,
- .pipelineStatisticsQuery = true,
- .fragmentStoresAndAtomics = true,
- .shaderTessellationAndGeometryPointSize = true,
- .shaderImageGatherExtended = true,
- .shaderStorageImageExtendedFormats = true,
- .shaderStorageImageMultisample = false,
- .shaderStorageImageReadWithoutFormat = false,
- .shaderStorageImageWriteWithoutFormat = true,
- .shaderUniformBufferArrayDynamicIndexing = true,
- .shaderSampledImageArrayDynamicIndexing = true,
- .shaderStorageBufferArrayDynamicIndexing = true,
- .shaderStorageImageArrayDynamicIndexing = true,
- .shaderClipDistance = true,
- .shaderCullDistance = true,
- .shaderFloat64 = pdevice->info.ver >= 8 &&
- pdevice->info.has_64bit_float,
- .shaderInt64 = pdevice->info.ver >= 8,
- .shaderInt16 = pdevice->info.ver >= 8,
- .shaderResourceMinLod = pdevice->info.ver >= 9,
- .variableMultisampleRate = true,
- .inheritedQueries = true,
- };
-
- /* We can't do image stores in vec4 shaders */
- pFeatures->vertexPipelineStoresAndAtomics =
- pdevice->compiler->scalar_stage[MESA_SHADER_VERTEX] &&
- pdevice->compiler->scalar_stage[MESA_SHADER_GEOMETRY];
-
- struct vk_app_info *app_info = &pdevice->instance->vk.app_info;
-
- /* The new DOOM and Wolfenstein games require depthBounds without
- * checking for it. They seem to run fine without it so just claim it's
- * there and accept the consequences.
- */
- if (app_info->engine_name && strcmp(app_info->engine_name, "idTech") == 0)
- pFeatures->depthBounds = true;
-}
-
-static void
-anv_get_physical_device_features_1_1(struct anv_physical_device *pdevice,
- VkPhysicalDeviceVulkan11Features *f)
-{
- assert(f->sType == VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_FEATURES);
-
- f->storageBuffer16BitAccess = pdevice->info.ver >= 8;
- f->uniformAndStorageBuffer16BitAccess = pdevice->info.ver >= 8;
- f->storagePushConstant16 = pdevice->info.ver >= 8;
- f->storageInputOutput16 = false;
- f->multiview = true;
- f->multiviewGeometryShader = true;
- f->multiviewTessellationShader = true;
- f->variablePointersStorageBuffer = true;
- f->variablePointers = true;
- f->protectedMemory = false;
- f->samplerYcbcrConversion = true;
- f->shaderDrawParameters = true;
-}
-
-static void
-anv_get_physical_device_features_1_2(struct anv_physical_device *pdevice,
- VkPhysicalDeviceVulkan12Features *f)
-{
- assert(f->sType == VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_FEATURES);
-
- f->samplerMirrorClampToEdge = true;
- f->drawIndirectCount = true;
- f->storageBuffer8BitAccess = pdevice->info.ver >= 8;
- f->uniformAndStorageBuffer8BitAccess = pdevice->info.ver >= 8;
- f->storagePushConstant8 = pdevice->info.ver >= 8;
- f->shaderBufferInt64Atomics = pdevice->info.ver >= 9 &&
- pdevice->use_softpin;
- f->shaderSharedInt64Atomics = false;
- f->shaderFloat16 = pdevice->info.ver >= 8;
- f->shaderInt8 = pdevice->info.ver >= 8;
-
- bool descIndexing = pdevice->has_a64_buffer_access &&
- pdevice->has_bindless_images;
- f->descriptorIndexing = descIndexing;
- f->shaderInputAttachmentArrayDynamicIndexing = false;
- f->shaderUniformTexelBufferArrayDynamicIndexing = descIndexing;
- f->shaderStorageTexelBufferArrayDynamicIndexing = descIndexing;
- f->shaderUniformBufferArrayNonUniformIndexing = false;
- f->shaderSampledImageArrayNonUniformIndexing = descIndexing;
- f->shaderStorageBufferArrayNonUniformIndexing = descIndexing;
- f->shaderStorageImageArrayNonUniformIndexing = descIndexing;
- f->shaderInputAttachmentArrayNonUniformIndexing = false;
- f->shaderUniformTexelBufferArrayNonUniformIndexing = descIndexing;
- f->shaderStorageTexelBufferArrayNonUniformIndexing = descIndexing;
- f->descriptorBindingUniformBufferUpdateAfterBind = false;
- f->descriptorBindingSampledImageUpdateAfterBind = descIndexing;
- f->descriptorBindingStorageImageUpdateAfterBind = descIndexing;
- f->descriptorBindingStorageBufferUpdateAfterBind = descIndexing;
- f->descriptorBindingUniformTexelBufferUpdateAfterBind = descIndexing;
- f->descriptorBindingStorageTexelBufferUpdateAfterBind = descIndexing;
- f->descriptorBindingUpdateUnusedWhilePending = descIndexing;
- f->descriptorBindingPartiallyBound = descIndexing;
- f->descriptorBindingVariableDescriptorCount = descIndexing;
- f->runtimeDescriptorArray = descIndexing;
-
- f->samplerFilterMinmax = pdevice->info.ver >= 9;
- f->scalarBlockLayout = true;
- f->imagelessFramebuffer = true;
- f->uniformBufferStandardLayout = true;
- f->shaderSubgroupExtendedTypes = true;
- f->separateDepthStencilLayouts = true;
- f->hostQueryReset = true;
- f->timelineSemaphore = true;
- f->bufferDeviceAddress = pdevice->has_a64_buffer_access;
- f->bufferDeviceAddressCaptureReplay = pdevice->has_a64_buffer_access;
- f->bufferDeviceAddressMultiDevice = false;
- f->vulkanMemoryModel = true;
- f->vulkanMemoryModelDeviceScope = true;
- f->vulkanMemoryModelAvailabilityVisibilityChains = true;
- f->shaderOutputViewportIndex = true;
- f->shaderOutputLayer = true;
- f->subgroupBroadcastDynamicId = true;
-}
-
-void anv_GetPhysicalDeviceFeatures2(
- VkPhysicalDevice physicalDevice,
- VkPhysicalDeviceFeatures2* pFeatures)
-{
- ANV_FROM_HANDLE(anv_physical_device, pdevice, physicalDevice);
- anv_GetPhysicalDeviceFeatures(physicalDevice, &pFeatures->features);
-
- VkPhysicalDeviceVulkan11Features core_1_1 = {
- .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_FEATURES,
- };
- anv_get_physical_device_features_1_1(pdevice, &core_1_1);
-
- VkPhysicalDeviceVulkan12Features core_1_2 = {
- .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_FEATURES,
- };
- anv_get_physical_device_features_1_2(pdevice, &core_1_2);
-
-#define CORE_FEATURE(major, minor, feature) \
- features->feature = core_##major##_##minor.feature
-
-
- vk_foreach_struct(ext, pFeatures->pNext) {
- switch (ext->sType) {
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_4444_FORMATS_FEATURES_EXT: {
- VkPhysicalDevice4444FormatsFeaturesEXT *features =
- (VkPhysicalDevice4444FormatsFeaturesEXT *)ext;
- features->formatA4R4G4B4 = true;
- features->formatA4B4G4R4 = false;
- break;
- }
-
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_8BIT_STORAGE_FEATURES_KHR: {
- VkPhysicalDevice8BitStorageFeaturesKHR *features =
- (VkPhysicalDevice8BitStorageFeaturesKHR *)ext;
- CORE_FEATURE(1, 2, storageBuffer8BitAccess);
- CORE_FEATURE(1, 2, uniformAndStorageBuffer8BitAccess);
- CORE_FEATURE(1, 2, storagePushConstant8);
- break;
- }
-
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_16BIT_STORAGE_FEATURES: {
- VkPhysicalDevice16BitStorageFeatures *features =
- (VkPhysicalDevice16BitStorageFeatures *)ext;
- CORE_FEATURE(1, 1, storageBuffer16BitAccess);
- CORE_FEATURE(1, 1, uniformAndStorageBuffer16BitAccess);
- CORE_FEATURE(1, 1, storagePushConstant16);
- CORE_FEATURE(1, 1, storageInputOutput16);
- break;
- }
-
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ACCELERATION_STRUCTURE_FEATURES_KHR: {
- VkPhysicalDeviceAccelerationStructureFeaturesKHR *features = (void *)ext;
- features->accelerationStructure = false;
- features->accelerationStructureCaptureReplay = false;
- features->accelerationStructureIndirectBuild = false;
- features->accelerationStructureHostCommands = false;
- features->descriptorBindingAccelerationStructureUpdateAfterBind = true;
- break;
- }
-
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_BUFFER_DEVICE_ADDRESS_FEATURES_EXT: {
- VkPhysicalDeviceBufferDeviceAddressFeaturesEXT *features = (void *)ext;
- features->bufferDeviceAddress = pdevice->has_a64_buffer_access;
- features->bufferDeviceAddressCaptureReplay = false;
- features->bufferDeviceAddressMultiDevice = false;
- break;
- }
-
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_BUFFER_DEVICE_ADDRESS_FEATURES_KHR: {
- VkPhysicalDeviceBufferDeviceAddressFeaturesKHR *features = (void *)ext;
- CORE_FEATURE(1, 2, bufferDeviceAddress);
- CORE_FEATURE(1, 2, bufferDeviceAddressCaptureReplay);
- CORE_FEATURE(1, 2, bufferDeviceAddressMultiDevice);
- break;
- }
-
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COLOR_WRITE_ENABLE_FEATURES_EXT: {
- VkPhysicalDeviceColorWriteEnableFeaturesEXT *features =
- (VkPhysicalDeviceColorWriteEnableFeaturesEXT *)ext;
- features->colorWriteEnable = true;
- break;
- }
-
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COMPUTE_SHADER_DERIVATIVES_FEATURES_NV: {
- VkPhysicalDeviceComputeShaderDerivativesFeaturesNV *features =
- (VkPhysicalDeviceComputeShaderDerivativesFeaturesNV *)ext;
- features->computeDerivativeGroupQuads = true;
- features->computeDerivativeGroupLinear = true;
- break;
- }
-
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_CONDITIONAL_RENDERING_FEATURES_EXT: {
- VkPhysicalDeviceConditionalRenderingFeaturesEXT *features =
- (VkPhysicalDeviceConditionalRenderingFeaturesEXT*)ext;
- features->conditionalRendering = pdevice->info.verx10 >= 75;
- features->inheritedConditionalRendering = pdevice->info.verx10 >= 75;
- break;
- }
-
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_CUSTOM_BORDER_COLOR_FEATURES_EXT: {
- VkPhysicalDeviceCustomBorderColorFeaturesEXT *features =
- (VkPhysicalDeviceCustomBorderColorFeaturesEXT *)ext;
- features->customBorderColors = pdevice->info.ver >= 8;
- features->customBorderColorWithoutFormat = pdevice->info.ver >= 8;
- break;
- }
-
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DEPTH_CLIP_ENABLE_FEATURES_EXT: {
- VkPhysicalDeviceDepthClipEnableFeaturesEXT *features =
- (VkPhysicalDeviceDepthClipEnableFeaturesEXT *)ext;
- features->depthClipEnable = true;
- break;
- }
-
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FLOAT16_INT8_FEATURES_KHR: {
- VkPhysicalDeviceFloat16Int8FeaturesKHR *features = (void *)ext;
- CORE_FEATURE(1, 2, shaderFloat16);
- CORE_FEATURE(1, 2, shaderInt8);
- break;
- }
-
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FRAGMENT_SHADER_INTERLOCK_FEATURES_EXT: {
- VkPhysicalDeviceFragmentShaderInterlockFeaturesEXT *features =
- (VkPhysicalDeviceFragmentShaderInterlockFeaturesEXT *)ext;
- features->fragmentShaderSampleInterlock = pdevice->info.ver >= 9;
- features->fragmentShaderPixelInterlock = pdevice->info.ver >= 9;
- features->fragmentShaderShadingRateInterlock = false;
- break;
- }
-
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_HOST_QUERY_RESET_FEATURES_EXT: {
- VkPhysicalDeviceHostQueryResetFeaturesEXT *features =
- (VkPhysicalDeviceHostQueryResetFeaturesEXT *)ext;
- CORE_FEATURE(1, 2, hostQueryReset);
- break;
- }
-
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DESCRIPTOR_INDEXING_FEATURES_EXT: {
- VkPhysicalDeviceDescriptorIndexingFeaturesEXT *features =
- (VkPhysicalDeviceDescriptorIndexingFeaturesEXT *)ext;
- CORE_FEATURE(1, 2, shaderInputAttachmentArrayDynamicIndexing);
- CORE_FEATURE(1, 2, shaderUniformTexelBufferArrayDynamicIndexing);
- CORE_FEATURE(1, 2, shaderStorageTexelBufferArrayDynamicIndexing);
- CORE_FEATURE(1, 2, shaderUniformBufferArrayNonUniformIndexing);
- CORE_FEATURE(1, 2, shaderSampledImageArrayNonUniformIndexing);
- CORE_FEATURE(1, 2, shaderStorageBufferArrayNonUniformIndexing);
- CORE_FEATURE(1, 2, shaderStorageImageArrayNonUniformIndexing);
- CORE_FEATURE(1, 2, shaderInputAttachmentArrayNonUniformIndexing);
- CORE_FEATURE(1, 2, shaderUniformTexelBufferArrayNonUniformIndexing);
- CORE_FEATURE(1, 2, shaderStorageTexelBufferArrayNonUniformIndexing);
- CORE_FEATURE(1, 2, descriptorBindingUniformBufferUpdateAfterBind);
- CORE_FEATURE(1, 2, descriptorBindingSampledImageUpdateAfterBind);
- CORE_FEATURE(1, 2, descriptorBindingStorageImageUpdateAfterBind);
- CORE_FEATURE(1, 2, descriptorBindingStorageBufferUpdateAfterBind);
- CORE_FEATURE(1, 2, descriptorBindingUniformTexelBufferUpdateAfterBind);
- CORE_FEATURE(1, 2, descriptorBindingStorageTexelBufferUpdateAfterBind);
- CORE_FEATURE(1, 2, descriptorBindingUpdateUnusedWhilePending);
- CORE_FEATURE(1, 2, descriptorBindingPartiallyBound);
- CORE_FEATURE(1, 2, descriptorBindingVariableDescriptorCount);
- CORE_FEATURE(1, 2, runtimeDescriptorArray);
- break;
- }
-
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FRAGMENT_SHADING_RATE_FEATURES_KHR: {
- VkPhysicalDeviceFragmentShadingRateFeaturesKHR *features =
- (VkPhysicalDeviceFragmentShadingRateFeaturesKHR *)ext;
- features->attachmentFragmentShadingRate = false;
- features->pipelineFragmentShadingRate = true;
- features->primitiveFragmentShadingRate = false;
- break;
- }
-
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_IMAGE_ROBUSTNESS_FEATURES_EXT: {
- VkPhysicalDeviceImageRobustnessFeaturesEXT *features =
- (VkPhysicalDeviceImageRobustnessFeaturesEXT *)ext;
- features->robustImageAccess = true;
- break;
- }
-
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_INDEX_TYPE_UINT8_FEATURES_EXT: {
- VkPhysicalDeviceIndexTypeUint8FeaturesEXT *features =
- (VkPhysicalDeviceIndexTypeUint8FeaturesEXT *)ext;
- features->indexTypeUint8 = true;
- break;
- }
-
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_INLINE_UNIFORM_BLOCK_FEATURES_EXT: {
- VkPhysicalDeviceInlineUniformBlockFeaturesEXT *features =
- (VkPhysicalDeviceInlineUniformBlockFeaturesEXT *)ext;
- features->inlineUniformBlock = true;
- features->descriptorBindingInlineUniformBlockUpdateAfterBind = true;
- break;
- }
-
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_LINE_RASTERIZATION_FEATURES_EXT: {
- VkPhysicalDeviceLineRasterizationFeaturesEXT *features =
- (VkPhysicalDeviceLineRasterizationFeaturesEXT *)ext;
- features->rectangularLines = true;
- features->bresenhamLines = true;
- /* Support for Smooth lines with MSAA was removed on gfx11. From the
- * BSpec section "Multisample ModesState" table for "AA Line Support
- * Requirements":
- *
- * GFX10:BUG:######## NUM_MULTISAMPLES == 1
- *
- * Fortunately, this isn't a case most people care about.
- */
- features->smoothLines = pdevice->info.ver < 10;
- features->stippledRectangularLines = false;
- features->stippledBresenhamLines = true;
- features->stippledSmoothLines = false;
- break;
- }
-
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MULTIVIEW_FEATURES: {
- VkPhysicalDeviceMultiviewFeatures *features =
- (VkPhysicalDeviceMultiviewFeatures *)ext;
- CORE_FEATURE(1, 1, multiview);
- CORE_FEATURE(1, 1, multiviewGeometryShader);
- CORE_FEATURE(1, 1, multiviewTessellationShader);
- break;
- }
-
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_IMAGELESS_FRAMEBUFFER_FEATURES_KHR: {
- VkPhysicalDeviceImagelessFramebufferFeaturesKHR *features =
- (VkPhysicalDeviceImagelessFramebufferFeaturesKHR *)ext;
- CORE_FEATURE(1, 2, imagelessFramebuffer);
- break;
- }
-
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PERFORMANCE_QUERY_FEATURES_KHR: {
- VkPhysicalDevicePerformanceQueryFeaturesKHR *feature =
- (VkPhysicalDevicePerformanceQueryFeaturesKHR *)ext;
- feature->performanceCounterQueryPools = true;
- /* HW only supports a single configuration at a time. */
- feature->performanceCounterMultipleQueryPools = false;
- break;
- }
-
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PIPELINE_CREATION_CACHE_CONTROL_FEATURES_EXT: {
- VkPhysicalDevicePipelineCreationCacheControlFeaturesEXT *features =
- (VkPhysicalDevicePipelineCreationCacheControlFeaturesEXT *)ext;
- features->pipelineCreationCacheControl = true;
- break;
- }
-
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PIPELINE_EXECUTABLE_PROPERTIES_FEATURES_KHR: {
- VkPhysicalDevicePipelineExecutablePropertiesFeaturesKHR *features =
- (VkPhysicalDevicePipelineExecutablePropertiesFeaturesKHR *)ext;
- features->pipelineExecutableInfo = true;
- break;
- }
-
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PRIVATE_DATA_FEATURES_EXT: {
- VkPhysicalDevicePrivateDataFeaturesEXT *features = (void *)ext;
- features->privateData = true;
- break;
- }
-
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROTECTED_MEMORY_FEATURES: {
- VkPhysicalDeviceProtectedMemoryFeatures *features = (void *)ext;
- CORE_FEATURE(1, 1, protectedMemory);
- break;
- }
-
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROVOKING_VERTEX_FEATURES_EXT: {
- VkPhysicalDeviceProvokingVertexFeaturesEXT *features =
- (VkPhysicalDeviceProvokingVertexFeaturesEXT *)ext;
- features->provokingVertexLast = true;
- features->transformFeedbackPreservesProvokingVertex = true;
- break;
- }
-
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ROBUSTNESS_2_FEATURES_EXT: {
- VkPhysicalDeviceRobustness2FeaturesEXT *features = (void *)ext;
- features->robustBufferAccess2 = true;
- features->robustImageAccess2 = true;
- features->nullDescriptor = true;
- break;
- }
-
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SAMPLER_YCBCR_CONVERSION_FEATURES: {
- VkPhysicalDeviceSamplerYcbcrConversionFeatures *features =
- (VkPhysicalDeviceSamplerYcbcrConversionFeatures *) ext;
- CORE_FEATURE(1, 1, samplerYcbcrConversion);
- break;
- }
-
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SCALAR_BLOCK_LAYOUT_FEATURES_EXT: {
- VkPhysicalDeviceScalarBlockLayoutFeaturesEXT *features =
- (VkPhysicalDeviceScalarBlockLayoutFeaturesEXT *)ext;
- CORE_FEATURE(1, 2, scalarBlockLayout);
- break;
- }
-
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SEPARATE_DEPTH_STENCIL_LAYOUTS_FEATURES_KHR: {
- VkPhysicalDeviceSeparateDepthStencilLayoutsFeaturesKHR *features =
- (VkPhysicalDeviceSeparateDepthStencilLayoutsFeaturesKHR *)ext;
- CORE_FEATURE(1, 2, separateDepthStencilLayouts);
- break;
- }
-
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_ATOMIC_FLOAT_FEATURES_EXT: {
- VkPhysicalDeviceShaderAtomicFloatFeaturesEXT *features = (void *)ext;
- features->shaderBufferFloat32Atomics = true;
- features->shaderBufferFloat32AtomicAdd = pdevice->info.has_lsc;
- features->shaderBufferFloat64Atomics = pdevice->info.has_lsc;
- features->shaderBufferFloat64AtomicAdd = false;
- features->shaderSharedFloat32Atomics = true;
- features->shaderSharedFloat32AtomicAdd = false;
- features->shaderSharedFloat64Atomics = false;
- features->shaderSharedFloat64AtomicAdd = false;
- features->shaderImageFloat32Atomics = true;
- features->shaderImageFloat32AtomicAdd = false;
- features->sparseImageFloat32Atomics = false;
- features->sparseImageFloat32AtomicAdd = false;
- break;
- }
-
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_ATOMIC_FLOAT_2_FEATURES_EXT: {
- VkPhysicalDeviceShaderAtomicFloat2FeaturesEXT *features = (void *)ext;
- features->shaderBufferFloat16Atomics = false;
- features->shaderBufferFloat16AtomicAdd = false;
- features->shaderBufferFloat16AtomicMinMax = false;
- features->shaderBufferFloat32AtomicMinMax = pdevice->info.ver >= 9;
- features->shaderBufferFloat64AtomicMinMax = pdevice->info.has_lsc;
- features->shaderSharedFloat16Atomics = false;
- features->shaderSharedFloat16AtomicAdd = false;
- features->shaderSharedFloat16AtomicMinMax = false;
- features->shaderSharedFloat32AtomicMinMax = pdevice->info.ver >= 9;
- features->shaderSharedFloat64AtomicMinMax = false;
- features->shaderImageFloat32AtomicMinMax = false;
- features->sparseImageFloat32AtomicMinMax = false;
- break;
- }
-
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_ATOMIC_INT64_FEATURES_KHR: {
- VkPhysicalDeviceShaderAtomicInt64FeaturesKHR *features = (void *)ext;
- CORE_FEATURE(1, 2, shaderBufferInt64Atomics);
- CORE_FEATURE(1, 2, shaderSharedInt64Atomics);
- break;
- }
-
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_DEMOTE_TO_HELPER_INVOCATION_FEATURES_EXT: {
- VkPhysicalDeviceShaderDemoteToHelperInvocationFeaturesEXT *features = (void *)ext;
- features->shaderDemoteToHelperInvocation = true;
- break;
- }
-
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_CLOCK_FEATURES_KHR: {
- VkPhysicalDeviceShaderClockFeaturesKHR *features =
- (VkPhysicalDeviceShaderClockFeaturesKHR *)ext;
- features->shaderSubgroupClock = true;
- features->shaderDeviceClock = false;
- break;
- }
-
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_DRAW_PARAMETERS_FEATURES: {
- VkPhysicalDeviceShaderDrawParametersFeatures *features = (void *)ext;
- CORE_FEATURE(1, 1, shaderDrawParameters);
- break;
- }
-
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_INTEGER_FUNCTIONS_2_FEATURES_INTEL: {
- VkPhysicalDeviceShaderIntegerFunctions2FeaturesINTEL *features =
- (VkPhysicalDeviceShaderIntegerFunctions2FeaturesINTEL *)ext;
- features->shaderIntegerFunctions2 = true;
- break;
- }
-
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_INTEGER_DOT_PRODUCT_FEATURES_KHR: {
- VkPhysicalDeviceShaderIntegerDotProductFeaturesKHR *features =
- (VkPhysicalDeviceShaderIntegerDotProductFeaturesKHR *)ext;
- features->shaderIntegerDotProduct = true;
- break;
- };
-
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_SUBGROUP_EXTENDED_TYPES_FEATURES_KHR: {
- VkPhysicalDeviceShaderSubgroupExtendedTypesFeaturesKHR *features =
- (VkPhysicalDeviceShaderSubgroupExtendedTypesFeaturesKHR *)ext;
- CORE_FEATURE(1, 2, shaderSubgroupExtendedTypes);
- break;
- }
-
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_SUBGROUP_UNIFORM_CONTROL_FLOW_FEATURES_KHR: {
- VkPhysicalDeviceShaderSubgroupUniformControlFlowFeaturesKHR *features =
- (VkPhysicalDeviceShaderSubgroupUniformControlFlowFeaturesKHR *)ext;
- features->shaderSubgroupUniformControlFlow = true;
- break;
- }
-
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_TERMINATE_INVOCATION_FEATURES_KHR: {
- VkPhysicalDeviceShaderTerminateInvocationFeaturesKHR *features =
- (VkPhysicalDeviceShaderTerminateInvocationFeaturesKHR *)ext;
- features->shaderTerminateInvocation = true;
- break;
- }
-
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_SIZE_CONTROL_FEATURES_EXT: {
- VkPhysicalDeviceSubgroupSizeControlFeaturesEXT *features =
- (VkPhysicalDeviceSubgroupSizeControlFeaturesEXT *)ext;
- features->subgroupSizeControl = true;
- features->computeFullSubgroups = true;
- break;
- }
-
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TEXEL_BUFFER_ALIGNMENT_FEATURES_EXT: {
- VkPhysicalDeviceTexelBufferAlignmentFeaturesEXT *features =
- (VkPhysicalDeviceTexelBufferAlignmentFeaturesEXT *)ext;
- features->texelBufferAlignment = true;
- break;
- }
-
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TIMELINE_SEMAPHORE_FEATURES_KHR: {
- VkPhysicalDeviceTimelineSemaphoreFeaturesKHR *features =
- (VkPhysicalDeviceTimelineSemaphoreFeaturesKHR *) ext;
- CORE_FEATURE(1, 2, timelineSemaphore);
- break;
- }
-
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VARIABLE_POINTERS_FEATURES: {
- VkPhysicalDeviceVariablePointersFeatures *features = (void *)ext;
- CORE_FEATURE(1, 1, variablePointersStorageBuffer);
- CORE_FEATURE(1, 1, variablePointers);
- break;
- }
-
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TRANSFORM_FEEDBACK_FEATURES_EXT: {
- VkPhysicalDeviceTransformFeedbackFeaturesEXT *features =
- (VkPhysicalDeviceTransformFeedbackFeaturesEXT *)ext;
- features->transformFeedback = true;
- features->geometryStreams = true;
- break;
- }
-
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_UNIFORM_BUFFER_STANDARD_LAYOUT_FEATURES_KHR: {
- VkPhysicalDeviceUniformBufferStandardLayoutFeaturesKHR *features =
- (VkPhysicalDeviceUniformBufferStandardLayoutFeaturesKHR *)ext;
- CORE_FEATURE(1, 2, uniformBufferStandardLayout);
- break;
- }
-
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VERTEX_ATTRIBUTE_DIVISOR_FEATURES_EXT: {
- VkPhysicalDeviceVertexAttributeDivisorFeaturesEXT *features =
- (VkPhysicalDeviceVertexAttributeDivisorFeaturesEXT *)ext;
- features->vertexAttributeInstanceRateDivisor = true;
- features->vertexAttributeInstanceRateZeroDivisor = true;
- break;
- }
-
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_FEATURES:
- anv_get_physical_device_features_1_1(pdevice, (void *)ext);
- break;
-
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_FEATURES:
- anv_get_physical_device_features_1_2(pdevice, (void *)ext);
- break;
-
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_MEMORY_MODEL_FEATURES_KHR: {
- VkPhysicalDeviceVulkanMemoryModelFeaturesKHR *features = (void *)ext;
- CORE_FEATURE(1, 2, vulkanMemoryModel);
- CORE_FEATURE(1, 2, vulkanMemoryModelDeviceScope);
- CORE_FEATURE(1, 2, vulkanMemoryModelAvailabilityVisibilityChains);
- break;
- }
-
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_WORKGROUP_MEMORY_EXPLICIT_LAYOUT_FEATURES_KHR: {
- VkPhysicalDeviceWorkgroupMemoryExplicitLayoutFeaturesKHR *features =
- (VkPhysicalDeviceWorkgroupMemoryExplicitLayoutFeaturesKHR *)ext;
- features->workgroupMemoryExplicitLayout = true;
- features->workgroupMemoryExplicitLayoutScalarBlockLayout = true;
- features->workgroupMemoryExplicitLayout8BitAccess = true;
- features->workgroupMemoryExplicitLayout16BitAccess = true;
- break;
- }
-
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_YCBCR_IMAGE_ARRAYS_FEATURES_EXT: {
- VkPhysicalDeviceYcbcrImageArraysFeaturesEXT *features =
- (VkPhysicalDeviceYcbcrImageArraysFeaturesEXT *)ext;
- features->ycbcrImageArrays = true;
- break;
- }
-
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTENDED_DYNAMIC_STATE_FEATURES_EXT: {
- VkPhysicalDeviceExtendedDynamicStateFeaturesEXT *features =
- (VkPhysicalDeviceExtendedDynamicStateFeaturesEXT *)ext;
- features->extendedDynamicState = true;
- break;
- }
-
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTENDED_DYNAMIC_STATE_2_FEATURES_EXT: {
- VkPhysicalDeviceExtendedDynamicState2FeaturesEXT *features =
- (VkPhysicalDeviceExtendedDynamicState2FeaturesEXT *)ext;
- features->extendedDynamicState2 = true;
- features->extendedDynamicState2LogicOp = true;
- features->extendedDynamicState2PatchControlPoints = false;
- break;
- }
-
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ZERO_INITIALIZE_WORKGROUP_MEMORY_FEATURES_KHR: {
- VkPhysicalDeviceZeroInitializeWorkgroupMemoryFeaturesKHR *features =
- (VkPhysicalDeviceZeroInitializeWorkgroupMemoryFeaturesKHR *)ext;
- features->shaderZeroInitializeWorkgroupMemory = true;
- break;
- }
-
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MULTI_DRAW_FEATURES_EXT: {
- VkPhysicalDeviceMultiDrawFeaturesEXT *features = (VkPhysicalDeviceMultiDrawFeaturesEXT *)ext;
- features->multiDraw = true;
- break;
- }
-
- default:
- anv_debug_ignored_stype(ext->sType);
- break;
- }
- }
-
-#undef CORE_FEATURE
-}
-
-#define MAX_PER_STAGE_DESCRIPTOR_UNIFORM_BUFFERS 64
-
-#define MAX_PER_STAGE_DESCRIPTOR_INPUT_ATTACHMENTS 64
-#define MAX_DESCRIPTOR_SET_INPUT_ATTACHMENTS 256
-
-#define MAX_CUSTOM_BORDER_COLORS 4096
-
-void anv_GetPhysicalDeviceProperties(
- VkPhysicalDevice physicalDevice,
- VkPhysicalDeviceProperties* pProperties)
-{
- ANV_FROM_HANDLE(anv_physical_device, pdevice, physicalDevice);
- const struct intel_device_info *devinfo = &pdevice->info;
-
- /* See assertions made when programming the buffer surface state. */
- const uint32_t max_raw_buffer_sz = devinfo->ver >= 7 ?
- (1ul << 30) : (1ul << 27);
-
- const uint32_t max_ssbos = pdevice->has_a64_buffer_access ? UINT16_MAX : 64;
- const uint32_t max_textures =
- pdevice->has_bindless_images ? UINT16_MAX : 128;
- const uint32_t max_samplers =
- pdevice->has_bindless_samplers ? UINT16_MAX :
- (devinfo->verx10 >= 75) ? 128 : 16;
- const uint32_t max_images =
- pdevice->has_bindless_images ? UINT16_MAX : MAX_IMAGES;
-
- /* If we can use bindless for everything, claim a high per-stage limit,
- * otherwise use the binding table size, minus the slots reserved for
- * render targets and one slot for the descriptor buffer. */
- const uint32_t max_per_stage =
- pdevice->has_bindless_images && pdevice->has_a64_buffer_access
- ? UINT32_MAX : MAX_BINDING_TABLE_SIZE - MAX_RTS - 1;
-
- const uint32_t max_workgroup_size = 32 * devinfo->max_cs_workgroup_threads;
-
- VkSampleCountFlags sample_counts =
- isl_device_get_sample_counts(&pdevice->isl_dev);
-
-
- VkPhysicalDeviceLimits limits = {
- .maxImageDimension1D = (1 << 14),
- .maxImageDimension2D = (1 << 14),
- .maxImageDimension3D = (1 << 11),
- .maxImageDimensionCube = (1 << 14),
- .maxImageArrayLayers = (1 << 11),
- .maxTexelBufferElements = 128 * 1024 * 1024,
- .maxUniformBufferRange = (1ul << 27),
- .maxStorageBufferRange = max_raw_buffer_sz,
- .maxPushConstantsSize = MAX_PUSH_CONSTANTS_SIZE,
- .maxMemoryAllocationCount = UINT32_MAX,
- .maxSamplerAllocationCount = 64 * 1024,
- .bufferImageGranularity = 64, /* A cache line */
- .sparseAddressSpaceSize = 0,
- .maxBoundDescriptorSets = MAX_SETS,
- .maxPerStageDescriptorSamplers = max_samplers,
- .maxPerStageDescriptorUniformBuffers = MAX_PER_STAGE_DESCRIPTOR_UNIFORM_BUFFERS,
- .maxPerStageDescriptorStorageBuffers = max_ssbos,
- .maxPerStageDescriptorSampledImages = max_textures,
- .maxPerStageDescriptorStorageImages = max_images,
- .maxPerStageDescriptorInputAttachments = MAX_PER_STAGE_DESCRIPTOR_INPUT_ATTACHMENTS,
- .maxPerStageResources = max_per_stage,
- .maxDescriptorSetSamplers = 6 * max_samplers, /* number of stages * maxPerStageDescriptorSamplers */
- .maxDescriptorSetUniformBuffers = 6 * MAX_PER_STAGE_DESCRIPTOR_UNIFORM_BUFFERS, /* number of stages * maxPerStageDescriptorUniformBuffers */
- .maxDescriptorSetUniformBuffersDynamic = MAX_DYNAMIC_BUFFERS / 2,
- .maxDescriptorSetStorageBuffers = 6 * max_ssbos, /* number of stages * maxPerStageDescriptorStorageBuffers */
- .maxDescriptorSetStorageBuffersDynamic = MAX_DYNAMIC_BUFFERS / 2,
- .maxDescriptorSetSampledImages = 6 * max_textures, /* number of stages * maxPerStageDescriptorSampledImages */
- .maxDescriptorSetStorageImages = 6 * max_images, /* number of stages * maxPerStageDescriptorStorageImages */
- .maxDescriptorSetInputAttachments = MAX_DESCRIPTOR_SET_INPUT_ATTACHMENTS,
- .maxVertexInputAttributes = MAX_VBS,
- .maxVertexInputBindings = MAX_VBS,
- .maxVertexInputAttributeOffset = 2047,
- .maxVertexInputBindingStride = 2048,
- .maxVertexOutputComponents = 128,
- .maxTessellationGenerationLevel = 64,
- .maxTessellationPatchSize = 32,
- .maxTessellationControlPerVertexInputComponents = 128,
- .maxTessellationControlPerVertexOutputComponents = 128,
- .maxTessellationControlPerPatchOutputComponents = 128,
- .maxTessellationControlTotalOutputComponents = 2048,
- .maxTessellationEvaluationInputComponents = 128,
- .maxTessellationEvaluationOutputComponents = 128,
- .maxGeometryShaderInvocations = 32,
- .maxGeometryInputComponents = devinfo->ver >= 8 ? 128 : 64,
- .maxGeometryOutputComponents = 128,
- .maxGeometryOutputVertices = 256,
- .maxGeometryTotalOutputComponents = 1024,
- .maxFragmentInputComponents = 116, /* 128 components - (PSIZ, CLIP_DIST0, CLIP_DIST1) */
- .maxFragmentOutputAttachments = 8,
- .maxFragmentDualSrcAttachments = 1,
- .maxFragmentCombinedOutputResources = 8,
- .maxComputeSharedMemorySize = 64 * 1024,
- .maxComputeWorkGroupCount = { 65535, 65535, 65535 },
- .maxComputeWorkGroupInvocations = max_workgroup_size,
- .maxComputeWorkGroupSize = {
- max_workgroup_size,
- max_workgroup_size,
- max_workgroup_size,
- },
- .subPixelPrecisionBits = 8,
- .subTexelPrecisionBits = 8,
- .mipmapPrecisionBits = 8,
- .maxDrawIndexedIndexValue = UINT32_MAX,
- .maxDrawIndirectCount = UINT32_MAX,
- .maxSamplerLodBias = 16,
- .maxSamplerAnisotropy = 16,
- .maxViewports = MAX_VIEWPORTS,
- .maxViewportDimensions = { (1 << 14), (1 << 14) },
- .viewportBoundsRange = { INT16_MIN, INT16_MAX },
- .viewportSubPixelBits = 13, /* We take a float? */
- .minMemoryMapAlignment = 4096, /* A page */
- /* The dataport requires texel alignment so we need to assume a worst
- * case of R32G32B32A32 which is 16 bytes.
- */
- .minTexelBufferOffsetAlignment = 16,
- .minUniformBufferOffsetAlignment = ANV_UBO_ALIGNMENT,
- .minStorageBufferOffsetAlignment = ANV_SSBO_ALIGNMENT,
- .minTexelOffset = -8,
- .maxTexelOffset = 7,
- .minTexelGatherOffset = -32,
- .maxTexelGatherOffset = 31,
- .minInterpolationOffset = -0.5,
- .maxInterpolationOffset = 0.4375,
- .subPixelInterpolationOffsetBits = 4,
- .maxFramebufferWidth = (1 << 14),
- .maxFramebufferHeight = (1 << 14),
- .maxFramebufferLayers = (1 << 11),
- .framebufferColorSampleCounts = sample_counts,
- .framebufferDepthSampleCounts = sample_counts,
- .framebufferStencilSampleCounts = sample_counts,
- .framebufferNoAttachmentsSampleCounts = sample_counts,
- .maxColorAttachments = MAX_RTS,
- .sampledImageColorSampleCounts = sample_counts,
- .sampledImageIntegerSampleCounts = sample_counts,
- .sampledImageDepthSampleCounts = sample_counts,
- .sampledImageStencilSampleCounts = sample_counts,
- .storageImageSampleCounts = VK_SAMPLE_COUNT_1_BIT,
- .maxSampleMaskWords = 1,
- .timestampComputeAndGraphics = true,
- .timestampPeriod = 1000000000.0 / devinfo->timestamp_frequency,
- .maxClipDistances = 8,
- .maxCullDistances = 8,
- .maxCombinedClipAndCullDistances = 8,
- .discreteQueuePriorities = 2,
- .pointSizeRange = { 0.125, 255.875 },
- /* While SKL and up support much wider lines than we are setting here,
- * in practice we run into conformance issues if we go past this limit.
- * Since the Windows driver does the same, it's probably fair to assume
- * that no one needs more than this.
- */
- .lineWidthRange = { 0.0, 7.9921875 },
- .pointSizeGranularity = (1.0 / 8.0),
- .lineWidthGranularity = (1.0 / 128.0),
- .strictLines = false,
- .standardSampleLocations = true,
- .optimalBufferCopyOffsetAlignment = 128,
- .optimalBufferCopyRowPitchAlignment = 128,
- .nonCoherentAtomSize = 64,
- };
-
- *pProperties = (VkPhysicalDeviceProperties) {
- .apiVersion = ANV_API_VERSION,
- .driverVersion = vk_get_driver_version(),
- .vendorID = 0x8086,
- .deviceID = pdevice->info.chipset_id,
- .deviceType = pdevice->info.has_local_mem ?
- VK_PHYSICAL_DEVICE_TYPE_DISCRETE_GPU :
- VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU,
- .limits = limits,
- .sparseProperties = {0}, /* Broadwell doesn't do sparse. */
- };
-
- snprintf(pProperties->deviceName, sizeof(pProperties->deviceName),
- "%s", pdevice->info.name);
- memcpy(pProperties->pipelineCacheUUID,
- pdevice->pipeline_cache_uuid, VK_UUID_SIZE);
-}
-
-static void
-anv_get_physical_device_properties_1_1(struct anv_physical_device *pdevice,
- VkPhysicalDeviceVulkan11Properties *p)
-{
- assert(p->sType == VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_PROPERTIES);
-
- memcpy(p->deviceUUID, pdevice->device_uuid, VK_UUID_SIZE);
- memcpy(p->driverUUID, pdevice->driver_uuid, VK_UUID_SIZE);
- memset(p->deviceLUID, 0, VK_LUID_SIZE);
- p->deviceNodeMask = 0;
- p->deviceLUIDValid = false;
-
- p->subgroupSize = BRW_SUBGROUP_SIZE;
- VkShaderStageFlags scalar_stages = 0;
- for (unsigned stage = 0; stage < MESA_SHADER_STAGES; stage++) {
- if (pdevice->compiler->scalar_stage[stage])
- scalar_stages |= mesa_to_vk_shader_stage(stage);
- }
- if (pdevice->vk.supported_extensions.KHR_ray_tracing_pipeline) {
- scalar_stages |= MESA_SHADER_RAYGEN |
- MESA_SHADER_ANY_HIT |
- MESA_SHADER_CLOSEST_HIT |
- MESA_SHADER_MISS |
- MESA_SHADER_INTERSECTION |
- MESA_SHADER_CALLABLE;
- }
- p->subgroupSupportedStages = scalar_stages;
- p->subgroupSupportedOperations = VK_SUBGROUP_FEATURE_BASIC_BIT |
- VK_SUBGROUP_FEATURE_VOTE_BIT |
- VK_SUBGROUP_FEATURE_BALLOT_BIT |
- VK_SUBGROUP_FEATURE_SHUFFLE_BIT |
- VK_SUBGROUP_FEATURE_SHUFFLE_RELATIVE_BIT |
- VK_SUBGROUP_FEATURE_QUAD_BIT;
- if (pdevice->info.ver >= 8) {
- /* TODO: There's no technical reason why these can't be made to
- * work on gfx7 but they don't at the moment so it's best to leave
- * the feature disabled than enabled and broken.
- */
- p->subgroupSupportedOperations |= VK_SUBGROUP_FEATURE_ARITHMETIC_BIT |
- VK_SUBGROUP_FEATURE_CLUSTERED_BIT;
- }
- p->subgroupQuadOperationsInAllStages = pdevice->info.ver >= 8;
-
- p->pointClippingBehavior = VK_POINT_CLIPPING_BEHAVIOR_USER_CLIP_PLANES_ONLY;
- p->maxMultiviewViewCount = 16;
- p->maxMultiviewInstanceIndex = UINT32_MAX / 16;
- p->protectedNoFault = false;
- /* This value doesn't matter for us today as our per-stage descriptors are
- * the real limit.
- */
- p->maxPerSetDescriptors = 1024;
- p->maxMemoryAllocationSize = MAX_MEMORY_ALLOCATION_SIZE;
-}
-
-static void
-anv_get_physical_device_properties_1_2(struct anv_physical_device *pdevice,
- VkPhysicalDeviceVulkan12Properties *p)
-{
- assert(p->sType == VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_PROPERTIES);
-
- p->driverID = VK_DRIVER_ID_INTEL_OPEN_SOURCE_MESA_KHR;
- memset(p->driverName, 0, sizeof(p->driverName));
- snprintf(p->driverName, VK_MAX_DRIVER_NAME_SIZE_KHR,
- "Intel open-source Mesa driver");
- memset(p->driverInfo, 0, sizeof(p->driverInfo));
- snprintf(p->driverInfo, VK_MAX_DRIVER_INFO_SIZE_KHR,
- "Mesa " PACKAGE_VERSION MESA_GIT_SHA1);
- p->conformanceVersion = (VkConformanceVersionKHR) {
- .major = 1,
- .minor = 2,
- .subminor = 0,
- .patch = 0,
- };
-
- p->denormBehaviorIndependence =
- VK_SHADER_FLOAT_CONTROLS_INDEPENDENCE_ALL_KHR;
- p->roundingModeIndependence =
- VK_SHADER_FLOAT_CONTROLS_INDEPENDENCE_NONE_KHR;
-
- /* Broadwell does not support HF denorms and there are restrictions
- * other gens. According to Kabylake's PRM:
- *
- * "math - Extended Math Function
- * [...]
- * Restriction : Half-float denorms are always retained."
- */
- p->shaderDenormFlushToZeroFloat16 = false;
- p->shaderDenormPreserveFloat16 = pdevice->info.ver > 8;
- p->shaderRoundingModeRTEFloat16 = true;
- p->shaderRoundingModeRTZFloat16 = true;
- p->shaderSignedZeroInfNanPreserveFloat16 = true;
-
- p->shaderDenormFlushToZeroFloat32 = true;
- p->shaderDenormPreserveFloat32 = true;
- p->shaderRoundingModeRTEFloat32 = true;
- p->shaderRoundingModeRTZFloat32 = true;
- p->shaderSignedZeroInfNanPreserveFloat32 = true;
-
- p->shaderDenormFlushToZeroFloat64 = true;
- p->shaderDenormPreserveFloat64 = true;
- p->shaderRoundingModeRTEFloat64 = true;
- p->shaderRoundingModeRTZFloat64 = true;
- p->shaderSignedZeroInfNanPreserveFloat64 = true;
-
- /* It's a bit hard to exactly map our implementation to the limits
- * described by Vulkan. The bindless surface handle in the extended
- * message descriptors is 20 bits and it's an index into the table of
- * RENDER_SURFACE_STATE structs that starts at bindless surface base
- * address. This means that we can have at must 1M surface states
- * allocated at any given time. Since most image views take two
- * descriptors, this means we have a limit of about 500K image views.
- *
- * However, since we allocate surface states at vkCreateImageView time,
- * this means our limit is actually something on the order of 500K image
- * views allocated at any time. The actual limit describe by Vulkan, on
- * the other hand, is a limit of how many you can have in a descriptor set.
- * Assuming anyone using 1M descriptors will be using the same image view
- * twice a bunch of times (or a bunch of null descriptors), we can safely
- * advertise a larger limit here.
- */
- const unsigned max_bindless_views = 1 << 20;
- p->maxUpdateAfterBindDescriptorsInAllPools = max_bindless_views;
- p->shaderUniformBufferArrayNonUniformIndexingNative = false;
- p->shaderSampledImageArrayNonUniformIndexingNative = false;
- p->shaderStorageBufferArrayNonUniformIndexingNative = true;
- p->shaderStorageImageArrayNonUniformIndexingNative = false;
- p->shaderInputAttachmentArrayNonUniformIndexingNative = false;
- p->robustBufferAccessUpdateAfterBind = true;
- p->quadDivergentImplicitLod = false;
- p->maxPerStageDescriptorUpdateAfterBindSamplers = max_bindless_views;
- p->maxPerStageDescriptorUpdateAfterBindUniformBuffers = MAX_PER_STAGE_DESCRIPTOR_UNIFORM_BUFFERS;
- p->maxPerStageDescriptorUpdateAfterBindStorageBuffers = UINT32_MAX;
- p->maxPerStageDescriptorUpdateAfterBindSampledImages = max_bindless_views;
- p->maxPerStageDescriptorUpdateAfterBindStorageImages = max_bindless_views;
- p->maxPerStageDescriptorUpdateAfterBindInputAttachments = MAX_PER_STAGE_DESCRIPTOR_INPUT_ATTACHMENTS;
- p->maxPerStageUpdateAfterBindResources = UINT32_MAX;
- p->maxDescriptorSetUpdateAfterBindSamplers = max_bindless_views;
- p->maxDescriptorSetUpdateAfterBindUniformBuffers = 6 * MAX_PER_STAGE_DESCRIPTOR_UNIFORM_BUFFERS;
- p->maxDescriptorSetUpdateAfterBindUniformBuffersDynamic = MAX_DYNAMIC_BUFFERS / 2;
- p->maxDescriptorSetUpdateAfterBindStorageBuffers = UINT32_MAX;
- p->maxDescriptorSetUpdateAfterBindStorageBuffersDynamic = MAX_DYNAMIC_BUFFERS / 2;
- p->maxDescriptorSetUpdateAfterBindSampledImages = max_bindless_views;
- p->maxDescriptorSetUpdateAfterBindStorageImages = max_bindless_views;
- p->maxDescriptorSetUpdateAfterBindInputAttachments = MAX_DESCRIPTOR_SET_INPUT_ATTACHMENTS;
-
- /* We support all of the depth resolve modes */
- p->supportedDepthResolveModes = VK_RESOLVE_MODE_SAMPLE_ZERO_BIT_KHR |
- VK_RESOLVE_MODE_AVERAGE_BIT_KHR |
- VK_RESOLVE_MODE_MIN_BIT_KHR |
- VK_RESOLVE_MODE_MAX_BIT_KHR;
- /* Average doesn't make sense for stencil so we don't support that */
- p->supportedStencilResolveModes = VK_RESOLVE_MODE_SAMPLE_ZERO_BIT_KHR;
- if (pdevice->info.ver >= 8) {
- /* The advanced stencil resolve modes currently require stencil
- * sampling be supported by the hardware.
- */
- p->supportedStencilResolveModes |= VK_RESOLVE_MODE_MIN_BIT_KHR |
- VK_RESOLVE_MODE_MAX_BIT_KHR;
- }
- p->independentResolveNone = true;
- p->independentResolve = true;
-
- p->filterMinmaxSingleComponentFormats = pdevice->info.ver >= 9;
- p->filterMinmaxImageComponentMapping = pdevice->info.ver >= 9;
-
- p->maxTimelineSemaphoreValueDifference = UINT64_MAX;
-
- p->framebufferIntegerColorSampleCounts =
- isl_device_get_sample_counts(&pdevice->isl_dev);
-}
-
void anv_GetPhysicalDeviceProperties2(
VkPhysicalDevice physicalDevice,
VkPhysicalDeviceProperties2* pProperties)
{
- ANV_FROM_HANDLE(anv_physical_device, pdevice, physicalDevice);
-
- anv_GetPhysicalDeviceProperties(physicalDevice, &pProperties->properties);
-
- VkPhysicalDeviceVulkan11Properties core_1_1 = {
- .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_PROPERTIES,
- };
- anv_get_physical_device_properties_1_1(pdevice, &core_1_1);
-
- VkPhysicalDeviceVulkan12Properties core_1_2 = {
- .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_PROPERTIES,
- };
- anv_get_physical_device_properties_1_2(pdevice, &core_1_2);
-
-#define CORE_RENAMED_PROPERTY(major, minor, ext_property, core_property) \
- memcpy(&properties->ext_property, &core_##major##_##minor.core_property, \
- sizeof(core_##major##_##minor.core_property))
-
-#define CORE_PROPERTY(major, minor, property) \
- CORE_RENAMED_PROPERTY(major, minor, property, property)
+ vk_common_GetPhysicalDeviceProperties2(physicalDevice, pProperties);
+ /* Unfortunately the runtime isn't handling ANDROID extensions. */
vk_foreach_struct(ext, pProperties->pNext) {
switch (ext->sType) {
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ACCELERATION_STRUCTURE_PROPERTIES_KHR: {
- VkPhysicalDeviceAccelerationStructurePropertiesKHR *props = (void *)ext;
- props->maxGeometryCount = (1u << 24) - 1;
- props->maxInstanceCount = (1u << 24) - 1;
- props->maxPrimitiveCount = (1u << 29) - 1;
- props->maxPerStageDescriptorAccelerationStructures = UINT16_MAX;
- props->maxPerStageDescriptorUpdateAfterBindAccelerationStructures = UINT16_MAX;
- props->maxDescriptorSetAccelerationStructures = UINT16_MAX;
- props->maxDescriptorSetUpdateAfterBindAccelerationStructures = UINT16_MAX;
- props->minAccelerationStructureScratchOffsetAlignment = 64;
- break;
- }
-
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_CONSERVATIVE_RASTERIZATION_PROPERTIES_EXT: {
- /* TODO: Real limits */
- VkPhysicalDeviceConservativeRasterizationPropertiesEXT *properties =
- (VkPhysicalDeviceConservativeRasterizationPropertiesEXT *)ext;
- /* There's nothing in the public docs about this value as far as I
- * can tell. However, this is the value the Windows driver reports
- * and there's a comment on a rejected HW feature in the internal
- * docs that says:
- *
- * "This is similar to conservative rasterization, except the
- * primitive area is not extended by 1/512 and..."
- *
- * That's a bit of an obtuse reference but it's the best we've got
- * for now.
- */
- properties->primitiveOverestimationSize = 1.0f / 512.0f;
- properties->maxExtraPrimitiveOverestimationSize = 0.0f;
- properties->extraPrimitiveOverestimationSizeGranularity = 0.0f;
- properties->primitiveUnderestimation = false;
- properties->conservativePointAndLineRasterization = false;
- properties->degenerateTrianglesRasterized = true;
- properties->degenerateLinesRasterized = false;
- properties->fullyCoveredFragmentShaderInputVariable = false;
- properties->conservativeRasterizationPostDepthCoverage = true;
- break;
- }
-
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_CUSTOM_BORDER_COLOR_PROPERTIES_EXT: {
- VkPhysicalDeviceCustomBorderColorPropertiesEXT *properties =
- (VkPhysicalDeviceCustomBorderColorPropertiesEXT *)ext;
- properties->maxCustomBorderColorSamplers = MAX_CUSTOM_BORDER_COLORS;
- break;
- }
-
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DEPTH_STENCIL_RESOLVE_PROPERTIES_KHR: {
- VkPhysicalDeviceDepthStencilResolvePropertiesKHR *properties =
- (VkPhysicalDeviceDepthStencilResolvePropertiesKHR *)ext;
- CORE_PROPERTY(1, 2, supportedDepthResolveModes);
- CORE_PROPERTY(1, 2, supportedStencilResolveModes);
- CORE_PROPERTY(1, 2, independentResolveNone);
- CORE_PROPERTY(1, 2, independentResolve);
- break;
- }
-
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DESCRIPTOR_INDEXING_PROPERTIES_EXT: {
- VkPhysicalDeviceDescriptorIndexingPropertiesEXT *properties =
- (VkPhysicalDeviceDescriptorIndexingPropertiesEXT *)ext;
- CORE_PROPERTY(1, 2, maxUpdateAfterBindDescriptorsInAllPools);
- CORE_PROPERTY(1, 2, shaderUniformBufferArrayNonUniformIndexingNative);
- CORE_PROPERTY(1, 2, shaderSampledImageArrayNonUniformIndexingNative);
- CORE_PROPERTY(1, 2, shaderStorageBufferArrayNonUniformIndexingNative);
- CORE_PROPERTY(1, 2, shaderStorageImageArrayNonUniformIndexingNative);
- CORE_PROPERTY(1, 2, shaderInputAttachmentArrayNonUniformIndexingNative);
- CORE_PROPERTY(1, 2, robustBufferAccessUpdateAfterBind);
- CORE_PROPERTY(1, 2, quadDivergentImplicitLod);
- CORE_PROPERTY(1, 2, maxPerStageDescriptorUpdateAfterBindSamplers);
- CORE_PROPERTY(1, 2, maxPerStageDescriptorUpdateAfterBindUniformBuffers);
- CORE_PROPERTY(1, 2, maxPerStageDescriptorUpdateAfterBindStorageBuffers);
- CORE_PROPERTY(1, 2, maxPerStageDescriptorUpdateAfterBindSampledImages);
- CORE_PROPERTY(1, 2, maxPerStageDescriptorUpdateAfterBindStorageImages);
- CORE_PROPERTY(1, 2, maxPerStageDescriptorUpdateAfterBindInputAttachments);
- CORE_PROPERTY(1, 2, maxPerStageUpdateAfterBindResources);
- CORE_PROPERTY(1, 2, maxDescriptorSetUpdateAfterBindSamplers);
- CORE_PROPERTY(1, 2, maxDescriptorSetUpdateAfterBindUniformBuffers);
- CORE_PROPERTY(1, 2, maxDescriptorSetUpdateAfterBindUniformBuffersDynamic);
- CORE_PROPERTY(1, 2, maxDescriptorSetUpdateAfterBindStorageBuffers);
- CORE_PROPERTY(1, 2, maxDescriptorSetUpdateAfterBindStorageBuffersDynamic);
- CORE_PROPERTY(1, 2, maxDescriptorSetUpdateAfterBindSampledImages);
- CORE_PROPERTY(1, 2, maxDescriptorSetUpdateAfterBindStorageImages);
- CORE_PROPERTY(1, 2, maxDescriptorSetUpdateAfterBindInputAttachments);
- break;
- }
-
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FRAGMENT_SHADING_RATE_PROPERTIES_KHR: {
- VkPhysicalDeviceFragmentShadingRatePropertiesKHR *props =
- (VkPhysicalDeviceFragmentShadingRatePropertiesKHR *)ext;
- /* Those must be 0 if attachmentFragmentShadingRate is not
- * supported.
- */
- props->minFragmentShadingRateAttachmentTexelSize = (VkExtent2D) { 0, 0 };
- props->maxFragmentShadingRateAttachmentTexelSize = (VkExtent2D) { 0, 0 };
- props->maxFragmentShadingRateAttachmentTexelSizeAspectRatio = 0;
-
- props->primitiveFragmentShadingRateWithMultipleViewports = false;
- props->layeredShadingRateAttachments = false;
- props->fragmentShadingRateNonTrivialCombinerOps = false;
- props->maxFragmentSize = (VkExtent2D) { 4, 4 };
- props->maxFragmentSizeAspectRatio = 4;
- props->maxFragmentShadingRateCoverageSamples = 4 * 4 * 16;
- props->maxFragmentShadingRateRasterizationSamples = VK_SAMPLE_COUNT_16_BIT;
- props->fragmentShadingRateWithShaderDepthStencilWrites = false;
- props->fragmentShadingRateWithSampleMask = true;
- props->fragmentShadingRateWithShaderSampleMask = false;
- props->fragmentShadingRateWithConservativeRasterization = true;
- props->fragmentShadingRateWithFragmentShaderInterlock = true;
- props->fragmentShadingRateWithCustomSampleLocations = true;
- props->fragmentShadingRateStrictMultiplyCombiner = false;
- break;
- }
-
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DRIVER_PROPERTIES_KHR: {
- VkPhysicalDeviceDriverPropertiesKHR *properties =
- (VkPhysicalDeviceDriverPropertiesKHR *) ext;
- CORE_PROPERTY(1, 2, driverID);
- CORE_PROPERTY(1, 2, driverName);
- CORE_PROPERTY(1, 2, driverInfo);
- CORE_PROPERTY(1, 2, conformanceVersion);
- break;
- }
-
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DRM_PROPERTIES_EXT: {
- VkPhysicalDeviceDrmPropertiesEXT *props =
- (VkPhysicalDeviceDrmPropertiesEXT *)ext;
-
- props->hasPrimary = pdevice->has_master;
- props->primaryMajor = pdevice->master_major;
- props->primaryMinor = pdevice->master_minor;
-
- props->hasRender = pdevice->has_local;
- props->renderMajor = pdevice->local_major;
- props->renderMinor = pdevice->local_minor;
-
- break;
- }
-
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTERNAL_MEMORY_HOST_PROPERTIES_EXT: {
- VkPhysicalDeviceExternalMemoryHostPropertiesEXT *props =
- (VkPhysicalDeviceExternalMemoryHostPropertiesEXT *) ext;
- /* Userptr needs page aligned memory. */
- props->minImportedHostPointerAlignment = 4096;
- break;
- }
-
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ID_PROPERTIES: {
- VkPhysicalDeviceIDProperties *properties =
- (VkPhysicalDeviceIDProperties *)ext;
- CORE_PROPERTY(1, 1, deviceUUID);
- CORE_PROPERTY(1, 1, driverUUID);
- CORE_PROPERTY(1, 1, deviceLUID);
- CORE_PROPERTY(1, 1, deviceLUIDValid);
- break;
- }
-
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_INLINE_UNIFORM_BLOCK_PROPERTIES_EXT: {
- VkPhysicalDeviceInlineUniformBlockPropertiesEXT *props =
- (VkPhysicalDeviceInlineUniformBlockPropertiesEXT *)ext;
- props->maxInlineUniformBlockSize = MAX_INLINE_UNIFORM_BLOCK_SIZE;
- props->maxPerStageDescriptorInlineUniformBlocks =
- MAX_INLINE_UNIFORM_BLOCK_DESCRIPTORS;
- props->maxPerStageDescriptorUpdateAfterBindInlineUniformBlocks =
- MAX_INLINE_UNIFORM_BLOCK_DESCRIPTORS;
- props->maxDescriptorSetInlineUniformBlocks =
- MAX_INLINE_UNIFORM_BLOCK_DESCRIPTORS;
- props->maxDescriptorSetUpdateAfterBindInlineUniformBlocks =
- MAX_INLINE_UNIFORM_BLOCK_DESCRIPTORS;
- break;
- }
-
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_LINE_RASTERIZATION_PROPERTIES_EXT: {
- VkPhysicalDeviceLineRasterizationPropertiesEXT *props =
- (VkPhysicalDeviceLineRasterizationPropertiesEXT *)ext;
- /* In the Skylake PRM Vol. 7, subsection titled "GIQ (Diamond)
- * Sampling Rules - Legacy Mode", it says the following:
- *
- * "Note that the device divides a pixel into a 16x16 array of
- * subpixels, referenced by their upper left corners."
- *
- * This is the only known reference in the PRMs to the subpixel
- * precision of line rasterization and a "16x16 array of subpixels"
- * implies 4 subpixel precision bits. Empirical testing has shown
- * that 4 subpixel precision bits applies to all line rasterization
- * types.
- */
- props->lineSubPixelPrecisionBits = 4;
- break;
- }
-
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MAINTENANCE_3_PROPERTIES: {
- VkPhysicalDeviceMaintenance3Properties *properties =
- (VkPhysicalDeviceMaintenance3Properties *)ext;
- /* This value doesn't matter for us today as our per-stage
- * descriptors are the real limit.
- */
- CORE_PROPERTY(1, 1, maxPerSetDescriptors);
- CORE_PROPERTY(1, 1, maxMemoryAllocationSize);
- break;
- }
-
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MULTIVIEW_PROPERTIES: {
- VkPhysicalDeviceMultiviewProperties *properties =
- (VkPhysicalDeviceMultiviewProperties *)ext;
- CORE_PROPERTY(1, 1, maxMultiviewViewCount);
- CORE_PROPERTY(1, 1, maxMultiviewInstanceIndex);
- break;
- }
-
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PCI_BUS_INFO_PROPERTIES_EXT: {
- VkPhysicalDevicePCIBusInfoPropertiesEXT *properties =
- (VkPhysicalDevicePCIBusInfoPropertiesEXT *)ext;
- properties->pciDomain = pdevice->pci_info.domain;
- properties->pciBus = pdevice->pci_info.bus;
- properties->pciDevice = pdevice->pci_info.device;
- properties->pciFunction = pdevice->pci_info.function;
- break;
- }
-
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PERFORMANCE_QUERY_PROPERTIES_KHR: {
- VkPhysicalDevicePerformanceQueryPropertiesKHR *properties =
- (VkPhysicalDevicePerformanceQueryPropertiesKHR *)ext;
- /* We could support this by spawning a shader to do the equation
- * normalization.
- */
- properties->allowCommandBufferQueryCopies = false;
- break;
- }
-
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_POINT_CLIPPING_PROPERTIES: {
- VkPhysicalDevicePointClippingProperties *properties =
- (VkPhysicalDevicePointClippingProperties *) ext;
- CORE_PROPERTY(1, 1, pointClippingBehavior);
- break;
- }
-
+#if DETECT_OS_ANDROID
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wswitch"
case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PRESENTATION_PROPERTIES_ANDROID: {
VkPhysicalDevicePresentationPropertiesANDROID *props =
(VkPhysicalDevicePresentationPropertiesANDROID *)ext;
- props->sharedImage = VK_FALSE;
+ uint64_t front_rendering_usage = 0;
+ struct u_gralloc *gralloc = u_gralloc_create(U_GRALLOC_TYPE_AUTO);
+ if (gralloc != NULL) {
+ u_gralloc_get_front_rendering_usage(gralloc, &front_rendering_usage);
+ u_gralloc_destroy(&gralloc);
+ }
+ props->sharedImage = front_rendering_usage ? VK_TRUE : VK_FALSE;
break;
}
#pragma GCC diagnostic pop
-
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROTECTED_MEMORY_PROPERTIES: {
- VkPhysicalDeviceProtectedMemoryProperties *properties =
- (VkPhysicalDeviceProtectedMemoryProperties *)ext;
- CORE_PROPERTY(1, 1, protectedNoFault);
- break;
- }
-
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROVOKING_VERTEX_PROPERTIES_EXT: {
- VkPhysicalDeviceProvokingVertexPropertiesEXT *properties =
- (VkPhysicalDeviceProvokingVertexPropertiesEXT *)ext;
- properties->provokingVertexModePerPipeline = true;
- properties->transformFeedbackPreservesTriangleFanProvokingVertex = false;
- break;
- }
-
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PUSH_DESCRIPTOR_PROPERTIES_KHR: {
- VkPhysicalDevicePushDescriptorPropertiesKHR *properties =
- (VkPhysicalDevicePushDescriptorPropertiesKHR *) ext;
- properties->maxPushDescriptors = MAX_PUSH_DESCRIPTORS;
- break;
- }
-
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ROBUSTNESS_2_PROPERTIES_EXT: {
- VkPhysicalDeviceRobustness2PropertiesEXT *properties = (void *)ext;
- properties->robustStorageBufferAccessSizeAlignment =
- ANV_SSBO_BOUNDS_CHECK_ALIGNMENT;
- properties->robustUniformBufferAccessSizeAlignment =
- ANV_UBO_ALIGNMENT;
- break;
- }
-
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SAMPLER_FILTER_MINMAX_PROPERTIES_EXT: {
- VkPhysicalDeviceSamplerFilterMinmaxPropertiesEXT *properties =
- (VkPhysicalDeviceSamplerFilterMinmaxPropertiesEXT *)ext;
- CORE_PROPERTY(1, 2, filterMinmaxImageComponentMapping);
- CORE_PROPERTY(1, 2, filterMinmaxSingleComponentFormats);
- break;
- }
-
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_INTEGER_DOT_PRODUCT_PROPERTIES_KHR: {
- VkPhysicalDeviceShaderIntegerDotProductPropertiesKHR *props =
- (VkPhysicalDeviceShaderIntegerDotProductPropertiesKHR *)ext;
-
- props->integerDotProduct8BitUnsignedAccelerated = false;
- props->integerDotProduct8BitSignedAccelerated = false;
- props->integerDotProduct8BitMixedSignednessAccelerated = false;
- props->integerDotProduct4x8BitPackedUnsignedAccelerated = pdevice->info.ver >= 12;
- props->integerDotProduct4x8BitPackedSignedAccelerated = pdevice->info.ver >= 12;
- props->integerDotProduct4x8BitPackedMixedSignednessAccelerated = pdevice->info.ver >= 12;
- props->integerDotProduct16BitUnsignedAccelerated = false;
- props->integerDotProduct16BitSignedAccelerated = false;
- props->integerDotProduct16BitMixedSignednessAccelerated = false;
- props->integerDotProduct32BitUnsignedAccelerated = false;
- props->integerDotProduct32BitSignedAccelerated = false;
- props->integerDotProduct32BitMixedSignednessAccelerated = false;
- props->integerDotProduct64BitUnsignedAccelerated = false;
- props->integerDotProduct64BitSignedAccelerated = false;
- props->integerDotProduct64BitMixedSignednessAccelerated = false;
- props->integerDotProductAccumulatingSaturating8BitUnsignedAccelerated = false;
- props->integerDotProductAccumulatingSaturating8BitSignedAccelerated = false;
- props->integerDotProductAccumulatingSaturating8BitMixedSignednessAccelerated = false;
- props->integerDotProductAccumulatingSaturating4x8BitPackedUnsignedAccelerated = pdevice->info.ver >= 12;
- props->integerDotProductAccumulatingSaturating4x8BitPackedSignedAccelerated = pdevice->info.ver >= 12;
- props->integerDotProductAccumulatingSaturating4x8BitPackedMixedSignednessAccelerated = pdevice->info.ver >= 12;
- props->integerDotProductAccumulatingSaturating16BitUnsignedAccelerated = false;
- props->integerDotProductAccumulatingSaturating16BitSignedAccelerated = false;
- props->integerDotProductAccumulatingSaturating16BitMixedSignednessAccelerated = false;
- props->integerDotProductAccumulatingSaturating32BitUnsignedAccelerated = false;
- props->integerDotProductAccumulatingSaturating32BitSignedAccelerated = false;
- props->integerDotProductAccumulatingSaturating32BitMixedSignednessAccelerated = false;
- props->integerDotProductAccumulatingSaturating64BitUnsignedAccelerated = false;
- props->integerDotProductAccumulatingSaturating64BitSignedAccelerated = false;
- props->integerDotProductAccumulatingSaturating64BitMixedSignednessAccelerated = false;
-
- break;
- }
-
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_PROPERTIES: {
- VkPhysicalDeviceSubgroupProperties *properties = (void *)ext;
- CORE_PROPERTY(1, 1, subgroupSize);
- CORE_RENAMED_PROPERTY(1, 1, supportedStages,
- subgroupSupportedStages);
- CORE_RENAMED_PROPERTY(1, 1, supportedOperations,
- subgroupSupportedOperations);
- CORE_RENAMED_PROPERTY(1, 1, quadOperationsInAllStages,
- subgroupQuadOperationsInAllStages);
- break;
- }
-
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_SIZE_CONTROL_PROPERTIES_EXT: {
- VkPhysicalDeviceSubgroupSizeControlPropertiesEXT *props =
- (VkPhysicalDeviceSubgroupSizeControlPropertiesEXT *)ext;
- STATIC_ASSERT(8 <= BRW_SUBGROUP_SIZE && BRW_SUBGROUP_SIZE <= 32);
- props->minSubgroupSize = 8;
- props->maxSubgroupSize = 32;
- props->maxComputeWorkgroupSubgroups = pdevice->info.max_cs_workgroup_threads;
- props->requiredSubgroupSizeStages = VK_SHADER_STAGE_COMPUTE_BIT;
- break;
- }
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FLOAT_CONTROLS_PROPERTIES_KHR : {
- VkPhysicalDeviceFloatControlsPropertiesKHR *properties = (void *)ext;
- CORE_PROPERTY(1, 2, denormBehaviorIndependence);
- CORE_PROPERTY(1, 2, roundingModeIndependence);
- CORE_PROPERTY(1, 2, shaderDenormFlushToZeroFloat16);
- CORE_PROPERTY(1, 2, shaderDenormPreserveFloat16);
- CORE_PROPERTY(1, 2, shaderRoundingModeRTEFloat16);
- CORE_PROPERTY(1, 2, shaderRoundingModeRTZFloat16);
- CORE_PROPERTY(1, 2, shaderSignedZeroInfNanPreserveFloat16);
- CORE_PROPERTY(1, 2, shaderDenormFlushToZeroFloat32);
- CORE_PROPERTY(1, 2, shaderDenormPreserveFloat32);
- CORE_PROPERTY(1, 2, shaderRoundingModeRTEFloat32);
- CORE_PROPERTY(1, 2, shaderRoundingModeRTZFloat32);
- CORE_PROPERTY(1, 2, shaderSignedZeroInfNanPreserveFloat32);
- CORE_PROPERTY(1, 2, shaderDenormFlushToZeroFloat64);
- CORE_PROPERTY(1, 2, shaderDenormPreserveFloat64);
- CORE_PROPERTY(1, 2, shaderRoundingModeRTEFloat64);
- CORE_PROPERTY(1, 2, shaderRoundingModeRTZFloat64);
- CORE_PROPERTY(1, 2, shaderSignedZeroInfNanPreserveFloat64);
- break;
- }
-
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SAMPLE_LOCATIONS_PROPERTIES_EXT: {
- VkPhysicalDeviceSampleLocationsPropertiesEXT *props =
- (VkPhysicalDeviceSampleLocationsPropertiesEXT *)ext;
-
- props->sampleLocationSampleCounts =
- isl_device_get_sample_counts(&pdevice->isl_dev);
-
- /* See also anv_GetPhysicalDeviceMultisamplePropertiesEXT */
- props->maxSampleLocationGridSize.width = 1;
- props->maxSampleLocationGridSize.height = 1;
-
- props->sampleLocationCoordinateRange[0] = 0;
- props->sampleLocationCoordinateRange[1] = 0.9375;
- props->sampleLocationSubPixelBits = 4;
-
- props->variableSampleLocations = true;
- break;
- }
-
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TEXEL_BUFFER_ALIGNMENT_PROPERTIES_EXT: {
- VkPhysicalDeviceTexelBufferAlignmentPropertiesEXT *props =
- (VkPhysicalDeviceTexelBufferAlignmentPropertiesEXT *)ext;
-
- /* From the SKL PRM Vol. 2d, docs for RENDER_SURFACE_STATE::Surface
- * Base Address:
- *
- * "For SURFTYPE_BUFFER non-rendertarget surfaces, this field
- * specifies the base address of the first element of the surface,
- * computed in software by adding the surface base address to the
- * byte offset of the element in the buffer. The base address must
- * be aligned to element size."
- *
- * The typed dataport messages require that things be texel aligned.
- * Otherwise, we may just load/store the wrong data or, in the worst
- * case, there may be hangs.
- */
- props->storageTexelBufferOffsetAlignmentBytes = 16;
- props->storageTexelBufferOffsetSingleTexelAlignment = true;
-
- /* The sampler, however, is much more forgiving and it can handle
- * arbitrary byte alignment for linear and buffer surfaces. It's
- * hard to find a good PRM citation for this but years of empirical
- * experience demonstrate that this is true.
- */
- props->uniformTexelBufferOffsetAlignmentBytes = 1;
- props->uniformTexelBufferOffsetSingleTexelAlignment = false;
- break;
- }
-
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TIMELINE_SEMAPHORE_PROPERTIES_KHR: {
- VkPhysicalDeviceTimelineSemaphorePropertiesKHR *properties =
- (VkPhysicalDeviceTimelineSemaphorePropertiesKHR *) ext;
- CORE_PROPERTY(1, 2, maxTimelineSemaphoreValueDifference);
- break;
- }
-
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TRANSFORM_FEEDBACK_PROPERTIES_EXT: {
- VkPhysicalDeviceTransformFeedbackPropertiesEXT *props =
- (VkPhysicalDeviceTransformFeedbackPropertiesEXT *)ext;
-
- props->maxTransformFeedbackStreams = MAX_XFB_STREAMS;
- props->maxTransformFeedbackBuffers = MAX_XFB_BUFFERS;
- props->maxTransformFeedbackBufferSize = (1ull << 32);
- props->maxTransformFeedbackStreamDataSize = 128 * 4;
- props->maxTransformFeedbackBufferDataSize = 128 * 4;
- props->maxTransformFeedbackBufferDataStride = 2048;
- props->transformFeedbackQueries = true;
- props->transformFeedbackStreamsLinesTriangles = false;
- props->transformFeedbackRasterizationStreamSelect = false;
- /* This requires MI_MATH */
- props->transformFeedbackDraw = pdevice->info.verx10 >= 75;
- break;
- }
-
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VERTEX_ATTRIBUTE_DIVISOR_PROPERTIES_EXT: {
- VkPhysicalDeviceVertexAttributeDivisorPropertiesEXT *props =
- (VkPhysicalDeviceVertexAttributeDivisorPropertiesEXT *)ext;
- /* We have to restrict this a bit for multiview */
- props->maxVertexAttribDivisor = UINT32_MAX / 16;
- break;
- }
-
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MULTI_DRAW_PROPERTIES_EXT: {
- VkPhysicalDeviceMultiDrawPropertiesEXT *props = (VkPhysicalDeviceMultiDrawPropertiesEXT *)ext;
- props->maxMultiDrawCount = 2048;
- break;
- }
-
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_PROPERTIES:
- anv_get_physical_device_properties_1_1(pdevice, (void *)ext);
- break;
-
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_PROPERTIES:
- anv_get_physical_device_properties_1_2(pdevice, (void *)ext);
- break;
+#endif
default:
- anv_debug_ignored_stype(ext->sType);
break;
}
}
-
-#undef CORE_RENAMED_PROPERTY
-#undef CORE_PROPERTY
}
static const VkQueueFamilyProperties
-anv_queue_family_properties_template = {
- .timestampValidBits = 36, /* XXX: Real value here */
- .minImageTransferGranularity = { 1, 1, 1 },
-};
+get_anv_queue_family_properties_template(const struct anv_physical_device *device)
+{
-void anv_GetPhysicalDeviceQueueFamilyProperties(
- VkPhysicalDevice physicalDevice,
- uint32_t* pCount,
- VkQueueFamilyProperties* pQueueFamilyProperties)
+ /*
+ * For Xe2+:
+ * Bspec 60411: Timestamp register can hold 64-bit value
+ *
+ * Platforms < Xe2:
+ * Bpsec 46111: Timestamp register can hold only 36-bit
+ * value
+ */
+ const VkQueueFamilyProperties anv_queue_family_properties_template =
+ {
+ .timestampValidBits = device->info.ver >= 20 ? 64 : 36,
+ .minImageTransferGranularity = { 1, 1, 1 },
+ };
+
+ return anv_queue_family_properties_template;
+}
+
+static VkQueueFamilyProperties
+anv_device_physical_get_queue_properties(const struct anv_physical_device *device,
+ uint32_t family_index)
{
- ANV_FROM_HANDLE(anv_physical_device, pdevice, physicalDevice);
- VK_OUTARRAY_MAKE(out, pQueueFamilyProperties, pCount);
+ const struct anv_queue_family *family = &device->queue.families[family_index];
+ VkQueueFamilyProperties properties =
+ get_anv_queue_family_properties_template(device);
- for (uint32_t i = 0; i < pdevice->queue.family_count; i++) {
- struct anv_queue_family *queue_family = &pdevice->queue.families[i];
- vk_outarray_append(&out, p) {
- *p = anv_queue_family_properties_template;
- p->queueFlags = queue_family->queueFlags;
- p->queueCount = queue_family->queueCount;
- }
- }
+ properties.queueFlags = family->queueFlags;
+ properties.queueCount = family->queueCount;
+ return properties;
}
void anv_GetPhysicalDeviceQueueFamilyProperties2(
@@ -2802,17 +2787,57 @@ void anv_GetPhysicalDeviceQueueFamilyProperties2(
VkQueueFamilyProperties2* pQueueFamilyProperties)
{
ANV_FROM_HANDLE(anv_physical_device, pdevice, physicalDevice);
- VK_OUTARRAY_MAKE(out, pQueueFamilyProperties, pQueueFamilyPropertyCount);
+ VK_OUTARRAY_MAKE_TYPED(VkQueueFamilyProperties2, out,
+ pQueueFamilyProperties, pQueueFamilyPropertyCount);
for (uint32_t i = 0; i < pdevice->queue.family_count; i++) {
struct anv_queue_family *queue_family = &pdevice->queue.families[i];
- vk_outarray_append(&out, p) {
- p->queueFamilyProperties = anv_queue_family_properties_template;
- p->queueFamilyProperties.queueFlags = queue_family->queueFlags;
- p->queueFamilyProperties.queueCount = queue_family->queueCount;
-
- vk_foreach_struct(s, p->pNext) {
- anv_debug_ignored_stype(s->sType);
+ vk_outarray_append_typed(VkQueueFamilyProperties2, &out, p) {
+ p->queueFamilyProperties =
+ anv_device_physical_get_queue_properties(pdevice, i);
+
+ vk_foreach_struct(ext, p->pNext) {
+ switch (ext->sType) {
+ case VK_STRUCTURE_TYPE_QUEUE_FAMILY_GLOBAL_PRIORITY_PROPERTIES_KHR: {
+ VkQueueFamilyGlobalPriorityPropertiesKHR *properties =
+ (VkQueueFamilyGlobalPriorityPropertiesKHR *)ext;
+
+ /* Deliberately sorted low to high */
+ VkQueueGlobalPriorityKHR all_priorities[] = {
+ VK_QUEUE_GLOBAL_PRIORITY_LOW_KHR,
+ VK_QUEUE_GLOBAL_PRIORITY_MEDIUM_KHR,
+ VK_QUEUE_GLOBAL_PRIORITY_HIGH_KHR,
+ VK_QUEUE_GLOBAL_PRIORITY_REALTIME_KHR,
+ };
+
+ uint32_t count = 0;
+ for (unsigned i = 0; i < ARRAY_SIZE(all_priorities); i++) {
+ if (all_priorities[i] > pdevice->max_context_priority)
+ break;
+
+ properties->priorities[count++] = all_priorities[i];
+ }
+ properties->priorityCount = count;
+ break;
+ }
+ case VK_STRUCTURE_TYPE_QUEUE_FAMILY_QUERY_RESULT_STATUS_PROPERTIES_KHR: {
+ VkQueueFamilyQueryResultStatusPropertiesKHR *prop =
+ (VkQueueFamilyQueryResultStatusPropertiesKHR *)ext;
+ prop->queryResultStatusSupport = VK_TRUE;
+ break;
+ }
+ case VK_STRUCTURE_TYPE_QUEUE_FAMILY_VIDEO_PROPERTIES_KHR: {
+ VkQueueFamilyVideoPropertiesKHR *prop =
+ (VkQueueFamilyVideoPropertiesKHR *)ext;
+ if (queue_family->queueFlags & VK_QUEUE_VIDEO_DECODE_BIT_KHR) {
+ prop->videoCodecOperations = VK_VIDEO_CODEC_OPERATION_DECODE_H264_BIT_KHR |
+ VK_VIDEO_CODEC_OPERATION_DECODE_H265_BIT_KHR;
+ }
+ break;
+ }
+ default:
+ anv_debug_ignored_stype(ext->sType);
+ }
}
}
}
@@ -2847,6 +2872,9 @@ anv_get_memory_budget(VkPhysicalDevice physicalDevice,
{
ANV_FROM_HANDLE(anv_physical_device, device, physicalDevice);
+ if (!device->vk.supported_extensions.EXT_memory_budget)
+ return;
+
anv_update_meminfo(device, device->local_fd);
VkDeviceSize total_sys_heaps_size = 0, total_vram_heaps_size = 0;
@@ -2866,10 +2894,14 @@ anv_get_memory_budget(VkPhysicalDevice physicalDevice,
if (device->memory.heaps[i].is_local_mem) {
total_heaps_size = total_vram_heaps_size;
- mem_available = device->vram.available;
+ if (device->vram_non_mappable.size > 0 && i == 0) {
+ mem_available = device->vram_non_mappable.available;
+ } else {
+ mem_available = device->vram_mappable.available;
+ }
} else {
total_heaps_size = total_sys_heaps_size;
- mem_available = device->sys.available;
+ mem_available = MIN2(device->sys.available, total_heaps_size);
}
double heap_proportion = (double) heap_size / total_heaps_size;
@@ -2926,21 +2958,6 @@ void anv_GetPhysicalDeviceMemoryProperties2(
}
}
-void
-anv_GetDeviceGroupPeerMemoryFeatures(
- VkDevice device,
- uint32_t heapIndex,
- uint32_t localDeviceIndex,
- uint32_t remoteDeviceIndex,
- VkPeerMemoryFeatureFlags* pPeerMemoryFeatures)
-{
- assert(localDeviceIndex == 0 && remoteDeviceIndex == 0);
- *pPeerMemoryFeatures = VK_PEER_MEMORY_FEATURE_COPY_SRC_BIT |
- VK_PEER_MEMORY_FEATURE_COPY_DST_BIT |
- VK_PEER_MEMORY_FEATURE_GENERIC_SRC_BIT |
- VK_PEER_MEMORY_FEATURE_GENERIC_DST_BIT;
-}
-
PFN_vkVoidFunction anv_GetInstanceProcAddr(
VkInstance _instance,
const char* pName)
@@ -2957,71 +2974,29 @@ PFN_vkVoidFunction anv_GetInstanceProcAddr(
PUBLIC
VKAPI_ATTR PFN_vkVoidFunction VKAPI_CALL vk_icdGetInstanceProcAddr(
VkInstance instance,
- const char* pName);
-
-PUBLIC
-VKAPI_ATTR PFN_vkVoidFunction VKAPI_CALL vk_icdGetInstanceProcAddr(
- VkInstance instance,
const char* pName)
{
return anv_GetInstanceProcAddr(instance, pName);
}
-/* With version 4+ of the loader interface the ICD should expose
- * vk_icdGetPhysicalDeviceProcAddr()
- */
-PUBLIC
-VKAPI_ATTR PFN_vkVoidFunction VKAPI_CALL vk_icdGetPhysicalDeviceProcAddr(
- VkInstance _instance,
- const char* pName);
-
-PFN_vkVoidFunction vk_icdGetPhysicalDeviceProcAddr(
- VkInstance _instance,
- const char* pName)
-{
- ANV_FROM_HANDLE(anv_instance, instance, _instance);
- return vk_instance_get_physical_device_proc_addr(&instance->vk, pName);
-}
-
-static struct anv_state
-anv_state_pool_emit_data(struct anv_state_pool *pool, size_t size, size_t align, const void *p)
-{
- struct anv_state state;
-
- state = anv_state_pool_alloc(pool, size, align);
- memcpy(state.map, p, size);
-
- return state;
-}
-
static void
anv_device_init_border_colors(struct anv_device *device)
{
- if (device->info.is_haswell) {
- static const struct hsw_border_color border_colors[] = {
- [VK_BORDER_COLOR_FLOAT_TRANSPARENT_BLACK] = { .float32 = { 0.0, 0.0, 0.0, 0.0 } },
- [VK_BORDER_COLOR_FLOAT_OPAQUE_BLACK] = { .float32 = { 0.0, 0.0, 0.0, 1.0 } },
- [VK_BORDER_COLOR_FLOAT_OPAQUE_WHITE] = { .float32 = { 1.0, 1.0, 1.0, 1.0 } },
- [VK_BORDER_COLOR_INT_TRANSPARENT_BLACK] = { .uint32 = { 0, 0, 0, 0 } },
- [VK_BORDER_COLOR_INT_OPAQUE_BLACK] = { .uint32 = { 0, 0, 0, 1 } },
- [VK_BORDER_COLOR_INT_OPAQUE_WHITE] = { .uint32 = { 1, 1, 1, 1 } },
- };
-
- device->border_colors =
- anv_state_pool_emit_data(&device->dynamic_state_pool,
- sizeof(border_colors), 512, border_colors);
- } else {
- static const struct gfx8_border_color border_colors[] = {
- [VK_BORDER_COLOR_FLOAT_TRANSPARENT_BLACK] = { .float32 = { 0.0, 0.0, 0.0, 0.0 } },
- [VK_BORDER_COLOR_FLOAT_OPAQUE_BLACK] = { .float32 = { 0.0, 0.0, 0.0, 1.0 } },
- [VK_BORDER_COLOR_FLOAT_OPAQUE_WHITE] = { .float32 = { 1.0, 1.0, 1.0, 1.0 } },
- [VK_BORDER_COLOR_INT_TRANSPARENT_BLACK] = { .uint32 = { 0, 0, 0, 0 } },
- [VK_BORDER_COLOR_INT_OPAQUE_BLACK] = { .uint32 = { 0, 0, 0, 1 } },
- [VK_BORDER_COLOR_INT_OPAQUE_WHITE] = { .uint32 = { 1, 1, 1, 1 } },
- };
+ static const struct gfx8_border_color border_colors[] = {
+ [VK_BORDER_COLOR_FLOAT_TRANSPARENT_BLACK] = { .float32 = { 0.0, 0.0, 0.0, 0.0 } },
+ [VK_BORDER_COLOR_FLOAT_OPAQUE_BLACK] = { .float32 = { 0.0, 0.0, 0.0, 1.0 } },
+ [VK_BORDER_COLOR_FLOAT_OPAQUE_WHITE] = { .float32 = { 1.0, 1.0, 1.0, 1.0 } },
+ [VK_BORDER_COLOR_INT_TRANSPARENT_BLACK] = { .uint32 = { 0, 0, 0, 0 } },
+ [VK_BORDER_COLOR_INT_OPAQUE_BLACK] = { .uint32 = { 0, 0, 0, 1 } },
+ [VK_BORDER_COLOR_INT_OPAQUE_WHITE] = { .uint32 = { 1, 1, 1, 1 } },
+ };
- device->border_colors =
- anv_state_pool_emit_data(&device->dynamic_state_pool,
+ device->border_colors =
+ anv_state_pool_emit_data(&device->dynamic_state_pool,
+ sizeof(border_colors), 64, border_colors);
+ if (device->vk.enabled_extensions.EXT_descriptor_buffer) {
+ device->border_colors_db =
+ anv_state_pool_emit_data(&device->dynamic_state_db_pool,
sizeof(border_colors), 64, border_colors);
}
}
@@ -3030,7 +3005,9 @@ static VkResult
anv_device_init_trivial_batch(struct anv_device *device)
{
VkResult result = anv_device_alloc_bo(device, "trivial-batch", 4096,
- ANV_BO_ALLOC_MAPPED,
+ ANV_BO_ALLOC_MAPPED |
+ ANV_BO_ALLOC_HOST_COHERENT |
+ ANV_BO_ALLOC_INTERNAL,
0 /* explicit_address */,
&device->trivial_batch_bo);
if (result != VK_SUCCESS)
@@ -3045,29 +3022,9 @@ anv_device_init_trivial_batch(struct anv_device *device)
anv_batch_emit(&batch, GFX7_MI_BATCH_BUFFER_END, bbe);
anv_batch_emit(&batch, GFX7_MI_NOOP, noop);
- if (!device->info.has_llc)
- intel_clflush_range(batch.start, batch.next - batch.start);
-
return VK_SUCCESS;
}
-static int
-vk_priority_to_gen(int priority)
-{
- switch (priority) {
- case VK_QUEUE_GLOBAL_PRIORITY_LOW_EXT:
- return INTEL_CONTEXT_LOW_PRIORITY;
- case VK_QUEUE_GLOBAL_PRIORITY_MEDIUM_EXT:
- return INTEL_CONTEXT_MEDIUM_PRIORITY;
- case VK_QUEUE_GLOBAL_PRIORITY_HIGH_EXT:
- return INTEL_CONTEXT_HIGH_PRIORITY;
- case VK_QUEUE_GLOBAL_PRIORITY_REALTIME_EXT:
- return INTEL_CONTEXT_REALTIME_PRIORITY;
- default:
- unreachable("Invalid priority");
- }
-}
-
static bool
get_bo_from_pool(struct intel_batch_decode_bo *ret,
struct anv_block_pool *pool,
@@ -3098,29 +3055,62 @@ decode_get_bo(void *v_batch, bool ppgtt, uint64_t address)
if (get_bo_from_pool(&ret_bo, &device->dynamic_state_pool.block_pool, address))
return ret_bo;
+ if (device->vk.enabled_extensions.EXT_descriptor_buffer &&
+ get_bo_from_pool(&ret_bo, &device->dynamic_state_db_pool.block_pool, address))
+ return ret_bo;
if (get_bo_from_pool(&ret_bo, &device->instruction_state_pool.block_pool, address))
return ret_bo;
if (get_bo_from_pool(&ret_bo, &device->binding_table_pool.block_pool, address))
return ret_bo;
- if (get_bo_from_pool(&ret_bo, &device->surface_state_pool.block_pool, address))
+ if (get_bo_from_pool(&ret_bo, &device->scratch_surface_state_pool.block_pool, address))
+ return ret_bo;
+ if (device->physical->indirect_descriptors &&
+ get_bo_from_pool(&ret_bo, &device->bindless_surface_state_pool.block_pool, address))
+ return ret_bo;
+ if (get_bo_from_pool(&ret_bo, &device->internal_surface_state_pool.block_pool, address))
+ return ret_bo;
+ if (device->physical->indirect_descriptors &&
+ get_bo_from_pool(&ret_bo, &device->indirect_push_descriptor_pool.block_pool, address))
+ return ret_bo;
+ if (device->info->has_aux_map &&
+ get_bo_from_pool(&ret_bo, &device->aux_tt_pool.block_pool, address))
return ret_bo;
if (!device->cmd_buffer_being_decoded)
return (struct intel_batch_decode_bo) { };
- struct anv_batch_bo **bo;
-
- u_vector_foreach(bo, &device->cmd_buffer_being_decoded->seen_bbos) {
+ struct anv_batch_bo **bbo;
+ u_vector_foreach(bbo, &device->cmd_buffer_being_decoded->seen_bbos) {
/* The decoder zeroes out the top 16 bits, so we need to as well */
- uint64_t bo_address = (*bo)->bo->offset & (~0ull >> 16);
+ uint64_t bo_address = (*bbo)->bo->offset & (~0ull >> 16);
- if (address >= bo_address && address < bo_address + (*bo)->bo->size) {
+ if (address >= bo_address && address < bo_address + (*bbo)->bo->size) {
return (struct intel_batch_decode_bo) {
.addr = bo_address,
- .size = (*bo)->bo->size,
- .map = (*bo)->bo->map,
+ .size = (*bbo)->bo->size,
+ .map = (*bbo)->bo->map,
};
}
+
+ uint32_t dep_words = (*bbo)->relocs.dep_words;
+ BITSET_WORD *deps = (*bbo)->relocs.deps;
+ for (uint32_t w = 0; w < dep_words; w++) {
+ BITSET_WORD mask = deps[w];
+ while (mask) {
+ int i = u_bit_scan(&mask);
+ uint32_t gem_handle = w * BITSET_WORDBITS + i;
+ struct anv_bo *bo = anv_device_lookup_bo(device, gem_handle);
+ assert(bo->refcount > 0);
+ bo_address = bo->offset & (~0ull >> 16);
+ if (address >= bo_address && address < bo_address + bo->size) {
+ return (struct intel_batch_decode_bo) {
+ .addr = bo_address,
+ .size = bo->size,
+ .map = bo->map,
+ };
+ }
+ }
+ }
}
return (struct intel_batch_decode_bo) { };
@@ -3139,10 +3129,8 @@ intel_aux_map_buffer_alloc(void *driver_ctx, uint32_t size)
return NULL;
struct anv_device *device = (struct anv_device*)driver_ctx;
- assert(device->physical->supports_48bit_addresses &&
- device->physical->use_softpin);
- struct anv_state_pool *pool = &device->dynamic_state_pool;
+ struct anv_state_pool *pool = &device->aux_tt_pool;
buf->state = anv_state_pool_alloc(pool, size, size);
buf->base.gpu = pool->block_pool.bo->offset + buf->state.offset;
@@ -3157,7 +3145,7 @@ intel_aux_map_buffer_free(void *driver_ctx, struct intel_buffer *buffer)
{
struct intel_aux_map_buffer *buf = (struct intel_aux_map_buffer*)buffer;
struct anv_device *device = (struct anv_device*)driver_ctx;
- struct anv_state_pool *pool = &device->dynamic_state_pool;
+ struct anv_state_pool *pool = &device->aux_tt_pool;
anv_state_pool_free(pool, buf->state);
free(buf);
}
@@ -3168,22 +3156,93 @@ static struct intel_mapped_pinned_buffer_alloc aux_map_allocator = {
};
static VkResult
-check_physical_device_features(VkPhysicalDevice physicalDevice,
- const VkPhysicalDeviceFeatures *features)
+anv_device_setup_context_or_vm(struct anv_device *device,
+ const VkDeviceCreateInfo *pCreateInfo,
+ const uint32_t num_queues)
{
- VkPhysicalDeviceFeatures supported_features;
- anv_GetPhysicalDeviceFeatures(physicalDevice, &supported_features);
- VkBool32 *supported_feature = (VkBool32 *)&supported_features;
- VkBool32 *enabled_feature = (VkBool32 *)features;
- unsigned num_features = sizeof(VkPhysicalDeviceFeatures) / sizeof(VkBool32);
- for (uint32_t i = 0; i < num_features; i++) {
- if (enabled_feature[i] && !supported_feature[i])
- return vk_error(VK_ERROR_FEATURE_NOT_PRESENT);
+ switch (device->info->kmd_type) {
+ case INTEL_KMD_TYPE_I915:
+ return anv_i915_device_setup_context(device, pCreateInfo, num_queues);
+ case INTEL_KMD_TYPE_XE:
+ return anv_xe_device_setup_vm(device);
+ default:
+ unreachable("Missing");
+ return VK_ERROR_UNKNOWN;
}
+}
+
+static bool
+anv_device_destroy_context_or_vm(struct anv_device *device)
+{
+ switch (device->info->kmd_type) {
+ case INTEL_KMD_TYPE_I915:
+ if (device->physical->has_vm_control)
+ return anv_i915_device_destroy_vm(device);
+ else
+ return intel_gem_destroy_context(device->fd, device->context_id);
+ case INTEL_KMD_TYPE_XE:
+ return anv_xe_device_destroy_vm(device);
+ default:
+ unreachable("Missing");
+ return false;
+ }
+}
+
+static VkResult
+anv_device_init_trtt(struct anv_device *device)
+{
+ struct anv_trtt *trtt = &device->trtt;
+
+ if (pthread_mutex_init(&trtt->mutex, NULL) != 0)
+ return vk_error(device, VK_ERROR_INITIALIZATION_FAILED);
+
+ list_inithead(&trtt->in_flight_batches);
return VK_SUCCESS;
}
+static void
+anv_device_finish_trtt(struct anv_device *device)
+{
+ struct anv_trtt *trtt = &device->trtt;
+
+ if (trtt->timeline_val > 0) {
+ struct drm_syncobj_timeline_wait wait = {
+ .handles = (uintptr_t)&trtt->timeline_handle,
+ .points = (uintptr_t)&trtt->timeline_val,
+ .timeout_nsec = INT64_MAX,
+ .count_handles = 1,
+ .flags = DRM_SYNCOBJ_WAIT_FLAGS_WAIT_ALL,
+ .first_signaled = false,
+ };
+ if (intel_ioctl(device->fd, DRM_IOCTL_SYNCOBJ_TIMELINE_WAIT, &wait))
+ fprintf(stderr, "TR-TT syncobj wait failed!\n");
+
+ list_for_each_entry_safe(struct anv_trtt_batch_bo, trtt_bbo,
+ &trtt->in_flight_batches, link)
+ anv_trtt_batch_bo_free(device, trtt_bbo);
+
+ }
+
+ if (trtt->timeline_handle > 0) {
+ struct drm_syncobj_destroy destroy = {
+ .handle = trtt->timeline_handle,
+ };
+ if (intel_ioctl(device->fd, DRM_IOCTL_SYNCOBJ_DESTROY, &destroy))
+ fprintf(stderr, "TR-TT syncobj destroy failed!\n");
+ }
+
+ pthread_mutex_destroy(&trtt->mutex);
+
+ vk_free(&device->vk.alloc, trtt->l3_mirror);
+ vk_free(&device->vk.alloc, trtt->l2_mirror);
+
+ for (int i = 0; i < trtt->num_page_table_bos; i++)
+ anv_device_release_bo(device, trtt->page_table_bos[i]);
+
+ vk_free(&device->vk.alloc, trtt->page_table_bos);
+}
+
VkResult anv_CreateDevice(
VkPhysicalDevice physicalDevice,
const VkDeviceCreateInfo* pCreateInfo,
@@ -3196,148 +3255,125 @@ VkResult anv_CreateDevice(
assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO);
- /* Check enabled features */
- bool robust_buffer_access = false;
- if (pCreateInfo->pEnabledFeatures) {
- result = check_physical_device_features(physicalDevice,
- pCreateInfo->pEnabledFeatures);
- if (result != VK_SUCCESS)
- return result;
-
- if (pCreateInfo->pEnabledFeatures->robustBufferAccess)
- robust_buffer_access = true;
- }
-
- vk_foreach_struct_const(ext, pCreateInfo->pNext) {
- switch (ext->sType) {
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2: {
- const VkPhysicalDeviceFeatures2 *features = (const void *)ext;
- result = check_physical_device_features(physicalDevice,
- &features->features);
- if (result != VK_SUCCESS)
- return result;
-
- if (features->features.robustBufferAccess)
- robust_buffer_access = true;
- break;
- }
-
- default:
- /* Don't warn */
- break;
- }
- }
-
/* Check requested queues and fail if we are requested to create any
* queues with flags we don't support.
*/
assert(pCreateInfo->queueCreateInfoCount > 0);
for (uint32_t i = 0; i < pCreateInfo->queueCreateInfoCount; i++) {
- if (pCreateInfo->pQueueCreateInfos[i].flags != 0)
- return vk_error(VK_ERROR_INITIALIZATION_FAILED);
+ if (pCreateInfo->pQueueCreateInfos[i].flags & ~VK_DEVICE_QUEUE_CREATE_PROTECTED_BIT)
+ return vk_error(physical_device, VK_ERROR_INITIALIZATION_FAILED);
}
- /* Check if client specified queue priority. */
- const VkDeviceQueueGlobalPriorityCreateInfoEXT *queue_priority =
- vk_find_struct_const(pCreateInfo->pQueueCreateInfos[0].pNext,
- DEVICE_QUEUE_GLOBAL_PRIORITY_CREATE_INFO_EXT);
-
- VkQueueGlobalPriorityEXT priority =
- queue_priority ? queue_priority->globalPriority :
- VK_QUEUE_GLOBAL_PRIORITY_MEDIUM_EXT;
-
- device = vk_alloc2(&physical_device->instance->vk.alloc, pAllocator,
+ device = vk_zalloc2(&physical_device->instance->vk.alloc, pAllocator,
sizeof(*device), 8,
VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
if (!device)
- return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+ return vk_error(physical_device, VK_ERROR_OUT_OF_HOST_MEMORY);
struct vk_device_dispatch_table dispatch_table;
+
+ bool override_initial_entrypoints = true;
+ if (physical_device->instance->vk.app_info.app_name &&
+ !strcmp(physical_device->instance->vk.app_info.app_name, "HITMAN3.exe")) {
+ vk_device_dispatch_table_from_entrypoints(&dispatch_table,
+ &anv_hitman3_device_entrypoints,
+ true);
+ override_initial_entrypoints = false;
+ }
+ if (physical_device->info.ver < 12 &&
+ physical_device->instance->vk.app_info.app_name &&
+ !strcmp(physical_device->instance->vk.app_info.app_name, "DOOM 64")) {
+ vk_device_dispatch_table_from_entrypoints(&dispatch_table,
+ &anv_doom64_device_entrypoints,
+ true);
+ override_initial_entrypoints = false;
+ }
+#if DETECT_OS_ANDROID
+ vk_device_dispatch_table_from_entrypoints(&dispatch_table,
+ &anv_android_device_entrypoints,
+ true);
+ override_initial_entrypoints = false;
+#endif
+ if (physical_device->instance->vk.trace_mode & VK_TRACE_MODE_RMV) {
+ vk_device_dispatch_table_from_entrypoints(&dispatch_table,
+ &anv_rmv_device_entrypoints,
+ true);
+ override_initial_entrypoints = false;
+ }
vk_device_dispatch_table_from_entrypoints(&dispatch_table,
- anv_genX(&physical_device->info, device_entrypoints), true);
+ anv_genX(&physical_device->info, device_entrypoints),
+ override_initial_entrypoints);
vk_device_dispatch_table_from_entrypoints(&dispatch_table,
&anv_device_entrypoints, false);
+ vk_device_dispatch_table_from_entrypoints(&dispatch_table,
+ &wsi_device_entrypoints, false);
+
result = vk_device_init(&device->vk, &physical_device->vk,
&dispatch_table, pCreateInfo, pAllocator);
- if (result != VK_SUCCESS) {
- vk_error(result);
+ if (result != VK_SUCCESS)
goto fail_alloc;
- }
- if (INTEL_DEBUG & DEBUG_BATCH) {
- const unsigned decode_flags =
- INTEL_BATCH_DECODE_FULL |
- ((INTEL_DEBUG & DEBUG_COLOR) ? INTEL_BATCH_DECODE_IN_COLOR : 0) |
- INTEL_BATCH_DECODE_OFFSETS |
- INTEL_BATCH_DECODE_FLOATS;
+ if (INTEL_DEBUG(DEBUG_BATCH | DEBUG_BATCH_STATS)) {
+ for (unsigned i = 0; i < physical_device->queue.family_count; i++) {
+ struct intel_batch_decode_ctx *decoder = &device->decoder[i];
+
+ const unsigned decode_flags = INTEL_BATCH_DECODE_DEFAULT_FLAGS;
+
+ intel_batch_decode_ctx_init_brw(decoder,
+ &physical_device->compiler->isa,
+ &physical_device->info,
+ stderr, decode_flags, NULL,
+ decode_get_bo, NULL, device);
+ intel_batch_stats_reset(decoder);
- intel_batch_decode_ctx_init(&device->decoder_ctx,
- &physical_device->info,
- stderr, decode_flags, NULL,
- decode_get_bo, NULL, device);
+ decoder->engine = physical_device->queue.families[i].engine_class;
+ decoder->dynamic_base = physical_device->va.dynamic_state_pool.addr;
+ decoder->surface_base = physical_device->va.internal_surface_state_pool.addr;
+ decoder->instruction_base = physical_device->va.instruction_state_pool.addr;
+ }
}
- device->physical = physical_device;
- device->_lost = false;
+ anv_device_set_physical(device, physical_device);
+ device->kmd_backend = anv_kmd_backend_get(device->info->kmd_type);
/* XXX(chadv): Can we dup() physicalDevice->fd here? */
device->fd = open(physical_device->path, O_RDWR | O_CLOEXEC);
if (device->fd == -1) {
- result = vk_error(VK_ERROR_INITIALIZATION_FAILED);
+ result = vk_error(device, VK_ERROR_INITIALIZATION_FAILED);
goto fail_device;
}
+ switch (device->info->kmd_type) {
+ case INTEL_KMD_TYPE_I915:
+ device->vk.check_status = anv_i915_device_check_status;
+ break;
+ case INTEL_KMD_TYPE_XE:
+ device->vk.check_status = anv_xe_device_check_status;
+ break;
+ default:
+ unreachable("Missing");
+ }
+
+ device->vk.command_buffer_ops = &anv_cmd_buffer_ops;
+ device->vk.create_sync_for_memory = anv_create_sync_for_memory;
+ if (physical_device->info.kmd_type == INTEL_KMD_TYPE_I915)
+ device->vk.create_sync_for_memory = anv_create_sync_for_memory;
+ vk_device_set_drm_fd(&device->vk, device->fd);
+
uint32_t num_queues = 0;
for (uint32_t i = 0; i < pCreateInfo->queueCreateInfoCount; i++)
num_queues += pCreateInfo->pQueueCreateInfos[i].queueCount;
- if (device->physical->engine_info) {
- /* The kernel API supports at most 64 engines */
- assert(num_queues <= 64);
- uint16_t engine_classes[64];
- int engine_count = 0;
- for (uint32_t i = 0; i < pCreateInfo->queueCreateInfoCount; i++) {
- const VkDeviceQueueCreateInfo *queueCreateInfo =
- &pCreateInfo->pQueueCreateInfos[i];
-
- assert(queueCreateInfo->queueFamilyIndex <
- physical_device->queue.family_count);
- struct anv_queue_family *queue_family =
- &physical_device->queue.families[queueCreateInfo->queueFamilyIndex];
-
- for (uint32_t j = 0; j < queueCreateInfo->queueCount; j++)
- engine_classes[engine_count++] = queue_family->engine_class;
- }
- device->context_id =
- anv_gem_create_context_engines(device,
- physical_device->engine_info,
- engine_count, engine_classes);
- } else {
- assert(num_queues == 1);
- device->context_id = anv_gem_create_context(device);
- }
- if (device->context_id == -1) {
- result = vk_error(VK_ERROR_INITIALIZATION_FAILED);
+ result = anv_device_setup_context_or_vm(device, pCreateInfo, num_queues);
+ if (result != VK_SUCCESS)
goto fail_fd;
- }
-
- /* Here we tell the kernel not to attempt to recover our context but
- * immediately (on the next batchbuffer submission) report that the
- * context is lost, and we will do the recovery ourselves. In the case
- * of Vulkan, recovery means throwing VK_ERROR_DEVICE_LOST and letting
- * the client clean up the pieces.
- */
- anv_gem_set_context_param(device->fd, device->context_id,
- I915_CONTEXT_PARAM_RECOVERABLE, false);
-
- device->has_thread_submit = physical_device->has_thread_submit;
device->queues =
vk_zalloc(&device->vk.alloc, num_queues * sizeof(*device->queues), 8,
VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
if (device->queues == NULL) {
- result = vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+ result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
goto fail_context_id;
}
@@ -3347,15 +3383,8 @@ VkResult anv_CreateDevice(
&pCreateInfo->pQueueCreateInfos[i];
for (uint32_t j = 0; j < queueCreateInfo->queueCount; j++) {
- /* When using legacy contexts, we use I915_EXEC_RENDER but, with
- * engine-based contexts, the bottom 6 bits of exec_flags are used
- * for the engine ID.
- */
- uint32_t exec_flags = device->physical->engine_info ?
- device->queue_count : I915_EXEC_RENDER;
-
result = anv_queue_init(device, &device->queues[device->queue_count],
- exec_flags, queueCreateInfo);
+ queueCreateInfo, j);
if (result != VK_SUCCESS)
goto fail_queues;
@@ -3363,149 +3392,281 @@ VkResult anv_CreateDevice(
}
}
- if (physical_device->use_softpin) {
- if (pthread_mutex_init(&device->vma_mutex, NULL) != 0) {
- result = vk_error(VK_ERROR_INITIALIZATION_FAILED);
- goto fail_queues;
- }
+ if (pthread_mutex_init(&device->vma_mutex, NULL) != 0) {
+ result = vk_error(device, VK_ERROR_INITIALIZATION_FAILED);
+ goto fail_queues;
+ }
- /* keep the page with address zero out of the allocator */
- util_vma_heap_init(&device->vma_lo,
- LOW_HEAP_MIN_ADDRESS, LOW_HEAP_SIZE);
+ /* keep the page with address zero out of the allocator */
+ util_vma_heap_init(&device->vma_lo,
+ device->physical->va.low_heap.addr,
+ device->physical->va.low_heap.size);
- util_vma_heap_init(&device->vma_cva, CLIENT_VISIBLE_HEAP_MIN_ADDRESS,
- CLIENT_VISIBLE_HEAP_SIZE);
+ util_vma_heap_init(&device->vma_hi,
+ device->physical->va.high_heap.addr,
+ device->physical->va.high_heap.size);
- /* Leave the last 4GiB out of the high vma range, so that no state
- * base address + size can overflow 48 bits. For more information see
- * the comment about Wa32bitGeneralStateOffset in anv_allocator.c
- */
- util_vma_heap_init(&device->vma_hi, HIGH_HEAP_MIN_ADDRESS,
- physical_device->gtt_size - (1ull << 32) -
- HIGH_HEAP_MIN_ADDRESS);
+ if (device->physical->indirect_descriptors) {
+ util_vma_heap_init(&device->vma_desc,
+ device->physical->va.indirect_descriptor_pool.addr,
+ device->physical->va.indirect_descriptor_pool.size);
+ } else {
+ util_vma_heap_init(&device->vma_desc,
+ device->physical->va.bindless_surface_state_pool.addr,
+ device->physical->va.bindless_surface_state_pool.size);
}
- list_inithead(&device->memory_objects);
-
- /* As per spec, the driver implementation may deny requests to acquire
- * a priority above the default priority (MEDIUM) if the caller does not
- * have sufficient privileges. In this scenario VK_ERROR_NOT_PERMITTED_EXT
- * is returned.
+ /* Always initialized because the the memory types point to this and they
+ * are on the physical device.
*/
- if (physical_device->has_context_priority) {
- int err = anv_gem_set_context_param(device->fd, device->context_id,
- I915_CONTEXT_PARAM_PRIORITY,
- vk_priority_to_gen(priority));
- if (err != 0 && priority > VK_QUEUE_GLOBAL_PRIORITY_MEDIUM_EXT) {
- result = vk_error(VK_ERROR_NOT_PERMITTED_EXT);
- goto fail_vmas;
- }
- }
-
- device->info = physical_device->info;
- device->isl_dev = physical_device->isl_dev;
+ util_vma_heap_init(&device->vma_desc_buf,
+ device->physical->va.descriptor_buffer_pool.addr,
+ device->physical->va.descriptor_buffer_pool.size);
- /* On Broadwell and later, we can use batch chaining to more efficiently
- * implement growing command buffers. Prior to Haswell, the kernel
- * command parser gets in the way and we have to fall back to growing
- * the batch.
- */
- device->can_chain_batches = device->info.ver >= 8;
+ util_vma_heap_init(&device->vma_samplers,
+ device->physical->va.sampler_state_pool.addr,
+ device->physical->va.sampler_state_pool.size);
+ util_vma_heap_init(&device->vma_trtt,
+ device->physical->va.trtt.addr,
+ device->physical->va.trtt.size);
- device->robust_buffer_access = robust_buffer_access;
+ list_inithead(&device->memory_objects);
+ list_inithead(&device->image_private_objects);
if (pthread_mutex_init(&device->mutex, NULL) != 0) {
- result = vk_error(VK_ERROR_INITIALIZATION_FAILED);
- goto fail_queues;
+ result = vk_error(device, VK_ERROR_INITIALIZATION_FAILED);
+ goto fail_vmas;
}
pthread_condattr_t condattr;
if (pthread_condattr_init(&condattr) != 0) {
- result = vk_error(VK_ERROR_INITIALIZATION_FAILED);
+ result = vk_error(device, VK_ERROR_INITIALIZATION_FAILED);
goto fail_mutex;
}
if (pthread_condattr_setclock(&condattr, CLOCK_MONOTONIC) != 0) {
pthread_condattr_destroy(&condattr);
- result = vk_error(VK_ERROR_INITIALIZATION_FAILED);
+ result = vk_error(device, VK_ERROR_INITIALIZATION_FAILED);
goto fail_mutex;
}
if (pthread_cond_init(&device->queue_submit, &condattr) != 0) {
pthread_condattr_destroy(&condattr);
- result = vk_error(VK_ERROR_INITIALIZATION_FAILED);
+ result = vk_error(device, VK_ERROR_INITIALIZATION_FAILED);
goto fail_mutex;
}
pthread_condattr_destroy(&condattr);
- result = anv_bo_cache_init(&device->bo_cache);
+ if (physical_device->instance->vk.trace_mode & VK_TRACE_MODE_RMV)
+ anv_memory_trace_init(device);
+
+ result = anv_bo_cache_init(&device->bo_cache, device);
if (result != VK_SUCCESS)
goto fail_queue_cond;
- anv_bo_pool_init(&device->batch_bo_pool, device, "batch");
+ anv_bo_pool_init(&device->batch_bo_pool, device, "batch",
+ ANV_BO_ALLOC_MAPPED |
+ ANV_BO_ALLOC_HOST_CACHED_COHERENT |
+ ANV_BO_ALLOC_CAPTURE);
+ if (device->vk.enabled_extensions.KHR_acceleration_structure) {
+ anv_bo_pool_init(&device->bvh_bo_pool, device, "bvh build",
+ 0 /* alloc_flags */);
+ }
/* Because scratch is also relative to General State Base Address, we leave
* the base address 0 and start the pool memory at an offset. This way we
* get the correct offsets in the anv_states that get allocated from it.
*/
result = anv_state_pool_init(&device->general_state_pool, device,
- "general pool",
- 0, GENERAL_STATE_POOL_MIN_ADDRESS, 16384);
+ &(struct anv_state_pool_params) {
+ .name = "general pool",
+ .base_address = 0,
+ .start_offset = device->physical->va.general_state_pool.addr,
+ .block_size = 16384,
+ .max_size = device->physical->va.general_state_pool.size
+ });
if (result != VK_SUCCESS)
goto fail_batch_bo_pool;
result = anv_state_pool_init(&device->dynamic_state_pool, device,
- "dynamic pool",
- DYNAMIC_STATE_POOL_MIN_ADDRESS, 0, 16384);
+ &(struct anv_state_pool_params) {
+ .name = "dynamic pool",
+ .base_address = device->physical->va.dynamic_state_pool.addr,
+ .block_size = 16384,
+ .max_size = device->physical->va.dynamic_state_pool.size,
+ });
if (result != VK_SUCCESS)
goto fail_general_state_pool;
- if (device->info.ver >= 8) {
- /* The border color pointer is limited to 24 bits, so we need to make
- * sure that any such color used at any point in the program doesn't
- * exceed that limit.
- * We achieve that by reserving all the custom border colors we support
- * right off the bat, so they are close to the base address.
- */
- anv_state_reserved_pool_init(&device->custom_border_colors,
- &device->dynamic_state_pool,
- MAX_CUSTOM_BORDER_COLORS,
- sizeof(struct gfx8_border_color), 64);
+ if (device->vk.enabled_extensions.EXT_descriptor_buffer) {
+ result = anv_state_pool_init(&device->dynamic_state_db_pool, device,
+ &(struct anv_state_pool_params) {
+ .name = "dynamic pool (db)",
+ .base_address = device->physical->va.dynamic_state_db_pool.addr,
+ .block_size = 16384,
+ .max_size = device->physical->va.dynamic_state_db_pool.size,
+ });
+ if (result != VK_SUCCESS)
+ goto fail_dynamic_state_pool;
+ }
+
+ /* The border color pointer is limited to 24 bits, so we need to make
+ * sure that any such color used at any point in the program doesn't
+ * exceed that limit.
+ * We achieve that by reserving all the custom border colors we support
+ * right off the bat, so they are close to the base address.
+ */
+ anv_state_reserved_pool_init(&device->custom_border_colors,
+ &device->dynamic_state_pool,
+ MAX_CUSTOM_BORDER_COLORS,
+ sizeof(struct gfx8_border_color), 64);
+ if (device->vk.enabled_extensions.EXT_descriptor_buffer) {
+ result = anv_state_reserved_array_pool_init(&device->custom_border_colors_db,
+ &device->dynamic_state_db_pool,
+ MAX_CUSTOM_BORDER_COLORS,
+ sizeof(struct gfx8_border_color), 64);
+ if (result != VK_SUCCESS)
+ goto fail_dynamic_state_db_pool;
}
result = anv_state_pool_init(&device->instruction_state_pool, device,
- "instruction pool",
- INSTRUCTION_STATE_POOL_MIN_ADDRESS, 0, 16384);
+ &(struct anv_state_pool_params) {
+ .name = "instruction pool",
+ .base_address = device->physical->va.instruction_state_pool.addr,
+ .block_size = 16384,
+ .max_size = device->physical->va.instruction_state_pool.size,
+ });
if (result != VK_SUCCESS)
- goto fail_dynamic_state_pool;
+ goto fail_reserved_array_pool;
- result = anv_state_pool_init(&device->surface_state_pool, device,
- "surface state pool",
- SURFACE_STATE_POOL_MIN_ADDRESS, 0, 4096);
+ if (device->info->verx10 >= 125) {
+ /* Put the scratch surface states at the beginning of the internal
+ * surface state pool.
+ */
+ result = anv_state_pool_init(&device->scratch_surface_state_pool, device,
+ &(struct anv_state_pool_params) {
+ .name = "scratch surface state pool",
+ .base_address = device->physical->va.scratch_surface_state_pool.addr,
+ .block_size = 4096,
+ .max_size = device->physical->va.scratch_surface_state_pool.size,
+ });
+ if (result != VK_SUCCESS)
+ goto fail_instruction_state_pool;
+
+ result = anv_state_pool_init(&device->internal_surface_state_pool, device,
+ &(struct anv_state_pool_params) {
+ .name = "internal surface state pool",
+ .base_address = device->physical->va.internal_surface_state_pool.addr,
+ .start_offset = device->physical->va.scratch_surface_state_pool.size,
+ .block_size = 4096,
+ .max_size = device->physical->va.internal_surface_state_pool.size,
+ });
+ } else {
+ result = anv_state_pool_init(&device->internal_surface_state_pool, device,
+ &(struct anv_state_pool_params) {
+ .name = "internal surface state pool",
+ .base_address = device->physical->va.internal_surface_state_pool.addr,
+ .block_size = 4096,
+ .max_size = device->physical->va.internal_surface_state_pool.size,
+ });
+ }
if (result != VK_SUCCESS)
- goto fail_instruction_state_pool;
+ goto fail_scratch_surface_state_pool;
+
+ if (device->physical->indirect_descriptors) {
+ result = anv_state_pool_init(&device->bindless_surface_state_pool, device,
+ &(struct anv_state_pool_params) {
+ .name = "bindless surface state pool",
+ .base_address = device->physical->va.bindless_surface_state_pool.addr,
+ .block_size = 4096,
+ .max_size = device->physical->va.bindless_surface_state_pool.size,
+ });
+ if (result != VK_SUCCESS)
+ goto fail_internal_surface_state_pool;
+ }
- if (physical_device->use_softpin) {
- int64_t bt_pool_offset = (int64_t)BINDING_TABLE_POOL_MIN_ADDRESS -
- (int64_t)SURFACE_STATE_POOL_MIN_ADDRESS;
+ if (device->info->verx10 >= 125) {
+ /* We're using 3DSTATE_BINDING_TABLE_POOL_ALLOC to give the binding
+ * table its own base address separately from surface state base.
+ */
+ result = anv_state_pool_init(&device->binding_table_pool, device,
+ &(struct anv_state_pool_params) {
+ .name = "binding table pool",
+ .base_address = device->physical->va.binding_table_pool.addr,
+ .block_size = BINDING_TABLE_POOL_BLOCK_SIZE,
+ .max_size = device->physical->va.binding_table_pool.size,
+ });
+ } else {
+ /* The binding table should be in front of the surface states in virtual
+ * address space so that all surface states can be express as relative
+ * offsets from the binding table location.
+ */
+ assert(device->physical->va.binding_table_pool.addr <
+ device->physical->va.internal_surface_state_pool.addr);
+ int64_t bt_pool_offset = (int64_t)device->physical->va.binding_table_pool.addr -
+ (int64_t)device->physical->va.internal_surface_state_pool.addr;
assert(INT32_MIN < bt_pool_offset && bt_pool_offset < 0);
result = anv_state_pool_init(&device->binding_table_pool, device,
- "binding table pool",
- SURFACE_STATE_POOL_MIN_ADDRESS,
- bt_pool_offset, 4096);
+ &(struct anv_state_pool_params) {
+ .name = "binding table pool",
+ .base_address = device->physical->va.internal_surface_state_pool.addr,
+ .start_offset = bt_pool_offset,
+ .block_size = BINDING_TABLE_POOL_BLOCK_SIZE,
+ .max_size = device->physical->va.internal_surface_state_pool.size,
+ });
+ }
+ if (result != VK_SUCCESS)
+ goto fail_bindless_surface_state_pool;
+
+ if (device->physical->indirect_descriptors) {
+ result = anv_state_pool_init(&device->indirect_push_descriptor_pool, device,
+ &(struct anv_state_pool_params) {
+ .name = "indirect push descriptor pool",
+ .base_address = device->physical->va.indirect_push_descriptor_pool.addr,
+ .block_size = 4096,
+ .max_size = device->physical->va.indirect_push_descriptor_pool.size,
+ });
if (result != VK_SUCCESS)
- goto fail_surface_state_pool;
+ goto fail_binding_table_pool;
}
- if (device->info.has_aux_map) {
+ if (device->vk.enabled_extensions.EXT_descriptor_buffer &&
+ device->info->verx10 >= 125) {
+ /* On Gfx12.5+ because of the bindless stages (Mesh, Task, RT), the only
+ * way we can wire push descriptors is through the bindless heap. This
+ * state pool is a 1Gb carve out of the 4Gb HW heap.
+ */
+ result = anv_state_pool_init(&device->push_descriptor_buffer_pool, device,
+ &(struct anv_state_pool_params) {
+ .name = "push descriptor buffer state pool",
+ .base_address = device->physical->va.push_descriptor_buffer_pool.addr,
+ .block_size = 4096,
+ .max_size = device->physical->va.push_descriptor_buffer_pool.size,
+ });
+ if (result != VK_SUCCESS)
+ goto fail_indirect_push_descriptor_pool;
+ }
+
+ if (device->info->has_aux_map) {
+ result = anv_state_pool_init(&device->aux_tt_pool, device,
+ &(struct anv_state_pool_params) {
+ .name = "aux-tt pool",
+ .base_address = device->physical->va.aux_tt_pool.addr,
+ .block_size = 16384,
+ .max_size = device->physical->va.aux_tt_pool.size,
+ });
+ if (result != VK_SUCCESS)
+ goto fail_push_descriptor_buffer_pool;
+
device->aux_map_ctx = intel_aux_map_init(device, &aux_map_allocator,
&physical_device->info);
if (!device->aux_map_ctx)
- goto fail_binding_table_pool;
+ goto fail_aux_tt_pool;
}
- result = anv_device_alloc_bo(device, "workaround", 4096,
+ result = anv_device_alloc_bo(device, "workaround", 8192,
ANV_BO_ALLOC_CAPTURE |
+ ANV_BO_ALLOC_HOST_COHERENT |
ANV_BO_ALLOC_MAPPED |
- ANV_BO_ALLOC_LOCAL_MEM,
+ ANV_BO_ALLOC_INTERNAL,
0 /* explicit_address */,
&device->workaround_bo);
if (result != VK_SUCCESS)
@@ -3513,85 +3674,296 @@ VkResult anv_CreateDevice(
device->workaround_address = (struct anv_address) {
.bo = device->workaround_bo,
- .offset = align_u32(
- intel_debug_write_identifiers(device->workaround_bo->map,
- device->workaround_bo->size,
- "Anv") + 8, 8),
+ .offset = align(intel_debug_write_identifiers(device->workaround_bo->map,
+ device->workaround_bo->size,
+ "Anv"), 32),
};
+ device->workarounds.doom64_images = NULL;
+
+ device->rt_uuid_addr = anv_address_add(device->workaround_address, 8);
+ memcpy(device->rt_uuid_addr.bo->map + device->rt_uuid_addr.offset,
+ physical_device->rt_uuid,
+ sizeof(physical_device->rt_uuid));
+
device->debug_frame_desc =
intel_debug_get_identifier_block(device->workaround_bo->map,
device->workaround_bo->size,
INTEL_DEBUG_BLOCK_TYPE_FRAME);
+ if (device->vk.enabled_extensions.KHR_ray_query) {
+ uint32_t ray_queries_size =
+ align(brw_rt_ray_queries_hw_stacks_size(device->info), 4096);
+
+ result = anv_device_alloc_bo(device, "ray queries",
+ ray_queries_size,
+ ANV_BO_ALLOC_INTERNAL,
+ 0 /* explicit_address */,
+ &device->ray_query_bo);
+ if (result != VK_SUCCESS)
+ goto fail_workaround_bo;
+ }
+
result = anv_device_init_trivial_batch(device);
if (result != VK_SUCCESS)
- goto fail_workaround_bo;
+ goto fail_ray_query_bo;
- /* Allocate a null surface state at surface state offset 0. This makes
- * NULL descriptor handling trivial because we can just memset structures
- * to zero and they have a valid descriptor.
+ /* Emit the CPS states before running the initialization batch as those
+ * structures are referenced.
*/
- device->null_surface_state =
- anv_state_pool_alloc(&device->surface_state_pool,
- device->isl_dev.ss.size,
- device->isl_dev.ss.align);
- isl_null_fill_state(&device->isl_dev, device->null_surface_state.map,
+ if (device->info->ver >= 12) {
+ uint32_t n_cps_states = 3 * 3; /* All combinaisons of X by Y CP sizes (1, 2, 4) */
+
+ if (device->info->has_coarse_pixel_primitive_and_cb)
+ n_cps_states *= 5 * 5; /* 5 combiners by 2 operators */
+
+ n_cps_states += 1; /* Disable CPS */
+
+ /* Each of the combinaison must be replicated on all viewports */
+ n_cps_states *= MAX_VIEWPORTS;
+
+ device->cps_states =
+ anv_state_pool_alloc(&device->dynamic_state_pool,
+ n_cps_states * CPS_STATE_length(device->info) * 4,
+ 32);
+ if (device->cps_states.map == NULL)
+ goto fail_trivial_batch;
+
+ anv_genX(device->info, init_cps_device_state)(device);
+
+ if (device->vk.enabled_extensions.EXT_descriptor_buffer) {
+ device->cps_states_db =
+ anv_state_pool_alloc(&device->dynamic_state_db_pool,
+ device->cps_states.alloc_size, 32);
+ if (device->cps_states_db.map == NULL)
+ goto fail_trivial_batch;
+
+ memcpy(device->cps_states_db.map, device->cps_states.map,
+ device->cps_states.alloc_size);
+ }
+ }
+
+ if (device->physical->indirect_descriptors) {
+ /* Allocate a null surface state at surface state offset 0. This makes
+ * NULL descriptor handling trivial because we can just memset
+ * structures to zero and they have a valid descriptor.
+ */
+ device->null_surface_state =
+ anv_state_pool_alloc(&device->bindless_surface_state_pool,
+ device->isl_dev.ss.size,
+ device->isl_dev.ss.align);
+ isl_null_fill_state(&device->isl_dev, device->null_surface_state.map,
+ .size = isl_extent3d(1, 1, 1) /* This shouldn't matter */);
+ assert(device->null_surface_state.offset == 0);
+ } else {
+ /* When using direct descriptors, those can hold the null surface state
+ * directly. We still need a null surface for the binding table entries
+ * though but this one can live anywhere the internal surface state
+ * pool.
+ */
+ device->null_surface_state =
+ anv_state_pool_alloc(&device->internal_surface_state_pool,
+ device->isl_dev.ss.size,
+ device->isl_dev.ss.align);
+ isl_null_fill_state(&device->isl_dev, device->null_surface_state.map,
+ .size = isl_extent3d(1, 1, 1) /* This shouldn't matter */);
+ }
+
+ isl_null_fill_state(&device->isl_dev, &device->host_null_surface_state,
.size = isl_extent3d(1, 1, 1) /* This shouldn't matter */);
- assert(device->null_surface_state.offset == 0);
anv_scratch_pool_init(device, &device->scratch_pool);
/* TODO(RT): Do we want some sort of data structure for this? */
memset(device->rt_scratch_bos, 0, sizeof(device->rt_scratch_bos));
- result = anv_genX(&device->info, init_device_state)(device);
+ if (ANV_SUPPORT_RT && device->info->has_ray_tracing) {
+ /* The docs say to always allocate 128KB per DSS */
+ const uint32_t btd_fifo_bo_size =
+ 128 * 1024 * intel_device_info_dual_subslice_id_bound(device->info);
+ result = anv_device_alloc_bo(device,
+ "rt-btd-fifo",
+ btd_fifo_bo_size,
+ ANV_BO_ALLOC_INTERNAL,
+ 0 /* explicit_address */,
+ &device->btd_fifo_bo);
+ if (result != VK_SUCCESS)
+ goto fail_trivial_batch_bo_and_scratch_pool;
+ }
+
+ result = anv_device_init_trtt(device);
if (result != VK_SUCCESS)
- goto fail_trivial_batch_bo_and_scratch_pool;
+ goto fail_btd_fifo_bo;
- anv_pipeline_cache_init(&device->default_pipeline_cache, device,
- true /* cache_enabled */, false /* external_sync */);
+ result = anv_genX(device->info, init_device_state)(device);
+ if (result != VK_SUCCESS)
+ goto fail_trtt;
+
+ struct vk_pipeline_cache_create_info pcc_info = { };
+ device->default_pipeline_cache =
+ vk_pipeline_cache_create(&device->vk, &pcc_info, NULL);
+ if (!device->default_pipeline_cache) {
+ result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+ goto fail_trtt;
+ }
+
+ /* Internal shaders need their own pipeline cache because, unlike the rest
+ * of ANV, it won't work at all without the cache. It depends on it for
+ * shaders to remain resident while it runs. Therefore, we need a special
+ * cache just for BLORP/RT that's forced to always be enabled.
+ */
+ pcc_info.force_enable = true;
+ device->internal_cache =
+ vk_pipeline_cache_create(&device->vk, &pcc_info, NULL);
+ if (device->internal_cache == NULL) {
+ result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+ goto fail_default_pipeline_cache;
+ }
+
+ /* The device (currently is ICL/TGL) does not have float64 support. */
+ if (!device->info->has_64bit_float &&
+ device->physical->instance->fp64_workaround_enabled)
+ anv_load_fp64_shader(device);
result = anv_device_init_rt_shaders(device);
- if (result != VK_SUCCESS)
- goto fail_rt_trampoline;
+ if (result != VK_SUCCESS) {
+ result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+ goto fail_internal_cache;
+ }
+
+#if DETECT_OS_ANDROID
+ device->u_gralloc = u_gralloc_create(U_GRALLOC_TYPE_AUTO);
+#endif
+
+ device->robust_buffer_access =
+ device->vk.enabled_features.robustBufferAccess ||
+ device->vk.enabled_features.nullDescriptor;
+
+ device->breakpoint = anv_state_pool_alloc(&device->dynamic_state_pool, 4,
+ 4);
+ p_atomic_set(&device->draw_call_count, 0);
+
+ /* Create a separate command pool for companion RCS command buffer. */
+ if (device->info->verx10 >= 125) {
+ VkCommandPoolCreateInfo pool_info = {
+ .sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO,
+ .queueFamilyIndex =
+ anv_get_first_render_queue_index(device->physical),
+ };
+
+ result = vk_common_CreateCommandPool(anv_device_to_handle(device),
+ &pool_info, NULL,
+ &device->companion_rcs_cmd_pool);
+ if (result != VK_SUCCESS) {
+ goto fail_internal_cache;
+ }
+ }
anv_device_init_blorp(device);
anv_device_init_border_colors(device);
+ anv_device_init_internal_kernels(device);
+
+ anv_device_init_astc_emu(device);
+
anv_device_perf_init(device);
+ anv_device_utrace_init(device);
+
+ anv_device_init_embedded_samplers(device);
+
+ BITSET_ONES(device->gfx_dirty_state);
+ BITSET_CLEAR(device->gfx_dirty_state, ANV_GFX_STATE_INDEX_BUFFER);
+ BITSET_CLEAR(device->gfx_dirty_state, ANV_GFX_STATE_SO_DECL_LIST);
+ if (device->info->ver < 11)
+ BITSET_CLEAR(device->gfx_dirty_state, ANV_GFX_STATE_VF_SGVS_2);
+ if (device->info->ver < 12) {
+ BITSET_CLEAR(device->gfx_dirty_state, ANV_GFX_STATE_PRIMITIVE_REPLICATION);
+ BITSET_CLEAR(device->gfx_dirty_state, ANV_GFX_STATE_DEPTH_BOUNDS);
+ }
+ if (!device->vk.enabled_extensions.EXT_sample_locations)
+ BITSET_CLEAR(device->gfx_dirty_state, ANV_GFX_STATE_SAMPLE_PATTERN);
+ if (!device->vk.enabled_extensions.KHR_fragment_shading_rate)
+ BITSET_CLEAR(device->gfx_dirty_state, ANV_GFX_STATE_CPS);
+ if (!device->vk.enabled_extensions.EXT_mesh_shader) {
+ BITSET_CLEAR(device->gfx_dirty_state, ANV_GFX_STATE_SBE_MESH);
+ BITSET_CLEAR(device->gfx_dirty_state, ANV_GFX_STATE_CLIP_MESH);
+ BITSET_CLEAR(device->gfx_dirty_state, ANV_GFX_STATE_MESH_CONTROL);
+ BITSET_CLEAR(device->gfx_dirty_state, ANV_GFX_STATE_MESH_SHADER);
+ BITSET_CLEAR(device->gfx_dirty_state, ANV_GFX_STATE_MESH_DISTRIB);
+ BITSET_CLEAR(device->gfx_dirty_state, ANV_GFX_STATE_TASK_CONTROL);
+ BITSET_CLEAR(device->gfx_dirty_state, ANV_GFX_STATE_TASK_SHADER);
+ BITSET_CLEAR(device->gfx_dirty_state, ANV_GFX_STATE_TASK_REDISTRIB);
+ }
+ if (!intel_needs_workaround(device->info, 18019816803))
+ BITSET_CLEAR(device->gfx_dirty_state, ANV_GFX_STATE_WA_18019816803);
+ if (device->info->ver > 9)
+ BITSET_CLEAR(device->gfx_dirty_state, ANV_GFX_STATE_PMA_FIX);
+
*pDevice = anv_device_to_handle(device);
return VK_SUCCESS;
- fail_rt_trampoline:
- anv_pipeline_cache_finish(&device->default_pipeline_cache);
+ fail_internal_cache:
+ vk_pipeline_cache_destroy(device->internal_cache, NULL);
+ fail_default_pipeline_cache:
+ vk_pipeline_cache_destroy(device->default_pipeline_cache, NULL);
+ fail_trtt:
+ anv_device_finish_trtt(device);
+ fail_btd_fifo_bo:
+ if (ANV_SUPPORT_RT && device->info->has_ray_tracing)
+ anv_device_release_bo(device, device->btd_fifo_bo);
fail_trivial_batch_bo_and_scratch_pool:
anv_scratch_pool_finish(device, &device->scratch_pool);
+ fail_trivial_batch:
anv_device_release_bo(device, device->trivial_batch_bo);
+ fail_ray_query_bo:
+ if (device->ray_query_bo)
+ anv_device_release_bo(device, device->ray_query_bo);
fail_workaround_bo:
anv_device_release_bo(device, device->workaround_bo);
fail_surface_aux_map_pool:
- if (device->info.has_aux_map) {
+ if (device->info->has_aux_map) {
intel_aux_map_finish(device->aux_map_ctx);
device->aux_map_ctx = NULL;
}
+ fail_aux_tt_pool:
+ if (device->info->has_aux_map)
+ anv_state_pool_finish(&device->aux_tt_pool);
+ fail_push_descriptor_buffer_pool:
+ if (device->vk.enabled_extensions.EXT_descriptor_buffer &&
+ device->info->verx10 >= 125)
+ anv_state_pool_finish(&device->push_descriptor_buffer_pool);
+ fail_indirect_push_descriptor_pool:
+ if (device->physical->indirect_descriptors)
+ anv_state_pool_finish(&device->indirect_push_descriptor_pool);
fail_binding_table_pool:
- if (physical_device->use_softpin)
- anv_state_pool_finish(&device->binding_table_pool);
- fail_surface_state_pool:
- anv_state_pool_finish(&device->surface_state_pool);
+ anv_state_pool_finish(&device->binding_table_pool);
+ fail_bindless_surface_state_pool:
+ if (device->physical->indirect_descriptors)
+ anv_state_pool_finish(&device->bindless_surface_state_pool);
+ fail_internal_surface_state_pool:
+ anv_state_pool_finish(&device->internal_surface_state_pool);
+ fail_scratch_surface_state_pool:
+ if (device->info->verx10 >= 125)
+ anv_state_pool_finish(&device->scratch_surface_state_pool);
fail_instruction_state_pool:
anv_state_pool_finish(&device->instruction_state_pool);
+ fail_reserved_array_pool:
+ if (device->vk.enabled_extensions.EXT_descriptor_buffer)
+ anv_state_reserved_array_pool_finish(&device->custom_border_colors_db);
+ fail_dynamic_state_db_pool:
+ anv_state_reserved_pool_finish(&device->custom_border_colors);
+ if (device->vk.enabled_extensions.EXT_descriptor_buffer)
+ anv_state_pool_finish(&device->dynamic_state_db_pool);
fail_dynamic_state_pool:
- if (device->info.ver >= 8)
- anv_state_reserved_pool_finish(&device->custom_border_colors);
anv_state_pool_finish(&device->dynamic_state_pool);
fail_general_state_pool:
anv_state_pool_finish(&device->general_state_pool);
fail_batch_bo_pool:
+ if (device->vk.enabled_extensions.KHR_acceleration_structure)
+ anv_bo_pool_finish(&device->bvh_bo_pool);
anv_bo_pool_finish(&device->batch_bo_pool);
anv_bo_cache_finish(&device->bo_cache);
fail_queue_cond:
@@ -3599,17 +3971,19 @@ VkResult anv_CreateDevice(
fail_mutex:
pthread_mutex_destroy(&device->mutex);
fail_vmas:
- if (physical_device->use_softpin) {
- util_vma_heap_finish(&device->vma_hi);
- util_vma_heap_finish(&device->vma_cva);
- util_vma_heap_finish(&device->vma_lo);
- }
+ util_vma_heap_finish(&device->vma_trtt);
+ util_vma_heap_finish(&device->vma_samplers);
+ util_vma_heap_finish(&device->vma_desc_buf);
+ util_vma_heap_finish(&device->vma_desc);
+ util_vma_heap_finish(&device->vma_hi);
+ util_vma_heap_finish(&device->vma_lo);
+ pthread_mutex_destroy(&device->vma_mutex);
fail_queues:
for (uint32_t i = 0; i < device->queue_count; i++)
anv_queue_finish(&device->queues[i]);
vk_free(&device->vk.alloc, device->queues);
fail_context_id:
- anv_gem_destroy_context(device, device->context_id);
+ anv_device_destroy_context_or_vm(device);
fail_fd:
close(device->fd);
fail_device:
@@ -3629,20 +4003,58 @@ void anv_DestroyDevice(
if (!device)
return;
+#if DETECT_OS_ANDROID
+ u_gralloc_destroy(&device->u_gralloc);
+#endif
+
+ anv_memory_trace_finish(device);
+
+ struct anv_physical_device *pdevice = device->physical;
+
+ for (uint32_t i = 0; i < device->queue_count; i++)
+ anv_queue_finish(&device->queues[i]);
+ vk_free(&device->vk.alloc, device->queues);
+
+ anv_device_utrace_finish(device);
+
anv_device_finish_blorp(device);
anv_device_finish_rt_shaders(device);
- anv_pipeline_cache_finish(&device->default_pipeline_cache);
+ anv_device_finish_astc_emu(device);
+
+ anv_device_finish_internal_kernels(device);
+
+ vk_pipeline_cache_destroy(device->internal_cache, NULL);
+ vk_pipeline_cache_destroy(device->default_pipeline_cache, NULL);
+
+ anv_device_finish_embedded_samplers(device);
+
+ anv_device_finish_trtt(device);
+
+ if (ANV_SUPPORT_RT && device->info->has_ray_tracing)
+ anv_device_release_bo(device, device->btd_fifo_bo);
+
+ if (device->info->verx10 >= 125) {
+ vk_common_DestroyCommandPool(anv_device_to_handle(device),
+ device->companion_rcs_cmd_pool, NULL);
+ }
#ifdef HAVE_VALGRIND
/* We only need to free these to prevent valgrind errors. The backing
* BO will go away in a couple of lines so we don't actually leak.
*/
- if (device->info.ver >= 8)
- anv_state_reserved_pool_finish(&device->custom_border_colors);
+ anv_state_reserved_pool_finish(&device->custom_border_colors);
anv_state_pool_free(&device->dynamic_state_pool, device->border_colors);
anv_state_pool_free(&device->dynamic_state_pool, device->slice_hash);
+ anv_state_pool_free(&device->dynamic_state_pool, device->cps_states);
+ anv_state_pool_free(&device->dynamic_state_pool, device->breakpoint);
+ if (device->vk.enabled_extensions.EXT_descriptor_buffer) {
+ anv_state_pool_free(&device->dynamic_state_db_pool, device->cps_states_db);
+ anv_state_pool_free(&device->dynamic_state_db_pool, device->slice_hash_db);
+ anv_state_pool_free(&device->dynamic_state_db_pool, device->border_colors_db);
+ anv_state_reserved_array_pool_finish(&device->custom_border_colors_db);
+ }
#endif
for (unsigned i = 0; i < ARRAY_SIZE(device->rt_scratch_bos); i++) {
@@ -3652,42 +4064,66 @@ void anv_DestroyDevice(
anv_scratch_pool_finish(device, &device->scratch_pool);
+ if (device->vk.enabled_extensions.KHR_ray_query) {
+ for (unsigned i = 0; i < ARRAY_SIZE(device->ray_query_shadow_bos); i++) {
+ if (device->ray_query_shadow_bos[i] != NULL)
+ anv_device_release_bo(device, device->ray_query_shadow_bos[i]);
+ }
+ anv_device_release_bo(device, device->ray_query_bo);
+ }
anv_device_release_bo(device, device->workaround_bo);
anv_device_release_bo(device, device->trivial_batch_bo);
- if (device->info.has_aux_map) {
+ if (device->info->has_aux_map) {
intel_aux_map_finish(device->aux_map_ctx);
device->aux_map_ctx = NULL;
+ anv_state_pool_finish(&device->aux_tt_pool);
}
-
- if (device->physical->use_softpin)
- anv_state_pool_finish(&device->binding_table_pool);
- anv_state_pool_finish(&device->surface_state_pool);
+ if (device->vk.enabled_extensions.EXT_descriptor_buffer &&
+ device->info->verx10 >= 125)
+ anv_state_pool_finish(&device->push_descriptor_buffer_pool);
+ if (device->physical->indirect_descriptors)
+ anv_state_pool_finish(&device->indirect_push_descriptor_pool);
+ anv_state_pool_finish(&device->binding_table_pool);
+ if (device->info->verx10 >= 125)
+ anv_state_pool_finish(&device->scratch_surface_state_pool);
+ anv_state_pool_finish(&device->internal_surface_state_pool);
+ if (device->physical->indirect_descriptors)
+ anv_state_pool_finish(&device->bindless_surface_state_pool);
anv_state_pool_finish(&device->instruction_state_pool);
+ if (device->vk.enabled_extensions.EXT_descriptor_buffer)
+ anv_state_pool_finish(&device->dynamic_state_db_pool);
anv_state_pool_finish(&device->dynamic_state_pool);
anv_state_pool_finish(&device->general_state_pool);
+ if (device->vk.enabled_extensions.KHR_acceleration_structure)
+ anv_bo_pool_finish(&device->bvh_bo_pool);
anv_bo_pool_finish(&device->batch_bo_pool);
anv_bo_cache_finish(&device->bo_cache);
- if (device->physical->use_softpin) {
- util_vma_heap_finish(&device->vma_hi);
- util_vma_heap_finish(&device->vma_cva);
- util_vma_heap_finish(&device->vma_lo);
- }
+ util_vma_heap_finish(&device->vma_trtt);
+ util_vma_heap_finish(&device->vma_samplers);
+ util_vma_heap_finish(&device->vma_desc_buf);
+ util_vma_heap_finish(&device->vma_desc);
+ util_vma_heap_finish(&device->vma_hi);
+ util_vma_heap_finish(&device->vma_lo);
+ pthread_mutex_destroy(&device->vma_mutex);
pthread_cond_destroy(&device->queue_submit);
pthread_mutex_destroy(&device->mutex);
- for (uint32_t i = 0; i < device->queue_count; i++)
- anv_queue_finish(&device->queues[i]);
- vk_free(&device->vk.alloc, device->queues);
+ ralloc_free(device->fp64_nir);
- anv_gem_destroy_context(device, device->context_id);
+ anv_device_destroy_context_or_vm(device);
- if (INTEL_DEBUG & DEBUG_BATCH)
- intel_batch_decode_ctx_finish(&device->decoder_ctx);
+ if (INTEL_DEBUG(DEBUG_BATCH | DEBUG_BATCH_STATS)) {
+ for (unsigned i = 0; i < pdevice->queue.family_count; i++) {
+ if (INTEL_DEBUG(DEBUG_BATCH_STATS))
+ intel_batch_print_stats(&device->decoder[i]);
+ intel_batch_decode_ctx_finish(&device->decoder[i]);
+ }
+ }
close(device->fd);
@@ -3705,161 +4141,7 @@ VkResult anv_EnumerateInstanceLayerProperties(
}
/* None supported at this time */
- return vk_error(VK_ERROR_LAYER_NOT_PRESENT);
-}
-
-void anv_GetDeviceQueue2(
- VkDevice _device,
- const VkDeviceQueueInfo2* pQueueInfo,
- VkQueue* pQueue)
-{
- ANV_FROM_HANDLE(anv_device, device, _device);
- struct anv_physical_device *pdevice = device->physical;
-
- assert(pQueueInfo->queueFamilyIndex < pdevice->queue.family_count);
- struct anv_queue_family *queue_family =
- &pdevice->queue.families[pQueueInfo->queueFamilyIndex];
-
- int idx_in_family = 0;
- struct anv_queue *queue = NULL;
- for (uint32_t i = 0; i < device->queue_count; i++) {
- if (device->queues[i].family != queue_family)
- continue;
-
- if (idx_in_family == pQueueInfo->queueIndex) {
- queue = &device->queues[i];
- break;
- }
-
- idx_in_family++;
- }
- assert(queue != NULL);
-
- if (queue && queue->flags == pQueueInfo->flags)
- *pQueue = anv_queue_to_handle(queue);
- else
- *pQueue = NULL;
-}
-
-void
-_anv_device_report_lost(struct anv_device *device)
-{
- assert(p_atomic_read(&device->_lost) > 0);
-
- device->lost_reported = true;
-
- for (uint32_t i = 0; i < device->queue_count; i++) {
- struct anv_queue *queue = &device->queues[i];
- if (queue->lost) {
- __vk_errorf(device->physical->instance, &device->vk.base,
- VK_ERROR_DEVICE_LOST,
- queue->error_file, queue->error_line,
- "%s", queue->error_msg);
- }
- }
-}
-
-VkResult
-_anv_device_set_lost(struct anv_device *device,
- const char *file, int line,
- const char *msg, ...)
-{
- VkResult err;
- va_list ap;
-
- if (p_atomic_read(&device->_lost) > 0)
- return VK_ERROR_DEVICE_LOST;
-
- p_atomic_inc(&device->_lost);
- device->lost_reported = true;
-
- va_start(ap, msg);
- err = __vk_errorv(device->physical->instance, &device->vk.base,
- VK_ERROR_DEVICE_LOST, file, line, msg, ap);
- va_end(ap);
-
- if (env_var_as_boolean("ANV_ABORT_ON_DEVICE_LOSS", false))
- abort();
-
- return err;
-}
-
-VkResult
-_anv_queue_set_lost(struct anv_queue *queue,
- const char *file, int line,
- const char *msg, ...)
-{
- va_list ap;
-
- if (queue->lost)
- return VK_ERROR_DEVICE_LOST;
-
- queue->lost = true;
-
- queue->error_file = file;
- queue->error_line = line;
- va_start(ap, msg);
- vsnprintf(queue->error_msg, sizeof(queue->error_msg),
- msg, ap);
- va_end(ap);
-
- p_atomic_inc(&queue->device->_lost);
-
- if (env_var_as_boolean("ANV_ABORT_ON_DEVICE_LOSS", false))
- abort();
-
- return VK_ERROR_DEVICE_LOST;
-}
-
-VkResult
-anv_device_query_status(struct anv_device *device)
-{
- /* This isn't likely as most of the callers of this function already check
- * for it. However, it doesn't hurt to check and it potentially lets us
- * avoid an ioctl.
- */
- if (anv_device_is_lost(device))
- return VK_ERROR_DEVICE_LOST;
-
- uint32_t active, pending;
- int ret = anv_gem_context_get_reset_stats(device->fd, device->context_id,
- &active, &pending);
- if (ret == -1) {
- /* We don't know the real error. */
- return anv_device_set_lost(device, "get_reset_stats failed: %m");
- }
-
- if (active) {
- return anv_device_set_lost(device, "GPU hung on one of our command buffers");
- } else if (pending) {
- return anv_device_set_lost(device, "GPU hung with commands in-flight");
- }
-
- return VK_SUCCESS;
-}
-
-VkResult
-anv_device_bo_busy(struct anv_device *device, struct anv_bo *bo)
-{
- /* Note: This only returns whether or not the BO is in use by an i915 GPU.
- * Other usages of the BO (such as on different hardware) will not be
- * flagged as "busy" by this ioctl. Use with care.
- */
- int ret = anv_gem_busy(device, bo->gem_handle);
- if (ret == 1) {
- return VK_NOT_READY;
- } else if (ret == -1) {
- /* We don't know the real error. */
- return anv_device_set_lost(device, "gem wait failed: %m");
- }
-
- /* Query for device status after the busy call. If the BO we're checking
- * got caught in a GPU hang we don't want to return VK_SUCCESS to the
- * client because it clearly doesn't have valid data. Yes, this most
- * likely means an ioctl, but we just did an ioctl to query the busy status
- * so it's no great loss.
- */
- return anv_device_query_status(device);
+ return vk_error(NULL, VK_ERROR_LAYER_NOT_PRESENT);
}
VkResult
@@ -3871,52 +4153,60 @@ anv_device_wait(struct anv_device *device, struct anv_bo *bo,
return VK_TIMEOUT;
} else if (ret == -1) {
/* We don't know the real error. */
- return anv_device_set_lost(device, "gem wait failed: %m");
+ return vk_device_set_lost(&device->vk, "gem wait failed: %m");
+ } else {
+ return VK_SUCCESS;
}
-
- /* Query for device status after the wait. If the BO we're waiting on got
- * caught in a GPU hang we don't want to return VK_SUCCESS to the client
- * because it clearly doesn't have valid data. Yes, this most likely means
- * an ioctl, but we just did an ioctl to wait so it's no great loss.
- */
- return anv_device_query_status(device);
}
-VkResult anv_DeviceWaitIdle(
- VkDevice _device)
+static struct util_vma_heap *
+anv_vma_heap_for_flags(struct anv_device *device,
+ enum anv_bo_alloc_flags alloc_flags)
{
- ANV_FROM_HANDLE(anv_device, device, _device);
+ if (alloc_flags & ANV_BO_ALLOC_TRTT)
+ return &device->vma_trtt;
- if (anv_device_is_lost(device))
- return VK_ERROR_DEVICE_LOST;
+ if (alloc_flags & ANV_BO_ALLOC_DESCRIPTOR_BUFFER_POOL)
+ return &device->vma_desc_buf;
- for (uint32_t i = 0; i < device->queue_count; i++) {
- VkResult res = anv_queue_submit_simple_batch(&device->queues[i], NULL);
- if (res != VK_SUCCESS)
- return res;
- }
+ if (alloc_flags & ANV_BO_ALLOC_32BIT_ADDRESS)
+ return &device->vma_lo;
- return VK_SUCCESS;
+ if (alloc_flags & ANV_BO_ALLOC_DESCRIPTOR_POOL)
+ return &device->vma_desc;
+
+ if (alloc_flags & ANV_BO_ALLOC_SAMPLER_POOL)
+ return &device->vma_samplers;
+
+ return &device->vma_hi;
}
uint64_t
anv_vma_alloc(struct anv_device *device,
uint64_t size, uint64_t align,
enum anv_bo_alloc_flags alloc_flags,
- uint64_t client_address)
+ uint64_t client_address,
+ struct util_vma_heap **out_vma_heap)
{
pthread_mutex_lock(&device->vma_mutex);
uint64_t addr = 0;
+ *out_vma_heap = anv_vma_heap_for_flags(device, alloc_flags);
if (alloc_flags & ANV_BO_ALLOC_CLIENT_VISIBLE_ADDRESS) {
+ assert(*out_vma_heap == &device->vma_hi ||
+ *out_vma_heap == &device->vma_desc_buf ||
+ *out_vma_heap == &device->vma_trtt);
+
if (client_address) {
- if (util_vma_heap_alloc_addr(&device->vma_cva,
+ if (util_vma_heap_alloc_addr(*out_vma_heap,
client_address, size)) {
addr = client_address;
}
} else {
- addr = util_vma_heap_alloc(&device->vma_cva, size, align);
+ (*out_vma_heap)->alloc_high = false;
+ addr = util_vma_heap_alloc(*out_vma_heap, size, align);
+ (*out_vma_heap)->alloc_high = true;
}
/* We don't want to fall back to other heaps */
goto done;
@@ -3924,11 +4214,7 @@ anv_vma_alloc(struct anv_device *device,
assert(client_address == 0);
- if (!(alloc_flags & ANV_BO_ALLOC_32BIT_ADDRESS))
- addr = util_vma_heap_alloc(&device->vma_hi, size, align);
-
- if (addr == 0)
- addr = util_vma_heap_alloc(&device->vma_lo, size, align);
+ addr = util_vma_heap_alloc(*out_vma_heap, size, align);
done:
pthread_mutex_unlock(&device->vma_mutex);
@@ -3939,22 +4225,21 @@ done:
void
anv_vma_free(struct anv_device *device,
+ struct util_vma_heap *vma_heap,
uint64_t address, uint64_t size)
{
+ assert(vma_heap == &device->vma_lo ||
+ vma_heap == &device->vma_hi ||
+ vma_heap == &device->vma_desc ||
+ vma_heap == &device->vma_desc_buf ||
+ vma_heap == &device->vma_samplers ||
+ vma_heap == &device->vma_trtt);
+
const uint64_t addr_48b = intel_48b_address(address);
pthread_mutex_lock(&device->vma_mutex);
- if (addr_48b >= LOW_HEAP_MIN_ADDRESS &&
- addr_48b <= LOW_HEAP_MAX_ADDRESS) {
- util_vma_heap_free(&device->vma_lo, addr_48b, size);
- } else if (addr_48b >= CLIENT_VISIBLE_HEAP_MIN_ADDRESS &&
- addr_48b <= CLIENT_VISIBLE_HEAP_MAX_ADDRESS) {
- util_vma_heap_free(&device->vma_cva, addr_48b, size);
- } else {
- assert(addr_48b >= HIGH_HEAP_MIN_ADDRESS);
- util_vma_heap_free(&device->vma_hi, addr_48b, size);
- }
+ util_vma_heap_free(vma_heap, addr_48b, size);
pthread_mutex_unlock(&device->vma_mutex);
}
@@ -3972,125 +4257,165 @@ VkResult anv_AllocateMemory(
assert(pAllocateInfo->sType == VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO);
- /* The Vulkan 1.0.33 spec says "allocationSize must be greater than 0". */
- assert(pAllocateInfo->allocationSize > 0);
-
VkDeviceSize aligned_alloc_size =
- align_u64(pAllocateInfo->allocationSize, 4096);
-
- if (aligned_alloc_size > MAX_MEMORY_ALLOCATION_SIZE)
- return vk_error(VK_ERROR_OUT_OF_DEVICE_MEMORY);
+ align64(pAllocateInfo->allocationSize, 4096);
assert(pAllocateInfo->memoryTypeIndex < pdevice->memory.type_count);
- struct anv_memory_type *mem_type =
+ const struct anv_memory_type *mem_type =
&pdevice->memory.types[pAllocateInfo->memoryTypeIndex];
assert(mem_type->heapIndex < pdevice->memory.heap_count);
struct anv_memory_heap *mem_heap =
&pdevice->memory.heaps[mem_type->heapIndex];
+ if (aligned_alloc_size > mem_heap->size)
+ return vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
+
uint64_t mem_heap_used = p_atomic_read(&mem_heap->used);
if (mem_heap_used + aligned_alloc_size > mem_heap->size)
- return vk_error(VK_ERROR_OUT_OF_DEVICE_MEMORY);
+ return vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
- mem = vk_object_alloc(&device->vk, pAllocator, sizeof(*mem),
- VK_OBJECT_TYPE_DEVICE_MEMORY);
+ mem = vk_device_memory_create(&device->vk, pAllocateInfo,
+ pAllocator, sizeof(*mem));
if (mem == NULL)
- return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+ return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
mem->type = mem_type;
mem->map = NULL;
mem->map_size = 0;
- mem->ahw = NULL;
- mem->host_ptr = NULL;
+ mem->map_delta = 0;
enum anv_bo_alloc_flags alloc_flags = 0;
- const VkExportMemoryAllocateInfo *export_info = NULL;
- const VkImportAndroidHardwareBufferInfoANDROID *ahw_import_info = NULL;
const VkImportMemoryFdInfoKHR *fd_info = NULL;
- const VkImportMemoryHostPointerInfoEXT *host_ptr_info = NULL;
const VkMemoryDedicatedAllocateInfo *dedicated_info = NULL;
- VkMemoryAllocateFlags vk_flags = 0;
+ const struct wsi_memory_allocate_info *wsi_info = NULL;
uint64_t client_address = 0;
vk_foreach_struct_const(ext, pAllocateInfo->pNext) {
- switch (ext->sType) {
+ /* VK_STRUCTURE_TYPE_WSI_MEMORY_ALLOCATE_INFO_MESA isn't a real enum
+ * value, so use cast to avoid compiler warn
+ */
+ switch ((uint32_t)ext->sType) {
case VK_STRUCTURE_TYPE_EXPORT_MEMORY_ALLOCATE_INFO:
- export_info = (void *)ext;
- break;
-
case VK_STRUCTURE_TYPE_IMPORT_ANDROID_HARDWARE_BUFFER_INFO_ANDROID:
- ahw_import_info = (void *)ext;
+ case VK_STRUCTURE_TYPE_IMPORT_MEMORY_HOST_POINTER_INFO_EXT:
+ case VK_STRUCTURE_TYPE_IMPORT_MEMORY_WIN32_HANDLE_INFO_KHR:
+ case VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_FLAGS_INFO:
+ /* handled by vk_device_memory_create */
break;
case VK_STRUCTURE_TYPE_IMPORT_MEMORY_FD_INFO_KHR:
fd_info = (void *)ext;
break;
- case VK_STRUCTURE_TYPE_IMPORT_MEMORY_HOST_POINTER_INFO_EXT:
- host_ptr_info = (void *)ext;
- break;
-
- case VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_FLAGS_INFO: {
- const VkMemoryAllocateFlagsInfo *flags_info = (void *)ext;
- vk_flags = flags_info->flags;
- break;
- }
-
case VK_STRUCTURE_TYPE_MEMORY_DEDICATED_ALLOCATE_INFO:
dedicated_info = (void *)ext;
break;
- case VK_STRUCTURE_TYPE_MEMORY_OPAQUE_CAPTURE_ADDRESS_ALLOCATE_INFO_KHR: {
- const VkMemoryOpaqueCaptureAddressAllocateInfoKHR *addr_info =
- (const VkMemoryOpaqueCaptureAddressAllocateInfoKHR *)ext;
+ case VK_STRUCTURE_TYPE_MEMORY_OPAQUE_CAPTURE_ADDRESS_ALLOCATE_INFO: {
+ const VkMemoryOpaqueCaptureAddressAllocateInfo *addr_info =
+ (const VkMemoryOpaqueCaptureAddressAllocateInfo *)ext;
client_address = addr_info->opaqueCaptureAddress;
break;
}
+ case VK_STRUCTURE_TYPE_WSI_MEMORY_ALLOCATE_INFO_MESA:
+ wsi_info = (void *)ext;
+ break;
+
default:
anv_debug_ignored_stype(ext->sType);
break;
}
}
- /* By default, we want all VkDeviceMemory objects to support CCS */
- if (device->physical->has_implicit_ccs)
- alloc_flags |= ANV_BO_ALLOC_IMPLICIT_CCS;
+ /* If i915 reported a mappable/non_mappable vram regions and the
+ * application want lmem mappable, then we need to use the
+ * I915_GEM_CREATE_EXT_FLAG_NEEDS_CPU_ACCESS flag to create our BO.
+ */
+ if (pdevice->vram_mappable.size > 0 &&
+ pdevice->vram_non_mappable.size > 0 &&
+ (mem_type->propertyFlags & VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT) &&
+ (mem_type->propertyFlags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT))
+ alloc_flags |= ANV_BO_ALLOC_LOCAL_MEM_CPU_VISIBLE;
+
+ if (!mem_heap->is_local_mem)
+ alloc_flags |= ANV_BO_ALLOC_NO_LOCAL_MEM;
- if (vk_flags & VK_MEMORY_ALLOCATE_DEVICE_ADDRESS_BIT_KHR)
+ if (mem->vk.alloc_flags & VK_MEMORY_ALLOCATE_DEVICE_ADDRESS_BIT)
alloc_flags |= ANV_BO_ALLOC_CLIENT_VISIBLE_ADDRESS;
- if ((export_info && export_info->handleTypes) ||
- (fd_info && fd_info->handleType) ||
- (host_ptr_info && host_ptr_info->handleType)) {
- /* Anything imported or exported is EXTERNAL */
- alloc_flags |= ANV_BO_ALLOC_EXTERNAL;
+ if (mem_type->propertyFlags & VK_MEMORY_PROPERTY_PROTECTED_BIT)
+ alloc_flags |= ANV_BO_ALLOC_PROTECTED;
- /* We can't have implicit CCS on external memory with an AUX-table.
- * Doing so would require us to sync the aux tables across processes
- * which is impractical.
- */
- if (device->info.has_aux_map)
- alloc_flags &= ~ANV_BO_ALLOC_IMPLICIT_CCS;
- }
+ /* For now, always allocated AUX-TT aligned memory, regardless of dedicated
+ * allocations. An application can for example, suballocate a large
+ * VkDeviceMemory and try to bind an image created with a CCS modifier. In
+ * that case we cannot disable CCS if the alignment doesn´t meet the AUX-TT
+ * requirements, so we need to ensure both the VkDeviceMemory and the
+ * alignment reported through vkGetImageMemoryRequirements() meet the
+ * AUX-TT requirement.
+ *
+ * TODO: when we enable EXT_descriptor_buffer, we'll be able to drop the
+ * AUX-TT alignment for that type of allocation.
+ */
+ if (device->info->has_aux_map)
+ alloc_flags |= ANV_BO_ALLOC_AUX_TT_ALIGNED;
- /* Check if we need to support Android HW buffer export. If so,
- * create AHardwareBuffer and import memory from it.
+ /* If the allocation is not dedicated nor a host pointer, allocate
+ * additional CCS space.
+ *
+ * TODO: If we ever ship VK_EXT_descriptor_buffer (ahahah... :() we could
+ * drop this flag in the descriptor buffer case as we don't need any
+ * compression there.
+ *
+ * TODO: We could also create new memory types for allocations that don't
+ * need any compression.
*/
- bool android_export = false;
- if (export_info && export_info->handleTypes &
- VK_EXTERNAL_MEMORY_HANDLE_TYPE_ANDROID_HARDWARE_BUFFER_BIT_ANDROID)
- android_export = true;
+ if (device->physical->alloc_aux_tt_mem &&
+ dedicated_info == NULL &&
+ mem->vk.host_ptr == NULL)
+ alloc_flags |= ANV_BO_ALLOC_AUX_CCS;
+
+ /* TODO: Android, ChromeOS and other applications may need another way to
+ * allocate buffers that can be scanout to display but it should pretty
+ * easy to catch those as Xe KMD driver will print warnings in dmesg when
+ * scanning buffers allocated without proper flag set.
+ */
+ if (wsi_info)
+ alloc_flags |= ANV_BO_ALLOC_SCANOUT;
- if (ahw_import_info) {
- result = anv_import_ahw_memory(_device, mem, ahw_import_info);
- if (result != VK_SUCCESS)
- goto fail;
+ /* Anything imported or exported is EXTERNAL */
+ if (mem->vk.export_handle_types || mem->vk.import_handle_type) {
+ alloc_flags |= ANV_BO_ALLOC_EXTERNAL;
- goto success;
- } else if (android_export) {
- result = anv_create_ahw_memory(_device, mem, pAllocateInfo);
+ /* wsi has its own way of synchronizing with the compositor */
+ if (pdevice->instance->external_memory_implicit_sync &&
+ !wsi_info && dedicated_info &&
+ dedicated_info->image != VK_NULL_HANDLE) {
+ ANV_FROM_HANDLE(anv_image, image, dedicated_info->image);
+
+ /* Apply implicit sync to be compatible with clients relying on
+ * implicit fencing. This matches the behavior in iris i915_batch
+ * submit. An example client is VA-API (iHD), so only dedicated
+ * image scenario has to be covered.
+ */
+ alloc_flags |= ANV_BO_ALLOC_IMPLICIT_SYNC;
+
+ /* For color attachment, apply IMPLICIT_WRITE so a client on the
+ * consumer side relying on implicit fencing can have a fence to
+ * wait for render complete.
+ */
+ if (image->vk.usage & VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT)
+ alloc_flags |= ANV_BO_ALLOC_IMPLICIT_WRITE;
+ }
+ }
+
+ if (mem_type->descriptor_buffer)
+ alloc_flags |= ANV_BO_ALLOC_DESCRIPTOR_BUFFER_POOL;
+
+ if (mem->vk.ahardware_buffer) {
+ result = anv_import_ahw_memory(_device, mem);
if (result != VK_SUCCESS)
goto fail;
@@ -4121,8 +4446,7 @@ VkResult anv_AllocateMemory(
* this sort of attack but only if it can trust the buffer size.
*/
if (mem->bo->size < aligned_alloc_size) {
- result = vk_errorf(device, &device->vk.base,
- VK_ERROR_INVALID_EXTERNAL_HANDLE,
+ result = vk_errorf(device, VK_ERROR_INVALID_EXTERNAL_HANDLE,
"aligned allocationSize too large for "
"VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT: "
"%"PRIu64"B > %"PRIu64"B",
@@ -4144,34 +4468,39 @@ VkResult anv_AllocateMemory(
goto success;
}
- if (host_ptr_info && host_ptr_info->handleType) {
- if (host_ptr_info->handleType ==
+ if (mem->vk.host_ptr) {
+ if (mem->vk.import_handle_type ==
VK_EXTERNAL_MEMORY_HANDLE_TYPE_HOST_MAPPED_FOREIGN_MEMORY_BIT_EXT) {
- result = vk_error(VK_ERROR_INVALID_EXTERNAL_HANDLE);
+ result = vk_error(device, VK_ERROR_INVALID_EXTERNAL_HANDLE);
goto fail;
}
- assert(host_ptr_info->handleType ==
+ assert(mem->vk.import_handle_type ==
VK_EXTERNAL_MEMORY_HANDLE_TYPE_HOST_ALLOCATION_BIT_EXT);
result = anv_device_import_bo_from_host_ptr(device,
- host_ptr_info->pHostPointer,
- pAllocateInfo->allocationSize,
+ mem->vk.host_ptr,
+ mem->vk.size,
alloc_flags,
client_address,
&mem->bo);
if (result != VK_SUCCESS)
goto fail;
- mem->host_ptr = host_ptr_info->pHostPointer;
goto success;
}
- /* Set ALLOC_LOCAL_MEM flag if heap has device local bit set and requested
- * memory property flag has DEVICE_LOCAL_BIT set.
- */
- if (mem_type->propertyFlags & VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT)
- alloc_flags |= ANV_BO_ALLOC_LOCAL_MEM;
+ if (alloc_flags & (ANV_BO_ALLOC_EXTERNAL | ANV_BO_ALLOC_SCANOUT)) {
+ alloc_flags |= ANV_BO_ALLOC_HOST_COHERENT;
+ } else if (mem_type->propertyFlags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT) {
+ if (mem_type->propertyFlags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT)
+ alloc_flags |= ANV_BO_ALLOC_HOST_COHERENT;
+ if (mem_type->propertyFlags & VK_MEMORY_PROPERTY_HOST_CACHED_BIT)
+ alloc_flags |= ANV_BO_ALLOC_HOST_CACHED;
+ } else {
+ /* Required to set some host mode to have a valid pat index set */
+ alloc_flags |= ANV_BO_ALLOC_HOST_COHERENT;
+ }
/* Regular allocate (not importing memory). */
@@ -4187,16 +4516,12 @@ VkResult anv_AllocateMemory(
* the BO. In this case, we have a dedicated allocation.
*/
if (image->vk.wsi_legacy_scanout) {
- const uint32_t i915_tiling =
- isl_tiling_to_i915_tiling(image->planes[0].primary_surface.isl.tiling);
- int ret = anv_gem_set_tiling(device, mem->bo->gem_handle,
- image->planes[0].primary_surface.isl.row_pitch_B,
- i915_tiling);
- if (ret) {
+ const struct isl_surf *surf = &image->planes[0].primary_surface.isl;
+ result = anv_device_set_bo_tiling(device, mem->bo,
+ surf->row_pitch_B,
+ surf->tiling);
+ if (result != VK_SUCCESS) {
anv_device_release_bo(device, mem->bo);
- result = vk_errorf(device, &device->vk.base,
- VK_ERROR_OUT_OF_DEVICE_MEMORY,
- "failed to set BO tiling: %m");
goto fail;
}
}
@@ -4207,8 +4532,7 @@ VkResult anv_AllocateMemory(
if (mem_heap_used > mem_heap->size) {
p_atomic_add(&mem_heap->used, -mem->bo->size);
anv_device_release_bo(device, mem->bo);
- result = vk_errorf(device, &device->vk.base,
- VK_ERROR_OUT_OF_DEVICE_MEMORY,
+ result = vk_errorf(device, VK_ERROR_OUT_OF_DEVICE_MEMORY,
"Out of heap memory");
goto fail;
}
@@ -4217,12 +4541,14 @@ VkResult anv_AllocateMemory(
list_addtail(&mem->link, &device->memory_objects);
pthread_mutex_unlock(&device->mutex);
+ ANV_RMV(heap_create, device, mem, false, 0);
+
*pMem = anv_device_memory_to_handle(mem);
return VK_SUCCESS;
fail:
- vk_object_free(&device->vk, pAllocator, mem);
+ vk_device_memory_destroy(&device->vk, pAllocator, &mem->vk);
return result;
}
@@ -4266,7 +4592,7 @@ VkResult anv_GetMemoryFdPropertiesKHR(
*
* So opaque handle types fall into the default "unsupported" case.
*/
- return vk_error(VK_ERROR_INVALID_EXTERNAL_HANDLE);
+ return vk_error(device, VK_ERROR_INVALID_EXTERNAL_HANDLE);
}
}
@@ -4309,71 +4635,84 @@ void anv_FreeMemory(
list_del(&mem->link);
pthread_mutex_unlock(&device->mutex);
- if (mem->map)
- anv_UnmapMemory(_device, _mem);
+ if (mem->map) {
+ const VkMemoryUnmapInfoKHR unmap = {
+ .sType = VK_STRUCTURE_TYPE_MEMORY_UNMAP_INFO_KHR,
+ .memory = _mem,
+ };
+ anv_UnmapMemory2KHR(_device, &unmap);
+ }
p_atomic_add(&device->physical->memory.heaps[mem->type->heapIndex].used,
-mem->bo->size);
anv_device_release_bo(device, mem->bo);
-#if defined(ANDROID) && ANDROID_API_LEVEL >= 26
- if (mem->ahw)
- AHardwareBuffer_release(mem->ahw);
-#endif
+ ANV_RMV(resource_destroy, device, mem);
- vk_object_free(&device->vk, pAllocator, mem);
+ vk_device_memory_destroy(&device->vk, pAllocator, &mem->vk);
}
-VkResult anv_MapMemory(
+VkResult anv_MapMemory2KHR(
VkDevice _device,
- VkDeviceMemory _memory,
- VkDeviceSize offset,
- VkDeviceSize size,
- VkMemoryMapFlags flags,
+ const VkMemoryMapInfoKHR* pMemoryMapInfo,
void** ppData)
{
ANV_FROM_HANDLE(anv_device, device, _device);
- ANV_FROM_HANDLE(anv_device_memory, mem, _memory);
+ ANV_FROM_HANDLE(anv_device_memory, mem, pMemoryMapInfo->memory);
if (mem == NULL) {
*ppData = NULL;
return VK_SUCCESS;
}
- if (mem->host_ptr) {
- *ppData = mem->host_ptr + offset;
+ if (mem->vk.host_ptr) {
+ *ppData = mem->vk.host_ptr + pMemoryMapInfo->offset;
return VK_SUCCESS;
}
- if (size == VK_WHOLE_SIZE)
- size = mem->bo->size - offset;
-
/* From the Vulkan spec version 1.0.32 docs for MapMemory:
*
- * * If size is not equal to VK_WHOLE_SIZE, size must be greater than 0
- * assert(size != 0);
- * * If size is not equal to VK_WHOLE_SIZE, size must be less than or
- * equal to the size of the memory minus offset
+ * * memory must have been created with a memory type that reports
+ * VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT
*/
- assert(size > 0);
- assert(offset + size <= mem->bo->size);
+ if (!(mem->type->propertyFlags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT)) {
+ return vk_errorf(device, VK_ERROR_MEMORY_MAP_FAILED,
+ "Memory object not mappable.");
+ }
- /* FIXME: Is this supposed to be thread safe? Since vkUnmapMemory() only
- * takes a VkDeviceMemory pointer, it seems like only one map of the memory
- * at a time is valid. We could just mmap up front and return an offset
- * pointer here, but that may exhaust virtual memory on 32 bit
- * userspace. */
+ assert(pMemoryMapInfo->size > 0);
+ const VkDeviceSize offset = pMemoryMapInfo->offset;
+ const VkDeviceSize size =
+ vk_device_memory_range(&mem->vk, pMemoryMapInfo->offset,
+ pMemoryMapInfo->size);
- uint32_t gem_flags = 0;
+ if (size != (size_t)size) {
+ return vk_errorf(device, VK_ERROR_MEMORY_MAP_FAILED,
+ "requested size 0x%"PRIx64" does not fit in %u bits",
+ size, (unsigned)(sizeof(size_t) * 8));
+ }
+
+ /* From the Vulkan 1.2.194 spec:
+ *
+ * "memory must not be currently host mapped"
+ */
+ if (mem->map != NULL) {
+ return vk_errorf(device, VK_ERROR_MEMORY_MAP_FAILED,
+ "Memory object already mapped.");
+ }
- if (!device->info.has_llc &&
- (mem->type->propertyFlags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT))
- gem_flags |= I915_MMAP_WC;
+ void *placed_addr = NULL;
+ if (pMemoryMapInfo->flags & VK_MEMORY_MAP_PLACED_BIT_EXT) {
+ const VkMemoryMapPlacedInfoEXT *placed_info =
+ vk_find_struct_const(pMemoryMapInfo->pNext, MEMORY_MAP_PLACED_INFO_EXT);
+ assert(placed_info != NULL);
+ placed_addr = placed_info->pPlacedAddress;
+ }
/* GEM will fail to map if the offset isn't 4k-aligned. Round down. */
uint64_t map_offset;
- if (!device->physical->has_mmap_offset)
+ if (!device->physical->info.has_mmap_offset)
map_offset = offset & ~4095ull;
else
map_offset = 0;
@@ -4381,53 +4720,43 @@ VkResult anv_MapMemory(
uint64_t map_size = (offset + size) - map_offset;
/* Let's map whole pages */
- map_size = align_u64(map_size, 4096);
+ map_size = align64(map_size, 4096);
- void *map = anv_gem_mmap(device, mem->bo->gem_handle,
- map_offset, map_size, gem_flags);
- if (map == MAP_FAILED)
- return vk_error(VK_ERROR_MEMORY_MAP_FAILED);
+ void *map;
+ VkResult result = anv_device_map_bo(device, mem->bo, map_offset,
+ map_size, placed_addr, &map);
+ if (result != VK_SUCCESS)
+ return result;
mem->map = map;
mem->map_size = map_size;
-
- *ppData = mem->map + (offset - map_offset);
+ mem->map_delta = (offset - map_offset);
+ *ppData = mem->map + mem->map_delta;
return VK_SUCCESS;
}
-void anv_UnmapMemory(
+VkResult anv_UnmapMemory2KHR(
VkDevice _device,
- VkDeviceMemory _memory)
+ const VkMemoryUnmapInfoKHR* pMemoryUnmapInfo)
{
ANV_FROM_HANDLE(anv_device, device, _device);
- ANV_FROM_HANDLE(anv_device_memory, mem, _memory);
+ ANV_FROM_HANDLE(anv_device_memory, mem, pMemoryUnmapInfo->memory);
- if (mem == NULL || mem->host_ptr)
- return;
+ if (mem == NULL || mem->vk.host_ptr)
+ return VK_SUCCESS;
- anv_gem_munmap(device, mem->map, mem->map_size);
+ VkResult result =
+ anv_device_unmap_bo(device, mem->bo, mem->map, mem->map_size,
+ pMemoryUnmapInfo->flags & VK_MEMORY_UNMAP_RESERVE_BIT_EXT);
+ if (result != VK_SUCCESS)
+ return result;
mem->map = NULL;
mem->map_size = 0;
-}
-
-static void
-clflush_mapped_ranges(struct anv_device *device,
- uint32_t count,
- const VkMappedMemoryRange *ranges)
-{
- for (uint32_t i = 0; i < count; i++) {
- ANV_FROM_HANDLE(anv_device_memory, mem, ranges[i].memory);
- if (ranges[i].offset >= mem->map_size)
- continue;
+ mem->map_delta = 0;
- if (mem->type->propertyFlags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT)
- continue;
-
- intel_clflush_range(mem->map + ranges[i].offset,
- MIN2(ranges[i].size, mem->map_size - ranges[i].offset));
- }
+ return VK_SUCCESS;
}
VkResult anv_FlushMappedMemoryRanges(
@@ -4435,16 +4764,29 @@ VkResult anv_FlushMappedMemoryRanges(
uint32_t memoryRangeCount,
const VkMappedMemoryRange* pMemoryRanges)
{
+#ifdef SUPPORT_INTEL_INTEGRATED_GPUS
ANV_FROM_HANDLE(anv_device, device, _device);
- if (!device->physical->memory.need_clflush)
+ if (!device->physical->memory.need_flush)
return VK_SUCCESS;
/* Make sure the writes we're flushing have landed. */
__builtin_ia32_mfence();
- clflush_mapped_ranges(device, memoryRangeCount, pMemoryRanges);
+ for (uint32_t i = 0; i < memoryRangeCount; i++) {
+ ANV_FROM_HANDLE(anv_device_memory, mem, pMemoryRanges[i].memory);
+ if (mem->type->propertyFlags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT)
+ continue;
+
+ uint64_t map_offset = pMemoryRanges[i].offset + mem->map_delta;
+ if (map_offset >= mem->map_size)
+ continue;
+ intel_flush_range(mem->map + map_offset,
+ MIN2(pMemoryRanges[i].size,
+ mem->map_size - map_offset));
+ }
+#endif
return VK_SUCCESS;
}
@@ -4453,73 +4795,32 @@ VkResult anv_InvalidateMappedMemoryRanges(
uint32_t memoryRangeCount,
const VkMappedMemoryRange* pMemoryRanges)
{
+#ifdef SUPPORT_INTEL_INTEGRATED_GPUS
ANV_FROM_HANDLE(anv_device, device, _device);
- if (!device->physical->memory.need_clflush)
+ if (!device->physical->memory.need_flush)
return VK_SUCCESS;
- clflush_mapped_ranges(device, memoryRangeCount, pMemoryRanges);
+ for (uint32_t i = 0; i < memoryRangeCount; i++) {
+ ANV_FROM_HANDLE(anv_device_memory, mem, pMemoryRanges[i].memory);
+ if (mem->type->propertyFlags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT)
+ continue;
+
+ uint64_t map_offset = pMemoryRanges[i].offset + mem->map_delta;
+ if (map_offset >= mem->map_size)
+ continue;
+
+ intel_invalidate_range(mem->map + map_offset,
+ MIN2(pMemoryRanges[i].size,
+ mem->map_size - map_offset));
+ }
/* Make sure no reads get moved up above the invalidate. */
__builtin_ia32_mfence();
-
+#endif
return VK_SUCCESS;
}
-void anv_GetBufferMemoryRequirements2(
- VkDevice _device,
- const VkBufferMemoryRequirementsInfo2* pInfo,
- VkMemoryRequirements2* pMemoryRequirements)
-{
- ANV_FROM_HANDLE(anv_device, device, _device);
- ANV_FROM_HANDLE(anv_buffer, buffer, pInfo->buffer);
-
- /* The Vulkan spec (git aaed022) says:
- *
- * memoryTypeBits is a bitfield and contains one bit set for every
- * supported memory type for the resource. The bit `1<<i` is set if and
- * only if the memory type `i` in the VkPhysicalDeviceMemoryProperties
- * structure for the physical device is supported.
- */
- uint32_t memory_types = (1ull << device->physical->memory.type_count) - 1;
-
- /* Base alignment requirement of a cache line */
- uint32_t alignment = 16;
-
- if (buffer->usage & VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT)
- alignment = MAX2(alignment, ANV_UBO_ALIGNMENT);
-
- pMemoryRequirements->memoryRequirements.size = buffer->size;
- pMemoryRequirements->memoryRequirements.alignment = alignment;
-
- /* Storage and Uniform buffers should have their size aligned to
- * 32-bits to avoid boundary checks when last DWord is not complete.
- * This would ensure that not internal padding would be needed for
- * 16-bit types.
- */
- if (device->robust_buffer_access &&
- (buffer->usage & VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT ||
- buffer->usage & VK_BUFFER_USAGE_STORAGE_BUFFER_BIT))
- pMemoryRequirements->memoryRequirements.size = align_u64(buffer->size, 4);
-
- pMemoryRequirements->memoryRequirements.memoryTypeBits = memory_types;
-
- vk_foreach_struct(ext, pMemoryRequirements->pNext) {
- switch (ext->sType) {
- case VK_STRUCTURE_TYPE_MEMORY_DEDICATED_REQUIREMENTS: {
- VkMemoryDedicatedRequirements *requirements = (void *)ext;
- requirements->prefersDedicatedAllocation = false;
- requirements->requiresDedicatedAllocation = false;
- break;
- }
-
- default:
- anv_debug_ignored_stype(ext->sType);
- break;
- }
- }
-}
-
void anv_GetDeviceMemoryCommitment(
VkDevice device,
VkDeviceMemory memory,
@@ -4529,16 +4830,21 @@ void anv_GetDeviceMemoryCommitment(
}
static void
-anv_bind_buffer_memory(const VkBindBufferMemoryInfo *pBindInfo)
+anv_bind_buffer_memory(struct anv_device *device,
+ const VkBindBufferMemoryInfo *pBindInfo)
{
ANV_FROM_HANDLE(anv_device_memory, mem, pBindInfo->memory);
ANV_FROM_HANDLE(anv_buffer, buffer, pBindInfo->buffer);
assert(pBindInfo->sType == VK_STRUCTURE_TYPE_BIND_BUFFER_MEMORY_INFO);
+ assert(!anv_buffer_is_sparse(buffer));
+
+ const VkBindMemoryStatusKHR *bind_status =
+ vk_find_struct_const(pBindInfo->pNext, BIND_MEMORY_STATUS_KHR);
if (mem) {
- assert(pBindInfo->memoryOffset < mem->bo->size);
- assert(mem->bo->size - pBindInfo->memoryOffset >= buffer->size);
+ assert(pBindInfo->memoryOffset < mem->vk.size);
+ assert(mem->vk.size - pBindInfo->memoryOffset >= buffer->vk.size);
buffer->address = (struct anv_address) {
.bo = mem->bo,
.offset = pBindInfo->memoryOffset,
@@ -4546,32 +4852,26 @@ anv_bind_buffer_memory(const VkBindBufferMemoryInfo *pBindInfo)
} else {
buffer->address = ANV_NULL_ADDRESS;
}
+
+ ANV_RMV(buffer_bind, device, buffer);
+
+ if (bind_status)
+ *bind_status->pResult = VK_SUCCESS;
}
VkResult anv_BindBufferMemory2(
- VkDevice device,
+ VkDevice _device,
uint32_t bindInfoCount,
const VkBindBufferMemoryInfo* pBindInfos)
{
+ ANV_FROM_HANDLE(anv_device, device, _device);
+
for (uint32_t i = 0; i < bindInfoCount; i++)
- anv_bind_buffer_memory(&pBindInfos[i]);
+ anv_bind_buffer_memory(device, &pBindInfos[i]);
return VK_SUCCESS;
}
-VkResult anv_QueueBindSparse(
- VkQueue _queue,
- uint32_t bindInfoCount,
- const VkBindSparseInfo* pBindInfo,
- VkFence fence)
-{
- ANV_FROM_HANDLE(anv_queue, queue, _queue);
- if (anv_device_is_lost(queue->device))
- return VK_ERROR_DEVICE_LOST;
-
- return vk_error(VK_ERROR_FEATURE_NOT_PRESENT);
-}
-
// Event functions
VkResult anv_CreateEvent(
@@ -4588,12 +4888,14 @@ VkResult anv_CreateEvent(
event = vk_object_alloc(&device->vk, pAllocator, sizeof(*event),
VK_OBJECT_TYPE_EVENT);
if (event == NULL)
- return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+ return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
event->state = anv_state_pool_alloc(&device->dynamic_state_pool,
sizeof(uint64_t), 8);
*(uint64_t *)event->state.map = VK_EVENT_RESET;
+ ANV_RMV(event_create, device, event, pCreateInfo->flags, false);
+
*pEvent = anv_event_to_handle(event);
return VK_SUCCESS;
@@ -4610,6 +4912,8 @@ void anv_DestroyEvent(
if (!event)
return;
+ ANV_RMV(resource_destroy, device, event);
+
anv_state_pool_free(&device->dynamic_state_pool, event->state);
vk_object_free(&device->vk, pAllocator, event);
@@ -4622,7 +4926,7 @@ VkResult anv_GetEventStatus(
ANV_FROM_HANDLE(anv_device, device, _device);
ANV_FROM_HANDLE(anv_event, event, _event);
- if (anv_device_is_lost(device))
+ if (vk_device_is_lost(&device->vk))
return VK_ERROR_DEVICE_LOST;
return *(uint64_t *)event->state.map;
@@ -4652,6 +4956,105 @@ VkResult anv_ResetEvent(
// Buffer functions
+static void
+anv_get_buffer_memory_requirements(struct anv_device *device,
+ VkBufferCreateFlags flags,
+ VkDeviceSize size,
+ VkBufferUsageFlags usage,
+ bool is_sparse,
+ VkMemoryRequirements2* pMemoryRequirements)
+{
+ /* The Vulkan spec (git aaed022) says:
+ *
+ * memoryTypeBits is a bitfield and contains one bit set for every
+ * supported memory type for the resource. The bit `1<<i` is set if and
+ * only if the memory type `i` in the VkPhysicalDeviceMemoryProperties
+ * structure for the physical device is supported.
+ *
+ * We have special memory types for descriptor buffers.
+ */
+ uint32_t memory_types =
+ (flags & VK_BUFFER_CREATE_PROTECTED_BIT) ?
+ device->physical->memory.protected_mem_types :
+ ((usage & (VK_BUFFER_USAGE_RESOURCE_DESCRIPTOR_BUFFER_BIT_EXT |
+ VK_BUFFER_USAGE_SAMPLER_DESCRIPTOR_BUFFER_BIT_EXT)) ?
+ device->physical->memory.desc_buffer_mem_types :
+ device->physical->memory.default_buffer_mem_types);
+
+ /* The GPU appears to write back to main memory in cachelines. Writes to a
+ * buffers should not clobber with writes to another buffers so make sure
+ * those are in different cachelines.
+ */
+ uint32_t alignment = 64;
+
+ /* From the spec, section "Sparse Buffer and Fully-Resident Image Block
+ * Size":
+ * "The sparse block size in bytes for sparse buffers and fully-resident
+ * images is reported as VkMemoryRequirements::alignment. alignment
+ * represents both the memory alignment requirement and the binding
+ * granularity (in bytes) for sparse resources."
+ */
+ if (is_sparse) {
+ alignment = ANV_SPARSE_BLOCK_SIZE;
+ size = align64(size, alignment);
+ }
+
+ pMemoryRequirements->memoryRequirements.size = size;
+ pMemoryRequirements->memoryRequirements.alignment = alignment;
+
+ /* Storage and Uniform buffers should have their size aligned to
+ * 32-bits to avoid boundary checks when last DWord is not complete.
+ * This would ensure that not internal padding would be needed for
+ * 16-bit types.
+ */
+ if (device->robust_buffer_access &&
+ (usage & VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT ||
+ usage & VK_BUFFER_USAGE_STORAGE_BUFFER_BIT))
+ pMemoryRequirements->memoryRequirements.size = align64(size, 4);
+
+ pMemoryRequirements->memoryRequirements.memoryTypeBits = memory_types;
+
+ vk_foreach_struct(ext, pMemoryRequirements->pNext) {
+ switch (ext->sType) {
+ case VK_STRUCTURE_TYPE_MEMORY_DEDICATED_REQUIREMENTS: {
+ VkMemoryDedicatedRequirements *requirements = (void *)ext;
+ requirements->prefersDedicatedAllocation = false;
+ requirements->requiresDedicatedAllocation = false;
+ break;
+ }
+
+ default:
+ anv_debug_ignored_stype(ext->sType);
+ break;
+ }
+ }
+}
+
+void anv_GetDeviceBufferMemoryRequirements(
+ VkDevice _device,
+ const VkDeviceBufferMemoryRequirements* pInfo,
+ VkMemoryRequirements2* pMemoryRequirements)
+{
+ ANV_FROM_HANDLE(anv_device, device, _device);
+ const bool is_sparse =
+ pInfo->pCreateInfo->flags & VK_BUFFER_CREATE_SPARSE_BINDING_BIT;
+
+ if ((device->physical->sparse_type == ANV_SPARSE_TYPE_NOT_SUPPORTED) &&
+ INTEL_DEBUG(DEBUG_SPARSE) &&
+ pInfo->pCreateInfo->flags & (VK_BUFFER_CREATE_SPARSE_BINDING_BIT |
+ VK_BUFFER_CREATE_SPARSE_RESIDENCY_BIT |
+ VK_BUFFER_CREATE_SPARSE_ALIASED_BIT))
+ fprintf(stderr, "=== %s %s:%d flags:0x%08x\n", __func__, __FILE__,
+ __LINE__, pInfo->pCreateInfo->flags);
+
+ anv_get_buffer_memory_requirements(device,
+ pInfo->pCreateInfo->flags,
+ pInfo->pCreateInfo->size,
+ pInfo->pCreateInfo->usage,
+ is_sparse,
+ pMemoryRequirements);
+}
+
VkResult anv_CreateBuffer(
VkDevice _device,
const VkBufferCreateInfo* pCreateInfo,
@@ -4661,25 +5064,62 @@ VkResult anv_CreateBuffer(
ANV_FROM_HANDLE(anv_device, device, _device);
struct anv_buffer *buffer;
+ if ((device->physical->sparse_type == ANV_SPARSE_TYPE_NOT_SUPPORTED) &&
+ INTEL_DEBUG(DEBUG_SPARSE) &&
+ pCreateInfo->flags & (VK_BUFFER_CREATE_SPARSE_BINDING_BIT |
+ VK_BUFFER_CREATE_SPARSE_RESIDENCY_BIT |
+ VK_BUFFER_CREATE_SPARSE_ALIASED_BIT))
+ fprintf(stderr, "=== %s %s:%d flags:0x%08x\n", __func__, __FILE__,
+ __LINE__, pCreateInfo->flags);
+
/* Don't allow creating buffers bigger than our address space. The real
* issue here is that we may align up the buffer size and we don't want
* doing so to cause roll-over. However, no one has any business
* allocating a buffer larger than our GTT size.
*/
if (pCreateInfo->size > device->physical->gtt_size)
- return vk_error(VK_ERROR_OUT_OF_DEVICE_MEMORY);
+ return vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
- assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO);
-
- buffer = vk_object_alloc(&device->vk, pAllocator, sizeof(*buffer),
- VK_OBJECT_TYPE_BUFFER);
+ buffer = vk_buffer_create(&device->vk, pCreateInfo,
+ pAllocator, sizeof(*buffer));
if (buffer == NULL)
- return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+ return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
- buffer->create_flags = pCreateInfo->flags;
- buffer->size = pCreateInfo->size;
- buffer->usage = pCreateInfo->usage;
buffer->address = ANV_NULL_ADDRESS;
+ if (anv_buffer_is_sparse(buffer)) {
+ enum anv_bo_alloc_flags alloc_flags = 0;
+ uint64_t client_address = 0;
+
+ if (buffer->vk.create_flags & VK_BUFFER_CREATE_DEVICE_ADDRESS_CAPTURE_REPLAY_BIT) {
+ alloc_flags = ANV_BO_ALLOC_CLIENT_VISIBLE_ADDRESS;
+ const VkBufferOpaqueCaptureAddressCreateInfo *opaque_addr_info =
+ vk_find_struct_const(pCreateInfo->pNext,
+ BUFFER_OPAQUE_CAPTURE_ADDRESS_CREATE_INFO);
+ if (opaque_addr_info)
+ client_address = opaque_addr_info->opaqueCaptureAddress;
+ }
+
+ if (buffer->vk.create_flags & VK_BUFFER_CREATE_DESCRIPTOR_BUFFER_CAPTURE_REPLAY_BIT_EXT) {
+ alloc_flags = ANV_BO_ALLOC_CLIENT_VISIBLE_ADDRESS;
+
+ const VkOpaqueCaptureDescriptorDataCreateInfoEXT *opaque_info =
+ vk_find_struct_const(pCreateInfo->pNext,
+ OPAQUE_CAPTURE_DESCRIPTOR_DATA_CREATE_INFO_EXT);
+ if (opaque_info)
+ client_address = *((const uint64_t *)opaque_info->opaqueCaptureDescriptorData);
+ }
+
+ VkResult result = anv_init_sparse_bindings(device, buffer->vk.size,
+ &buffer->sparse_data,
+ alloc_flags, client_address,
+ &buffer->address);
+ if (result != VK_SUCCESS) {
+ vk_buffer_destroy(&device->vk, pAllocator, &buffer->vk);
+ return result;
+ }
+ }
+
+ ANV_RMV(buffer_create, device, false, buffer);
*pBuffer = anv_buffer_to_handle(buffer);
@@ -4697,57 +5137,98 @@ void anv_DestroyBuffer(
if (!buffer)
return;
- vk_object_free(&device->vk, pAllocator, buffer);
+ ANV_RMV(buffer_destroy, device, buffer);
+
+ if (anv_buffer_is_sparse(buffer)) {
+ assert(buffer->address.offset == buffer->sparse_data.address);
+ anv_free_sparse_bindings(device, &buffer->sparse_data);
+ }
+
+ vk_buffer_destroy(&device->vk, pAllocator, &buffer->vk);
}
VkDeviceAddress anv_GetBufferDeviceAddress(
VkDevice device,
- const VkBufferDeviceAddressInfoKHR* pInfo)
+ const VkBufferDeviceAddressInfo* pInfo)
{
ANV_FROM_HANDLE(anv_buffer, buffer, pInfo->buffer);
assert(!anv_address_is_null(buffer->address));
- assert(buffer->address.bo->flags & EXEC_OBJECT_PINNED);
return anv_address_physical(buffer->address);
}
uint64_t anv_GetBufferOpaqueCaptureAddress(
VkDevice device,
- const VkBufferDeviceAddressInfoKHR* pInfo)
+ const VkBufferDeviceAddressInfo* pInfo)
{
- return 0;
+ ANV_FROM_HANDLE(anv_buffer, buffer, pInfo->buffer);
+
+ return anv_address_physical(buffer->address);
+}
+
+VkResult anv_GetBufferOpaqueCaptureDescriptorDataEXT(
+ VkDevice device,
+ const VkBufferCaptureDescriptorDataInfoEXT* pInfo,
+ void* pData)
+{
+ ANV_FROM_HANDLE(anv_buffer, buffer, pInfo->buffer);
+
+ *((uint64_t *)pData) = anv_address_physical(buffer->address);
+
+ return VK_SUCCESS;
}
uint64_t anv_GetDeviceMemoryOpaqueCaptureAddress(
VkDevice device,
- const VkDeviceMemoryOpaqueCaptureAddressInfoKHR* pInfo)
+ const VkDeviceMemoryOpaqueCaptureAddressInfo* pInfo)
{
ANV_FROM_HANDLE(anv_device_memory, memory, pInfo->memory);
- assert(memory->bo->flags & EXEC_OBJECT_PINNED);
- assert(memory->bo->has_client_visible_address);
+ assert(memory->bo->alloc_flags & ANV_BO_ALLOC_CLIENT_VISIBLE_ADDRESS);
return intel_48b_address(memory->bo->offset);
}
void
-anv_fill_buffer_surface_state(struct anv_device *device, struct anv_state state,
+anv_fill_buffer_surface_state(struct anv_device *device,
+ void *surface_state_ptr,
enum isl_format format,
+ struct isl_swizzle swizzle,
isl_surf_usage_flags_t usage,
struct anv_address address,
uint32_t range, uint32_t stride)
{
- isl_buffer_fill_state(&device->isl_dev, state.map,
+ isl_buffer_fill_state(&device->isl_dev, surface_state_ptr,
.address = anv_address_physical(address),
.mocs = isl_mocs(&device->isl_dev, usage,
- address.bo && address.bo->is_external),
+ address.bo && anv_bo_is_external(address.bo)),
.size_B = range,
.format = format,
- .swizzle = ISL_SWIZZLE_IDENTITY,
+ .swizzle = swizzle,
.stride_B = stride);
}
+VkResult anv_GetSamplerOpaqueCaptureDescriptorDataEXT(
+ VkDevice _device,
+ const VkSamplerCaptureDescriptorDataInfoEXT* pInfo,
+ void* pData)
+{
+ ANV_FROM_HANDLE(anv_device, device, _device);
+ ANV_FROM_HANDLE(anv_sampler, sampler, pInfo->sampler);
+
+ if (sampler->custom_border_color_db.alloc_size != 0) {
+ *((uint32_t *)pData) =
+ anv_state_reserved_array_pool_state_index(
+ &device->custom_border_colors_db,
+ sampler->custom_border_color_db);
+ } else {
+ *((uint32_t *)pData) = 0;
+ }
+
+ return VK_SUCCESS;
+}
+
void anv_DestroySampler(
VkDevice _device,
VkSampler _sampler,
@@ -4768,85 +5249,32 @@ void anv_DestroySampler(
anv_state_reserved_pool_free(&device->custom_border_colors,
sampler->custom_border_color);
}
-
- vk_object_free(&device->vk, pAllocator, sampler);
-}
-
-VkResult anv_CreateFramebuffer(
- VkDevice _device,
- const VkFramebufferCreateInfo* pCreateInfo,
- const VkAllocationCallbacks* pAllocator,
- VkFramebuffer* pFramebuffer)
-{
- ANV_FROM_HANDLE(anv_device, device, _device);
- struct anv_framebuffer *framebuffer;
-
- assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO);
-
- size_t size = sizeof(*framebuffer);
-
- /* VK_KHR_imageless_framebuffer extension says:
- *
- * If flags includes VK_FRAMEBUFFER_CREATE_IMAGELESS_BIT_KHR,
- * parameter pAttachments is ignored.
- */
- if (!(pCreateInfo->flags & VK_FRAMEBUFFER_CREATE_IMAGELESS_BIT_KHR))
- size += sizeof(struct anv_image_view *) * pCreateInfo->attachmentCount;
-
- framebuffer = vk_object_alloc(&device->vk, pAllocator, size,
- VK_OBJECT_TYPE_FRAMEBUFFER);
- if (framebuffer == NULL)
- return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
-
- framebuffer->width = pCreateInfo->width;
- framebuffer->height = pCreateInfo->height;
- framebuffer->layers = pCreateInfo->layers;
-
- if (!(pCreateInfo->flags & VK_FRAMEBUFFER_CREATE_IMAGELESS_BIT_KHR)) {
- for (uint32_t i = 0; i < pCreateInfo->attachmentCount; i++) {
- ANV_FROM_HANDLE(anv_image_view, iview, pCreateInfo->pAttachments[i]);
- framebuffer->attachments[i] = iview;
- }
- framebuffer->attachment_count = pCreateInfo->attachmentCount;
+ if (sampler->custom_border_color_db.map) {
+ anv_state_reserved_array_pool_free(&device->custom_border_colors_db,
+ sampler->custom_border_color_db);
}
- *pFramebuffer = anv_framebuffer_to_handle(framebuffer);
-
- return VK_SUCCESS;
-}
-
-void anv_DestroyFramebuffer(
- VkDevice _device,
- VkFramebuffer _fb,
- const VkAllocationCallbacks* pAllocator)
-{
- ANV_FROM_HANDLE(anv_device, device, _device);
- ANV_FROM_HANDLE(anv_framebuffer, fb, _fb);
-
- if (!fb)
- return;
-
- vk_object_free(&device->vk, pAllocator, fb);
+ vk_sampler_destroy(&device->vk, pAllocator, &sampler->vk);
}
-static const VkTimeDomainEXT anv_time_domains[] = {
- VK_TIME_DOMAIN_DEVICE_EXT,
- VK_TIME_DOMAIN_CLOCK_MONOTONIC_EXT,
+static const VkTimeDomainKHR anv_time_domains[] = {
+ VK_TIME_DOMAIN_DEVICE_KHR,
+ VK_TIME_DOMAIN_CLOCK_MONOTONIC_KHR,
#ifdef CLOCK_MONOTONIC_RAW
- VK_TIME_DOMAIN_CLOCK_MONOTONIC_RAW_EXT,
+ VK_TIME_DOMAIN_CLOCK_MONOTONIC_RAW_KHR,
#endif
};
-VkResult anv_GetPhysicalDeviceCalibrateableTimeDomainsEXT(
+VkResult anv_GetPhysicalDeviceCalibrateableTimeDomainsKHR(
VkPhysicalDevice physicalDevice,
uint32_t *pTimeDomainCount,
- VkTimeDomainEXT *pTimeDomains)
+ VkTimeDomainKHR *pTimeDomains)
{
int d;
- VK_OUTARRAY_MAKE(out, pTimeDomains, pTimeDomainCount);
+ VK_OUTARRAY_MAKE_TYPED(VkTimeDomainKHR, out, pTimeDomains, pTimeDomainCount);
for (d = 0; d < ARRAY_SIZE(anv_time_domains); d++) {
- vk_outarray_append(&out, i) {
+ vk_outarray_append_typed(VkTimeDomainKHR, &out, i) {
*i = anv_time_domains[d];
}
}
@@ -4854,63 +5282,146 @@ VkResult anv_GetPhysicalDeviceCalibrateableTimeDomainsEXT(
return vk_outarray_status(&out);
}
-static uint64_t
-anv_clock_gettime(clockid_t clock_id)
+static inline clockid_t
+anv_get_default_cpu_clock_id(void)
{
- struct timespec current;
- int ret;
+#ifdef CLOCK_MONOTONIC_RAW
+ return CLOCK_MONOTONIC_RAW;
+#else
+ return CLOCK_MONOTONIC;
+#endif
+}
- ret = clock_gettime(clock_id, &current);
+static inline clockid_t
+vk_time_domain_to_clockid(VkTimeDomainKHR domain)
+{
+ switch (domain) {
#ifdef CLOCK_MONOTONIC_RAW
- if (ret < 0 && clock_id == CLOCK_MONOTONIC_RAW)
- ret = clock_gettime(CLOCK_MONOTONIC, &current);
+ case VK_TIME_DOMAIN_CLOCK_MONOTONIC_RAW_KHR:
+ return CLOCK_MONOTONIC_RAW;
#endif
- if (ret < 0)
- return 0;
+ case VK_TIME_DOMAIN_CLOCK_MONOTONIC_KHR:
+ return CLOCK_MONOTONIC;
+ default:
+ unreachable("Missing");
+ return CLOCK_MONOTONIC;
+ }
+}
- return (uint64_t) current.tv_sec * 1000000000ULL + current.tv_nsec;
+static inline bool
+is_cpu_time_domain(VkTimeDomainKHR domain)
+{
+ return domain == VK_TIME_DOMAIN_CLOCK_MONOTONIC_KHR ||
+ domain == VK_TIME_DOMAIN_CLOCK_MONOTONIC_RAW_KHR;
}
-VkResult anv_GetCalibratedTimestampsEXT(
+static inline bool
+is_gpu_time_domain(VkTimeDomainKHR domain)
+{
+ return domain == VK_TIME_DOMAIN_DEVICE_KHR;
+}
+
+VkResult anv_GetCalibratedTimestampsKHR(
VkDevice _device,
uint32_t timestampCount,
- const VkCalibratedTimestampInfoEXT *pTimestampInfos,
+ const VkCalibratedTimestampInfoKHR *pTimestampInfos,
uint64_t *pTimestamps,
uint64_t *pMaxDeviation)
{
ANV_FROM_HANDLE(anv_device, device, _device);
- uint64_t timestamp_frequency = device->info.timestamp_frequency;
- int ret;
- int d;
+ const uint64_t timestamp_frequency = device->info->timestamp_frequency;
+ const uint64_t device_period = DIV_ROUND_UP(1000000000, timestamp_frequency);
+ uint32_t d, increment;
uint64_t begin, end;
uint64_t max_clock_period = 0;
+ const enum intel_kmd_type kmd_type = device->physical->info.kmd_type;
+ const bool has_correlate_timestamp = kmd_type == INTEL_KMD_TYPE_XE;
+ clockid_t cpu_clock_id = -1;
+
+ begin = end = vk_clock_gettime(anv_get_default_cpu_clock_id());
+
+ for (d = 0, increment = 1; d < timestampCount; d += increment) {
+ const VkTimeDomainKHR current = pTimestampInfos[d].timeDomain;
+ /* If we have a request pattern like this :
+ * - domain0 = VK_TIME_DOMAIN_CLOCK_MONOTONIC_KHR or VK_TIME_DOMAIN_CLOCK_MONOTONIC_RAW_KHR
+ * - domain1 = VK_TIME_DOMAIN_DEVICE_KHR
+ * - domain2 = domain0 (optional)
+ *
+ * We can combine all of those into a single ioctl for maximum accuracy.
+ */
+ if (has_correlate_timestamp && (d + 1) < timestampCount) {
+ const VkTimeDomainKHR next = pTimestampInfos[d + 1].timeDomain;
+
+ if ((is_cpu_time_domain(current) && is_gpu_time_domain(next)) ||
+ (is_gpu_time_domain(current) && is_cpu_time_domain(next))) {
+ /* We'll consume at least 2 elements. */
+ increment = 2;
+
+ if (is_cpu_time_domain(current))
+ cpu_clock_id = vk_time_domain_to_clockid(current);
+ else
+ cpu_clock_id = vk_time_domain_to_clockid(next);
+
+ uint64_t cpu_timestamp, gpu_timestamp, cpu_delta_timestamp, cpu_end_timestamp;
+ if (!intel_gem_read_correlate_cpu_gpu_timestamp(device->fd,
+ kmd_type,
+ INTEL_ENGINE_CLASS_RENDER,
+ 0 /* engine_instance */,
+ cpu_clock_id,
+ &cpu_timestamp,
+ &gpu_timestamp,
+ &cpu_delta_timestamp))
+ return vk_device_set_lost(&device->vk, "Failed to read correlate timestamp %m");
+
+ cpu_end_timestamp = cpu_timestamp + cpu_delta_timestamp;
+ if (is_cpu_time_domain(current)) {
+ pTimestamps[d] = cpu_timestamp;
+ pTimestamps[d + 1] = gpu_timestamp;
+ } else {
+ pTimestamps[d] = gpu_timestamp;
+ pTimestamps[d + 1] = cpu_end_timestamp;
+ }
+ max_clock_period = MAX2(max_clock_period, device_period);
+
+ /* If we can consume a third element */
+ if ((d + 2) < timestampCount &&
+ is_cpu_time_domain(current) &&
+ current == pTimestampInfos[d + 2].timeDomain) {
+ pTimestamps[d + 2] = cpu_end_timestamp;
+ increment++;
+ }
+
+ /* If we're the first element, we can replace begin */
+ if (d == 0 && cpu_clock_id == anv_get_default_cpu_clock_id())
+ begin = cpu_timestamp;
+
+ /* If we're in the same clock domain as begin/end. We can set the end. */
+ if (cpu_clock_id == anv_get_default_cpu_clock_id())
+ end = cpu_end_timestamp;
-#ifdef CLOCK_MONOTONIC_RAW
- begin = anv_clock_gettime(CLOCK_MONOTONIC_RAW);
-#else
- begin = anv_clock_gettime(CLOCK_MONOTONIC);
-#endif
-
- for (d = 0; d < timestampCount; d++) {
- switch (pTimestampInfos[d].timeDomain) {
- case VK_TIME_DOMAIN_DEVICE_EXT:
- ret = anv_gem_reg_read(device->fd, TIMESTAMP | I915_REG_READ_8B_WA,
- &pTimestamps[d]);
+ continue;
+ }
+ }
- if (ret != 0) {
- return anv_device_set_lost(device, "Failed to read the TIMESTAMP "
- "register: %m");
+ /* fallback to regular method */
+ increment = 1;
+ switch (current) {
+ case VK_TIME_DOMAIN_DEVICE_KHR:
+ if (!intel_gem_read_render_timestamp(device->fd,
+ device->info->kmd_type,
+ &pTimestamps[d])) {
+ return vk_device_set_lost(&device->vk, "Failed to read the "
+ "TIMESTAMP register: %m");
}
- uint64_t device_period = DIV_ROUND_UP(1000000000, timestamp_frequency);
max_clock_period = MAX2(max_clock_period, device_period);
break;
- case VK_TIME_DOMAIN_CLOCK_MONOTONIC_EXT:
- pTimestamps[d] = anv_clock_gettime(CLOCK_MONOTONIC);
+ case VK_TIME_DOMAIN_CLOCK_MONOTONIC_KHR:
+ pTimestamps[d] = vk_clock_gettime(CLOCK_MONOTONIC);
max_clock_period = MAX2(max_clock_period, 1);
break;
#ifdef CLOCK_MONOTONIC_RAW
- case VK_TIME_DOMAIN_CLOCK_MONOTONIC_RAW_EXT:
+ case VK_TIME_DOMAIN_CLOCK_MONOTONIC_RAW_KHR:
pTimestamps[d] = begin;
break;
#endif
@@ -4920,50 +5431,13 @@ VkResult anv_GetCalibratedTimestampsEXT(
}
}
-#ifdef CLOCK_MONOTONIC_RAW
- end = anv_clock_gettime(CLOCK_MONOTONIC_RAW);
-#else
- end = anv_clock_gettime(CLOCK_MONOTONIC);
-#endif
+ /* If last timestamp was not get with has_correlate_timestamp method or
+ * if it was but last cpu clock is not the default one, get time again
+ */
+ if (increment == 1 || cpu_clock_id != anv_get_default_cpu_clock_id())
+ end = vk_clock_gettime(anv_get_default_cpu_clock_id());
- /*
- * The maximum deviation is the sum of the interval over which we
- * perform the sampling and the maximum period of any sampled
- * clock. That's because the maximum skew between any two sampled
- * clock edges is when the sampled clock with the largest period is
- * sampled at the end of that period but right at the beginning of the
- * sampling interval and some other clock is sampled right at the
- * begining of its sampling period and right at the end of the
- * sampling interval. Let's assume the GPU has the longest clock
- * period and that the application is sampling GPU and monotonic:
- *
- * s e
- * w x y z 0 1 2 3 4 5 6 7 8 9 a b c d e f
- * Raw -_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-
- *
- * g
- * 0 1 2 3
- * GPU -----_____-----_____-----_____-----_____
- *
- * m
- * x y z 0 1 2 3 4 5 6 7 8 9 a b c
- * Monotonic -_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-
- *
- * Interval <----------------->
- * Deviation <-------------------------->
- *
- * s = read(raw) 2
- * g = read(GPU) 1
- * m = read(monotonic) 2
- * e = read(raw) b
- *
- * We round the sample interval up by one tick to cover sampling error
- * in the interval clock
- */
-
- uint64_t sample_interval = end - begin + 1;
-
- *pMaxDeviation = sample_interval + max_clock_period;
+ *pMaxDeviation = vk_time_max_deviation(begin, end, max_clock_period);
return VK_SUCCESS;
}
@@ -4992,86 +5466,198 @@ void anv_GetPhysicalDeviceMultisamplePropertiesEXT(
anv_debug_ignored_stype(ext->sType);
}
-/* vk_icd.h does not declare this function, so we declare it here to
- * suppress Wmissing-prototypes.
- */
-PUBLIC VKAPI_ATTR VkResult VKAPI_CALL
-vk_icdNegotiateLoaderICDInterfaceVersion(uint32_t* pSupportedVersion);
-
-PUBLIC VKAPI_ATTR VkResult VKAPI_CALL
-vk_icdNegotiateLoaderICDInterfaceVersion(uint32_t* pSupportedVersion)
-{
- /* For the full details on loader interface versioning, see
- * <https://github.com/KhronosGroup/Vulkan-LoaderAndValidationLayers/blob/master/loader/LoaderAndLayerInterface.md>.
- * What follows is a condensed summary, to help you navigate the large and
- * confusing official doc.
- *
- * - Loader interface v0 is incompatible with later versions. We don't
- * support it.
- *
- * - In loader interface v1:
- * - The first ICD entrypoint called by the loader is
- * vk_icdGetInstanceProcAddr(). The ICD must statically expose this
- * entrypoint.
- * - The ICD must statically expose no other Vulkan symbol unless it is
- * linked with -Bsymbolic.
- * - Each dispatchable Vulkan handle created by the ICD must be
- * a pointer to a struct whose first member is VK_LOADER_DATA. The
- * ICD must initialize VK_LOADER_DATA.loadMagic to ICD_LOADER_MAGIC.
- * - The loader implements vkCreate{PLATFORM}SurfaceKHR() and
- * vkDestroySurfaceKHR(). The ICD must be capable of working with
- * such loader-managed surfaces.
- *
- * - Loader interface v2 differs from v1 in:
- * - The first ICD entrypoint called by the loader is
- * vk_icdNegotiateLoaderICDInterfaceVersion(). The ICD must
- * statically expose this entrypoint.
- *
- * - Loader interface v3 differs from v2 in:
- * - The ICD must implement vkCreate{PLATFORM}SurfaceKHR(),
- * vkDestroySurfaceKHR(), and other API which uses VKSurfaceKHR,
- * because the loader no longer does so.
- *
- * - Loader interface v4 differs from v3 in:
- * - The ICD must implement vk_icdGetPhysicalDeviceProcAddr().
- */
- *pSupportedVersion = MIN2(*pSupportedVersion, 4u);
- return VK_SUCCESS;
-}
-
VkResult anv_GetPhysicalDeviceFragmentShadingRatesKHR(
VkPhysicalDevice physicalDevice,
uint32_t* pFragmentShadingRateCount,
VkPhysicalDeviceFragmentShadingRateKHR* pFragmentShadingRates)
{
ANV_FROM_HANDLE(anv_physical_device, physical_device, physicalDevice);
- VK_OUTARRAY_MAKE(out, pFragmentShadingRates, pFragmentShadingRateCount);
-
-#define append_rate(_samples, _width, _height) \
- do { \
- vk_outarray_append(&out, __r) { \
- __r->sampleCounts = _samples; \
- __r->fragmentSize = (VkExtent2D) { \
- .width = _width, \
- .height = _height, \
- }; \
- } \
+ VK_OUTARRAY_MAKE_TYPED(VkPhysicalDeviceFragmentShadingRateKHR, out,
+ pFragmentShadingRates, pFragmentShadingRateCount);
+
+#define append_rate(_samples, _width, _height) \
+ do { \
+ vk_outarray_append_typed(VkPhysicalDeviceFragmentShadingRateKHR, &out, __r) { \
+ __r->sampleCounts = _samples; \
+ __r->fragmentSize = (VkExtent2D) { \
+ .width = _width, \
+ .height = _height, \
+ }; \
+ } \
} while (0)
VkSampleCountFlags sample_counts =
isl_device_get_sample_counts(&physical_device->isl_dev);
+ /* BSpec 47003: There are a number of restrictions on the sample count
+ * based off the coarse pixel size.
+ */
+ static const VkSampleCountFlags cp_size_sample_limits[] = {
+ [1] = ISL_SAMPLE_COUNT_16_BIT | ISL_SAMPLE_COUNT_8_BIT |
+ ISL_SAMPLE_COUNT_4_BIT | ISL_SAMPLE_COUNT_2_BIT | ISL_SAMPLE_COUNT_1_BIT,
+ [2] = ISL_SAMPLE_COUNT_4_BIT | ISL_SAMPLE_COUNT_2_BIT | ISL_SAMPLE_COUNT_1_BIT,
+ [4] = ISL_SAMPLE_COUNT_4_BIT | ISL_SAMPLE_COUNT_2_BIT | ISL_SAMPLE_COUNT_1_BIT,
+ [8] = ISL_SAMPLE_COUNT_2_BIT | ISL_SAMPLE_COUNT_1_BIT,
+ [16] = ISL_SAMPLE_COUNT_1_BIT,
+ };
+
for (uint32_t x = 4; x >= 1; x /= 2) {
for (uint32_t y = 4; y >= 1; y /= 2) {
- /* For size {1, 1}, the sample count must be ~0 */
- if (x == 1 && y == 1)
- append_rate(~0, x, y);
- else
- append_rate(sample_counts, x, y);
- }
+ if (physical_device->info.has_coarse_pixel_primitive_and_cb) {
+ /* BSpec 47003:
+ * "CPsize 1x4 and 4x1 are not supported"
+ */
+ if ((x == 1 && y == 4) || (x == 4 && y == 1))
+ continue;
+
+ /* For size {1, 1}, the sample count must be ~0
+ *
+ * 4x2 is also a specially case.
+ */
+ if (x == 1 && y == 1)
+ append_rate(~0, x, y);
+ else if (x == 4 && y == 2)
+ append_rate(ISL_SAMPLE_COUNT_1_BIT, x, y);
+ else
+ append_rate(cp_size_sample_limits[x * y], x, y);
+ } else {
+ /* For size {1, 1}, the sample count must be ~0 */
+ if (x == 1 && y == 1)
+ append_rate(~0, x, y);
+ else
+ append_rate(sample_counts, x, y);
+ }
+ }
}
#undef append_rate
return vk_outarray_status(&out);
}
+
+const struct intel_device_info_pat_entry *
+anv_device_get_pat_entry(struct anv_device *device,
+ enum anv_bo_alloc_flags alloc_flags)
+{
+ if (alloc_flags & ANV_BO_ALLOC_IMPORTED)
+ return &device->info->pat.cached_coherent;
+
+ /* PAT indexes has no actual effect in DG2 and DG1, smem caches will always
+ * be snopped by GPU and lmem will always be WC.
+ * This might change in future discrete platforms.
+ */
+ if (anv_physical_device_has_vram(device->physical)) {
+ if (alloc_flags & ANV_BO_ALLOC_NO_LOCAL_MEM)
+ return &device->info->pat.cached_coherent;
+ return &device->info->pat.writecombining;
+ }
+
+ if ((alloc_flags & (ANV_BO_ALLOC_HOST_CACHED_COHERENT)) == ANV_BO_ALLOC_HOST_CACHED_COHERENT)
+ return &device->info->pat.cached_coherent;
+ else if (alloc_flags & (ANV_BO_ALLOC_EXTERNAL | ANV_BO_ALLOC_SCANOUT))
+ return &device->info->pat.scanout;
+ else if (alloc_flags & ANV_BO_ALLOC_HOST_CACHED)
+ return &device->info->pat.writeback_incoherent;
+ else
+ return &device->info->pat.writecombining;
+}
+
+static VkComponentTypeKHR
+convert_component_type(enum intel_cooperative_matrix_component_type t)
+{
+ switch (t) {
+ case INTEL_CMAT_FLOAT16: return VK_COMPONENT_TYPE_FLOAT16_KHR;
+ case INTEL_CMAT_FLOAT32: return VK_COMPONENT_TYPE_FLOAT32_KHR;
+ case INTEL_CMAT_SINT32: return VK_COMPONENT_TYPE_SINT32_KHR;
+ case INTEL_CMAT_SINT8: return VK_COMPONENT_TYPE_SINT8_KHR;
+ case INTEL_CMAT_UINT32: return VK_COMPONENT_TYPE_UINT32_KHR;
+ case INTEL_CMAT_UINT8: return VK_COMPONENT_TYPE_UINT8_KHR;
+ }
+ unreachable("invalid cooperative matrix component type in configuration");
+}
+
+static VkScopeKHR
+convert_scope(enum intel_cmat_scope scope)
+{
+ switch (scope) {
+ case INTEL_CMAT_SCOPE_SUBGROUP: return VK_SCOPE_SUBGROUP_KHR;
+ default:
+ unreachable("invalid cooperative matrix scope in configuration");
+ }
+}
+
+VkResult anv_GetPhysicalDeviceCooperativeMatrixPropertiesKHR(
+ VkPhysicalDevice physicalDevice,
+ uint32_t* pPropertyCount,
+ VkCooperativeMatrixPropertiesKHR* pProperties)
+{
+ ANV_FROM_HANDLE(anv_physical_device, pdevice, physicalDevice);
+ const struct intel_device_info *devinfo = &pdevice->info;
+
+ assert(anv_has_cooperative_matrix(pdevice));
+
+ VK_OUTARRAY_MAKE_TYPED(VkCooperativeMatrixPropertiesKHR, out, pProperties, pPropertyCount);
+
+ for (int i = 0; i < ARRAY_SIZE(devinfo->cooperative_matrix_configurations); i++) {
+ const struct intel_cooperative_matrix_configuration *cfg =
+ &devinfo->cooperative_matrix_configurations[i];
+
+ if (cfg->scope == INTEL_CMAT_SCOPE_NONE)
+ break;
+
+ vk_outarray_append_typed(VkCooperativeMatrixPropertiesKHR, &out, prop) {
+ prop->sType = VK_STRUCTURE_TYPE_COOPERATIVE_MATRIX_PROPERTIES_KHR;
+
+ prop->MSize = cfg->m;
+ prop->NSize = cfg->n;
+ prop->KSize = cfg->k;
+
+ prop->AType = convert_component_type(cfg->a);
+ prop->BType = convert_component_type(cfg->b);
+ prop->CType = convert_component_type(cfg->c);
+ prop->ResultType = convert_component_type(cfg->result);
+
+ prop->saturatingAccumulation = VK_FALSE;
+ prop->scope = convert_scope(cfg->scope);
+ }
+
+ /* VUID-RuntimeSpirv-saturatingAccumulation-08983 says:
+ *
+ * For OpCooperativeMatrixMulAddKHR, the SaturatingAccumulation
+ * cooperative matrix operand must be present if and only if
+ * VkCooperativeMatrixPropertiesKHR::saturatingAccumulation is
+ * VK_TRUE.
+ *
+ * As a result, we have to advertise integer configs both with and
+ * without this flag set.
+ *
+ * The DPAS instruction does not support the .sat modifier, so only
+ * advertise the configurations when the DPAS would be lowered.
+ *
+ * FINISHME: It should be possible to do better than full lowering on
+ * platforms that support DPAS. Emit a DPAS with a NULL accumulator
+ * argument, then perform the correct sequence of saturating add
+ * instructions.
+ */
+ if (cfg->a != INTEL_CMAT_FLOAT16 &&
+ (devinfo->verx10 < 125 || debug_get_bool_option("INTEL_LOWER_DPAS", false))) {
+ vk_outarray_append_typed(VkCooperativeMatrixPropertiesKHR, &out, prop) {
+ prop->sType = VK_STRUCTURE_TYPE_COOPERATIVE_MATRIX_PROPERTIES_KHR;
+
+ prop->MSize = cfg->m;
+ prop->NSize = cfg->n;
+ prop->KSize = cfg->k;
+
+ prop->AType = convert_component_type(cfg->a);
+ prop->BType = convert_component_type(cfg->b);
+ prop->CType = convert_component_type(cfg->c);
+ prop->ResultType = convert_component_type(cfg->result);
+
+ prop->saturatingAccumulation = VK_TRUE;
+ prop->scope = convert_scope(cfg->scope);
+ }
+ }
+ }
+
+ return vk_outarray_status(&out);
+}
diff --git a/src/intel/vulkan/anv_formats.c b/src/intel/vulkan/anv_formats.c
index 1713b446825..91cd9a9e7a9 100644
--- a/src/intel/vulkan/anv_formats.c
+++ b/src/intel/vulkan/anv_formats.c
@@ -23,6 +23,8 @@
#include "anv_private.h"
#include "drm-uapi/drm_fourcc.h"
+#include "vk_android.h"
+#include "vk_enum_defines.h"
#include "vk_enum_to_str.h"
#include "vk_format.h"
#include "vk_util.h"
@@ -50,7 +52,6 @@
[VK_ENUM_OFFSET(__vk_fmt)] = { \
.planes = { \
{ .isl_format = __hw_fmt, .swizzle = __swizzle, \
- .denominator_scales = { 1, 1, }, \
.aspect = VK_IMAGE_ASPECT_COLOR_BIT, \
}, \
}, \
@@ -65,7 +66,6 @@
[VK_ENUM_OFFSET(__vk_fmt)] = { \
.planes = { \
{ .isl_format = __hw_fmt, .swizzle = RGBA, \
- .denominator_scales = { 1, 1, }, \
.aspect = VK_IMAGE_ASPECT_DEPTH_BIT, \
}, \
}, \
@@ -77,7 +77,6 @@
[VK_ENUM_OFFSET(__vk_fmt)] = { \
.planes = { \
{ .isl_format = __hw_fmt, .swizzle = RGBA, \
- .denominator_scales = { 1, 1, }, \
.aspect = VK_IMAGE_ASPECT_STENCIL_BIT, \
}, \
}, \
@@ -89,11 +88,9 @@
[VK_ENUM_OFFSET(__vk_fmt)] = { \
.planes = { \
{ .isl_format = __fmt1, .swizzle = RGBA, \
- .denominator_scales = { 1, 1, }, \
.aspect = VK_IMAGE_ASPECT_DEPTH_BIT, \
}, \
{ .isl_format = __fmt2, .swizzle = RGBA, \
- .denominator_scales = { 1, 1, }, \
.aspect = VK_IMAGE_ASPECT_STENCIL_BIT, \
}, \
}, \
@@ -109,32 +106,21 @@
.vk_format = VK_FORMAT_UNDEFINED, \
}
-#define y_plane(__plane, __hw_fmt, __swizzle, __ycbcr_swizzle, dhs, dvs) \
+#define ycbcr_plane(__plane, __hw_fmt, __swizzle) \
{ .isl_format = __hw_fmt, \
.swizzle = __swizzle, \
- .ycbcr_swizzle = __ycbcr_swizzle, \
- .denominator_scales = { dhs, dvs, }, \
- .has_chroma = false, \
- .aspect = VK_IMAGE_ASPECT_PLANE_0_BIT, /* Y plane is always plane 0 */ \
- }
-
-#define chroma_plane(__plane, __hw_fmt, __swizzle, __ycbcr_swizzle, dhs, dvs) \
- { .isl_format = __hw_fmt, \
- .swizzle = __swizzle, \
- .ycbcr_swizzle = __ycbcr_swizzle, \
- .denominator_scales = { dhs, dvs, }, \
- .has_chroma = true, \
.aspect = VK_IMAGE_ASPECT_PLANE_ ## __plane ## _BIT, \
}
-#define ycbcr_fmt(__vk_fmt, __n_planes, ...) \
+#define ycbcr_fmt(__vk_fmt, __n_planes, __can_ycbcr, __can_video, ...) \
[VK_ENUM_OFFSET(__vk_fmt)] = { \
.planes = { \
__VA_ARGS__, \
}, \
.vk_format = __vk_fmt, \
.n_planes = __n_planes, \
- .can_ycbcr = true, \
+ .can_ycbcr = __can_ycbcr, \
+ .can_video = __can_video, \
}
/* HINT: For array formats, the ISL name should match the VK name. For
@@ -148,9 +134,9 @@ static const struct anv_format main_formats[] = {
fmt1(VK_FORMAT_R4G4B4A4_UNORM_PACK16, ISL_FORMAT_A4B4G4R4_UNORM),
swiz_fmt1(VK_FORMAT_B4G4R4A4_UNORM_PACK16, ISL_FORMAT_A4B4G4R4_UNORM, BGRA),
fmt1(VK_FORMAT_R5G6B5_UNORM_PACK16, ISL_FORMAT_B5G6R5_UNORM),
- fmt_unsupported(VK_FORMAT_B5G6R5_UNORM_PACK16),
+ swiz_fmt1(VK_FORMAT_B5G6R5_UNORM_PACK16, ISL_FORMAT_B5G6R5_UNORM, BGRA),
fmt1(VK_FORMAT_R5G5B5A1_UNORM_PACK16, ISL_FORMAT_A1B5G5R5_UNORM),
- fmt_unsupported(VK_FORMAT_B5G5R5A1_UNORM_PACK16),
+ swiz_fmt1(VK_FORMAT_B5G5R5A1_UNORM_PACK16, ISL_FORMAT_A1B5G5R5_UNORM, BGRA),
fmt1(VK_FORMAT_A1R5G5B5_UNORM_PACK16, ISL_FORMAT_B5G5R5A1_UNORM),
fmt1(VK_FORMAT_R8_UNORM, ISL_FORMAT_R8_UNORM),
fmt1(VK_FORMAT_R8_SNORM, ISL_FORMAT_R8_SNORM),
@@ -334,33 +320,33 @@ static const struct anv_format main_formats[] = {
};
static const struct anv_format _4444_formats[] = {
- fmt1(VK_FORMAT_A4R4G4B4_UNORM_PACK16_EXT, ISL_FORMAT_B4G4R4A4_UNORM),
- fmt_unsupported(VK_FORMAT_A4B4G4R4_UNORM_PACK16_EXT),
+ fmt1(VK_FORMAT_A4R4G4B4_UNORM_PACK16, ISL_FORMAT_B4G4R4A4_UNORM),
+ fmt_unsupported(VK_FORMAT_A4B4G4R4_UNORM_PACK16),
};
static const struct anv_format ycbcr_formats[] = {
- ycbcr_fmt(VK_FORMAT_G8B8G8R8_422_UNORM, 1,
- y_plane(0, ISL_FORMAT_YCRCB_SWAPUV, RGBA, _ISL_SWIZZLE(BLUE, GREEN, RED, ZERO), 1, 1)),
- ycbcr_fmt(VK_FORMAT_B8G8R8G8_422_UNORM, 1,
- y_plane(0, ISL_FORMAT_YCRCB_SWAPUVY, RGBA, _ISL_SWIZZLE(BLUE, GREEN, RED, ZERO), 1, 1)),
- ycbcr_fmt(VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM, 3,
- y_plane(0, ISL_FORMAT_R8_UNORM, RGBA, _ISL_SWIZZLE(GREEN, ZERO, ZERO, ZERO), 1, 1),
- chroma_plane(1, ISL_FORMAT_R8_UNORM, RGBA, _ISL_SWIZZLE(BLUE, ZERO, ZERO, ZERO), 2, 2),
- chroma_plane(2, ISL_FORMAT_R8_UNORM, RGBA, _ISL_SWIZZLE(RED, ZERO, ZERO, ZERO), 2, 2)),
- ycbcr_fmt(VK_FORMAT_G8_B8R8_2PLANE_420_UNORM, 2,
- y_plane(0, ISL_FORMAT_R8_UNORM, RGBA, _ISL_SWIZZLE(GREEN, ZERO, ZERO, ZERO), 1, 1),
- chroma_plane(1, ISL_FORMAT_R8G8_UNORM, RGBA, _ISL_SWIZZLE(BLUE, RED, ZERO, ZERO), 2, 2)),
- ycbcr_fmt(VK_FORMAT_G8_B8_R8_3PLANE_422_UNORM, 3,
- y_plane(0, ISL_FORMAT_R8_UNORM, RGBA, _ISL_SWIZZLE(GREEN, ZERO, ZERO, ZERO), 1, 1),
- chroma_plane(1, ISL_FORMAT_R8_UNORM, RGBA, _ISL_SWIZZLE(BLUE, ZERO, ZERO, ZERO), 2, 1),
- chroma_plane(2, ISL_FORMAT_R8_UNORM, RGBA, _ISL_SWIZZLE(RED, ZERO, ZERO, ZERO), 2, 1)),
- ycbcr_fmt(VK_FORMAT_G8_B8R8_2PLANE_422_UNORM, 2,
- y_plane(0, ISL_FORMAT_R8_UNORM, RGBA, _ISL_SWIZZLE(GREEN, ZERO, ZERO, ZERO), 1, 1),
- chroma_plane(1, ISL_FORMAT_R8G8_UNORM, RGBA, _ISL_SWIZZLE(BLUE, RED, ZERO, ZERO), 2, 1)),
- ycbcr_fmt(VK_FORMAT_G8_B8_R8_3PLANE_444_UNORM, 3,
- y_plane(0, ISL_FORMAT_R8_UNORM, RGBA, _ISL_SWIZZLE(GREEN, ZERO, ZERO, ZERO), 1, 1),
- chroma_plane(1, ISL_FORMAT_R8_UNORM, RGBA, _ISL_SWIZZLE(BLUE, ZERO, ZERO, ZERO), 1, 1),
- chroma_plane(2, ISL_FORMAT_R8_UNORM, RGBA, _ISL_SWIZZLE(RED, ZERO, ZERO, ZERO), 1, 1)),
+ ycbcr_fmt(VK_FORMAT_G8B8G8R8_422_UNORM, 1, true, false,
+ ycbcr_plane(0, ISL_FORMAT_YCRCB_NORMAL, RGBA)),
+ ycbcr_fmt(VK_FORMAT_B8G8R8G8_422_UNORM, 1, true, false,
+ ycbcr_plane(0, ISL_FORMAT_YCRCB_SWAPY, RGBA)),
+ ycbcr_fmt(VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM, 3, true, false,
+ ycbcr_plane(0, ISL_FORMAT_R8_UNORM, RGBA),
+ ycbcr_plane(1, ISL_FORMAT_R8_UNORM, RGBA),
+ ycbcr_plane(2, ISL_FORMAT_R8_UNORM, RGBA)),
+ ycbcr_fmt(VK_FORMAT_G8_B8R8_2PLANE_420_UNORM, 2, true, true,
+ ycbcr_plane(0, ISL_FORMAT_R8_UNORM, RGBA),
+ ycbcr_plane(1, ISL_FORMAT_R8G8_UNORM, RGBA)),
+ ycbcr_fmt(VK_FORMAT_G8_B8_R8_3PLANE_422_UNORM, 3, true, false,
+ ycbcr_plane(0, ISL_FORMAT_R8_UNORM, RGBA),
+ ycbcr_plane(1, ISL_FORMAT_R8_UNORM, RGBA),
+ ycbcr_plane(2, ISL_FORMAT_R8_UNORM, RGBA)),
+ ycbcr_fmt(VK_FORMAT_G8_B8R8_2PLANE_422_UNORM, 2, true, false,
+ ycbcr_plane(0, ISL_FORMAT_R8_UNORM, RGBA),
+ ycbcr_plane(1, ISL_FORMAT_R8G8_UNORM, RGBA)),
+ ycbcr_fmt(VK_FORMAT_G8_B8_R8_3PLANE_444_UNORM, 3, true, false,
+ ycbcr_plane(0, ISL_FORMAT_R8_UNORM, RGBA),
+ ycbcr_plane(1, ISL_FORMAT_R8_UNORM, RGBA),
+ ycbcr_plane(2, ISL_FORMAT_R8_UNORM, RGBA)),
fmt_unsupported(VK_FORMAT_R10X6_UNORM_PACK16),
fmt_unsupported(VK_FORMAT_R10X6G10X6_UNORM_2PACK16),
@@ -368,7 +354,9 @@ static const struct anv_format ycbcr_formats[] = {
fmt_unsupported(VK_FORMAT_G10X6B10X6G10X6R10X6_422_UNORM_4PACK16),
fmt_unsupported(VK_FORMAT_B10X6G10X6R10X6G10X6_422_UNORM_4PACK16),
fmt_unsupported(VK_FORMAT_G10X6_B10X6_R10X6_3PLANE_420_UNORM_3PACK16),
- fmt_unsupported(VK_FORMAT_G10X6_B10X6R10X6_2PLANE_420_UNORM_3PACK16),
+ ycbcr_fmt(VK_FORMAT_G10X6_B10X6R10X6_2PLANE_420_UNORM_3PACK16, 2, false, true,
+ ycbcr_plane(0, ISL_FORMAT_R16_UNORM, RGBA),
+ ycbcr_plane(1, ISL_FORMAT_R16G16_UNORM, RGBA)),
fmt_unsupported(VK_FORMAT_G10X6_B10X6_R10X6_3PLANE_422_UNORM_3PACK16),
fmt_unsupported(VK_FORMAT_G10X6_B10X6R10X6_2PLANE_422_UNORM_3PACK16),
fmt_unsupported(VK_FORMAT_G10X6_B10X6_R10X6_3PLANE_444_UNORM_3PACK16),
@@ -388,24 +376,29 @@ static const struct anv_format ycbcr_formats[] = {
fmt_unsupported(VK_FORMAT_G16B16G16R16_422_UNORM),
fmt_unsupported(VK_FORMAT_B16G16R16G16_422_UNORM),
- ycbcr_fmt(VK_FORMAT_G16_B16_R16_3PLANE_420_UNORM, 3,
- y_plane(0, ISL_FORMAT_R16_UNORM, RGBA, _ISL_SWIZZLE(GREEN, ZERO, ZERO, ZERO), 1, 1),
- chroma_plane(1, ISL_FORMAT_R16_UNORM, RGBA, _ISL_SWIZZLE(BLUE, ZERO, ZERO, ZERO), 2, 2),
- chroma_plane(2, ISL_FORMAT_R16_UNORM, RGBA, _ISL_SWIZZLE(RED, ZERO, ZERO, ZERO), 2, 2)),
- ycbcr_fmt(VK_FORMAT_G16_B16R16_2PLANE_420_UNORM, 2,
- y_plane(0, ISL_FORMAT_R16_UNORM, RGBA, _ISL_SWIZZLE(GREEN, ZERO, ZERO, ZERO), 1, 1),
- chroma_plane(1, ISL_FORMAT_R16G16_UNORM, RGBA, _ISL_SWIZZLE(BLUE, RED, ZERO, ZERO), 2, 2)),
- ycbcr_fmt(VK_FORMAT_G16_B16_R16_3PLANE_422_UNORM, 3,
- y_plane(0, ISL_FORMAT_R16_UNORM, RGBA, _ISL_SWIZZLE(GREEN, ZERO, ZERO, ZERO), 1, 1),
- chroma_plane(1, ISL_FORMAT_R16_UNORM, RGBA, _ISL_SWIZZLE(BLUE, ZERO, ZERO, ZERO), 2, 1),
- chroma_plane(2, ISL_FORMAT_R16_UNORM, RGBA, _ISL_SWIZZLE(RED, ZERO, ZERO, ZERO), 2, 1)),
- ycbcr_fmt(VK_FORMAT_G16_B16R16_2PLANE_422_UNORM, 2,
- y_plane(0, ISL_FORMAT_R16_UNORM, RGBA, _ISL_SWIZZLE(GREEN, ZERO, ZERO, ZERO), 1, 1),
- chroma_plane(1, ISL_FORMAT_R16G16_UNORM, RGBA, _ISL_SWIZZLE(BLUE, RED, ZERO, ZERO), 2, 1)),
- ycbcr_fmt(VK_FORMAT_G16_B16_R16_3PLANE_444_UNORM, 3,
- y_plane(0, ISL_FORMAT_R16_UNORM, RGBA, _ISL_SWIZZLE(GREEN, ZERO, ZERO, ZERO), 1, 1),
- chroma_plane(1, ISL_FORMAT_R16_UNORM, RGBA, _ISL_SWIZZLE(BLUE, ZERO, ZERO, ZERO), 1, 1),
- chroma_plane(2, ISL_FORMAT_R16_UNORM, RGBA, _ISL_SWIZZLE(RED, ZERO, ZERO, ZERO), 1, 1)),
+ ycbcr_fmt(VK_FORMAT_G16_B16_R16_3PLANE_420_UNORM, 3, true, false,
+ ycbcr_plane(0, ISL_FORMAT_R16_UNORM, RGBA),
+ ycbcr_plane(1, ISL_FORMAT_R16_UNORM, RGBA),
+ ycbcr_plane(2, ISL_FORMAT_R16_UNORM, RGBA)),
+ ycbcr_fmt(VK_FORMAT_G16_B16R16_2PLANE_420_UNORM, 2, true, false,
+ ycbcr_plane(0, ISL_FORMAT_R16_UNORM, RGBA),
+ ycbcr_plane(1, ISL_FORMAT_R16G16_UNORM, RGBA)),
+ ycbcr_fmt(VK_FORMAT_G16_B16_R16_3PLANE_422_UNORM, 3, true, false,
+ ycbcr_plane(0, ISL_FORMAT_R16_UNORM, RGBA),
+ ycbcr_plane(1, ISL_FORMAT_R16_UNORM, RGBA),
+ ycbcr_plane(2, ISL_FORMAT_R16_UNORM, RGBA)),
+ ycbcr_fmt(VK_FORMAT_G16_B16R16_2PLANE_422_UNORM, 2, true, false,
+ ycbcr_plane(0, ISL_FORMAT_R16_UNORM, RGBA),
+ ycbcr_plane(1, ISL_FORMAT_R16G16_UNORM, RGBA)),
+ ycbcr_fmt(VK_FORMAT_G16_B16_R16_3PLANE_444_UNORM, 3, true, false,
+ ycbcr_plane(0, ISL_FORMAT_R16_UNORM, RGBA),
+ ycbcr_plane(1, ISL_FORMAT_R16_UNORM, RGBA),
+ ycbcr_plane(2, ISL_FORMAT_R16_UNORM, RGBA)),
+};
+
+static const struct anv_format maintenance5_formats[] = {
+ fmt1(VK_FORMAT_A8_UNORM_KHR, ISL_FORMAT_A8_UNORM),
+ swiz_fmt1(VK_FORMAT_A1B5G5R5_UNORM_PACK16_KHR, ISL_FORMAT_B5G5R5A1_UNORM, BGRA)
};
#undef _fmt
@@ -423,6 +416,8 @@ static const struct {
.n_formats = ARRAY_SIZE(_4444_formats), },
[_VK_KHR_sampler_ycbcr_conversion_number] = { .formats = ycbcr_formats,
.n_formats = ARRAY_SIZE(ycbcr_formats), },
+ [_VK_KHR_maintenance5_number] = { .formats = maintenance5_formats,
+ .n_formats = ARRAY_SIZE(maintenance5_formats), },
};
const struct anv_format *
@@ -494,14 +489,6 @@ anv_get_format_plane(const struct intel_device_info *devinfo,
const struct isl_format_layout *isl_layout =
isl_format_get_layout(plane_format.isl_format);
- /* On Ivy Bridge we don't even have enough 24 and 48-bit formats that we
- * can reliably do texture upload with BLORP so just don't claim support
- * for any of them.
- */
- if (devinfo->verx10 == 70 &&
- (isl_layout->bpb == 24 || isl_layout->bpb == 48))
- return unsupported;
-
if (tiling == VK_IMAGE_TILING_OPTIMAL &&
!util_is_power_of_two_or_zero(isl_layout->bpb)) {
/* Tiled formats *must* be power-of-two because we need up upload
@@ -520,14 +507,6 @@ anv_get_format_plane(const struct intel_device_info *devinfo,
}
}
- /* The B4G4R4A4 format isn't available prior to Broadwell so we have to fall
- * back to a format with a more complex swizzle.
- */
- if (vk_format == VK_FORMAT_B4G4R4A4_UNORM_PACK16 && devinfo->ver < 8) {
- plane_format.isl_format = ISL_FORMAT_B4G4R4A4_UNORM;
- plane_format.swizzle = ISL_SWIZZLE(GREEN, RED, ALPHA, BLUE);
- }
-
return plane_format;
}
@@ -543,14 +522,15 @@ anv_get_format_aspect(const struct intel_device_info *devinfo,
// Format capabilities
-VkFormatFeatureFlags
-anv_get_image_format_features(const struct intel_device_info *devinfo,
- VkFormat vk_format,
- const struct anv_format *anv_format,
- VkImageTiling vk_tiling,
- const struct isl_drm_modifier_info *isl_mod_info)
+VkFormatFeatureFlags2
+anv_get_image_format_features2(const struct anv_physical_device *physical_device,
+ VkFormat vk_format,
+ const struct anv_format *anv_format,
+ VkImageTiling vk_tiling,
+ const struct isl_drm_modifier_info *isl_mod_info)
{
- VkFormatFeatureFlags flags = 0;
+ const struct intel_device_info *devinfo = &physical_device->info;
+ VkFormatFeatureFlags2 flags = 0;
if (anv_format == NULL)
return 0;
@@ -558,6 +538,23 @@ anv_get_image_format_features(const struct intel_device_info *devinfo,
assert((isl_mod_info != NULL) ==
(vk_tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT));
+ if (anv_is_format_emulated(physical_device, vk_format)) {
+ assert(isl_format_is_compressed(anv_format->planes[0].isl_format));
+
+ /* require optimal tiling so that we can decompress on upload */
+ if (vk_tiling != VK_IMAGE_TILING_OPTIMAL)
+ return 0;
+
+ /* required features for compressed formats */
+ flags |= VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_BIT |
+ VK_FORMAT_FEATURE_2_BLIT_SRC_BIT |
+ VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_FILTER_LINEAR_BIT |
+ VK_FORMAT_FEATURE_2_TRANSFER_SRC_BIT |
+ VK_FORMAT_FEATURE_2_TRANSFER_DST_BIT;
+
+ return flags;
+ }
+
const VkImageAspectFlags aspects = vk_format_aspects(vk_format);
if (aspects & (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) {
@@ -565,23 +562,30 @@ anv_get_image_format_features(const struct intel_device_info *devinfo,
vk_tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT)
return 0;
- flags |= VK_FORMAT_FEATURE_DEPTH_STENCIL_ATTACHMENT_BIT |
- VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT |
- VK_FORMAT_FEATURE_BLIT_SRC_BIT |
- VK_FORMAT_FEATURE_BLIT_DST_BIT |
- VK_FORMAT_FEATURE_TRANSFER_SRC_BIT |
- VK_FORMAT_FEATURE_TRANSFER_DST_BIT;
-
- if (aspects & VK_IMAGE_ASPECT_DEPTH_BIT)
- flags |= VK_FORMAT_FEATURE_SAMPLED_IMAGE_FILTER_LINEAR_BIT;
-
- if ((aspects & VK_IMAGE_ASPECT_DEPTH_BIT) && devinfo->ver >= 9)
- flags |= VK_FORMAT_FEATURE_SAMPLED_IMAGE_FILTER_MINMAX_BIT_EXT;
+ flags |= VK_FORMAT_FEATURE_2_DEPTH_STENCIL_ATTACHMENT_BIT |
+ VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_BIT |
+ VK_FORMAT_FEATURE_2_BLIT_SRC_BIT |
+ VK_FORMAT_FEATURE_2_BLIT_DST_BIT |
+ VK_FORMAT_FEATURE_2_TRANSFER_SRC_BIT |
+ VK_FORMAT_FEATURE_2_TRANSFER_DST_BIT;
+
+ if (aspects & VK_IMAGE_ASPECT_DEPTH_BIT) {
+ flags |= VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_FILTER_LINEAR_BIT |
+ VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_FILTER_MINMAX_BIT |
+ VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_DEPTH_COMPARISON_BIT;
+ }
return flags;
}
assert(aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV);
+
+ if (physical_device->video_decode_enabled &&
+ anv_format->can_video) {
+ flags |= VK_FORMAT_FEATURE_2_VIDEO_DECODE_OUTPUT_BIT_KHR |
+ VK_FORMAT_FEATURE_2_VIDEO_DECODE_DPB_BIT_KHR;
+ }
+
const struct anv_format_plane plane_format =
anv_get_format_plane(devinfo, vk_format, 0, vk_tiling);
@@ -596,29 +600,26 @@ anv_get_image_format_features(const struct intel_device_info *devinfo,
enum isl_format base_isl_format = base_plane_format.isl_format;
- /* ASTC textures must be in Y-tiled memory, and we reject compressed formats
- * with modifiers.
- */
- if (vk_tiling != VK_IMAGE_TILING_OPTIMAL &&
- isl_format_get_layout(plane_format.isl_format)->txc == ISL_TXC_ASTC)
- return 0;
-
- /* ASTC requires nasty workarounds on BSW so we just disable it for now.
- *
- * TODO: Figure out the ASTC workarounds and re-enable on BSW.
- */
- if (devinfo->ver < 9 &&
- isl_format_get_layout(plane_format.isl_format)->txc == ISL_TXC_ASTC)
- return 0;
-
if (isl_format_supports_sampling(devinfo, plane_format.isl_format)) {
- flags |= VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT;
- if (devinfo->ver >= 9)
- flags |= VK_FORMAT_FEATURE_SAMPLED_IMAGE_FILTER_MINMAX_BIT_EXT;
+ /* Unlike other surface formats, our sampler requires that the ASTC
+ * format only be used on surfaces in non-linearly-tiled memory.
+ * Thankfully, we can make an exception for linearly-tiled images that
+ * are only used for transfers. blorp_copy will reinterpret any
+ * compressed format to an uncompressed one.
+ *
+ * We handle modifier tilings further down in this function.
+ */
+ if (vk_tiling == VK_IMAGE_TILING_LINEAR &&
+ isl_format_get_layout(plane_format.isl_format)->txc == ISL_TXC_ASTC)
+ return VK_FORMAT_FEATURE_2_TRANSFER_SRC_BIT |
+ VK_FORMAT_FEATURE_2_TRANSFER_DST_BIT;
+
+ flags |= VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_BIT |
+ VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_FILTER_MINMAX_BIT;
if (isl_format_supports_filtering(devinfo, plane_format.isl_format))
- flags |= VK_FORMAT_FEATURE_SAMPLED_IMAGE_FILTER_LINEAR_BIT;
+ flags |= VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_FILTER_LINEAR_BIT;
}
/* We can render to swizzled formats. However, if the alpha channel is
@@ -627,31 +628,50 @@ anv_get_image_format_features(const struct intel_device_info *devinfo,
*/
if (isl_format_supports_rendering(devinfo, plane_format.isl_format) &&
plane_format.swizzle.a == ISL_CHANNEL_SELECT_ALPHA) {
- flags |= VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BIT;
+ flags |= VK_FORMAT_FEATURE_2_COLOR_ATTACHMENT_BIT;
- if (isl_format_supports_alpha_blending(devinfo, plane_format.isl_format))
- flags |= VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BLEND_BIT;
+ /* While we can render to swizzled formats, they don't blend correctly
+ * if there are blend constants involved. The swizzle just remaps the
+ * output of the shader to different channels in the texture. It
+ * doesn't change the interpretation of the constant blend factors in
+ * COLOR_CALC_STATE.
+ */
+ if (isl_format_supports_alpha_blending(devinfo, plane_format.isl_format) &&
+ isl_swizzle_is_identity(plane_format.swizzle))
+ flags |= VK_FORMAT_FEATURE_2_COLOR_ATTACHMENT_BLEND_BIT;
}
/* Load/store is determined based on base format. This prevents RGB
* formats from showing up as load/store capable.
*/
+ if (isl_format_supports_typed_reads(devinfo, base_isl_format))
+ flags |= VK_FORMAT_FEATURE_2_STORAGE_READ_WITHOUT_FORMAT_BIT;
if (isl_format_supports_typed_writes(devinfo, base_isl_format))
- flags |= VK_FORMAT_FEATURE_STORAGE_IMAGE_BIT;
+ flags |= VK_FORMAT_FEATURE_2_STORAGE_WRITE_WITHOUT_FORMAT_BIT;
+
+ /* Keep this old behavior on VK_FORMAT_FEATURE_2_STORAGE_IMAGE_BIT.
+ * When KHR_format_features2 is enabled, applications should only rely on
+ * it for the list of shader storage extended formats [1]. Before that,
+ * this applies to all VkFormats.
+ *
+ * [1] : https://www.khronos.org/registry/vulkan/specs/1.2-extensions/html/vkspec.html#features-shaderStorageImageExtendedFormats
+ */
+ if (flags & VK_FORMAT_FEATURE_2_STORAGE_WRITE_WITHOUT_FORMAT_BIT)
+ flags |= VK_FORMAT_FEATURE_2_STORAGE_IMAGE_BIT;
if (base_isl_format == ISL_FORMAT_R32_SINT ||
base_isl_format == ISL_FORMAT_R32_UINT ||
base_isl_format == ISL_FORMAT_R32_FLOAT)
- flags |= VK_FORMAT_FEATURE_STORAGE_IMAGE_ATOMIC_BIT;
+ flags |= VK_FORMAT_FEATURE_2_STORAGE_IMAGE_ATOMIC_BIT;
if (flags) {
- flags |= VK_FORMAT_FEATURE_BLIT_SRC_BIT |
- VK_FORMAT_FEATURE_TRANSFER_SRC_BIT |
- VK_FORMAT_FEATURE_TRANSFER_DST_BIT;
+ flags |= VK_FORMAT_FEATURE_2_BLIT_SRC_BIT |
+ VK_FORMAT_FEATURE_2_TRANSFER_SRC_BIT |
+ VK_FORMAT_FEATURE_2_TRANSFER_DST_BIT;
/* Blit destination requires rendering support. */
if (isl_format_supports_rendering(devinfo, plane_format.isl_format))
- flags |= VK_FORMAT_FEATURE_BLIT_DST_BIT;
+ flags |= VK_FORMAT_FEATURE_2_BLIT_DST_BIT;
}
/* XXX: We handle 3-channel formats by switching them out for RGBX or
@@ -665,10 +685,17 @@ anv_get_image_format_features(const struct intel_device_info *devinfo,
base_isl_format != ISL_FORMAT_UNSUPPORTED &&
!util_is_power_of_two_or_zero(isl_format_layouts[base_isl_format].bpb) &&
isl_format_rgb_to_rgbx(base_isl_format) == ISL_FORMAT_UNSUPPORTED) {
- flags &= ~VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BIT;
- flags &= ~VK_FORMAT_FEATURE_BLIT_DST_BIT;
+ flags &= ~VK_FORMAT_FEATURE_2_COLOR_ATTACHMENT_BIT;
+ flags &= ~VK_FORMAT_FEATURE_2_BLIT_DST_BIT;
}
+ const VkFormatFeatureFlags2 disallowed_ycbcr_image_features =
+ VK_FORMAT_FEATURE_2_BLIT_SRC_BIT |
+ VK_FORMAT_FEATURE_2_BLIT_DST_BIT |
+ VK_FORMAT_FEATURE_2_COLOR_ATTACHMENT_BIT |
+ VK_FORMAT_FEATURE_2_COLOR_ATTACHMENT_BLEND_BIT |
+ VK_FORMAT_FEATURE_2_STORAGE_IMAGE_BIT;
+
if (anv_format->can_ycbcr) {
/* The sampler doesn't have support for mid point when it handles YUV on
* its own.
@@ -678,35 +705,34 @@ anv_get_image_format_features(const struct intel_device_info *devinfo,
* sampler. The failures show a slightly out of range values on the
* bottom left of the sampled image.
*/
- flags |= VK_FORMAT_FEATURE_MIDPOINT_CHROMA_SAMPLES_BIT;
+ flags |= VK_FORMAT_FEATURE_2_MIDPOINT_CHROMA_SAMPLES_BIT;
} else {
- flags |= VK_FORMAT_FEATURE_SAMPLED_IMAGE_YCBCR_CONVERSION_LINEAR_FILTER_BIT |
- VK_FORMAT_FEATURE_MIDPOINT_CHROMA_SAMPLES_BIT |
- VK_FORMAT_FEATURE_SAMPLED_IMAGE_YCBCR_CONVERSION_SEPARATE_RECONSTRUCTION_FILTER_BIT;
+ flags |= VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_YCBCR_CONVERSION_LINEAR_FILTER_BIT |
+ VK_FORMAT_FEATURE_2_MIDPOINT_CHROMA_SAMPLES_BIT |
+ VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_YCBCR_CONVERSION_SEPARATE_RECONSTRUCTION_FILTER_BIT;
}
/* We can support cosited chroma locations when handle planes with our
* own shader snippets.
*/
- for (unsigned p = 0; p < anv_format->n_planes; p++) {
- if (anv_format->planes[p].denominator_scales[0] > 1 ||
- anv_format->planes[p].denominator_scales[1] > 1) {
+ const struct vk_format_ycbcr_info *ycbcr_info =
+ vk_format_get_ycbcr_info(vk_format);
+ assert(anv_format->n_planes == ycbcr_info->n_planes);
+ for (unsigned p = 0; p < ycbcr_info->n_planes; p++) {
+ if (ycbcr_info->planes[p].denominator_scales[0] > 1 ||
+ ycbcr_info->planes[p].denominator_scales[1] > 1) {
flags |= VK_FORMAT_FEATURE_COSITED_CHROMA_SAMPLES_BIT;
break;
}
}
if (anv_format->n_planes > 1)
- flags |= VK_FORMAT_FEATURE_DISJOINT_BIT;
-
- const VkFormatFeatureFlags disallowed_ycbcr_image_features =
- VK_FORMAT_FEATURE_BLIT_SRC_BIT |
- VK_FORMAT_FEATURE_BLIT_DST_BIT |
- VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BIT |
- VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BLEND_BIT |
- VK_FORMAT_FEATURE_STORAGE_IMAGE_BIT;
+ flags |= VK_FORMAT_FEATURE_2_DISJOINT_BIT;
flags &= ~disallowed_ycbcr_image_features;
+ } else if (anv_format->can_video) {
+ /* This format is for video decoding. */
+ flags &= ~disallowed_ycbcr_image_features;
}
if (vk_tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT) {
@@ -755,14 +781,16 @@ anv_get_image_format_features(const struct intel_device_info *devinfo,
if (anv_format->n_planes > 1) {
/* For simplicity, keep DISJOINT disabled for multi-planar format. */
- flags &= ~VK_FORMAT_FEATURE_DISJOINT_BIT;
+ flags &= ~VK_FORMAT_FEATURE_2_DISJOINT_BIT;
/* VK_ANDROID_external_memory_android_hardware_buffer in Virtio-GPU
* Venus driver layers on top of VK_EXT_image_drm_format_modifier of
- * the host Vulkan driver, and VK_FORMAT_G8_B8R8_2PLANE_420_UNORM is
- * required to support camera/media interop in Android.
+ * the host Vulkan driver, and both VK_FORMAT_G8_B8R8_2PLANE_420_UNORM
+ * and VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM and required to support
+ * camera/media interop in Android.
*/
- if (vk_format != VK_FORMAT_G8_B8R8_2PLANE_420_UNORM) {
+ if (vk_format != VK_FORMAT_G8_B8R8_2PLANE_420_UNORM &&
+ vk_format != VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM) {
anv_finishme("support more multi-planar formats with DRM modifiers");
return 0;
}
@@ -771,41 +799,46 @@ anv_get_image_format_features(const struct intel_device_info *devinfo,
* planes and aux planes due to the lack of defined ABI for external
* multi-planar images.
*/
- if (isl_mod_info->aux_usage != ISL_AUX_USAGE_NONE) {
+ if (isl_drm_modifier_has_aux(isl_mod_info->modifier)) {
return 0;
}
}
- if (isl_mod_info->aux_usage == ISL_AUX_USAGE_CCS_E &&
- !isl_format_supports_ccs_e(devinfo, plane_format.isl_format)) {
+ if (isl_drm_modifier_has_aux(isl_mod_info->modifier) &&
+ !anv_format_supports_ccs_e(devinfo, plane_format.isl_format)) {
return 0;
}
- if (isl_mod_info->aux_usage != ISL_AUX_USAGE_NONE) {
+ if (isl_drm_modifier_has_aux(isl_mod_info->modifier)) {
/* Rejection DISJOINT for consistency with the GL driver. In
* eglCreateImage, we require that the dma_buf for the primary surface
* and the dma_buf for its aux surface refer to the same bo.
*/
- flags &= ~VK_FORMAT_FEATURE_DISJOINT_BIT;
+ flags &= ~VK_FORMAT_FEATURE_2_DISJOINT_BIT;
/* When the hardware accesses a storage image, it bypasses the aux
* surface. We could support storage access on images with aux
* modifiers by resolving the aux surface prior to the storage access.
*/
- flags &= ~VK_FORMAT_FEATURE_STORAGE_IMAGE_BIT;
- flags &= ~VK_FORMAT_FEATURE_STORAGE_IMAGE_ATOMIC_BIT;
+ flags &= ~VK_FORMAT_FEATURE_2_STORAGE_IMAGE_BIT;
+ flags &= ~VK_FORMAT_FEATURE_2_STORAGE_IMAGE_ATOMIC_BIT;
}
}
+ if (devinfo->has_coarse_pixel_primitive_and_cb &&
+ vk_format == VK_FORMAT_R8_UINT &&
+ vk_tiling == VK_IMAGE_TILING_OPTIMAL)
+ flags |= VK_FORMAT_FEATURE_FRAGMENT_SHADING_RATE_ATTACHMENT_BIT_KHR;
+
return flags;
}
-static VkFormatFeatureFlags
-get_buffer_format_features(const struct intel_device_info *devinfo,
- VkFormat vk_format,
- const struct anv_format *anv_format)
+static VkFormatFeatureFlags2
+get_buffer_format_features2(const struct intel_device_info *devinfo,
+ VkFormat vk_format,
+ const struct anv_format *anv_format)
{
- VkFormatFeatureFlags flags = 0;
+ VkFormatFeatureFlags2 flags = 0;
if (anv_format == NULL)
return 0;
@@ -818,7 +851,7 @@ get_buffer_format_features(const struct intel_device_info *devinfo,
if (anv_format->n_planes > 1)
return 0;
- if (anv_format->can_ycbcr)
+ if (anv_format->can_ycbcr || anv_format->can_video)
return 0;
if (vk_format_is_depth_or_stencil(vk_format))
@@ -826,16 +859,42 @@ get_buffer_format_features(const struct intel_device_info *devinfo,
if (isl_format_supports_sampling(devinfo, isl_format) &&
!isl_format_is_compressed(isl_format))
- flags |= VK_FORMAT_FEATURE_UNIFORM_TEXEL_BUFFER_BIT;
+ flags |= VK_FORMAT_FEATURE_2_UNIFORM_TEXEL_BUFFER_BIT;
if (isl_format_supports_vertex_fetch(devinfo, isl_format))
- flags |= VK_FORMAT_FEATURE_VERTEX_BUFFER_BIT;
+ flags |= VK_FORMAT_FEATURE_2_VERTEX_BUFFER_BIT;
- if (isl_is_storage_image_format(isl_format))
- flags |= VK_FORMAT_FEATURE_STORAGE_TEXEL_BUFFER_BIT;
+ if (isl_is_storage_image_format(devinfo, isl_format))
+ flags |= VK_FORMAT_FEATURE_2_STORAGE_TEXEL_BUFFER_BIT;
if (isl_format == ISL_FORMAT_R32_SINT || isl_format == ISL_FORMAT_R32_UINT)
- flags |= VK_FORMAT_FEATURE_STORAGE_TEXEL_BUFFER_ATOMIC_BIT;
+ flags |= VK_FORMAT_FEATURE_2_STORAGE_TEXEL_BUFFER_ATOMIC_BIT;
+
+ if (isl_format_supports_typed_reads(devinfo, isl_format))
+ flags |= VK_FORMAT_FEATURE_2_STORAGE_READ_WITHOUT_FORMAT_BIT;
+ if (isl_format_supports_typed_writes(devinfo, isl_format))
+ flags |= VK_FORMAT_FEATURE_2_STORAGE_WRITE_WITHOUT_FORMAT_BIT;
+
+ if (devinfo->has_ray_tracing) {
+ switch (vk_format) {
+ case VK_FORMAT_R32G32_SFLOAT:
+ case VK_FORMAT_R32G32B32_SFLOAT:
+ case VK_FORMAT_R16G16_SFLOAT:
+ case VK_FORMAT_R16G16B16A16_SFLOAT:
+ case VK_FORMAT_R16G16_SNORM:
+ case VK_FORMAT_R16G16B16A16_SNORM:
+ case VK_FORMAT_R16G16B16A16_UNORM:
+ case VK_FORMAT_R16G16_UNORM:
+ case VK_FORMAT_R8G8B8A8_UNORM:
+ case VK_FORMAT_R8G8_UNORM:
+ case VK_FORMAT_R8G8B8A8_SNORM:
+ case VK_FORMAT_R8G8_SNORM:
+ flags |= VK_FORMAT_FEATURE_ACCELERATION_STRUCTURE_VERTEX_BUFFER_BIT_KHR;
+ break;
+ default:
+ break;
+ }
+ }
return flags;
}
@@ -845,25 +904,27 @@ get_drm_format_modifier_properties_list(const struct anv_physical_device *physic
VkFormat vk_format,
VkDrmFormatModifierPropertiesListEXT *list)
{
- const struct intel_device_info *devinfo = &physical_device->info;
const struct anv_format *anv_format = anv_get_format(vk_format);
- VK_OUTARRAY_MAKE(out, list->pDrmFormatModifierProperties,
- &list->drmFormatModifierCount);
+ VK_OUTARRAY_MAKE_TYPED(VkDrmFormatModifierPropertiesEXT, out,
+ list->pDrmFormatModifierProperties,
+ &list->drmFormatModifierCount);
isl_drm_modifier_info_for_each(isl_mod_info) {
- VkFormatFeatureFlags features =
- anv_get_image_format_features(devinfo, vk_format, anv_format,
- VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT,
- isl_mod_info);
+ VkFormatFeatureFlags2 features2 =
+ anv_get_image_format_features2(physical_device, vk_format, anv_format,
+ VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT,
+ isl_mod_info);
+ VkFormatFeatureFlags features = vk_format_features2_to_features(features2);
if (!features)
continue;
- uint32_t planes = anv_format->n_planes;
- if (isl_mod_info->aux_usage != ISL_AUX_USAGE_NONE)
- ++planes;
+ const uint32_t planes =
+ isl_drm_modifier_get_plane_count(&physical_device->info,
+ isl_mod_info->modifier,
+ anv_format->n_planes);
- vk_outarray_append(&out, out_props) {
+ vk_outarray_append_typed(VkDrmFormatModifierPropertiesEXT, &out, out_props) {
*out_props = (VkDrmFormatModifierPropertiesEXT) {
.drmFormatModifier = isl_mod_info->modifier,
.drmFormatModifierPlaneCount = planes,
@@ -873,43 +934,89 @@ get_drm_format_modifier_properties_list(const struct anv_physical_device *physic
}
}
-void anv_GetPhysicalDeviceFormatProperties(
- VkPhysicalDevice physicalDevice,
- VkFormat vk_format,
- VkFormatProperties* pFormatProperties)
+static void
+get_drm_format_modifier_properties_list_2(const struct anv_physical_device *physical_device,
+ VkFormat vk_format,
+ VkDrmFormatModifierPropertiesList2EXT *list)
{
- ANV_FROM_HANDLE(anv_physical_device, physical_device, physicalDevice);
- const struct intel_device_info *devinfo = &physical_device->info;
const struct anv_format *anv_format = anv_get_format(vk_format);
- *pFormatProperties = (VkFormatProperties) {
- .linearTilingFeatures =
- anv_get_image_format_features(devinfo, vk_format, anv_format,
- VK_IMAGE_TILING_LINEAR, NULL),
- .optimalTilingFeatures =
- anv_get_image_format_features(devinfo, vk_format, anv_format,
- VK_IMAGE_TILING_OPTIMAL, NULL),
- .bufferFeatures =
- get_buffer_format_features(devinfo, vk_format, anv_format),
- };
+ VK_OUTARRAY_MAKE_TYPED(VkDrmFormatModifierProperties2EXT, out,
+ list->pDrmFormatModifierProperties,
+ &list->drmFormatModifierCount);
+
+ isl_drm_modifier_info_for_each(isl_mod_info) {
+ VkFormatFeatureFlags2 features2 =
+ anv_get_image_format_features2(physical_device, vk_format, anv_format,
+ VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT,
+ isl_mod_info);
+ if (!features2)
+ continue;
+
+ const uint32_t planes =
+ isl_drm_modifier_get_plane_count(&physical_device->info,
+ isl_mod_info->modifier,
+ anv_format->n_planes);
+
+ vk_outarray_append_typed(VkDrmFormatModifierProperties2EXT, &out, out_props) {
+ *out_props = (VkDrmFormatModifierProperties2EXT) {
+ .drmFormatModifier = isl_mod_info->modifier,
+ .drmFormatModifierPlaneCount = planes,
+ .drmFormatModifierTilingFeatures = features2,
+ };
+ };
+ }
}
void anv_GetPhysicalDeviceFormatProperties2(
VkPhysicalDevice physicalDevice,
- VkFormat format,
+ VkFormat vk_format,
VkFormatProperties2* pFormatProperties)
{
ANV_FROM_HANDLE(anv_physical_device, physical_device, physicalDevice);
- anv_GetPhysicalDeviceFormatProperties(physicalDevice, format,
- &pFormatProperties->formatProperties);
+ const struct intel_device_info *devinfo = &physical_device->info;
+ const struct anv_format *anv_format = anv_get_format(vk_format);
+
+ assert(pFormatProperties->sType == VK_STRUCTURE_TYPE_FORMAT_PROPERTIES_2);
+
+ VkFormatFeatureFlags2 linear2, optimal2, buffer2;
+ linear2 = anv_get_image_format_features2(physical_device, vk_format,
+ anv_format,
+ VK_IMAGE_TILING_LINEAR, NULL);
+ optimal2 = anv_get_image_format_features2(physical_device, vk_format,
+ anv_format,
+ VK_IMAGE_TILING_OPTIMAL, NULL);
+ buffer2 = get_buffer_format_features2(devinfo, vk_format, anv_format);
+
+ pFormatProperties->formatProperties = (VkFormatProperties) {
+ .linearTilingFeatures = vk_format_features2_to_features(linear2),
+ .optimalTilingFeatures = vk_format_features2_to_features(optimal2),
+ .bufferFeatures = vk_format_features2_to_features(buffer2),
+ };
vk_foreach_struct(ext, pFormatProperties->pNext) {
/* Use unsigned since some cases are not in the VkStructureType enum. */
switch ((unsigned)ext->sType) {
case VK_STRUCTURE_TYPE_DRM_FORMAT_MODIFIER_PROPERTIES_LIST_EXT:
- get_drm_format_modifier_properties_list(physical_device, format,
+ get_drm_format_modifier_properties_list(physical_device, vk_format,
(void *)ext);
break;
+
+ case VK_STRUCTURE_TYPE_DRM_FORMAT_MODIFIER_PROPERTIES_LIST_2_EXT:
+ get_drm_format_modifier_properties_list_2(physical_device, vk_format,
+ (void *)ext);
+ break;
+
+ case VK_STRUCTURE_TYPE_FORMAT_PROPERTIES_3: {
+ VkFormatProperties3 *props = (VkFormatProperties3 *)ext;
+ props->linearTilingFeatures = linear2;
+ props->optimalTilingFeatures = optimal2;
+ props->bufferFeatures = buffer2;
+ break;
+ }
+ case VK_STRUCTURE_TYPE_VIDEO_PROFILE_LIST_INFO_KHR:
+ /* don't have any thing to use this for yet */
+ break;
default:
anv_debug_ignored_stype(ext->sType);
break;
@@ -917,61 +1024,347 @@ void anv_GetPhysicalDeviceFormatProperties2(
}
}
+static bool
+anv_format_supports_usage(
+ VkFormatFeatureFlags2 format_feature_flags,
+ VkImageUsageFlags usage_flags)
+{
+ if (usage_flags & VK_IMAGE_USAGE_TRANSFER_SRC_BIT) {
+ if (!(format_feature_flags & (VK_FORMAT_FEATURE_2_TRANSFER_SRC_BIT |
+ VK_FORMAT_FEATURE_2_BLIT_SRC_BIT))) {
+ return false;
+ }
+ }
+
+ if (usage_flags & VK_IMAGE_USAGE_TRANSFER_DST_BIT) {
+ if (!(format_feature_flags & (VK_FORMAT_FEATURE_2_TRANSFER_DST_BIT |
+ VK_FORMAT_FEATURE_2_BLIT_DST_BIT))) {
+ return false;
+ }
+ }
+
+ if (usage_flags & VK_IMAGE_USAGE_SAMPLED_BIT) {
+ if (!(format_feature_flags & VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_BIT)) {
+ return false;
+ }
+ }
+
+ if (usage_flags & VK_IMAGE_USAGE_STORAGE_BIT) {
+ if (!(format_feature_flags & VK_FORMAT_FEATURE_2_STORAGE_IMAGE_BIT)) {
+ return false;
+ }
+ }
+
+ if (usage_flags & VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT) {
+ if (!(format_feature_flags & VK_FORMAT_FEATURE_2_COLOR_ATTACHMENT_BIT)) {
+ return false;
+ }
+ }
+
+ if (usage_flags & VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT) {
+ if (!(format_feature_flags & VK_FORMAT_FEATURE_2_DEPTH_STENCIL_ATTACHMENT_BIT)) {
+ return false;
+ }
+ }
+
+ if (usage_flags & VK_IMAGE_USAGE_TRANSIENT_ATTACHMENT_BIT) {
+ /* Nothing to check. */
+ }
+
+ if (usage_flags & VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT) {
+ if (!(format_feature_flags & (VK_FORMAT_FEATURE_2_COLOR_ATTACHMENT_BIT |
+ VK_FORMAT_FEATURE_2_DEPTH_STENCIL_ATTACHMENT_BIT))) {
+ return false;
+ }
+ }
+
+ if (usage_flags & VK_IMAGE_USAGE_FRAGMENT_SHADING_RATE_ATTACHMENT_BIT_KHR) {
+ if (!(format_feature_flags & VK_FORMAT_FEATURE_2_FRAGMENT_SHADING_RATE_ATTACHMENT_BIT_KHR)) {
+ return false;
+ }
+ }
+
+ return true;
+}
+
+static bool
+anv_formats_are_compatible(
+ const struct anv_format *img_fmt, const struct anv_format *img_view_fmt,
+ const struct intel_device_info *devinfo, VkImageTiling tiling,
+ bool allow_texel_compatible)
+{
+ if (img_view_fmt->vk_format == VK_FORMAT_UNDEFINED)
+ return false;
+
+ if (img_fmt == img_view_fmt)
+ return true;
+
+ /* TODO: Handle multi-planar images that can have view of a plane with
+ * possibly different type.
+ */
+ if (img_fmt->n_planes != 1 || img_view_fmt->n_planes != 1)
+ return false;
+
+ const enum isl_format img_isl_fmt =
+ anv_get_format_plane(devinfo, img_fmt->vk_format, 0, tiling).isl_format;
+ const enum isl_format img_view_isl_fmt =
+ anv_get_format_plane(devinfo, img_view_fmt->vk_format, 0, tiling).isl_format;
+ if (img_isl_fmt == ISL_FORMAT_UNSUPPORTED ||
+ img_view_isl_fmt == ISL_FORMAT_UNSUPPORTED)
+ return false;
+
+ const struct isl_format_layout *img_fmt_layout =
+ isl_format_get_layout(img_isl_fmt);
+ const struct isl_format_layout *img_view_fmt_layout =
+ isl_format_get_layout(img_view_isl_fmt);
+
+ /* From the Vulkan 1.3.230 spec "12.5. Image Views"
+ *
+ * "If image was created with the
+ * VK_IMAGE_CREATE_BLOCK_TEXEL_VIEW_COMPATIBLE_BIT flag, format must be
+ * compatible with the image’s format as described above; or must be
+ * an uncompressed format, in which case it must be size-compatible
+ * with the image’s format."
+ */
+ if (allow_texel_compatible &&
+ isl_format_is_compressed(img_isl_fmt) &&
+ !isl_format_is_compressed(img_view_isl_fmt) &&
+ img_fmt_layout->bpb == img_view_fmt_layout->bpb)
+ return true;
+
+ if (isl_format_is_compressed(img_isl_fmt) !=
+ isl_format_is_compressed(img_view_isl_fmt))
+ return false;
+
+ if (!isl_format_is_compressed(img_isl_fmt)) {
+ /* From the Vulkan 1.3.224 spec "43.1.6. Format Compatibility Classes":
+ *
+ * "Uncompressed color formats are compatible with each other if they
+ * occupy the same number of bits per texel block."
+ */
+ return img_fmt_layout->bpb == img_view_fmt_layout->bpb;
+ }
+
+ /* From the Vulkan 1.3.224 spec "43.1.6. Format Compatibility Classes":
+ *
+ * "Compressed color formats are compatible with each other if the only
+ * difference between them is the numerical type of the uncompressed
+ * pixels (e.g. signed vs. unsigned, or SRGB vs. UNORM encoding)."
+ */
+ return img_fmt_layout->txc == img_view_fmt_layout->txc &&
+ isl_formats_have_same_bits_per_channel(img_isl_fmt, img_view_isl_fmt);
+}
+
+/* Returns a set of feature flags supported by any of the VkFormat listed in
+ * format_list_info or any VkFormat compatible with format.
+ */
+static VkFormatFeatureFlags2
+anv_formats_gather_format_features(
+ const struct anv_physical_device *physical_device,
+ const struct anv_format *format,
+ VkImageTiling tiling,
+ const struct isl_drm_modifier_info *isl_mod_info,
+ const VkImageFormatListCreateInfo *format_list_info,
+ bool allow_texel_compatible)
+{
+ const struct intel_device_info *devinfo = &physical_device->info;
+ VkFormatFeatureFlags2 all_formats_feature_flags = 0;
+
+ /* We need to check that each of the usage bits are allowed for at least
+ * one of the potential formats.
+ */
+ if (!format_list_info || format_list_info->viewFormatCount == 0) {
+ /* If we specify no list of possible formats, we need to assume that
+ * every compatible format is possible and consider the features
+ * supported by each of them.
+ */
+ for (uint32_t fmt_arr_ind = 0;
+ fmt_arr_ind < ARRAY_SIZE(anv_formats);
+ ++fmt_arr_ind) {
+ for (uint32_t fmt_ind = 0;
+ fmt_ind < anv_formats[fmt_arr_ind].n_formats;
+ ++fmt_ind) {
+ const struct anv_format *possible_anv_format =
+ &(anv_formats[fmt_arr_ind].formats[fmt_ind]);
+
+ if (anv_formats_are_compatible(format, possible_anv_format,
+ devinfo, tiling,
+ allow_texel_compatible)) {
+ VkFormatFeatureFlags2 view_format_features =
+ anv_get_image_format_features2(physical_device,
+ possible_anv_format->vk_format,
+ possible_anv_format, tiling,
+ isl_mod_info);
+ all_formats_feature_flags |= view_format_features;
+ }
+ }
+ }
+ } else {
+ /* If we provide the list of possible formats, then check just them. */
+ for (uint32_t i = 0; i < format_list_info->viewFormatCount; ++i) {
+ VkFormat vk_view_format = format_list_info->pViewFormats[i];
+
+ if (vk_view_format == VK_FORMAT_UNDEFINED)
+ continue;
+
+ const struct anv_format *anv_view_format =
+ anv_get_format(vk_view_format);
+ VkFormatFeatureFlags2 view_format_features =
+ anv_get_image_format_features2(physical_device,
+ vk_view_format, anv_view_format,
+ tiling, isl_mod_info);
+ all_formats_feature_flags |= view_format_features;
+ }
+ }
+
+ return all_formats_feature_flags;
+}
+
+/* Supports opaque fd but not dma_buf. */
+static const VkExternalMemoryProperties opaque_fd_only_props = {
+ .externalMemoryFeatures =
+ VK_EXTERNAL_MEMORY_FEATURE_EXPORTABLE_BIT |
+ VK_EXTERNAL_MEMORY_FEATURE_IMPORTABLE_BIT,
+ .exportFromImportedHandleTypes =
+ VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT,
+ .compatibleHandleTypes =
+ VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT,
+};
+
+/* Supports opaque fd and dma_buf. */
+static const VkExternalMemoryProperties opaque_fd_dma_buf_props = {
+ .externalMemoryFeatures =
+ VK_EXTERNAL_MEMORY_FEATURE_EXPORTABLE_BIT |
+ VK_EXTERNAL_MEMORY_FEATURE_IMPORTABLE_BIT,
+ .exportFromImportedHandleTypes =
+ VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT |
+ VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT,
+ .compatibleHandleTypes =
+ VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT |
+ VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT,
+};
+
+static const VkExternalMemoryProperties userptr_props = {
+ .externalMemoryFeatures = VK_EXTERNAL_MEMORY_FEATURE_IMPORTABLE_BIT,
+ .exportFromImportedHandleTypes = 0,
+ .compatibleHandleTypes =
+ VK_EXTERNAL_MEMORY_HANDLE_TYPE_HOST_ALLOCATION_BIT_EXT,
+};
+
+static const VkExternalMemoryProperties android_buffer_props = {
+ .externalMemoryFeatures = VK_EXTERNAL_MEMORY_FEATURE_EXPORTABLE_BIT |
+ VK_EXTERNAL_MEMORY_FEATURE_IMPORTABLE_BIT,
+ .exportFromImportedHandleTypes =
+ VK_EXTERNAL_MEMORY_HANDLE_TYPE_ANDROID_HARDWARE_BUFFER_BIT_ANDROID,
+ .compatibleHandleTypes =
+ VK_EXTERNAL_MEMORY_HANDLE_TYPE_ANDROID_HARDWARE_BUFFER_BIT_ANDROID,
+};
+
+
+static const VkExternalMemoryProperties android_image_props = {
+ /* VK_EXTERNAL_MEMORY_FEATURE_EXPORTABLE_BIT will be set dynamically */
+ .externalMemoryFeatures = VK_EXTERNAL_MEMORY_FEATURE_IMPORTABLE_BIT |
+ VK_EXTERNAL_MEMORY_FEATURE_DEDICATED_ONLY_BIT,
+ .exportFromImportedHandleTypes =
+ VK_EXTERNAL_MEMORY_HANDLE_TYPE_ANDROID_HARDWARE_BUFFER_BIT_ANDROID,
+ .compatibleHandleTypes =
+ VK_EXTERNAL_MEMORY_HANDLE_TYPE_ANDROID_HARDWARE_BUFFER_BIT_ANDROID,
+};
+
static VkResult
anv_get_image_format_properties(
struct anv_physical_device *physical_device,
const VkPhysicalDeviceImageFormatInfo2 *info,
- VkImageFormatProperties *pImageFormatProperties,
- VkSamplerYcbcrConversionImageFormatProperties *pYcbcrImageFormatProperties)
+ VkImageFormatProperties2 *props)
{
- VkFormatFeatureFlags format_feature_flags;
+ VkFormatFeatureFlags2 format_feature_flags;
VkExtent3D maxExtent;
uint32_t maxMipLevels;
uint32_t maxArraySize;
VkSampleCountFlags sampleCounts;
- struct anv_instance *instance = physical_device->instance;
const struct intel_device_info *devinfo = &physical_device->info;
const struct anv_format *format = anv_get_format(info->format);
const struct isl_drm_modifier_info *isl_mod_info = NULL;
- const VkImageFormatListCreateInfo *format_list_info =
- vk_find_struct_const(info->pNext, IMAGE_FORMAT_LIST_CREATE_INFO);
+ const VkPhysicalDeviceImageDrmFormatModifierInfoEXT *modifier_info = NULL;
+ const VkImageFormatListCreateInfo *format_list_info = NULL;
+ const VkPhysicalDeviceExternalImageFormatInfo *external_info = NULL;
+ VkExternalImageFormatProperties *external_props = NULL;
+ VkSamplerYcbcrConversionImageFormatProperties *ycbcr_props = NULL;
+ VkAndroidHardwareBufferUsageANDROID *android_usage = NULL;
+ VkTextureLODGatherFormatPropertiesAMD *texture_lod_gather_props = NULL;
+ VkImageCompressionPropertiesEXT *comp_props = NULL;
+ bool from_wsi = false;
+
+ /* Extract input structs */
+ vk_foreach_struct_const(s, info->pNext) {
+ switch ((unsigned)s->sType) {
+ case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTERNAL_IMAGE_FORMAT_INFO:
+ external_info = (const void *) s;
+ break;
+ case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_IMAGE_DRM_FORMAT_MODIFIER_INFO_EXT:
+ modifier_info = (const void *)s;
+ break;
+ case VK_STRUCTURE_TYPE_IMAGE_FORMAT_LIST_CREATE_INFO:
+ format_list_info = (const void *)s;
+ break;
+ case VK_STRUCTURE_TYPE_IMAGE_STENCIL_USAGE_CREATE_INFO:
+ /* Ignore but don't warn */
+ break;
+ case VK_STRUCTURE_TYPE_WSI_IMAGE_CREATE_INFO_MESA:
+ from_wsi = true;
+ break;
+ case VK_STRUCTURE_TYPE_VIDEO_PROFILE_LIST_INFO_KHR:
+ /* Ignore but don't warn */
+ break;
+ case VK_STRUCTURE_TYPE_IMAGE_COMPRESSION_CONTROL_EXT:
+ /* Ignore but don't warn */
+ break;
+ default:
+ anv_debug_ignored_stype(s->sType);
+ break;
+ }
+ }
+
+ /* Extract output structs */
+ vk_foreach_struct(s, props->pNext) {
+ switch (s->sType) {
+ case VK_STRUCTURE_TYPE_EXTERNAL_IMAGE_FORMAT_PROPERTIES:
+ external_props = (void *) s;
+ break;
+ case VK_STRUCTURE_TYPE_SAMPLER_YCBCR_CONVERSION_IMAGE_FORMAT_PROPERTIES:
+ ycbcr_props = (void *) s;
+ break;
+ case VK_STRUCTURE_TYPE_ANDROID_HARDWARE_BUFFER_USAGE_ANDROID:
+ android_usage = (void *) s;
+ break;
+ case VK_STRUCTURE_TYPE_TEXTURE_LOD_GATHER_FORMAT_PROPERTIES_AMD:
+ texture_lod_gather_props = (void *) s;
+ break;
+ case VK_STRUCTURE_TYPE_IMAGE_COMPRESSION_PROPERTIES_EXT:
+ comp_props = (void *) s;
+ break;
+ default:
+ anv_debug_ignored_stype(s->sType);
+ break;
+ }
+ }
if (format == NULL)
goto unsupported;
if (info->tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT) {
- const VkPhysicalDeviceImageDrmFormatModifierInfoEXT *vk_mod_info =
- vk_find_struct_const(info->pNext, PHYSICAL_DEVICE_IMAGE_DRM_FORMAT_MODIFIER_INFO_EXT);
-
- isl_mod_info = isl_drm_modifier_get_info(vk_mod_info->drmFormatModifier);
+ isl_mod_info = isl_drm_modifier_get_info(modifier_info->drmFormatModifier);
if (isl_mod_info == NULL)
goto unsupported;
- }
- assert(format->vk_format == info->format);
- format_feature_flags = anv_get_image_format_features(devinfo, info->format,
- format, info->tiling,
- isl_mod_info);
-
- /* Remove the VkFormatFeatureFlags that are incompatible with any declared
- * image view format. (Removals are more likely to occur when a DRM format
- * modifier is present).
- */
- if ((info->flags & VK_IMAGE_CREATE_MUTABLE_FORMAT_BIT) && format_list_info) {
- for (uint32_t i = 0; i < format_list_info->viewFormatCount; ++i) {
- VkFormat vk_view_format = format_list_info->pViewFormats[i];
- const struct anv_format *anv_view_format = anv_get_format(vk_view_format);
- VkFormatFeatureFlags view_format_features =
- anv_get_image_format_features(devinfo, vk_view_format,
- anv_view_format,
- info->tiling,
- isl_mod_info);
- format_feature_flags &= view_format_features;
+ /* only allow Y-tiling/Tile4 for video decode. */
+ if (info->usage & VK_IMAGE_USAGE_VIDEO_DECODE_DST_BIT_KHR) {
+ if (isl_mod_info->tiling != ISL_TILING_Y0 && isl_mod_info->tiling != ISL_TILING_4)
+ goto unsupported;
}
}
- if (!format_feature_flags)
- goto unsupported;
+ assert(format->vk_format == info->format);
switch (info->type) {
default:
@@ -999,29 +1392,90 @@ anv_get_image_format_properties(
maxExtent.width = 2048;
maxExtent.height = 2048;
maxExtent.depth = 2048;
- /* Prior to SKL, the mipmaps for 3D surfaces are laid out in a way
- * that make it impossible to represent in the way that
- * VkSubresourceLayout expects. Since we can't tell users how to make
- * sense of them, don't report them as available.
- */
- if (devinfo->ver < 9 && info->tiling == VK_IMAGE_TILING_LINEAR)
- maxMipLevels = 1;
- else
- maxMipLevels = 12; /* log2(maxWidth) + 1 */
+ maxMipLevels = 12; /* log2(maxWidth) + 1 */
maxArraySize = 1;
sampleCounts = VK_SAMPLE_COUNT_1_BIT;
break;
}
+ /* If any of the format in VkImageFormatListCreateInfo is completely
+ * unsupported, report unsupported.
+ */
+ if ((info->flags & VK_IMAGE_CREATE_MUTABLE_FORMAT_BIT) &&
+ format_list_info != NULL) {
+ for (uint32_t i = 0; i < format_list_info->viewFormatCount; i++) {
+ const struct anv_format *view_format =
+ anv_get_format(format_list_info->pViewFormats[i]);
+ if (view_format == NULL)
+ goto unsupported;
+ }
+ }
+
+ /* From the Vulkan 1.3.218 spec:
+ *
+ * "For images created without VK_IMAGE_CREATE_EXTENDED_USAGE_BIT a usage
+ * bit is valid if it is supported for the format the image is created with.
+ * For images created with VK_IMAGE_CREATE_EXTENDED_USAGE_BIT a usage bit
+ * is valid if it is supported for at least one of the formats
+ * a VkImageView created from the image can have."
+ *
+ * "VK_IMAGE_CREATE_MUTABLE_FORMAT_BIT specifies that the image can be
+ * used to create a VkImageView with a different format from the image."
+ *
+ * So, if both VK_IMAGE_CREATE_EXTENDED_USAGE_BIT and
+ * VK_IMAGE_CREATE_MUTABLE_FORMAT_BIT are set, views can be created with
+ * different usage than the image, so we can't always filter on usage.
+ * There is one exception to this below for storage.
+ */
+ format_feature_flags = anv_get_image_format_features2(physical_device,
+ info->format, format,
+ info->tiling,
+ isl_mod_info);
+
+ if (!anv_format_supports_usage(format_feature_flags, info->usage)) {
+ /* If image format itself does not support the usage, and we don't allow
+ * views formats to support it, then we can't support this usage at all.
+ */
+ if (!(info->flags & VK_IMAGE_CREATE_MUTABLE_FORMAT_BIT) ||
+ !(info->flags & VK_IMAGE_CREATE_EXTENDED_USAGE_BIT))
+ goto unsupported;
+
+ /* We don't want emulated formats to gain unexpected usage (storage in
+ * particular) from its compatible view formats.
+ */
+ if (anv_is_format_emulated(physical_device, info->format))
+ goto unsupported;
+
+ /* From the Vulkan 1.3.224 spec "43.1.6. Format Compatibility Classes":
+ *
+ * "Each depth/stencil format is only compatible with itself."
+ *
+ * So, other formats also can't help.
+ */
+ if (vk_format_is_depth_or_stencil(info->format))
+ goto unsupported;
+
+ /* Gather all possible format feature flags for the formats listed in
+ * the format list or all the compatible formats.
+ */
+ VkFormatFeatureFlags2 all_formats_feature_flags = format_feature_flags |
+ anv_formats_gather_format_features(physical_device, format,
+ info->tiling, isl_mod_info,
+ format_list_info,
+ info->flags & VK_IMAGE_CREATE_BLOCK_TEXEL_VIEW_COMPATIBLE_BIT);
+
+ if (!anv_format_supports_usage(all_formats_feature_flags, info->usage))
+ goto unsupported;
+ }
+
if (info->tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT) {
/* We support modifiers only for "simple" (that is, non-array
* non-mipmapped single-sample) 2D images.
*/
if (info->type != VK_IMAGE_TYPE_2D) {
- vk_errorfi(instance, &physical_device->vk.base,
- VK_ERROR_FORMAT_NOT_SUPPORTED,
- "VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT "
- "requires VK_IMAGE_TYPE_2D");
+ vk_errorf(physical_device, VK_ERROR_FORMAT_NOT_SUPPORTED,
+ "VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT "
+ "requires VK_IMAGE_TYPE_2D");
goto unsupported;
}
@@ -1029,9 +1483,10 @@ anv_get_image_format_properties(
maxMipLevels = 1;
sampleCounts = VK_SAMPLE_COUNT_1_BIT;
- if (isl_mod_info->aux_usage == ISL_AUX_USAGE_CCS_E &&
+ if (isl_drm_modifier_has_aux(isl_mod_info->modifier) &&
!anv_formats_ccs_e_compatible(devinfo, info->flags, info->format,
- info->tiling, format_list_info)) {
+ info->tiling, info->usage,
+ format_list_info)) {
goto unsupported;
}
}
@@ -1049,45 +1504,23 @@ anv_get_image_format_properties(
if (info->tiling == VK_IMAGE_TILING_OPTIMAL &&
info->type == VK_IMAGE_TYPE_2D &&
- (format_feature_flags & (VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BIT |
- VK_FORMAT_FEATURE_DEPTH_STENCIL_ATTACHMENT_BIT)) &&
+ (format_feature_flags & (VK_FORMAT_FEATURE_2_COLOR_ATTACHMENT_BIT |
+ VK_FORMAT_FEATURE_2_DEPTH_STENCIL_ATTACHMENT_BIT)) &&
!(info->flags & VK_IMAGE_CREATE_CUBE_COMPATIBLE_BIT) &&
- !(info->usage & VK_IMAGE_USAGE_STORAGE_BIT)) {
+ !(info->usage & VK_IMAGE_USAGE_STORAGE_BIT) &&
+ isl_format_supports_multisampling(devinfo, format->planes[0].isl_format)) {
sampleCounts = isl_device_get_sample_counts(&physical_device->isl_dev);
}
- if (info->usage & (VK_IMAGE_USAGE_TRANSFER_SRC_BIT |
- VK_IMAGE_USAGE_TRANSFER_DST_BIT)) {
- /* Accept transfers on anything we can sample from or renderer to. */
- if (!(format_feature_flags & (VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BIT |
- VK_FORMAT_FEATURE_DEPTH_STENCIL_ATTACHMENT_BIT |
- VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT))) {
- goto unsupported;
- }
- }
-
- if (info->usage & VK_IMAGE_USAGE_SAMPLED_BIT) {
- if (!(format_feature_flags & VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT)) {
- goto unsupported;
- }
- }
-
if (info->usage & VK_IMAGE_USAGE_STORAGE_BIT) {
- if (!(format_feature_flags & VK_FORMAT_FEATURE_STORAGE_IMAGE_BIT)) {
- goto unsupported;
- }
- }
-
- if (info->usage & VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT) {
- if (!(format_feature_flags & VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BIT)) {
- goto unsupported;
- }
- }
-
- if (info->usage & VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT) {
- if (!(format_feature_flags & VK_FORMAT_FEATURE_DEPTH_STENCIL_ATTACHMENT_BIT)) {
+ /* Non-power-of-two formats can never be used as storage images. We
+ * only check plane 0 because there are no YCbCr formats with
+ * non-power-of-two planes.
+ */
+ const struct isl_format_layout *isl_layout =
+ isl_format_get_layout(format->planes[0].isl_format);
+ if (!util_is_power_of_two_or_zero(isl_layout->bpb))
goto unsupported;
- }
}
if (info->flags & VK_IMAGE_CREATE_DISJOINT_BIT) {
@@ -1095,11 +1528,11 @@ anv_get_image_format_properties(
*
* If format is a multi-planar format, and if imageCreateFormatFeatures
* (as defined in Image Creation Limits) does not contain
- * VK_FORMAT_FEATURE_DISJOINT_BIT, then flags must not contain
+ * VK_FORMAT_FEATURE_2_DISJOINT_BIT, then flags must not contain
* VK_IMAGE_CREATE_DISJOINT_BIT.
*/
if (format->n_planes > 1 &&
- !(format_feature_flags & VK_FORMAT_FEATURE_DISJOINT_BIT)) {
+ !(format_feature_flags & VK_FORMAT_FEATURE_2_DISJOINT_BIT)) {
goto unsupported;
}
@@ -1115,7 +1548,7 @@ anv_get_image_format_properties(
}
if (info->tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT &&
- isl_mod_info->aux_usage != ISL_AUX_USAGE_NONE) {
+ isl_drm_modifier_has_aux(isl_mod_info->modifier)) {
/* Rejection DISJOINT for consistency with the GL driver. In
* eglCreateImage, we require that the dma_buf for the primary surface
* and the dma_buf for its aux surface refer to the same bo.
@@ -1124,7 +1557,7 @@ anv_get_image_format_properties(
}
}
- if (info->flags & VK_IMAGE_CREATE_ALIAS_BIT) {
+ if ((info->flags & VK_IMAGE_CREATE_ALIAS_BIT) && !from_wsi) {
/* Reject aliasing of images with non-linear DRM format modifiers because:
*
* 1. For modifiers with compression, we store aux tracking state in
@@ -1134,6 +1567,9 @@ anv_get_image_format_properties(
* 2. For tiled modifiers without compression, we may attempt to compress
* them behind the scenes, in which case both the aux tracking state
* and the CCS data are bound to ANV_IMAGE_MEMORY_BINDING_PRIVATE.
+ *
+ * 3. For WSI we should ignore ALIAS_BIT because we have the ability to
+ * bind the ANV_MEMORY_BINDING_PRIVATE from the other WSI image.
*/
if (info->tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT &&
isl_mod_info->modifier != DRM_FORMAT_MOD_LINEAR) {
@@ -1141,30 +1577,20 @@ anv_get_image_format_properties(
}
}
- if (info->usage & VK_IMAGE_USAGE_TRANSIENT_ATTACHMENT_BIT) {
- /* Nothing to check. */
- }
-
- if (info->usage & VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT) {
- /* Ignore this flag because it was removed from the
- * provisional_I_20150910 header.
- */
- }
+ if ((info->usage & VK_IMAGE_USAGE_FRAGMENT_SHADING_RATE_ATTACHMENT_BIT_KHR) &&
+ !devinfo->has_coarse_pixel_primitive_and_cb)
+ goto unsupported;
/* From the bspec section entitled "Surface Layout and Tiling",
- * pre-gfx9 has a 2 GB limitation of the size in bytes,
- * gfx9 and gfx10 have a 256 GB limitation and gfx11+
- * has a 16 TB limitation.
+ * Gfx9 has a 256 GB limitation and Gfx11+ has a 16 TB limitation.
*/
uint64_t maxResourceSize = 0;
- if (devinfo->ver < 9)
- maxResourceSize = (uint64_t) 1 << 31;
- else if (devinfo->ver < 11)
+ if (devinfo->ver < 11)
maxResourceSize = (uint64_t) 1 << 38;
else
maxResourceSize = (uint64_t) 1 << 44;
- *pImageFormatProperties = (VkImageFormatProperties) {
+ props->imageFormatProperties = (VkImageFormatProperties) {
.maxExtent = maxExtent,
.maxMipLevels = maxMipLevels,
.maxArrayLayers = maxArraySize,
@@ -1176,166 +1602,23 @@ anv_get_image_format_properties(
.maxResourceSize = maxResourceSize,
};
- if (pYcbcrImageFormatProperties) {
- pYcbcrImageFormatProperties->combinedImageSamplerDescriptorCount =
- format->n_planes;
- }
-
- return VK_SUCCESS;
-
-unsupported:
- *pImageFormatProperties = (VkImageFormatProperties) {
- .maxExtent = { 0, 0, 0 },
- .maxMipLevels = 0,
- .maxArrayLayers = 0,
- .sampleCounts = 0,
- .maxResourceSize = 0,
- };
-
- return VK_ERROR_FORMAT_NOT_SUPPORTED;
-}
-
-VkResult anv_GetPhysicalDeviceImageFormatProperties(
- VkPhysicalDevice physicalDevice,
- VkFormat format,
- VkImageType type,
- VkImageTiling tiling,
- VkImageUsageFlags usage,
- VkImageCreateFlags createFlags,
- VkImageFormatProperties* pImageFormatProperties)
-{
- ANV_FROM_HANDLE(anv_physical_device, physical_device, physicalDevice);
-
- const VkPhysicalDeviceImageFormatInfo2 info = {
- .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_IMAGE_FORMAT_INFO_2,
- .pNext = NULL,
- .format = format,
- .type = type,
- .tiling = tiling,
- .usage = usage,
- .flags = createFlags,
- };
-
- return anv_get_image_format_properties(physical_device, &info,
- pImageFormatProperties, NULL);
-}
-
-
-/* Supports opaque fd but not dma_buf. */
-static const VkExternalMemoryProperties opaque_fd_only_props = {
- .externalMemoryFeatures =
- VK_EXTERNAL_MEMORY_FEATURE_EXPORTABLE_BIT |
- VK_EXTERNAL_MEMORY_FEATURE_IMPORTABLE_BIT,
- .exportFromImportedHandleTypes =
- VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT,
- .compatibleHandleTypes =
- VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT,
-};
-
-/* Supports opaque fd and dma_buf. */
-static const VkExternalMemoryProperties opaque_fd_dma_buf_props = {
- .externalMemoryFeatures =
- VK_EXTERNAL_MEMORY_FEATURE_EXPORTABLE_BIT |
- VK_EXTERNAL_MEMORY_FEATURE_IMPORTABLE_BIT,
- .exportFromImportedHandleTypes =
- VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT |
- VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT,
- .compatibleHandleTypes =
- VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT |
- VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT,
-};
-
-static const VkExternalMemoryProperties userptr_props = {
- .externalMemoryFeatures = VK_EXTERNAL_MEMORY_FEATURE_IMPORTABLE_BIT,
- .exportFromImportedHandleTypes = 0,
- .compatibleHandleTypes =
- VK_EXTERNAL_MEMORY_HANDLE_TYPE_HOST_ALLOCATION_BIT_EXT,
-};
-
-static const VkExternalMemoryProperties android_buffer_props = {
- .externalMemoryFeatures = VK_EXTERNAL_MEMORY_FEATURE_EXPORTABLE_BIT |
- VK_EXTERNAL_MEMORY_FEATURE_IMPORTABLE_BIT,
- .exportFromImportedHandleTypes =
- VK_EXTERNAL_MEMORY_HANDLE_TYPE_ANDROID_HARDWARE_BUFFER_BIT_ANDROID,
- .compatibleHandleTypes =
- VK_EXTERNAL_MEMORY_HANDLE_TYPE_ANDROID_HARDWARE_BUFFER_BIT_ANDROID,
-};
-
-
-static const VkExternalMemoryProperties android_image_props = {
- .externalMemoryFeatures = VK_EXTERNAL_MEMORY_FEATURE_EXPORTABLE_BIT |
- VK_EXTERNAL_MEMORY_FEATURE_IMPORTABLE_BIT |
- VK_EXTERNAL_MEMORY_FEATURE_DEDICATED_ONLY_BIT,
- .exportFromImportedHandleTypes =
- VK_EXTERNAL_MEMORY_HANDLE_TYPE_ANDROID_HARDWARE_BUFFER_BIT_ANDROID,
- .compatibleHandleTypes =
- VK_EXTERNAL_MEMORY_HANDLE_TYPE_ANDROID_HARDWARE_BUFFER_BIT_ANDROID,
-};
-
-VkResult anv_GetPhysicalDeviceImageFormatProperties2(
- VkPhysicalDevice physicalDevice,
- const VkPhysicalDeviceImageFormatInfo2* base_info,
- VkImageFormatProperties2* base_props)
-{
- ANV_FROM_HANDLE(anv_physical_device, physical_device, physicalDevice);
- struct anv_instance *instance = physical_device->instance;
- const VkPhysicalDeviceExternalImageFormatInfo *external_info = NULL;
- VkExternalImageFormatProperties *external_props = NULL;
- VkSamplerYcbcrConversionImageFormatProperties *ycbcr_props = NULL;
- VkAndroidHardwareBufferUsageANDROID *android_usage = NULL;
- VkResult result;
-
- /* Extract input structs */
- vk_foreach_struct_const(s, base_info->pNext) {
- switch (s->sType) {
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTERNAL_IMAGE_FORMAT_INFO:
- external_info = (const void *) s;
- break;
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_IMAGE_DRM_FORMAT_MODIFIER_INFO_EXT:
- /* anv_get_image_format_properties will handle this */
- break;
- case VK_STRUCTURE_TYPE_IMAGE_STENCIL_USAGE_CREATE_INFO_EXT:
- /* Ignore but don't warn */
- break;
- default:
- anv_debug_ignored_stype(s->sType);
- break;
- }
- }
+ if (ycbcr_props)
+ ycbcr_props->combinedImageSamplerDescriptorCount = format->n_planes;
- /* Extract output structs */
- vk_foreach_struct(s, base_props->pNext) {
- switch (s->sType) {
- case VK_STRUCTURE_TYPE_EXTERNAL_IMAGE_FORMAT_PROPERTIES:
- external_props = (void *) s;
- break;
- case VK_STRUCTURE_TYPE_SAMPLER_YCBCR_CONVERSION_IMAGE_FORMAT_PROPERTIES:
- ycbcr_props = (void *) s;
- break;
- case VK_STRUCTURE_TYPE_ANDROID_HARDWARE_BUFFER_USAGE_ANDROID:
- android_usage = (void *) s;
- break;
- default:
- anv_debug_ignored_stype(s->sType);
- break;
- }
+ if (texture_lod_gather_props) {
+ texture_lod_gather_props->supportsTextureGatherLODBiasAMD =
+ physical_device->info.ver >= 20;
}
- result = anv_get_image_format_properties(physical_device, base_info,
- &base_props->imageFormatProperties, ycbcr_props);
- if (result != VK_SUCCESS)
- goto fail;
-
bool ahw_supported =
physical_device->vk.supported_extensions.ANDROID_external_memory_android_hardware_buffer;
if (ahw_supported && android_usage) {
android_usage->androidHardwareBufferUsage =
- anv_ahw_usage_from_vk_usage(base_info->flags,
- base_info->usage);
+ vk_image_usage_to_ahb_usage(info->flags, info->usage);
/* Limit maxArrayLayers to 1 for AHardwareBuffer based images for now. */
- base_props->imageFormatProperties.maxArrayLayers = 1;
+ props->imageFormatProperties.maxArrayLayers = 1;
}
/* From the Vulkan 1.0.42 spec:
@@ -1350,7 +1633,7 @@ VkResult anv_GetPhysicalDeviceImageFormatProperties2(
*/
bool tiling_has_explicit_layout;
- switch (base_info->tiling) {
+ switch (info->tiling) {
default:
unreachable("bad VkImageTiling");
case VK_IMAGE_TILING_LINEAR:
@@ -1379,12 +1662,12 @@ VkResult anv_GetPhysicalDeviceImageFormatProperties2(
* method exists, then we reject image creation here.
*
* If the memory handle requires matching
- * VkPhysicalDeviceIDPropertiesKHR::driverUUID and ::deviceUUID, then the
+ * VkPhysicalDeviceIDProperties::driverUUID and ::deviceUUID, then the
* match-requirement guarantees that all users of the image agree on the
* image's memory layout.
*
* If the memory handle does not require matching
- * VkPhysicalDeviceIDPropertiesKHR::driverUUID nor ::deviceUUID, then we
+ * VkPhysicalDeviceIDProperties::driverUUID nor ::deviceUUID, then we
* require that the app and driver be able to explicitly communicate to
* each other the image's memory layout.
*
@@ -1414,12 +1697,11 @@ VkResult anv_GetPhysicalDeviceImageFormatProperties2(
* and therefore requires explicit memory layout.
*/
if (!tiling_has_explicit_layout) {
- result = vk_errorfi(instance, &physical_device->vk.base,
- VK_ERROR_FORMAT_NOT_SUPPORTED,
- "VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT "
- "requires VK_IMAGE_TILING_LINEAR or "
- "VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT");
- goto fail;
+ vk_errorf(physical_device, VK_ERROR_FORMAT_NOT_SUPPORTED,
+ "VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT "
+ "requires VK_IMAGE_TILING_LINEAR or "
+ "VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT");
+ goto unsupported;
}
/* With an explicit memory layout, we don't care which type of fd
@@ -1434,12 +1716,11 @@ VkResult anv_GetPhysicalDeviceImageFormatProperties2(
* and therefore requires explicit memory layout.
*/
if (!tiling_has_explicit_layout) {
- result = vk_errorfi(instance, &physical_device->vk.base,
- VK_ERROR_FORMAT_NOT_SUPPORTED,
- "VK_EXTERNAL_MEMORY_HANDLE_TYPE_HOST_ALLOCATION_BIT_EXT "
- "requires VK_IMAGE_TILING_LINEAR or "
- "VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT");
- goto fail;
+ vk_errorf(physical_device, VK_ERROR_FORMAT_NOT_SUPPORTED,
+ "VK_EXTERNAL_MEMORY_HANDLE_TYPE_HOST_ALLOCATION_BIT_EXT "
+ "requires VK_IMAGE_TILING_LINEAR or "
+ "VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT");
+ goto unsupported;
}
if (external_props)
@@ -1451,8 +1732,14 @@ VkResult anv_GetPhysicalDeviceImageFormatProperties2(
* requires support for VK_IMAGE_TILING_OPTIMAL. Android systems
* communicate the image's memory layout through backdoor channels.
*/
- if (ahw_supported && external_props) {
- external_props->externalMemoryProperties = android_image_props;
+ if (ahw_supported) {
+ if (external_props) {
+ external_props->externalMemoryProperties = android_image_props;
+ if (anv_ahb_format_for_vk_format(info->format)) {
+ external_props->externalMemoryProperties.externalMemoryFeatures |=
+ VK_EXTERNAL_MEMORY_FEATURE_EXPORTABLE_BIT;
+ }
+ }
break;
}
FALLTHROUGH; /* If ahw not supported */
@@ -1464,43 +1751,56 @@ VkResult anv_GetPhysicalDeviceImageFormatProperties2(
* vkGetPhysicalDeviceImageFormatProperties2 returns
* VK_ERROR_FORMAT_NOT_SUPPORTED.
*/
- result = vk_errorfi(instance, &physical_device->vk.base,
- VK_ERROR_FORMAT_NOT_SUPPORTED,
- "unsupported VkExternalMemoryTypeFlagBits 0x%x",
- external_info->handleType);
- goto fail;
+ vk_errorf(physical_device, VK_ERROR_FORMAT_NOT_SUPPORTED,
+ "unsupported VkExternalMemoryTypeFlagBits 0x%x",
+ external_info->handleType);
+ goto unsupported;
}
}
+ if (comp_props) {
+ bool ccs_supported =
+ anv_formats_ccs_e_compatible(devinfo, info->flags, info->format,
+ info->tiling, info->usage,
+ format_list_info);
+ comp_props->imageCompressionFixedRateFlags =
+ VK_IMAGE_COMPRESSION_FIXED_RATE_NONE_EXT;
+ comp_props->imageCompressionFlags = ccs_supported ?
+ VK_IMAGE_COMPRESSION_DEFAULT_EXT :
+ VK_IMAGE_COMPRESSION_DISABLED_EXT;
+ }
+
return VK_SUCCESS;
- fail:
- if (result == VK_ERROR_FORMAT_NOT_SUPPORTED) {
- /* From the Vulkan 1.0.42 spec:
- *
- * If the combination of parameters to
- * vkGetPhysicalDeviceImageFormatProperties2 is not supported by
- * the implementation for use in vkCreateImage, then all members of
- * imageFormatProperties will be filled with zero.
- */
- base_props->imageFormatProperties = (VkImageFormatProperties) {};
- }
+unsupported:
+ /* From the Vulkan 1.0.42 spec:
+ *
+ * If the combination of parameters to
+ * vkGetPhysicalDeviceImageFormatProperties2 is not supported by the
+ * implementation for use in vkCreateImage, then all members of
+ * imageFormatProperties will be filled with zero.
+ */
+ props->imageFormatProperties = (VkImageFormatProperties) {
+ .maxExtent = { 0, 0, 0 },
+ .maxMipLevels = 0,
+ .maxArrayLayers = 0,
+ .sampleCounts = 0,
+ .maxResourceSize = 0,
+ };
- return result;
+ return VK_ERROR_FORMAT_NOT_SUPPORTED;
}
-void anv_GetPhysicalDeviceSparseImageFormatProperties(
+VkResult anv_GetPhysicalDeviceImageFormatProperties2(
VkPhysicalDevice physicalDevice,
- VkFormat format,
- VkImageType type,
- uint32_t samples,
- VkImageUsageFlags usage,
- VkImageTiling tiling,
- uint32_t* pNumProperties,
- VkSparseImageFormatProperties* pProperties)
+ const VkPhysicalDeviceImageFormatInfo2* pImageFormatInfo,
+ VkImageFormatProperties2* pImageFormatProperties)
{
- /* Sparse images are not yet supported. */
- *pNumProperties = 0;
+ ANV_FROM_HANDLE(anv_physical_device, physical_device, physicalDevice);
+
+ return anv_get_image_format_properties(physical_device,
+ pImageFormatInfo,
+ pImageFormatProperties);
}
void anv_GetPhysicalDeviceSparseImageFormatProperties2(
@@ -1509,8 +1809,125 @@ void anv_GetPhysicalDeviceSparseImageFormatProperties2(
uint32_t* pPropertyCount,
VkSparseImageFormatProperties2* pProperties)
{
- /* Sparse images are not yet supported. */
- *pPropertyCount = 0;
+ ANV_FROM_HANDLE(anv_physical_device, physical_device, physicalDevice);
+ const struct intel_device_info *devinfo = &physical_device->info;
+ VkImageAspectFlags aspects = vk_format_aspects(pFormatInfo->format);
+ VK_OUTARRAY_MAKE_TYPED(VkSparseImageFormatProperties2, props,
+ pProperties, pPropertyCount);
+
+ if (physical_device->sparse_type == ANV_SPARSE_TYPE_NOT_SUPPORTED) {
+ if (INTEL_DEBUG(DEBUG_SPARSE))
+ fprintf(stderr, "=== [%s:%d] [%s]\n", __FILE__, __LINE__, __func__);
+ return;
+ }
+
+ vk_foreach_struct_const(ext, pFormatInfo->pNext)
+ anv_debug_ignored_stype(ext->sType);
+
+ /* Check if the image is supported at all (regardless of being Sparse). */
+ const VkPhysicalDeviceImageFormatInfo2 img_info = {
+ .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_IMAGE_FORMAT_INFO_2,
+ .pNext = NULL,
+ .format = pFormatInfo->format,
+ .type = pFormatInfo->type,
+ .tiling = pFormatInfo->tiling,
+ .usage = pFormatInfo->usage,
+ .flags = VK_IMAGE_CREATE_SPARSE_BINDING_BIT |
+ VK_IMAGE_CREATE_SPARSE_RESIDENCY_BIT,
+ };
+ VkImageFormatProperties2 img_props = {};
+ if (anv_get_image_format_properties(physical_device,
+ &img_info, &img_props) != VK_SUCCESS)
+ return;
+
+ if (anv_sparse_image_check_support(physical_device,
+ VK_IMAGE_CREATE_SPARSE_BINDING_BIT |
+ VK_IMAGE_CREATE_SPARSE_RESIDENCY_BIT,
+ pFormatInfo->tiling,
+ pFormatInfo->samples,
+ pFormatInfo->type,
+ pFormatInfo->format) != VK_SUCCESS) {
+ return;
+ }
+
+ VkExtent3D ds_granularity = {};
+ VkSparseImageFormatProperties2 *ds_props_ptr = NULL;
+
+ u_foreach_bit(b, aspects) {
+ VkImageAspectFlagBits aspect = 1 << b;
+
+ const uint32_t plane =
+ anv_aspect_to_plane(vk_format_aspects(pFormatInfo->format), aspect);
+ struct anv_format_plane anv_format_plane =
+ anv_get_format_plane(devinfo, pFormatInfo->format, plane,
+ pFormatInfo->tiling);
+ enum isl_format isl_format = anv_format_plane.isl_format;
+ assert(isl_format != ISL_FORMAT_UNSUPPORTED);
+
+ VkImageCreateFlags vk_create_flags =
+ VK_IMAGE_CREATE_SPARSE_BINDING_BIT |
+ VK_IMAGE_CREATE_SPARSE_RESIDENCY_BIT;
+
+ isl_surf_usage_flags_t isl_usage =
+ anv_image_choose_isl_surf_usage(physical_device,
+ vk_create_flags, pFormatInfo->usage,
+ 0, aspect,
+ VK_IMAGE_COMPRESSION_DEFAULT_EXT);
+
+ const enum isl_surf_dim isl_surf_dim =
+ pFormatInfo->type == VK_IMAGE_TYPE_1D ? ISL_SURF_DIM_1D :
+ pFormatInfo->type == VK_IMAGE_TYPE_2D ? ISL_SURF_DIM_2D :
+ ISL_SURF_DIM_3D;
+
+ struct isl_surf isl_surf;
+ bool ok = isl_surf_init(&physical_device->isl_dev, &isl_surf,
+ .dim = isl_surf_dim,
+ .format = isl_format,
+ .width = 1,
+ .height = 1,
+ .depth = 1,
+ .levels = 1,
+ .array_len = 1,
+ .samples = pFormatInfo->samples,
+ .min_alignment_B = 0,
+ .row_pitch_B = 0,
+ .usage = isl_usage,
+ .tiling_flags = ISL_TILING_ANY_MASK);
+ if (!ok) {
+ /* There's no way to return an error code! */
+ assert(false);
+ *pPropertyCount = 0;
+ return;
+ }
+
+ VkSparseImageFormatProperties format_props =
+ anv_sparse_calc_image_format_properties(physical_device, aspect,
+ pFormatInfo->type,
+ &isl_surf);
+
+ /* If both depth and stencil are the same, unify them if possible. */
+ if (aspect & (VK_IMAGE_ASPECT_DEPTH_BIT |
+ VK_IMAGE_ASPECT_STENCIL_BIT)) {
+ if (!ds_props_ptr) {
+ ds_granularity = format_props.imageGranularity;
+ } else if (ds_granularity.width ==
+ format_props.imageGranularity.width &&
+ ds_granularity.height ==
+ format_props.imageGranularity.height &&
+ ds_granularity.depth ==
+ format_props.imageGranularity.depth) {
+ ds_props_ptr->properties.aspectMask |= aspect;
+ continue;
+ }
+ }
+
+ vk_outarray_append_typed(VkSparseImageFormatProperties2, &props, p) {
+ p->properties = format_props;
+ if (aspect & (VK_IMAGE_ASPECT_DEPTH_BIT |
+ VK_IMAGE_ASPECT_STENCIL_BIT))
+ ds_props_ptr = p;
+ }
+ }
}
void anv_GetPhysicalDeviceExternalBufferProperties(
@@ -1563,82 +1980,3 @@ void anv_GetPhysicalDeviceExternalBufferProperties(
.compatibleHandleTypes = pExternalBufferInfo->handleType,
};
}
-
-VkResult anv_CreateSamplerYcbcrConversion(
- VkDevice _device,
- const VkSamplerYcbcrConversionCreateInfo* pCreateInfo,
- const VkAllocationCallbacks* pAllocator,
- VkSamplerYcbcrConversion* pYcbcrConversion)
-{
- ANV_FROM_HANDLE(anv_device, device, _device);
- struct anv_ycbcr_conversion *conversion;
-
- /* Search for VkExternalFormatANDROID and resolve the format. */
- struct anv_format *ext_format = NULL;
- const VkExternalFormatANDROID *ext_info =
- vk_find_struct_const(pCreateInfo->pNext, EXTERNAL_FORMAT_ANDROID);
-
- uint64_t format = ext_info ? ext_info->externalFormat : 0;
- if (format) {
- assert(pCreateInfo->format == VK_FORMAT_UNDEFINED);
- ext_format = (struct anv_format *) (uintptr_t) format;
- }
-
- assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_SAMPLER_YCBCR_CONVERSION_CREATE_INFO);
-
- conversion = vk_object_zalloc(&device->vk, pAllocator, sizeof(*conversion),
- VK_OBJECT_TYPE_SAMPLER_YCBCR_CONVERSION);
- if (!conversion)
- return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
-
- conversion->format = anv_get_format(pCreateInfo->format);
- conversion->ycbcr_model = pCreateInfo->ycbcrModel;
- conversion->ycbcr_range = pCreateInfo->ycbcrRange;
-
- /* The Vulkan 1.1.95 spec says "When creating an external format conversion,
- * the value of components if ignored."
- */
- if (!ext_format) {
- conversion->mapping[0] = pCreateInfo->components.r;
- conversion->mapping[1] = pCreateInfo->components.g;
- conversion->mapping[2] = pCreateInfo->components.b;
- conversion->mapping[3] = pCreateInfo->components.a;
- }
-
- conversion->chroma_offsets[0] = pCreateInfo->xChromaOffset;
- conversion->chroma_offsets[1] = pCreateInfo->yChromaOffset;
- conversion->chroma_filter = pCreateInfo->chromaFilter;
-
- /* Setup external format. */
- if (ext_format)
- conversion->format = ext_format;
-
- bool has_chroma_subsampled = false;
- for (uint32_t p = 0; p < conversion->format->n_planes; p++) {
- if (conversion->format->planes[p].has_chroma &&
- (conversion->format->planes[p].denominator_scales[0] > 1 ||
- conversion->format->planes[p].denominator_scales[1] > 1))
- has_chroma_subsampled = true;
- }
- conversion->chroma_reconstruction = has_chroma_subsampled &&
- (conversion->chroma_offsets[0] == VK_CHROMA_LOCATION_COSITED_EVEN ||
- conversion->chroma_offsets[1] == VK_CHROMA_LOCATION_COSITED_EVEN);
-
- *pYcbcrConversion = anv_ycbcr_conversion_to_handle(conversion);
-
- return VK_SUCCESS;
-}
-
-void anv_DestroySamplerYcbcrConversion(
- VkDevice _device,
- VkSamplerYcbcrConversion YcbcrConversion,
- const VkAllocationCallbacks* pAllocator)
-{
- ANV_FROM_HANDLE(anv_device, device, _device);
- ANV_FROM_HANDLE(anv_ycbcr_conversion, conversion, YcbcrConversion);
-
- if (!conversion)
- return;
-
- vk_object_free(&device->vk, pAllocator, conversion);
-}
diff --git a/src/intel/vulkan/anv_gem.c b/src/intel/vulkan/anv_gem.c
index dd4c860a565..e721885cb55 100644
--- a/src/intel/vulkan/anv_gem.c
+++ b/src/intel/vulkan/anv_gem.c
@@ -30,200 +30,9 @@
#include <fcntl.h>
#include "anv_private.h"
-#include "common/intel_defines.h"
#include "common/intel_gem.h"
-#include "drm-uapi/sync_file.h"
-/**
- * Wrapper around DRM_IOCTL_I915_GEM_CREATE.
- *
- * Return gem handle, or 0 on failure. Gem handles are never 0.
- */
-uint32_t
-anv_gem_create(struct anv_device *device, uint64_t size)
-{
- struct drm_i915_gem_create gem_create = {
- .size = size,
- };
-
- int ret = intel_ioctl(device->fd, DRM_IOCTL_I915_GEM_CREATE, &gem_create);
- if (ret != 0) {
- /* FIXME: What do we do if this fails? */
- return 0;
- }
-
- return gem_create.handle;
-}
-
-void
-anv_gem_close(struct anv_device *device, uint32_t gem_handle)
-{
- struct drm_gem_close close = {
- .handle = gem_handle,
- };
-
- intel_ioctl(device->fd, DRM_IOCTL_GEM_CLOSE, &close);
-}
-
-uint32_t
-anv_gem_create_regions(struct anv_device *device, uint64_t anv_bo_size,
- uint32_t num_regions,
- struct drm_i915_gem_memory_class_instance *regions)
-{
- struct drm_i915_gem_create_ext_memory_regions ext_regions = {
- .base = { .name = I915_GEM_CREATE_EXT_MEMORY_REGIONS },
- .num_regions = num_regions,
- .regions = (uintptr_t)regions,
- };
-
- struct drm_i915_gem_create_ext gem_create = {
- .size = anv_bo_size,
- .extensions = (uintptr_t) &ext_regions,
- };
-
- int ret = intel_ioctl(device->fd, DRM_IOCTL_I915_GEM_CREATE_EXT,
- &gem_create);
- if (ret != 0) {
- return 0;
- }
-
- return gem_create.handle;
-}
-
-/**
- * Wrapper around DRM_IOCTL_I915_GEM_MMAP. Returns MAP_FAILED on error.
- */
-static void*
-anv_gem_mmap_offset(struct anv_device *device, uint32_t gem_handle,
- uint64_t offset, uint64_t size, uint32_t flags)
-{
- struct drm_i915_gem_mmap_offset gem_mmap = {
- .handle = gem_handle,
- .flags = device->info.has_local_mem ? I915_MMAP_OFFSET_FIXED :
- (flags & I915_MMAP_WC) ? I915_MMAP_OFFSET_WC : I915_MMAP_OFFSET_WB,
- };
- assert(offset == 0);
-
- /* Get the fake offset back */
- int ret = intel_ioctl(device->fd, DRM_IOCTL_I915_GEM_MMAP_OFFSET, &gem_mmap);
- if (ret != 0)
- return MAP_FAILED;
-
- /* And map it */
- void *map = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED,
- device->fd, gem_mmap.offset);
- return map;
-}
-
-static void*
-anv_gem_mmap_legacy(struct anv_device *device, uint32_t gem_handle,
- uint64_t offset, uint64_t size, uint32_t flags)
-{
- assert(!device->info.has_local_mem);
-
- struct drm_i915_gem_mmap gem_mmap = {
- .handle = gem_handle,
- .offset = offset,
- .size = size,
- .flags = flags,
- };
-
- int ret = intel_ioctl(device->fd, DRM_IOCTL_I915_GEM_MMAP, &gem_mmap);
- if (ret != 0)
- return MAP_FAILED;
-
- return (void *)(uintptr_t) gem_mmap.addr_ptr;
-}
-
-/**
- * Wrapper around DRM_IOCTL_I915_GEM_MMAP. Returns MAP_FAILED on error.
- */
-void*
-anv_gem_mmap(struct anv_device *device, uint32_t gem_handle,
- uint64_t offset, uint64_t size, uint32_t flags)
-{
- void *map;
- if (device->physical->has_mmap_offset)
- map = anv_gem_mmap_offset(device, gem_handle, offset, size, flags);
- else
- map = anv_gem_mmap_legacy(device, gem_handle, offset, size, flags);
-
- if (map != MAP_FAILED)
- VG(VALGRIND_MALLOCLIKE_BLOCK(map, size, 0, 1));
-
- return map;
-}
-
-/* This is just a wrapper around munmap, but it also notifies valgrind that
- * this map is no longer valid. Pair this with anv_gem_mmap().
- */
-void
-anv_gem_munmap(struct anv_device *device, void *p, uint64_t size)
-{
- VG(VALGRIND_FREELIKE_BLOCK(p, 0));
- munmap(p, size);
-}
-
-uint32_t
-anv_gem_userptr(struct anv_device *device, void *mem, size_t size)
-{
- struct drm_i915_gem_userptr userptr = {
- .user_ptr = (__u64)((unsigned long) mem),
- .user_size = size,
- .flags = 0,
- };
-
- if (device->physical->has_userptr_probe)
- userptr.flags |= I915_USERPTR_PROBE;
-
- int ret = intel_ioctl(device->fd, DRM_IOCTL_I915_GEM_USERPTR, &userptr);
- if (ret == -1)
- return 0;
-
- return userptr.handle;
-}
-
-int
-anv_gem_set_caching(struct anv_device *device,
- uint32_t gem_handle, uint32_t caching)
-{
- struct drm_i915_gem_caching gem_caching = {
- .handle = gem_handle,
- .caching = caching,
- };
-
- return intel_ioctl(device->fd, DRM_IOCTL_I915_GEM_SET_CACHING, &gem_caching);
-}
-
-int
-anv_gem_set_domain(struct anv_device *device, uint32_t gem_handle,
- uint32_t read_domains, uint32_t write_domain)
-{
- struct drm_i915_gem_set_domain gem_set_domain = {
- .handle = gem_handle,
- .read_domains = read_domains,
- .write_domain = write_domain,
- };
-
- return intel_ioctl(device->fd, DRM_IOCTL_I915_GEM_SET_DOMAIN, &gem_set_domain);
-}
-
-/**
- * Returns 0, 1, or negative to indicate error
- */
-int
-anv_gem_busy(struct anv_device *device, uint32_t gem_handle)
-{
- struct drm_i915_gem_busy busy = {
- .handle = gem_handle,
- };
-
- int ret = intel_ioctl(device->fd, DRM_IOCTL_I915_GEM_BUSY, &busy);
- if (ret < 0)
- return ret;
-
- return busy.busy != 0;
-}
+#include "i915/anv_gem.h"
/**
* On error, \a timeout_ns holds the remaining time.
@@ -231,319 +40,45 @@ anv_gem_busy(struct anv_device *device, uint32_t gem_handle)
int
anv_gem_wait(struct anv_device *device, uint32_t gem_handle, int64_t *timeout_ns)
{
- struct drm_i915_gem_wait wait = {
- .bo_handle = gem_handle,
- .timeout_ns = *timeout_ns,
- .flags = 0,
- };
-
- int ret = intel_ioctl(device->fd, DRM_IOCTL_I915_GEM_WAIT, &wait);
- *timeout_ns = wait.timeout_ns;
-
- return ret;
-}
-
-int
-anv_gem_execbuffer(struct anv_device *device,
- struct drm_i915_gem_execbuffer2 *execbuf)
-{
- if (execbuf->flags & I915_EXEC_FENCE_OUT)
- return intel_ioctl(device->fd, DRM_IOCTL_I915_GEM_EXECBUFFER2_WR, execbuf);
- else
- return intel_ioctl(device->fd, DRM_IOCTL_I915_GEM_EXECBUFFER2, execbuf);
+ switch (device->info->kmd_type) {
+ case INTEL_KMD_TYPE_I915:
+ return anv_i915_gem_wait(device, gem_handle, timeout_ns);
+ case INTEL_KMD_TYPE_XE:
+ return -1;
+ default:
+ unreachable("missing");
+ return -1;
+ }
}
/** Return -1 on error. */
int
anv_gem_get_tiling(struct anv_device *device, uint32_t gem_handle)
{
- struct drm_i915_gem_get_tiling get_tiling = {
- .handle = gem_handle,
- };
-
- /* FIXME: On discrete platforms we don't have DRM_IOCTL_I915_GEM_GET_TILING
- * anymore, so we will need another way to get the tiling. Apparently this
- * is only used in Android code, so we may need some other way to
- * communicate the tiling mode.
- */
- if (intel_ioctl(device->fd, DRM_IOCTL_I915_GEM_GET_TILING, &get_tiling)) {
- assert(!"Failed to get BO tiling");
+ switch (device->info->kmd_type) {
+ case INTEL_KMD_TYPE_I915:
+ return anv_i915_gem_get_tiling(device, gem_handle);
+ case INTEL_KMD_TYPE_XE:
+ return -1;
+ default:
+ unreachable("missing");
return -1;
}
-
- return get_tiling.tiling_mode;
}
int
anv_gem_set_tiling(struct anv_device *device,
uint32_t gem_handle, uint32_t stride, uint32_t tiling)
{
- int ret;
-
- /* On discrete platforms we don't have DRM_IOCTL_I915_GEM_SET_TILING. So
- * nothing needs to be done.
- */
- if (!device->info.has_tiling_uapi)
+ switch (device->info->kmd_type) {
+ case INTEL_KMD_TYPE_I915:
+ return anv_i915_gem_set_tiling(device, gem_handle, stride, tiling);
+ case INTEL_KMD_TYPE_XE:
return 0;
-
- /* set_tiling overwrites the input on the error path, so we have to open
- * code intel_ioctl.
- */
- do {
- struct drm_i915_gem_set_tiling set_tiling = {
- .handle = gem_handle,
- .tiling_mode = tiling,
- .stride = stride,
- };
-
- ret = ioctl(device->fd, DRM_IOCTL_I915_GEM_SET_TILING, &set_tiling);
- } while (ret == -1 && (errno == EINTR || errno == EAGAIN));
-
- return ret;
-}
-
-int
-anv_gem_get_param(int fd, uint32_t param)
-{
- int tmp;
-
- drm_i915_getparam_t gp = {
- .param = param,
- .value = &tmp,
- };
-
- int ret = intel_ioctl(fd, DRM_IOCTL_I915_GETPARAM, &gp);
- if (ret == 0)
- return tmp;
-
- return 0;
-}
-
-uint64_t
-anv_gem_get_drm_cap(int fd, uint32_t capability)
-{
- struct drm_get_cap cap = {
- .capability = capability,
- };
-
- intel_ioctl(fd, DRM_IOCTL_GET_CAP, &cap);
- return cap.value;
-}
-
-bool
-anv_gem_get_bit6_swizzle(int fd, uint32_t tiling)
-{
- struct drm_gem_close close;
- int ret;
-
- struct drm_i915_gem_create gem_create = {
- .size = 4096,
- };
-
- if (intel_ioctl(fd, DRM_IOCTL_I915_GEM_CREATE, &gem_create)) {
- assert(!"Failed to create GEM BO");
- return false;
- }
-
- bool swizzled = false;
-
- /* set_tiling overwrites the input on the error path, so we have to open
- * code intel_ioctl.
- */
- do {
- struct drm_i915_gem_set_tiling set_tiling = {
- .handle = gem_create.handle,
- .tiling_mode = tiling,
- .stride = tiling == I915_TILING_X ? 512 : 128,
- };
-
- ret = ioctl(fd, DRM_IOCTL_I915_GEM_SET_TILING, &set_tiling);
- } while (ret == -1 && (errno == EINTR || errno == EAGAIN));
-
- if (ret != 0) {
- assert(!"Failed to set BO tiling");
- goto close_and_return;
- }
-
- struct drm_i915_gem_get_tiling get_tiling = {
- .handle = gem_create.handle,
- };
-
- if (intel_ioctl(fd, DRM_IOCTL_I915_GEM_GET_TILING, &get_tiling)) {
- assert(!"Failed to get BO tiling");
- goto close_and_return;
- }
-
- swizzled = get_tiling.swizzle_mode != I915_BIT_6_SWIZZLE_NONE;
-
-close_and_return:
-
- memset(&close, 0, sizeof(close));
- close.handle = gem_create.handle;
- intel_ioctl(fd, DRM_IOCTL_GEM_CLOSE, &close);
-
- return swizzled;
-}
-
-bool
-anv_gem_has_context_priority(int fd)
-{
- return !anv_gem_set_context_param(fd, 0, I915_CONTEXT_PARAM_PRIORITY,
- INTEL_CONTEXT_MEDIUM_PRIORITY);
-}
-
-int
-anv_gem_create_context(struct anv_device *device)
-{
- struct drm_i915_gem_context_create create = { 0 };
-
- int ret = intel_ioctl(device->fd, DRM_IOCTL_I915_GEM_CONTEXT_CREATE, &create);
- if (ret == -1)
- return -1;
-
- return create.ctx_id;
-}
-
-int
-anv_gem_create_context_engines(struct anv_device *device,
- const struct drm_i915_query_engine_info *info,
- int num_engines, uint16_t *engine_classes)
-{
- const size_t engine_inst_sz = 2 * sizeof(__u16); /* 1 class, 1 instance */
- const size_t engines_param_size =
- sizeof(__u64) /* extensions */ + num_engines * engine_inst_sz;
-
- void *engines_param = malloc(engines_param_size);
- assert(engines_param);
- *(__u64*)engines_param = 0;
- __u16 *class_inst_ptr = (__u16*)(((__u64*)engines_param) + 1);
-
- /* For each type of drm_i915_gem_engine_class of interest, we keep track of
- * the previous engine instance used.
- */
- int last_engine_idx[] = {
- [I915_ENGINE_CLASS_RENDER] = -1,
- };
-
- int i915_engine_counts[] = {
- [I915_ENGINE_CLASS_RENDER] =
- anv_gem_count_engines(info, I915_ENGINE_CLASS_RENDER),
- };
-
- /* For each queue, we look for the next instance that matches the class we
- * need.
- */
- for (int i = 0; i < num_engines; i++) {
- uint16_t engine_class = engine_classes[i];
- if (i915_engine_counts[engine_class] <= 0) {
- free(engines_param);
- return -1;
- }
-
- /* Run through the engines reported by the kernel looking for the next
- * matching instance. We loop in case we want to create multiple
- * contexts on an engine instance.
- */
- int engine_instance = -1;
- for (int i = 0; i < info->num_engines; i++) {
- int *idx = &last_engine_idx[engine_class];
- if (++(*idx) >= info->num_engines)
- *idx = 0;
- if (info->engines[*idx].engine.engine_class == engine_class) {
- engine_instance = info->engines[*idx].engine.engine_instance;
- break;
- }
- }
- if (engine_instance < 0) {
- free(engines_param);
- return -1;
- }
-
- *class_inst_ptr++ = engine_class;
- *class_inst_ptr++ = engine_instance;
- }
-
- assert((uintptr_t)engines_param + engines_param_size ==
- (uintptr_t)class_inst_ptr);
-
- struct drm_i915_gem_context_create_ext_setparam set_engines = {
- .base = {
- .name = I915_CONTEXT_CREATE_EXT_SETPARAM,
- },
- .param = {
- .param = I915_CONTEXT_PARAM_ENGINES,
- .value = (uintptr_t)engines_param,
- .size = engines_param_size,
- }
- };
- struct drm_i915_gem_context_create_ext create = {
- .flags = I915_CONTEXT_CREATE_FLAGS_USE_EXTENSIONS,
- .extensions = (uintptr_t)&set_engines,
- };
- int ret = intel_ioctl(device->fd, DRM_IOCTL_I915_GEM_CONTEXT_CREATE_EXT, &create);
- free(engines_param);
- if (ret == -1)
- return -1;
-
- return create.ctx_id;
-}
-
-int
-anv_gem_destroy_context(struct anv_device *device, int context)
-{
- struct drm_i915_gem_context_destroy destroy = {
- .ctx_id = context,
- };
-
- return intel_ioctl(device->fd, DRM_IOCTL_I915_GEM_CONTEXT_DESTROY, &destroy);
-}
-
-int
-anv_gem_set_context_param(int fd, int context, uint32_t param, uint64_t value)
-{
- struct drm_i915_gem_context_param p = {
- .ctx_id = context,
- .param = param,
- .value = value,
- };
- int err = 0;
-
- if (intel_ioctl(fd, DRM_IOCTL_I915_GEM_CONTEXT_SETPARAM, &p))
- err = -errno;
- return err;
-}
-
-int
-anv_gem_get_context_param(int fd, int context, uint32_t param, uint64_t *value)
-{
- struct drm_i915_gem_context_param gp = {
- .ctx_id = context,
- .param = param,
- };
-
- int ret = intel_ioctl(fd, DRM_IOCTL_I915_GEM_CONTEXT_GETPARAM, &gp);
- if (ret == -1)
+ default:
+ unreachable("missing");
return -1;
-
- *value = gp.value;
- return 0;
-}
-
-int
-anv_gem_context_get_reset_stats(int fd, int context,
- uint32_t *active, uint32_t *pending)
-{
- struct drm_i915_reset_stats stats = {
- .ctx_id = context,
- };
-
- int ret = intel_ioctl(fd, DRM_IOCTL_I915_GET_RESET_STATS, &stats);
- if (ret == 0) {
- *active = stats.batch_active;
- *pending = stats.batch_pending;
}
-
- return ret;
}
int
@@ -575,220 +110,27 @@ anv_gem_fd_to_handle(struct anv_device *device, int fd)
return args.handle;
}
-int
-anv_gem_reg_read(int fd, uint32_t offset, uint64_t *result)
-{
- struct drm_i915_reg_read args = {
- .offset = offset
- };
-
- int ret = intel_ioctl(fd, DRM_IOCTL_I915_REG_READ, &args);
-
- *result = args.val;
- return ret;
-}
-
-int
-anv_gem_sync_file_merge(struct anv_device *device, int fd1, int fd2)
-{
- struct sync_merge_data args = {
- .name = "anv merge fence",
- .fd2 = fd2,
- .fence = -1,
- };
-
- int ret = intel_ioctl(fd1, SYNC_IOC_MERGE, &args);
- if (ret == -1)
- return -1;
-
- return args.fence;
-}
-
-uint32_t
-anv_gem_syncobj_create(struct anv_device *device, uint32_t flags)
-{
- struct drm_syncobj_create args = {
- .flags = flags,
- };
-
- int ret = intel_ioctl(device->fd, DRM_IOCTL_SYNCOBJ_CREATE, &args);
- if (ret)
- return 0;
-
- return args.handle;
-}
-
-void
-anv_gem_syncobj_destroy(struct anv_device *device, uint32_t handle)
-{
- struct drm_syncobj_destroy args = {
- .handle = handle,
- };
-
- intel_ioctl(device->fd, DRM_IOCTL_SYNCOBJ_DESTROY, &args);
-}
-
-int
-anv_gem_syncobj_handle_to_fd(struct anv_device *device, uint32_t handle)
-{
- struct drm_syncobj_handle args = {
- .handle = handle,
- };
-
- int ret = intel_ioctl(device->fd, DRM_IOCTL_SYNCOBJ_HANDLE_TO_FD, &args);
- if (ret)
- return -1;
-
- return args.fd;
-}
-
-uint32_t
-anv_gem_syncobj_fd_to_handle(struct anv_device *device, int fd)
-{
- struct drm_syncobj_handle args = {
- .fd = fd,
- };
-
- int ret = intel_ioctl(device->fd, DRM_IOCTL_SYNCOBJ_FD_TO_HANDLE, &args);
- if (ret)
- return 0;
-
- return args.handle;
-}
-
-int
-anv_gem_syncobj_export_sync_file(struct anv_device *device, uint32_t handle)
-{
- struct drm_syncobj_handle args = {
- .handle = handle,
- .flags = DRM_SYNCOBJ_HANDLE_TO_FD_FLAGS_EXPORT_SYNC_FILE,
- };
-
- int ret = intel_ioctl(device->fd, DRM_IOCTL_SYNCOBJ_HANDLE_TO_FD, &args);
- if (ret)
- return -1;
-
- return args.fd;
-}
-
-int
-anv_gem_syncobj_import_sync_file(struct anv_device *device,
- uint32_t handle, int fd)
-{
- struct drm_syncobj_handle args = {
- .handle = handle,
- .fd = fd,
- .flags = DRM_SYNCOBJ_FD_TO_HANDLE_FLAGS_IMPORT_SYNC_FILE,
- };
-
- return intel_ioctl(device->fd, DRM_IOCTL_SYNCOBJ_FD_TO_HANDLE, &args);
-}
-
-void
-anv_gem_syncobj_reset(struct anv_device *device, uint32_t handle)
-{
- struct drm_syncobj_array args = {
- .handles = (uint64_t)(uintptr_t)&handle,
- .count_handles = 1,
- };
-
- intel_ioctl(device->fd, DRM_IOCTL_SYNCOBJ_RESET, &args);
-}
-
-bool
-anv_gem_supports_syncobj_wait(int fd)
-{
- return intel_gem_supports_syncobj_wait(fd);
-}
-
-int
-anv_gem_syncobj_wait(struct anv_device *device,
- const uint32_t *handles, uint32_t num_handles,
- int64_t abs_timeout_ns, bool wait_all)
-{
- struct drm_syncobj_wait args = {
- .handles = (uint64_t)(uintptr_t)handles,
- .count_handles = num_handles,
- .timeout_nsec = abs_timeout_ns,
- .flags = DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT,
- };
-
- if (wait_all)
- args.flags |= DRM_SYNCOBJ_WAIT_FLAGS_WAIT_ALL;
-
- return intel_ioctl(device->fd, DRM_IOCTL_SYNCOBJ_WAIT, &args);
-}
-
-int
-anv_gem_syncobj_timeline_wait(struct anv_device *device,
- const uint32_t *handles, const uint64_t *points,
- uint32_t num_items, int64_t abs_timeout_ns,
- bool wait_all, bool wait_materialize)
-{
- assert(device->physical->has_syncobj_wait_available);
-
- struct drm_syncobj_timeline_wait args = {
- .handles = (uint64_t)(uintptr_t)handles,
- .points = (uint64_t)(uintptr_t)points,
- .count_handles = num_items,
- .timeout_nsec = abs_timeout_ns,
- .flags = DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT,
- };
-
- if (wait_all)
- args.flags |= DRM_SYNCOBJ_WAIT_FLAGS_WAIT_ALL;
- if (wait_materialize)
- args.flags |= DRM_SYNCOBJ_WAIT_FLAGS_WAIT_AVAILABLE;
-
- return intel_ioctl(device->fd, DRM_IOCTL_SYNCOBJ_TIMELINE_WAIT, &args);
-}
-
-int
-anv_gem_syncobj_timeline_signal(struct anv_device *device,
- const uint32_t *handles, const uint64_t *points,
- uint32_t num_items)
-{
- assert(device->physical->has_syncobj_wait_available);
-
- struct drm_syncobj_timeline_array args = {
- .handles = (uint64_t)(uintptr_t)handles,
- .points = (uint64_t)(uintptr_t)points,
- .count_handles = num_items,
- };
-
- return intel_ioctl(device->fd, DRM_IOCTL_SYNCOBJ_TIMELINE_SIGNAL, &args);
-}
-
-int
-anv_gem_syncobj_timeline_query(struct anv_device *device,
- const uint32_t *handles, uint64_t *points,
- uint32_t num_items)
-{
- assert(device->physical->has_syncobj_wait_available);
-
- struct drm_syncobj_timeline_array args = {
- .handles = (uint64_t)(uintptr_t)handles,
- .points = (uint64_t)(uintptr_t)points,
- .count_handles = num_items,
- };
-
- return intel_ioctl(device->fd, DRM_IOCTL_SYNCOBJ_QUERY, &args);
-}
-
-struct drm_i915_query_engine_info *
-anv_gem_get_engine_info(int fd)
-{
- return intel_i915_query_alloc(fd, DRM_I915_QUERY_ENGINE_INFO);
+VkResult
+anv_gem_import_bo_alloc_flags_to_bo_flags(struct anv_device *device,
+ struct anv_bo *bo,
+ enum anv_bo_alloc_flags alloc_flags,
+ uint32_t *bo_flags)
+{
+ switch (device->info->kmd_type) {
+ case INTEL_KMD_TYPE_I915:
+ return anv_i915_gem_import_bo_alloc_flags_to_bo_flags(device, bo,
+ alloc_flags,
+ bo_flags);
+ case INTEL_KMD_TYPE_XE:
+ *bo_flags = device->kmd_backend->bo_alloc_flags_to_bo_flags(device, alloc_flags);
+ return VK_SUCCESS;
+ default:
+ unreachable("missing");
+ return VK_ERROR_UNKNOWN;
+ }
}
-int
-anv_gem_count_engines(const struct drm_i915_query_engine_info *info,
- uint16_t engine_class)
+const struct anv_kmd_backend *anv_stub_kmd_backend_get(void)
{
- int count = 0;
- for (int i = 0; i < info->num_engines; i++) {
- if (info->engines[i].engine.engine_class == engine_class)
- count++;
- }
- return count;
+ return NULL;
}
diff --git a/src/intel/vulkan/anv_gem_stubs.c b/src/intel/vulkan/anv_gem_stubs.c
index c552b7c6dc2..48795b431a8 100644
--- a/src/intel/vulkan/anv_gem_stubs.c
+++ b/src/intel/vulkan/anv_gem_stubs.c
@@ -27,8 +27,18 @@
#include "util/anon_file.h"
#include "anv_private.h"
-uint32_t
-anv_gem_create(struct anv_device *device, uint64_t size)
+static void
+stub_gem_close(struct anv_device *device, struct anv_bo *bo)
+{
+ close(bo->gem_handle);
+}
+
+static uint32_t
+stub_gem_create(struct anv_device *device,
+ const struct intel_memory_class_instance **regions,
+ uint16_t num_regions, uint64_t size,
+ enum anv_bo_alloc_flags alloc_flags,
+ uint64_t *actual_size)
{
int fd = os_create_anonymous_file(size, "fake bo");
if (fd == -1)
@@ -36,45 +46,62 @@ anv_gem_create(struct anv_device *device, uint64_t size)
assert(fd != 0);
+ *actual_size = size;
return fd;
}
-void
-anv_gem_close(struct anv_device *device, uint32_t gem_handle)
+static void *
+stub_gem_mmap(struct anv_device *device, struct anv_bo *bo, uint64_t offset,
+ uint64_t size, void *placed_addr)
{
- close(gem_handle);
+ return mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, bo->gem_handle,
+ offset);
}
-uint32_t
-anv_gem_create_regions(struct anv_device *device, uint64_t anv_bo_size,
- uint32_t num_regions,
- struct drm_i915_gem_memory_class_instance *regions)
+static VkResult
+stub_execute_simple_batch(struct anv_queue *queue, struct anv_bo *batch_bo,
+ uint32_t batch_bo_size, bool is_companion_rcs_batch)
{
- return 0;
+ return VK_ERROR_UNKNOWN;
}
-void*
-anv_gem_mmap(struct anv_device *device, uint32_t gem_handle,
- uint64_t offset, uint64_t size, uint32_t flags)
+static VkResult
+stub_execute_trtt_batch(struct anv_sparse_submission *submit,
+ struct anv_trtt_batch_bo *trtt_bbo)
{
- /* Ignore flags, as they're specific to I915_GEM_MMAP. */
- (void) flags;
+ return VK_ERROR_UNKNOWN;
+}
- return mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED,
- gem_handle, offset);
+static VkResult
+stub_queue_exec_locked(struct anv_queue *queue,
+ uint32_t wait_count,
+ const struct vk_sync_wait *waits,
+ uint32_t cmd_buffer_count,
+ struct anv_cmd_buffer **cmd_buffers,
+ uint32_t signal_count,
+ const struct vk_sync_signal *signals,
+ struct anv_query_pool *perf_query_pool,
+ uint32_t perf_query_pass,
+ struct anv_utrace_submit *utrace_submit)
+{
+ return VK_ERROR_UNKNOWN;
}
-/* This is just a wrapper around munmap, but it also notifies valgrind that
- * this map is no longer valid. Pair this with anv_gem_mmap().
- */
-void
-anv_gem_munmap(struct anv_device *device, void *p, uint64_t size)
+static VkResult
+stub_queue_exec_trace(struct anv_queue *queue, struct anv_utrace_submit *submit)
{
- munmap(p, size);
+ return VK_ERROR_UNKNOWN;
}
-uint32_t
-anv_gem_userptr(struct anv_device *device, void *mem, size_t size)
+static uint32_t
+stub_bo_alloc_flags_to_bo_flags(struct anv_device *device,
+ enum anv_bo_alloc_flags alloc_flags)
+{
+ return 0;
+}
+
+static uint32_t
+stub_gem_create_userptr(struct anv_device *device, void *mem, uint64_t size)
{
int fd = os_create_anonymous_file(size, "fake bo");
if (fd == -1)
@@ -86,25 +113,12 @@ anv_gem_userptr(struct anv_device *device, void *mem, size_t size)
}
int
-anv_gem_busy(struct anv_device *device, uint32_t gem_handle)
-{
- return 0;
-}
-
-int
anv_gem_wait(struct anv_device *device, uint32_t gem_handle, int64_t *timeout_ns)
{
return 0;
}
int
-anv_gem_execbuffer(struct anv_device *device,
- struct drm_i915_gem_execbuffer2 *execbuf)
-{
- return 0;
-}
-
-int
anv_gem_set_tiling(struct anv_device *device,
uint32_t gem_handle, uint32_t stride, uint32_t tiling)
{
@@ -118,75 +132,6 @@ anv_gem_get_tiling(struct anv_device *device, uint32_t gem_handle)
}
int
-anv_gem_set_caching(struct anv_device *device, uint32_t gem_handle,
- uint32_t caching)
-{
- return 0;
-}
-
-int
-anv_gem_set_domain(struct anv_device *device, uint32_t gem_handle,
- uint32_t read_domains, uint32_t write_domain)
-{
- return 0;
-}
-
-int
-anv_gem_get_param(int fd, uint32_t param)
-{
- unreachable("Unused");
-}
-
-uint64_t
-anv_gem_get_drm_cap(int fd, uint32_t capability)
-{
- return 0;
-}
-
-bool
-anv_gem_get_bit6_swizzle(int fd, uint32_t tiling)
-{
- unreachable("Unused");
-}
-
-int
-anv_gem_create_context(struct anv_device *device)
-{
- unreachable("Unused");
-}
-
-int
-anv_gem_destroy_context(struct anv_device *device, int context)
-{
- unreachable("Unused");
-}
-
-int
-anv_gem_set_context_param(int fd, int context, uint32_t param, uint64_t value)
-{
- unreachable("Unused");
-}
-
-int
-anv_gem_get_context_param(int fd, int context, uint32_t param, uint64_t *value)
-{
- unreachable("Unused");
-}
-
-bool
-anv_gem_has_context_priority(int fd)
-{
- unreachable("Unused");
-}
-
-int
-anv_gem_context_get_reset_stats(int fd, int context,
- uint32_t *active, uint32_t *pending)
-{
- unreachable("Unused");
-}
-
-int
anv_gem_handle_to_fd(struct anv_device *device, uint32_t gem_handle)
{
unreachable("Unused");
@@ -198,125 +143,43 @@ anv_gem_fd_to_handle(struct anv_device *device, int fd)
unreachable("Unused");
}
-int
-anv_gem_sync_file_merge(struct anv_device *device, int fd1, int fd2)
-{
- unreachable("Unused");
-}
-
-int
-anv_gem_syncobj_export_sync_file(struct anv_device *device, uint32_t handle)
-{
- unreachable("Unused");
-}
-
-int
-anv_gem_syncobj_import_sync_file(struct anv_device *device,
- uint32_t handle, int fd)
-{
- unreachable("Unused");
-}
-
-uint32_t
-anv_gem_syncobj_create(struct anv_device *device, uint32_t flags)
-{
- unreachable("Unused");
-}
-
-void
-anv_gem_syncobj_destroy(struct anv_device *device, uint32_t handle)
-{
- unreachable("Unused");
-}
-
-int
-anv_gem_syncobj_handle_to_fd(struct anv_device *device, uint32_t handle)
-{
- unreachable("Unused");
-}
-
-uint32_t
-anv_gem_syncobj_fd_to_handle(struct anv_device *device, int fd)
-{
- unreachable("Unused");
-}
-
-void
-anv_gem_syncobj_reset(struct anv_device *device, uint32_t handle)
-{
- unreachable("Unused");
-}
-
-bool
-anv_gem_supports_syncobj_wait(int fd)
+VkResult
+anv_gem_import_bo_alloc_flags_to_bo_flags(struct anv_device *device,
+ struct anv_bo *bo,
+ enum anv_bo_alloc_flags alloc_flags,
+ uint32_t *bo_flags)
{
- return false;
+ return VK_SUCCESS;
}
-int
-anv_i915_query(int fd, uint64_t query_id, void *buffer,
- int32_t *buffer_len)
+static VkResult
+stub_vm_bind(struct anv_device *device, struct anv_sparse_submission *submit,
+ enum anv_vm_bind_flags flags)
{
- unreachable("Unused");
+ return VK_SUCCESS;
}
-int
-anv_gem_create_context_engines(struct anv_device *device,
- const struct drm_i915_query_engine_info *info,
- int num_engines,
- uint16_t *engine_classes)
+static VkResult
+stub_vm_bind_bo(struct anv_device *device, struct anv_bo *bo)
{
- unreachable("Unused");
+ return VK_SUCCESS;
}
-struct drm_i915_query_engine_info *
-anv_gem_get_engine_info(int fd)
+const struct anv_kmd_backend *anv_stub_kmd_backend_get(void)
{
- unreachable("Unused");
-}
-
-int
-anv_gem_count_engines(const struct drm_i915_query_engine_info *info,
- uint16_t engine_class)
-{
- unreachable("Unused");
-}
-
-int
-anv_gem_syncobj_wait(struct anv_device *device,
- const uint32_t *handles, uint32_t num_handles,
- int64_t abs_timeout_ns, bool wait_all)
-{
- unreachable("Unused");
-}
-
-int
-anv_gem_reg_read(int fd, uint32_t offset, uint64_t *result)
-{
- unreachable("Unused");
-}
-
-int
-anv_gem_syncobj_timeline_wait(struct anv_device *device,
- const uint32_t *handles, const uint64_t *points,
- uint32_t num_items, int64_t abs_timeout_ns,
- bool wait_all, bool wait_materialize)
-{
- unreachable("Unused");
-}
-
-int
-anv_gem_syncobj_timeline_signal(struct anv_device *device,
- const uint32_t *handles, const uint64_t *points,
- uint32_t num_items)
-{
- unreachable("Unused");
-}
-
-int
-anv_gem_syncobj_timeline_query(struct anv_device *device,
- const uint32_t *handles, uint64_t *points,
- uint32_t num_items)
-{
- unreachable("Unused");
+ static const struct anv_kmd_backend stub_backend = {
+ .gem_create = stub_gem_create,
+ .gem_create_userptr = stub_gem_create_userptr,
+ .gem_close = stub_gem_close,
+ .gem_mmap = stub_gem_mmap,
+ .vm_bind = stub_vm_bind,
+ .vm_bind_bo = stub_vm_bind_bo,
+ .vm_unbind_bo = stub_vm_bind_bo,
+ .execute_simple_batch = stub_execute_simple_batch,
+ .execute_trtt_batch = stub_execute_trtt_batch,
+ .queue_exec_locked = stub_queue_exec_locked,
+ .queue_exec_trace = stub_queue_exec_trace,
+ .bo_alloc_flags_to_bo_flags = stub_bo_alloc_flags_to_bo_flags,
+ };
+ return &stub_backend;
}
diff --git a/src/intel/vulkan/anv_genX.h b/src/intel/vulkan/anv_genX.h
index 025ceff8a95..9370a9dc7a2 100644
--- a/src/intel/vulkan/anv_genX.h
+++ b/src/intel/vulkan/anv_genX.h
@@ -36,6 +36,14 @@
#error This file is included by means other than anv_private.h
#endif
+struct intel_sample_positions;
+struct intel_urb_config;
+struct anv_embedded_sampler;
+struct anv_pipeline_embedded_sampler_binding;
+
+typedef struct nir_builder nir_builder;
+typedef struct nir_shader nir_shader;
+
extern const uint32_t genX(vk_to_intel_cullmode)[];
extern const uint32_t genX(vk_to_intel_front_face)[];
@@ -48,16 +56,36 @@ extern const uint32_t genX(vk_to_intel_stencil_op)[];
extern const uint32_t genX(vk_to_intel_logic_op)[];
+extern const uint32_t genX(vk_to_intel_fillmode)[];
+
void genX(init_physical_device_state)(struct anv_physical_device *device);
VkResult genX(init_device_state)(struct anv_device *device);
+void genX(init_cps_device_state)(struct anv_device *device);
+
+nir_shader *genX(load_libanv_shader)(struct anv_device *device, void *mem_ctx);
+
+uint32_t genX(call_internal_shader)(nir_builder *b,
+ enum anv_internal_kernel_name shader_name);
+
+void
+genX(set_fast_clear_state)(struct anv_cmd_buffer *cmd_buffer,
+ const struct anv_image *image,
+ const enum isl_format format,
+ union isl_color_value clear_color);
+
+void
+genX(load_image_clear_color)(struct anv_cmd_buffer *cmd_buffer,
+ struct anv_state surface_state,
+ const struct anv_image *image);
+
+void genX(cmd_buffer_emit_bt_pool_base_address)(struct anv_cmd_buffer *cmd_buffer);
+
void genX(cmd_buffer_emit_state_base_address)(struct anv_cmd_buffer *cmd_buffer);
void genX(cmd_buffer_apply_pipe_flushes)(struct anv_cmd_buffer *cmd_buffer);
-void genX(cmd_buffer_emit_gfx7_depth_flush)(struct anv_cmd_buffer *cmd_buffer);
-
void genX(cmd_buffer_emit_gfx12_depth_wa)(struct anv_cmd_buffer *cmd_buffer,
const struct isl_surf *surf);
@@ -73,8 +101,46 @@ void genX(cmd_buffer_emit_hashing_mode)(struct anv_cmd_buffer *cmd_buffer,
unsigned width, unsigned height,
unsigned scale);
+void genX(urb_workaround)(struct anv_cmd_buffer *cmd_buffer,
+ const struct intel_urb_config *urb_cfg);
+
void genX(flush_pipeline_select_3d)(struct anv_cmd_buffer *cmd_buffer);
void genX(flush_pipeline_select_gpgpu)(struct anv_cmd_buffer *cmd_buffer);
+void genX(emit_pipeline_select)(struct anv_batch *batch, uint32_t pipeline,
+ const struct anv_device *device);
+
+void genX(apply_task_urb_workaround)(struct anv_cmd_buffer *cmd_buffer);
+
+void genX(emit_vertex_input)(struct anv_batch *batch,
+ uint32_t *vertex_element_dws,
+ struct anv_graphics_pipeline *pipeline,
+ const struct vk_vertex_input_state *vi,
+ bool emit_in_pipeline);
+
+enum anv_pipe_bits
+genX(emit_apply_pipe_flushes)(struct anv_batch *batch,
+ struct anv_device *device,
+ uint32_t current_pipeline,
+ enum anv_pipe_bits bits,
+ enum anv_pipe_bits *emitted_flush_bits);
+void
+genX(invalidate_aux_map)(struct anv_batch *batch,
+ struct anv_device *device,
+ enum intel_engine_class engine_class,
+ enum anv_pipe_bits bits);
+
+
+void genX(emit_so_memcpy_init)(struct anv_memcpy_state *state,
+ struct anv_device *device,
+ struct anv_batch *batch);
+
+void genX(emit_so_memcpy_fini)(struct anv_memcpy_state *state);
+
+void genX(emit_so_memcpy_end)(struct anv_memcpy_state *state);
+
+void genX(emit_so_memcpy)(struct anv_memcpy_state *state,
+ struct anv_address dst, struct anv_address src,
+ uint32_t size);
void genX(emit_l3_config)(struct anv_batch *batch,
const struct anv_device *device,
@@ -83,10 +149,21 @@ void genX(emit_l3_config)(struct anv_batch *batch,
void genX(cmd_buffer_config_l3)(struct anv_cmd_buffer *cmd_buffer,
const struct intel_l3_config *cfg);
-void genX(cmd_buffer_flush_state)(struct anv_cmd_buffer *cmd_buffer);
-void genX(cmd_buffer_flush_dynamic_state)(struct anv_cmd_buffer *cmd_buffer);
+void genX(flush_descriptor_buffers)(struct anv_cmd_buffer *cmd_buffer,
+ struct anv_cmd_pipeline_state *pipe_state);
+
+uint32_t
+genX(cmd_buffer_flush_descriptor_sets)(struct anv_cmd_buffer *cmd_buffer,
+ struct anv_cmd_pipeline_state *pipe_state,
+ const VkShaderStageFlags dirty,
+ struct anv_shader_bin **shaders,
+ uint32_t num_shaders);
+
+void genX(cmd_buffer_flush_gfx_hw_state)(struct anv_cmd_buffer *cmd_buffer);
+
+void genX(cmd_buffer_flush_gfx_runtime_state)(struct anv_cmd_buffer *cmd_buffer);
-void genX(cmd_buffer_flush_compute_state)(struct anv_cmd_buffer *cmd_buffer);
+void genX(cmd_buffer_flush_gfx_hw_state)(struct anv_cmd_buffer *cmd_buffer);
void genX(cmd_buffer_enable_pma_fix)(struct anv_cmd_buffer *cmd_buffer,
bool enable);
@@ -101,46 +178,216 @@ void genX(cmd_buffer_mark_image_written)(struct anv_cmd_buffer *cmd_buffer,
void genX(cmd_emit_conditional_render_predicate)(struct anv_cmd_buffer *cmd_buffer);
+struct anv_address genX(cmd_buffer_ray_query_globals)(struct anv_cmd_buffer *cmd_buffer);
+
+void genX(cmd_buffer_ensure_cfe_state)(struct anv_cmd_buffer *cmd_buffer,
+ uint32_t total_scratch);
+
void
genX(emit_urb_setup)(struct anv_device *device, struct anv_batch *batch,
const struct intel_l3_config *l3_config,
VkShaderStageFlags active_stages,
- const unsigned entry_size[4],
+ const struct intel_urb_config *urb_cfg_in,
+ struct intel_urb_config *urb_cfg_out,
enum intel_urb_deref_block_size *deref_block_size);
-void genX(emit_multisample)(struct anv_batch *batch, uint32_t samples,
- const VkSampleLocationEXT *locations);
-
-void genX(emit_sample_pattern)(struct anv_batch *batch, uint32_t samples,
- const VkSampleLocationEXT *locations);
-
-void genX(emit_shading_rate)(struct anv_batch *batch,
- const struct anv_graphics_pipeline *pipeline,
- struct anv_state cps_states,
- struct anv_dynamic_state *dynamic_state);
+void genX(emit_sample_pattern)(struct anv_batch *batch,
+ const struct vk_sample_locations_state *sl);
void genX(cmd_buffer_so_memcpy)(struct anv_cmd_buffer *cmd_buffer,
struct anv_address dst, struct anv_address src,
uint32_t size);
+void genX(cmd_buffer_dispatch_kernel)(struct anv_cmd_buffer *cmd_buffer,
+ struct anv_kernel *kernel,
+ const uint32_t *global_size, /* NULL for indirect */
+ uint32_t arg_count,
+ const struct anv_kernel_arg *args);
+
+void genX(blorp_init_dynamic_states)(struct blorp_context *context);
+
void genX(blorp_exec)(struct blorp_batch *batch,
const struct blorp_params *params);
+void genX(batch_emit_secondary_call)(struct anv_batch *batch,
+ struct anv_address secondary_addr,
+ struct anv_address secondary_return_addr);
+
+void *genX(batch_emit_return)(struct anv_batch *batch);
+
void genX(cmd_emit_timestamp)(struct anv_batch *batch,
- struct anv_bo *bo,
- uint32_t offset);
+ struct anv_device *device,
+ struct anv_address addr,
+ enum anv_timestamp_capture_type type,
+ void *data);
void
-genX(rasterization_mode)(VkPolygonMode raster_mode,
- VkLineRasterizationModeEXT line_mode,
- float line_width,
- uint32_t *api_mode,
- bool *msaa_rasterization_enable);
+genX(batch_emit_post_3dprimitive_was)(struct anv_batch *batch,
+ const struct anv_device *device,
+ uint32_t primitive_topology,
+ uint32_t vertex_count);
-uint32_t
-genX(ms_rasterization_mode)(struct anv_graphics_pipeline *pipeline,
- VkPolygonMode raster_mode);
+void genX(batch_emit_fast_color_dummy_blit)(struct anv_batch *batch,
+ struct anv_device *device);
VkPolygonMode
-genX(raster_polygon_mode)(struct anv_graphics_pipeline *pipeline,
+genX(raster_polygon_mode)(const struct anv_graphics_pipeline *pipeline,
+ VkPolygonMode polygon_mode,
VkPrimitiveTopology primitive_topology);
+
+void
+genX(graphics_pipeline_emit)(struct anv_graphics_pipeline *pipeline,
+ const struct vk_graphics_pipeline_state *state);
+
+void
+genX(compute_pipeline_emit)(struct anv_compute_pipeline *pipeline);
+
+void
+genX(ray_tracing_pipeline_emit)(struct anv_ray_tracing_pipeline *pipeline);
+
+#define anv_shader_bin_get_bsr(bin, local_arg_offset) ({ \
+ assert((local_arg_offset) % 8 == 0); \
+ const struct brw_bs_prog_data *prog_data = \
+ brw_bs_prog_data_const(bin->prog_data); \
+ assert(prog_data->simd_size == 8 || prog_data->simd_size == 16); \
+ \
+ (struct GENX(BINDLESS_SHADER_RECORD)) { \
+ .OffsetToLocalArguments = (local_arg_offset) / 8, \
+ .BindlessShaderDispatchMode = \
+ prog_data->simd_size == 16 ? RT_SIMD16 : RT_SIMD8, \
+ .KernelStartPointer = bin->kernel.offset, \
+ }; \
+})
+
+void
+genX(batch_set_preemption)(struct anv_batch *batch,
+ const struct intel_device_info *devinfo,
+ uint32_t current_pipeline,
+ bool value);
+
+void
+genX(cmd_buffer_set_preemption)(struct anv_cmd_buffer *cmd_buffer, bool value);
+
+void
+genX(batch_emit_pipe_control)(struct anv_batch *batch,
+ const struct intel_device_info *devinfo,
+ uint32_t current_pipeline,
+ enum anv_pipe_bits bits,
+ const char *reason);
+
+void
+genX(batch_emit_pipe_control_write)(struct anv_batch *batch,
+ const struct intel_device_info *devinfo,
+ uint32_t current_pipeline,
+ uint32_t post_sync_op,
+ struct anv_address address,
+ uint32_t imm_data,
+ enum anv_pipe_bits bits,
+ const char *reason);
+
+#define genx_batch_emit_pipe_control(a, b, c, d) \
+genX(batch_emit_pipe_control) (a, b, c, d, __func__)
+
+#define genx_batch_emit_pipe_control_write(a, b, c, d, e, f, g) \
+genX(batch_emit_pipe_control_write) (a, b, c, d, e, f, g, __func__)
+
+void genX(batch_emit_breakpoint)(struct anv_batch *batch,
+ struct anv_device *device,
+ bool emit_before_draw);
+
+static inline void
+genX(emit_breakpoint)(struct anv_batch *batch,
+ struct anv_device *device,
+ bool emit_before_draw)
+{
+ if (INTEL_DEBUG(DEBUG_DRAW_BKP))
+ genX(batch_emit_breakpoint)(batch, device, emit_before_draw);
+}
+
+void
+genX(cmd_buffer_begin_companion)(struct anv_cmd_buffer *buffer,
+ VkCommandBufferLevel level);
+
+struct anv_state
+genX(cmd_buffer_begin_companion_rcs_syncpoint)(struct anv_cmd_buffer *cmd_buffer);
+
+void
+genX(cmd_buffer_end_companion_rcs_syncpoint)(struct anv_cmd_buffer *cmd_buffer,
+ struct anv_state syncpoint);
+
+void
+genX(emit_simple_shader_init)(struct anv_simple_shader *state);
+
+void
+genX(emit_simple_shader_dispatch)(struct anv_simple_shader *state,
+ uint32_t num_threads,
+ struct anv_state push_state);
+
+struct anv_state
+genX(simple_shader_alloc_push)(struct anv_simple_shader *state, uint32_t size);
+
+struct anv_address
+genX(simple_shader_push_state_address)(struct anv_simple_shader *state,
+ struct anv_state push_state);
+
+void
+genX(emit_simple_shader_end)(struct anv_simple_shader *state);
+
+VkResult genX(init_trtt_context_state)(struct anv_queue *queue);
+
+VkResult genX(write_trtt_entries)(struct anv_trtt_submission *submit);
+
+void
+genX(cmd_buffer_emit_push_descriptor_buffer_surface)(struct anv_cmd_buffer *cmd_buffer,
+ struct anv_descriptor_set *set);
+
+void
+genX(cmd_buffer_emit_push_descriptor_surfaces)(struct anv_cmd_buffer *cmd_buffer,
+ struct anv_descriptor_set *set);
+
+static inline VkShaderStageFlags
+genX(cmd_buffer_flush_push_descriptors)(struct anv_cmd_buffer *cmd_buffer,
+ struct anv_cmd_pipeline_state *state,
+ struct anv_pipeline *pipeline)
+{
+ if (!pipeline->use_push_descriptor && !pipeline->use_push_descriptor_buffer)
+ return 0;
+
+ assert(pipeline->layout.push_descriptor_set_index != -1);
+ struct anv_descriptor_set *set =
+ state->descriptors[pipeline->layout.push_descriptor_set_index];
+ assert(set->is_push);
+
+ const VkShaderStageFlags push_buffer_dirty =
+ cmd_buffer->state.push_descriptors_dirty &
+ pipeline->use_push_descriptor_buffer;
+ if (push_buffer_dirty) {
+ if (set->desc_surface_state.map == NULL)
+ genX(cmd_buffer_emit_push_descriptor_buffer_surface)(cmd_buffer, set);
+
+ /* Force the next push descriptor update to allocate a new descriptor set. */
+ state->push_descriptor.set_used_on_gpu = true;
+ }
+
+ const VkShaderStageFlags push_descriptor_dirty =
+ cmd_buffer->state.push_descriptors_dirty & pipeline->use_push_descriptor;
+ if (push_descriptor_dirty) {
+ genX(cmd_buffer_emit_push_descriptor_surfaces)(cmd_buffer, set);
+
+ /* Force the next push descriptor update to allocate a new descriptor set. */
+ state->push_descriptor.set_used_on_gpu = true;
+ }
+
+ /* Clear the dirty stages now that we've generated the surface states for
+ * them.
+ */
+ cmd_buffer->state.push_descriptors_dirty &=
+ ~(push_descriptor_dirty | push_buffer_dirty);
+
+ /* Return the binding table stages that need to be updated */
+ return push_buffer_dirty | push_descriptor_dirty;
+}
+
+void genX(emit_embedded_sampler)(struct anv_device *device,
+ struct anv_embedded_sampler *sampler,
+ struct anv_pipeline_embedded_sampler_binding *binding);
diff --git a/src/intel/vulkan/anv_image.c b/src/intel/vulkan/anv_image.c
index 97062a067cc..7d1c57b806d 100644
--- a/src/intel/vulkan/anv_image.c
+++ b/src/intel/vulkan/anv_image.c
@@ -30,7 +30,8 @@
#include "drm-uapi/drm_fourcc.h"
#include "anv_private.h"
-#include "util/debug.h"
+#include "common/intel_aux_map.h"
+#include "util/u_debug.h"
#include "vk_util.h"
#include "util/u_math.h"
@@ -53,27 +54,35 @@ memory_range_end(struct anv_image_memory_range memory_range)
}
/**
- * Get binding for VkImagePlaneMemoryRequirementsInfo and
- * VkBindImagePlaneMemoryInfo.
+ * Get binding for VkImagePlaneMemoryRequirementsInfo,
+ * VkBindImagePlaneMemoryInfo and VkDeviceImageMemoryRequirements.
*/
-static struct anv_image_binding *
-image_aspect_to_binding(struct anv_image *image, VkImageAspectFlags aspect)
+struct anv_image_binding *
+anv_image_aspect_to_binding(struct anv_image *image,
+ VkImageAspectFlags aspect)
{
- uint32_t plane;
+ uint32_t plane = 0;
assert(image->disjoint);
if (image->vk.tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT) {
/* Spec requires special aspects for modifier images. */
- assert(aspect >= VK_IMAGE_ASPECT_MEMORY_PLANE_0_BIT_EXT &&
- aspect <= VK_IMAGE_ASPECT_MEMORY_PLANE_3_BIT_EXT);
+ assert(aspect == VK_IMAGE_ASPECT_MEMORY_PLANE_0_BIT_EXT ||
+ aspect == VK_IMAGE_ASPECT_MEMORY_PLANE_1_BIT_EXT ||
+ aspect == VK_IMAGE_ASPECT_MEMORY_PLANE_2_BIT_EXT ||
+ aspect == VK_IMAGE_ASPECT_MEMORY_PLANE_3_BIT_EXT);
/* We don't advertise DISJOINT for modifiers with aux, and therefore we
* don't handle queries of the modifier's "aux plane" here.
*/
assert(!isl_drm_modifier_has_aux(image->vk.drm_format_mod));
- plane = aspect - VK_IMAGE_ASPECT_MEMORY_PLANE_0_BIT_EXT;
+ switch(aspect) {
+ case VK_IMAGE_ASPECT_MEMORY_PLANE_0_BIT_EXT: plane = 0; break;
+ case VK_IMAGE_ASPECT_MEMORY_PLANE_1_BIT_EXT: plane = 1; break;
+ case VK_IMAGE_ASPECT_MEMORY_PLANE_2_BIT_EXT: plane = 2; break;
+ case VK_IMAGE_ASPECT_MEMORY_PLANE_3_BIT_EXT: plane = 3; break;
+ }
} else {
plane = anv_image_aspect_to_plane(image, aspect);
}
@@ -126,51 +135,44 @@ image_binding_grow(const struct anv_device *device,
&image->bindings[binding].memory_range;
if (has_implicit_offset) {
- offset = align_u64(container->offset + container->size, alignment);
+ offset = align64(container->offset + container->size, alignment);
} else {
/* Offset must be validated because it comes from
* VkImageDrmFormatModifierExplicitCreateInfoEXT.
*/
if (unlikely(!anv_is_aligned(offset, alignment))) {
- return vk_errorf(device, &device->vk.base,
+ return vk_errorf(device,
VK_ERROR_INVALID_DRM_FORMAT_MODIFIER_PLANE_LAYOUT_EXT,
"VkImageDrmFormatModifierExplicitCreateInfoEXT::"
"pPlaneLayouts[]::offset is misaligned");
}
-
- /* We require that surfaces be added in memory-order. This simplifies the
- * layout validation required by
- * VkImageDrmFormatModifierExplicitCreateInfoEXT,
- */
- if (unlikely(offset < container->size)) {
- return vk_errorf(device, &device->vk.base,
- VK_ERROR_INVALID_DRM_FORMAT_MODIFIER_PLANE_LAYOUT_EXT,
- "VkImageDrmFormatModifierExplicitCreateInfoEXT::"
- "pPlaneLayouts[]::offset is too small");
- }
}
- if (__builtin_add_overflow(offset, size, &container->size)) {
+ /* Surfaces can be added out of memory-order. Track the end of each memory
+ * plane to update the binding size properly.
+ */
+ uint64_t memory_range_end;
+ if (__builtin_add_overflow(offset, size, &memory_range_end)) {
if (has_implicit_offset) {
assert(!"overflow");
- return vk_errorf(device, &device->vk.base,
- VK_ERROR_UNKNOWN,
+ return vk_errorf(device, VK_ERROR_UNKNOWN,
"internal error: overflow in %s", __func__);
} else {
- return vk_errorf(device, &device->vk.base,
+ return vk_errorf(device,
VK_ERROR_INVALID_DRM_FORMAT_MODIFIER_PLANE_LAYOUT_EXT,
"VkImageDrmFormatModifierExplicitCreateInfoEXT::"
"pPlaneLayouts[]::offset is too large");
}
}
+ container->size = MAX2(container->size, memory_range_end);
container->alignment = MAX2(container->alignment, alignment);
*out_range = (struct anv_image_memory_range) {
.binding = binding,
- .offset = offset,
- .size = size,
.alignment = alignment,
+ .size = size,
+ .offset = offset,
};
return VK_SUCCESS;
@@ -200,26 +202,55 @@ memory_range_merge(struct anv_image_memory_range *a,
a->size = MAX2(a->size, b.offset + b.size);
}
-static isl_surf_usage_flags_t
-choose_isl_surf_usage(VkImageCreateFlags vk_create_flags,
- VkImageUsageFlags vk_usage,
- isl_surf_usage_flags_t isl_extra_usage,
- VkImageAspectFlagBits aspect)
+isl_surf_usage_flags_t
+anv_image_choose_isl_surf_usage(struct anv_physical_device *device,
+ VkImageCreateFlags vk_create_flags,
+ VkImageUsageFlags vk_usage,
+ isl_surf_usage_flags_t isl_extra_usage,
+ VkImageAspectFlagBits aspect,
+ VkImageCompressionFlagsEXT comp_flags)
{
isl_surf_usage_flags_t isl_usage = isl_extra_usage;
+ /* On platform like MTL, we choose to allocate additional CCS memory at the
+ * back of the VkDeviceMemory objects since different images can share the
+ * AUX-TT PTE because the HW doesn't care about the image format in the
+ * PTE. That means we can always ignore the AUX-TT alignment requirement
+ * from an ISL point of view.
+ */
+ if (device->alloc_aux_tt_mem)
+ isl_usage |= ISL_SURF_USAGE_NO_AUX_TT_ALIGNMENT_BIT;
+
if (vk_usage & VK_IMAGE_USAGE_SAMPLED_BIT)
isl_usage |= ISL_SURF_USAGE_TEXTURE_BIT;
if (vk_usage & VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT)
isl_usage |= ISL_SURF_USAGE_TEXTURE_BIT;
+ if (vk_usage & VK_IMAGE_USAGE_STORAGE_BIT)
+ isl_usage |= ISL_SURF_USAGE_STORAGE_BIT;
+
if (vk_usage & VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT)
isl_usage |= ISL_SURF_USAGE_RENDER_TARGET_BIT;
+ if (vk_usage & VK_IMAGE_USAGE_FRAGMENT_SHADING_RATE_ATTACHMENT_BIT_KHR)
+ isl_usage |= ISL_SURF_USAGE_CPB_BIT;
+
+ if (vk_create_flags & VK_IMAGE_CREATE_SPARSE_BINDING_BIT)
+ isl_usage |= ISL_SURF_USAGE_SPARSE_BIT |
+ ISL_SURF_USAGE_DISABLE_AUX_BIT;
+
+ if (vk_usage & VK_IMAGE_USAGE_VIDEO_DECODE_DST_BIT_KHR ||
+ vk_usage & VK_IMAGE_USAGE_VIDEO_DECODE_DPB_BIT_KHR)
+ isl_usage |= ISL_SURF_USAGE_VIDEO_DECODE_BIT;
+
if (vk_create_flags & VK_IMAGE_CREATE_CUBE_COMPATIBLE_BIT)
isl_usage |= ISL_SURF_USAGE_CUBE_BIT;
+ if (vk_create_flags & (VK_IMAGE_CREATE_2D_VIEW_COMPATIBLE_BIT_EXT |
+ VK_IMAGE_CREATE_2D_ARRAY_COMPATIBLE_BIT))
+ isl_usage |= ISL_SURF_USAGE_2D_3D_COMPATIBLE_BIT;
+
/* Even if we're only using it for transfer operations, clears to depth and
* stencil images happen as depth and stencil so they need the right ISL
* usage bits or else things will fall apart.
@@ -253,6 +284,9 @@ choose_isl_surf_usage(VkImageCreateFlags vk_create_flags,
isl_usage |= ISL_SURF_USAGE_RENDER_TARGET_BIT;
}
+ if (comp_flags & VK_IMAGE_COMPRESSION_DISABLED_EXT)
+ isl_usage |= ISL_SURF_USAGE_DISABLE_AUX_BIT;
+
return isl_usage;
}
@@ -319,67 +353,138 @@ add_surface(struct anv_device *device,
&surf->memory_range);
}
+static bool
+can_fast_clear_with_non_zero_color(const struct intel_device_info *devinfo,
+ const struct anv_image *image,
+ uint32_t plane,
+ const VkImageFormatListCreateInfo *fmt_list)
+{
+ /* If we don't have an AUX surface where fast clears apply, we can return
+ * early.
+ */
+ if (!isl_aux_usage_has_fast_clears(image->planes[plane].aux_usage))
+ return false;
+
+ /* On TGL (< C0), if a block of fragment shader outputs match the surface's
+ * clear color, the HW may convert them to fast-clears (see HSD 1607794140).
+ * This can lead to rendering corruptions if not handled properly. We
+ * restrict the clear color to zero to avoid issues that can occur with:
+ * - Texture view rendering (including blorp_copy calls)
+ * - Images with multiple levels or array layers
+ */
+ if (image->planes[plane].aux_usage == ISL_AUX_USAGE_FCV_CCS_E)
+ return false;
+
+ /* Turning on non zero fast clears for CCS_E introduces a performance
+ * regression for games such as F1 22 and RDR2 by introducing additional
+ * partial resolves. Let's turn non zero fast clears back off till we can
+ * fix performance.
+ */
+ if (image->planes[plane].aux_usage == ISL_AUX_USAGE_CCS_E &&
+ devinfo->ver >= 12)
+ return false;
+
+ /* Non mutable image, we can fast clear with any color supported by HW.
+ */
+ if (!(image->vk.create_flags & VK_IMAGE_CREATE_MUTABLE_FORMAT_BIT))
+ return true;
+
+ /* Mutable image with no format list, we have to assume all formats */
+ if (!fmt_list || fmt_list->viewFormatCount == 0)
+ return false;
+
+ enum isl_format img_format = image->planes[plane].primary_surface.isl.format;
+
+ /* Check bit compatibility for clear color components */
+ for (uint32_t i = 0; i < fmt_list->viewFormatCount; i++) {
+ if (fmt_list->pViewFormats[i] == VK_FORMAT_UNDEFINED)
+ continue;
+
+ struct anv_format_plane view_format_plane =
+ anv_get_format_plane(devinfo, fmt_list->pViewFormats[i],
+ plane, image->vk.tiling);
+
+ enum isl_format view_format = view_format_plane.isl_format;
+
+ if (!isl_formats_have_same_bits_per_channel(img_format, view_format))
+ return false;
+ }
+
+ return true;
+}
+
/**
- * Do hardware limitations require the image plane to use a shadow surface?
+ * Return true if the storage image could be used with atomics.
*
- * If hardware limitations force us to use a shadow surface, then the same
- * limitations may also constrain the tiling of the primary surface; therefore
- * paramater @a inout_primary_tiling_flags.
- *
- * If the image plane is a separate stencil plane and if the user provided
- * VkImageStencilUsageCreateInfoEXT, then @a usage must be stencilUsage.
- *
- * @see anv_image::planes[]::shadow_surface
+ * If the image was created with an explicit format, we check it for typed
+ * atomic support. If MUTABLE_FORMAT_BIT is set, then we check the optional
+ * format list, seeing if /any/ of the formats support typed atomics. If no
+ * list is supplied, we fall back to using the bpb, as the application could
+ * make an image view with a format that does use atomics.
*/
static bool
-anv_image_plane_needs_shadow_surface(const struct intel_device_info *devinfo,
- struct anv_format_plane plane_format,
+storage_image_format_supports_atomic(const struct intel_device_info *devinfo,
+ VkImageCreateFlags create_flags,
+ enum isl_format format,
VkImageTiling vk_tiling,
- VkImageUsageFlags vk_plane_usage,
- VkImageCreateFlags vk_create_flags,
- isl_tiling_flags_t *inout_primary_tiling_flags)
-{
- if (devinfo->ver <= 8 &&
- (vk_create_flags & VK_IMAGE_CREATE_BLOCK_TEXEL_VIEW_COMPATIBLE_BIT) &&
- vk_tiling == VK_IMAGE_TILING_OPTIMAL) {
- /* We must fallback to a linear surface because we may not be able to
- * correctly handle the offsets if tiled. (On gfx9,
- * RENDER_SURFACE_STATE::X/Y Offset are sufficient). To prevent garbage
- * performance while texturing, we maintain a tiled shadow surface.
- */
- assert(isl_format_is_compressed(plane_format.isl_format));
+ const VkImageFormatListCreateInfo *fmt_list)
+{
+ if (isl_format_supports_typed_atomics(devinfo, format))
+ return true;
- if (inout_primary_tiling_flags) {
- *inout_primary_tiling_flags = ISL_TILING_LINEAR_BIT;
- }
+ if (!(create_flags & VK_IMAGE_CREATE_MUTABLE_FORMAT_BIT))
+ return false;
- return true;
- }
+ if (fmt_list) {
+ for (uint32_t i = 0; i < fmt_list->viewFormatCount; i++) {
+ if (fmt_list->pViewFormats[i] == VK_FORMAT_UNDEFINED)
+ continue;
- if (devinfo->ver <= 7 &&
- plane_format.aspect == VK_IMAGE_ASPECT_STENCIL_BIT &&
- (vk_plane_usage & VK_IMAGE_USAGE_SAMPLED_BIT)) {
- /* gfx7 can't sample from W-tiled surfaces. */
- return true;
+ enum isl_format view_format =
+ anv_get_isl_format(devinfo, fmt_list->pViewFormats[i],
+ VK_IMAGE_ASPECT_COLOR_BIT, vk_tiling);
+
+ if (isl_format_supports_typed_atomics(devinfo, view_format))
+ return true;
+ }
+
+ return false;
}
- return false;
+ /* No explicit format list. Any 16/32/64bpp format could be used with atomics. */
+ unsigned bpb = isl_format_get_layout(format)->bpb;
+ return bpb == 16 || bpb == 32 || bpb == 64;
}
-bool
-anv_formats_ccs_e_compatible(const struct intel_device_info *devinfo,
- VkImageCreateFlags create_flags,
- VkFormat vk_format,
- VkImageTiling vk_tiling,
- const VkImageFormatListCreateInfoKHR *fmt_list)
+static enum isl_format
+anv_get_isl_format_with_usage(const struct intel_device_info *devinfo,
+ VkFormat vk_format,
+ VkImageAspectFlagBits vk_aspect,
+ VkImageUsageFlags vk_usage,
+ VkImageTiling vk_tiling)
{
- enum isl_format format =
- anv_get_isl_format(devinfo, vk_format,
- VK_IMAGE_ASPECT_COLOR_BIT, vk_tiling);
+ assert(util_bitcount(vk_usage) == 1);
+ struct anv_format_plane format =
+ anv_get_format_aspect(devinfo, vk_format, vk_aspect,
+ vk_tiling);
+
+ return format.isl_format;
+}
- if (!isl_format_supports_ccs_e(devinfo, format))
+static bool
+formats_ccs_e_compatible(const struct intel_device_info *devinfo,
+ VkImageCreateFlags create_flags,
+ enum isl_format format, VkImageTiling vk_tiling,
+ VkImageUsageFlags vk_usage,
+ const VkImageFormatListCreateInfo *fmt_list)
+{
+ if (!anv_format_supports_ccs_e(devinfo, format))
return false;
+ /* For images created without MUTABLE_FORMAT_BIT set, we know that they will
+ * always be used with the original format. In particular, they will always
+ * be used with a format that supports color compression.
+ */
if (!(create_flags & VK_IMAGE_CREATE_MUTABLE_FORMAT_BIT))
return true;
@@ -387,9 +492,13 @@ anv_formats_ccs_e_compatible(const struct intel_device_info *devinfo,
return false;
for (uint32_t i = 0; i < fmt_list->viewFormatCount; i++) {
+ if (fmt_list->pViewFormats[i] == VK_FORMAT_UNDEFINED)
+ continue;
+
enum isl_format view_format =
- anv_get_isl_format(devinfo, fmt_list->pViewFormats[i],
- VK_IMAGE_ASPECT_COLOR_BIT, vk_tiling);
+ anv_get_isl_format_with_usage(devinfo, fmt_list->pViewFormats[i],
+ VK_IMAGE_ASPECT_COLOR_BIT, vk_usage,
+ vk_tiling);
if (!isl_formats_are_ccs_e_compatible(devinfo, format, view_format))
return false;
@@ -398,6 +507,62 @@ anv_formats_ccs_e_compatible(const struct intel_device_info *devinfo,
return true;
}
+bool
+anv_format_supports_ccs_e(const struct intel_device_info *devinfo,
+ const enum isl_format format)
+{
+ /* CCS_E for YCRCB_NORMAL and YCRCB_SWAP_UV is not currently supported by
+ * ANV so leave it disabled for now.
+ */
+ if (isl_format_is_yuv(format))
+ return false;
+
+ return isl_format_supports_ccs_e(devinfo, format);
+}
+
+bool
+anv_formats_ccs_e_compatible(const struct intel_device_info *devinfo,
+ VkImageCreateFlags create_flags,
+ VkFormat vk_format, VkImageTiling vk_tiling,
+ VkImageUsageFlags vk_usage,
+ const VkImageFormatListCreateInfo *fmt_list)
+{
+ enum isl_format format =
+ anv_get_isl_format_with_usage(devinfo, vk_format,
+ VK_IMAGE_ASPECT_COLOR_BIT,
+ VK_IMAGE_USAGE_SAMPLED_BIT, vk_tiling);
+
+ if (!formats_ccs_e_compatible(devinfo, create_flags, format, vk_tiling,
+ VK_IMAGE_USAGE_SAMPLED_BIT, fmt_list))
+ return false;
+
+ if (vk_usage & VK_IMAGE_USAGE_STORAGE_BIT) {
+ if (devinfo->verx10 < 125)
+ return false;
+
+ enum isl_format lower_format =
+ anv_get_isl_format_with_usage(devinfo, vk_format,
+ VK_IMAGE_ASPECT_COLOR_BIT,
+ VK_IMAGE_USAGE_STORAGE_BIT, vk_tiling);
+
+ if (!isl_formats_are_ccs_e_compatible(devinfo, format, lower_format))
+ return false;
+
+ if (!formats_ccs_e_compatible(devinfo, create_flags, format, vk_tiling,
+ VK_IMAGE_USAGE_STORAGE_BIT, fmt_list))
+ return false;
+
+ /* Disable compression when surface can be potentially used for atomic
+ * operation.
+ */
+ if (storage_image_format_supports_atomic(devinfo, create_flags, format,
+ vk_tiling, fmt_list))
+ return false;
+ }
+
+ return true;
+}
+
/**
* For color images that have an auxiliary surface, request allocation for an
* additional buffer that mainly stores fast-clear values. Use of this buffer
@@ -455,6 +620,7 @@ anv_formats_ccs_e_compatible(const struct intel_device_info *devinfo,
static VkResult MUST_CHECK
add_aux_state_tracking_buffer(struct anv_device *device,
struct anv_image *image,
+ uint64_t state_offset,
uint32_t plane)
{
assert(image && device);
@@ -462,18 +628,43 @@ add_aux_state_tracking_buffer(struct anv_device *device,
image->vk.aspects & (VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV |
VK_IMAGE_ASPECT_DEPTH_BIT));
- const unsigned clear_color_state_size = device->info.ver >= 10 ?
- device->isl_dev.ss.clear_color_state_size :
- device->isl_dev.ss.clear_value_size;
+ unsigned clear_color_state_size;
+ if (device->info->ver >= 11) {
+ /* When importing an image from another source with a drm modifier that
+ * supports clear color, the clear color values are in a 32-byte struct
+ * defined in drm_fourcc.h. The fast clear type and compression state
+ * are not defined in these drm_fourcc.h, so there won't be memory
+ * allocated for these extra meta data by the source.
+ *
+ * We use the last 2 dwords of the clear color struct's memory to store
+ * the fast clear type and the first compression state, so the driver
+ * doesn't assume the extra size or need another allocation later.
+ *
+ * So far, the 2 stolen dwords are either not used in the clear color
+ * struct or for features not enabled. There should be no side effect to
+ * the hardware and destinations of images exported by this driver.
+ *
+ * Images with multiple levels or layers are not supported by drm
+ * modifiers, so we don't have to apply the above approach or face a
+ * bigger shortage from multiple compression states. We just apply the
+ * approach to all cases to keep the design unified.
+ *
+ * As a result, the state starts 8 bytes lower than where it should be.
+ */
+ assert(device->isl_dev.ss.clear_color_state_size >= 32);
+ clear_color_state_size = device->isl_dev.ss.clear_color_state_size - 8;
+ } else {
+ clear_color_state_size = device->isl_dev.ss.clear_value_size;
+ }
/* Clear color and fast clear type */
unsigned state_size = clear_color_state_size + 4;
/* We only need to track compression on CCS_E surfaces. */
- if (image->planes[plane].aux_usage == ISL_AUX_USAGE_CCS_E) {
+ if (isl_aux_usage_has_ccs_e(image->planes[plane].aux_usage)) {
if (image->vk.image_type == VK_IMAGE_TYPE_3D) {
for (uint32_t l = 0; l < image->vk.mip_levels; l++)
- state_size += anv_minify(image->vk.extent.depth, l) * 4;
+ state_size += u_minify(image->vk.extent.depth, l) * 4;
} else {
state_size += image->vk.mip_levels * image->vk.array_layers * 4;
}
@@ -482,17 +673,53 @@ add_aux_state_tracking_buffer(struct anv_device *device,
enum anv_image_memory_binding binding =
ANV_IMAGE_MEMORY_BINDING_PLANE_0 + plane;
- if (image->vk.drm_format_mod != DRM_FORMAT_MOD_INVALID)
- binding = ANV_IMAGE_MEMORY_BINDING_PRIVATE;
+ /* If an auxiliary surface is used for an externally-shareable image,
+ * we have to hide this from the memory of the image since other
+ * processes with access to the memory may not be aware of it or of
+ * its current state. So put that auxiliary data into a separate
+ * buffer (ANV_IMAGE_MEMORY_BINDING_PRIVATE).
+ *
+ * But when the image is created with a drm modifier that supports
+ * clear color, it will be exported along with main surface.
+ */
+ if (anv_image_is_externally_shared(image)
+ && !isl_drm_modifier_get_info(image->vk.drm_format_mod)->supports_clear_color) {
+ binding = ANV_IMAGE_MEMORY_BINDING_PRIVATE;
+ }
/* We believe that 256B alignment may be sufficient, but we choose 4K due to
* lack of testing. And MI_LOAD/STORE operations require dword-alignment.
*/
return image_binding_grow(device, image, binding,
- ANV_OFFSET_IMPLICIT, state_size, 4096,
+ state_offset, state_size, 4096,
&image->planes[plane].fast_clear_memory_range);
}
+static VkResult MUST_CHECK
+add_compression_control_buffer(struct anv_device *device,
+ struct anv_image *image,
+ uint32_t plane,
+ uint32_t binding,
+ uint64_t offset)
+{
+ assert(device->info->has_aux_map);
+
+ uint64_t ratio = intel_aux_get_main_to_aux_ratio(device->aux_map_ctx);
+ assert(image->planes[plane].primary_surface.isl.size_B % ratio == 0);
+ uint64_t size = image->planes[plane].primary_surface.isl.size_B / ratio;
+
+ /* The diagram in the Bspec section, Memory Compression - Gfx12 (44930),
+ * shows that the CCS is indexed in 256B chunks for TGL, 4K chunks for MTL.
+ * When modifiers are in use, the 4K alignment requirement of the
+ * PLANE_AUX_DIST::Auxiliary Surface Distance field must be considered
+ * (Bspec 50379). Keep things simple and just use 4K.
+ */
+ uint32_t alignment = 4096;
+
+ return image_binding_grow(device, image, binding, offset, size, alignment,
+ &image->planes[plane].compr_ctrl_memory_range);
+}
+
/**
* The return code indicates whether creation of the VkImage should continue
* or fail, not whether the creation of the aux surface succeeded. If the aux
@@ -507,9 +734,10 @@ add_aux_surface_if_supported(struct anv_device *device,
struct anv_image *image,
uint32_t plane,
struct anv_format_plane plane_format,
- const VkImageFormatListCreateInfoKHR *fmt_list,
+ const VkImageFormatListCreateInfo *fmt_list,
uint64_t offset,
uint32_t stride,
+ uint64_t aux_state_offset,
isl_surf_usage_flags_t isl_extra_usage_flags)
{
VkImageAspectFlags aspect = plane_format.aspect;
@@ -522,6 +750,27 @@ add_aux_surface_if_supported(struct anv_device *device,
if ((isl_extra_usage_flags & ISL_SURF_USAGE_DISABLE_AUX_BIT))
return VK_SUCCESS;
+ /* TODO: consider whether compression with sparse is workable. */
+ if (anv_image_is_sparse(image))
+ return VK_SUCCESS;
+
+ /* If resource created with sharing mode CONCURRENT when multiple queues
+ * are supported, we can't support the compression since we can't do
+ * FULL_RESOLVE/PARTIAL_RESOLVE to construct the main surface data without
+ * barrier.
+ */
+ if (image->vk.sharing_mode == VK_SHARING_MODE_CONCURRENT &&
+ device->queue_count > 1)
+ return VK_SUCCESS;
+
+ uint32_t binding;
+ if (image->vk.drm_format_mod == DRM_FORMAT_MOD_INVALID ||
+ isl_drm_modifier_has_aux(image->vk.drm_format_mod)) {
+ binding = ANV_IMAGE_MEMORY_BINDING_PLANE_0 + plane;
+ } else {
+ binding = ANV_IMAGE_MEMORY_BINDING_PRIVATE;
+ }
+
if (aspect == VK_IMAGE_ASPECT_DEPTH_BIT) {
/* We don't advertise that depth buffers could be used as storage
* images.
@@ -536,24 +785,12 @@ add_aux_surface_if_supported(struct anv_device *device,
return VK_SUCCESS;
}
- if (device->info.ver == 7) {
- anv_perf_warn(device, &image->vk.base, "Implement gfx7 HiZ");
+ /* TODO: Adjust blorp for multi-LOD HiZ surface on Gen8 - Gen9*/
+ if (image->vk.mip_levels > 1 && device->info->ver <= 9) {
+ anv_perf_warn(VK_LOG_OBJS(&image->vk.base), "Enable multi-LOD HiZ");
return VK_SUCCESS;
}
- if (image->vk.mip_levels > 1) {
- anv_perf_warn(device, &image->vk.base, "Enable multi-LOD HiZ");
- return VK_SUCCESS;
- }
-
- if (device->info.ver == 8 && image->vk.samples > 1) {
- anv_perf_warn(device, &image->vk.base, "Enable gfx8 multisampled HiZ");
- return VK_SUCCESS;
- }
-
- if (INTEL_DEBUG & DEBUG_NO_HIZ)
- return VK_SUCCESS;
-
ok = isl_surf_get_hiz_surf(&device->isl_dev,
&image->planes[plane].primary_surface.isl,
&image->planes[plane].aux_surface.isl);
@@ -575,32 +812,45 @@ add_aux_surface_if_supported(struct anv_device *device,
*
* TODO: This is a heuristic trade-off; we haven't tuned it at all.
*/
- assert(device->info.ver >= 12);
+ assert(device->info->ver >= 12);
image->planes[plane].aux_usage = ISL_AUX_USAGE_HIZ_CCS_WT;
} else {
- assert(device->info.ver >= 12);
+ assert(device->info->ver >= 12);
image->planes[plane].aux_usage = ISL_AUX_USAGE_HIZ_CCS;
}
result = add_surface(device, image, &image->planes[plane].aux_surface,
- ANV_IMAGE_MEMORY_BINDING_PLANE_0 + plane,
- ANV_OFFSET_IMPLICIT);
+ binding, ANV_OFFSET_IMPLICIT);
if (result != VK_SUCCESS)
return result;
+ if (anv_image_plane_uses_aux_map(device, image, plane)) {
+ result = add_compression_control_buffer(device, image, plane,
+ binding,
+ ANV_OFFSET_IMPLICIT);
+ if (result != VK_SUCCESS)
+ return result;
+ }
+
if (image->planes[plane].aux_usage == ISL_AUX_USAGE_HIZ_CCS_WT)
- return add_aux_state_tracking_buffer(device, image, plane);
+ return add_aux_state_tracking_buffer(device, image,
+ aux_state_offset,
+ plane);
} else if (aspect == VK_IMAGE_ASPECT_STENCIL_BIT) {
-
- if (INTEL_DEBUG & DEBUG_NO_RBC)
- return VK_SUCCESS;
-
if (!isl_surf_supports_ccs(&device->isl_dev,
&image->planes[plane].primary_surface.isl,
NULL))
return VK_SUCCESS;
image->planes[plane].aux_usage = ISL_AUX_USAGE_STC_CCS;
+
+ if (device->info->has_aux_map) {
+ result = add_compression_control_buffer(device, image, plane,
+ binding,
+ ANV_OFFSET_IMPLICIT);
+ if (result != VK_SUCCESS)
+ return result;
+ }
} else if ((aspect & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) && image->vk.samples == 1) {
if (image->n_planes != 1) {
/* Multiplanar images seem to hit a sampler bug with CCS and R16G16
@@ -610,7 +860,7 @@ add_aux_surface_if_supported(struct anv_device *device,
return VK_SUCCESS;
}
- if ((image->vk.create_flags & VK_IMAGE_CREATE_ALIAS_BIT)) {
+ if ((image->vk.create_flags & VK_IMAGE_CREATE_ALIAS_BIT) && !image->from_wsi) {
/* The image may alias a plane of a multiplanar image. Above we ban
* CCS on multiplanar images.
*
@@ -622,35 +872,6 @@ add_aux_surface_if_supported(struct anv_device *device,
return VK_SUCCESS;
}
- if (!isl_format_supports_rendering(&device->info,
- plane_format.isl_format)) {
- /* Disable CCS because it is not useful (we can't render to the image
- * with CCS enabled). While it may be technically possible to enable
- * CCS for this case, we currently don't have things hooked up to get
- * it working.
- */
- anv_perf_warn(device, &image->vk.base,
- "This image format doesn't support rendering. "
- "Not allocating an CCS buffer.");
- return VK_SUCCESS;
- }
-
- if (device->info.ver >= 12 && image->vk.array_layers > 1) {
- /* HSD 14010672564: On TGL, if a block of fragment shader outputs
- * match the surface's clear color, the HW may convert them to
- * fast-clears. Anv only does clear color tracking for the first
- * slice unfortunately. Disable CCS until anv gains more clear color
- * tracking abilities.
- */
- anv_perf_warn(device, &image->vk.base,
- "HW may put fast-clear blocks on more slices than SW "
- "currently tracks. Not allocating a CCS buffer.");
- return VK_SUCCESS;
- }
-
- if (INTEL_DEBUG & DEBUG_NO_RBC)
- return VK_SUCCESS;
-
ok = isl_surf_get_ccs_surf(&device->isl_dev,
&image->planes[plane].primary_surface.isl,
NULL,
@@ -660,23 +881,30 @@ add_aux_surface_if_supported(struct anv_device *device,
return VK_SUCCESS;
/* Choose aux usage */
- if (!(image->vk.usage & VK_IMAGE_USAGE_STORAGE_BIT) &&
- anv_formats_ccs_e_compatible(&device->info,
- image->vk.create_flags,
- image->vk.format,
- image->vk.tiling,
- fmt_list)) {
- /* For images created without MUTABLE_FORMAT_BIT set, we know that
- * they will always be used with the original format. In particular,
- * they will always be used with a format that supports color
- * compression. If it's never used as a storage image, then it will
- * only be used through the sampler or the as a render target. This
- * means that it's safe to just leave compression on at all times for
- * these formats.
- */
- image->planes[plane].aux_usage = ISL_AUX_USAGE_CCS_E;
- } else if (device->info.ver >= 12) {
- anv_perf_warn(device, &image->vk.base,
+ if (anv_formats_ccs_e_compatible(device->info, image->vk.create_flags,
+ image->vk.format, image->vk.tiling,
+ image->vk.usage, fmt_list)) {
+ if (intel_needs_workaround(device->info, 1607794140)) {
+ /* FCV is permanently enabled on this HW. */
+ image->planes[plane].aux_usage = ISL_AUX_USAGE_FCV_CCS_E;
+ } else if (device->info->verx10 >= 125 &&
+ !device->physical->disable_fcv) {
+ /* FCV is enabled via 3DSTATE_3D_MODE. We'd expect plain CCS_E to
+ * perform better because it allows for non-zero fast clear colors,
+ * but we've run into regressions in several benchmarks (F1 22 and
+ * RDR2) when trying to enable it. When non-zero clear colors are
+ * enabled, we've observed many partial resolves. We haven't yet
+ * root-caused what layout transitions are causing these resolves,
+ * so in the meantime, we choose to reduce our clear color support.
+ * With only zero clear colors being supported, we might as well
+ * turn on FCV.
+ */
+ image->planes[plane].aux_usage = ISL_AUX_USAGE_FCV_CCS_E;
+ } else {
+ image->planes[plane].aux_usage = ISL_AUX_USAGE_CCS_E;
+ }
+ } else if (device->info->ver >= 12) {
+ anv_perf_warn(VK_LOG_OBJS(&image->vk.base),
"The CCS_D aux mode is not yet handled on "
"Gfx12+. Not allocating a CCS buffer.");
image->planes[plane].aux_surface.isl.size_B = 0;
@@ -685,21 +913,22 @@ add_aux_surface_if_supported(struct anv_device *device,
image->planes[plane].aux_usage = ISL_AUX_USAGE_CCS_D;
}
- if (!device->physical->has_implicit_ccs) {
- enum anv_image_memory_binding binding =
- ANV_IMAGE_MEMORY_BINDING_PLANE_0 + plane;
-
- if (image->vk.drm_format_mod != DRM_FORMAT_MOD_INVALID &&
- !isl_drm_modifier_has_aux(image->vk.drm_format_mod))
- binding = ANV_IMAGE_MEMORY_BINDING_PRIVATE;
-
- result = add_surface(device, image, &image->planes[plane].aux_surface,
- binding, offset);
- if (result != VK_SUCCESS)
- return result;
+ if (device->info->has_flat_ccs) {
+ result = VK_SUCCESS;
+ } else if (device->info->has_aux_map) {
+ result = add_compression_control_buffer(device, image, plane,
+ binding, offset);
+ } else {
+ result = add_surface(device, image,
+ &image->planes[plane].aux_surface, binding,
+ offset);
}
+ if (result != VK_SUCCESS)
+ return result;
- return add_aux_state_tracking_buffer(device, image, plane);
+ return add_aux_state_tracking_buffer(device, image,
+ aux_state_offset,
+ plane);
} else if ((aspect & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) && image->vk.samples > 1) {
assert(!(image->vk.usage & VK_IMAGE_USAGE_STORAGE_BIT));
ok = isl_surf_get_mcs_surf(&device->isl_dev,
@@ -711,51 +940,45 @@ add_aux_surface_if_supported(struct anv_device *device,
image->planes[plane].aux_usage = ISL_AUX_USAGE_MCS;
result = add_surface(device, image, &image->planes[plane].aux_surface,
- ANV_IMAGE_MEMORY_BINDING_PLANE_0 + plane,
- ANV_OFFSET_IMPLICIT);
+ binding, ANV_OFFSET_IMPLICIT);
if (result != VK_SUCCESS)
return result;
- return add_aux_state_tracking_buffer(device, image, plane);
+ return add_aux_state_tracking_buffer(device, image,
+ aux_state_offset,
+ plane);
}
return VK_SUCCESS;
}
static VkResult
-add_shadow_surface(struct anv_device *device,
- struct anv_image *image,
- uint32_t plane,
- struct anv_format_plane plane_format,
- uint32_t stride,
- VkImageUsageFlags vk_plane_usage)
+add_video_buffers(struct anv_device *device,
+ struct anv_image *image,
+ const struct VkVideoProfileListInfoKHR *profile_list)
{
ASSERTED bool ok;
+ unsigned size = 0;
- ok = isl_surf_init(&device->isl_dev,
- &image->planes[plane].shadow_surface.isl,
- .dim = vk_to_isl_surf_dim[image->vk.image_type],
- .format = plane_format.isl_format,
- .width = image->vk.extent.width,
- .height = image->vk.extent.height,
- .depth = image->vk.extent.depth,
- .levels = image->vk.mip_levels,
- .array_len = image->vk.array_layers,
- .samples = image->vk.samples,
- .min_alignment_B = 0,
- .row_pitch_B = stride,
- .usage = ISL_SURF_USAGE_TEXTURE_BIT |
- (vk_plane_usage & ISL_SURF_USAGE_CUBE_BIT),
- .tiling_flags = ISL_TILING_ANY_MASK);
-
- /* isl_surf_init() will fail only if provided invalid input. Invalid input
- * here is illegal in Vulkan.
- */
- assert(ok);
+ for (unsigned i = 0; i < profile_list->profileCount; i++) {
+ if (profile_list->pProfiles[i].videoCodecOperation == VK_VIDEO_CODEC_OPERATION_DECODE_H264_BIT_KHR) {
+ unsigned w_mb = DIV_ROUND_UP(image->vk.extent.width, ANV_MB_WIDTH);
+ unsigned h_mb = DIV_ROUND_UP(image->vk.extent.height, ANV_MB_HEIGHT);
+ size = w_mb * h_mb * 128;
+ }
+ else if (profile_list->pProfiles[i].videoCodecOperation == VK_VIDEO_CODEC_OPERATION_DECODE_H265_BIT_KHR) {
+ unsigned w_mb = DIV_ROUND_UP(image->vk.extent.width, 32);
+ unsigned h_mb = DIV_ROUND_UP(image->vk.extent.height, 32);
+ size = ALIGN(w_mb * h_mb, 2) << 6;
+ }
+ }
- return add_surface(device, image, &image->planes[plane].shadow_surface,
- ANV_IMAGE_MEMORY_BINDING_PLANE_0 + plane,
- ANV_OFFSET_IMPLICIT);
+ if (size == 0)
+ return VK_SUCCESS;
+
+ ok = image_binding_grow(device, image, ANV_IMAGE_MEMORY_BINDING_PRIVATE,
+ ANV_OFFSET_IMPLICIT, size, 65536, &image->vid_dmv_top_surface);
+ return ok;
}
/**
@@ -777,11 +1000,21 @@ add_primary_surface(struct anv_device *device,
struct anv_surface *anv_surf = &image->planes[plane].primary_surface;
bool ok;
+ uint32_t width = image->vk.extent.width;
+ uint32_t height = image->vk.extent.height;
+ const struct vk_format_ycbcr_info *ycbcr_info =
+ vk_format_get_ycbcr_info(image->vk.format);
+ if (ycbcr_info) {
+ assert(plane < ycbcr_info->n_planes);
+ width /= ycbcr_info->planes[plane].denominator_scales[0];
+ height /= ycbcr_info->planes[plane].denominator_scales[1];
+ }
+
ok = isl_surf_init(&device->isl_dev, &anv_surf->isl,
.dim = vk_to_isl_surf_dim[image->vk.image_type],
.format = plane_format.isl_format,
- .width = image->vk.extent.width / plane_format.denominator_scales[0],
- .height = image->vk.extent.height / plane_format.denominator_scales[1],
+ .width = width,
+ .height = height,
.depth = image->vk.extent.depth,
.levels = image->vk.mip_levels,
.array_len = image->vk.array_layers,
@@ -810,6 +1043,16 @@ memory_range_is_aligned(struct anv_image_memory_range memory_range)
{
return anv_is_aligned(memory_range.offset, memory_range.alignment);
}
+
+static bool MUST_CHECK
+memory_ranges_equal(struct anv_image_memory_range a,
+ struct anv_image_memory_range b)
+{
+ return a.binding == b.binding &&
+ a.alignment == b.alignment &&
+ a.size == b.size &&
+ a.offset == b.offset;
+}
#endif
struct check_memory_range_params {
@@ -858,7 +1101,7 @@ static void
check_memory_bindings(const struct anv_device *device,
const struct anv_image *image)
{
-#ifdef DEBUG
+#if MESA_DEBUG
/* As we inspect each part of the image, we merge the part's memory range
* into these accumulation ranges.
*/
@@ -878,9 +1121,12 @@ check_memory_bindings(const struct anv_device *device,
: ANV_IMAGE_MEMORY_BINDING_MAIN;
/* Aliasing is incompatible with the private binding because it does not
- * live in a VkDeviceMemory.
+ * live in a VkDeviceMemory. The exception is either swapchain images or
+ * that the private binding is for a video motion vector buffer.
*/
assert(!(image->vk.create_flags & VK_IMAGE_CREATE_ALIAS_BIT) ||
+ image->from_wsi ||
+ (plane->primary_surface.isl.usage & ISL_SURF_USAGE_VIDEO_DECODE_BIT) ||
image->bindings[ANV_IMAGE_MEMORY_BINDING_PRIVATE].memory_range.size == 0);
/* Check primary surface */
@@ -888,20 +1134,22 @@ check_memory_bindings(const struct anv_device *device,
.test_surface = &plane->primary_surface,
.expect_binding = primary_binding);
- /* Check shadow surface */
- if (anv_surface_is_valid(&plane->shadow_surface)) {
- check_memory_range(accum_ranges,
- .test_surface = &plane->shadow_surface,
- .expect_binding = primary_binding);
- }
-
/* Check aux_surface */
- if (anv_surface_is_valid(&plane->aux_surface)) {
+ const struct anv_image_memory_range *aux_mem_range =
+ anv_image_get_aux_memory_range(image, p);
+ if (aux_mem_range->size > 0) {
enum anv_image_memory_binding binding = primary_binding;
- if (image->vk.drm_format_mod != DRM_FORMAT_MOD_INVALID &&
- !isl_drm_modifier_has_aux(image->vk.drm_format_mod))
+ /* If an auxiliary surface is used for an externally-shareable image,
+ * we have to hide this from the memory of the image since other
+ * processes with access to the memory may not be aware of it or of
+ * its current state. So put that auxiliary data into a separate
+ * buffer (ANV_IMAGE_MEMORY_BINDING_PRIVATE).
+ */
+ if (anv_image_is_externally_shared(image) &&
+ !isl_drm_modifier_has_aux(image->vk.drm_format_mod)) {
binding = ANV_IMAGE_MEMORY_BINDING_PRIVATE;
+ }
/* Display hardware requires that the aux surface start at
* a higher address than the primary surface. The 3D hardware
@@ -909,7 +1157,7 @@ check_memory_bindings(const struct anv_device *device,
* the image is sent to display.
*/
check_memory_range(accum_ranges,
- .test_surface = &plane->aux_surface,
+ .test_range = aux_mem_range,
.expect_binding = binding);
}
@@ -917,8 +1165,19 @@ check_memory_bindings(const struct anv_device *device,
if (plane->fast_clear_memory_range.size > 0) {
enum anv_image_memory_binding binding = primary_binding;
- if (image->vk.drm_format_mod != DRM_FORMAT_MOD_INVALID)
+ /* If an auxiliary surface is used for an externally-shareable image,
+ * we have to hide this from the memory of the image since other
+ * processes with access to the memory may not be aware of it or of
+ * its current state. So put that auxiliary data into a separate
+ * buffer (ANV_IMAGE_MEMORY_BINDING_PRIVATE).
+ *
+ * But when the image is created with a drm modifier that supports
+ * clear color, it will be exported along with main surface.
+ */
+ if (anv_image_is_externally_shared(image)
+ && !isl_drm_modifier_get_info(image->vk.drm_format_mod)->supports_clear_color) {
binding = ANV_IMAGE_MEMORY_BINDING_PRIVATE;
+ }
/* We believe that 256B alignment may be sufficient, but we choose 4K
* due to lack of testing. And MI_LOAD/STORE operations require
@@ -969,7 +1228,7 @@ check_drm_format_mod(const struct anv_device *device,
isl_drm_modifier_get_info(image->vk.drm_format_mod);
/* Driver must support the modifier. */
- assert(isl_drm_modifier_get_score(&device->info, isl_mod_info->modifier));
+ assert(isl_drm_modifier_get_score(device->info, isl_mod_info->modifier));
/* Enforced by us, not the Vulkan spec. */
assert(image->vk.image_type == VK_IMAGE_TYPE_2D);
@@ -988,9 +1247,8 @@ check_drm_format_mod(const struct anv_device *device,
assert(isl_layout->txc == ISL_TXC_NONE);
assert(isl_layout->colorspace == ISL_COLORSPACE_LINEAR ||
isl_layout->colorspace == ISL_COLORSPACE_SRGB);
- assert(!anv_surface_is_valid(&plane->shadow_surface));
- if (isl_mod_info->aux_usage != ISL_AUX_USAGE_NONE) {
+ if (isl_drm_modifier_has_aux(isl_mod_info->modifier)) {
/* Reject DISJOINT for consistency with the GL driver. */
assert(!image->disjoint);
@@ -998,8 +1256,11 @@ check_drm_format_mod(const struct anv_device *device,
* The inverse, however, does not hold; if the modifier has no aux
* usage, then we may enable a private aux surface.
*/
- if (plane->aux_usage != isl_mod_info->aux_usage) {
- return vk_errorf(device, &image->vk.base, VK_ERROR_UNKNOWN,
+ if ((isl_mod_info->supports_media_compression &&
+ plane->aux_usage != ISL_AUX_USAGE_MC) ||
+ (isl_mod_info->supports_render_compression &&
+ !isl_aux_usage_has_ccs_e(plane->aux_usage))) {
+ return vk_errorf(device, VK_ERROR_UNKNOWN,
"image with modifier unexpectedly has wrong aux "
"usage");
}
@@ -1020,55 +1281,67 @@ add_all_surfaces_implicit_layout(
const VkImageFormatListCreateInfo *format_list_info,
uint32_t stride,
isl_tiling_flags_t isl_tiling_flags,
- const struct anv_image_create_info *create_info)
+ isl_surf_usage_flags_t isl_extra_usage_flags)
{
- assert(create_info);
- const struct intel_device_info *devinfo = &device->info;
- isl_surf_usage_flags_t isl_extra_usage_flags =
- create_info->isl_extra_usage_flags;
+ const struct intel_device_info *devinfo = device->info;
VkResult result;
+ const struct vk_format_ycbcr_info *ycbcr_info =
+ vk_format_get_ycbcr_info(image->vk.format);
+ if (ycbcr_info)
+ assert(ycbcr_info->n_planes == image->n_planes);
+
+ unsigned num_aspects = 0;
+ VkImageAspectFlagBits aspects[3];
u_foreach_bit(b, image->vk.aspects) {
- VkImageAspectFlagBits aspect = 1 << b;
+ assert(num_aspects < 3);
+ aspects[num_aspects++] = 1 << b;
+ }
+ assert(num_aspects == image->n_planes);
+
+ /* The Android hardware buffer YV12 format has the planes ordered as Y-Cr-Cb,
+ * while Vulkan expects VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM to be in Y-Cb-Cr.
+ * Adjust the order we add the ISL surfaces accordingly so the implicit
+ * offset gets calculated correctly.
+ */
+ if (image->from_ahb && image->vk.format == VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM) {
+ assert(num_aspects == 3);
+ assert(aspects[1] == VK_IMAGE_ASPECT_PLANE_1_BIT);
+ assert(aspects[2] == VK_IMAGE_ASPECT_PLANE_2_BIT);
+ aspects[1] = VK_IMAGE_ASPECT_PLANE_2_BIT;
+ aspects[2] = VK_IMAGE_ASPECT_PLANE_1_BIT;
+ }
+
+ for (unsigned i = 0; i < num_aspects; i++) {
+ VkImageAspectFlagBits aspect = aspects[i];
const uint32_t plane = anv_image_aspect_to_plane(image, aspect);
const struct anv_format_plane plane_format =
anv_get_format_plane(devinfo, image->vk.format, plane, image->vk.tiling);
+ enum isl_format isl_fmt = plane_format.isl_format;
+ assert(isl_fmt != ISL_FORMAT_UNSUPPORTED);
+
+ uint32_t plane_stride = stride * isl_format_get_layout(isl_fmt)->bpb / 8;
+ if (ycbcr_info)
+ plane_stride /= ycbcr_info->planes[plane].denominator_scales[0];
+
VkImageUsageFlags vk_usage = vk_image_usage(&image->vk, aspect);
isl_surf_usage_flags_t isl_usage =
- choose_isl_surf_usage(image->vk.create_flags, vk_usage,
- isl_extra_usage_flags, aspect);
-
- /* Must call this before adding any surfaces because it may modify
- * isl_tiling_flags.
- */
- bool needs_shadow =
- anv_image_plane_needs_shadow_surface(devinfo, plane_format,
- image->vk.tiling, vk_usage,
- image->vk.create_flags,
- &isl_tiling_flags);
+ anv_image_choose_isl_surf_usage(device->physical,
+ image->vk.create_flags, vk_usage,
+ isl_extra_usage_flags, aspect,
+ image->vk.compr_flags);
result = add_primary_surface(device, image, plane, plane_format,
- ANV_OFFSET_IMPLICIT, stride,
+ ANV_OFFSET_IMPLICIT, plane_stride,
isl_tiling_flags, isl_usage);
if (result != VK_SUCCESS)
return result;
- if (needs_shadow) {
- result = add_shadow_surface(device, image, plane, plane_format,
- stride, vk_usage);
- if (result != VK_SUCCESS)
- return result;
- }
-
- /* Disable aux if image supports export without modifiers. */
- if (image->vk.external_handle_types != 0 &&
- image->vk.tiling != VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT)
- continue;
-
result = add_aux_surface_if_supported(device, image, plane, plane_format,
format_list_info,
- ANV_OFFSET_IMPLICIT, stride,
+ ANV_OFFSET_IMPLICIT, plane_stride,
+ ANV_OFFSET_IMPLICIT,
isl_extra_usage_flags);
if (result != VK_SUCCESS)
return result;
@@ -1089,56 +1362,54 @@ add_all_surfaces_explicit_layout(
isl_tiling_flags_t isl_tiling_flags,
isl_surf_usage_flags_t isl_extra_usage_flags)
{
- const struct intel_device_info *devinfo = &device->info;
+ const struct intel_device_info *devinfo = device->info;
const uint32_t mod_plane_count = drm_info->drmFormatModifierPlaneCount;
const bool mod_has_aux =
isl_drm_modifier_has_aux(drm_info->drmFormatModifier);
VkResult result;
- /* About valid usage in the Vulkan spec:
- *
- * Unlike vanilla vkCreateImage, which produces undefined behavior on user
- * error, here the spec requires the implementation to return
- * VK_ERROR_INVALID_DRM_FORMAT_MODIFIER_PLANE_LAYOUT_EXT if the app provides
- * a bad plane layout. However, the spec does require
- * drmFormatModifierPlaneCount to be valid.
- *
- * Most validation of plane layout occurs in add_surface().
- */
-
- /* We support a restricted set of images with modifiers.
- *
- * With aux usage,
- * - Format plane count must be 1.
- * - Memory plane count must be 2.
- * Without aux usage,
- * - Each format plane must map to a distint memory plane.
- *
- * For the other cases, currently there is no way to properly map memory
- * planes to format planes and aux planes due to the lack of defined ABI
- * for external multi-planar images.
+ /* Currently there is no way to properly map memory planes to format planes
+ * and aux planes due to the lack of defined ABI for external multi-planar
+ * images.
*/
if (image->n_planes == 1)
assert(image->vk.aspects == VK_IMAGE_ASPECT_COLOR_BIT);
else
assert(!(image->vk.aspects & ~VK_IMAGE_ASPECT_PLANES_BITS_ANV));
- if (mod_has_aux)
- assert(image->n_planes == 1 && mod_plane_count == 2);
- else
+ if (mod_has_aux) {
+ assert(image->n_planes == 1);
+
+ /* About valid usage in the Vulkan spec:
+ *
+ * Unlike vanilla vkCreateImage, which produces undefined behavior on user
+ * error, here the spec requires the implementation to return
+ * VK_ERROR_INVALID_DRM_FORMAT_MODIFIER_PLANE_LAYOUT_EXT if the app provides
+ * a bad plane layout. However, the spec does require
+ * drmFormatModifierPlaneCount to be valid.
+ *
+ * Most validation of plane layout occurs in add_surface().
+ */
+ uint32_t n_mod_planes =
+ isl_drm_modifier_get_plane_count(devinfo,
+ drm_info->drmFormatModifier,
+ image->n_planes);
+ assert(n_mod_planes == mod_plane_count);
+ } else {
assert(image->n_planes == mod_plane_count);
+ }
/* Reject special values in the app-provided plane layouts. */
for (uint32_t i = 0; i < mod_plane_count; ++i) {
if (drm_info->pPlaneLayouts[i].rowPitch == 0) {
- return vk_errorf(device, &device->vk.base,
+ return vk_errorf(device,
VK_ERROR_INVALID_DRM_FORMAT_MODIFIER_PLANE_LAYOUT_EXT,
"VkImageDrmFormatModifierExplicitCreateInfoEXT::"
"pPlaneLayouts[%u]::rowPitch is 0", i);
}
if (drm_info->pPlaneLayouts[i].offset == ANV_OFFSET_IMPLICIT) {
- return vk_errorf(device, &device->vk.base,
+ return vk_errorf(device,
VK_ERROR_INVALID_DRM_FORMAT_MODIFIER_PLANE_LAYOUT_EXT,
"VkImageDrmFormatModifierExplicitCreateInfoEXT::"
"pPlaneLayouts[%u]::offset is %" PRIu64,
@@ -1162,27 +1433,44 @@ add_all_surfaces_explicit_layout(
if (result != VK_SUCCESS)
return result;
- if (!mod_has_aux) {
- /* Even though the modifier does not support aux, try to create
- * a driver-private aux to improve performance.
+ if (mod_has_aux) {
+ const VkSubresourceLayout flat_ccs_layout = {
+ .offset = ANV_OFFSET_IMPLICIT,
+ };
+
+ const VkSubresourceLayout *aux_layout;
+
+ uint64_t aux_state_offset = ANV_OFFSET_IMPLICIT;
+
+ /* We already asserted on image->n_planes == 1 when mod_has_aux is
+ * true above, so the indexes of aux and clear color are just hard-
+ * coded without ambiguity.
*/
- result = add_aux_surface_if_supported(device, image, plane,
- format_plane,
- format_list_info,
- ANV_OFFSET_IMPLICIT, 0,
- isl_extra_usage_flags);
- if (result != VK_SUCCESS)
- return result;
- } else {
- const VkSubresourceLayout *aux_layout = &drm_info->pPlaneLayouts[1];
+ if (devinfo->has_flat_ccs) {
+ aux_layout = &flat_ccs_layout;
+ if (isl_drm_modifier_get_info(
+ drm_info->drmFormatModifier)->supports_clear_color) {
+ aux_state_offset = drm_info->pPlaneLayouts[1].offset;
+ }
+ } else {
+ aux_layout = &drm_info->pPlaneLayouts[1];
+ if (isl_drm_modifier_get_info(
+ drm_info->drmFormatModifier)->supports_clear_color) {
+ aux_state_offset = drm_info->pPlaneLayouts[2].offset;
+ }
+ }
+
result = add_aux_surface_if_supported(device, image, plane,
format_plane,
format_list_info,
aux_layout->offset,
aux_layout->rowPitch,
+ aux_state_offset,
isl_extra_usage_flags);
if (result != VK_SUCCESS)
return result;
+
+ assert(isl_aux_usage_has_ccs(image->planes[plane].aux_usage));
}
}
@@ -1248,33 +1536,107 @@ alloc_private_binding(struct anv_device *device,
return VK_SUCCESS;
}
- return anv_device_alloc_bo(device, "image-binding-private",
- binding->memory_range.size, 0, 0,
- &binding->address.bo);
+ VkResult result = anv_device_alloc_bo(device, "image-binding-private",
+ binding->memory_range.size, 0, 0,
+ &binding->address.bo);
+ if (result == VK_SUCCESS) {
+ pthread_mutex_lock(&device->mutex);
+ list_addtail(&image->link, &device->image_private_objects);
+ pthread_mutex_unlock(&device->mutex);
+ }
+
+ return result;
+}
+
+static void
+anv_image_finish_sparse_bindings(struct anv_image *image)
+{
+ struct anv_device *device =
+ container_of(image->vk.base.device, struct anv_device, vk);
+
+ assert(anv_image_is_sparse(image));
+
+ for (int i = 0; i < ANV_IMAGE_MEMORY_BINDING_END; i++) {
+ struct anv_image_binding *b = &image->bindings[i];
+
+ if (b->sparse_data.size != 0) {
+ assert(b->memory_range.size == b->sparse_data.size);
+ assert(b->address.offset == b->sparse_data.address);
+ anv_free_sparse_bindings(device, &b->sparse_data);
+ }
+ }
+}
+
+static VkResult MUST_CHECK
+anv_image_init_sparse_bindings(struct anv_image *image,
+ const struct anv_image_create_info *create_info)
+{
+ struct anv_device *device =
+ container_of(image->vk.base.device, struct anv_device, vk);
+ VkResult result;
+
+ assert(anv_image_is_sparse(image));
+
+ enum anv_bo_alloc_flags alloc_flags = 0;
+ uint64_t explicit_address = 0;
+ if (image->vk.create_flags & VK_IMAGE_CREATE_DESCRIPTOR_BUFFER_CAPTURE_REPLAY_BIT_EXT) {
+ alloc_flags |= ANV_BO_ALLOC_FIXED_ADDRESS;
+
+ const VkOpaqueCaptureDescriptorDataCreateInfoEXT *opaque_info =
+ vk_find_struct_const(create_info->vk_info->pNext,
+ OPAQUE_CAPTURE_DESCRIPTOR_DATA_CREATE_INFO_EXT);
+ if (opaque_info)
+ explicit_address = *((const uint64_t *)opaque_info->opaqueCaptureDescriptorData);
+ }
+
+ for (int i = 0; i < ANV_IMAGE_MEMORY_BINDING_END; i++) {
+ struct anv_image_binding *b = &image->bindings[i];
+
+ if (b->memory_range.size != 0) {
+ assert(b->sparse_data.size == 0);
+
+ /* From the spec, Custom Sparse Image Block Shapes section:
+ * "... the size in bytes of the custom sparse image block shape
+ * will be reported in VkMemoryRequirements::alignment."
+ *
+ * ISL should have set this for us, so just assert it here.
+ */
+ assert(b->memory_range.alignment == ANV_SPARSE_BLOCK_SIZE);
+ assert(b->memory_range.size % ANV_SPARSE_BLOCK_SIZE == 0);
+
+ result = anv_init_sparse_bindings(device,
+ b->memory_range.size,
+ &b->sparse_data,
+ alloc_flags,
+ explicit_address,
+ &b->address);
+ if (result != VK_SUCCESS) {
+ anv_image_finish_sparse_bindings(image);
+ return result;
+ }
+ }
+ }
+
+ return VK_SUCCESS;
}
VkResult
-anv_image_create(VkDevice _device,
- const struct anv_image_create_info *create_info,
- const VkAllocationCallbacks* alloc,
- VkImage *pImage)
+anv_image_init(struct anv_device *device, struct anv_image *image,
+ const struct anv_image_create_info *create_info)
{
- ANV_FROM_HANDLE(anv_device, device, _device);
const VkImageCreateInfo *pCreateInfo = create_info->vk_info;
const struct VkImageDrmFormatModifierExplicitCreateInfoEXT *mod_explicit_info = NULL;
const struct isl_drm_modifier_info *isl_mod_info = NULL;
- struct anv_image *image = NULL;
VkResult r;
- image = vk_image_create(&device->vk, pCreateInfo, alloc, sizeof(*image));
- if (image == NULL)
- return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+ vk_image_init(&device->vk, &image->vk, pCreateInfo);
image->vk.usage = anv_image_create_usage(pCreateInfo, image->vk.usage);
image->vk.stencil_usage =
anv_image_create_usage(pCreateInfo, image->vk.stencil_usage);
if (pCreateInfo->tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT) {
+ assert(!image->vk.wsi_legacy_scanout);
mod_explicit_info =
vk_find_struct_const(pCreateInfo->pNext,
IMAGE_DRM_FORMAT_MODIFIER_EXPLICIT_CREATE_INFO_EXT);
@@ -1304,12 +1666,17 @@ anv_image_create(VkDevice _device,
if (image->vk.external_handle_types &
VK_EXTERNAL_MEMORY_HANDLE_TYPE_ANDROID_HARDWARE_BUFFER_BIT_ANDROID) {
image->from_ahb = true;
- *pImage = anv_image_to_handle(image);
+#if DETECT_OS_ANDROID
+ image->vk.ahb_format = anv_ahb_format_for_vk_format(image->vk.format);
+#endif
return VK_SUCCESS;
}
image->n_planes = anv_get_format_planes(image->vk.format);
+ image->from_wsi =
+ vk_find_struct_const(pCreateInfo->pNext, WSI_IMAGE_CREATE_INFO_MESA) != NULL;
+
/* The Vulkan 1.2.165 glossary says:
*
* A disjoint image consists of multiple disjoint planes, and is created
@@ -1318,30 +1685,88 @@ anv_image_create(VkDevice _device,
image->disjoint = image->n_planes > 1 &&
(pCreateInfo->flags & VK_IMAGE_CREATE_DISJOINT_BIT);
+ isl_surf_usage_flags_t isl_extra_usage_flags = create_info->isl_extra_usage_flags;
+ if (anv_is_format_emulated(device->physical, pCreateInfo->format)) {
+ assert(image->n_planes == 1 &&
+ vk_format_is_compressed(image->vk.format));
+ assert(!(image->vk.create_flags & VK_IMAGE_CREATE_SPARSE_RESIDENCY_BIT));
+
+ image->emu_plane_format =
+ anv_get_emulation_format(device->physical, image->vk.format);
+
+ /* for fetching the raw copmressed data and storing the decompressed
+ * data
+ */
+ image->vk.create_flags |=
+ VK_IMAGE_CREATE_MUTABLE_FORMAT_BIT |
+ VK_IMAGE_CREATE_BLOCK_TEXEL_VIEW_COMPATIBLE_BIT;
+ if (image->vk.image_type == VK_IMAGE_TYPE_3D)
+ image->vk.create_flags |= VK_IMAGE_CREATE_2D_VIEW_COMPATIBLE_BIT_EXT;
+ image->vk.usage |=
+ VK_IMAGE_USAGE_SAMPLED_BIT | VK_IMAGE_USAGE_STORAGE_BIT;
+
+ /* TODO: enable compression on emulation plane */
+ isl_extra_usage_flags |= ISL_SURF_USAGE_DISABLE_AUX_BIT;
+ }
+
+ /* Disable aux if image supports export without modifiers. */
+ if (image->vk.external_handle_types != 0 &&
+ image->vk.tiling != VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT)
+ isl_extra_usage_flags |= ISL_SURF_USAGE_DISABLE_AUX_BIT;
+
const isl_tiling_flags_t isl_tiling_flags =
- choose_isl_tiling_flags(&device->info, create_info, isl_mod_info,
+ choose_isl_tiling_flags(device->info, create_info, isl_mod_info,
image->vk.wsi_legacy_scanout);
- const VkImageFormatListCreateInfoKHR *fmt_list =
+ const VkImageFormatListCreateInfo *fmt_list =
vk_find_struct_const(pCreateInfo->pNext,
- IMAGE_FORMAT_LIST_CREATE_INFO_KHR);
+ IMAGE_FORMAT_LIST_CREATE_INFO);
if (mod_explicit_info) {
r = add_all_surfaces_explicit_layout(device, image, fmt_list,
mod_explicit_info, isl_tiling_flags,
- create_info->isl_extra_usage_flags);
+ isl_extra_usage_flags);
} else {
- r = add_all_surfaces_implicit_layout(device, image, fmt_list, 0,
+ r = add_all_surfaces_implicit_layout(device, image, fmt_list, create_info->stride,
isl_tiling_flags,
- create_info);
+ isl_extra_usage_flags);
}
if (r != VK_SUCCESS)
goto fail;
- r = alloc_private_binding(device, image, pCreateInfo);
- if (r != VK_SUCCESS)
- goto fail;
+ if (image->emu_plane_format != VK_FORMAT_UNDEFINED) {
+ const struct intel_device_info *devinfo = device->info;
+ const uint32_t plane = image->n_planes;
+ const struct anv_format_plane plane_format = anv_get_format_plane(
+ devinfo, image->emu_plane_format, 0, image->vk.tiling);
+
+ isl_surf_usage_flags_t isl_usage = anv_image_choose_isl_surf_usage(
+ device->physical, image->vk.create_flags, image->vk.usage,
+ isl_extra_usage_flags, VK_IMAGE_ASPECT_COLOR_BIT,
+ image->vk.compr_flags);
+
+ r = add_primary_surface(device, image, plane, plane_format,
+ ANV_OFFSET_IMPLICIT, 0,
+ isl_tiling_flags, isl_usage);
+ if (r != VK_SUCCESS)
+ goto fail;
+ }
+
+ const VkVideoProfileListInfoKHR *video_profile =
+ vk_find_struct_const(pCreateInfo->pNext,
+ VIDEO_PROFILE_LIST_INFO_KHR);
+ if (video_profile) {
+ r = add_video_buffers(device, image, video_profile);
+ if (r != VK_SUCCESS)
+ goto fail;
+ }
+
+ if (!create_info->no_private_binding_alloc) {
+ r = alloc_private_binding(device, image, pCreateInfo);
+ if (r != VK_SUCCESS)
+ goto fail;
+ }
check_memory_bindings(device, image);
@@ -1349,95 +1774,135 @@ anv_image_create(VkDevice _device,
if (r != VK_SUCCESS)
goto fail;
- *pImage = anv_image_to_handle(image);
+ /* Once we have all the bindings, determine whether we can do non 0 fast
+ * clears for each plane.
+ */
+ for (uint32_t p = 0; p < image->n_planes; p++) {
+ image->planes[p].can_non_zero_fast_clear =
+ can_fast_clear_with_non_zero_color(device->info, image, p, fmt_list);
+ }
+
+ if (anv_image_is_sparse(image)) {
+ r = anv_image_init_sparse_bindings(image, create_info);
+ if (r != VK_SUCCESS)
+ goto fail;
+ }
return VK_SUCCESS;
fail:
- vk_image_destroy(&device->vk, alloc, &image->vk);
+ vk_image_finish(&image->vk);
return r;
}
-static struct anv_image *
-anv_swapchain_get_image(VkSwapchainKHR swapchain,
- uint32_t index)
+void
+anv_image_finish(struct anv_image *image)
{
- uint32_t n_images = index + 1;
- VkImage *images = malloc(sizeof(*images) * n_images);
- VkResult result = wsi_common_get_images(swapchain, &n_images, images);
+ struct anv_device *device =
+ container_of(image->vk.base.device, struct anv_device, vk);
- if (result != VK_SUCCESS && result != VK_INCOMPLETE) {
- free(images);
- return NULL;
+ if (anv_image_is_sparse(image))
+ anv_image_finish_sparse_bindings(image);
+
+ /* Unmap a CCS so that if the bound region of the image is rebound to
+ * another image, the AUX tables will be cleared to allow for a new
+ * mapping.
+ */
+ for (int p = 0; p < image->n_planes; ++p) {
+ if (image->planes[p].aux_tt.mapped) {
+ intel_aux_map_del_mapping(device->aux_map_ctx,
+ image->planes[p].aux_tt.addr,
+ image->planes[p].aux_tt.size);
+ }
+ }
+
+ if (image->from_gralloc) {
+ assert(!image->disjoint);
+ assert(image->n_planes == 1);
+ assert(image->planes[0].primary_surface.memory_range.binding ==
+ ANV_IMAGE_MEMORY_BINDING_MAIN);
+ assert(image->bindings[ANV_IMAGE_MEMORY_BINDING_MAIN].address.bo != NULL);
+ anv_device_release_bo(device, image->bindings[ANV_IMAGE_MEMORY_BINDING_MAIN].address.bo);
}
- ANV_FROM_HANDLE(anv_image, image, images[index]);
- free(images);
+ struct anv_bo *private_bo = image->bindings[ANV_IMAGE_MEMORY_BINDING_PRIVATE].address.bo;
+ if (private_bo) {
+ pthread_mutex_lock(&device->mutex);
+ list_del(&image->link);
+ pthread_mutex_unlock(&device->mutex);
+ anv_device_release_bo(device, private_bo);
+ }
- return image;
+ vk_image_finish(&image->vk);
}
-static VkResult
-anv_image_from_swapchain(VkDevice device,
- const VkImageCreateInfo *pCreateInfo,
- const VkImageSwapchainCreateInfoKHR *swapchain_info,
- const VkAllocationCallbacks *pAllocator,
- VkImage *pImage)
+static struct anv_image *
+anv_swapchain_get_image(VkSwapchainKHR swapchain,
+ uint32_t index)
{
- struct anv_image *swapchain_image = anv_swapchain_get_image(swapchain_info->swapchain, 0);
- assert(swapchain_image);
-
- VkImageCreateInfo local_create_info = *pCreateInfo;
- local_create_info.pNext = NULL;
+ VkImage image = wsi_common_get_image(swapchain, index);
+ return anv_image_from_handle(image);
+}
- /* Added by wsi code. */
- local_create_info.usage |= VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT;
+static VkResult
+anv_image_init_from_create_info(struct anv_device *device,
+ struct anv_image *image,
+ const VkImageCreateInfo *pCreateInfo,
+ bool no_private_binding_alloc)
+{
+ if (pCreateInfo->flags & VK_IMAGE_CREATE_SPARSE_RESIDENCY_BIT) {
+ VkResult result =
+ anv_sparse_image_check_support(device->physical,
+ pCreateInfo->flags,
+ pCreateInfo->tiling,
+ pCreateInfo->samples,
+ pCreateInfo->imageType,
+ pCreateInfo->format);
+ if (result != VK_SUCCESS)
+ return result;
+ }
- /* The spec requires TILING_OPTIMAL as input, but the swapchain image may
- * privately use a different tiling. See spec anchor
- * #swapchain-wsi-image-create-info .
- */
- assert(local_create_info.tiling == VK_IMAGE_TILING_OPTIMAL);
- local_create_info.tiling = swapchain_image->vk.tiling;
+ const VkNativeBufferANDROID *gralloc_info =
+ vk_find_struct_const(pCreateInfo->pNext, NATIVE_BUFFER_ANDROID);
+ if (gralloc_info)
+ return anv_image_init_from_gralloc(device, image, pCreateInfo,
+ gralloc_info);
- VkImageDrmFormatModifierListCreateInfoEXT local_modifier_info = {
- .sType = VK_STRUCTURE_TYPE_IMAGE_DRM_FORMAT_MODIFIER_LIST_CREATE_INFO_EXT,
- .drmFormatModifierCount = 1,
- .pDrmFormatModifiers = &swapchain_image->vk.drm_format_mod,
+ struct anv_image_create_info create_info = {
+ .vk_info = pCreateInfo,
+ .no_private_binding_alloc = no_private_binding_alloc,
};
- if (swapchain_image->vk.drm_format_mod != DRM_FORMAT_MOD_INVALID)
- __vk_append_struct(&local_create_info, &local_modifier_info);
-
- assert(swapchain_image->vk.image_type == local_create_info.imageType);
- assert(swapchain_image->vk.format == local_create_info.format);
- assert(swapchain_image->vk.extent.width == local_create_info.extent.width);
- assert(swapchain_image->vk.extent.height == local_create_info.extent.height);
- assert(swapchain_image->vk.extent.depth == local_create_info.extent.depth);
- assert(swapchain_image->vk.array_layers == local_create_info.arrayLayers);
- assert(swapchain_image->vk.samples == local_create_info.samples);
- assert(swapchain_image->vk.tiling == local_create_info.tiling);
- assert(swapchain_image->vk.usage == local_create_info.usage);
+ /* For dmabuf imports, configure the primary surface without support for
+ * compression if the modifier doesn't specify it. This helps to create
+ * VkImages with memory requirements that are compatible with the buffers
+ * apps provide.
+ */
+ const struct VkImageDrmFormatModifierExplicitCreateInfoEXT *mod_explicit_info =
+ vk_find_struct_const(pCreateInfo->pNext,
+ IMAGE_DRM_FORMAT_MODIFIER_EXPLICIT_CREATE_INFO_EXT);
+ if (mod_explicit_info &&
+ !isl_drm_modifier_has_aux(mod_explicit_info->drmFormatModifier))
+ create_info.isl_extra_usage_flags |= ISL_SURF_USAGE_DISABLE_AUX_BIT;
- return anv_image_create(device,
- &(struct anv_image_create_info) {
- .vk_info = &local_create_info,
- },
- pAllocator,
- pImage);
+ return anv_image_init(device, image, &create_info);
}
-VkResult
-anv_CreateImage(VkDevice device,
- const VkImageCreateInfo *pCreateInfo,
- const VkAllocationCallbacks *pAllocator,
- VkImage *pImage)
+VkResult anv_CreateImage(
+ VkDevice _device,
+ const VkImageCreateInfo* pCreateInfo,
+ const VkAllocationCallbacks* pAllocator,
+ VkImage* pImage)
{
- const VkNativeBufferANDROID *gralloc_info =
- vk_find_struct_const(pCreateInfo->pNext, NATIVE_BUFFER_ANDROID);
- if (gralloc_info)
- return anv_image_from_gralloc(device, pCreateInfo, gralloc_info,
- pAllocator, pImage);
+ ANV_FROM_HANDLE(anv_device, device, _device);
+
+ if ((device->physical->sparse_type == ANV_SPARSE_TYPE_NOT_SUPPORTED) &&
+ INTEL_DEBUG(DEBUG_SPARSE) &&
+ pCreateInfo->flags & (VK_IMAGE_CREATE_SPARSE_BINDING_BIT |
+ VK_IMAGE_CREATE_SPARSE_RESIDENCY_BIT |
+ VK_IMAGE_CREATE_SPARSE_ALIASED_BIT))
+ fprintf(stderr, "=== %s %s:%d flags:0x%08x\n", __func__, __FILE__,
+ __LINE__, pCreateInfo->flags);
#ifndef VK_USE_PLATFORM_ANDROID_KHR
/* Ignore swapchain creation info on Android. Since we don't have an
@@ -1446,17 +1911,33 @@ anv_CreateImage(VkDevice device,
*/
const VkImageSwapchainCreateInfoKHR *swapchain_info =
vk_find_struct_const(pCreateInfo->pNext, IMAGE_SWAPCHAIN_CREATE_INFO_KHR);
- if (swapchain_info && swapchain_info->swapchain != VK_NULL_HANDLE)
- return anv_image_from_swapchain(device, pCreateInfo, swapchain_info,
- pAllocator, pImage);
+ if (swapchain_info && swapchain_info->swapchain != VK_NULL_HANDLE) {
+ return wsi_common_create_swapchain_image(&device->physical->wsi_device,
+ pCreateInfo,
+ swapchain_info->swapchain,
+ pImage);
+ }
#endif
- return anv_image_create(device,
- &(struct anv_image_create_info) {
- .vk_info = pCreateInfo,
- },
- pAllocator,
- pImage);
+ struct anv_image *image =
+ vk_object_zalloc(&device->vk, pAllocator, sizeof(*image),
+ VK_OBJECT_TYPE_IMAGE);
+ if (!image)
+ return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+ VkResult result = anv_image_init_from_create_info(device, image,
+ pCreateInfo,
+ false);
+ if (result != VK_SUCCESS) {
+ vk_object_free(&device->vk, pAllocator, image);
+ return result;
+ }
+
+ ANV_RMV(image_create, device, false, image);
+
+ *pImage = anv_image_to_handle(image);
+
+ return result;
}
void
@@ -1469,20 +1950,12 @@ anv_DestroyImage(VkDevice _device, VkImage _image,
if (!image)
return;
- if (image->from_gralloc) {
- assert(!image->disjoint);
- assert(image->n_planes == 1);
- assert(image->planes[0].primary_surface.memory_range.binding ==
- ANV_IMAGE_MEMORY_BINDING_MAIN);
- assert(image->bindings[ANV_IMAGE_MEMORY_BINDING_MAIN].address.bo != NULL);
- anv_device_release_bo(device, image->bindings[ANV_IMAGE_MEMORY_BINDING_MAIN].address.bo);
- }
+ ANV_RMV(image_destroy, device, image);
- struct anv_bo *private_bo = image->bindings[ANV_IMAGE_MEMORY_BINDING_PRIVATE].address.bo;
- if (private_bo)
- anv_device_release_bo(device, private_bo);
+ assert(&device->vk == image->vk.base.device);
+ anv_image_finish(image);
- vk_image_destroy(&device->vk, pAllocator, &image->vk);
+ vk_free2(&device->vk.alloc, pAllocator, image);
}
/* We are binding AHardwareBuffer. Get a description, resolve the
@@ -1493,54 +1966,21 @@ resolve_ahw_image(struct anv_device *device,
struct anv_image *image,
struct anv_device_memory *mem)
{
-#if defined(ANDROID) && ANDROID_API_LEVEL >= 26
- assert(mem->ahw);
+#if DETECT_OS_ANDROID && ANDROID_API_LEVEL >= 26
+ assert(mem->vk.ahardware_buffer);
AHardwareBuffer_Desc desc;
- AHardwareBuffer_describe(mem->ahw, &desc);
+ AHardwareBuffer_describe(mem->vk.ahardware_buffer, &desc);
VkResult result;
/* Check tiling. */
- int i915_tiling = anv_gem_get_tiling(device, mem->bo->gem_handle);
- VkImageTiling vk_tiling;
- isl_tiling_flags_t isl_tiling_flags = 0;
-
- switch (i915_tiling) {
- case I915_TILING_NONE:
- vk_tiling = VK_IMAGE_TILING_LINEAR;
- isl_tiling_flags = ISL_TILING_LINEAR_BIT;
- break;
- case I915_TILING_X:
- vk_tiling = VK_IMAGE_TILING_OPTIMAL;
- isl_tiling_flags = ISL_TILING_X_BIT;
- break;
- case I915_TILING_Y:
- vk_tiling = VK_IMAGE_TILING_OPTIMAL;
- isl_tiling_flags = ISL_TILING_Y0_BIT;
- break;
- case -1:
- default:
- unreachable("Invalid tiling flags.");
- }
-
- assert(vk_tiling == VK_IMAGE_TILING_LINEAR ||
- vk_tiling == VK_IMAGE_TILING_OPTIMAL);
+ enum isl_tiling tiling;
+ result = anv_device_get_bo_tiling(device, mem->bo, &tiling);
+ assert(result == VK_SUCCESS);
+ isl_tiling_flags_t isl_tiling_flags = (1u << tiling);
/* Check format. */
VkFormat vk_format = vk_format_from_android(desc.format, desc.usage);
- enum isl_format isl_fmt = anv_get_isl_format(&device->info,
- vk_format,
- VK_IMAGE_ASPECT_COLOR_BIT,
- vk_tiling);
- assert(isl_fmt != ISL_FORMAT_UNSUPPORTED);
-
- /* Handle RGB(X)->RGBA fallback. */
- switch (desc.format) {
- case AHARDWAREBUFFER_FORMAT_R8G8B8_UNORM:
- case AHARDWAREBUFFER_FORMAT_R8G8B8X8_UNORM:
- if (isl_format_is_rgb(isl_fmt))
- isl_fmt = isl_format_rgb_to_rgba(isl_fmt);
- break;
- }
+ assert(vk_format != VK_FORMAT_UNDEFINED);
/* Now we are able to fill anv_image fields properly and create
* isl_surface for it.
@@ -1548,70 +1988,41 @@ resolve_ahw_image(struct anv_device *device,
vk_image_set_format(&image->vk, vk_format);
image->n_planes = anv_get_format_planes(image->vk.format);
- uint32_t stride = desc.stride *
- (isl_format_get_layout(isl_fmt)->bpb / 8);
-
- struct anv_image_create_info create_info = {
- .isl_extra_usage_flags = ISL_SURF_USAGE_DISABLE_AUX_BIT,
- };
-
- result = add_all_surfaces_implicit_layout(device, image, NULL, stride,
+ result = add_all_surfaces_implicit_layout(device, image, NULL, desc.stride,
isl_tiling_flags,
- &create_info);
+ ISL_SURF_USAGE_DISABLE_AUX_BIT);
assert(result == VK_SUCCESS);
#endif
}
-void anv_GetImageMemoryRequirements2(
- VkDevice _device,
- const VkImageMemoryRequirementsInfo2* pInfo,
- VkMemoryRequirements2* pMemoryRequirements)
+void
+anv_image_get_memory_requirements(struct anv_device *device,
+ struct anv_image *image,
+ VkImageAspectFlags aspects,
+ VkMemoryRequirements2 *pMemoryRequirements)
{
- ANV_FROM_HANDLE(anv_device, device, _device);
- ANV_FROM_HANDLE(anv_image, image, pInfo->image);
-
- const VkImagePlaneMemoryRequirementsInfo *plane_reqs = NULL;
-
/* The Vulkan spec (git aaed022) says:
*
* memoryTypeBits is a bitfield and contains one bit set for every
* supported memory type for the resource. The bit `1<<i` is set if and
* only if the memory type `i` in the VkPhysicalDeviceMemoryProperties
* structure for the physical device is supported.
- *
- * All types are currently supported for images.
*/
- uint32_t memory_types = (1ull << device->physical->memory.type_count) - 1;
-
- vk_foreach_struct_const(ext, pInfo->pNext) {
- switch (ext->sType) {
- case VK_STRUCTURE_TYPE_IMAGE_PLANE_MEMORY_REQUIREMENTS_INFO: {
- assert(image->disjoint);
- plane_reqs = (const VkImagePlaneMemoryRequirementsInfo *) ext;
- const struct anv_image_binding *binding =
- image_aspect_to_binding(image, plane_reqs->planeAspect);
-
- pMemoryRequirements->memoryRequirements = (VkMemoryRequirements) {
- .size = binding->memory_range.size,
- .alignment = binding->memory_range.alignment,
- .memoryTypeBits = memory_types,
- };
- break;
- }
-
- default:
- anv_debug_ignored_stype(ext->sType);
- break;
- }
- }
+ uint32_t memory_types =
+ (image->vk.create_flags & VK_IMAGE_CREATE_PROTECTED_BIT) ?
+ device->physical->memory.protected_mem_types :
+ device->physical->memory.default_buffer_mem_types;
vk_foreach_struct(ext, pMemoryRequirements->pNext) {
switch (ext->sType) {
case VK_STRUCTURE_TYPE_MEMORY_DEDICATED_REQUIREMENTS: {
VkMemoryDedicatedRequirements *requirements = (void *)ext;
- if (image->vk.wsi_legacy_scanout || image->from_ahb) {
- /* If we need to set the tiling for external consumers, we need a
- * dedicated allocation.
+ if (image->vk.wsi_legacy_scanout ||
+ image->from_ahb ||
+ (isl_drm_modifier_has_aux(image->vk.drm_format_mod) &&
+ anv_image_uses_aux_map(device, image))) {
+ /* If we need to set the tiling for external consumers or the
+ * modifier involves AUX tables, we need a dedicated allocation.
*
* See also anv_AllocateMemory.
*/
@@ -1640,173 +2051,537 @@ void anv_GetImageMemoryRequirements2(
* and only if the image is disjoint (that is, multi-planar format and
* VK_IMAGE_CREATE_DISJOINT_BIT).
*/
- assert(image->disjoint == (plane_reqs != NULL));
+ const struct anv_image_binding *binding;
+ if (image->disjoint) {
+ assert(util_bitcount(aspects) == 1);
+ assert(aspects & image->vk.aspects);
+ binding = anv_image_aspect_to_binding(image, aspects);
+ } else {
+ assert(aspects == image->vk.aspects);
+ binding = &image->bindings[ANV_IMAGE_MEMORY_BINDING_MAIN];
+ }
- if (!image->disjoint) {
- const struct anv_image_binding *binding =
- &image->bindings[ANV_IMAGE_MEMORY_BINDING_MAIN];
+ pMemoryRequirements->memoryRequirements = (VkMemoryRequirements) {
+ .size = binding->memory_range.size,
+ .alignment = binding->memory_range.alignment,
+ .memoryTypeBits = memory_types,
+ };
+}
- pMemoryRequirements->memoryRequirements = (VkMemoryRequirements) {
- .size = binding->memory_range.size,
- .alignment = binding->memory_range.alignment,
- .memoryTypeBits = memory_types,
- };
+void anv_GetImageMemoryRequirements2(
+ VkDevice _device,
+ const VkImageMemoryRequirementsInfo2* pInfo,
+ VkMemoryRequirements2* pMemoryRequirements)
+{
+ ANV_FROM_HANDLE(anv_device, device, _device);
+ ANV_FROM_HANDLE(anv_image, image, pInfo->image);
+
+ VkImageAspectFlags aspects = image->vk.aspects;
+
+ vk_foreach_struct_const(ext, pInfo->pNext) {
+ switch (ext->sType) {
+ case VK_STRUCTURE_TYPE_IMAGE_PLANE_MEMORY_REQUIREMENTS_INFO: {
+ assert(image->disjoint);
+ const VkImagePlaneMemoryRequirementsInfo *plane_reqs =
+ (const VkImagePlaneMemoryRequirementsInfo *) ext;
+ aspects = plane_reqs->planeAspect;
+ break;
+ }
+
+ default:
+ anv_debug_ignored_stype(ext->sType);
+ break;
+ }
}
+
+ anv_image_get_memory_requirements(device, image, aspects,
+ pMemoryRequirements);
}
-void anv_GetImageSparseMemoryRequirements(
- VkDevice device,
- VkImage image,
- uint32_t* pSparseMemoryRequirementCount,
- VkSparseImageMemoryRequirements* pSparseMemoryRequirements)
+void anv_GetDeviceImageMemoryRequirements(
+ VkDevice _device,
+ const VkDeviceImageMemoryRequirements* pInfo,
+ VkMemoryRequirements2* pMemoryRequirements)
{
- *pSparseMemoryRequirementCount = 0;
+ ANV_FROM_HANDLE(anv_device, device, _device);
+ struct anv_image image = { 0 };
+
+ if ((device->physical->sparse_type == ANV_SPARSE_TYPE_NOT_SUPPORTED) &&
+ INTEL_DEBUG(DEBUG_SPARSE) &&
+ pInfo->pCreateInfo->flags & (VK_IMAGE_CREATE_SPARSE_BINDING_BIT |
+ VK_IMAGE_CREATE_SPARSE_RESIDENCY_BIT |
+ VK_IMAGE_CREATE_SPARSE_ALIASED_BIT))
+ fprintf(stderr, "=== %s %s:%d flags:0x%08x\n", __func__, __FILE__,
+ __LINE__, pInfo->pCreateInfo->flags);
+
+ ASSERTED VkResult result =
+ anv_image_init_from_create_info(device, &image, pInfo->pCreateInfo, true);
+ assert(result == VK_SUCCESS);
+
+ VkImageAspectFlags aspects =
+ image.disjoint ? pInfo->planeAspect : image.vk.aspects;
+
+ anv_image_get_memory_requirements(device, &image, aspects,
+ pMemoryRequirements);
+ anv_image_finish(&image);
+}
+
+static void
+anv_image_get_sparse_memory_requirements(
+ struct anv_device *device,
+ struct anv_image *image,
+ VkImageAspectFlags aspects,
+ uint32_t *pSparseMemoryRequirementCount,
+ VkSparseImageMemoryRequirements2 *pSparseMemoryRequirements)
+{
+ VK_OUTARRAY_MAKE_TYPED(VkSparseImageMemoryRequirements2, reqs,
+ pSparseMemoryRequirements,
+ pSparseMemoryRequirementCount);
+
+ /* From the spec:
+ * "The sparse image must have been created using the
+ * VK_IMAGE_CREATE_SPARSE_RESIDENCY_BIT flag to retrieve valid sparse
+ * image memory requirements."
+ */
+ if (!(image->vk.create_flags & VK_IMAGE_CREATE_SPARSE_RESIDENCY_BIT))
+ return;
+
+ VkSparseImageMemoryRequirements ds_mem_reqs = {};
+ VkSparseImageMemoryRequirements2 *ds_reqs_ptr = NULL;
+
+ u_foreach_bit(b, aspects) {
+ VkImageAspectFlagBits aspect = 1 << b;
+ const uint32_t plane = anv_image_aspect_to_plane(image, aspect);
+ struct isl_surf *surf = &image->planes[plane].primary_surface.isl;
+
+ VkSparseImageFormatProperties format_props =
+ anv_sparse_calc_image_format_properties(device->physical, aspect,
+ image->vk.image_type, surf);
+
+ uint32_t miptail_first_lod;
+ VkDeviceSize miptail_size, miptail_offset, miptail_stride;
+ anv_sparse_calc_miptail_properties(device, image, aspect,
+ &miptail_first_lod, &miptail_size,
+ &miptail_offset, &miptail_stride);
+
+ VkSparseImageMemoryRequirements mem_reqs = {
+ .formatProperties = format_props,
+ .imageMipTailFirstLod = miptail_first_lod,
+ .imageMipTailSize = miptail_size,
+ .imageMipTailOffset = miptail_offset,
+ .imageMipTailStride = miptail_stride,
+ };
+
+ /* If both depth and stencil are the same, unify them if possible. */
+ if (aspect & (VK_IMAGE_ASPECT_DEPTH_BIT |
+ VK_IMAGE_ASPECT_STENCIL_BIT)) {
+ if (!ds_reqs_ptr) {
+ ds_mem_reqs = mem_reqs;
+ } else if (ds_mem_reqs.formatProperties.imageGranularity.width ==
+ mem_reqs.formatProperties.imageGranularity.width &&
+ ds_mem_reqs.formatProperties.imageGranularity.height ==
+ mem_reqs.formatProperties.imageGranularity.height &&
+ ds_mem_reqs.formatProperties.imageGranularity.depth ==
+ mem_reqs.formatProperties.imageGranularity.depth &&
+ ds_mem_reqs.imageMipTailFirstLod ==
+ mem_reqs.imageMipTailFirstLod &&
+ ds_mem_reqs.imageMipTailSize ==
+ mem_reqs.imageMipTailSize &&
+ ds_mem_reqs.imageMipTailOffset ==
+ mem_reqs.imageMipTailOffset &&
+ ds_mem_reqs.imageMipTailStride ==
+ mem_reqs.imageMipTailStride) {
+ ds_reqs_ptr->memoryRequirements.formatProperties.aspectMask |=
+ aspect;
+ continue;
+ }
+ }
+
+ vk_outarray_append_typed(VkSparseImageMemoryRequirements2, &reqs, r) {
+ r->memoryRequirements = mem_reqs;
+ if (aspect & (VK_IMAGE_ASPECT_DEPTH_BIT |
+ VK_IMAGE_ASPECT_STENCIL_BIT))
+ ds_reqs_ptr = r;
+ }
+ }
}
void anv_GetImageSparseMemoryRequirements2(
- VkDevice device,
+ VkDevice _device,
const VkImageSparseMemoryRequirementsInfo2* pInfo,
uint32_t* pSparseMemoryRequirementCount,
VkSparseImageMemoryRequirements2* pSparseMemoryRequirements)
{
- *pSparseMemoryRequirementCount = 0;
+ ANV_FROM_HANDLE(anv_device, device, _device);
+ ANV_FROM_HANDLE(anv_image, image, pInfo->image);
+
+ if (!anv_sparse_residency_is_enabled(device)) {
+ if ((device->physical->sparse_type == ANV_SPARSE_TYPE_NOT_SUPPORTED) &&
+ INTEL_DEBUG(DEBUG_SPARSE))
+ fprintf(stderr, "=== [%s:%d] [%s]\n", __FILE__, __LINE__, __func__);
+
+ *pSparseMemoryRequirementCount = 0;
+ return;
+ }
+
+ anv_image_get_sparse_memory_requirements(device, image, image->vk.aspects,
+ pSparseMemoryRequirementCount,
+ pSparseMemoryRequirements);
}
-VkResult anv_BindImageMemory2(
+void anv_GetDeviceImageSparseMemoryRequirements(
VkDevice _device,
- uint32_t bindInfoCount,
- const VkBindImageMemoryInfo* pBindInfos)
+ const VkDeviceImageMemoryRequirements* pInfo,
+ uint32_t* pSparseMemoryRequirementCount,
+ VkSparseImageMemoryRequirements2* pSparseMemoryRequirements)
{
ANV_FROM_HANDLE(anv_device, device, _device);
+ struct anv_image image = { 0 };
- for (uint32_t i = 0; i < bindInfoCount; i++) {
- const VkBindImageMemoryInfo *bind_info = &pBindInfos[i];
- ANV_FROM_HANDLE(anv_device_memory, mem, bind_info->memory);
- ANV_FROM_HANDLE(anv_image, image, bind_info->image);
- bool did_bind = false;
-
- /* Resolve will alter the image's aspects, do this first. */
- if (mem && mem->ahw)
- resolve_ahw_image(device, image, mem);
-
- vk_foreach_struct_const(s, bind_info->pNext) {
- switch (s->sType) {
- case VK_STRUCTURE_TYPE_BIND_IMAGE_PLANE_MEMORY_INFO: {
- const VkBindImagePlaneMemoryInfo *plane_info =
- (const VkBindImagePlaneMemoryInfo *) s;
-
- /* Workaround for possible spec bug.
- *
- * Unlike VkImagePlaneMemoryRequirementsInfo, which requires that
- * the image be disjoint (that is, multi-planar format and
- * VK_IMAGE_CREATE_DISJOINT_BIT), VkBindImagePlaneMemoryInfo allows
- * the image to be non-disjoint and requires only that the image
- * have the DISJOINT flag. In this case, regardless of the value of
- * VkImagePlaneMemoryRequirementsInfo::planeAspect, the behavior is
- * the same as if VkImagePlaneMemoryRequirementsInfo were omitted.
- */
- if (!image->disjoint)
- break;
+ if (!anv_sparse_residency_is_enabled(device)) {
+ if ((device->physical->sparse_type == ANV_SPARSE_TYPE_NOT_SUPPORTED) &&
+ INTEL_DEBUG(DEBUG_SPARSE))
+ fprintf(stderr, "=== [%s:%d] [%s]\n", __FILE__, __LINE__, __func__);
- struct anv_image_binding *binding =
- image_aspect_to_binding(image, plane_info->planeAspect);
+ *pSparseMemoryRequirementCount = 0;
+ return;
+ }
- binding->address = (struct anv_address) {
- .bo = mem->bo,
- .offset = bind_info->memoryOffset,
- };
+ /* This function is similar to anv_GetDeviceImageMemoryRequirements, in
+ * which it actually creates an image, gets the properties and then
+ * destroys the image.
+ *
+ * We could one day refactor things to allow us to gather the properties
+ * without having to actually create the image, maybe by reworking ISL to
+ * separate creation from parameter computing.
+ */
+ VkResult result =
+ anv_image_init_from_create_info(device, &image, pInfo->pCreateInfo,
+ true /* no_private_binding_alloc */);
+ if (result != VK_SUCCESS) {
+ *pSparseMemoryRequirementCount = 0;
+ return;
+ }
- did_bind = true;
- break;
+ /* The spec says:
+ * "planeAspect is a VkImageAspectFlagBits value specifying the aspect
+ * corresponding to the image plane to query. This parameter is ignored
+ * unless pCreateInfo::tiling is VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT,
+ * or pCreateInfo::flags has VK_IMAGE_CREATE_DISJOINT_BIT set."
+ */
+ VkImageAspectFlags aspects =
+ (pInfo->pCreateInfo->flags & VK_IMAGE_CREATE_DISJOINT_BIT) ||
+ (pInfo->pCreateInfo->tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT)
+ ? pInfo->planeAspect : image.vk.aspects;
+
+ anv_image_get_sparse_memory_requirements(device, &image, aspects,
+ pSparseMemoryRequirementCount,
+ pSparseMemoryRequirements);
+
+ anv_image_finish(&image);
+}
+
+static bool
+anv_image_map_aux_tt(struct anv_device *device,
+ struct anv_image *image, uint32_t plane)
+{
+ const struct anv_address main_addr = anv_image_address(
+ image, &image->planes[plane].primary_surface.memory_range);
+ struct anv_bo *bo = main_addr.bo;
+ assert(bo != NULL);
+
+ /* If the additional memory padding was added at the end of the BO for CCS
+ * data, map this region at the granularity of the main/CCS pages.
+ *
+ * Otherwise the image should have additional CCS data at the computed
+ * offset.
+ */
+ if (device->physical->alloc_aux_tt_mem &&
+ (bo->alloc_flags & ANV_BO_ALLOC_AUX_CCS)) {
+ uint64_t main_aux_alignment =
+ intel_aux_map_get_alignment(device->aux_map_ctx);
+ assert(bo->offset % main_aux_alignment == 0);
+ const struct anv_address start_addr = (struct anv_address) {
+ .bo = bo,
+ .offset = ROUND_DOWN_TO(main_addr.offset, main_aux_alignment),
+ };
+ const struct anv_address aux_addr = (struct anv_address) {
+ .bo = bo,
+ .offset = bo->ccs_offset +
+ intel_aux_main_to_aux_offset(device->aux_map_ctx,
+ start_addr.offset),
+ };
+ const struct isl_surf *surf = &image->planes[plane].primary_surface.isl;
+ const uint64_t format_bits =
+ intel_aux_map_format_bits_for_isl_surf(surf);
+ /* Make sure to have the mapping cover the entire image from the aux
+ * aligned start.
+ */
+ const uint64_t main_size = align(
+ (main_addr.offset - start_addr.offset) + surf->size_B,
+ main_aux_alignment);
+
+ if (intel_aux_map_add_mapping(device->aux_map_ctx,
+ anv_address_physical(start_addr),
+ anv_address_physical(aux_addr),
+ main_size, format_bits)) {
+ image->planes[plane].aux_tt.mapped = true;
+ image->planes[plane].aux_tt.addr = anv_address_physical(start_addr);
+ image->planes[plane].aux_tt.size = main_size;
+ return true;
+ }
+ } else {
+ if (anv_address_allows_aux_map(device, main_addr)) {
+ const struct anv_address aux_addr =
+ anv_image_address(image,
+ &image->planes[plane].compr_ctrl_memory_range);
+ const struct isl_surf *surf =
+ &image->planes[plane].primary_surface.isl;
+ const uint64_t format_bits =
+ intel_aux_map_format_bits_for_isl_surf(surf);
+ if (intel_aux_map_add_mapping(device->aux_map_ctx,
+ anv_address_physical(main_addr),
+ anv_address_physical(aux_addr),
+ surf->size_B, format_bits)) {
+ image->planes[plane].aux_tt.mapped = true;
+ image->planes[plane].aux_tt.addr = anv_address_physical(main_addr);
+ image->planes[plane].aux_tt.size = surf->size_B;
+ return true;
}
- case VK_STRUCTURE_TYPE_BIND_IMAGE_MEMORY_SWAPCHAIN_INFO_KHR: {
- /* Ignore this struct on Android, we cannot access swapchain
- * structures threre.
- */
-#ifndef VK_USE_PLATFORM_ANDROID_KHR
- const VkBindImageMemorySwapchainInfoKHR *swapchain_info =
- (const VkBindImageMemorySwapchainInfoKHR *) s;
- struct anv_image *swapchain_image =
- anv_swapchain_get_image(swapchain_info->swapchain,
- swapchain_info->imageIndex);
- assert(swapchain_image);
- assert(image->vk.aspects == swapchain_image->vk.aspects);
- assert(mem == NULL);
-
- for (int j = 0; j < ARRAY_SIZE(image->bindings); ++j)
- image->bindings[j].address = swapchain_image->bindings[j].address;
-
- /* We must bump the private binding's bo's refcount because, unlike the other
- * bindings, its lifetime is not application-managed.
- */
- struct anv_bo *private_bo =
- image->bindings[ANV_IMAGE_MEMORY_BINDING_PRIVATE].address.bo;
- if (private_bo)
- anv_bo_ref(private_bo);
+ }
+ }
- did_bind = true;
-#endif
+ return false;
+
+}
+
+static VkResult
+anv_bind_image_memory(struct anv_device *device,
+ const VkBindImageMemoryInfo *bind_info)
+{
+ ANV_FROM_HANDLE(anv_device_memory, mem, bind_info->memory);
+ ANV_FROM_HANDLE(anv_image, image, bind_info->image);
+ bool did_bind = false;
+
+ const VkBindMemoryStatusKHR *bind_status =
+ vk_find_struct_const(bind_info->pNext, BIND_MEMORY_STATUS_KHR);
+
+ assert(!anv_image_is_sparse(image));
+
+ /* Resolve will alter the image's aspects, do this first. */
+ if (mem && mem->vk.ahardware_buffer)
+ resolve_ahw_image(device, image, mem);
+
+ vk_foreach_struct_const(s, bind_info->pNext) {
+ switch (s->sType) {
+ case VK_STRUCTURE_TYPE_BIND_IMAGE_PLANE_MEMORY_INFO: {
+ const VkBindImagePlaneMemoryInfo *plane_info =
+ (const VkBindImagePlaneMemoryInfo *) s;
+
+ /* Workaround for possible spec bug.
+ *
+ * Unlike VkImagePlaneMemoryRequirementsInfo, which requires that
+ * the image be disjoint (that is, multi-planar format and
+ * VK_IMAGE_CREATE_DISJOINT_BIT), VkBindImagePlaneMemoryInfo allows
+ * the image to be non-disjoint and requires only that the image
+ * have the DISJOINT flag. In this case, regardless of the value of
+ * VkImagePlaneMemoryRequirementsInfo::planeAspect, the behavior is
+ * the same as if VkImagePlaneMemoryRequirementsInfo were omitted.
+ */
+ if (!image->disjoint)
break;
+
+ struct anv_image_binding *binding =
+ anv_image_aspect_to_binding(image, plane_info->planeAspect);
+
+ binding->address = (struct anv_address) {
+ .bo = mem->bo,
+ .offset = bind_info->memoryOffset,
+ };
+
+ ANV_RMV(image_bind, device, image,
+ binding - image->bindings);
+
+ did_bind = true;
+ break;
+ }
+ case VK_STRUCTURE_TYPE_BIND_IMAGE_MEMORY_SWAPCHAIN_INFO_KHR: {
+ /* Ignore this struct on Android, we cannot access swapchain
+ * structures there.
+ */
+#ifndef VK_USE_PLATFORM_ANDROID_KHR
+ const VkBindImageMemorySwapchainInfoKHR *swapchain_info =
+ (const VkBindImageMemorySwapchainInfoKHR *) s;
+ struct anv_image *swapchain_image =
+ anv_swapchain_get_image(swapchain_info->swapchain,
+ swapchain_info->imageIndex);
+ assert(swapchain_image);
+ assert(image->vk.aspects == swapchain_image->vk.aspects);
+ assert(mem == NULL);
+
+ for (int j = 0; j < ARRAY_SIZE(image->bindings); ++j) {
+ assert(memory_ranges_equal(image->bindings[j].memory_range,
+ swapchain_image->bindings[j].memory_range));
+ image->bindings[j].address = swapchain_image->bindings[j].address;
}
+
+ /* We must bump the private binding's bo's refcount because, unlike the other
+ * bindings, its lifetime is not application-managed.
+ */
+ struct anv_bo *private_bo =
+ image->bindings[ANV_IMAGE_MEMORY_BINDING_PRIVATE].address.bo;
+ if (private_bo)
+ anv_bo_ref(private_bo);
+
+ did_bind = true;
+#endif
+ break;
+ }
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wswitch"
- case VK_STRUCTURE_TYPE_NATIVE_BUFFER_ANDROID: {
- const VkNativeBufferANDROID *gralloc_info =
- (const VkNativeBufferANDROID *)s;
- VkResult result = anv_image_bind_from_gralloc(device, image,
- gralloc_info);
- if (result != VK_SUCCESS)
- return result;
- did_bind = true;
- break;
- }
+ case VK_STRUCTURE_TYPE_NATIVE_BUFFER_ANDROID: {
+ const VkNativeBufferANDROID *gralloc_info =
+ (const VkNativeBufferANDROID *)s;
+ VkResult result = anv_image_bind_from_gralloc(device, image,
+ gralloc_info);
+ if (result != VK_SUCCESS)
+ return result;
+ did_bind = true;
+ break;
+ }
#pragma GCC diagnostic pop
- default:
- anv_debug_ignored_stype(s->sType);
- break;
- }
+ default:
+ anv_debug_ignored_stype(s->sType);
+ break;
}
+ }
- if (!did_bind) {
- assert(!image->disjoint);
+ if (!did_bind) {
+ assert(!image->disjoint);
- image->bindings[ANV_IMAGE_MEMORY_BINDING_MAIN].address =
- (struct anv_address) {
- .bo = mem->bo,
- .offset = bind_info->memoryOffset,
- };
+ image->bindings[ANV_IMAGE_MEMORY_BINDING_MAIN].address =
+ (struct anv_address) {
+ .bo = mem->bo,
+ .offset = bind_info->memoryOffset,
+ };
- did_bind = true;
- }
+ ANV_RMV(image_bind, device, image,
+ ANV_IMAGE_MEMORY_BINDING_MAIN);
- /* On platforms that use implicit CCS, if the plane's bo lacks implicit
- * CCS then disable compression on the plane.
+ did_bind = true;
+ }
+
+ /* Now that we have the BO, finalize CCS setup. */
+ for (int p = 0; p < image->n_planes; ++p) {
+ enum anv_image_memory_binding binding =
+ image->planes[p].primary_surface.memory_range.binding;
+ const struct anv_bo *bo =
+ image->bindings[binding].address.bo;
+
+ if (!bo || !isl_aux_usage_has_ccs(image->planes[p].aux_usage))
+ continue;
+
+ /* Do nothing if flat CCS requirements are satisfied.
+ *
+ * Also, assume that imported BOs with a modifier including
+ * CCS live only in local memory. Otherwise the exporter should
+ * have failed the creation of the BO.
*/
- for (int p = 0; p < image->n_planes; ++p) {
- enum anv_image_memory_binding binding =
- image->planes[p].primary_surface.memory_range.binding;
- const struct anv_bo *bo =
- image->bindings[binding].address.bo;
-
- if (bo && !bo->has_implicit_ccs &&
- device->physical->has_implicit_ccs)
- image->planes[p].aux_usage = ISL_AUX_USAGE_NONE;
+ if (device->info->has_flat_ccs &&
+ (anv_bo_is_vram_only(bo) ||
+ (bo->alloc_flags & ANV_BO_ALLOC_IMPORTED)))
+ continue;
+
+ /* If the AUX-TT mapping succeeds, there is nothing else to do. */
+ if (device->info->has_aux_map && anv_image_map_aux_tt(device, image, p))
+ continue;
+
+ /* Do nothing prior to gfx12. There are no special requirements. */
+ if (device->info->ver < 12)
+ continue;
+
+ /* The plane's BO cannot support CCS, disable compression on it. */
+ assert(!isl_drm_modifier_has_aux(image->vk.drm_format_mod));
+
+ anv_perf_warn(VK_LOG_OBJS(&image->vk.base),
+ "BO lacks CCS support. Disabling the CCS aux usage.");
+
+ if (image->planes[p].aux_surface.memory_range.size > 0) {
+ assert(image->planes[p].aux_usage == ISL_AUX_USAGE_HIZ_CCS ||
+ image->planes[p].aux_usage == ISL_AUX_USAGE_HIZ_CCS_WT);
+ image->planes[p].aux_usage = ISL_AUX_USAGE_HIZ;
+ } else {
+ assert(image->planes[p].aux_usage == ISL_AUX_USAGE_CCS_E ||
+ image->planes[p].aux_usage == ISL_AUX_USAGE_FCV_CCS_E ||
+ image->planes[p].aux_usage == ISL_AUX_USAGE_STC_CCS);
+ image->planes[p].aux_usage = ISL_AUX_USAGE_NONE;
}
}
+ if (bind_status)
+ *bind_status->pResult = VK_SUCCESS;
+
return VK_SUCCESS;
}
-void anv_GetImageSubresourceLayout(
- VkDevice device,
- VkImage _image,
- const VkImageSubresource* subresource,
- VkSubresourceLayout* layout)
+VkResult anv_BindImageMemory2(
+ VkDevice _device,
+ uint32_t bindInfoCount,
+ const VkBindImageMemoryInfo* pBindInfos)
{
- ANV_FROM_HANDLE(anv_image, image, _image);
- const struct anv_surface *surface;
+ ANV_FROM_HANDLE(anv_device, device, _device);
+ VkResult result = VK_SUCCESS;
- assert(__builtin_popcount(subresource->aspectMask) == 1);
+ for (uint32_t i = 0; i < bindInfoCount; i++) {
+ VkResult res = anv_bind_image_memory(device, &pBindInfos[i]);
+ if (result == VK_SUCCESS && res != VK_SUCCESS)
+ result = res;
+ }
+
+ return result;
+}
+
+static inline void
+get_image_fast_clear_layout(const struct anv_image *image,
+ VkSubresourceLayout *out_layout)
+{
+ /* If the memory binding differs between primary and fast clear
+ * region, then the returned offset will be incorrect.
+ */
+ assert(image->planes[0].fast_clear_memory_range.binding ==
+ image->planes[0].primary_surface.memory_range.binding);
+ out_layout->offset = image->planes[0].fast_clear_memory_range.offset;
+ out_layout->size = image->planes[0].fast_clear_memory_range.size;
+ /* Refer to the comment above add_aux_state_tracking_buffer() for the
+ * design of fast clear region. It is not a typical isl surface, so we
+ * just push some values in these pitches when no other requirements
+ * to meet. We have some freedom to do so according to the spec of
+ * VkSubresourceLayout:
+ *
+ * If the image is non-linear, then rowPitch, arrayPitch, and depthPitch
+ * have an implementation-dependent meaning.
+ *
+ * Fast clear is neither supported on linear tiling formats nor linear
+ * modifiers, which don't have the fast clear plane. We should be safe
+ * with these values.
+ */
+ out_layout->arrayPitch = 1;
+ out_layout->depthPitch = 1;
+ /* On TGL and DG2, 64-byte alignment on clear color is required.
+ * This pitch is ignored on MTL. (drm_fourcc.h)
+ */
+ out_layout->rowPitch = 64;
+}
+
+static void
+anv_get_image_subresource_layout(const struct anv_image *image,
+ const VkImageSubresource2KHR *subresource,
+ VkSubresourceLayout2KHR *layout)
+{
+ const struct anv_image_memory_range *mem_range;
+ const struct isl_surf *isl_surf;
+
+ assert(__builtin_popcount(subresource->imageSubresource.aspectMask) == 1);
/* The Vulkan spec requires that aspectMask be
* VK_IMAGE_ASPECT_MEMORY_PLANE_i_BIT_EXT if tiling is
@@ -1822,11 +2597,13 @@ void anv_GetImageSubresourceLayout(
* so it _should_ correctly use VK_IMAGE_ASPECT_MEMORY_PLANE_* in that case.
* But it incorrectly uses VK_IMAGE_ASPECT_PLANE_*, so we have a temporary
* workaround.
+ *
+ * https://gitlab.freedesktop.org/mesa/mesa/-/issues/10176
*/
if (image->vk.tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT) {
/* TODO(chadv): Drop this workaround when WSI gets fixed. */
uint32_t mem_plane;
- switch (subresource->aspectMask) {
+ switch (subresource->imageSubresource.aspectMask) {
case VK_IMAGE_ASPECT_MEMORY_PLANE_0_BIT_EXT:
case VK_IMAGE_ASPECT_PLANE_0_BIT:
mem_plane = 0;
@@ -1842,46 +2619,136 @@ void anv_GetImageSubresourceLayout(
default:
unreachable("bad VkImageAspectFlags");
}
+ if (isl_drm_modifier_plane_is_clear_color(image->vk.drm_format_mod,
+ mem_plane)) {
+ get_image_fast_clear_layout(image, &layout->subresourceLayout);
- if (mem_plane == 1 && isl_drm_modifier_has_aux(image->vk.drm_format_mod)) {
+ return;
+ } else if (mem_plane == 1 &&
+ isl_drm_modifier_has_aux(image->vk.drm_format_mod)) {
assert(image->n_planes == 1);
/* If the memory binding differs between primary and aux, then the
* returned offset will be incorrect.
*/
- assert(image->planes[0].aux_surface.memory_range.binding ==
+ mem_range = anv_image_get_aux_memory_range(image, 0);
+ assert(mem_range->binding ==
image->planes[0].primary_surface.memory_range.binding);
- surface = &image->planes[0].aux_surface;
+ isl_surf = &image->planes[0].aux_surface.isl;
} else {
assert(mem_plane < image->n_planes);
- surface = &image->planes[mem_plane].primary_surface;
+ mem_range = &image->planes[mem_plane].primary_surface.memory_range;
+ isl_surf = &image->planes[mem_plane].primary_surface.isl;
}
} else {
const uint32_t plane =
- anv_image_aspect_to_plane(image, subresource->aspectMask);
- surface = &image->planes[plane].primary_surface;
+ anv_image_aspect_to_plane(image, subresource->imageSubresource.aspectMask);
+ mem_range = &image->planes[plane].primary_surface.memory_range;
+ isl_surf = &image->planes[plane].primary_surface.isl;
}
- layout->offset = surface->memory_range.offset;
- layout->rowPitch = surface->isl.row_pitch_B;
- layout->depthPitch = isl_surf_get_array_pitch(&surface->isl);
- layout->arrayPitch = isl_surf_get_array_pitch(&surface->isl);
+ layout->subresourceLayout.offset = mem_range->offset;
+ layout->subresourceLayout.rowPitch = isl_surf->row_pitch_B;
+ layout->subresourceLayout.depthPitch = isl_surf_get_array_pitch(isl_surf);
+ layout->subresourceLayout.arrayPitch = isl_surf_get_array_pitch(isl_surf);
- if (subresource->mipLevel > 0 || subresource->arrayLayer > 0) {
- assert(surface->isl.tiling == ISL_TILING_LINEAR);
+ if (subresource->imageSubresource.mipLevel > 0 ||
+ subresource->imageSubresource.arrayLayer > 0) {
+ assert(isl_surf->tiling == ISL_TILING_LINEAR);
uint64_t offset_B;
- isl_surf_get_image_offset_B_tile_sa(&surface->isl,
- subresource->mipLevel,
- subresource->arrayLayer,
+ isl_surf_get_image_offset_B_tile_sa(isl_surf,
+ subresource->imageSubresource.mipLevel,
+ subresource->imageSubresource.arrayLayer,
0 /* logical_z_offset_px */,
&offset_B, NULL, NULL);
- layout->offset += offset_B;
- layout->size = layout->rowPitch * anv_minify(image->vk.extent.height,
- subresource->mipLevel) *
- image->vk.extent.depth;
+ layout->subresourceLayout.offset += offset_B;
+ layout->subresourceLayout.size =
+ layout->subresourceLayout.rowPitch *
+ u_minify(image->vk.extent.height,
+ subresource->imageSubresource.mipLevel) *
+ image->vk.extent.depth;
} else {
- layout->size = surface->memory_range.size;
+ layout->subresourceLayout.size = mem_range->size;
+ }
+
+ VkImageCompressionPropertiesEXT *comp_props =
+ vk_find_struct(layout->pNext, IMAGE_COMPRESSION_PROPERTIES_EXT);
+ if (comp_props) {
+ comp_props->imageCompressionFixedRateFlags =
+ VK_IMAGE_COMPRESSION_FIXED_RATE_NONE_EXT;
+ comp_props->imageCompressionFlags = VK_IMAGE_COMPRESSION_DISABLED_EXT;
+ for (uint32_t p = 0; p < image->n_planes; p++) {
+ if (image->planes[p].aux_usage != ISL_AUX_USAGE_NONE) {
+ comp_props->imageCompressionFlags = VK_IMAGE_COMPRESSION_DEFAULT_EXT;
+ break;
+ }
+ }
+ }
+}
+
+void anv_GetDeviceImageSubresourceLayoutKHR(
+ VkDevice _device,
+ const VkDeviceImageSubresourceInfoKHR* pInfo,
+ VkSubresourceLayout2KHR* pLayout)
+{
+ ANV_FROM_HANDLE(anv_device, device, _device);
+
+ struct anv_image image = { 0 };
+
+ if (anv_image_init_from_create_info(device, &image, pInfo->pCreateInfo,
+ true) != VK_SUCCESS) {
+ pLayout->subresourceLayout = (VkSubresourceLayout) { 0, };
+ return;
+ }
+
+ anv_get_image_subresource_layout(&image, pInfo->pSubresource, pLayout);
+}
+
+void anv_GetImageSubresourceLayout2KHR(
+ VkDevice device,
+ VkImage _image,
+ const VkImageSubresource2KHR* pSubresource,
+ VkSubresourceLayout2KHR* pLayout)
+{
+ ANV_FROM_HANDLE(anv_image, image, _image);
+
+ anv_get_image_subresource_layout(image, pSubresource, pLayout);
+}
+
+static VkImageUsageFlags
+anv_image_flags_filter_for_queue(VkImageUsageFlags usages,
+ VkQueueFlagBits queue_flags)
+{
+ /* Eliminate graphics usages if the queue is not graphics capable */
+ if (!(queue_flags & VK_QUEUE_GRAPHICS_BIT)) {
+ usages &= ~(VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT |
+ VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT |
+ VK_IMAGE_USAGE_TRANSIENT_ATTACHMENT_BIT |
+ VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT |
+ VK_IMAGE_USAGE_FRAGMENT_DENSITY_MAP_BIT_EXT |
+ VK_IMAGE_USAGE_FRAGMENT_SHADING_RATE_ATTACHMENT_BIT_KHR |
+ VK_IMAGE_USAGE_ATTACHMENT_FEEDBACK_LOOP_BIT_EXT);
}
+
+ /* Eliminate sampling & storage usages if the queue is neither graphics nor
+ * compute capable
+ */
+ if (!(queue_flags & (VK_QUEUE_GRAPHICS_BIT | VK_QUEUE_COMPUTE_BIT))) {
+ usages &= ~(VK_IMAGE_USAGE_SAMPLED_BIT |
+ VK_IMAGE_USAGE_STORAGE_BIT);
+ }
+
+ /* Eliminate transfer usages if the queue is neither transfer, compute or
+ * graphics capable
+ */
+ if (!(queue_flags & (VK_QUEUE_TRANSFER_BIT |
+ VK_QUEUE_COMPUTE_BIT |
+ VK_QUEUE_GRAPHICS_BIT))) {
+ usages &= ~(VK_IMAGE_USAGE_TRANSFER_SRC_BIT |
+ VK_IMAGE_USAGE_TRANSFER_DST_BIT);
+ }
+
+ return usages;
}
/**
@@ -1900,7 +2767,8 @@ enum isl_aux_state ATTRIBUTE_PURE
anv_layout_to_aux_state(const struct intel_device_info * const devinfo,
const struct anv_image * const image,
const VkImageAspectFlagBits aspect,
- const VkImageLayout layout)
+ const VkImageLayout layout,
+ const VkQueueFlagBits queue_flags)
{
/* Validate the inputs. */
@@ -1947,8 +2815,6 @@ anv_layout_to_aux_state(const struct intel_device_info * const devinfo,
isl_drm_modifier_get_default_aux_state(image->vk.drm_format_mod);
switch (aux_state) {
- default:
- assert(!"unexpected isl_aux_state");
case ISL_AUX_STATE_AUX_INVALID:
/* The modifier does not support compression. But, if we arrived
* here, then we have enabled compression on it anyway, in which case
@@ -1964,8 +2830,12 @@ anv_layout_to_aux_state(const struct intel_device_info * const devinfo,
* pass-through.
*/
return ISL_AUX_STATE_PASS_THROUGH;
+ case ISL_AUX_STATE_COMPRESSED_CLEAR:
+ return ISL_AUX_STATE_COMPRESSED_CLEAR;
case ISL_AUX_STATE_COMPRESSED_NO_CLEAR:
return ISL_AUX_STATE_COMPRESSED_NO_CLEAR;
+ default:
+ unreachable("unexpected isl_aux_state");
}
}
@@ -1976,14 +2846,17 @@ anv_layout_to_aux_state(const struct intel_device_info * const devinfo,
const bool read_only = vk_image_layout_is_read_only(layout, aspect);
const VkImageUsageFlags image_aspect_usage =
- vk_image_usage(&image->vk, aspect);
+ anv_image_flags_filter_for_queue(
+ vk_image_usage(&image->vk, aspect), queue_flags);
const VkImageUsageFlags usage =
vk_image_layout_to_usage_flags(layout, aspect) & image_aspect_usage;
bool aux_supported = true;
bool clear_supported = isl_aux_usage_has_fast_clears(aux_usage);
- if ((usage & VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT) && !read_only) {
+ if ((usage & (VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT |
+ VK_IMAGE_USAGE_ATTACHMENT_FEEDBACK_LOOP_BIT_EXT)) &&
+ !read_only) {
/* This image could be used as both an input attachment and a render
* target (depth, stencil, or color) at the same time and this can cause
* corruption.
@@ -1993,17 +2866,12 @@ anv_layout_to_aux_state(const struct intel_device_info * const devinfo,
*
* TODO: Should we be disabling this in more cases?
*/
- if (aspect == VK_IMAGE_ASPECT_DEPTH_BIT) {
+ if (aspect == VK_IMAGE_ASPECT_DEPTH_BIT && devinfo->ver <= 9) {
aux_supported = false;
clear_supported = false;
}
}
- if (usage & VK_IMAGE_USAGE_STORAGE_BIT) {
- aux_supported = false;
- clear_supported = false;
- }
-
if (usage & (VK_IMAGE_USAGE_TRANSFER_SRC_BIT |
VK_IMAGE_USAGE_SAMPLED_BIT |
VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT)) {
@@ -2034,6 +2902,7 @@ anv_layout_to_aux_state(const struct intel_device_info * const devinfo,
break;
case ISL_AUX_USAGE_CCS_E:
+ case ISL_AUX_USAGE_FCV_CCS_E:
case ISL_AUX_USAGE_STC_CCS:
break;
@@ -2057,7 +2926,8 @@ anv_layout_to_aux_state(const struct intel_device_info * const devinfo,
case ISL_AUX_USAGE_CCS_D:
/* We only support clear in exactly one state */
- if (layout == VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL) {
+ if (layout == VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL ||
+ layout == VK_IMAGE_LAYOUT_ATTACHMENT_OPTIMAL) {
assert(aux_supported);
assert(clear_supported);
return ISL_AUX_STATE_PARTIAL_CLEAR;
@@ -2066,6 +2936,7 @@ anv_layout_to_aux_state(const struct intel_device_info * const devinfo,
}
case ISL_AUX_USAGE_CCS_E:
+ case ISL_AUX_USAGE_FCV_CCS_E:
if (aux_supported) {
assert(clear_supported);
return ISL_AUX_STATE_COMPRESSED_CLEAR;
@@ -2110,7 +2981,8 @@ anv_layout_to_aux_usage(const struct intel_device_info * const devinfo,
const struct anv_image * const image,
const VkImageAspectFlagBits aspect,
const VkImageUsageFlagBits usage,
- const VkImageLayout layout)
+ const VkImageLayout layout,
+ const VkQueueFlagBits queue_flags)
{
const uint32_t plane = anv_image_aspect_to_plane(image, aspect);
@@ -2121,7 +2993,7 @@ anv_layout_to_aux_usage(const struct intel_device_info * const devinfo,
return ISL_AUX_USAGE_NONE;
enum isl_aux_state aux_state =
- anv_layout_to_aux_state(devinfo, image, aspect, layout);
+ anv_layout_to_aux_state(devinfo, image, aspect, layout, queue_flags);
switch (aux_state) {
case ISL_AUX_STATE_CLEAR:
@@ -2176,9 +3048,10 @@ enum anv_fast_clear_type ATTRIBUTE_PURE
anv_layout_to_fast_clear_type(const struct intel_device_info * const devinfo,
const struct anv_image * const image,
const VkImageAspectFlagBits aspect,
- const VkImageLayout layout)
+ const VkImageLayout layout,
+ const VkQueueFlagBits queue_flags)
{
- if (INTEL_DEBUG & DEBUG_NO_FAST_CLEAR)
+ if (INTEL_DEBUG(DEBUG_NO_FAST_CLEAR))
return ANV_FAST_CLEAR_NONE;
const uint32_t plane = anv_image_aspect_to_plane(image, aspect);
@@ -2187,14 +3060,11 @@ anv_layout_to_fast_clear_type(const struct intel_device_info * const devinfo,
if (image->planes[plane].aux_usage == ISL_AUX_USAGE_NONE)
return ANV_FAST_CLEAR_NONE;
- /* We don't support MSAA fast-clears on Ivybridge or Bay Trail because they
- * lack the MI ALU which we need to determine the predicates.
- */
- if (devinfo->verx10 == 70 && image->vk.samples > 1)
- return ANV_FAST_CLEAR_NONE;
-
enum isl_aux_state aux_state =
- anv_layout_to_aux_state(devinfo, image, aspect, layout);
+ anv_layout_to_aux_state(devinfo, image, aspect, layout, queue_flags);
+
+ const VkImageUsageFlags layout_usage =
+ vk_image_layout_to_usage_flags(layout, aspect);
switch (aux_state) {
case ISL_AUX_STATE_CLEAR:
@@ -2204,15 +3074,31 @@ anv_layout_to_fast_clear_type(const struct intel_device_info * const devinfo,
case ISL_AUX_STATE_COMPRESSED_CLEAR:
if (aspect == VK_IMAGE_ASPECT_DEPTH_BIT) {
return ANV_FAST_CLEAR_DEFAULT_VALUE;
- } else if (layout == VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL) {
+ } else if (layout == VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL ||
+ layout == VK_IMAGE_LAYOUT_ATTACHMENT_OPTIMAL) {
+ /* The image might not support non zero fast clears when mutable. */
+ if (!image->planes[plane].can_non_zero_fast_clear)
+ return ANV_FAST_CLEAR_DEFAULT_VALUE;
+
/* When we're in a render pass we have the clear color data from the
* VkRenderPassBeginInfo and we can use arbitrary clear colors. They
* must get partially resolved before we leave the render pass.
*/
return ANV_FAST_CLEAR_ANY;
+ } else if (layout_usage & (VK_IMAGE_USAGE_TRANSFER_SRC_BIT |
+ VK_IMAGE_USAGE_TRANSFER_DST_BIT)) {
+ /* Fast clear with non zero color is not supported during transfer
+ * operations since transfer may do format reinterpretation.
+ */
+ return ANV_FAST_CLEAR_DEFAULT_VALUE;
} else if (image->planes[plane].aux_usage == ISL_AUX_USAGE_MCS ||
- image->planes[plane].aux_usage == ISL_AUX_USAGE_CCS_E) {
+ image->planes[plane].aux_usage == ISL_AUX_USAGE_CCS_E ||
+ image->planes[plane].aux_usage == ISL_AUX_USAGE_FCV_CCS_E) {
if (devinfo->ver >= 11) {
+ /* The image might not support non zero fast clears when mutable. */
+ if (!image->planes[plane].can_non_zero_fast_clear)
+ return ANV_FAST_CLEAR_DEFAULT_VALUE;
+
/* On ICL and later, the sampler hardware uses a copy of the clear
* value that is encoded as a pixel value. Therefore, we can use
* any clear color we like for sampling.
@@ -2241,10 +3127,60 @@ anv_layout_to_fast_clear_type(const struct intel_device_info * const devinfo,
}
+/**
+ * This function determines if the layout & usage of an image can have
+ * untracked aux writes. When we see a transition that matches this criteria,
+ * we need to mark the image as compressed written so that our predicated
+ * resolves work properly.
+ *
+ * @param devinfo The device information of the Intel GPU.
+ * @param image The image that may contain a collection of buffers.
+ * @param aspect The aspect of the image to be accessed.
+ * @param layout The current layout of the image aspect(s).
+ */
+bool
+anv_layout_has_untracked_aux_writes(const struct intel_device_info * const devinfo,
+ const struct anv_image * const image,
+ const VkImageAspectFlagBits aspect,
+ const VkImageLayout layout,
+ const VkQueueFlagBits queue_flags)
+{
+ const VkImageUsageFlags image_aspect_usage =
+ vk_image_usage(&image->vk, aspect);
+ const VkImageUsageFlags usage =
+ vk_image_layout_to_usage_flags(layout, aspect) & image_aspect_usage;
+
+ /* Storage is the only usage where we do not write the image through a
+ * render target but through a descriptor. Since VK_EXT_descriptor_indexing
+ * and the update-after-bind feature, it has become impossible to track
+ * writes to images in descriptor at the command buffer build time. So it's
+ * not possible to mark an image as compressed like we do in
+ * genX_cmd_buffer.c(EndRendering) or anv_blorp.c for all transfer
+ * operations.
+ */
+ if (!(usage & VK_IMAGE_USAGE_STORAGE_BIT))
+ return false;
+
+ /* No AUX, no writes to the AUX surface :) */
+ const uint32_t plane = anv_image_aspect_to_plane(image, aspect);
+ const enum isl_aux_usage aux_usage = image->planes[plane].aux_usage;
+ if (aux_usage == ISL_AUX_USAGE_NONE)
+ return false;
+
+ return true;
+}
+
static struct anv_state
-alloc_surface_state(struct anv_device *device)
+maybe_alloc_surface_state(struct anv_device *device,
+ struct anv_state_stream *surface_state_stream)
{
- return anv_state_pool_alloc(&device->surface_state_pool, 64, 64);
+ if (device->physical->indirect_descriptors) {
+ if (surface_state_stream)
+ return anv_state_stream_alloc(surface_state_stream, 64, 64);
+ return anv_state_pool_alloc(&device->bindless_surface_state_pool, 64, 64);
+ } else {
+ return ANV_STATE_NULL;
+ }
}
static enum isl_channel_select
@@ -2272,10 +3208,28 @@ anv_image_fill_surface_state(struct anv_device *device,
enum isl_aux_usage aux_usage,
const union isl_color_value *clear_color,
enum anv_image_view_state_flags flags,
- struct anv_surface_state *state_inout,
- struct brw_image_param *image_param_out)
+ struct anv_surface_state *state_inout)
{
- const uint32_t plane = anv_image_aspect_to_plane(image, aspect);
+ uint32_t plane = anv_image_aspect_to_plane(image, aspect);
+ if (image->emu_plane_format != VK_FORMAT_UNDEFINED) {
+ const uint16_t view_bpb = isl_format_get_layout(view_in->format)->bpb;
+ const uint16_t plane_bpb = isl_format_get_layout(
+ image->planes[plane].primary_surface.isl.format)->bpb;
+
+ /* We should redirect to the hidden plane when the original view format
+ * is compressed or when the view usage is storage. But we don't always
+ * have visibility to the original view format so we also check for size
+ * compatibility.
+ */
+ if (isl_format_is_compressed(view_in->format) ||
+ (view_usage & ISL_SURF_USAGE_STORAGE_BIT) ||
+ view_bpb != plane_bpb) {
+ plane = image->n_planes;
+ assert(isl_format_get_layout(
+ image->planes[plane].primary_surface.isl.format)->bpb ==
+ view_bpb);
+ }
+ }
const struct anv_surface *surface = &image->planes[plane].primary_surface,
*aux_surface = &image->planes[plane].aux_surface;
@@ -2283,42 +3237,14 @@ anv_image_fill_surface_state(struct anv_device *device,
struct isl_view view = *view_in;
view.usage |= view_usage;
- /* For texturing with VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL from a
- * compressed surface with a shadow surface, we use the shadow instead of
- * the primary surface. The shadow surface will be tiled, unlike the main
- * surface, so it should get significantly better performance.
- */
- if (anv_surface_is_valid(&image->planes[plane].shadow_surface) &&
- isl_format_is_compressed(view.format) &&
- (flags & ANV_IMAGE_VIEW_STATE_TEXTURE_OPTIMAL)) {
- assert(isl_format_is_compressed(surface->isl.format));
- assert(surface->isl.tiling == ISL_TILING_LINEAR);
- assert(image->planes[plane].shadow_surface.isl.tiling != ISL_TILING_LINEAR);
- surface = &image->planes[plane].shadow_surface;
- }
-
- /* For texturing from stencil on gfx7, we have to sample from a shadow
- * surface because we don't support W-tiling in the sampler.
- */
- if (anv_surface_is_valid(&image->planes[plane].shadow_surface) &&
- aspect == VK_IMAGE_ASPECT_STENCIL_BIT) {
- assert(device->info.ver == 7);
- assert(view_usage & ISL_SURF_USAGE_TEXTURE_BIT);
- surface = &image->planes[plane].shadow_surface;
- }
-
if (view_usage == ISL_SURF_USAGE_RENDER_TARGET_BIT)
view.swizzle = anv_swizzle_for_render(view.swizzle);
- /* On Ivy Bridge and Bay Trail we do the swizzle in the shader */
- if (device->info.verx10 == 70)
- view.swizzle = ISL_SWIZZLE_IDENTITY;
-
/* If this is a HiZ buffer we can sample from with a programmable clear
* value (SKL+), define the clear value to the optimal constant.
*/
union isl_color_value default_clear_color = { .u32 = { 0, } };
- if (device->info.ver >= 9 && aspect == VK_IMAGE_ASPECT_DEPTH_BIT)
+ if (aspect == VK_IMAGE_ASPECT_DEPTH_BIT)
default_clear_color.f32[0] = ANV_HZ_FC_VAL;
if (!clear_color)
clear_color = &default_clear_color;
@@ -2326,117 +3252,85 @@ anv_image_fill_surface_state(struct anv_device *device,
const struct anv_address address =
anv_image_address(image, &surface->memory_range);
- if (view_usage == ISL_SURF_USAGE_STORAGE_BIT &&
- !(flags & ANV_IMAGE_VIEW_STATE_STORAGE_WRITE_ONLY) &&
- !isl_has_matching_typed_storage_image_format(&device->info,
- view.format)) {
- /* In this case, we are a writeable storage buffer which needs to be
- * lowered to linear. All tiling and offset calculations will be done in
- * the shader.
- */
- assert(aux_usage == ISL_AUX_USAGE_NONE);
- isl_buffer_fill_state(&device->isl_dev, state_inout->state.map,
- .address = anv_address_physical(address),
- .size_B = surface->isl.size_B,
- .format = ISL_FORMAT_RAW,
- .swizzle = ISL_SWIZZLE_IDENTITY,
- .stride_B = 1,
- .mocs = anv_mocs(device, address.bo, view_usage));
- state_inout->address = address,
- state_inout->aux_address = ANV_NULL_ADDRESS;
- state_inout->clear_address = ANV_NULL_ADDRESS;
- } else {
- if (view_usage == ISL_SURF_USAGE_STORAGE_BIT &&
- !(flags & ANV_IMAGE_VIEW_STATE_STORAGE_WRITE_ONLY)) {
- /* Typed surface reads support a very limited subset of the shader
- * image formats. Translate it into the closest format the hardware
- * supports.
- */
- assert(aux_usage == ISL_AUX_USAGE_NONE);
- view.format = isl_lower_storage_image_format(&device->info,
- view.format);
- }
+ void *surface_state_map = state_inout->state_data.data;
- const struct isl_surf *isl_surf = &surface->isl;
+ const struct isl_surf *isl_surf = &surface->isl;
- struct isl_surf tmp_surf;
- uint64_t offset_B = 0;
- uint32_t tile_x_sa = 0, tile_y_sa = 0;
- if (isl_format_is_compressed(surface->isl.format) &&
- !isl_format_is_compressed(view.format)) {
- /* We're creating an uncompressed view of a compressed surface. This
- * is allowed but only for a single level/layer.
- */
- assert(surface->isl.samples == 1);
- assert(view.levels == 1);
- assert(view.array_len == 1);
-
- ASSERTED bool ok =
- isl_surf_get_uncompressed_surf(&device->isl_dev, isl_surf, &view,
- &tmp_surf, &view,
- &offset_B, &tile_x_sa, &tile_y_sa);
- assert(ok);
- isl_surf = &tmp_surf;
-
- if (device->info.ver <= 8) {
- assert(surface->isl.tiling == ISL_TILING_LINEAR);
- assert(tile_x_sa == 0);
- assert(tile_y_sa == 0);
- }
- }
-
- state_inout->address = anv_address_add(address, offset_B);
+ struct isl_surf tmp_surf;
+ uint64_t offset_B = 0;
+ uint32_t tile_x_sa = 0, tile_y_sa = 0;
+ if (isl_format_is_compressed(surface->isl.format) &&
+ !isl_format_is_compressed(view.format)) {
+ /* We're creating an uncompressed view of a compressed surface. This is
+ * allowed but only for a single level/layer.
+ */
+ assert(surface->isl.samples == 1);
+ assert(view.levels == 1);
+
+ ASSERTED bool ok =
+ isl_surf_get_uncompressed_surf(&device->isl_dev, isl_surf, &view,
+ &tmp_surf, &view,
+ &offset_B, &tile_x_sa, &tile_y_sa);
+ assert(ok);
+ isl_surf = &tmp_surf;
+ }
- struct anv_address aux_address = ANV_NULL_ADDRESS;
- if (aux_usage != ISL_AUX_USAGE_NONE)
- aux_address = anv_image_address(image, &aux_surface->memory_range);
- state_inout->aux_address = aux_address;
+ state_inout->address = anv_address_add(address, offset_B);
- struct anv_address clear_address = ANV_NULL_ADDRESS;
- if (device->info.ver >= 10 && isl_aux_usage_has_fast_clears(aux_usage)) {
- clear_address = anv_image_get_clear_color_addr(device, image, aspect);
- }
- state_inout->clear_address = clear_address;
-
- isl_surf_fill_state(&device->isl_dev, state_inout->state.map,
- .surf = isl_surf,
- .view = &view,
- .address = anv_address_physical(state_inout->address),
- .clear_color = *clear_color,
- .aux_surf = &aux_surface->isl,
- .aux_usage = aux_usage,
- .aux_address = anv_address_physical(aux_address),
- .clear_address = anv_address_physical(clear_address),
- .use_clear_address = !anv_address_is_null(clear_address),
- .mocs = anv_mocs(device, state_inout->address.bo,
- view_usage),
- .x_offset_sa = tile_x_sa,
- .y_offset_sa = tile_y_sa);
-
- /* With the exception of gfx8, the bottom 12 bits of the MCS base address
- * are used to store other information. This should be ok, however,
- * because the surface buffer addresses are always 4K page aligned.
- */
- if (!anv_address_is_null(aux_address)) {
- uint32_t *aux_addr_dw = state_inout->state.map +
- device->isl_dev.ss.aux_addr_offset;
- assert((aux_address.offset & 0xfff) == 0);
- state_inout->aux_address.offset |= *aux_addr_dw & 0xfff;
- }
+ struct anv_address aux_address = ANV_NULL_ADDRESS;
+ if (aux_usage != ISL_AUX_USAGE_NONE)
+ aux_address = anv_image_address(image, &aux_surface->memory_range);
+ state_inout->aux_address = aux_address;
- if (device->info.ver >= 10 && clear_address.bo) {
- uint32_t *clear_addr_dw = state_inout->state.map +
- device->isl_dev.ss.clear_color_state_offset;
- assert((clear_address.offset & 0x3f) == 0);
- state_inout->clear_address.offset |= *clear_addr_dw & 0x3f;
- }
+ struct anv_address clear_address = ANV_NULL_ADDRESS;
+ if (device->info->ver >= 10 && isl_aux_usage_has_fast_clears(aux_usage)) {
+ clear_address = anv_image_get_clear_color_addr(device, image, aspect);
+ }
+ state_inout->clear_address = clear_address;
+
+ isl_surf_fill_state(&device->isl_dev, surface_state_map,
+ .surf = isl_surf,
+ .view = &view,
+ .address = anv_address_physical(state_inout->address),
+ .clear_color = *clear_color,
+ .aux_surf = &aux_surface->isl,
+ .aux_usage = aux_usage,
+ .aux_address = anv_address_physical(aux_address),
+ .clear_address = anv_address_physical(clear_address),
+ .use_clear_address = !anv_address_is_null(clear_address),
+ .mocs = anv_mocs(device, state_inout->address.bo,
+ view_usage),
+ .x_offset_sa = tile_x_sa,
+ .y_offset_sa = tile_y_sa,
+ /* Assume robustness with EXT_pipeline_robustness
+ * because this can be turned on/off per pipeline and
+ * we have no visibility on this here.
+ */
+ .robust_image_access =
+ device->vk.enabled_features.robustImageAccess ||
+ device->vk.enabled_features.robustImageAccess2 ||
+ device->vk.enabled_extensions.EXT_pipeline_robustness);
+
+ /* With the exception of gfx8, the bottom 12 bits of the MCS base address
+ * are used to store other information. This should be ok, however, because
+ * the surface buffer addresses are always 4K page aligned.
+ */
+ if (!anv_address_is_null(aux_address)) {
+ uint32_t *aux_addr_dw = surface_state_map +
+ device->isl_dev.ss.aux_addr_offset;
+ assert((aux_address.offset & 0xfff) == 0);
+ state_inout->aux_address.offset |= *aux_addr_dw & 0xfff;
}
- if (image_param_out) {
- assert(view_usage == ISL_SURF_USAGE_STORAGE_BIT);
- isl_surf_fill_image_param(&device->isl_dev, image_param_out,
- &surface->isl, &view);
+ if (device->info->ver >= 10 && clear_address.bo) {
+ uint32_t *clear_addr_dw = surface_state_map +
+ device->isl_dev.ss.clear_color_state_offset;
+ assert((clear_address.offset & 0x3f) == 0);
+ state_inout->clear_address.offset |= *clear_addr_dw & 0x3f;
}
+
+ if (state_inout->state.map)
+ memcpy(state_inout->state.map, surface_state_map, ANV_SURFACE_STATE_SIZE);
}
static uint32_t
@@ -2446,67 +3340,193 @@ anv_image_aspect_get_planes(VkImageAspectFlags aspect_mask)
return util_bitcount(aspect_mask);
}
-VkResult
-anv_CreateImageView(VkDevice _device,
- const VkImageViewCreateInfo *pCreateInfo,
- const VkAllocationCallbacks *pAllocator,
- VkImageView *pView)
+bool
+anv_can_hiz_clear_ds_view(struct anv_device *device,
+ const struct anv_image_view *iview,
+ VkImageLayout layout,
+ VkImageAspectFlags clear_aspects,
+ float depth_clear_value,
+ VkRect2D render_area,
+ const VkQueueFlagBits queue_flags)
{
- ANV_FROM_HANDLE(anv_device, device, _device);
- ANV_FROM_HANDLE(anv_image, image, pCreateInfo->image);
- struct anv_image_view *iview;
+ if (INTEL_DEBUG(DEBUG_NO_FAST_CLEAR))
+ return false;
- iview = vk_image_view_create(&device->vk, pCreateInfo,
- pAllocator, sizeof(*iview));
- if (iview == NULL)
- return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+ /* If we're just clearing stencil, we can always HiZ clear */
+ if (!(clear_aspects & VK_IMAGE_ASPECT_DEPTH_BIT))
+ return true;
- iview->image = image;
- iview->n_planes = anv_image_aspect_get_planes(iview->vk.aspects);
+ /* We must have depth in order to have HiZ */
+ if (!(iview->image->vk.aspects & VK_IMAGE_ASPECT_DEPTH_BIT))
+ return false;
- /* Check if a conversion info was passed. */
- const struct anv_format *conv_format = NULL;
- const VkSamplerYcbcrConversionInfo *conv_info =
- vk_find_struct_const(pCreateInfo->pNext, SAMPLER_YCBCR_CONVERSION_INFO);
+ const enum isl_aux_usage clear_aux_usage =
+ anv_layout_to_aux_usage(device->info, iview->image,
+ VK_IMAGE_ASPECT_DEPTH_BIT,
+ VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT,
+ layout, queue_flags);
+ if (!blorp_can_hiz_clear_depth(device->info,
+ &iview->image->planes[0].primary_surface.isl,
+ clear_aux_usage,
+ iview->planes[0].isl.base_level,
+ iview->planes[0].isl.base_array_layer,
+ render_area.offset.x,
+ render_area.offset.y,
+ render_area.offset.x +
+ render_area.extent.width,
+ render_area.offset.y +
+ render_area.extent.height))
+ return false;
-#ifdef ANDROID
- /* If image has an external format, the pNext chain must contain an
- * instance of VKSamplerYcbcrConversionInfo with a conversion object
- * created with the same external format as image."
- */
- assert(!image->vk.android_external_format || conv_info);
-#endif
+ if (depth_clear_value != ANV_HZ_FC_VAL)
+ return false;
+
+ /* If we got here, then we can fast clear */
+ return true;
+}
+
+static bool
+isl_color_value_requires_conversion(union isl_color_value color,
+ const struct isl_surf *surf,
+ const struct isl_view *view)
+{
+ if (surf->format == view->format && isl_swizzle_is_identity(view->swizzle))
+ return false;
+
+ uint32_t surf_pack[4] = { 0, 0, 0, 0 };
+ isl_color_value_pack(&color, surf->format, surf_pack);
+
+ uint32_t view_pack[4] = { 0, 0, 0, 0 };
+ union isl_color_value swiz_color =
+ isl_color_value_swizzle_inv(color, view->swizzle);
+ isl_color_value_pack(&swiz_color, view->format, view_pack);
- if (conv_info) {
- ANV_FROM_HANDLE(anv_ycbcr_conversion, conversion, conv_info->conversion);
- conv_format = conversion->format;
+ return memcmp(surf_pack, view_pack, sizeof(surf_pack)) != 0;
+}
+
+bool
+anv_can_fast_clear_color_view(struct anv_device *device,
+ struct anv_image_view *iview,
+ VkImageLayout layout,
+ union isl_color_value clear_color,
+ uint32_t num_layers,
+ VkRect2D render_area,
+ const VkQueueFlagBits queue_flags)
+{
+ if (INTEL_DEBUG(DEBUG_NO_FAST_CLEAR))
+ return false;
+
+ if (iview->planes[0].isl.base_array_layer >=
+ anv_image_aux_layers(iview->image, VK_IMAGE_ASPECT_COLOR_BIT,
+ iview->planes[0].isl.base_level))
+ return false;
+
+ /* Start by getting the fast clear type. We use the first subpass
+ * layout here because we don't want to fast-clear if the first subpass
+ * to use the attachment can't handle fast-clears.
+ */
+ enum anv_fast_clear_type fast_clear_type =
+ anv_layout_to_fast_clear_type(device->info, iview->image,
+ VK_IMAGE_ASPECT_COLOR_BIT,
+ layout, queue_flags);
+ switch (fast_clear_type) {
+ case ANV_FAST_CLEAR_NONE:
+ return false;
+ case ANV_FAST_CLEAR_DEFAULT_VALUE:
+ if (!isl_color_value_is_zero(clear_color, iview->planes[0].isl.format))
+ return false;
+ break;
+ case ANV_FAST_CLEAR_ANY:
+ break;
}
-#ifdef ANDROID
- /* "If image has an external format, format must be VK_FORMAT_UNDEFINED." */
- assert(!image->vk.android_external_format ||
- pCreateInfo->format == VK_FORMAT_UNDEFINED);
-#endif
+ /* Potentially, we could do partial fast-clears but doing so has crazy
+ * alignment restrictions. It's easier to just restrict to full size
+ * fast clears for now.
+ */
+ if (render_area.offset.x != 0 ||
+ render_area.offset.y != 0 ||
+ render_area.extent.width != iview->vk.extent.width ||
+ render_area.extent.height != iview->vk.extent.height)
+ return false;
- /* Format is undefined, this can happen when using external formats. Set
- * view format from the passed conversion info.
+ /* If the clear color is one that would require non-trivial format
+ * conversion on resolve, we don't bother with the fast clear. This
+ * shouldn't be common as most clear colors are 0/1 and the most common
+ * format re-interpretation is for sRGB.
*/
- if (iview->vk.format == VK_FORMAT_UNDEFINED && conv_format)
- iview->vk.format = conv_format->vk_format;
+ if (isl_color_value_requires_conversion(clear_color,
+ &iview->image->planes[0].primary_surface.isl,
+ &iview->planes[0].isl)) {
+ anv_perf_warn(VK_LOG_OBJS(&iview->vk.base),
+ "Cannot fast-clear to colors which would require "
+ "format conversion on resolve");
+ return false;
+ }
+
+ /* We only allow fast clears to the first slice of an image (level 0,
+ * layer 0) and only for the entire slice. This guarantees us that, at
+ * any given time, there is only one clear color on any given image at
+ * any given time. At the time of our testing (Jan 17, 2018), there
+ * were no known applications which would benefit from fast-clearing
+ * more than just the first slice.
+ */
+ if (iview->planes[0].isl.base_level > 0 ||
+ iview->planes[0].isl.base_array_layer > 0) {
+ anv_perf_warn(VK_LOG_OBJS(&iview->image->vk.base),
+ "Rendering with multi-lod or multi-layer framebuffer "
+ "with LOAD_OP_LOAD and baseMipLevel > 0 or "
+ "baseArrayLayer > 0. Not fast clearing.");
+ return false;
+ }
+
+ if (num_layers > 1) {
+ anv_perf_warn(VK_LOG_OBJS(&iview->image->vk.base),
+ "Rendering to a multi-layer framebuffer with "
+ "LOAD_OP_CLEAR. Only fast-clearing the first slice");
+ }
+
+ /* Wa_18020603990 - slow clear surfaces up to 256x256, 32bpp. */
+ if (intel_needs_workaround(device->info, 18020603990)) {
+ const struct anv_surface *anv_surf =
+ &iview->image->planes->primary_surface;
+ if (isl_format_get_layout(anv_surf->isl.format)->bpb <= 32 &&
+ anv_surf->isl.logical_level0_px.w <= 256 &&
+ anv_surf->isl.logical_level0_px.h <= 256)
+ return false;
+ }
+
+ return true;
+}
+
+void
+anv_image_view_init(struct anv_device *device,
+ struct anv_image_view *iview,
+ const VkImageViewCreateInfo *pCreateInfo,
+ struct anv_state_stream *surface_state_stream)
+{
+ ANV_FROM_HANDLE(anv_image, image, pCreateInfo->image);
+
+ vk_image_view_init(&device->vk, &iview->vk, false, pCreateInfo);
+ iview->image = image;
+ iview->n_planes = anv_image_aspect_get_planes(iview->vk.aspects);
+ iview->use_surface_state_stream = surface_state_stream != NULL;
/* Now go through the underlying image selected planes and map them to
* planes in the image view.
*/
anv_foreach_image_aspect_bit(iaspect_bit, image, iview->vk.aspects) {
- const uint32_t iplane =
- anv_aspect_to_plane(image->vk.aspects, 1UL << iaspect_bit);
const uint32_t vplane =
anv_aspect_to_plane(iview->vk.aspects, 1UL << iaspect_bit);
- struct anv_format_plane format;
- format = anv_get_format_plane(&device->info, iview->vk.format,
- vplane, image->vk.tiling);
- iview->planes[vplane].image_plane = iplane;
+ VkFormat view_format = iview->vk.view_format;
+ if (anv_is_format_emulated(device->physical, view_format)) {
+ assert(image->emu_plane_format != VK_FORMAT_UNDEFINED);
+ view_format =
+ anv_get_emulation_format(device->physical, view_format);
+ }
+ const struct anv_format_plane format = anv_get_format_plane(
+ device->info, view_format, vplane, image->vk.tiling);
iview->planes[vplane].isl = (struct isl_view) {
.format = format.isl_format,
@@ -2514,6 +3534,7 @@ anv_CreateImageView(VkDevice _device,
.levels = iview->vk.level_count,
.base_array_layer = iview->vk.base_array_layer,
.array_len = iview->vk.layer_count,
+ .min_lod_clamp = iview->vk.min_lod,
.swizzle = {
.r = remap_swizzle(iview->vk.swizzle.r, format.swizzle),
.g = remap_swizzle(iview->vk.swizzle.g, format.swizzle),
@@ -2534,73 +3555,114 @@ anv_CreateImageView(VkDevice _device,
iview->planes[vplane].isl.usage = 0;
}
- if (iview->vk.usage & VK_IMAGE_USAGE_SAMPLED_BIT ||
- (iview->vk.usage & VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT &&
- !(iview->vk.aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV))) {
- iview->planes[vplane].optimal_sampler_surface_state.state = alloc_surface_state(device);
- iview->planes[vplane].general_sampler_surface_state.state = alloc_surface_state(device);
+ if (iview->vk.usage & (VK_IMAGE_USAGE_SAMPLED_BIT |
+ VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT)) {
+ iview->planes[vplane].optimal_sampler.state =
+ maybe_alloc_surface_state(device, surface_state_stream);
+ iview->planes[vplane].general_sampler.state =
+ maybe_alloc_surface_state(device, surface_state_stream);
enum isl_aux_usage general_aux_usage =
- anv_layout_to_aux_usage(&device->info, image, 1UL << iaspect_bit,
+ anv_layout_to_aux_usage(device->info, image, 1UL << iaspect_bit,
VK_IMAGE_USAGE_SAMPLED_BIT,
- VK_IMAGE_LAYOUT_GENERAL);
+ VK_IMAGE_LAYOUT_GENERAL,
+ VK_QUEUE_GRAPHICS_BIT |
+ VK_QUEUE_COMPUTE_BIT |
+ VK_QUEUE_TRANSFER_BIT);
enum isl_aux_usage optimal_aux_usage =
- anv_layout_to_aux_usage(&device->info, image, 1UL << iaspect_bit,
+ anv_layout_to_aux_usage(device->info, image, 1UL << iaspect_bit,
VK_IMAGE_USAGE_SAMPLED_BIT,
- VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL);
+ VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL,
+ VK_QUEUE_GRAPHICS_BIT |
+ VK_QUEUE_COMPUTE_BIT |
+ VK_QUEUE_TRANSFER_BIT);
anv_image_fill_surface_state(device, image, 1ULL << iaspect_bit,
&iview->planes[vplane].isl,
ISL_SURF_USAGE_TEXTURE_BIT,
optimal_aux_usage, NULL,
ANV_IMAGE_VIEW_STATE_TEXTURE_OPTIMAL,
- &iview->planes[vplane].optimal_sampler_surface_state,
- NULL);
+ &iview->planes[vplane].optimal_sampler);
anv_image_fill_surface_state(device, image, 1ULL << iaspect_bit,
&iview->planes[vplane].isl,
ISL_SURF_USAGE_TEXTURE_BIT,
general_aux_usage, NULL,
0,
- &iview->planes[vplane].general_sampler_surface_state,
- NULL);
+ &iview->planes[vplane].general_sampler);
}
/* NOTE: This one needs to go last since it may stomp isl_view.format */
if (iview->vk.usage & VK_IMAGE_USAGE_STORAGE_BIT) {
- if (isl_is_storage_image_format(format.isl_format)) {
- iview->planes[vplane].storage_surface_state.state =
- alloc_surface_state(device);
-
- anv_image_fill_surface_state(device, image, 1ULL << iaspect_bit,
- &iview->planes[vplane].isl,
- ISL_SURF_USAGE_STORAGE_BIT,
- ISL_AUX_USAGE_NONE, NULL,
- 0,
- &iview->planes[vplane].storage_surface_state,
- &iview->planes[vplane].storage_image_param);
- } else {
- /* In this case, we support the format but, because there's no
- * SPIR-V format specifier corresponding to it, we only support
- * NonReadable (writeonly in GLSL) access. Instead of hanging in
- * these invalid cases, we give them a NULL descriptor.
- */
- assert(isl_format_supports_typed_writes(&device->info,
- format.isl_format));
- iview->planes[vplane].storage_surface_state.state =
- device->null_surface_state;
+ struct isl_view storage_view = iview->planes[vplane].isl;
+ if (iview->vk.view_type == VK_IMAGE_VIEW_TYPE_3D) {
+ storage_view.base_array_layer = iview->vk.storage.z_slice_offset;
+ storage_view.array_len = iview->vk.storage.z_slice_count;
}
- iview->planes[vplane].writeonly_storage_surface_state.state = alloc_surface_state(device);
+ enum isl_aux_usage general_aux_usage =
+ anv_layout_to_aux_usage(device->info, image, 1UL << iaspect_bit,
+ VK_IMAGE_USAGE_STORAGE_BIT,
+ VK_IMAGE_LAYOUT_GENERAL,
+ VK_QUEUE_GRAPHICS_BIT |
+ VK_QUEUE_COMPUTE_BIT |
+ VK_QUEUE_TRANSFER_BIT);
+ iview->planes[vplane].storage.state =
+ maybe_alloc_surface_state(device, surface_state_stream);
+
anv_image_fill_surface_state(device, image, 1ULL << iaspect_bit,
- &iview->planes[vplane].isl,
+ &storage_view,
ISL_SURF_USAGE_STORAGE_BIT,
- ISL_AUX_USAGE_NONE, NULL,
- ANV_IMAGE_VIEW_STATE_STORAGE_WRITE_ONLY,
- &iview->planes[vplane].writeonly_storage_surface_state,
- NULL);
+ general_aux_usage, NULL,
+ 0,
+ &iview->planes[vplane].storage);
}
}
+}
+
+void
+anv_image_view_finish(struct anv_image_view *iview)
+{
+ struct anv_device *device =
+ container_of(iview->vk.base.device, struct anv_device, vk);
+
+ if (!iview->use_surface_state_stream) {
+ for (uint32_t plane = 0; plane < iview->n_planes; plane++) {
+ if (iview->planes[plane].optimal_sampler.state.alloc_size) {
+ anv_state_pool_free(&device->bindless_surface_state_pool,
+ iview->planes[plane].optimal_sampler.state);
+ }
+
+ if (iview->planes[plane].general_sampler.state.alloc_size) {
+ anv_state_pool_free(&device->bindless_surface_state_pool,
+ iview->planes[plane].general_sampler.state);
+ }
+
+ if (iview->planes[plane].storage.state.alloc_size) {
+ anv_state_pool_free(&device->bindless_surface_state_pool,
+ iview->planes[plane].storage.state);
+ }
+ }
+ }
+
+ vk_image_view_finish(&iview->vk);
+}
+
+VkResult
+anv_CreateImageView(VkDevice _device,
+ const VkImageViewCreateInfo *pCreateInfo,
+ const VkAllocationCallbacks *pAllocator,
+ VkImageView *pView)
+{
+ ANV_FROM_HANDLE(anv_device, device, _device);
+ struct anv_image_view *iview;
+
+ iview = vk_zalloc2(&device->vk.alloc, pAllocator, sizeof(*iview), 8,
+ VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+ if (iview == NULL)
+ return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+ anv_image_view_init(device, iview, pCreateInfo, NULL);
*pView = anv_image_view_to_handle(iview);
@@ -2611,42 +3673,33 @@ void
anv_DestroyImageView(VkDevice _device, VkImageView _iview,
const VkAllocationCallbacks *pAllocator)
{
- ANV_FROM_HANDLE(anv_device, device, _device);
ANV_FROM_HANDLE(anv_image_view, iview, _iview);
if (!iview)
return;
- for (uint32_t plane = 0; plane < iview->n_planes; plane++) {
- /* Check offset instead of alloc_size because this they might be
- * device->null_surface_state which always has offset == 0. We don't
- * own that one so we don't want to accidentally free it.
- */
- if (iview->planes[plane].optimal_sampler_surface_state.state.offset) {
- anv_state_pool_free(&device->surface_state_pool,
- iview->planes[plane].optimal_sampler_surface_state.state);
- }
-
- if (iview->planes[plane].general_sampler_surface_state.state.offset) {
- anv_state_pool_free(&device->surface_state_pool,
- iview->planes[plane].general_sampler_surface_state.state);
- }
-
- if (iview->planes[plane].storage_surface_state.state.offset) {
- anv_state_pool_free(&device->surface_state_pool,
- iview->planes[plane].storage_surface_state.state);
- }
+ anv_image_view_finish(iview);
+ vk_free2(&iview->vk.base.device->alloc, pAllocator, iview);
+}
- if (iview->planes[plane].writeonly_storage_surface_state.state.offset) {
- anv_state_pool_free(&device->surface_state_pool,
- iview->planes[plane].writeonly_storage_surface_state.state);
- }
- }
+static void
+anv_fill_buffer_view_surface_state(struct anv_device *device,
+ struct anv_buffer_state *state,
+ enum isl_format format,
+ struct isl_swizzle swizzle,
+ isl_surf_usage_flags_t usage,
+ struct anv_address address,
+ uint32_t range, uint32_t stride)
+{
+ anv_fill_buffer_surface_state(device,
+ state->state_data.data,
+ format, swizzle, usage,
+ address, range, stride);
- vk_image_view_destroy(&device->vk, pAllocator, &iview->vk);
+ if (state->state.map)
+ memcpy(state->state.map, state->state_data.data, ANV_SURFACE_STATE_SIZE);
}
-
VkResult
anv_CreateBufferView(VkDevice _device,
const VkBufferViewCreateInfo *pCreateInfo,
@@ -2657,61 +3710,49 @@ anv_CreateBufferView(VkDevice _device,
ANV_FROM_HANDLE(anv_buffer, buffer, pCreateInfo->buffer);
struct anv_buffer_view *view;
- view = vk_object_alloc(&device->vk, pAllocator, sizeof(*view),
- VK_OBJECT_TYPE_BUFFER_VIEW);
+ view = vk_buffer_view_create(&device->vk, pCreateInfo,
+ pAllocator, sizeof(*view));
if (!view)
- return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+ return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+ const VkBufferUsageFlags2CreateInfoKHR *view_usage_info =
+ vk_find_struct_const(pCreateInfo->pNext, BUFFER_USAGE_FLAGS_2_CREATE_INFO_KHR);
+ const VkBufferUsageFlags buffer_usage =
+ view_usage_info != NULL ? view_usage_info->usage : buffer->vk.usage;
- /* TODO: Handle the format swizzle? */
+ struct anv_format_plane format;
+ format = anv_get_format_plane(device->info, pCreateInfo->format,
+ 0, VK_IMAGE_TILING_LINEAR);
- view->format = anv_get_isl_format(&device->info, pCreateInfo->format,
- VK_IMAGE_ASPECT_COLOR_BIT,
- VK_IMAGE_TILING_LINEAR);
- const uint32_t format_bs = isl_format_get_layout(view->format)->bpb / 8;
- view->range = anv_buffer_get_range(buffer, pCreateInfo->offset,
- pCreateInfo->range);
- view->range = align_down_npot_u32(view->range, format_bs);
+ const uint32_t format_bs = isl_format_get_layout(format.isl_format)->bpb / 8;
+ const uint32_t align_range =
+ align_down_npot_u32(view->vk.range, format_bs);
view->address = anv_address_add(buffer->address, pCreateInfo->offset);
- if (buffer->usage & VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT) {
- view->surface_state = alloc_surface_state(device);
+ if (buffer_usage & VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT) {
+ view->general.state = maybe_alloc_surface_state(device, NULL);
- anv_fill_buffer_surface_state(device, view->surface_state,
- view->format, ISL_SURF_USAGE_TEXTURE_BIT,
- view->address, view->range, format_bs);
+ anv_fill_buffer_view_surface_state(device,
+ &view->general,
+ format.isl_format,
+ format.swizzle,
+ ISL_SURF_USAGE_TEXTURE_BIT,
+ view->address, align_range, format_bs);
} else {
- view->surface_state = (struct anv_state){ 0 };
- }
-
- if (buffer->usage & VK_BUFFER_USAGE_STORAGE_TEXEL_BUFFER_BIT) {
- view->storage_surface_state = alloc_surface_state(device);
- view->writeonly_storage_surface_state = alloc_surface_state(device);
-
- enum isl_format storage_format =
- isl_has_matching_typed_storage_image_format(&device->info,
- view->format) ?
- isl_lower_storage_image_format(&device->info, view->format) :
- ISL_FORMAT_RAW;
-
- anv_fill_buffer_surface_state(device, view->storage_surface_state,
- storage_format, ISL_SURF_USAGE_STORAGE_BIT,
- view->address, view->range,
- (storage_format == ISL_FORMAT_RAW ? 1 :
- isl_format_get_layout(storage_format)->bpb / 8));
-
- /* Write-only accesses should use the original format. */
- anv_fill_buffer_surface_state(device, view->writeonly_storage_surface_state,
- view->format, ISL_SURF_USAGE_STORAGE_BIT,
- view->address, view->range,
- isl_format_get_layout(view->format)->bpb / 8);
-
- isl_buffer_fill_image_param(&device->isl_dev,
- &view->storage_image_param,
- view->format, view->range);
+ view->general.state = ANV_STATE_NULL;
+ }
+
+ if (buffer_usage & VK_BUFFER_USAGE_STORAGE_TEXEL_BUFFER_BIT) {
+ view->storage.state = maybe_alloc_surface_state(device, NULL);
+
+ anv_fill_buffer_view_surface_state(device,
+ &view->storage,
+ format.isl_format, format.swizzle,
+ ISL_SURF_USAGE_STORAGE_BIT,
+ view->address, align_range, format_bs);
} else {
- view->storage_surface_state = (struct anv_state){ 0 };
- view->writeonly_storage_surface_state = (struct anv_state){ 0 };
+ view->storage.state = ANV_STATE_NULL;
}
*pView = anv_buffer_view_to_handle(view);
@@ -2729,17 +3770,26 @@ anv_DestroyBufferView(VkDevice _device, VkBufferView bufferView,
if (!view)
return;
- if (view->surface_state.alloc_size > 0)
- anv_state_pool_free(&device->surface_state_pool,
- view->surface_state);
+ if (view->general.state.alloc_size > 0) {
+ anv_state_pool_free(&device->bindless_surface_state_pool,
+ view->general.state);
+ }
- if (view->storage_surface_state.alloc_size > 0)
- anv_state_pool_free(&device->surface_state_pool,
- view->storage_surface_state);
+ if (view->storage.state.alloc_size > 0) {
+ anv_state_pool_free(&device->bindless_surface_state_pool,
+ view->storage.state);
+ }
- if (view->writeonly_storage_surface_state.alloc_size > 0)
- anv_state_pool_free(&device->surface_state_pool,
- view->writeonly_storage_surface_state);
+ vk_buffer_view_destroy(&device->vk, pAllocator, &view->vk);
+}
- vk_object_free(&device->vk, pAllocator, view);
+void anv_GetRenderingAreaGranularityKHR(
+ VkDevice _device,
+ const VkRenderingAreaInfoKHR* pRenderingAreaInfo,
+ VkExtent2D* pGranularity)
+{
+ *pGranularity = (VkExtent2D) {
+ .width = 1,
+ .height = 1,
+ };
}
diff --git a/src/intel/vulkan/anv_internal_kernels.c b/src/intel/vulkan/anv_internal_kernels.c
new file mode 100644
index 00000000000..b4496cb51bb
--- /dev/null
+++ b/src/intel/vulkan/anv_internal_kernels.c
@@ -0,0 +1,369 @@
+/*
+ * Copyright © 2022 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "anv_private.h"
+
+#include "compiler/intel_nir.h"
+#include "compiler/brw_compiler.h"
+#include "compiler/brw_nir.h"
+#include "compiler/nir/nir.h"
+#include "compiler/nir/nir_builder.h"
+#include "dev/intel_debug.h"
+#include "intel/compiler/intel_nir.h"
+#include "util/macros.h"
+
+#include "vk_nir.h"
+
+#include "anv_internal_kernels.h"
+
+static bool
+lower_base_workgroup_id(nir_builder *b, nir_intrinsic_instr *intrin,
+ UNUSED void *data)
+{
+ if (intrin->intrinsic != nir_intrinsic_load_base_workgroup_id)
+ return false;
+
+ b->cursor = nir_instr_remove(&intrin->instr);
+ nir_def_rewrite_uses(&intrin->def, nir_imm_zero(b, 3, 32));
+ return true;
+}
+
+static void
+link_libanv(nir_shader *nir, const nir_shader *libanv)
+{
+ nir_link_shader_functions(nir, libanv);
+ NIR_PASS_V(nir, nir_inline_functions);
+ NIR_PASS_V(nir, nir_remove_non_entrypoints);
+ NIR_PASS_V(nir, nir_lower_vars_to_explicit_types, nir_var_function_temp,
+ glsl_get_cl_type_size_align);
+ NIR_PASS_V(nir, nir_opt_deref);
+ NIR_PASS_V(nir, nir_lower_vars_to_ssa);
+ NIR_PASS_V(nir, nir_lower_explicit_io,
+ nir_var_shader_temp | nir_var_function_temp | nir_var_mem_shared |
+ nir_var_mem_global,
+ nir_address_format_62bit_generic);
+}
+
+static struct anv_shader_bin *
+compile_shader(struct anv_device *device,
+ const nir_shader *libanv,
+ enum anv_internal_kernel_name shader_name,
+ gl_shader_stage stage,
+ const char *name,
+ const void *hash_key,
+ uint32_t hash_key_size,
+ uint32_t sends_count_expectation)
+{
+ const nir_shader_compiler_options *nir_options =
+ device->physical->compiler->nir_options[stage];
+
+ nir_builder b = nir_builder_init_simple_shader(stage, nir_options,
+ "%s", name);
+
+ uint32_t uniform_size =
+ anv_genX(device->info, call_internal_shader)(&b, shader_name);
+
+ nir_shader *nir = b.shader;
+
+ link_libanv(nir, libanv);
+
+ NIR_PASS_V(nir, nir_lower_vars_to_ssa);
+ NIR_PASS_V(nir, nir_opt_cse);
+ NIR_PASS_V(nir, nir_opt_gcm, true);
+ NIR_PASS_V(nir, nir_opt_peephole_select, 1, false, false);
+
+ NIR_PASS_V(nir, nir_lower_variable_initializers, ~0);
+
+ NIR_PASS_V(nir, nir_split_var_copies);
+ NIR_PASS_V(nir, nir_split_per_member_structs);
+
+ if (stage == MESA_SHADER_COMPUTE) {
+ nir->info.workgroup_size[0] = 16;
+ nir->info.workgroup_size[1] = 1;
+ nir->info.workgroup_size[2] = 1;
+ }
+
+ struct brw_compiler *compiler = device->physical->compiler;
+ struct brw_nir_compiler_opts opts = {};
+ brw_preprocess_nir(compiler, nir, &opts);
+
+ NIR_PASS_V(nir, nir_propagate_invariant, false);
+
+ if (stage == MESA_SHADER_FRAGMENT) {
+ NIR_PASS_V(nir, nir_lower_input_attachments,
+ &(nir_input_attachment_options) {
+ .use_fragcoord_sysval = true,
+ .use_layer_id_sysval = true,
+ });
+ } else {
+ nir_lower_compute_system_values_options options = {
+ .has_base_workgroup_id = true,
+ .lower_cs_local_id_to_index = true,
+ .lower_workgroup_id_to_index = gl_shader_stage_is_mesh(stage),
+ };
+ NIR_PASS_V(nir, nir_lower_compute_system_values, &options);
+ NIR_PASS_V(nir, nir_shader_intrinsics_pass, lower_base_workgroup_id,
+ nir_metadata_block_index | nir_metadata_dominance, NULL);
+ }
+
+ /* Reset sizes before gathering information */
+ nir->global_mem_size = 0;
+ nir->scratch_size = 0;
+ nir->info.shared_size = 0;
+ nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
+
+ NIR_PASS_V(nir, nir_copy_prop);
+ NIR_PASS_V(nir, nir_opt_constant_folding);
+ NIR_PASS_V(nir, nir_opt_dce);
+
+ union brw_any_prog_key key;
+ memset(&key, 0, sizeof(key));
+
+ union brw_any_prog_data prog_data;
+ memset(&prog_data, 0, sizeof(prog_data));
+
+ if (stage == MESA_SHADER_COMPUTE) {
+ NIR_PASS_V(nir, brw_nir_lower_cs_intrinsics,
+ device->info, &prog_data.cs);
+ }
+
+ /* Do vectorizing here. For some reason when trying to do it in the back
+ * this just isn't working.
+ */
+ nir_load_store_vectorize_options options = {
+ .modes = nir_var_mem_ubo | nir_var_mem_ssbo | nir_var_mem_global,
+ .callback = brw_nir_should_vectorize_mem,
+ .robust_modes = (nir_variable_mode)0,
+ };
+ NIR_PASS_V(nir, nir_opt_load_store_vectorize, &options);
+
+ nir->num_uniforms = uniform_size;
+
+ prog_data.base.nr_params = nir->num_uniforms / 4;
+
+ brw_nir_analyze_ubo_ranges(compiler, nir, prog_data.base.ubo_ranges);
+
+ void *temp_ctx = ralloc_context(NULL);
+
+ const unsigned *program;
+ if (stage == MESA_SHADER_FRAGMENT) {
+ struct brw_compile_stats stats[3];
+ struct brw_compile_fs_params params = {
+ .base = {
+ .nir = nir,
+ .log_data = device,
+ .debug_flag = DEBUG_WM,
+ .stats = stats,
+ .mem_ctx = temp_ctx,
+ },
+ .key = &key.wm,
+ .prog_data = &prog_data.wm,
+ };
+ program = brw_compile_fs(compiler, &params);
+
+ unsigned stat_idx = 0;
+ if (prog_data.wm.dispatch_8) {
+ assert(stats[stat_idx].spills == 0);
+ assert(stats[stat_idx].fills == 0);
+ assert(stats[stat_idx].sends == sends_count_expectation);
+ stat_idx++;
+ }
+ if (prog_data.wm.dispatch_16) {
+ assert(stats[stat_idx].spills == 0);
+ assert(stats[stat_idx].fills == 0);
+ assert(stats[stat_idx].sends == sends_count_expectation);
+ stat_idx++;
+ }
+ if (prog_data.wm.dispatch_32) {
+ assert(stats[stat_idx].spills == 0);
+ assert(stats[stat_idx].fills == 0);
+ assert(stats[stat_idx].sends == sends_count_expectation * 2);
+ stat_idx++;
+ }
+ } else {
+ struct brw_compile_stats stats;
+ struct brw_compile_cs_params params = {
+ .base = {
+ .nir = nir,
+ .stats = &stats,
+ .log_data = device,
+ .debug_flag = DEBUG_CS,
+ .mem_ctx = temp_ctx,
+ },
+ .key = &key.cs,
+ .prog_data = &prog_data.cs,
+ };
+ program = brw_compile_cs(compiler, &params);
+
+ assert(stats.spills == 0);
+ assert(stats.fills == 0);
+ assert(stats.sends == sends_count_expectation);
+ }
+
+ assert(prog_data.base.total_scratch == 0);
+
+ struct anv_pipeline_bind_map empty_bind_map = {};
+ struct anv_push_descriptor_info empty_push_desc_info = {};
+ struct anv_shader_upload_params upload_params = {
+ .stage = nir->info.stage,
+ .key_data = hash_key,
+ .key_size = hash_key_size,
+ .kernel_data = program,
+ .kernel_size = prog_data.base.program_size,
+ .prog_data = &prog_data.base,
+ .prog_data_size = sizeof(prog_data),
+ .bind_map = &empty_bind_map,
+ .push_desc_info = &empty_push_desc_info,
+ };
+
+ struct anv_shader_bin *kernel =
+ anv_device_upload_kernel(device, device->internal_cache, &upload_params);
+
+ ralloc_free(temp_ctx);
+ ralloc_free(nir);
+
+ return kernel;
+}
+
+VkResult
+anv_device_get_internal_shader(struct anv_device *device,
+ enum anv_internal_kernel_name name,
+ struct anv_shader_bin **out_bin)
+{
+ const struct {
+ struct {
+ char name[40];
+ } key;
+
+ gl_shader_stage stage;
+
+ uint32_t send_count;
+ } internal_kernels[] = {
+ [ANV_INTERNAL_KERNEL_GENERATED_DRAWS] = {
+ .key = {
+ .name = "anv-generated-indirect-draws",
+ },
+ .stage = MESA_SHADER_FRAGMENT,
+ .send_count = (device->info->ver == 9 ?
+ /* 1 load +
+ * 4 stores +
+ * 2 * (2 loads + 2 stores) +
+ * 3 stores
+ */
+ 16 :
+ /* 1 load +
+ * 2 * (2 loads + 3 stores) +
+ * 3 stores
+ */
+ 14),
+ },
+ [ANV_INTERNAL_KERNEL_COPY_QUERY_RESULTS_COMPUTE] = {
+ .key = {
+ .name = "anv-copy-query-compute",
+ },
+ .stage = MESA_SHADER_COMPUTE,
+ .send_count = device->info->verx10 >= 125 ?
+ 9 /* 4 loads + 4 stores + 1 EOT */ :
+ 8 /* 3 loads + 4 stores + 1 EOT */,
+ },
+ [ANV_INTERNAL_KERNEL_COPY_QUERY_RESULTS_FRAGMENT] = {
+ .key = {
+ .name = "anv-copy-query-fragment",
+ },
+ .stage = MESA_SHADER_FRAGMENT,
+ .send_count = 8 /* 3 loads + 4 stores + 1 EOT */,
+ },
+ [ANV_INTERNAL_KERNEL_MEMCPY_COMPUTE] = {
+ .key = {
+ .name = "anv-memcpy-compute",
+ },
+ .stage = MESA_SHADER_COMPUTE,
+ .send_count = device->info->verx10 >= 125 ?
+ 10 /* 5 loads (1 pull constants) + 4 stores + 1 EOT */ :
+ 9 /* 4 loads + 4 stores + 1 EOT */,
+ },
+ };
+
+ struct anv_shader_bin *bin =
+ p_atomic_read(&device->internal_kernels[name]);
+ if (bin != NULL) {
+ *out_bin = bin;
+ return VK_SUCCESS;
+ }
+
+ bin =
+ anv_device_search_for_kernel(device,
+ device->internal_cache,
+ &internal_kernels[name].key,
+ sizeof(internal_kernels[name].key),
+ NULL);
+ if (bin != NULL) {
+ p_atomic_set(&device->internal_kernels[name], bin);
+ *out_bin = bin;
+ return VK_SUCCESS;
+ }
+
+ void *mem_ctx = ralloc_context(NULL);
+
+ nir_shader *libanv_shaders =
+ anv_genX(device->info, load_libanv_shader)(device, mem_ctx);
+
+ bin = compile_shader(device,
+ libanv_shaders,
+ name,
+ internal_kernels[name].stage,
+ internal_kernels[name].key.name,
+ &internal_kernels[name].key,
+ sizeof(internal_kernels[name].key),
+ internal_kernels[name].send_count);
+ if (bin == NULL)
+ return vk_errorf(device, VK_ERROR_OUT_OF_HOST_MEMORY,
+ "Unable to compiler internal kernel");
+
+ /* The cache already has a reference and it's not going anywhere so
+ * there is no need to hold a second reference.
+ */
+ anv_shader_bin_unref(device, bin);
+
+ p_atomic_set(&device->internal_kernels[name], bin);
+
+ *out_bin = bin;
+ return VK_SUCCESS;
+}
+
+VkResult
+anv_device_init_internal_kernels(struct anv_device *device)
+{
+ const struct intel_l3_weights w =
+ intel_get_default_l3_weights(device->info,
+ true /* wants_dc_cache */,
+ false /* needs_slm */);
+ device->internal_kernels_l3_config = intel_get_l3_config(device->info, w);
+
+ return VK_SUCCESS;
+}
+
+void
+anv_device_finish_internal_kernels(struct anv_device *device)
+{
+}
diff --git a/src/intel/vulkan/anv_internal_kernels.h b/src/intel/vulkan/anv_internal_kernels.h
new file mode 100644
index 00000000000..d0e325add2a
--- /dev/null
+++ b/src/intel/vulkan/anv_internal_kernels.h
@@ -0,0 +1,131 @@
+/*
+ * Copyright © 2022 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef ANV_GENERATED_INDIRECT_DRAWS_H
+#define ANV_GENERATED_INDIRECT_DRAWS_H
+
+#include "libintel_shaders.h"
+
+struct PACKED anv_gen_indirect_params {
+ /* Draw ID buffer address (only used on Gfx9) */
+ uint64_t draw_id_addr;
+
+ /* Indirect data buffer address (only used on Gfx9) */
+ uint64_t indirect_data_addr;
+
+ /* Stride between each elements of the indirect data buffer */
+ uint32_t indirect_data_stride;
+
+ uint32_t flags; /* 0-7: bits, 8-15: mocs, 16-23: cmd_dws */
+
+ /* Base number of the draw ID, it is added to the index computed from the
+ * gl_FragCoord
+ */
+ uint32_t draw_base;
+
+ /* Maximum number of draws (equals to draw_count for indirect draws without
+ * an indirect count)
+ */
+ uint32_t max_draw_count;
+
+ /* Number of draws to generate in the ring buffer (only useful in ring
+ * buffer mode)
+ */
+ uint32_t ring_count;
+
+ /* Instance multiplier for multi view */
+ uint32_t instance_multiplier;
+
+ /* Address where to jump at to generate further draws (used with ring mode)
+ */
+ uint64_t gen_addr;
+
+ /* Address where to jump at after the generated draw (only used with
+ * indirect draw count variants)
+ */
+ uint64_t end_addr;
+
+ /* Destination of the generated draw commands */
+ uint64_t generated_cmds_addr;
+
+ /* Draw count address (points to the draw_count field in cases) */
+ uint64_t draw_count_addr;
+
+ /* Draw count value for non count variants of draw indirect commands */
+ uint32_t draw_count;
+
+ /* CPU side pointer to the previous item when number of draws has to be
+ * split into smaller chunks, see while loop in
+ * genX(cmd_buffer_emit_indirect_generated_draws)
+ */
+ struct anv_gen_indirect_params *prev;
+};
+
+struct PACKED anv_query_copy_params {
+ /* ANV_COPY_QUERY_FLAG_* flags */
+ uint32_t flags;
+
+ /* Number of queries to copy */
+ uint32_t num_queries;
+
+ /* Number of items to write back in the results per query */
+ uint32_t num_items;
+
+ /* First query to copy result from */
+ uint32_t query_base;
+
+ /* Query stride in bytes */
+ uint32_t query_stride;
+
+ /* Offset at which the data should be read from */
+ uint32_t query_data_offset;
+
+ /* Stride of destination writes */
+ uint32_t destination_stride;
+
+ /* We need to be 64 bit aligned, or 32 bit builds get
+ * very unhappy.
+ */
+ uint32_t padding;
+
+ /* Address of the query pool */
+ uint64_t query_data_addr;
+
+ /* Destination address of the results */
+ uint64_t destination_addr;
+};
+
+struct PACKED anv_memcpy_params {
+ /* Number of dwords to copy*/
+ uint32_t num_dwords;
+
+ uint32_t pad;
+
+ /* Source address of the copy */
+ uint64_t src_addr;
+
+ /* Destination address of the copy */
+ uint64_t dst_addr;
+};
+
+#endif /* ANV_GENERATED_INDIRECT_DRAWS_H */
diff --git a/src/intel/vulkan/anv_kmd_backend.c b/src/intel/vulkan/anv_kmd_backend.c
new file mode 100644
index 00000000000..8ce882bba26
--- /dev/null
+++ b/src/intel/vulkan/anv_kmd_backend.c
@@ -0,0 +1,42 @@
+/*
+ * Copyright © 2023 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <stdlib.h>
+
+#include "anv_kmd_backend.h"
+#include "anv_private.h"
+
+const struct anv_kmd_backend *
+anv_kmd_backend_get(enum intel_kmd_type type)
+{
+ switch (type) {
+ case INTEL_KMD_TYPE_I915:
+ return anv_i915_kmd_backend_get();
+ case INTEL_KMD_TYPE_XE:
+ return anv_xe_kmd_backend_get();
+ case INTEL_KMD_TYPE_STUB:
+ return anv_stub_kmd_backend_get();
+ default:
+ return NULL;
+ }
+}
diff --git a/src/intel/vulkan/anv_kmd_backend.h b/src/intel/vulkan/anv_kmd_backend.h
new file mode 100644
index 00000000000..13d3799858e
--- /dev/null
+++ b/src/intel/vulkan/anv_kmd_backend.h
@@ -0,0 +1,136 @@
+/*
+ * Copyright © 2023 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#include <stdint.h>
+
+#include "vulkan/vulkan_core.h"
+#include "vk_sync.h"
+
+#include "dev/intel_device_info.h"
+#include "dev/intel_kmd.h"
+
+struct anv_bo;
+enum anv_bo_alloc_flags;
+struct anv_cmd_buffer;
+struct anv_device;
+struct anv_queue;
+struct anv_query_pool;
+struct anv_utrace_submit;
+struct anv_sparse_submission;
+struct anv_trtt_batch_bo;
+
+enum anv_vm_bind_op {
+ /* bind vma specified in anv_vm_bind */
+ ANV_VM_BIND,
+ /* unbind vma specified in anv_vm_bind */
+ ANV_VM_UNBIND,
+ /* unbind all vmas of anv_vm_bind::bo, address and size fields must be set to 0 */
+ ANV_VM_UNBIND_ALL,
+};
+
+struct anv_vm_bind {
+ struct anv_bo *bo; /* Or NULL in case of a NULL binding. */
+ uint64_t address; /* Includes the resource offset. */
+ uint64_t bo_offset; /* Also known as the memory offset. */
+ uint64_t size;
+ enum anv_vm_bind_op op;
+};
+
+/* These flags apply only to the vm_bind() ioctl backend operations, not to
+ * the higher-level concept of resource address binding. In other words: they
+ * don't apply to TR-TT, which also uses other structs with "vm_bind" in their
+ * names.
+ */
+enum anv_vm_bind_flags {
+ ANV_VM_BIND_FLAG_NONE = 0,
+ /* The most recent bind_timeline wait point is waited for during every
+ * command submission. This flag allows the vm_bind operation to create a
+ * new timeline point and signal it upon completion.
+ */
+ ANV_VM_BIND_FLAG_SIGNAL_BIND_TIMELINE = 1 << 0,
+};
+
+struct anv_kmd_backend {
+ /*
+ * Create a gem buffer.
+ * Return the gem handle in case of success otherwise returns 0.
+ */
+ uint32_t (*gem_create)(struct anv_device *device,
+ const struct intel_memory_class_instance **regions,
+ uint16_t num_regions, uint64_t size,
+ enum anv_bo_alloc_flags alloc_flags,
+ uint64_t *actual_size);
+ uint32_t (*gem_create_userptr)(struct anv_device *device, void *mem, uint64_t size);
+ void (*gem_close)(struct anv_device *device, struct anv_bo *bo);
+ /* Returns MAP_FAILED on error */
+ void *(*gem_mmap)(struct anv_device *device, struct anv_bo *bo,
+ uint64_t offset, uint64_t size, void *placed_addr);
+
+ /*
+ * Bind things however you want.
+ * This is intended for sparse resources, so it's a little lower level and
+ * the _bo variants.
+ */
+ VkResult (*vm_bind)(struct anv_device *device,
+ struct anv_sparse_submission *submit,
+ enum anv_vm_bind_flags flags);
+
+ /*
+ * Fully bind or unbind a BO.
+ * This is intended for general buffer creation/destruction, so it creates
+ * a new point in the bind_timeline, which will be waited for the next time
+ * a batch is submitted.
+ */
+ VkResult (*vm_bind_bo)(struct anv_device *device, struct anv_bo *bo);
+ VkResult (*vm_unbind_bo)(struct anv_device *device, struct anv_bo *bo);
+
+ VkResult (*execute_simple_batch)(struct anv_queue *queue,
+ struct anv_bo *batch_bo,
+ uint32_t batch_bo_size,
+ bool is_companion_rcs_batch);
+ VkResult (*execute_trtt_batch)(struct anv_sparse_submission *submit,
+ struct anv_trtt_batch_bo *trtt_bbo);
+ VkResult (*queue_exec_locked)(struct anv_queue *queue,
+ uint32_t wait_count,
+ const struct vk_sync_wait *waits,
+ uint32_t cmd_buffer_count,
+ struct anv_cmd_buffer **cmd_buffers,
+ uint32_t signal_count,
+ const struct vk_sync_signal *signals,
+ struct anv_query_pool *perf_query_pool,
+ uint32_t perf_query_pass,
+ struct anv_utrace_submit *utrace_submit);
+ VkResult (*queue_exec_trace)(struct anv_queue *queue,
+ struct anv_utrace_submit *submit);
+ uint32_t (*bo_alloc_flags_to_bo_flags)(struct anv_device *device,
+ enum anv_bo_alloc_flags alloc_flags);
+};
+
+const struct anv_kmd_backend *anv_kmd_backend_get(enum intel_kmd_type type);
+
+/* Internal functions, should only be called by anv_kmd_backend_get() */
+const struct anv_kmd_backend *anv_i915_kmd_backend_get(void);
+const struct anv_kmd_backend *anv_xe_kmd_backend_get(void);
+const struct anv_kmd_backend *anv_stub_kmd_backend_get(void);
diff --git a/src/intel/vulkan/anv_measure.c b/src/intel/vulkan/anv_measure.c
index 2ac654b7c05..8e778946ea8 100644
--- a/src/intel/vulkan/anv_measure.c
+++ b/src/intel/vulkan/anv_measure.c
@@ -28,7 +28,7 @@
#include <sys/types.h>
#include "common/intel_measure.h"
-#include "util/debug.h"
+#include "util/u_debug.h"
struct anv_measure_batch {
struct anv_bo *bo;
@@ -38,32 +38,6 @@ struct anv_measure_batch {
void
anv_measure_device_init(struct anv_physical_device *device)
{
- switch (device->info.verx10) {
- case 125:
- device->cmd_emit_timestamp = &gfx125_cmd_emit_timestamp;
- break;
- case 120:
- device->cmd_emit_timestamp = &gfx12_cmd_emit_timestamp;
- break;
- case 110:
- device->cmd_emit_timestamp = &gfx11_cmd_emit_timestamp;
- break;
- case 90:
- device->cmd_emit_timestamp = &gfx9_cmd_emit_timestamp;
- break;
- case 80:
- device->cmd_emit_timestamp = &gfx8_cmd_emit_timestamp;
- break;
- case 75:
- device->cmd_emit_timestamp = &gfx75_cmd_emit_timestamp;
- break;
- case 70:
- device->cmd_emit_timestamp = &gfx7_cmd_emit_timestamp;
- break;
- default:
- assert(false);
- }
-
/* initialise list of measure structures that await rendering */
struct intel_measure_device *measure_device = &device->measure_device;
intel_measure_init(measure_device);
@@ -108,21 +82,25 @@ anv_measure_init(struct anv_cmd_buffer *cmd_buffer)
const size_t batch_bytes = sizeof(struct anv_measure_batch) +
config->batch_size * sizeof(struct intel_measure_snapshot);
struct anv_measure_batch * measure =
- vk_alloc(&cmd_buffer->pool->alloc,
+ vk_alloc(&cmd_buffer->vk.pool->alloc,
batch_bytes, 8,
VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
memset(measure, 0, batch_bytes);
+ cmd_buffer->measure = measure;
+ if(config->cpu_measure)
+ return;
+
ASSERTED VkResult result =
anv_device_alloc_bo(device, "measure data",
config->batch_size * sizeof(uint64_t),
- ANV_BO_ALLOC_MAPPED,
+ ANV_BO_ALLOC_MAPPED |
+ ANV_BO_ALLOC_HOST_CACHED_COHERENT |
+ ANV_BO_ALLOC_INTERNAL,
0,
(struct anv_bo**)&measure->bo);
measure->base.timestamps = measure->bo->map;
assert(result == VK_SUCCESS);
-
- cmd_buffer->measure = measure;
}
static void
@@ -135,33 +113,37 @@ anv_measure_start_snapshot(struct anv_cmd_buffer *cmd_buffer,
struct anv_measure_batch *measure = cmd_buffer->measure;
struct anv_physical_device *device = cmd_buffer->device->physical;
struct intel_measure_device *measure_device = &device->measure_device;
+ struct intel_measure_config *config = config_from_command_buffer(cmd_buffer);
+ enum anv_timestamp_capture_type capture_type;
+ unsigned index = measure->base.index++;
- const unsigned device_frame = measure_device->frame;
-
- /* if the command buffer is not associated with a frame, associate it with
- * the most recent acquired frame
- */
- if (measure->base.frame == 0)
- measure->base.frame = device_frame;
-
- uintptr_t framebuffer = (uintptr_t)cmd_buffer->state.framebuffer;
-
- if (!measure->base.framebuffer &&
- cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY)
- /* secondary command buffer inherited the framebuffer from the primary */
- measure->base.framebuffer = framebuffer;
+ if (event_name == NULL)
+ event_name = intel_measure_snapshot_string(type);
- /* verify framebuffer has been properly tracked */
- assert(type == INTEL_SNAPSHOT_END ||
- framebuffer == measure->base.framebuffer ||
- framebuffer == 0 ); /* compute has no framebuffer */
+ if (config->cpu_measure) {
+ intel_measure_print_cpu_result(measure_device->frame,
+ measure->base.batch_count,
+ measure->base.batch_size,
+ index/2,
+ measure->base.event_count,
+ count,
+ event_name);
+ return;
+ }
- unsigned index = measure->base.index++;
- (*device->cmd_emit_timestamp)(batch, measure->bo, index * sizeof(uint64_t));
+ if ((batch->engine_class == INTEL_ENGINE_CLASS_COPY) ||
+ (batch->engine_class == INTEL_ENGINE_CLASS_VIDEO))
+ capture_type = ANV_TIMESTAMP_CAPTURE_TOP_OF_PIPE;
+ else
+ capture_type = ANV_TIMESTAMP_CAPTURE_AT_CS_STALL;
- if (event_name == NULL)
- event_name = intel_measure_snapshot_string(type);
+ (*device->cmd_emit_timestamp)(batch, cmd_buffer->device,
+ (struct anv_address) {
+ .bo = measure->bo,
+ .offset = index * sizeof(uint64_t) },
+ capture_type,
+ NULL);
struct intel_measure_snapshot *snapshot = &(measure->base.snapshots[index]);
memset(snapshot, 0, sizeof(*snapshot));
@@ -169,18 +151,23 @@ anv_measure_start_snapshot(struct anv_cmd_buffer *cmd_buffer,
snapshot->count = (unsigned) count;
snapshot->event_count = measure->base.event_count;
snapshot->event_name = event_name;
- snapshot->framebuffer = framebuffer;
-
- if (type == INTEL_SNAPSHOT_COMPUTE && cmd_buffer->state.compute.pipeline) {
- snapshot->cs = (uintptr_t) cmd_buffer->state.compute.pipeline->cs;
- } else if (cmd_buffer->state.gfx.pipeline) {
+ snapshot->renderpass = (type == INTEL_SNAPSHOT_COMPUTE) ? 0
+ : measure->base.renderpass;
+
+ if (type == INTEL_SNAPSHOT_COMPUTE && cmd_buffer->state.compute.base.pipeline) {
+ const struct anv_compute_pipeline *pipeline =
+ anv_pipeline_to_compute(cmd_buffer->state.compute.base.pipeline);
+ snapshot->cs = pipeline->source_hash;
+ } else if (type == INTEL_SNAPSHOT_DRAW && cmd_buffer->state.gfx.base.pipeline) {
const struct anv_graphics_pipeline *pipeline =
- cmd_buffer->state.gfx.pipeline;
- snapshot->vs = (uintptr_t) pipeline->shaders[MESA_SHADER_VERTEX];
- snapshot->tcs = (uintptr_t) pipeline->shaders[MESA_SHADER_TESS_CTRL];
- snapshot->tes = (uintptr_t) pipeline->shaders[MESA_SHADER_TESS_EVAL];
- snapshot->gs = (uintptr_t) pipeline->shaders[MESA_SHADER_GEOMETRY];
- snapshot->fs = (uintptr_t) pipeline->shaders[MESA_SHADER_FRAGMENT];
+ anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
+ snapshot->vs = pipeline->base.source_hashes[MESA_SHADER_VERTEX];
+ snapshot->tcs = pipeline->base.source_hashes[MESA_SHADER_TESS_CTRL];
+ snapshot->tes = pipeline->base.source_hashes[MESA_SHADER_TESS_EVAL];
+ snapshot->gs = pipeline->base.source_hashes[MESA_SHADER_GEOMETRY];
+ snapshot->fs = pipeline->base.source_hashes[MESA_SHADER_FRAGMENT];
+ snapshot->ms = pipeline->base.source_hashes[MESA_SHADER_MESH];
+ snapshot->ts = pipeline->base.source_hashes[MESA_SHADER_TASK];
}
}
@@ -191,11 +178,26 @@ anv_measure_end_snapshot(struct anv_cmd_buffer *cmd_buffer,
struct anv_batch *batch = &cmd_buffer->batch;
struct anv_measure_batch *measure = cmd_buffer->measure;
struct anv_physical_device *device = cmd_buffer->device->physical;
-
+ struct intel_measure_config *config = config_from_command_buffer(cmd_buffer);
+ enum anv_timestamp_capture_type capture_type;
unsigned index = measure->base.index++;
assert(index % 2 == 1);
- (*device->cmd_emit_timestamp)(batch, measure->bo, index * sizeof(uint64_t));
+ if (config->cpu_measure)
+ return;
+
+ if ((batch->engine_class == INTEL_ENGINE_CLASS_COPY) ||
+ (batch->engine_class == INTEL_ENGINE_CLASS_VIDEO))
+ capture_type = ANV_TIMESTAMP_CAPTURE_END_OF_PIPE;
+ else
+ capture_type = ANV_TIMESTAMP_CAPTURE_AT_CS_STALL;
+
+ (*device->cmd_emit_timestamp)(batch, cmd_buffer->device,
+ (struct anv_address) {
+ .bo = measure->bo,
+ .offset = index * sizeof(uint64_t) },
+ capture_type,
+ NULL);
struct intel_measure_snapshot *snapshot = &(measure->base.snapshots[index]);
memset(snapshot, 0, sizeof(*snapshot));
@@ -207,7 +209,7 @@ static bool
state_changed(struct anv_cmd_buffer *cmd_buffer,
enum intel_measure_snapshot_type type)
{
- uintptr_t vs=0, tcs=0, tes=0, gs=0, fs=0, cs=0;
+ uint32_t vs=0, tcs=0, tes=0, gs=0, fs=0, cs=0, ms=0, ts=0;
if (cmd_buffer->usage_flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT)
/* can't record timestamps in this mode */
@@ -215,22 +217,25 @@ state_changed(struct anv_cmd_buffer *cmd_buffer,
if (type == INTEL_SNAPSHOT_COMPUTE) {
const struct anv_compute_pipeline *cs_pipe =
- cmd_buffer->state.compute.pipeline;
+ anv_pipeline_to_compute(cmd_buffer->state.compute.base.pipeline);
assert(cs_pipe);
- cs = (uintptr_t)cs_pipe->cs;
+ cs = cs_pipe->source_hash;
} else if (type == INTEL_SNAPSHOT_DRAW) {
- const struct anv_graphics_pipeline *gfx = cmd_buffer->state.gfx.pipeline;
+ const struct anv_graphics_pipeline *gfx =
+ anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
assert(gfx);
- vs = (uintptr_t) gfx->shaders[MESA_SHADER_VERTEX];
- tcs = (uintptr_t) gfx->shaders[MESA_SHADER_TESS_CTRL];
- tes = (uintptr_t) gfx->shaders[MESA_SHADER_TESS_EVAL];
- gs = (uintptr_t) gfx->shaders[MESA_SHADER_GEOMETRY];
- fs = (uintptr_t) gfx->shaders[MESA_SHADER_FRAGMENT];
+ vs = gfx->base.source_hashes[MESA_SHADER_VERTEX];
+ tcs = gfx->base.source_hashes[MESA_SHADER_TESS_CTRL];
+ tes = gfx->base.source_hashes[MESA_SHADER_TESS_EVAL];
+ gs = gfx->base.source_hashes[MESA_SHADER_GEOMETRY];
+ fs = gfx->base.source_hashes[MESA_SHADER_FRAGMENT];
+ ms = gfx->base.source_hashes[MESA_SHADER_MESH];
+ ts = gfx->base.source_hashes[MESA_SHADER_TASK];
}
/* else blorp, all programs NULL */
return intel_measure_state_changed(&cmd_buffer->measure->base,
- vs, tcs, tes, gs, fs, cs);
+ vs, tcs, tes, gs, fs, cs, ms, ts);
}
void
@@ -315,25 +320,15 @@ anv_measure_reset(struct anv_cmd_buffer *cmd_buffer)
* yet been processed
*/
intel_measure_gather(&device->physical->measure_device,
- &device->info);
+ device->info);
assert(cmd_buffer->device != NULL);
measure->base.index = 0;
- measure->base.framebuffer = 0;
+ measure->base.renderpass = 0;
measure->base.frame = 0;
measure->base.event_count = 0;
list_inithead(&measure->base.link);
-
- anv_device_release_bo(device, measure->bo);
- ASSERTED VkResult result =
- anv_device_alloc_bo(device, "measure data",
- config->batch_size * sizeof(uint64_t),
- ANV_BO_ALLOC_MAPPED,
- 0,
- (struct anv_bo**)&measure->bo);
- measure->base.timestamps = measure->bo->map;
- assert(result == VK_SUCCESS);
}
void
@@ -354,8 +349,9 @@ anv_measure_destroy(struct anv_cmd_buffer *cmd_buffer)
*/
intel_measure_gather(&physical->measure_device, &physical->info);
- anv_device_release_bo(device, measure->bo);
- vk_free(&cmd_buffer->pool->alloc, measure);
+ if (measure->bo != NULL)
+ anv_device_release_bo(device, measure->bo);
+ vk_free(&cmd_buffer->vk.pool->alloc, measure);
cmd_buffer->measure = NULL;
}
@@ -395,19 +391,30 @@ _anv_measure_submit(struct anv_cmd_buffer *cmd_buffer)
if (measure == NULL)
return;
- if (measure->base.index == 0)
+ struct intel_measure_batch *base = &measure->base;
+ if (base->index == 0)
/* no snapshots were started */
return;
/* finalize snapshots and enqueue them */
static unsigned cmd_buffer_count = 0;
- measure->base.batch_count = p_atomic_inc_return(&cmd_buffer_count);
+ base->batch_count = p_atomic_inc_return(&cmd_buffer_count);
+ base->batch_size = cmd_buffer->total_batch_size;
+ base->frame = measure_device->frame;
- if (measure->base.index %2 == 1) {
- anv_measure_end_snapshot(cmd_buffer, measure->base.event_count);
- measure->base.event_count = 0;
+ if (base->index %2 == 1) {
+ anv_measure_end_snapshot(cmd_buffer, base->event_count);
+ base->event_count = 0;
}
+ if (config->cpu_measure)
+ return;
+
+ /* Mark the final timestamp as 'not completed'. This marker will be used
+ * to verify that rendering is complete.
+ */
+ base->timestamps[base->index - 1] = 0;
+
/* add to the list of submitted snapshots */
pthread_mutex_lock(&measure_device->mutex);
list_addtail(&measure->base.link, &measure_device->queued_snapshots);
@@ -418,7 +425,7 @@ _anv_measure_submit(struct anv_cmd_buffer *cmd_buffer)
* Hook for the start of a frame.
*/
void
-anv_measure_acquire(struct anv_device *device)
+_anv_measure_acquire(struct anv_device *device)
{
struct intel_measure_config *config = config_from_device(device);
struct intel_measure_device *measure_device = &device->physical->measure_device;
@@ -456,14 +463,10 @@ _anv_measure_beginrenderpass(struct anv_cmd_buffer *cmd_buffer)
{
struct intel_measure_config *config = config_from_command_buffer(cmd_buffer);
struct anv_measure_batch *measure = cmd_buffer->measure;
+ struct anv_physical_device *device = cmd_buffer->device->physical;
+ struct intel_measure_device *measure_device = &device->measure_device;
- if (!config)
- return;
- if (measure == NULL)
- return;
-
- if (measure->base.framebuffer == (uintptr_t) cmd_buffer->state.framebuffer)
- /* no change */
+ if (!config || !measure)
return;
bool filtering = (config->flags & (INTEL_MEASURE_RENDERPASS |
@@ -475,7 +478,8 @@ _anv_measure_beginrenderpass(struct anv_cmd_buffer *cmd_buffer)
measure->base.event_count = 0;
}
- measure->base.framebuffer = (uintptr_t) cmd_buffer->state.framebuffer;
+ measure->base.renderpass =
+ (uintptr_t) p_atomic_inc_return(&measure_device->render_pass_count);
}
void
diff --git a/src/intel/vulkan/anv_measure.h b/src/intel/vulkan/anv_measure.h
index bca0fc0c207..a058a5ac51e 100644
--- a/src/intel/vulkan/anv_measure.h
+++ b/src/intel/vulkan/anv_measure.h
@@ -46,7 +46,7 @@ void _anv_measure_endcommandbuffer(struct anv_cmd_buffer *cmd_buffer);
void _anv_measure_beginrenderpass(struct anv_cmd_buffer *cmd_buffer);
/* tracks frame progression */
-void anv_measure_acquire(struct anv_device *device);
+void _anv_measure_acquire(struct anv_device *device);
/* should be combined with endcommandbuffer */
void _anv_measure_submit(struct anv_cmd_buffer *cmd_buffer);
@@ -55,6 +55,10 @@ void
_anv_measure_add_secondary(struct anv_cmd_buffer *primary,
struct anv_cmd_buffer *secondary);
+#define anv_measure_acquire(device) \
+ if (unlikely(device->physical->measure_device.config)) \
+ _anv_measure_acquire(device)
+
#define anv_measure_snapshot(cmd_buffer, type, event_name, count) \
if (unlikely(cmd_buffer->measure)) \
_anv_measure_snapshot(cmd_buffer, type, event_name, count)
diff --git a/src/intel/vulkan/anv_mesh_perprim_wa.c b/src/intel/vulkan/anv_mesh_perprim_wa.c
new file mode 100644
index 00000000000..f46d6a1082b
--- /dev/null
+++ b/src/intel/vulkan/anv_mesh_perprim_wa.c
@@ -0,0 +1,533 @@
+/*
+ * Copyright © 2022 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "anv_private.h"
+#include "nir_builder.h"
+
+/*
+ * Wa_18019110168 for gfx 12.5.
+ *
+ * This file implements workaround for HW bug, which leads to fragment shader
+ * reading incorrect per-primitive data if mesh shader, in addition to writing
+ * per-primitive data, also writes to gl_ClipDistance.
+ *
+ * The suggested solution to that bug is to not use per-primitive data by:
+ * - creating new vertices for provoking vertices shared by multiple primitives
+ * - converting per-primitive attributes read by fragment shader to flat
+ * per-vertex attributes for the provoking vertex
+ * - modifying fragment shader to read those per-vertex attributes
+ *
+ * There are at least 2 type of failures not handled very well:
+ * - if the number of varying slots overflows, than only some attributes will
+ * be converted, leading to corruption of those unconverted attributes
+ * - if the overall MUE size is so large it doesn't fit in URB, then URB
+ * allocation will fail in some way; unfortunately there's no good way to
+ * say how big MUE will be at this moment and back out
+ *
+ * This workaround needs to be applied before linking, so that unused outputs
+ * created by this code are removed at link time.
+ *
+ * This workaround can be controlled by a driconf option to either disable it,
+ * lower its scope or force enable it.
+ *
+ * Option "anv_mesh_conv_prim_attrs_to_vert_attrs" is evaluated like this:
+ * value == 0 - disable workaround
+ * value < 0 - enable ONLY if workaround is required
+ * value > 0 - enable ALWAYS, even if it's not required
+ * abs(value) >= 1 - attribute conversion
+ * abs(value) >= 2 - attribute conversion and vertex duplication
+ *
+ * Default: -2 (both parts of the work around, ONLY if it's required)
+ *
+ */
+
+static bool
+anv_mesh_convert_attrs_prim_to_vert(struct nir_shader *nir,
+ gl_varying_slot *wa_mapping,
+ uint64_t fs_inputs,
+ const VkGraphicsPipelineCreateInfo *pCreateInfo,
+ void *mem_ctx,
+ const bool dup_vertices,
+ const bool force_conversion)
+{
+ uint64_t per_primitive_outputs = nir->info.per_primitive_outputs;
+ per_primitive_outputs &= ~BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_INDICES);
+
+ if (per_primitive_outputs == 0)
+ return false;
+
+ uint64_t outputs_written = nir->info.outputs_written;
+ uint64_t other_outputs = outputs_written & ~per_primitive_outputs;
+
+ if ((other_outputs & (VARYING_BIT_CLIP_DIST0 | VARYING_BIT_CLIP_DIST1)) == 0)
+ if (!force_conversion)
+ return false;
+
+ uint64_t all_outputs = outputs_written;
+ unsigned attrs = 0;
+
+ uint64_t remapped_outputs = outputs_written & per_primitive_outputs;
+ remapped_outputs &= ~BITFIELD64_BIT(VARYING_SLOT_CULL_PRIMITIVE);
+
+ /* Skip locations not read by the fragment shader, because they will
+ * be eliminated at linking time. Note that some fs inputs may be
+ * removed only after optimizations, so it's possible that we will
+ * create too many variables.
+ */
+ remapped_outputs &= fs_inputs;
+
+ /* Figure out the mapping between per-primitive and new per-vertex outputs. */
+ nir_foreach_shader_out_variable(var, nir) {
+ int location = var->data.location;
+
+ if (!(BITFIELD64_BIT(location) & remapped_outputs))
+ continue;
+
+ /* Although primitive shading rate, layer and viewport have predefined
+ * place in MUE Primitive Header (so we can't really move them anywhere),
+ * we have to copy them to per-vertex space if fragment shader reads them.
+ */
+ assert(location == VARYING_SLOT_PRIMITIVE_SHADING_RATE ||
+ location == VARYING_SLOT_LAYER ||
+ location == VARYING_SLOT_VIEWPORT ||
+ location == VARYING_SLOT_PRIMITIVE_ID ||
+ location >= VARYING_SLOT_VAR0);
+
+ const struct glsl_type *type = var->type;
+ if (nir_is_arrayed_io(var, MESA_SHADER_MESH) || var->data.per_view) {
+ assert(glsl_type_is_array(type));
+ type = glsl_get_array_element(type);
+ }
+
+ unsigned num_slots = glsl_count_attribute_slots(type, false);
+
+ for (gl_varying_slot slot = VARYING_SLOT_VAR0; slot <= VARYING_SLOT_VAR31; slot++) {
+ uint64_t mask = BITFIELD64_MASK(num_slots) << slot;
+ if ((all_outputs & mask) == 0) {
+ wa_mapping[location] = slot;
+ all_outputs |= mask;
+ attrs++;
+ break;
+ }
+ }
+
+ if (wa_mapping[location] == 0) {
+ fprintf(stderr, "Not enough space for hardware per-primitive data corruption work around.\n");
+ break;
+ }
+ }
+
+ if (attrs == 0)
+ if (!force_conversion)
+ return false;
+
+ unsigned provoking_vertex = 0;
+
+ const VkPipelineRasterizationStateCreateInfo *rs_info = pCreateInfo->pRasterizationState;
+ const VkPipelineRasterizationProvokingVertexStateCreateInfoEXT *rs_pv_info =
+ vk_find_struct_const(rs_info, PIPELINE_RASTERIZATION_PROVOKING_VERTEX_STATE_CREATE_INFO_EXT);
+ if (rs_pv_info && rs_pv_info->provokingVertexMode == VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT)
+ provoking_vertex = 2;
+
+ unsigned vertices_per_primitive =
+ mesa_vertices_per_prim(nir->info.mesh.primitive_type);
+
+ nir_function_impl *impl = nir_shader_get_entrypoint(nir);
+ nir_builder b = nir_builder_at(nir_after_impl(impl));
+
+ /* wait for all subgroups to finish */
+ nir_barrier(&b, SCOPE_WORKGROUP);
+
+ nir_def *zero = nir_imm_int(&b, 0);
+
+ nir_def *local_invocation_index = nir_load_local_invocation_index(&b);
+
+ nir_def *cmp = nir_ieq(&b, local_invocation_index, zero);
+ nir_if *if_stmt = nir_push_if(&b, cmp);
+ {
+ nir_variable *primitive_count_var = NULL;
+ nir_variable *primitive_indices_var = NULL;
+
+ unsigned num_other_variables = 0;
+ nir_foreach_shader_out_variable(var, b.shader) {
+ if ((BITFIELD64_BIT(var->data.location) & other_outputs) == 0)
+ continue;
+ num_other_variables++;
+ }
+
+ nir_deref_instr **per_vertex_derefs =
+ ralloc_array(mem_ctx, nir_deref_instr *, num_other_variables);
+
+ unsigned num_per_vertex_variables = 0;
+
+ unsigned processed = 0;
+ nir_foreach_shader_out_variable(var, b.shader) {
+ if ((BITFIELD64_BIT(var->data.location) & other_outputs) == 0)
+ continue;
+
+ switch (var->data.location) {
+ case VARYING_SLOT_PRIMITIVE_COUNT:
+ primitive_count_var = var;
+ break;
+ case VARYING_SLOT_PRIMITIVE_INDICES:
+ primitive_indices_var = var;
+ break;
+ default: {
+ const struct glsl_type *type = var->type;
+ assert(glsl_type_is_array(type));
+ const struct glsl_type *array_element_type =
+ glsl_get_array_element(type);
+
+ if (dup_vertices) {
+ /*
+ * Resize type of array output to make space for one extra
+ * vertex attribute for each primitive, so we ensure that
+ * the provoking vertex is not shared between primitives.
+ */
+ const struct glsl_type *new_type =
+ glsl_array_type(array_element_type,
+ glsl_get_length(type) +
+ nir->info.mesh.max_primitives_out,
+ 0);
+
+ var->type = new_type;
+ }
+
+ per_vertex_derefs[num_per_vertex_variables++] =
+ nir_build_deref_var(&b, var);
+ break;
+ }
+ }
+
+ ++processed;
+ }
+ assert(processed == num_other_variables);
+
+ assert(primitive_count_var != NULL);
+ assert(primitive_indices_var != NULL);
+
+ /* Update types of derefs to match type of variables they (de)reference. */
+ if (dup_vertices) {
+ nir_foreach_function_impl(impl, b.shader) {
+ nir_foreach_block(block, impl) {
+ nir_foreach_instr(instr, block) {
+ if (instr->type != nir_instr_type_deref)
+ continue;
+
+ nir_deref_instr *deref = nir_instr_as_deref(instr);
+ if (deref->deref_type != nir_deref_type_var)
+ continue;
+
+ if (deref->var->type != deref->type)
+ deref->type = deref->var->type;
+ }
+ }
+ }
+ }
+
+ /* indexed by slot of per-prim attribute */
+ struct {
+ nir_deref_instr *per_prim_deref;
+ nir_deref_instr *per_vert_deref;
+ } mapping[VARYING_SLOT_MAX] = {{NULL, NULL}, };
+
+ /* Create new per-vertex output variables mirroring per-primitive variables
+ * and create derefs for both old and new variables.
+ */
+ nir_foreach_shader_out_variable(var, b.shader) {
+ gl_varying_slot location = var->data.location;
+
+ if ((BITFIELD64_BIT(location) & (outputs_written & per_primitive_outputs)) == 0)
+ continue;
+ if (wa_mapping[location] == 0)
+ continue;
+
+ const struct glsl_type *type = var->type;
+ assert(glsl_type_is_array(type));
+ const struct glsl_type *array_element_type = glsl_get_array_element(type);
+
+ const struct glsl_type *new_type =
+ glsl_array_type(array_element_type,
+ nir->info.mesh.max_vertices_out +
+ (dup_vertices ? nir->info.mesh.max_primitives_out : 0),
+ 0);
+
+ nir_variable *new_var =
+ nir_variable_create(b.shader, nir_var_shader_out, new_type, var->name);
+ assert(wa_mapping[location] >= VARYING_SLOT_VAR0);
+ assert(wa_mapping[location] <= VARYING_SLOT_VAR31);
+ new_var->data.location = wa_mapping[location];
+ new_var->data.interpolation = INTERP_MODE_FLAT;
+
+ mapping[location].per_vert_deref = nir_build_deref_var(&b, new_var);
+ mapping[location].per_prim_deref = nir_build_deref_var(&b, var);
+ }
+
+ nir_def *trueconst = nir_imm_true(&b);
+
+ /*
+ * for each Primitive (0 : primitiveCount)
+ * if VertexUsed[PrimitiveIndices[Primitive][provoking vertex]]
+ * create 1 new vertex at offset "Vertex"
+ * copy per vert attributes of provoking vertex to the new one
+ * update PrimitiveIndices[Primitive][provoking vertex]
+ * Vertex++
+ * else
+ * VertexUsed[PrimitiveIndices[Primitive][provoking vertex]] := true
+ *
+ * for each attribute : mapping
+ * copy per_prim_attr(Primitive) to per_vert_attr[Primitive][provoking vertex]
+ */
+
+ /* primitive count */
+ nir_def *primitive_count = nir_load_var(&b, primitive_count_var);
+
+ /* primitive index */
+ nir_variable *primitive_var =
+ nir_local_variable_create(impl, glsl_uint_type(), "Primitive");
+ nir_deref_instr *primitive_deref = nir_build_deref_var(&b, primitive_var);
+ nir_store_deref(&b, primitive_deref, zero, 1);
+
+ /* vertex index */
+ nir_variable *vertex_var =
+ nir_local_variable_create(impl, glsl_uint_type(), "Vertex");
+ nir_deref_instr *vertex_deref = nir_build_deref_var(&b, vertex_var);
+ nir_store_deref(&b, vertex_deref, nir_imm_int(&b, nir->info.mesh.max_vertices_out), 1);
+
+ /* used vertices bitvector */
+ const struct glsl_type *used_vertex_type =
+ glsl_array_type(glsl_bool_type(),
+ nir->info.mesh.max_vertices_out,
+ 0);
+ nir_variable *used_vertex_var =
+ nir_local_variable_create(impl, used_vertex_type, "VertexUsed");
+ nir_deref_instr *used_vertex_deref =
+ nir_build_deref_var(&b, used_vertex_var);
+ /* Initialize it as "not used" */
+ for (unsigned i = 0; i < nir->info.mesh.max_vertices_out; ++i) {
+ nir_deref_instr *indexed_used_vertex_deref =
+ nir_build_deref_array(&b, used_vertex_deref, nir_imm_int(&b, i));
+ nir_store_deref(&b, indexed_used_vertex_deref, nir_imm_false(&b), 1);
+ }
+
+ nir_loop *loop = nir_push_loop(&b);
+ {
+ nir_def *primitive = nir_load_deref(&b, primitive_deref);
+ nir_def *cmp = nir_ige(&b, primitive, primitive_count);
+
+ nir_if *loop_check = nir_push_if(&b, cmp);
+ nir_jump(&b, nir_jump_break);
+ nir_pop_if(&b, loop_check);
+
+ nir_deref_instr *primitive_indices_deref =
+ nir_build_deref_var(&b, primitive_indices_var);
+ nir_deref_instr *indexed_primitive_indices_deref;
+ nir_def *src_vertex;
+ nir_def *prim_indices;
+
+ /* array of vectors, we have to extract index out of array deref */
+ indexed_primitive_indices_deref = nir_build_deref_array(&b, primitive_indices_deref, primitive);
+ prim_indices = nir_load_deref(&b, indexed_primitive_indices_deref);
+ src_vertex = nir_channel(&b, prim_indices, provoking_vertex);
+
+ nir_def *dst_vertex = nir_load_deref(&b, vertex_deref);
+
+ nir_deref_instr *indexed_used_vertex_deref =
+ nir_build_deref_array(&b, used_vertex_deref, src_vertex);
+ nir_def *used_vertex = nir_load_deref(&b, indexed_used_vertex_deref);
+ if (!dup_vertices)
+ used_vertex = nir_imm_false(&b);
+
+ nir_if *vertex_used_check = nir_push_if(&b, used_vertex);
+ {
+ for (unsigned a = 0; a < num_per_vertex_variables; ++a) {
+ nir_deref_instr *attr_arr = per_vertex_derefs[a];
+ nir_deref_instr *src = nir_build_deref_array(&b, attr_arr, src_vertex);
+ nir_deref_instr *dst = nir_build_deref_array(&b, attr_arr, dst_vertex);
+
+ nir_copy_deref(&b, dst, src);
+ }
+
+ /* replace one component of primitive indices vector */
+ nir_def *new_val =
+ nir_vector_insert_imm(&b, prim_indices, dst_vertex, provoking_vertex);
+
+ /* and store complete vector */
+ nir_store_deref(&b, indexed_primitive_indices_deref, new_val,
+ BITFIELD_MASK(vertices_per_primitive));
+
+ nir_store_deref(&b, vertex_deref, nir_iadd_imm(&b, dst_vertex, 1), 1);
+
+ for (unsigned i = 0; i < ARRAY_SIZE(mapping); ++i) {
+ if (!mapping[i].per_vert_deref)
+ continue;
+
+ nir_deref_instr *src =
+ nir_build_deref_array(&b, mapping[i].per_prim_deref, primitive);
+ nir_deref_instr *dst =
+ nir_build_deref_array(&b, mapping[i].per_vert_deref, dst_vertex);
+
+ nir_copy_deref(&b, dst, src);
+ }
+ }
+ nir_push_else(&b, vertex_used_check);
+ {
+ nir_store_deref(&b, indexed_used_vertex_deref, trueconst, 1);
+
+ for (unsigned i = 0; i < ARRAY_SIZE(mapping); ++i) {
+ if (!mapping[i].per_vert_deref)
+ continue;
+
+ nir_deref_instr *src =
+ nir_build_deref_array(&b, mapping[i].per_prim_deref, primitive);
+ nir_deref_instr *dst =
+ nir_build_deref_array(&b, mapping[i].per_vert_deref, src_vertex);
+
+ nir_copy_deref(&b, dst, src);
+ }
+
+ }
+ nir_pop_if(&b, vertex_used_check);
+
+ nir_store_deref(&b, primitive_deref, nir_iadd_imm(&b, primitive, 1), 1);
+ }
+ nir_pop_loop(&b, loop);
+ }
+ nir_pop_if(&b, if_stmt); /* local_invocation_index == 0 */
+
+ if (dup_vertices)
+ nir->info.mesh.max_vertices_out += nir->info.mesh.max_primitives_out;
+
+ if (should_print_nir(nir)) {
+ printf("%s\n", __func__);
+ nir_print_shader(nir, stdout);
+ }
+
+ /* deal with copy_derefs */
+ NIR_PASS(_, nir, nir_split_var_copies);
+ NIR_PASS(_, nir, nir_lower_var_copies);
+
+ nir_shader_gather_info(nir, impl);
+
+ return true;
+}
+
+static bool
+anv_frag_update_derefs_instr(struct nir_builder *b, nir_instr *instr, void *data)
+{
+ if (instr->type != nir_instr_type_deref)
+ return false;
+
+ nir_deref_instr *deref = nir_instr_as_deref(instr);
+ if (deref->deref_type != nir_deref_type_var)
+ return false;
+
+ nir_variable *var = deref->var;
+ if (!(var->data.mode & nir_var_shader_in))
+ return false;
+
+ int location = var->data.location;
+ nir_deref_instr **new_derefs = (nir_deref_instr **)data;
+ if (new_derefs[location] == NULL)
+ return false;
+
+ nir_instr_remove(&deref->instr);
+ nir_def_rewrite_uses(&deref->def, &new_derefs[location]->def);
+
+ return true;
+}
+
+static bool
+anv_frag_update_derefs(nir_shader *shader, nir_deref_instr **mapping)
+{
+ return nir_shader_instructions_pass(shader, anv_frag_update_derefs_instr,
+ nir_metadata_none, (void *)mapping);
+}
+
+/* Update fragment shader inputs with new ones. */
+static void
+anv_frag_convert_attrs_prim_to_vert(struct nir_shader *nir,
+ gl_varying_slot *wa_mapping)
+{
+ /* indexed by slot of per-prim attribute */
+ nir_deref_instr *new_derefs[VARYING_SLOT_MAX] = {NULL, };
+
+ nir_function_impl *impl = nir_shader_get_entrypoint(nir);
+ nir_builder b = nir_builder_at(nir_before_impl(impl));
+
+ nir_foreach_shader_in_variable_safe(var, nir) {
+ gl_varying_slot location = var->data.location;
+ gl_varying_slot new_location = wa_mapping[location];
+ if (new_location == 0)
+ continue;
+
+ assert(wa_mapping[new_location] == 0);
+
+ nir_variable *new_var =
+ nir_variable_create(b.shader, nir_var_shader_in, var->type, var->name);
+ new_var->data.location = new_location;
+ new_var->data.location_frac = var->data.location_frac;
+ new_var->data.interpolation = INTERP_MODE_FLAT;
+
+ new_derefs[location] = nir_build_deref_var(&b, new_var);
+ }
+
+ NIR_PASS(_, nir, anv_frag_update_derefs, new_derefs);
+
+ nir_shader_gather_info(nir, impl);
+}
+
+void
+anv_apply_per_prim_attr_wa(struct nir_shader *ms_nir,
+ struct nir_shader *fs_nir,
+ struct anv_device *device,
+ const VkGraphicsPipelineCreateInfo *info)
+{
+ const struct intel_device_info *devinfo = device->info;
+
+ int mesh_conv_prim_attrs_to_vert_attrs =
+ device->physical->instance->mesh_conv_prim_attrs_to_vert_attrs;
+ if (mesh_conv_prim_attrs_to_vert_attrs < 0 &&
+ !intel_needs_workaround(devinfo, 18019110168))
+ mesh_conv_prim_attrs_to_vert_attrs = 0;
+
+ if (mesh_conv_prim_attrs_to_vert_attrs != 0) {
+ uint64_t fs_inputs = 0;
+ nir_foreach_shader_in_variable(var, fs_nir)
+ fs_inputs |= BITFIELD64_BIT(var->data.location);
+
+ void *stage_ctx = ralloc_context(NULL);
+
+ gl_varying_slot wa_mapping[VARYING_SLOT_MAX] = { 0, };
+
+ const bool dup_vertices = abs(mesh_conv_prim_attrs_to_vert_attrs) >= 2;
+ const bool force_conversion = mesh_conv_prim_attrs_to_vert_attrs > 0;
+
+ if (anv_mesh_convert_attrs_prim_to_vert(ms_nir, wa_mapping,
+ fs_inputs, info, stage_ctx,
+ dup_vertices, force_conversion))
+ anv_frag_convert_attrs_prim_to_vert(fs_nir, wa_mapping);
+
+ ralloc_free(stage_ctx);
+ }
+}
diff --git a/src/intel/vulkan/anv_nir.h b/src/intel/vulkan/anv_nir.h
index 0ffed5dfc0f..435b9065979 100644
--- a/src/intel/vulkan/anv_nir.h
+++ b/src/intel/vulkan/anv_nir.h
@@ -31,63 +31,94 @@
extern "C" {
#endif
-bool anv_check_for_primitive_replication(nir_shader **shaders,
- struct anv_graphics_pipeline *pipeline);
+/* This map is represent a mapping where the key is the NIR
+ * nir_intrinsic_resource_intel::block index. It allows mapping bindless UBOs
+ * accesses to descriptor entry.
+ *
+ * This map only temporary lives between the anv_nir_apply_pipeline_layout()
+ * and anv_nir_compute_push_layout() passes.
+ */
+struct anv_pipeline_push_map {
+ uint32_t block_count;
+ struct anv_pipeline_binding *block_to_descriptor;
+};
+
+bool anv_check_for_primitive_replication(struct anv_device *device,
+ VkShaderStageFlags stages,
+ nir_shader **shaders,
+ uint32_t view_mask);
-bool anv_nir_lower_multiview(nir_shader *shader,
- struct anv_graphics_pipeline *pipeline);
+bool anv_nir_lower_load_patch_vertices_in(nir_shader *shader);
+
+bool anv_nir_lower_multiview(nir_shader *shader, uint32_t view_mask,
+ bool use_primitive_replication);
bool anv_nir_lower_ycbcr_textures(nir_shader *shader,
- const struct anv_pipeline_layout *layout);
+ const struct anv_pipeline_sets_layout *layout);
static inline nir_address_format
anv_nir_ssbo_addr_format(const struct anv_physical_device *pdevice,
- bool robust_buffer_access)
+ enum brw_robustness_flags robust_flags)
{
- if (pdevice->has_a64_buffer_access) {
- if (robust_buffer_access)
- return nir_address_format_64bit_bounded_global;
- else
- return nir_address_format_64bit_global_32bit_offset;
- } else {
- return nir_address_format_32bit_index_offset;
- }
+ if (robust_flags & BRW_ROBUSTNESS_SSBO)
+ return nir_address_format_64bit_bounded_global;
+ else
+ return nir_address_format_64bit_global_32bit_offset;
}
static inline nir_address_format
anv_nir_ubo_addr_format(const struct anv_physical_device *pdevice,
- bool robust_buffer_access)
+ enum brw_robustness_flags robust_flags)
{
- if (pdevice->has_a64_buffer_access) {
- if (robust_buffer_access)
- return nir_address_format_64bit_bounded_global;
- else
- return nir_address_format_64bit_global_32bit_offset;
- } else {
- return nir_address_format_32bit_index_offset;
- }
+ if (robust_flags & BRW_ROBUSTNESS_UBO)
+ return nir_address_format_64bit_bounded_global;
+ else
+ return nir_address_format_64bit_global_32bit_offset;
}
bool anv_nir_lower_ubo_loads(nir_shader *shader);
-void anv_nir_apply_pipeline_layout(const struct anv_physical_device *pdevice,
- bool robust_buffer_access,
- const struct anv_pipeline_layout *layout,
- nir_shader *shader,
- struct anv_pipeline_bind_map *map);
+void anv_nir_apply_pipeline_layout(nir_shader *shader,
+ const struct anv_physical_device *pdevice,
+ enum brw_robustness_flags robust_flags,
+ bool independent_sets,
+ const struct anv_pipeline_sets_layout *layout,
+ struct anv_pipeline_bind_map *map,
+ struct anv_pipeline_push_map *push_map,
+ void *push_map_mem_ctx);
-void anv_nir_compute_push_layout(const struct anv_physical_device *pdevice,
- bool robust_buffer_access,
- nir_shader *nir,
+void anv_nir_compute_push_layout(nir_shader *nir,
+ const struct anv_physical_device *pdevice,
+ enum brw_robustness_flags robust_flags,
+ bool fragment_dynamic,
struct brw_stage_prog_data *prog_data,
struct anv_pipeline_bind_map *map,
+ const struct anv_pipeline_push_map *push_map,
+ enum anv_descriptor_set_layout_type desc_type,
void *mem_ctx);
void anv_nir_validate_push_layout(struct brw_stage_prog_data *prog_data,
struct anv_pipeline_bind_map *map);
+bool anv_nir_update_resource_intel_block(nir_shader *shader);
+
+bool anv_nir_lower_resource_intel(nir_shader *shader,
+ const struct anv_physical_device *device,
+ enum anv_descriptor_set_layout_type desc_type);
+
bool anv_nir_add_base_work_group_id(nir_shader *shader);
+uint32_t anv_nir_compute_used_push_descriptors(nir_shader *shader,
+ const struct anv_pipeline_sets_layout *layout);
+
+bool anv_nir_loads_push_desc_buffer(nir_shader *nir,
+ const struct anv_pipeline_sets_layout *layout,
+ const struct anv_pipeline_bind_map *bind_map);
+
+uint32_t anv_nir_push_desc_ubo_fully_promoted(nir_shader *nir,
+ const struct anv_pipeline_sets_layout *layout,
+ const struct anv_pipeline_bind_map *bind_map);
+
#ifdef __cplusplus
}
#endif
diff --git a/src/intel/vulkan/anv_nir_add_base_work_group_id.c b/src/intel/vulkan/anv_nir_add_base_work_group_id.c
deleted file mode 100644
index 97596214de9..00000000000
--- a/src/intel/vulkan/anv_nir_add_base_work_group_id.c
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * Copyright © 2017 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-#include "anv_nir.h"
-#include "nir/nir_builder.h"
-#include "compiler/brw_compiler.h"
-
-bool
-anv_nir_add_base_work_group_id(nir_shader *shader)
-{
- assert(shader->info.stage == MESA_SHADER_COMPUTE);
-
- nir_builder b;
- bool progress = false;
- nir_foreach_function(function, shader) {
- if (!function->impl)
- continue;
-
- nir_builder_init(&b, function->impl);
-
- nir_foreach_block(block, function->impl) {
- nir_foreach_instr_safe(instr, block) {
- if (instr->type != nir_instr_type_intrinsic)
- continue;
-
- nir_intrinsic_instr *load_id = nir_instr_as_intrinsic(instr);
- if (load_id->intrinsic != nir_intrinsic_load_workgroup_id)
- continue;
-
- b.cursor = nir_after_instr(&load_id->instr);
-
- nir_ssa_def *load_base =
- nir_load_push_constant(&b, 3, 32, nir_imm_int(&b, 0),
- .base = offsetof(struct anv_push_constants, cs.base_work_group_id),
- .range = 3 * sizeof(uint32_t));
-
- nir_ssa_def *id = nir_iadd(&b, &load_id->dest.ssa,
- load_base);
-
- nir_ssa_def_rewrite_uses_after(&load_id->dest.ssa,
- id,
- id->parent_instr);
- progress = true;
- }
- }
-
- nir_metadata_preserve(function->impl, nir_metadata_block_index |
- nir_metadata_dominance);
- }
-
- return progress;
-}
diff --git a/src/intel/vulkan/anv_nir_apply_pipeline_layout.c b/src/intel/vulkan/anv_nir_apply_pipeline_layout.c
index 0f508490110..19183a85949 100644
--- a/src/intel/vulkan/anv_nir_apply_pipeline_layout.c
+++ b/src/intel/vulkan/anv_nir_apply_pipeline_layout.c
@@ -22,23 +22,33 @@
*/
#include "anv_nir.h"
-#include "program/prog_parameter.h"
#include "nir/nir_builder.h"
#include "compiler/brw_nir.h"
#include "util/mesa-sha1.h"
#include "util/set.h"
+#include "vk_enum_to_str.h"
+
+#include "genxml/genX_bits.h"
+
/* Sampler tables don't actually have a maximum size but we pick one just so
* that we don't end up emitting too much state on-the-fly.
*/
#define MAX_SAMPLER_TABLE_SIZE 128
#define BINDLESS_OFFSET 255
+#define sizeof_field(type, field) sizeof(((type *)0)->field)
+
+enum binding_property {
+ BINDING_PROPERTY_NORMAL = BITFIELD_BIT(0),
+ BINDING_PROPERTY_PUSHABLE = BITFIELD_BIT(1),
+ BINDING_PROPERTY_EMBEDDED_SAMPLER = BITFIELD_BIT(2),
+};
+
struct apply_pipeline_layout_state {
const struct anv_physical_device *pdevice;
- const struct anv_pipeline_layout *layout;
- bool add_bounds_checks;
+ const struct anv_pipeline_sets_layout *layout;
nir_address_format desc_addr_format;
nir_address_format ssbo_addr_format;
nir_address_format ubo_addr_format;
@@ -48,17 +58,50 @@ struct apply_pipeline_layout_state {
bool uses_constants;
bool has_dynamic_buffers;
+ bool has_independent_sets;
uint8_t constants_offset;
struct {
bool desc_buffer_used;
uint8_t desc_offset;
- uint8_t *use_count;
- uint8_t *surface_offsets;
- uint8_t *sampler_offsets;
+ struct {
+ uint8_t use_count;
+
+ /* Binding table offset */
+ uint8_t surface_offset;
+
+ /* Sampler table offset */
+ uint8_t sampler_offset;
+
+ /* Embedded sampler index */
+ uint16_t embedded_sampler_index;
+
+ /* Properties of the binding */
+ enum binding_property properties;
+
+ /* For each binding is identified with a unique identifier for push
+ * computation.
+ */
+ uint32_t push_block;
+ } *binding;
} set[MAX_SETS];
};
+/* For a given binding, tells us how many binding table entries are needed per
+ * element.
+ */
+static uint32_t
+bti_multiplier(const struct apply_pipeline_layout_state *state,
+ uint32_t set, uint32_t binding)
+{
+ const struct anv_descriptor_set_layout *set_layout =
+ state->layout->set[set].layout;
+ const struct anv_descriptor_set_binding_layout *bind_layout =
+ &set_layout->binding[binding];
+
+ return bind_layout->max_plane_count == 0 ? 1 : bind_layout->max_plane_count;
+}
+
static nir_address_format
addr_format_for_desc_type(VkDescriptorType desc_type,
struct apply_pipeline_layout_state *state)
@@ -72,7 +115,7 @@ addr_format_for_desc_type(VkDescriptorType desc_type,
case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
return state->ubo_addr_format;
- case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT:
+ case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK:
return state->desc_addr_format;
default:
@@ -84,18 +127,68 @@ static void
add_binding(struct apply_pipeline_layout_state *state,
uint32_t set, uint32_t binding)
{
+ const struct anv_descriptor_set_layout *set_layout =
+ state->layout->set[set].layout;
const struct anv_descriptor_set_binding_layout *bind_layout =
- &state->layout->set[set].layout->binding[binding];
+ &set_layout->binding[binding];
- if (state->set[set].use_count[binding] < UINT8_MAX)
- state->set[set].use_count[binding]++;
+ assert(set < state->layout->num_sets);
+ assert(binding < state->layout->set[set].layout->binding_count);
+
+ if (state->set[set].binding[binding].use_count < UINT8_MAX)
+ state->set[set].binding[binding].use_count++;
/* Only flag the descriptor buffer as used if there's actually data for
* this binding. This lets us be lazy and call this function constantly
* without worrying about unnecessarily enabling the buffer.
*/
- if (anv_descriptor_size(bind_layout))
+ if (bind_layout->descriptor_surface_stride)
state->set[set].desc_buffer_used = true;
+
+ if (bind_layout->dynamic_offset_index >= 0)
+ state->has_dynamic_buffers = true;
+
+ state->set[set].binding[binding].properties |= BINDING_PROPERTY_NORMAL;
+
+ if (set_layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_EMBEDDED_IMMUTABLE_SAMPLERS_BIT_EXT)
+ state->set[set].binding[binding].properties |= BINDING_PROPERTY_EMBEDDED_SAMPLER;
+}
+
+const VkDescriptorSetLayoutCreateFlags non_pushable_set_flags =
+ VK_DESCRIPTOR_SET_LAYOUT_CREATE_DESCRIPTOR_BUFFER_BIT_EXT |
+ VK_DESCRIPTOR_SET_LAYOUT_CREATE_EMBEDDED_IMMUTABLE_SAMPLERS_BIT_EXT;
+
+const VkDescriptorBindingFlags non_pushable_binding_flags =
+ VK_DESCRIPTOR_BINDING_UPDATE_AFTER_BIND_BIT |
+ VK_DESCRIPTOR_BINDING_UPDATE_UNUSED_WHILE_PENDING_BIT |
+ VK_DESCRIPTOR_BINDING_PARTIALLY_BOUND_BIT;
+
+static void
+add_binding_type(struct apply_pipeline_layout_state *state,
+ uint32_t set, uint32_t binding, VkDescriptorType type)
+{
+ add_binding(state, set, binding);
+
+ const struct anv_descriptor_set_layout *set_layout =
+ state->layout->set[set].layout;
+ const struct anv_descriptor_set_binding_layout *bind_layout =
+ &set_layout->binding[binding];
+
+ /* We can't push descriptor buffers but we can for push descriptors */
+ const bool is_set_pushable =
+ (set_layout->flags & non_pushable_set_flags) == 0 ||
+ set_layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR;
+ const bool is_binding_pushable =
+ (bind_layout->flags & non_pushable_binding_flags) == 0;
+
+ if (is_set_pushable && is_binding_pushable &&
+ (state->layout->set[set].layout->binding[binding].type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER ||
+ state->layout->set[set].layout->binding[binding].type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC ||
+ state->layout->set[set].layout->binding[binding].type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK ||
+ state->layout->set[set].layout->binding[binding].type == VK_DESCRIPTOR_TYPE_MUTABLE_EXT) &&
+ (type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER ||
+ type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK))
+ state->set[set].binding[binding].properties |= BINDING_PROPERTY_PUSHABLE;
}
static void
@@ -127,28 +220,22 @@ get_used_bindings(UNUSED nir_builder *_b, nir_instr *instr, void *_state)
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
switch (intrin->intrinsic) {
case nir_intrinsic_vulkan_resource_index:
- add_binding(state, nir_intrinsic_desc_set(intrin),
- nir_intrinsic_binding(intrin));
+ add_binding_type(state,
+ nir_intrinsic_desc_set(intrin),
+ nir_intrinsic_binding(intrin),
+ nir_intrinsic_desc_type(intrin));
break;
case nir_intrinsic_image_deref_load:
case nir_intrinsic_image_deref_store:
- case nir_intrinsic_image_deref_atomic_add:
- case nir_intrinsic_image_deref_atomic_imin:
- case nir_intrinsic_image_deref_atomic_umin:
- case nir_intrinsic_image_deref_atomic_imax:
- case nir_intrinsic_image_deref_atomic_umax:
- case nir_intrinsic_image_deref_atomic_and:
- case nir_intrinsic_image_deref_atomic_or:
- case nir_intrinsic_image_deref_atomic_xor:
- case nir_intrinsic_image_deref_atomic_exchange:
- case nir_intrinsic_image_deref_atomic_comp_swap:
- case nir_intrinsic_image_deref_atomic_fadd:
+ case nir_intrinsic_image_deref_atomic:
+ case nir_intrinsic_image_deref_atomic_swap:
case nir_intrinsic_image_deref_size:
case nir_intrinsic_image_deref_samples:
case nir_intrinsic_image_deref_load_param_intel:
case nir_intrinsic_image_deref_load_raw_intel:
case nir_intrinsic_image_deref_store_raw_intel:
+ case nir_intrinsic_image_deref_sparse_load:
add_deref_src_binding(state, intrin->src[0]);
break;
@@ -200,11 +287,14 @@ descriptor_has_bti(nir_intrinsic_instr *intrin,
const struct anv_descriptor_set_binding_layout *bind_layout =
&state->layout->set[set].layout->binding[binding];
+ if (state->set[set].binding[binding].properties & BINDING_PROPERTY_EMBEDDED_SAMPLER)
+ return false;
+
uint32_t surface_index;
if (bind_layout->data & ANV_DESCRIPTOR_INLINE_UNIFORM)
surface_index = state->set[set].desc_offset;
else
- surface_index = state->set[set].surface_offsets[binding];
+ surface_index = state->set[set].binding[binding].surface_offset;
/* Only lower to a BTI message if we have a valid binding table index. */
return surface_index < MAX_BINDING_TABLE_SIZE;
@@ -216,12 +306,7 @@ descriptor_address_format(nir_intrinsic_instr *intrin,
{
assert(intrin->intrinsic == nir_intrinsic_vulkan_resource_index);
- uint32_t set = nir_intrinsic_desc_set(intrin);
- uint32_t binding = nir_intrinsic_binding(intrin);
- const struct anv_descriptor_set_binding_layout *bind_layout =
- &state->layout->set[set].layout->binding[binding];
-
- return addr_format_for_desc_type(bind_layout->type, state);
+ return addr_format_for_desc_type(nir_intrinsic_desc_type(intrin), state);
}
static nir_intrinsic_instr *
@@ -242,23 +327,23 @@ nir_deref_find_descriptor(nir_deref_instr *deref,
nir_intrinsic_instr *intrin = nir_src_as_intrinsic(deref->parent);
if (!intrin || intrin->intrinsic != nir_intrinsic_load_vulkan_descriptor)
- return false;
+ return NULL;
return find_descriptor_for_index_src(intrin->src[0], state);
}
-static nir_ssa_def *
+static nir_def *
build_load_descriptor_mem(nir_builder *b,
- nir_ssa_def *desc_addr, unsigned desc_offset,
+ nir_def *desc_addr, unsigned desc_offset,
unsigned num_components, unsigned bit_size,
- struct apply_pipeline_layout_state *state)
+ const struct apply_pipeline_layout_state *state)
{
switch (state->desc_addr_format) {
case nir_address_format_64bit_global_32bit_offset: {
- nir_ssa_def *base_addr =
- nir_pack_64_2x32(b, nir_channels(b, desc_addr, 0x3));
- nir_ssa_def *offset32 =
+ nir_def *base_addr =
+ nir_pack_64_2x32(b, nir_trim_vector(b, desc_addr, 2));
+ nir_def *offset32 =
nir_iadd_imm(b, nir_channel(b, desc_addr, 3), desc_offset);
return nir_load_global_constant_offset(b, num_components, bit_size,
@@ -268,8 +353,8 @@ build_load_descriptor_mem(nir_builder *b,
}
case nir_address_format_32bit_index_offset: {
- nir_ssa_def *surface_index = nir_channel(b, desc_addr, 0);
- nir_ssa_def *offset32 =
+ nir_def *surface_index = nir_channel(b, desc_addr, 0);
+ nir_def *offset32 =
nir_iadd_imm(b, nir_channel(b, desc_addr, 1), desc_offset);
return nir_load_ubo(b, num_components, bit_size,
@@ -277,7 +362,7 @@ build_load_descriptor_mem(nir_builder *b,
.align_mul = 8,
.align_offset = desc_offset % 8,
.range_base = 0,
- .range = ~0);
+ .range = num_components * bit_size / 8);
}
default:
@@ -285,6 +370,183 @@ build_load_descriptor_mem(nir_builder *b,
}
}
+/* When using direct descriptor, we do not have a structure to read in memory
+ * like anv_address_range_descriptor where all the fields match perfectly the
+ * vec4 address format we need to generate for A64 messages. Instead we need
+ * to build the vec4 from parsing the RENDER_SURFACE_STATE structure. Easy
+ * enough for the surface address, lot less fun for the size where you have to
+ * combine 3 fields scattered over multiple dwords, add one to the total and
+ * do a check against the surface type to deal with the null descriptors.
+ *
+ * Fortunately we can reuse the Auxiliary surface adddress field to stash our
+ * buffer size and just load a vec4.
+ */
+static nir_def *
+build_optimized_load_render_surface_state_address(nir_builder *b,
+ nir_def *desc_addr,
+ struct apply_pipeline_layout_state *state)
+
+{
+ const struct intel_device_info *devinfo = &state->pdevice->info;
+
+ nir_def *surface_addr =
+ build_load_descriptor_mem(b, desc_addr,
+ RENDER_SURFACE_STATE_SurfaceBaseAddress_start(devinfo) / 8,
+ 4, 32, state);
+ nir_def *addr_ldw = nir_channel(b, surface_addr, 0);
+ nir_def *addr_udw = nir_channel(b, surface_addr, 1);
+ nir_def *length = nir_channel(b, surface_addr, 3);
+
+ return nir_vec4(b, addr_ldw, addr_udw, length, nir_imm_int(b, 0));
+}
+
+/* When using direct descriptor, we do not have a structure to read in memory
+ * like anv_address_range_descriptor where all the fields match perfectly the
+ * vec4 address format we need to generate for A64 messages. Instead we need
+ * to build the vec4 from parsing the RENDER_SURFACE_STATE structure. Easy
+ * enough for the surface address, lot less fun for the size.
+ */
+static nir_def *
+build_non_optimized_load_render_surface_state_address(nir_builder *b,
+ nir_def *desc_addr,
+ struct apply_pipeline_layout_state *state)
+
+{
+ const struct intel_device_info *devinfo = &state->pdevice->info;
+
+ assert(((RENDER_SURFACE_STATE_SurfaceBaseAddress_start(devinfo) +
+ RENDER_SURFACE_STATE_SurfaceBaseAddress_bits(devinfo) - 1) -
+ RENDER_SURFACE_STATE_Width_start(devinfo)) / 8 <= 32);
+
+ nir_def *surface_addr =
+ build_load_descriptor_mem(b, desc_addr,
+ RENDER_SURFACE_STATE_SurfaceBaseAddress_start(devinfo) / 8,
+ DIV_ROUND_UP(RENDER_SURFACE_STATE_SurfaceBaseAddress_bits(devinfo), 32),
+ 32, state);
+ nir_def *addr_ldw = nir_channel(b, surface_addr, 0);
+ nir_def *addr_udw = nir_channel(b, surface_addr, 1);
+
+ /* Take all the RENDER_SURFACE_STATE fields from the beginning of the
+ * structure up to the Depth field.
+ */
+ const uint32_t type_sizes_dwords =
+ DIV_ROUND_UP(RENDER_SURFACE_STATE_Depth_start(devinfo) +
+ RENDER_SURFACE_STATE_Depth_bits(devinfo), 32);
+ nir_def *type_sizes =
+ build_load_descriptor_mem(b, desc_addr, 0, type_sizes_dwords, 32, state);
+
+ const unsigned width_start = RENDER_SURFACE_STATE_Width_start(devinfo);
+ /* SKL PRMs, Volume 2d: Command Reference: Structures, RENDER_SURFACE_STATE
+ *
+ * Width: "bits [6:0] of the number of entries in the buffer - 1"
+ * Height: "bits [20:7] of the number of entries in the buffer - 1"
+ * Depth: "bits [31:21] of the number of entries in the buffer - 1"
+ */
+ const unsigned width_bits = 7;
+ nir_def *width =
+ nir_iand_imm(b,
+ nir_ishr_imm(b,
+ nir_channel(b, type_sizes, width_start / 32),
+ width_start % 32),
+ (1u << width_bits) - 1);
+
+ const unsigned height_start = RENDER_SURFACE_STATE_Height_start(devinfo);
+ const unsigned height_bits = RENDER_SURFACE_STATE_Height_bits(devinfo);
+ nir_def *height =
+ nir_iand_imm(b,
+ nir_ishr_imm(b,
+ nir_channel(b, type_sizes, height_start / 32),
+ height_start % 32),
+ (1u << height_bits) - 1);
+
+ const unsigned depth_start = RENDER_SURFACE_STATE_Depth_start(devinfo);
+ const unsigned depth_bits = RENDER_SURFACE_STATE_Depth_bits(devinfo);
+ nir_def *depth =
+ nir_iand_imm(b,
+ nir_ishr_imm(b,
+ nir_channel(b, type_sizes, depth_start / 32),
+ depth_start % 32),
+ (1u << depth_bits) - 1);
+
+ nir_def *length = width;
+ length = nir_ior(b, length, nir_ishl_imm(b, height, width_bits));
+ length = nir_ior(b, length, nir_ishl_imm(b, depth, width_bits + height_bits));
+ length = nir_iadd_imm(b, length, 1);
+
+ /* Check the surface type, if it's SURFTYPE_NULL, set the length of the
+ * buffer to 0.
+ */
+ const unsigned type_start = RENDER_SURFACE_STATE_SurfaceType_start(devinfo);
+ const unsigned type_dw = type_start / 32;
+ nir_def *type =
+ nir_iand_imm(b,
+ nir_ishr_imm(b,
+ nir_channel(b, type_sizes, type_dw),
+ type_start % 32),
+ (1u << RENDER_SURFACE_STATE_SurfaceType_bits(devinfo)) - 1);
+
+ length = nir_bcsel(b,
+ nir_ieq_imm(b, type, 7 /* SURFTYPE_NULL */),
+ nir_imm_int(b, 0), length);
+
+ return nir_vec4(b, addr_ldw, addr_udw, length, nir_imm_int(b, 0));
+}
+
+static inline nir_def *
+build_load_render_surface_state_address(nir_builder *b,
+ nir_def *desc_addr,
+ struct apply_pipeline_layout_state *state)
+{
+ if (state->pdevice->isl_dev.buffer_length_in_aux_addr)
+ return build_optimized_load_render_surface_state_address(b, desc_addr, state);
+ return build_non_optimized_load_render_surface_state_address(b, desc_addr, state);
+}
+
+/* Load the depth of a 3D storage image.
+ *
+ * Either by reading the indirect descriptor value, or reading the value from
+ * RENDER_SURFACE_STATE.
+ *
+ * This is necessary for VK_EXT_image_sliced_view_of_3d.
+ */
+static nir_def *
+build_load_storage_3d_image_depth(nir_builder *b,
+ nir_def *desc_addr,
+ nir_def *resinfo_depth,
+ struct apply_pipeline_layout_state *state)
+
+{
+ const struct intel_device_info *devinfo = &state->pdevice->info;
+
+ if (state->layout->type == ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_INDIRECT) {
+ return build_load_descriptor_mem(
+ b, desc_addr,
+ offsetof(struct anv_storage_image_descriptor, image_depth),
+ 1, 32, state);
+ } else {
+ nir_def *data = build_load_descriptor_mem(
+ b, desc_addr,
+ RENDER_SURFACE_STATE_RenderTargetViewExtent_start(devinfo) / 8,
+ 1, 32, state);
+ nir_def *depth =
+ nir_ushr_imm(
+ b, data,
+ RENDER_SURFACE_STATE_RenderTargetViewExtent_start(devinfo) % 32);
+ depth = nir_iand_imm(
+ b, depth,
+ (1u << RENDER_SURFACE_STATE_RenderTargetViewExtent_bits(devinfo)) - 1);
+ depth = nir_iadd_imm(b, depth, 1);
+
+ /* Return the minimum between the RESINFO value and the
+ * RENDER_SURFACE_STATE::RenderTargetViewExtent value.
+ *
+ * Both are expressed for the current view LOD, but in the case of a
+ * SURFTYPE_NULL, RESINFO will return the right value, while the -1
+ * value in RENDER_SURFACE_STATE should be ignored.
+ */
+ return nir_umin(b, resinfo_depth, depth);
+ }
+}
/** Build a Vulkan resource index
*
* A "resource index" is the term used by our SPIR-V parser and the relevant
@@ -305,9 +567,10 @@ build_load_descriptor_mem(nir_builder *b,
* The load_vulkan_descriptor intrinsic exists to provide a transition point
* between these two forms of derefs: descriptor and memory.
*/
-static nir_ssa_def *
-build_res_index(nir_builder *b, uint32_t set, uint32_t binding,
- nir_ssa_def *array_index, nir_address_format addr_format,
+static nir_def *
+build_res_index(nir_builder *b,
+ uint32_t set, uint32_t binding,
+ nir_def *array_index,
struct apply_pipeline_layout_state *state)
{
const struct anv_descriptor_set_binding_layout *bind_layout =
@@ -315,75 +578,94 @@ build_res_index(nir_builder *b, uint32_t set, uint32_t binding,
uint32_t array_size = bind_layout->array_size;
- switch (addr_format) {
+ uint32_t set_idx;
+ switch (state->desc_addr_format) {
case nir_address_format_64bit_global_32bit_offset:
- case nir_address_format_64bit_bounded_global: {
- uint32_t set_idx;
- switch (state->desc_addr_format) {
- case nir_address_format_64bit_global_32bit_offset:
- set_idx = set;
- break;
-
- case nir_address_format_32bit_index_offset:
- assert(state->set[set].desc_offset < MAX_BINDING_TABLE_SIZE);
- set_idx = state->set[set].desc_offset;
- break;
-
- default:
- unreachable("Unsupported address format");
- }
-
- assert(bind_layout->dynamic_offset_index < MAX_DYNAMIC_BUFFERS);
- uint32_t dynamic_offset_index = 0xff; /* No dynamic offset */
- if (bind_layout->dynamic_offset_index >= 0) {
- dynamic_offset_index =
- state->layout->set[set].dynamic_offset_start +
- bind_layout->dynamic_offset_index;
- }
+ /* Descriptor set buffer accesses will go through A64 messages, so the
+ * index to get the descriptor set buffer address is located in the
+ * anv_push_constants::desc_surface_offsets and it's indexed by the set
+ * number.
+ */
+ set_idx = set;
+ break;
- const uint32_t packed = (set_idx << 16) | dynamic_offset_index;
+ case nir_address_format_32bit_index_offset:
+ /* Descriptor set buffer accesses will go through the binding table. The
+ * offset is the entry in the binding table.
+ */
+ assert(state->set[set].desc_offset < MAX_BINDING_TABLE_SIZE);
+ set_idx = state->set[set].desc_offset;
+ break;
- return nir_vec4(b, nir_imm_int(b, packed),
- nir_imm_int(b, bind_layout->descriptor_offset),
- nir_imm_int(b, array_size - 1),
- array_index);
+ default:
+ unreachable("Unsupported address format");
}
- case nir_address_format_32bit_index_offset: {
- assert(state->desc_addr_format == nir_address_format_32bit_index_offset);
- if (bind_layout->type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT) {
- uint32_t surface_index = state->set[set].desc_offset;
- return nir_imm_ivec2(b, surface_index,
- bind_layout->descriptor_offset);
+ assert(bind_layout->dynamic_offset_index < MAX_DYNAMIC_BUFFERS);
+ nir_def *dynamic_offset_index;
+ if (bind_layout->dynamic_offset_index >= 0) {
+ if (state->has_independent_sets) {
+ nir_def *dynamic_offset_start =
+ nir_load_desc_set_dynamic_index_intel(b, nir_imm_int(b, set));
+ dynamic_offset_index =
+ nir_iadd_imm(b, dynamic_offset_start,
+ bind_layout->dynamic_offset_index);
+ } else {
+ dynamic_offset_index =
+ nir_imm_int(b,
+ state->layout->set[set].dynamic_offset_start +
+ bind_layout->dynamic_offset_index);
+ }
} else {
- uint32_t surface_index = state->set[set].surface_offsets[binding];
- assert(array_size > 0 && array_size <= UINT16_MAX);
- assert(surface_index <= UINT16_MAX);
- uint32_t packed = ((array_size - 1) << 16) | surface_index;
- return nir_vec2(b, array_index, nir_imm_int(b, packed));
+ dynamic_offset_index = nir_imm_int(b, 0xff); /* No dynamic offset */
}
- }
- default:
- unreachable("Unsupported address format");
- }
+ const uint32_t desc_bti = state->set[set].binding[binding].surface_offset;
+ /* We don't care about the stride field for inline uniforms (see
+ * build_desc_addr_for_res_index), but for anything else we should be
+ * aligned to 8 bytes because we store a multiple of 8 in the packed info
+ * to be able to encode a stride up to 2040 (8 * 255).
+ */
+ assert(bind_layout->type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK ||
+ bind_layout->descriptor_surface_stride % 8 == 0);
+ const uint32_t desc_stride =
+ bind_layout->type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK ? 0 :
+ bind_layout->descriptor_surface_stride / 8;
+
+ nir_def *packed =
+ nir_ior_imm(b,
+ dynamic_offset_index,
+ (desc_stride << 24) |
+ (desc_bti << 16) |
+ (set_idx << 8));
+
+
+ return nir_vec4(b, packed,
+ nir_imm_int(b, bind_layout->descriptor_surface_offset),
+ nir_imm_int(b, array_size - 1),
+ array_index);
}
struct res_index_defs {
- nir_ssa_def *set_idx;
- nir_ssa_def *dyn_offset_base;
- nir_ssa_def *desc_offset_base;
- nir_ssa_def *array_index;
+ nir_def *bti_idx;
+ nir_def *set_idx;
+ nir_def *dyn_offset_base;
+ nir_def *desc_offset_base;
+ nir_def *array_index;
+ nir_def *desc_stride;
};
static struct res_index_defs
-unpack_res_index(nir_builder *b, nir_ssa_def *index)
+unpack_res_index(nir_builder *b, nir_def *index)
{
struct res_index_defs defs;
- nir_ssa_def *packed = nir_channel(b, index, 0);
- defs.set_idx = nir_extract_u16(b, packed, nir_imm_int(b, 1));
- defs.dyn_offset_base = nir_extract_u16(b, packed, nir_imm_int(b, 0));
+ nir_def *packed = nir_channel(b, index, 0);
+ defs.desc_stride =
+ nir_imul_imm(b, nir_extract_u8(b, packed, nir_imm_int(b, 3)), 8);
+ defs.bti_idx = nir_extract_u8(b, packed, nir_imm_int(b, 2));
+ defs.set_idx = nir_extract_u8(b, packed, nir_imm_int(b, 1));
+ defs.dyn_offset_base = nir_extract_u8(b, packed, nir_imm_int(b, 0));
defs.desc_offset_base = nir_channel(b, index, 1);
defs.array_index = nir_umin(b, nir_channel(b, index, 2),
@@ -392,6 +674,22 @@ unpack_res_index(nir_builder *b, nir_ssa_def *index)
return defs;
}
+/** Whether a surface is accessed through the bindless surface state heap */
+static bool
+is_binding_bindless(unsigned set, unsigned binding, bool sampler,
+ const struct apply_pipeline_layout_state *state)
+{
+ /* Has binding table entry has been allocated for this binding? */
+ if (sampler &&
+ state->set[set].binding[binding].sampler_offset != BINDLESS_OFFSET)
+ return false;
+ if (!sampler &&
+ state->set[set].binding[binding].surface_offset != BINDLESS_OFFSET)
+ return false;
+
+ return true;
+}
+
/** Adjust a Vulkan resource index
*
* This is the equivalent of nir_deref_type_ptr_as_array for resource indices.
@@ -400,25 +698,13 @@ unpack_res_index(nir_builder *b, nir_ssa_def *index)
* vulkan_resource_index intrinsic and we have to do it based on nothing but
* the address format.
*/
-static nir_ssa_def *
-build_res_reindex(nir_builder *b, nir_ssa_def *orig, nir_ssa_def *delta,
- nir_address_format addr_format)
+static nir_def *
+build_res_reindex(nir_builder *b, nir_def *orig, nir_def *delta)
{
- switch (addr_format) {
- case nir_address_format_64bit_global_32bit_offset:
- case nir_address_format_64bit_bounded_global:
- return nir_vec4(b, nir_channel(b, orig, 0),
- nir_channel(b, orig, 1),
- nir_channel(b, orig, 2),
- nir_iadd(b, nir_channel(b, orig, 3), delta));
-
- case nir_address_format_32bit_index_offset:
- return nir_vec2(b, nir_iadd(b, nir_channel(b, orig, 0), delta),
- nir_channel(b, orig, 1));
-
- default:
- unreachable("Unhandled address format");
- }
+ return nir_vec4(b, nir_channel(b, orig, 0),
+ nir_channel(b, orig, 1),
+ nir_channel(b, orig, 2),
+ nir_iadd(b, nir_channel(b, orig, 3), delta));
}
/** Get the address for a descriptor given its resource index
@@ -431,38 +717,31 @@ build_res_reindex(nir_builder *b, nir_ssa_def *orig, nir_ssa_def *delta,
* determine the descriptor stride for array descriptors. The bind_layout is
* optional for buffer descriptor types.
*/
-static nir_ssa_def *
-build_desc_addr(nir_builder *b,
- const struct anv_descriptor_set_binding_layout *bind_layout,
- const VkDescriptorType desc_type,
- nir_ssa_def *index, nir_address_format addr_format,
- struct apply_pipeline_layout_state *state)
+static nir_def *
+build_desc_addr_for_res_index(nir_builder *b,
+ const VkDescriptorType desc_type,
+ nir_def *index, nir_address_format addr_format,
+ struct apply_pipeline_layout_state *state)
{
+ struct res_index_defs res = unpack_res_index(b, index);
+
+ nir_def *desc_offset = res.desc_offset_base;
+ if (desc_type != VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) {
+ /* Compute the actual descriptor offset. For inline uniform blocks,
+ * the array index is ignored as they are only allowed to be a single
+ * descriptor (not an array) and there is no concept of a "stride".
+ *
+ */
+ desc_offset =
+ nir_iadd(b, desc_offset, nir_imul(b, res.array_index, res.desc_stride));
+ }
+
switch (addr_format) {
case nir_address_format_64bit_global_32bit_offset:
case nir_address_format_64bit_bounded_global: {
- struct res_index_defs res = unpack_res_index(b, index);
-
- nir_ssa_def *desc_offset = res.desc_offset_base;
- if (desc_type != VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT) {
- /* Compute the actual descriptor offset. For inline uniform blocks,
- * the array index is ignored as they are only allowed to be a single
- * descriptor (not an array) and there is no concept of a "stride".
- *
- * We use the bind_layout, if available, because it provides a more
- * accurate descriptor size.
- */
- const unsigned stride = bind_layout ?
- anv_descriptor_size(bind_layout) :
- anv_descriptor_type_size(state->pdevice, desc_type);
-
- desc_offset =
- nir_iadd(b, desc_offset, nir_imul_imm(b, res.array_index, stride));
- }
-
switch (state->desc_addr_format) {
case nir_address_format_64bit_global_32bit_offset: {
- nir_ssa_def *base_addr =
+ nir_def *base_addr =
nir_load_desc_set_address_intel(b, res.set_idx);
return nir_vec4(b, nir_unpack_64_2x32_split_x(b, base_addr),
nir_unpack_64_2x32_split_y(b, base_addr),
@@ -479,15 +758,272 @@ build_desc_addr(nir_builder *b,
}
case nir_address_format_32bit_index_offset:
- assert(desc_type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT);
+ assert(desc_type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK);
assert(state->desc_addr_format == nir_address_format_32bit_index_offset);
- return index;
+ return nir_vec2(b, res.set_idx, desc_offset);
+
+ default:
+ unreachable("Unhandled address format");
+ }
+}
+
+static nir_def *
+build_desc_addr_for_binding(nir_builder *b,
+ unsigned set, unsigned binding,
+ nir_def *array_index,
+ const struct apply_pipeline_layout_state *state)
+{
+ const struct anv_descriptor_set_binding_layout *bind_layout =
+ &state->layout->set[set].layout->binding[binding];
+
+ switch (state->desc_addr_format) {
+ case nir_address_format_64bit_global_32bit_offset:
+ case nir_address_format_64bit_bounded_global: {
+ nir_def *set_addr = nir_load_desc_set_address_intel(b, nir_imm_int(b, set));
+ nir_def *desc_offset =
+ nir_iadd_imm(b,
+ nir_imul_imm(b,
+ array_index,
+ bind_layout->descriptor_surface_stride),
+ bind_layout->descriptor_surface_offset);
+
+ return nir_vec4(b, nir_unpack_64_2x32_split_x(b, set_addr),
+ nir_unpack_64_2x32_split_y(b, set_addr),
+ nir_imm_int(b, UINT32_MAX),
+ desc_offset);
+ }
+
+ case nir_address_format_32bit_index_offset:
+ return nir_vec2(b,
+ nir_imm_int(b, state->set[set].desc_offset),
+ nir_iadd_imm(b,
+ nir_imul_imm(b,
+ array_index,
+ bind_layout->descriptor_surface_stride),
+ bind_layout->descriptor_surface_offset));
default:
unreachable("Unhandled address format");
}
}
+static unsigned
+binding_descriptor_offset(const struct apply_pipeline_layout_state *state,
+ const struct anv_descriptor_set_binding_layout *bind_layout,
+ bool sampler)
+{
+ if (sampler &&
+ state->layout->type == ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_DIRECT)
+ return bind_layout->descriptor_sampler_offset;
+
+ return bind_layout->descriptor_surface_offset;
+}
+
+static unsigned
+binding_descriptor_stride(const struct apply_pipeline_layout_state *state,
+ const struct anv_descriptor_set_binding_layout *bind_layout,
+ bool sampler)
+{
+ if (sampler &&
+ state->layout->type == ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_DIRECT)
+ return bind_layout->descriptor_sampler_stride;
+
+ return bind_layout->descriptor_surface_stride;
+}
+
+static nir_def *
+build_surface_index_for_binding(nir_builder *b,
+ unsigned set, unsigned binding,
+ nir_def *array_index,
+ unsigned plane,
+ bool non_uniform,
+ const struct apply_pipeline_layout_state *state)
+{
+ const struct anv_descriptor_set_binding_layout *bind_layout =
+ &state->layout->set[set].layout->binding[binding];
+ const unsigned descriptor_offset =
+ binding_descriptor_offset(state, bind_layout, false /* sampler */);
+ const unsigned descriptor_stride =
+ binding_descriptor_stride(state, bind_layout, false /* sampler */);
+ const bool is_bindless =
+ is_binding_bindless(set, binding, false /* sampler */, state);
+
+ nir_def *set_offset, *surface_index;
+ if (is_bindless) {
+ if (state->layout->type == ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_INDIRECT) {
+ set_offset = nir_imm_int(b, 0xdeaddead);
+
+ nir_def *desc_addr =
+ build_desc_addr_for_binding(b, set, binding, array_index, state);
+
+ surface_index =
+ build_load_descriptor_mem(b, desc_addr, 0, 1, 32, state);
+ } else {
+ set_offset =
+ nir_load_push_constant(b, 1, 32, nir_imm_int(b, 0),
+ .base = offsetof(struct anv_push_constants,
+ desc_surface_offsets[set]),
+ .range = sizeof_field(struct anv_push_constants,
+ desc_surface_offsets[set]));
+
+ /* With bindless indexes are offsets in the descriptor buffer */
+ surface_index =
+ nir_iadd_imm(b,
+ nir_imul_imm(b, array_index, descriptor_stride),
+ descriptor_offset);
+ if (plane != 0) {
+ assert(plane < bind_layout->max_plane_count);
+ surface_index = nir_iadd_imm(b, surface_index,
+ plane * (descriptor_stride /
+ bind_layout->max_plane_count));
+ }
+
+ assert(descriptor_offset % 64 == 0);
+ assert(descriptor_stride % 64 == 0);
+ }
+ } else {
+ /* Unused */
+ set_offset = nir_imm_int(b, 0xdeaddead);
+
+ unsigned bti_stride = bti_multiplier(state, set, binding);
+ assert(bti_stride >= 1);
+
+ /* For Ycbcr descriptors, add the plane offset */
+ unsigned element_index = plane;
+
+ /* With the binding table, it's an index in the table */
+ surface_index =
+ nir_iadd_imm(b, nir_imul_imm(b, array_index, bti_stride),
+ state->set[set].binding[binding].surface_offset + element_index);
+ assert(state->set[set].binding[binding].surface_offset < MAX_BINDING_TABLE_SIZE);
+ }
+
+ return nir_resource_intel(b,
+ set_offset,
+ surface_index,
+ array_index,
+ nir_imm_int(b, 0) /* bindless_base_offset */,
+ .desc_set = set,
+ .binding = binding,
+ .resource_block_intel = state->set[set].binding[binding].push_block,
+ .resource_access_intel =
+ (is_bindless ? nir_resource_intel_bindless : 0) |
+ (non_uniform ? nir_resource_intel_non_uniform : 0) |
+ ((state->set[set].binding[binding].properties &
+ BINDING_PROPERTY_PUSHABLE) ? nir_resource_intel_pushable : 0));
+}
+
+static nir_def *
+build_sampler_handle_for_binding(nir_builder *b,
+ unsigned set, unsigned binding,
+ nir_def *array_index,
+ unsigned plane,
+ bool non_uniform,
+ const struct apply_pipeline_layout_state *state)
+{
+ const struct anv_descriptor_set_binding_layout *bind_layout =
+ &state->layout->set[set].layout->binding[binding];
+ const unsigned descriptor_offset =
+ binding_descriptor_offset(state, bind_layout, true /* sampler */);
+ const unsigned descriptor_stride =
+ binding_descriptor_stride(state, bind_layout, true /* sampler */);
+ const bool is_embedded =
+ state->set[set].binding[binding].properties & BINDING_PROPERTY_EMBEDDED_SAMPLER;
+ const bool is_bindless =
+ is_binding_bindless(set, binding, true /* sampler */, state);
+ nir_def *set_offset, *sampler_index, *sampler_base_offset = nir_imm_int(b, 0);
+
+ if (is_embedded) {
+ set_offset = nir_imm_int(b, 0xdeaddead);
+ sampler_index = nir_load_reloc_const_intel(
+ b, BRW_SHADER_RELOC_EMBEDDED_SAMPLER_HANDLE +
+ state->set[set].binding[binding].embedded_sampler_index);
+ } else if (is_bindless) {
+ if (state->layout->type == ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_INDIRECT) {
+ set_offset = nir_imm_int(b, 0xdeaddead);
+
+ nir_def *desc_addr =
+ build_desc_addr_for_binding(b, set, binding, array_index, state);
+
+ /* This is anv_sampled_image_descriptor, the sampler handle is always
+ * in component 1.
+ */
+ nir_def *desc_data =
+ build_load_descriptor_mem(b, desc_addr, 0, 2, 32, state);
+
+ sampler_index = nir_channel(b, desc_data, 1);
+ } else {
+ set_offset =
+ nir_load_push_constant(b, 1, 32, nir_imm_int(b, 0),
+ .base = offsetof(struct anv_push_constants,
+ desc_sampler_offsets[set]),
+ .range = sizeof_field(struct anv_push_constants,
+ desc_sampler_offsets[set]));
+
+ uint32_t base_offset = descriptor_offset;
+
+ /* The SAMPLER_STATE can only be located at a 64 byte in the combined
+ * image/sampler case. Combined image/sampler is not supported to be
+ * used with mutable descriptor types.
+ */
+ if (bind_layout->data & ANV_DESCRIPTOR_SURFACE_SAMPLER)
+ base_offset += ANV_SURFACE_STATE_SIZE;
+
+ if (plane != 0) {
+ assert(plane < bind_layout->max_plane_count);
+ base_offset += plane * (descriptor_stride /
+ bind_layout->max_plane_count);
+ }
+
+ sampler_index =
+ nir_iadd_imm(b,
+ nir_imul_imm(b, array_index, descriptor_stride),
+ base_offset);
+ }
+ } else {
+ /* Unused */
+ set_offset = nir_imm_int(b, 0xdeaddead);
+
+ sampler_index =
+ nir_iadd_imm(b, array_index,
+ state->set[set].binding[binding].sampler_offset + plane);
+ }
+
+ nir_resource_data_intel sampler_resource = nir_resource_intel_sampler;
+ if (is_bindless)
+ sampler_resource |= nir_resource_intel_bindless;
+ if (is_embedded)
+ sampler_resource |= nir_resource_intel_sampler_embedded;
+ if (non_uniform)
+ sampler_resource |= nir_resource_intel_non_uniform;
+
+ return nir_resource_intel(b,
+ set_offset,
+ sampler_index,
+ array_index,
+ sampler_base_offset,
+ .desc_set = set,
+ .binding = binding,
+ .resource_access_intel = sampler_resource);
+}
+
+static nir_def *
+build_buffer_dynamic_offset_for_res_index(nir_builder *b,
+ nir_def *dyn_offset_base,
+ nir_def *array_index,
+ struct apply_pipeline_layout_state *state)
+{
+ nir_def *dyn_offset_idx = nir_iadd(b, dyn_offset_base, array_index);
+
+ nir_def *dyn_load =
+ nir_load_push_constant(b, 1, 32, nir_imul_imm(b, dyn_offset_idx, 4),
+ .base = offsetof(struct anv_push_constants, dynamic_offsets),
+ .range = sizeof_field(struct anv_push_constants, dynamic_offsets));
+
+ return nir_bcsel(b, nir_ieq_imm(b, dyn_offset_base, 0xff),
+ nir_imm_int(b, 0), dyn_load);
+}
+
/** Convert a Vulkan resource index into a buffer address
*
* In some cases, this does a memory load from the descriptor set and, in
@@ -495,62 +1031,52 @@ build_desc_addr(nir_builder *b,
*
* See build_res_index for details about each resource index format.
*/
-static nir_ssa_def *
-build_buffer_addr_for_res_index(nir_builder *b,
- const VkDescriptorType desc_type,
- nir_ssa_def *res_index,
- nir_address_format addr_format,
- struct apply_pipeline_layout_state *state)
+static nir_def *
+build_indirect_buffer_addr_for_res_index(nir_builder *b,
+ const VkDescriptorType desc_type,
+ nir_def *res_index,
+ nir_address_format addr_format,
+ struct apply_pipeline_layout_state *state)
{
- if (desc_type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT) {
+ struct res_index_defs res = unpack_res_index(b, res_index);
+
+ if (desc_type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) {
assert(addr_format == state->desc_addr_format);
- return build_desc_addr(b, NULL, desc_type, res_index, addr_format, state);
+ return build_desc_addr_for_res_index(b, desc_type, res_index,
+ addr_format, state);
} else if (addr_format == nir_address_format_32bit_index_offset) {
- nir_ssa_def *array_index = nir_channel(b, res_index, 0);
- nir_ssa_def *packed = nir_channel(b, res_index, 1);
- nir_ssa_def *array_max = nir_extract_u16(b, packed, nir_imm_int(b, 1));
- nir_ssa_def *surface_index = nir_extract_u16(b, packed, nir_imm_int(b, 0));
-
- if (state->add_bounds_checks)
- array_index = nir_umin(b, array_index, array_max);
-
- return nir_vec2(b, nir_iadd(b, surface_index, array_index),
+ return nir_vec2(b, nir_iadd(b, res.bti_idx, res.array_index),
nir_imm_int(b, 0));
}
- nir_ssa_def *desc_addr =
- build_desc_addr(b, NULL, desc_type, res_index, addr_format, state);
+ nir_def *desc_addr =
+ build_desc_addr_for_res_index(b, desc_type, res_index,
+ addr_format, state);
- nir_ssa_def *desc = build_load_descriptor_mem(b, desc_addr, 0, 4, 32, state);
+ nir_def *desc = build_load_descriptor_mem(b, desc_addr, 0, 4, 32, state);
if (state->has_dynamic_buffers) {
- struct res_index_defs res = unpack_res_index(b, res_index);
-
/* This shader has dynamic offsets and we have no way of knowing
* (save from the dynamic offset base index) if this buffer has a
* dynamic offset.
*/
- nir_ssa_def *dyn_offset_idx =
+ nir_def *dyn_offset_idx =
nir_iadd(b, res.dyn_offset_base, res.array_index);
- if (state->add_bounds_checks) {
- dyn_offset_idx = nir_umin(b, dyn_offset_idx,
- nir_imm_int(b, MAX_DYNAMIC_BUFFERS));
- }
- nir_ssa_def *dyn_load =
+ nir_def *dyn_load =
nir_load_push_constant(b, 1, 32, nir_imul_imm(b, dyn_offset_idx, 4),
.base = offsetof(struct anv_push_constants, dynamic_offsets),
.range = MAX_DYNAMIC_BUFFERS * 4);
- nir_ssa_def *dynamic_offset =
+ nir_def *dynamic_offset =
nir_bcsel(b, nir_ieq_imm(b, res.dyn_offset_base, 0xff),
nir_imm_int(b, 0), dyn_load);
/* The dynamic offset gets added to the base pointer so that we
* have a sliding window range.
*/
- nir_ssa_def *base_ptr =
- nir_pack_64_2x32(b, nir_channels(b, desc, 0x3));
+ nir_def *base_ptr =
+ nir_pack_64_2x32(b, nir_trim_vector(b, desc, 2));
base_ptr = nir_iadd(b, base_ptr, nir_u2u64(b, dynamic_offset));
desc = nir_vec4(b, nir_unpack_64_2x32_split_x(b, base_ptr),
nir_unpack_64_2x32_split_y(b, base_ptr),
@@ -568,50 +1094,138 @@ build_buffer_addr_for_res_index(nir_builder *b,
nir_imm_int(b, 0));
}
+static nir_def *
+build_direct_buffer_addr_for_res_index(nir_builder *b,
+ const VkDescriptorType desc_type,
+ nir_def *res_index,
+ nir_address_format addr_format,
+ struct apply_pipeline_layout_state *state)
+{
+ if (desc_type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) {
+ assert(addr_format == state->desc_addr_format);
+ return build_desc_addr_for_res_index(b, desc_type, res_index,
+ addr_format, state);
+ } else if (addr_format == nir_address_format_32bit_index_offset) {
+ struct res_index_defs res = unpack_res_index(b, res_index);
+
+ return nir_vec2(b, nir_iadd(b, res.desc_offset_base,
+ nir_imul(b, res.array_index, res.desc_stride)),
+ nir_imm_int(b, 0));
+ }
+
+ nir_def *desc_addr =
+ build_desc_addr_for_res_index(b, desc_type, res_index,
+ addr_format, state);
+
+ nir_def *addr =
+ build_load_render_surface_state_address(b, desc_addr, state);
+
+ if (state->has_dynamic_buffers) {
+ struct res_index_defs res = unpack_res_index(b, res_index);
+
+ /* This shader has dynamic offsets and we have no way of knowing (save
+ * from the dynamic offset base index) if this buffer has a dynamic
+ * offset.
+ */
+ nir_def *dynamic_offset =
+ build_buffer_dynamic_offset_for_res_index(
+ b, res.dyn_offset_base, res.array_index, state);
+
+ /* The dynamic offset gets added to the base pointer so that we
+ * have a sliding window range.
+ */
+ nir_def *base_ptr =
+ nir_pack_64_2x32(b, nir_trim_vector(b, addr, 2));
+ base_ptr = nir_iadd(b, base_ptr, nir_u2u64(b, dynamic_offset));
+ addr = nir_vec4(b, nir_unpack_64_2x32_split_x(b, base_ptr),
+ nir_unpack_64_2x32_split_y(b, base_ptr),
+ nir_channel(b, addr, 2),
+ nir_channel(b, addr, 3));
+ }
+
+ /* The last element of the vec4 is always zero.
+ *
+ * See also struct anv_address_range_descriptor
+ */
+ return nir_vec4(b, nir_channel(b, addr, 0),
+ nir_channel(b, addr, 1),
+ nir_channel(b, addr, 2),
+ nir_imm_int(b, 0));
+}
+
+static nir_def *
+build_buffer_addr_for_res_index(nir_builder *b,
+ const VkDescriptorType desc_type,
+ nir_def *res_index,
+ nir_address_format addr_format,
+ struct apply_pipeline_layout_state *state)
+{
+ if (state->layout->type == ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_INDIRECT)
+ return build_indirect_buffer_addr_for_res_index(b, desc_type, res_index, addr_format, state);
+ else
+ return build_direct_buffer_addr_for_res_index(b, desc_type, res_index, addr_format, state);
+}
+
+static nir_def *
+build_buffer_addr_for_binding(nir_builder *b,
+ const VkDescriptorType desc_type,
+ unsigned set,
+ unsigned binding,
+ nir_def *res_index,
+ nir_address_format addr_format,
+ struct apply_pipeline_layout_state *state)
+{
+ if (addr_format != nir_address_format_32bit_index_offset)
+ return build_buffer_addr_for_res_index(b, desc_type, res_index, addr_format, state);
+
+ if (desc_type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) {
+ const struct anv_descriptor_set_binding_layout *bind_layout =
+ &state->layout->set[set].layout->binding[binding];
+ return nir_vec2(b,
+ nir_imm_int(b, state->set[set].desc_offset),
+ nir_imm_int(b, bind_layout->descriptor_surface_offset));
+ }
+
+ struct res_index_defs res = unpack_res_index(b, res_index);
+
+ return nir_vec2(b,
+ build_surface_index_for_binding(b, set, binding, res.array_index,
+ 0 /* plane */,
+ false /* non_uniform */,
+ state),
+ nir_imm_int(b, 0));
+}
+
/** Loads descriptor memory for a variable-based deref chain
*
* The deref chain has to terminate at a variable with a descriptor_set and
* binding set. This is used for images, textures, and samplers.
*/
-static nir_ssa_def *
-build_load_var_deref_descriptor_mem(nir_builder *b, nir_deref_instr *deref,
- unsigned desc_offset,
- unsigned num_components, unsigned bit_size,
+static nir_def *
+build_load_var_deref_surface_handle(nir_builder *b, nir_deref_instr *deref,
+ bool non_uniform,
+ bool *out_is_bindless,
struct apply_pipeline_layout_state *state)
{
nir_variable *var = nir_deref_instr_get_variable(deref);
const uint32_t set = var->data.descriptor_set;
const uint32_t binding = var->data.binding;
- const struct anv_descriptor_set_binding_layout *bind_layout =
- &state->layout->set[set].layout->binding[binding];
- nir_ssa_def *array_index;
+ *out_is_bindless =
+ is_binding_bindless(set, binding, false /* sampler */, state);
+
+ nir_def *array_index;
if (deref->deref_type != nir_deref_type_var) {
assert(deref->deref_type == nir_deref_type_array);
assert(nir_deref_instr_parent(deref)->deref_type == nir_deref_type_var);
- assert(deref->arr.index.is_ssa);
array_index = deref->arr.index.ssa;
} else {
array_index = nir_imm_int(b, 0);
}
- /* It doesn't really matter what address format we choose as everything
- * will constant-fold nicely. Choose one that uses the actual descriptor
- * buffer so we don't run into issues index/offset assumptions.
- */
- const nir_address_format addr_format =
- nir_address_format_64bit_bounded_global;
-
- nir_ssa_def *res_index =
- build_res_index(b, set, binding, array_index, addr_format, state);
-
- nir_ssa_def *desc_addr =
- build_desc_addr(b, bind_layout, bind_layout->type,
- res_index, addr_format, state);
-
- return build_load_descriptor_mem(b, desc_addr, desc_offset,
- num_components, bit_size, state);
+ return build_surface_index_for_binding(b, set, binding, array_index,
+ 0 /* plane */, non_uniform, state);
}
/** A recursive form of build_res_index()
@@ -621,7 +1235,7 @@ build_load_var_deref_descriptor_mem(nir_builder *b, nir_deref_instr *deref,
* hopes of better CSE. This means the cursor is not where you left it when
* this function returns.
*/
-static nir_ssa_def *
+static nir_def *
build_res_index_for_chain(nir_builder *b, nir_intrinsic_instr *intrin,
nir_address_format addr_format,
uint32_t *set, uint32_t *binding,
@@ -629,22 +1243,19 @@ build_res_index_for_chain(nir_builder *b, nir_intrinsic_instr *intrin,
{
if (intrin->intrinsic == nir_intrinsic_vulkan_resource_index) {
b->cursor = nir_before_instr(&intrin->instr);
- assert(intrin->src[0].is_ssa);
*set = nir_intrinsic_desc_set(intrin);
*binding = nir_intrinsic_binding(intrin);
- return build_res_index(b, *set, *binding, intrin->src[0].ssa,
- addr_format, state);
+ return build_res_index(b, *set, *binding, intrin->src[0].ssa, state);
} else {
assert(intrin->intrinsic == nir_intrinsic_vulkan_resource_reindex);
nir_intrinsic_instr *parent = nir_src_as_intrinsic(intrin->src[0]);
- nir_ssa_def *index =
+ nir_def *index =
build_res_index_for_chain(b, parent, addr_format,
set, binding, state);
b->cursor = nir_before_instr(&intrin->instr);
- assert(intrin->src[1].is_ssa);
- return build_res_reindex(b, index, intrin->src[1].ssa, addr_format);
+ return build_res_reindex(b, index, intrin->src[1].ssa);
}
}
@@ -652,22 +1263,23 @@ build_res_index_for_chain(nir_builder *b, nir_intrinsic_instr *intrin,
*
* The cursor is not where you left it when this function returns.
*/
-static nir_ssa_def *
+static nir_def *
build_buffer_addr_for_idx_intrin(nir_builder *b,
nir_intrinsic_instr *idx_intrin,
nir_address_format addr_format,
struct apply_pipeline_layout_state *state)
{
uint32_t set = UINT32_MAX, binding = UINT32_MAX;
- nir_ssa_def *res_index =
+ nir_def *res_index =
build_res_index_for_chain(b, idx_intrin, addr_format,
&set, &binding, state);
const struct anv_descriptor_set_binding_layout *bind_layout =
&state->layout->set[set].layout->binding[binding];
- return build_buffer_addr_for_res_index(b, bind_layout->type,
- res_index, addr_format, state);
+ return build_buffer_addr_for_binding(b, bind_layout->type,
+ set, binding, res_index,
+ addr_format, state);
}
/** Builds a buffer address for deref chain
@@ -677,14 +1289,14 @@ build_buffer_addr_for_idx_intrin(nir_builder *b,
*
* The cursor is not where you left it when this function returns.
*/
-static nir_ssa_def *
+static nir_def *
build_buffer_addr_for_deref(nir_builder *b, nir_deref_instr *deref,
nir_address_format addr_format,
struct apply_pipeline_layout_state *state)
{
nir_deref_instr *parent = nir_deref_instr_parent(deref);
if (parent) {
- nir_ssa_def *addr =
+ nir_def *addr =
build_buffer_addr_for_deref(b, parent, addr_format, state);
b->cursor = nir_before_instr(&deref->instr);
@@ -717,23 +1329,35 @@ try_lower_direct_buffer_intrinsic(nir_builder *b,
return false;
}
+ const unsigned set = nir_intrinsic_desc_set(desc);
+ const unsigned binding = nir_intrinsic_binding(desc);
+
+ const struct anv_descriptor_set_binding_layout *bind_layout =
+ &state->layout->set[set].layout->binding[binding];
+
nir_address_format addr_format = descriptor_address_format(desc, state);
+ /* Although we could lower non uniform binding table accesses with
+ * nir_opt_non_uniform_access, we might as well use an A64 message and
+ * avoid the loops inserted by that lowering pass.
+ */
+ if (nir_intrinsic_access(intrin) & ACCESS_NON_UNIFORM)
+ return false;
+
if (nir_deref_mode_is(deref, nir_var_mem_ssbo)) {
/* 64-bit atomics only support A64 messages so we can't lower them to
* the index+offset model.
*/
- if (is_atomic && nir_dest_bit_size(intrin->dest) == 64 &&
+ if (is_atomic && intrin->def.bit_size == 64 &&
!state->pdevice->info.has_lsc)
return false;
- /* Normal binding table-based messages can't handle non-uniform access
- * so we have to fall back to A64.
+ /* If we don't have a BTI for this binding and we're using indirect
+ * descriptors, we'll use A64 messages. This is handled in the main
+ * lowering path.
*/
- if (nir_intrinsic_access(intrin) & ACCESS_NON_UNIFORM)
- return false;
-
- if (!descriptor_has_bti(desc, state))
+ if (state->layout->type == ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_INDIRECT &&
+ !descriptor_has_bti(desc, state))
return false;
/* Rewrite to 32bit_index_offset whenever we can */
@@ -741,12 +1365,36 @@ try_lower_direct_buffer_intrinsic(nir_builder *b,
} else {
assert(nir_deref_mode_is(deref, nir_var_mem_ubo));
- /* Rewrite to 32bit_index_offset whenever we can */
- if (descriptor_has_bti(desc, state))
+ /* If we don't have a BTI for this binding and we're using indirect
+ * descriptors, we'll use A64 messages. This is handled in the main
+ * lowering path.
+ *
+ * We make an exception for uniform blocks which are built from the
+ * descriptor set base address + offset. There is no indirect data to
+ * fetch.
+ */
+ if (state->layout->type == ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_INDIRECT &&
+ bind_layout->type != VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK &&
+ !descriptor_has_bti(desc, state))
+ return false;
+
+ /* If this is an inline uniform and the shader stage is bindless, we
+ * can't switch to 32bit_index_offset.
+ */
+ if (bind_layout->type != VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK ||
+ !brw_shader_stage_requires_bindless_resources(b->shader->info.stage))
addr_format = nir_address_format_32bit_index_offset;
}
- nir_ssa_def *addr =
+ /* If a dynamic has not been assigned a binding table entry, we need to
+ * bail here.
+ */
+ if ((bind_layout->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC ||
+ bind_layout->type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC) &&
+ !descriptor_has_bti(desc, state))
+ return false;
+
+ nir_def *addr =
build_buffer_addr_for_deref(b, deref, addr_format, state);
b->cursor = nir_before_instr(&intrin->instr);
@@ -772,26 +1420,22 @@ lower_load_accel_struct_desc(nir_builder *b,
nir_address_format_64bit_bounded_global;
uint32_t set = UINT32_MAX, binding = UINT32_MAX;
- nir_ssa_def *res_index =
+ nir_def *res_index =
build_res_index_for_chain(b, idx_intrin, addr_format,
&set, &binding, state);
- const struct anv_descriptor_set_binding_layout *bind_layout =
- &state->layout->set[set].layout->binding[binding];
-
b->cursor = nir_before_instr(&load_desc->instr);
- nir_ssa_def *desc_addr =
- build_desc_addr(b, bind_layout, bind_layout->type,
- res_index, addr_format, state);
+ struct res_index_defs res = unpack_res_index(b, res_index);
+ nir_def *desc_addr =
+ build_desc_addr_for_binding(b, set, binding, res.array_index, state);
/* Acceleration structure descriptors are always uint64_t */
- nir_ssa_def *desc = build_load_descriptor_mem(b, desc_addr, 0, 1, 64, state);
+ nir_def *desc = build_load_descriptor_mem(b, desc_addr, 0, 1, 64, state);
- assert(load_desc->dest.is_ssa);
- assert(load_desc->dest.ssa.bit_size == 64);
- assert(load_desc->dest.ssa.num_components == 1);
- nir_ssa_def_rewrite_uses(&load_desc->dest.ssa, desc);
+ assert(load_desc->def.bit_size == 64);
+ assert(load_desc->def.num_components == 1);
+ nir_def_rewrite_uses(&load_desc->def, desc);
nir_instr_remove(&load_desc->instr);
return true;
@@ -811,20 +1455,8 @@ lower_direct_buffer_instr(nir_builder *b, nir_instr *instr, void *_state)
case nir_intrinsic_store_deref:
return try_lower_direct_buffer_intrinsic(b, intrin, false, state);
- case nir_intrinsic_deref_atomic_add:
- case nir_intrinsic_deref_atomic_imin:
- case nir_intrinsic_deref_atomic_umin:
- case nir_intrinsic_deref_atomic_imax:
- case nir_intrinsic_deref_atomic_umax:
- case nir_intrinsic_deref_atomic_and:
- case nir_intrinsic_deref_atomic_or:
- case nir_intrinsic_deref_atomic_xor:
- case nir_intrinsic_deref_atomic_exchange:
- case nir_intrinsic_deref_atomic_comp_swap:
- case nir_intrinsic_deref_atomic_fadd:
- case nir_intrinsic_deref_atomic_fmin:
- case nir_intrinsic_deref_atomic_fmax:
- case nir_intrinsic_deref_atomic_fcomp_swap:
+ case nir_intrinsic_deref_atomic:
+ case nir_intrinsic_deref_atomic_swap:
return try_lower_direct_buffer_intrinsic(b, intrin, true, state);
case nir_intrinsic_get_ssbo_size: {
@@ -833,23 +1465,30 @@ lower_direct_buffer_instr(nir_builder *b, nir_instr *instr, void *_state)
*/
nir_intrinsic_instr *idx_intrin =
find_descriptor_for_index_src(intrin->src[0], state);
- if (idx_intrin == NULL || !descriptor_has_bti(idx_intrin, state))
+ if (idx_intrin == NULL)
return false;
- b->cursor = nir_before_instr(&intrin->instr);
-
/* We just checked that this is a BTI descriptor */
const nir_address_format addr_format =
nir_address_format_32bit_index_offset;
- nir_ssa_def *buffer_addr =
- build_buffer_addr_for_idx_intrin(b, idx_intrin, addr_format, state);
-
b->cursor = nir_before_instr(&intrin->instr);
- nir_ssa_def *bti = nir_channel(b, buffer_addr, 0);
- nir_instr_rewrite_src(&intrin->instr, &intrin->src[0],
- nir_src_for_ssa(bti));
+ uint32_t set = UINT32_MAX, binding = UINT32_MAX;
+ nir_def *res_index =
+ build_res_index_for_chain(b, idx_intrin, addr_format,
+ &set, &binding, state);
+
+ bool non_uniform = nir_intrinsic_access(intrin) & ACCESS_NON_UNIFORM;
+
+ nir_def *surface_index =
+ build_surface_index_for_binding(b, set, binding,
+ nir_channel(b, res_index, 3),
+ 0 /* plane */,
+ non_uniform,
+ state);
+
+ nir_src_rewrite(&intrin->src[0], surface_index);
_mesa_set_add(state->lowered_instrs, intrin);
return true;
}
@@ -871,20 +1510,15 @@ lower_res_index_intrinsic(nir_builder *b, nir_intrinsic_instr *intrin,
{
b->cursor = nir_before_instr(&intrin->instr);
- nir_address_format addr_format =
- addr_format_for_desc_type(nir_intrinsic_desc_type(intrin), state);
-
- assert(intrin->src[0].is_ssa);
- nir_ssa_def *index =
+ nir_def *index =
build_res_index(b, nir_intrinsic_desc_set(intrin),
nir_intrinsic_binding(intrin),
intrin->src[0].ssa,
- addr_format, state);
+ state);
- assert(intrin->dest.is_ssa);
- assert(intrin->dest.ssa.bit_size == index->bit_size);
- assert(intrin->dest.ssa.num_components == index->num_components);
- nir_ssa_def_rewrite_uses(&intrin->dest.ssa, index);
+ assert(intrin->def.bit_size == index->bit_size);
+ assert(intrin->def.num_components == index->num_components);
+ nir_def_rewrite_uses(&intrin->def, index);
nir_instr_remove(&intrin->instr);
return true;
@@ -896,19 +1530,13 @@ lower_res_reindex_intrinsic(nir_builder *b, nir_intrinsic_instr *intrin,
{
b->cursor = nir_before_instr(&intrin->instr);
- nir_address_format addr_format =
- addr_format_for_desc_type(nir_intrinsic_desc_type(intrin), state);
-
- assert(intrin->src[0].is_ssa && intrin->src[1].is_ssa);
- nir_ssa_def *index =
+ nir_def *index =
build_res_reindex(b, intrin->src[0].ssa,
- intrin->src[1].ssa,
- addr_format);
+ intrin->src[1].ssa);
- assert(intrin->dest.is_ssa);
- assert(intrin->dest.ssa.bit_size == index->bit_size);
- assert(intrin->dest.ssa.num_components == index->num_components);
- nir_ssa_def_rewrite_uses(&intrin->dest.ssa, index);
+ assert(intrin->def.bit_size == index->bit_size);
+ assert(intrin->def.num_components == index->num_components);
+ nir_def_rewrite_uses(&intrin->def, index);
nir_instr_remove(&intrin->instr);
return true;
@@ -923,40 +1551,14 @@ lower_load_vulkan_descriptor(nir_builder *b, nir_intrinsic_instr *intrin,
const VkDescriptorType desc_type = nir_intrinsic_desc_type(intrin);
nir_address_format addr_format = addr_format_for_desc_type(desc_type, state);
- assert(intrin->dest.is_ssa);
- nir_foreach_use(src, &intrin->dest.ssa) {
- if (src->parent_instr->type != nir_instr_type_deref)
- continue;
-
- nir_deref_instr *cast = nir_instr_as_deref(src->parent_instr);
- assert(cast->deref_type == nir_deref_type_cast);
- switch (desc_type) {
- case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
- case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
- cast->cast.align_mul = ANV_UBO_ALIGNMENT;
- cast->cast.align_offset = 0;
- break;
-
- case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
- case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
- cast->cast.align_mul = ANV_SSBO_ALIGNMENT;
- cast->cast.align_offset = 0;
- break;
-
- default:
- break;
- }
- }
-
- assert(intrin->src[0].is_ssa);
- nir_ssa_def *desc =
- build_buffer_addr_for_res_index(b, desc_type, intrin->src[0].ssa,
+ nir_def *desc =
+ build_buffer_addr_for_res_index(b,
+ desc_type, intrin->src[0].ssa,
addr_format, state);
- assert(intrin->dest.is_ssa);
- assert(intrin->dest.ssa.bit_size == desc->bit_size);
- assert(intrin->dest.ssa.num_components == desc->num_components);
- nir_ssa_def_rewrite_uses(&intrin->dest.ssa, desc);
+ assert(intrin->def.bit_size == desc->bit_size);
+ assert(intrin->def.num_components == desc->num_components);
+ nir_def_rewrite_uses(&intrin->def, desc);
nir_instr_remove(&intrin->instr);
return true;
@@ -971,35 +1573,37 @@ lower_get_ssbo_size(nir_builder *b, nir_intrinsic_instr *intrin,
b->cursor = nir_before_instr(&intrin->instr);
- nir_address_format addr_format =
- addr_format_for_desc_type(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, state);
-
- assert(intrin->src[0].is_ssa);
- nir_ssa_def *desc =
- build_buffer_addr_for_res_index(b, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
- intrin->src[0].ssa, addr_format, state);
-
- switch (addr_format) {
- case nir_address_format_64bit_global_32bit_offset:
- case nir_address_format_64bit_bounded_global: {
- nir_ssa_def *size = nir_channel(b, desc, 2);
- nir_ssa_def_rewrite_uses(&intrin->dest.ssa, size);
- nir_instr_remove(&intrin->instr);
- break;
- }
+ const nir_address_format addr_format =
+ nir_address_format_64bit_bounded_global;
- case nir_address_format_32bit_index_offset:
- /* The binding table index is the first component of the address. The
- * back-end wants a scalar binding table index source.
+ nir_def *desc_addr =
+ nir_build_addr_iadd_imm(
+ b,
+ build_desc_addr_for_res_index(b,
+ VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+ intrin->src[0].ssa,
+ addr_format, state),
+ addr_format,
+ nir_var_mem_ssbo,
+ state->pdevice->isl_dev.ss.size);
+
+ nir_def *desc_range;
+ if (state->layout->type == ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_INDIRECT) {
+ /* Load the anv_address_range_descriptor */
+ desc_range =
+ build_load_descriptor_mem(b, desc_addr, 0, 4, 32, state);
+ } else {
+ /* Build a vec4 similar to anv_address_range_descriptor using the
+ * RENDER_SURFACE_STATE.
*/
- nir_instr_rewrite_src(&intrin->instr, &intrin->src[0],
- nir_src_for_ssa(nir_channel(b, desc, 0)));
- break;
-
- default:
- unreachable("Unsupported address format");
+ desc_range =
+ build_load_render_surface_state_address(b, desc_addr, state);
}
+ nir_def *size = nir_channel(b, desc_range, 2);
+ nir_def_rewrite_uses(&intrin->def, size);
+ nir_instr_remove(&intrin->instr);
+
return true;
}
@@ -1008,53 +1612,67 @@ lower_image_intrinsic(nir_builder *b, nir_intrinsic_instr *intrin,
struct apply_pipeline_layout_state *state)
{
nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]);
- nir_variable *var = nir_deref_instr_get_variable(deref);
-
- unsigned set = var->data.descriptor_set;
- unsigned binding = var->data.binding;
- unsigned binding_offset = state->set[set].surface_offsets[binding];
b->cursor = nir_before_instr(&intrin->instr);
- ASSERTED const bool use_bindless = state->pdevice->has_bindless_images;
+ bool non_uniform = nir_intrinsic_access(intrin) & ACCESS_NON_UNIFORM;
+ bool is_bindless;
+ nir_def *handle =
+ build_load_var_deref_surface_handle(b, deref, non_uniform,
+ &is_bindless, state);
+ nir_rewrite_image_intrinsic(intrin, handle, is_bindless);
+
+ return true;
+}
- if (intrin->intrinsic == nir_intrinsic_image_deref_load_param_intel) {
- b->cursor = nir_instr_remove(&intrin->instr);
+static bool
+lower_image_size_intrinsic(nir_builder *b, nir_intrinsic_instr *intrin,
+ struct apply_pipeline_layout_state *state)
+{
+ if (nir_intrinsic_image_dim(intrin) != GLSL_SAMPLER_DIM_3D)
+ return lower_image_intrinsic(b, intrin, state);
- assert(!use_bindless); /* Otherwise our offsets would be wrong */
- const unsigned param = nir_intrinsic_base(intrin);
+ nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]);
- nir_ssa_def *desc =
- build_load_var_deref_descriptor_mem(b, deref, param * 16,
- intrin->dest.ssa.num_components,
- intrin->dest.ssa.bit_size, state);
+ b->cursor = nir_before_instr(&intrin->instr);
- nir_ssa_def_rewrite_uses(&intrin->dest.ssa, desc);
- } else if (binding_offset > MAX_BINDING_TABLE_SIZE) {
- const bool write_only =
- (var->data.access & ACCESS_NON_READABLE) != 0;
- nir_ssa_def *desc =
- build_load_var_deref_descriptor_mem(b, deref, 0, 2, 32, state);
- nir_ssa_def *handle = nir_channel(b, desc, write_only ? 1 : 0);
- nir_rewrite_image_intrinsic(intrin, handle, true);
- } else {
- unsigned array_size =
- state->layout->set[set].layout->binding[binding].array_size;
-
- nir_ssa_def *index = NULL;
- if (deref->deref_type != nir_deref_type_var) {
- assert(deref->deref_type == nir_deref_type_array);
- index = nir_ssa_for_src(b, deref->arr.index, 1);
- if (state->add_bounds_checks)
- index = nir_umin(b, index, nir_imm_int(b, array_size - 1));
- } else {
- index = nir_imm_int(b, 0);
- }
+ bool non_uniform = nir_intrinsic_access(intrin) & ACCESS_NON_UNIFORM;
+ bool is_bindless;
+ nir_def *handle =
+ build_load_var_deref_surface_handle(b, deref, non_uniform,
+ &is_bindless, state);
+ nir_rewrite_image_intrinsic(intrin, handle, is_bindless);
- index = nir_iadd_imm(b, index, binding_offset);
- nir_rewrite_image_intrinsic(intrin, index, false);
+ nir_variable *var = nir_deref_instr_get_variable(deref);
+ const uint32_t set = var->data.descriptor_set;
+ const uint32_t binding = var->data.binding;
+
+ nir_def *array_index;
+ if (deref->deref_type != nir_deref_type_var) {
+ assert(deref->deref_type == nir_deref_type_array);
+ assert(nir_deref_instr_parent(deref)->deref_type == nir_deref_type_var);
+ array_index = deref->arr.index.ssa;
+ } else {
+ array_index = nir_imm_int(b, 0);
}
+ nir_def *desc_addr = build_desc_addr_for_binding(
+ b, set, binding, array_index, state);
+
+ b->cursor = nir_after_instr(&intrin->instr);
+
+ nir_def *image_depth =
+ build_load_storage_3d_image_depth(b, desc_addr,
+ nir_channel(b, &intrin->def, 2),
+ state);
+
+ nir_def *comps[4] = {};
+ for (unsigned c = 0; c < intrin->def.num_components; c++)
+ comps[c] = c == 2 ? image_depth : nir_channel(b, &intrin->def, c);
+
+ nir_def *vec = nir_vec(b, comps, intrin->def.num_components);
+ nir_def_rewrite_uses_after(&intrin->def, vec, vec->parent_instr);
+
return true;
}
@@ -1068,40 +1686,45 @@ lower_load_constant(nir_builder *b, nir_intrinsic_instr *intrin,
* by constant folding.
*/
assert(!nir_src_is_const(intrin->src[0]));
- nir_ssa_def *offset = nir_iadd_imm(b, nir_ssa_for_src(b, intrin->src[0], 1),
+ nir_def *offset = nir_iadd_imm(b, intrin->src[0].ssa,
nir_intrinsic_base(intrin));
- nir_ssa_def *data;
- if (state->pdevice->use_softpin) {
- unsigned load_size = intrin->dest.ssa.num_components *
- intrin->dest.ssa.bit_size / 8;
- unsigned load_align = intrin->dest.ssa.bit_size / 8;
+ unsigned load_size = intrin->def.num_components *
+ intrin->def.bit_size / 8;
+ unsigned load_align = intrin->def.bit_size / 8;
- assert(load_size < b->shader->constant_data_size);
- unsigned max_offset = b->shader->constant_data_size - load_size;
- offset = nir_umin(b, offset, nir_imm_int(b, max_offset));
+ assert(load_size < b->shader->constant_data_size);
+ unsigned max_offset = b->shader->constant_data_size - load_size;
+ offset = nir_umin(b, offset, nir_imm_int(b, max_offset));
- nir_ssa_def *const_data_base_addr = nir_pack_64_2x32_split(b,
+ nir_def *const_data_addr = nir_pack_64_2x32_split(b,
+ nir_iadd(b,
nir_load_reloc_const_intel(b, BRW_SHADER_RELOC_CONST_DATA_ADDR_LOW),
- nir_load_reloc_const_intel(b, BRW_SHADER_RELOC_CONST_DATA_ADDR_HIGH));
+ offset),
+ nir_load_reloc_const_intel(b, BRW_SHADER_RELOC_CONST_DATA_ADDR_HIGH));
- data = nir_load_global_constant(b, nir_iadd(b, const_data_base_addr,
- nir_u2u64(b, offset)),
- load_align,
- intrin->dest.ssa.num_components,
- intrin->dest.ssa.bit_size);
- } else {
- nir_ssa_def *index = nir_imm_int(b, state->constants_offset);
-
- data = nir_load_ubo(b, intrin->num_components, intrin->dest.ssa.bit_size,
- index, offset,
- .align_mul = intrin->dest.ssa.bit_size / 8,
- .align_offset = 0,
- .range_base = nir_intrinsic_base(intrin),
- .range = nir_intrinsic_range(intrin));
- }
+ nir_def *data =
+ nir_load_global_constant(b, const_data_addr,
+ load_align,
+ intrin->def.num_components,
+ intrin->def.bit_size);
+
+ nir_def_rewrite_uses(&intrin->def, data);
+
+ return true;
+}
+
+static bool
+lower_base_workgroup_id(nir_builder *b, nir_intrinsic_instr *intrin,
+ struct apply_pipeline_layout_state *state)
+{
+ b->cursor = nir_instr_remove(&intrin->instr);
- nir_ssa_def_rewrite_uses(&intrin->dest.ssa, data);
+ nir_def *base_workgroup_id =
+ nir_load_push_constant(b, 3, 32, nir_imm_int(b, 0),
+ .base = offsetof(struct anv_push_constants, cs.base_work_group_id),
+ .range = sizeof_field(struct anv_push_constants, cs.base_work_group_id));
+ nir_def_rewrite_uses(&intrin->def, base_workgroup_id);
return true;
}
@@ -1109,7 +1732,7 @@ lower_load_constant(nir_builder *b, nir_intrinsic_instr *intrin,
static void
lower_tex_deref(nir_builder *b, nir_tex_instr *tex,
nir_tex_src_type deref_src_type,
- unsigned *base_index, unsigned plane,
+ unsigned base_index, unsigned plane,
struct apply_pipeline_layout_state *state)
{
int deref_src_idx = nir_tex_instr_src_index(tex, deref_src_type);
@@ -1119,91 +1742,44 @@ lower_tex_deref(nir_builder *b, nir_tex_instr *tex,
nir_deref_instr *deref = nir_src_as_deref(tex->src[deref_src_idx].src);
nir_variable *var = nir_deref_instr_get_variable(deref);
- unsigned set = var->data.descriptor_set;
- unsigned binding = var->data.binding;
- unsigned array_size =
- state->layout->set[set].layout->binding[binding].array_size;
+ const bool is_sampler = deref_src_type == nir_tex_src_sampler_deref;
+ const unsigned set = var->data.descriptor_set;
+ const unsigned binding = var->data.binding;
+ const bool bindless = is_binding_bindless(set, binding, is_sampler, state);
- unsigned binding_offset;
- if (deref_src_type == nir_tex_src_texture_deref) {
- binding_offset = state->set[set].surface_offsets[binding];
+ nir_def *array_index = NULL;
+ if (deref->deref_type != nir_deref_type_var) {
+ assert(deref->deref_type == nir_deref_type_array);
+
+ array_index = deref->arr.index.ssa;
} else {
- assert(deref_src_type == nir_tex_src_sampler_deref);
- binding_offset = state->set[set].sampler_offsets[binding];
+ array_index = nir_imm_int(b, 0);
}
nir_tex_src_type offset_src_type;
- nir_ssa_def *index = NULL;
- if (binding_offset > MAX_BINDING_TABLE_SIZE) {
- const unsigned plane_offset =
- plane * sizeof(struct anv_sampled_image_descriptor);
-
- nir_ssa_def *desc =
- build_load_var_deref_descriptor_mem(b, deref, plane_offset,
- 2, 32, state);
-
- if (deref_src_type == nir_tex_src_texture_deref) {
- offset_src_type = nir_tex_src_texture_handle;
- index = nir_channel(b, desc, 0);
- } else {
- assert(deref_src_type == nir_tex_src_sampler_deref);
- offset_src_type = nir_tex_src_sampler_handle;
- index = nir_channel(b, desc, 1);
- }
+ nir_def *index;
+ if (deref_src_type == nir_tex_src_texture_deref) {
+ index = build_surface_index_for_binding(b, set, binding, array_index,
+ plane,
+ tex->texture_non_uniform,
+ state);
+ offset_src_type = bindless ?
+ nir_tex_src_texture_handle :
+ nir_tex_src_texture_offset;
} else {
- if (deref_src_type == nir_tex_src_texture_deref) {
- offset_src_type = nir_tex_src_texture_offset;
- } else {
- assert(deref_src_type == nir_tex_src_sampler_deref);
- offset_src_type = nir_tex_src_sampler_offset;
- }
-
- *base_index = binding_offset + plane;
-
- if (deref->deref_type != nir_deref_type_var) {
- assert(deref->deref_type == nir_deref_type_array);
-
- if (nir_src_is_const(deref->arr.index)) {
- unsigned arr_index = MIN2(nir_src_as_uint(deref->arr.index), array_size - 1);
- struct anv_sampler **immutable_samplers =
- state->layout->set[set].layout->binding[binding].immutable_samplers;
- if (immutable_samplers) {
- /* Array of YCbCr samplers are tightly packed in the binding
- * tables, compute the offset of an element in the array by
- * adding the number of planes of all preceding elements.
- */
- unsigned desc_arr_index = 0;
- for (int i = 0; i < arr_index; i++)
- desc_arr_index += immutable_samplers[i]->n_planes;
- *base_index += desc_arr_index;
- } else {
- *base_index += arr_index;
- }
- } else {
- /* From VK_KHR_sampler_ycbcr_conversion:
- *
- * If sampler Y’CBCR conversion is enabled, the combined image
- * sampler must be indexed only by constant integral expressions
- * when aggregated into arrays in shader code, irrespective of
- * the shaderSampledImageArrayDynamicIndexing feature.
- */
- assert(nir_tex_instr_src_index(tex, nir_tex_src_plane) == -1);
-
- index = nir_ssa_for_src(b, deref->arr.index, 1);
+ assert(deref_src_type == nir_tex_src_sampler_deref);
- if (state->add_bounds_checks)
- index = nir_umin(b, index, nir_imm_int(b, array_size - 1));
- }
- }
+ index = build_sampler_handle_for_binding(b, set, binding, array_index,
+ plane,
+ tex->sampler_non_uniform,
+ state);
+ offset_src_type = bindless ?
+ nir_tex_src_sampler_handle :
+ nir_tex_src_sampler_offset;
}
- if (index) {
- nir_instr_rewrite_src(&tex->instr, &tex->src[deref_src_idx].src,
- nir_src_for_ssa(index));
- tex->src[deref_src_idx].src_type = offset_src_type;
- } else {
- nir_tex_instr_remove_src(tex, deref_src_idx);
- }
+ nir_src_rewrite(&tex->src[deref_src_idx].src, index);
+ tex->src[deref_src_idx].src_type = offset_src_type;
}
static uint32_t
@@ -1220,106 +1796,51 @@ tex_instr_get_and_remove_plane_src(nir_tex_instr *tex)
return plane;
}
-static nir_ssa_def *
-build_def_array_select(nir_builder *b, nir_ssa_def **srcs, nir_ssa_def *idx,
+static nir_def *
+build_def_array_select(nir_builder *b, nir_def **srcs, nir_def *idx,
unsigned start, unsigned end)
{
if (start == end - 1) {
return srcs[start];
} else {
unsigned mid = start + (end - start) / 2;
- return nir_bcsel(b, nir_ilt(b, idx, nir_imm_int(b, mid)),
+ return nir_bcsel(b, nir_ilt_imm(b, idx, mid),
build_def_array_select(b, srcs, idx, start, mid),
build_def_array_select(b, srcs, idx, mid, end));
}
}
-static void
-lower_gfx7_tex_swizzle(nir_builder *b, nir_tex_instr *tex, unsigned plane,
- struct apply_pipeline_layout_state *state)
-{
- assert(state->pdevice->info.verx10 == 70);
- if (tex->sampler_dim == GLSL_SAMPLER_DIM_BUF ||
- nir_tex_instr_is_query(tex) ||
- tex->op == nir_texop_tg4 || /* We can't swizzle TG4 */
- (tex->is_shadow && tex->is_new_style_shadow))
- return;
-
- int deref_src_idx = nir_tex_instr_src_index(tex, nir_tex_src_texture_deref);
- assert(deref_src_idx >= 0);
-
- nir_deref_instr *deref = nir_src_as_deref(tex->src[deref_src_idx].src);
- nir_variable *var = nir_deref_instr_get_variable(deref);
-
- unsigned set = var->data.descriptor_set;
- unsigned binding = var->data.binding;
- const struct anv_descriptor_set_binding_layout *bind_layout =
- &state->layout->set[set].layout->binding[binding];
-
- if ((bind_layout->data & ANV_DESCRIPTOR_TEXTURE_SWIZZLE) == 0)
- return;
-
- b->cursor = nir_before_instr(&tex->instr);
-
- const unsigned plane_offset =
- plane * sizeof(struct anv_texture_swizzle_descriptor);
- nir_ssa_def *swiz =
- build_load_var_deref_descriptor_mem(b, deref, plane_offset,
- 1, 32, state);
-
- b->cursor = nir_after_instr(&tex->instr);
-
- assert(tex->dest.ssa.bit_size == 32);
- assert(tex->dest.ssa.num_components == 4);
-
- /* Initializing to undef is ok; nir_opt_undef will clean it up. */
- nir_ssa_def *undef = nir_ssa_undef(b, 1, 32);
- nir_ssa_def *comps[8];
- for (unsigned i = 0; i < ARRAY_SIZE(comps); i++)
- comps[i] = undef;
-
- comps[ISL_CHANNEL_SELECT_ZERO] = nir_imm_int(b, 0);
- if (nir_alu_type_get_base_type(tex->dest_type) == nir_type_float)
- comps[ISL_CHANNEL_SELECT_ONE] = nir_imm_float(b, 1);
- else
- comps[ISL_CHANNEL_SELECT_ONE] = nir_imm_int(b, 1);
- comps[ISL_CHANNEL_SELECT_RED] = nir_channel(b, &tex->dest.ssa, 0);
- comps[ISL_CHANNEL_SELECT_GREEN] = nir_channel(b, &tex->dest.ssa, 1);
- comps[ISL_CHANNEL_SELECT_BLUE] = nir_channel(b, &tex->dest.ssa, 2);
- comps[ISL_CHANNEL_SELECT_ALPHA] = nir_channel(b, &tex->dest.ssa, 3);
-
- nir_ssa_def *swiz_comps[4];
- for (unsigned i = 0; i < 4; i++) {
- nir_ssa_def *comp_swiz = nir_extract_u8(b, swiz, nir_imm_int(b, i));
- swiz_comps[i] = build_def_array_select(b, comps, comp_swiz, 0, 8);
- }
- nir_ssa_def *swiz_tex_res = nir_vec(b, swiz_comps, 4);
-
- /* Rewrite uses before we insert so we don't rewrite this use */
- nir_ssa_def_rewrite_uses_after(&tex->dest.ssa,
- swiz_tex_res,
- swiz_tex_res->parent_instr);
-}
-
static bool
lower_tex(nir_builder *b, nir_tex_instr *tex,
struct apply_pipeline_layout_state *state)
{
unsigned plane = tex_instr_get_and_remove_plane_src(tex);
- /* On Ivy Bridge and Bay Trail, we have to swizzle in the shader. Do this
- * before we lower the derefs away so we can still find the descriptor.
- */
- if (state->pdevice->info.verx10 == 70)
- lower_gfx7_tex_swizzle(b, tex, plane, state);
-
b->cursor = nir_before_instr(&tex->instr);
lower_tex_deref(b, tex, nir_tex_src_texture_deref,
- &tex->texture_index, plane, state);
-
+ tex->texture_index, plane, state);
lower_tex_deref(b, tex, nir_tex_src_sampler_deref,
- &tex->sampler_index, plane, state);
+ tex->sampler_index, plane, state);
+
+ /* The whole lot will be embedded in the offset/handle source */
+ tex->texture_index = 0;
+ tex->sampler_index = 0;
+
+ return true;
+}
+
+static bool
+lower_ray_query_globals(nir_builder *b, nir_intrinsic_instr *intrin,
+ struct apply_pipeline_layout_state *state)
+{
+ b->cursor = nir_instr_remove(&intrin->instr);
+
+ nir_def *rq_globals =
+ nir_load_push_constant(b, 1, 64, nir_imm_int(b, 0),
+ .base = offsetof(struct anv_push_constants, ray_query_globals),
+ .range = sizeof_field(struct anv_push_constants, ray_query_globals));
+ nir_def_rewrite_uses(&intrin->def, rq_globals);
return true;
}
@@ -1343,25 +1864,22 @@ apply_pipeline_layout(nir_builder *b, nir_instr *instr, void *_state)
return lower_get_ssbo_size(b, intrin, state);
case nir_intrinsic_image_deref_load:
case nir_intrinsic_image_deref_store:
- case nir_intrinsic_image_deref_atomic_add:
- case nir_intrinsic_image_deref_atomic_imin:
- case nir_intrinsic_image_deref_atomic_umin:
- case nir_intrinsic_image_deref_atomic_imax:
- case nir_intrinsic_image_deref_atomic_umax:
- case nir_intrinsic_image_deref_atomic_and:
- case nir_intrinsic_image_deref_atomic_or:
- case nir_intrinsic_image_deref_atomic_xor:
- case nir_intrinsic_image_deref_atomic_exchange:
- case nir_intrinsic_image_deref_atomic_comp_swap:
- case nir_intrinsic_image_deref_atomic_fadd:
- case nir_intrinsic_image_deref_size:
+ case nir_intrinsic_image_deref_atomic:
+ case nir_intrinsic_image_deref_atomic_swap:
case nir_intrinsic_image_deref_samples:
case nir_intrinsic_image_deref_load_param_intel:
case nir_intrinsic_image_deref_load_raw_intel:
case nir_intrinsic_image_deref_store_raw_intel:
+ case nir_intrinsic_image_deref_sparse_load:
return lower_image_intrinsic(b, intrin, state);
+ case nir_intrinsic_image_deref_size:
+ return lower_image_size_intrinsic(b, intrin, state);
case nir_intrinsic_load_constant:
return lower_load_constant(b, intrin, state);
+ case nir_intrinsic_load_base_workgroup_id:
+ return lower_base_workgroup_id(b, intrin, state);
+ case nir_intrinsic_load_ray_query_global_intel:
+ return lower_ray_query_globals(b, intrin, state);
default:
return false;
}
@@ -1393,66 +1911,300 @@ compare_binding_infos(const void *_a, const void *_b)
return a->binding - b->binding;
}
+#ifndef NDEBUG
+static void
+anv_validate_pipeline_layout(const struct anv_pipeline_sets_layout *layout,
+ nir_shader *shader)
+{
+ nir_foreach_function_impl(impl, shader) {
+ nir_foreach_block(block, impl) {
+ nir_foreach_instr(instr, block) {
+ if (instr->type != nir_instr_type_intrinsic)
+ continue;
+
+ nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+ if (intrin->intrinsic != nir_intrinsic_vulkan_resource_index)
+ continue;
+
+ unsigned set = nir_intrinsic_desc_set(intrin);
+ assert(layout->set[set].layout);
+ }
+ }
+ }
+}
+#endif
+
+static bool
+binding_is_promotable_to_push(const struct anv_descriptor_set_layout *set_layout,
+ const struct anv_descriptor_set_binding_layout *bind_layout)
+{
+ if (set_layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR)
+ return true;
+
+ if (set_layout->flags & (VK_DESCRIPTOR_SET_LAYOUT_CREATE_DESCRIPTOR_BUFFER_BIT_EXT |
+ VK_DESCRIPTOR_SET_LAYOUT_CREATE_EMBEDDED_IMMUTABLE_SAMPLERS_BIT_EXT))
+ return false;
+
+ return (bind_layout->flags & non_pushable_binding_flags) == 0;
+}
+
+static void
+add_null_bti_entry(struct anv_pipeline_bind_map *map)
+{
+ map->surface_to_descriptor[map->surface_count++] =
+ (struct anv_pipeline_binding) {
+ .set = ANV_DESCRIPTOR_SET_NULL,
+ };
+ assert(map->surface_count <= MAX_BINDING_TABLE_SIZE);
+}
+
+static void
+add_bti_entry(struct anv_pipeline_bind_map *map,
+ uint32_t set,
+ uint32_t binding,
+ uint32_t element,
+ uint32_t plane,
+ const struct anv_descriptor_set_binding_layout *bind_layout)
+{
+ map->surface_to_descriptor[map->surface_count++] =
+ (struct anv_pipeline_binding) {
+ .set = set,
+ .binding = binding,
+ .index = bind_layout->descriptor_index + element,
+ .set_offset = bind_layout->descriptor_surface_offset +
+ element * bind_layout->descriptor_surface_stride +
+ plane * bind_layout->descriptor_data_surface_size,
+ .plane = plane,
+ };
+ assert(map->surface_count <= MAX_BINDING_TABLE_SIZE);
+}
+
+static void
+add_dynamic_bti_entry(struct anv_pipeline_bind_map *map,
+ uint32_t set,
+ uint32_t binding,
+ uint32_t element,
+ const struct anv_pipeline_sets_layout *layout,
+ const struct anv_descriptor_set_binding_layout *bind_layout)
+{
+ map->surface_to_descriptor[map->surface_count++] =
+ (struct anv_pipeline_binding) {
+ .set = set,
+ .binding = binding,
+ .index = bind_layout->descriptor_index + element,
+ .set_offset = bind_layout->descriptor_surface_offset +
+ element * bind_layout->descriptor_surface_stride,
+ .dynamic_offset_index = bind_layout->dynamic_offset_index + element,
+ };
+ assert(map->surface_count <= MAX_BINDING_TABLE_SIZE);
+}
+
+static void
+add_sampler_entry(struct anv_pipeline_bind_map *map,
+ uint32_t set,
+ uint32_t binding,
+ uint32_t element,
+ uint32_t plane,
+ const struct anv_pipeline_sets_layout *layout,
+ const struct anv_descriptor_set_binding_layout *bind_layout)
+{
+ assert((bind_layout->descriptor_index + element) < layout->set[set].layout->descriptor_count);
+ map->sampler_to_descriptor[map->sampler_count++] =
+ (struct anv_pipeline_binding) {
+ .set = set,
+ .binding = binding,
+ .index = bind_layout->descriptor_index + element,
+ .plane = plane,
+ };
+}
+
+static void
+add_push_entry(struct anv_pipeline_push_map *push_map,
+ uint32_t set,
+ uint32_t binding,
+ uint32_t element,
+ const struct anv_pipeline_sets_layout *layout,
+ const struct anv_descriptor_set_binding_layout *bind_layout)
+{
+ push_map->block_to_descriptor[push_map->block_count++] =
+ (struct anv_pipeline_binding) {
+ .set = set,
+ .binding = binding,
+ .index = bind_layout->descriptor_index + element,
+ .dynamic_offset_index = bind_layout->dynamic_offset_index + element,
+ };
+}
+
+static void
+add_embedded_sampler_entry(struct apply_pipeline_layout_state *state,
+ struct anv_pipeline_bind_map *map,
+ uint32_t set, uint32_t binding)
+{
+ state->set[set].binding[binding].embedded_sampler_index =
+ map->embedded_sampler_count;
+ struct anv_pipeline_embedded_sampler_binding *sampler =
+ &map->embedded_sampler_to_binding[map->embedded_sampler_count++];
+ const struct anv_descriptor_set_layout *set_layout =
+ state->layout->set[set].layout;
+ const struct anv_descriptor_set_binding_layout *bind_layout =
+ &set_layout->binding[binding];
+
+ *sampler = (struct anv_pipeline_embedded_sampler_binding) {
+ .set = set,
+ .binding = binding,
+ };
+
+ assert(sizeof(sampler->key.sampler) ==
+ sizeof(bind_layout->immutable_samplers[0]->state_no_bc[0]));
+ memcpy(sampler->key.sampler,
+ bind_layout->immutable_samplers[0]->state_no_bc[0],
+ sizeof(sampler->key.sampler));
+
+ assert(sizeof(sampler->key.color) ==
+ sizeof(bind_layout->immutable_samplers[0]->vk.border_color_value.uint32));
+ memcpy(sampler->key.color,
+ bind_layout->immutable_samplers[0]->vk.border_color_value.uint32,
+ sizeof(sampler->key.color));
+}
+
+static bool
+binding_should_use_surface_binding_table(const struct apply_pipeline_layout_state *state,
+ const struct anv_descriptor_set_binding_layout *binding)
+{
+ if ((binding->data & ANV_DESCRIPTOR_BTI_SURFACE_STATE) == 0)
+ return false;
+
+ if (state->pdevice->always_use_bindless &&
+ (binding->data & ANV_DESCRIPTOR_SURFACE))
+ return false;
+
+ return true;
+}
+
+static bool
+binding_should_use_sampler_binding_table(const struct apply_pipeline_layout_state *state,
+ const struct anv_descriptor_set_binding_layout *binding)
+{
+ if ((binding->data & ANV_DESCRIPTOR_BTI_SAMPLER_STATE) == 0)
+ return false;
+
+ if (state->pdevice->always_use_bindless &&
+ (binding->data & ANV_DESCRIPTOR_SAMPLER))
+ return false;
+
+ return true;
+}
+
void
-anv_nir_apply_pipeline_layout(const struct anv_physical_device *pdevice,
- bool robust_buffer_access,
- const struct anv_pipeline_layout *layout,
- nir_shader *shader,
- struct anv_pipeline_bind_map *map)
+anv_nir_apply_pipeline_layout(nir_shader *shader,
+ const struct anv_physical_device *pdevice,
+ enum brw_robustness_flags robust_flags,
+ bool independent_sets,
+ const struct anv_pipeline_sets_layout *layout,
+ struct anv_pipeline_bind_map *map,
+ struct anv_pipeline_push_map *push_map,
+ void *push_map_mem_ctx)
{
void *mem_ctx = ralloc_context(NULL);
+#ifndef NDEBUG
+ /* We should not have have any reference to a descriptor set that is not
+ * given through the pipeline layout (layout->set[set].layout = NULL).
+ */
+ anv_validate_pipeline_layout(layout, shader);
+#endif
+
+ const bool bindless_stage =
+ brw_shader_stage_requires_bindless_resources(shader->info.stage);
struct apply_pipeline_layout_state state = {
.pdevice = pdevice,
.layout = layout,
- .add_bounds_checks = robust_buffer_access,
- .desc_addr_format = brw_shader_stage_is_bindless(shader->info.stage) ?
+ .desc_addr_format = bindless_stage ?
nir_address_format_64bit_global_32bit_offset :
nir_address_format_32bit_index_offset,
- .ssbo_addr_format = anv_nir_ssbo_addr_format(pdevice, robust_buffer_access),
- .ubo_addr_format = anv_nir_ubo_addr_format(pdevice, robust_buffer_access),
+ .ssbo_addr_format = anv_nir_ssbo_addr_format(pdevice, robust_flags),
+ .ubo_addr_format = anv_nir_ubo_addr_format(pdevice, robust_flags),
.lowered_instrs = _mesa_pointer_set_create(mem_ctx),
+ .has_independent_sets = independent_sets,
};
+ /* Compute the amount of push block items required. */
+ unsigned push_block_count = 0;
for (unsigned s = 0; s < layout->num_sets; s++) {
+ if (!layout->set[s].layout)
+ continue;
+
const unsigned count = layout->set[s].layout->binding_count;
- state.set[s].use_count = rzalloc_array(mem_ctx, uint8_t, count);
- state.set[s].surface_offsets = rzalloc_array(mem_ctx, uint8_t, count);
- state.set[s].sampler_offsets = rzalloc_array(mem_ctx, uint8_t, count);
+ state.set[s].binding = rzalloc_array_size(mem_ctx, sizeof(state.set[s].binding[0]), count);
+
+ const struct anv_descriptor_set_layout *set_layout = layout->set[s].layout;
+ for (unsigned b = 0; b < set_layout->binding_count; b++) {
+ if (set_layout->binding[b].type != VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK)
+ push_block_count += set_layout->binding[b].array_size;
+ }
}
+ /* Find all use sets/bindings */
nir_shader_instructions_pass(shader, get_used_bindings,
nir_metadata_all, &state);
+ /* Assign a BTI to each used descriptor set */
for (unsigned s = 0; s < layout->num_sets; s++) {
if (state.desc_addr_format != nir_address_format_32bit_index_offset) {
state.set[s].desc_offset = BINDLESS_OFFSET;
} else if (state.set[s].desc_buffer_used) {
map->surface_to_descriptor[map->surface_count] =
(struct anv_pipeline_binding) {
- .set = ANV_DESCRIPTOR_SET_DESCRIPTORS,
+ .set = (layout->type == ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_BUFFER) ?
+ ANV_DESCRIPTOR_SET_DESCRIPTORS_BUFFER :
+ ANV_DESCRIPTOR_SET_DESCRIPTORS,
+ .binding = UINT32_MAX,
.index = s,
};
- state.set[s].desc_offset = map->surface_count;
- map->surface_count++;
+ state.set[s].desc_offset = map->surface_count++;
}
}
- if (state.uses_constants && !pdevice->use_softpin) {
- state.constants_offset = map->surface_count;
- map->surface_to_descriptor[map->surface_count].set =
- ANV_DESCRIPTOR_SET_SHADER_CONSTANTS;
- map->surface_count++;
- }
+ /* Assign a block index for each surface */
+ push_map->block_to_descriptor =
+ rzalloc_array(push_map_mem_ctx, struct anv_pipeline_binding,
+ map->surface_count + push_block_count);
+
+ memcpy(push_map->block_to_descriptor,
+ map->surface_to_descriptor,
+ sizeof(push_map->block_to_descriptor[0]) * map->surface_count);
+ push_map->block_count = map->surface_count;
+ /* Count used bindings, assign embedded sampler indices & add push blocks
+ * for promotion to push constants
+ */
unsigned used_binding_count = 0;
for (uint32_t set = 0; set < layout->num_sets; set++) {
struct anv_descriptor_set_layout *set_layout = layout->set[set].layout;
+ if (!set_layout)
+ continue;
+
for (unsigned b = 0; b < set_layout->binding_count; b++) {
- if (state.set[set].use_count[b] == 0)
+ if (state.set[set].binding[b].use_count == 0)
continue;
used_binding_count++;
+
+ const struct anv_descriptor_set_binding_layout *bind_layout =
+ &set_layout->binding[b];
+
+ if (state.set[set].binding[b].properties & BINDING_PROPERTY_EMBEDDED_SAMPLER)
+ add_embedded_sampler_entry(&state, map, set, b);
+
+ if (binding_is_promotable_to_push(set_layout, bind_layout)) {
+ if (bind_layout->type != VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) {
+ state.set[set].binding[b].push_block = push_map->block_count;
+ for (unsigned i = 0; i < bind_layout->array_size; i++)
+ add_push_entry(push_map, set, b, i, layout, bind_layout);
+ } else {
+ state.set[set].binding[b].push_block = state.set[set].desc_offset;
+ }
+ }
}
}
@@ -1461,8 +2213,11 @@ anv_nir_apply_pipeline_layout(const struct anv_physical_device *pdevice,
used_binding_count = 0;
for (uint32_t set = 0; set < layout->num_sets; set++) {
const struct anv_descriptor_set_layout *set_layout = layout->set[set].layout;
+ if (!set_layout)
+ continue;
+
for (unsigned b = 0; b < set_layout->binding_count; b++) {
- if (state.set[set].use_count[b] == 0)
+ if (state.set[set].binding[b].use_count == 0)
continue;
const struct anv_descriptor_set_binding_layout *binding =
@@ -1474,14 +2229,13 @@ anv_nir_apply_pipeline_layout(const struct anv_physical_device *pdevice,
* everything which does not support bindless super higher priority
* than things which do.
*/
- uint16_t score = ((uint16_t)state.set[set].use_count[b] << 7) /
+ uint16_t score = ((uint16_t)state.set[set].binding[b].use_count << 7) /
binding->array_size;
/* If the descriptor type doesn't support bindless then put it at the
* beginning so we guarantee it gets a slot.
*/
- if (!anv_descriptor_supports_bindless(pdevice, binding, true) ||
- !anv_descriptor_supports_bindless(pdevice, binding, false))
+ if (!anv_descriptor_supports_bindless(pdevice, set_layout, binding))
score |= 1 << 15;
infos[used_binding_count++] = (struct binding_info) {
@@ -1500,58 +2254,59 @@ anv_nir_apply_pipeline_layout(const struct anv_physical_device *pdevice,
for (unsigned i = 0; i < used_binding_count; i++) {
unsigned set = infos[i].set, b = infos[i].binding;
+ assert(layout->set[set].layout);
+ const struct anv_descriptor_set_layout *set_layout =
+ layout->set[set].layout;
const struct anv_descriptor_set_binding_layout *binding =
- &layout->set[set].layout->binding[b];
+ &set_layout->binding[b];
const uint32_t array_size = binding->array_size;
if (binding->dynamic_offset_index >= 0)
state.has_dynamic_buffers = true;
- if (binding->data & ANV_DESCRIPTOR_SURFACE_STATE) {
- if (map->surface_count + array_size > MAX_BINDING_TABLE_SIZE ||
- anv_descriptor_requires_bindless(pdevice, binding, false) ||
- brw_shader_stage_is_bindless(shader->info.stage)) {
+ const unsigned array_multiplier = bti_multiplier(&state, set, b);
+ assert(array_multiplier >= 1);
+
+ /* Assume bindless by default */
+ state.set[set].binding[b].surface_offset = BINDLESS_OFFSET;
+ state.set[set].binding[b].sampler_offset = BINDLESS_OFFSET;
+
+ if (binding_should_use_surface_binding_table(&state, binding)) {
+ if (map->surface_count + array_size * array_multiplier > MAX_BINDING_TABLE_SIZE ||
+ anv_descriptor_requires_bindless(pdevice, set_layout, binding) ||
+ brw_shader_stage_requires_bindless_resources(shader->info.stage)) {
/* If this descriptor doesn't fit in the binding table or if it
* requires bindless for some reason, flag it as bindless.
*/
- assert(anv_descriptor_supports_bindless(pdevice, binding, false));
- state.set[set].surface_offsets[b] = BINDLESS_OFFSET;
+ assert(anv_descriptor_supports_bindless(pdevice, set_layout, binding));
} else {
- state.set[set].surface_offsets[b] = map->surface_count;
+ state.set[set].binding[b].surface_offset = map->surface_count;
if (binding->dynamic_offset_index < 0) {
struct anv_sampler **samplers = binding->immutable_samplers;
+ uint8_t max_planes = bti_multiplier(&state, set, b);
for (unsigned i = 0; i < binding->array_size; i++) {
uint8_t planes = samplers ? samplers[i]->n_planes : 1;
- for (uint8_t p = 0; p < planes; p++) {
- map->surface_to_descriptor[map->surface_count++] =
- (struct anv_pipeline_binding) {
- .set = set,
- .index = binding->descriptor_index + i,
- .plane = p,
- };
+ for (uint8_t p = 0; p < max_planes; p++) {
+ if (p < planes) {
+ add_bti_entry(map, set, b, i, p, binding);
+ } else {
+ add_null_bti_entry(map);
+ }
}
}
} else {
- for (unsigned i = 0; i < binding->array_size; i++) {
- map->surface_to_descriptor[map->surface_count++] =
- (struct anv_pipeline_binding) {
- .set = set,
- .index = binding->descriptor_index + i,
- .dynamic_offset_index =
- layout->set[set].dynamic_offset_start +
- binding->dynamic_offset_index + i,
- };
- }
+ for (unsigned i = 0; i < binding->array_size; i++)
+ add_dynamic_bti_entry(map, set, b, i, layout, binding);
}
}
assert(map->surface_count <= MAX_BINDING_TABLE_SIZE);
}
- if (binding->data & ANV_DESCRIPTOR_SAMPLER_STATE) {
- if (map->sampler_count + array_size > MAX_SAMPLER_TABLE_SIZE ||
- anv_descriptor_requires_bindless(pdevice, binding, true) ||
- brw_shader_stage_is_bindless(shader->info.stage)) {
+ if (binding_should_use_sampler_binding_table(&state, binding)) {
+ if (map->sampler_count + array_size * array_multiplier > MAX_SAMPLER_TABLE_SIZE ||
+ anv_descriptor_requires_bindless(pdevice, set_layout, binding) ||
+ brw_shader_stage_requires_bindless_resources(shader->info.stage)) {
/* If this descriptor doesn't fit in the binding table or if it
* requires bindless for some reason, flag it as bindless.
*
@@ -1559,60 +2314,29 @@ anv_nir_apply_pipeline_layout(const struct anv_physical_device *pdevice,
* using indirect sends thanks to bindless samplers being packed
* less tightly than the sampler table.
*/
- assert(anv_descriptor_supports_bindless(pdevice, binding, true));
- state.set[set].sampler_offsets[b] = BINDLESS_OFFSET;
+ assert(anv_descriptor_supports_bindless(pdevice, set_layout, binding));
} else {
- state.set[set].sampler_offsets[b] = map->sampler_count;
- struct anv_sampler **samplers = binding->immutable_samplers;
+ state.set[set].binding[b].sampler_offset = map->sampler_count;
+ uint8_t max_planes = bti_multiplier(&state, set, b);
for (unsigned i = 0; i < binding->array_size; i++) {
- uint8_t planes = samplers ? samplers[i]->n_planes : 1;
- for (uint8_t p = 0; p < planes; p++) {
- map->sampler_to_descriptor[map->sampler_count++] =
- (struct anv_pipeline_binding) {
- .set = set,
- .index = binding->descriptor_index + i,
- .plane = p,
- };
+ for (uint8_t p = 0; p < max_planes; p++) {
+ add_sampler_entry(map, set, b, i, p, layout, binding);
}
}
}
}
- }
- nir_foreach_uniform_variable(var, shader) {
- const struct glsl_type *glsl_type = glsl_without_array(var->type);
-
- if (!glsl_type_is_image(glsl_type))
- continue;
-
- enum glsl_sampler_dim dim = glsl_get_sampler_dim(glsl_type);
-
- const uint32_t set = var->data.descriptor_set;
- const uint32_t binding = var->data.binding;
- const struct anv_descriptor_set_binding_layout *bind_layout =
- &layout->set[set].layout->binding[binding];
- const uint32_t array_size = bind_layout->array_size;
-
- if (state.set[set].use_count[binding] == 0)
- continue;
-
- if (state.set[set].surface_offsets[binding] >= MAX_BINDING_TABLE_SIZE)
- continue;
-
- struct anv_pipeline_binding *pipe_binding =
- &map->surface_to_descriptor[state.set[set].surface_offsets[binding]];
- for (unsigned i = 0; i < array_size; i++) {
- assert(pipe_binding[i].set == set);
- assert(pipe_binding[i].index == bind_layout->descriptor_index + i);
-
- if (dim == GLSL_SAMPLER_DIM_SUBPASS ||
- dim == GLSL_SAMPLER_DIM_SUBPASS_MS)
- pipe_binding[i].input_attachment_index = var->data.index + i;
-
- /* NOTE: This is a uint8_t so we really do need to != 0 here */
- pipe_binding[i].write_only =
- (var->data.access & ACCESS_NON_READABLE) != 0;
+ if (binding->data & ANV_DESCRIPTOR_INLINE_UNIFORM) {
+ state.set[set].binding[b].surface_offset = state.set[set].desc_offset;
}
+
+#if 0
+ fprintf(stderr, "set=%u binding=%u surface_offset=0x%08x require_bindless=%u type=%s\n",
+ set, b,
+ state.set[set].binding[b].surface_offset,
+ anv_descriptor_requires_bindless(pdevice, set_layout, binding),
+ vk_DescriptorType_to_str(binding->type));
+#endif
}
/* Before we do the normal lowering, we look for any SSBO operations
@@ -1667,6 +2391,27 @@ anv_nir_apply_pipeline_layout(const struct anv_physical_device *pdevice,
assert(map->sampler_count == 0);
}
+#if 0
+ fprintf(stderr, "bti:\n");
+ for (unsigned i = 0; i < map->surface_count; i++) {
+ fprintf(stderr, " %03i: set=%03u binding=%06i index=%u plane=%u set_offset=0x%08x dyn_offset=0x%08x\n", i,
+ map->surface_to_descriptor[i].set,
+ map->surface_to_descriptor[i].binding,
+ map->surface_to_descriptor[i].index,
+ map->surface_to_descriptor[i].plane,
+ map->surface_to_descriptor[i].set_offset,
+ map->surface_to_descriptor[i].dynamic_offset_index);
+ }
+ fprintf(stderr, "sti:\n");
+ for (unsigned i = 0; i < map->sampler_count; i++) {
+ fprintf(stderr, " %03i: set=%03u binding=%06i index=%u plane=%u\n", i,
+ map->sampler_to_descriptor[i].set,
+ map->sampler_to_descriptor[i].binding,
+ map->sampler_to_descriptor[i].index,
+ map->sampler_to_descriptor[i].plane);
+ }
+#endif
+
/* Now that we're done computing the surface and sampler portions of the
* bind map, hash them. This lets us quickly determine if the actual
* mapping has changed and not just a no-op pipeline change.
diff --git a/src/intel/vulkan/anv_nir_compute_push_layout.c b/src/intel/vulkan/anv_nir_compute_push_layout.c
index 526e1a48f0b..74e59e4cb28 100644
--- a/src/intel/vulkan/anv_nir_compute_push_layout.c
+++ b/src/intel/vulkan/anv_nir_compute_push_layout.c
@@ -29,11 +29,14 @@
#define sizeof_field(type, field) sizeof(((type *)0)->field)
void
-anv_nir_compute_push_layout(const struct anv_physical_device *pdevice,
- bool robust_buffer_access,
- nir_shader *nir,
+anv_nir_compute_push_layout(nir_shader *nir,
+ const struct anv_physical_device *pdevice,
+ enum brw_robustness_flags robust_flags,
+ bool fragment_dynamic,
struct brw_stage_prog_data *prog_data,
struct anv_pipeline_bind_map *map,
+ const struct anv_pipeline_push_map *push_map,
+ enum anv_descriptor_set_layout_type desc_type,
void *mem_ctx)
{
const struct brw_compiler *compiler = pdevice->compiler;
@@ -42,11 +45,8 @@ anv_nir_compute_push_layout(const struct anv_physical_device *pdevice,
bool has_const_ubo = false;
unsigned push_start = UINT_MAX, push_end = 0;
- nir_foreach_function(function, nir) {
- if (!function->impl)
- continue;
-
- nir_foreach_block(block, function->impl) {
+ nir_foreach_function_impl(impl, nir) {
+ nir_foreach_block(block, impl) {
nir_foreach_instr(instr, block) {
if (instr->type != nir_instr_type_intrinsic)
continue;
@@ -54,7 +54,7 @@ anv_nir_compute_push_layout(const struct anv_physical_device *pdevice,
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
switch (intrin->intrinsic) {
case nir_intrinsic_load_ubo:
- if (nir_src_is_const(intrin->src[0]) &&
+ if (brw_nir_ubo_surface_index_is_pushable(intrin->src[0]) &&
nir_src_is_const(intrin->src[1]))
has_const_ubo = true;
break;
@@ -68,11 +68,25 @@ anv_nir_compute_push_layout(const struct anv_physical_device *pdevice,
}
case nir_intrinsic_load_desc_set_address_intel:
- push_start = MIN2(push_start,
- offsetof(struct anv_push_constants, desc_sets));
- push_end = MAX2(push_end, push_start +
- sizeof_field(struct anv_push_constants, desc_sets));
+ case nir_intrinsic_load_desc_set_dynamic_index_intel: {
+ unsigned base = offsetof(struct anv_push_constants,
+ desc_surface_offsets);
+ push_start = MIN2(push_start, base);
+ push_end = MAX2(push_end, base +
+ sizeof_field(struct anv_push_constants,
+ desc_surface_offsets));
+
+ if (desc_type == ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_BUFFER &&
+ !pdevice->uses_ex_bso) {
+ base = offsetof(struct anv_push_constants,
+ surfaces_base_offset);
+ push_start = MIN2(push_start, base);
+ push_end = MAX2(push_end, base +
+ sizeof_field(struct anv_push_constants,
+ surfaces_base_offset));
+ }
break;
+ }
default:
break;
@@ -84,11 +98,10 @@ anv_nir_compute_push_layout(const struct anv_physical_device *pdevice,
const bool has_push_intrinsic = push_start <= push_end;
const bool push_ubo_ranges =
- pdevice->info.verx10 >= 75 &&
has_const_ubo && nir->info.stage != MESA_SHADER_COMPUTE &&
- !brw_shader_stage_is_bindless(nir->info.stage);
+ !brw_shader_stage_requires_bindless_resources(nir->info.stage);
- if (push_ubo_ranges && robust_buffer_access) {
+ if (push_ubo_ranges && (robust_flags & BRW_ROBUSTNESS_UBO)) {
/* We can't on-the-fly adjust our push ranges because doing so would
* mess up the layout in the shader. When robustBufferAccess is
* enabled, we push a mask into the shader indicating which pushed
@@ -102,6 +115,14 @@ anv_nir_compute_push_layout(const struct anv_physical_device *pdevice,
push_end = MAX2(push_end, push_reg_mask_end);
}
+ if (nir->info.stage == MESA_SHADER_FRAGMENT && fragment_dynamic) {
+ const uint32_t fs_msaa_flags_start =
+ offsetof(struct anv_push_constants, gfx.fs_msaa_flags);
+ const uint32_t fs_msaa_flags_end = fs_msaa_flags_start + sizeof(uint32_t);
+ push_start = MIN2(push_start, fs_msaa_flags_start);
+ push_end = MAX2(push_end, fs_msaa_flags_end);
+ }
+
if (nir->info.stage == MESA_SHADER_COMPUTE && devinfo->verx10 < 125) {
/* For compute shaders, we always have to have the subgroup ID. The
* back-end compiler will "helpfully" add it for us in the last push
@@ -118,13 +139,11 @@ anv_nir_compute_push_layout(const struct anv_physical_device *pdevice,
* push_end (no push constants is indicated by push_start = UINT_MAX).
*/
push_start = MIN2(push_start, push_end);
- push_start = align_down_u32(push_start, 32);
+ push_start = ROUND_DOWN_TO(push_start, 32);
- /* For vec4 our push data size needs to be aligned to a vec4 and for
- * scalar, it needs to be aligned to a DWORD.
- */
- const unsigned align = compiler->scalar_stage[nir->info.stage] ? 4 : 16;
- nir->num_uniforms = ALIGN(push_end - push_start, align);
+ /* For scalar, push data size needs to be aligned to a DWORD. */
+ const unsigned alignment = 4;
+ nir->num_uniforms = ALIGN(push_end - push_start, alignment);
prog_data->nr_params = nir->num_uniforms / 4;
prog_data->param = rzalloc_array(mem_ctx, uint32_t, prog_data->nr_params);
@@ -135,35 +154,80 @@ anv_nir_compute_push_layout(const struct anv_physical_device *pdevice,
};
if (has_push_intrinsic) {
- nir_foreach_function(function, nir) {
- if (!function->impl)
- continue;
+ nir_foreach_function_impl(impl, nir) {
+ nir_builder build = nir_builder_create(impl);
+ nir_builder *b = &build;
- nir_builder build, *b = &build;
- nir_builder_init(b, function->impl);
-
- nir_foreach_block(block, function->impl) {
+ nir_foreach_block(block, impl) {
nir_foreach_instr_safe(instr, block) {
if (instr->type != nir_instr_type_intrinsic)
continue;
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
switch (intrin->intrinsic) {
- case nir_intrinsic_load_push_constant:
+ case nir_intrinsic_load_push_constant: {
+ /* With bindless shaders we load uniforms with SEND
+ * messages. All the push constants are located after the
+ * RT_DISPATCH_GLOBALS. We just need to add the offset to
+ * the address right after RT_DISPATCH_GLOBALS (see
+ * brw_nir_lower_rt_intrinsics.c).
+ */
+ unsigned base_offset =
+ brw_shader_stage_requires_bindless_resources(nir->info.stage) ? 0 : push_start;
intrin->intrinsic = nir_intrinsic_load_uniform;
nir_intrinsic_set_base(intrin,
nir_intrinsic_base(intrin) -
- push_start);
+ base_offset);
break;
+ }
case nir_intrinsic_load_desc_set_address_intel: {
+ assert(brw_shader_stage_requires_bindless_resources(nir->info.stage));
b->cursor = nir_before_instr(&intrin->instr);
- nir_ssa_def *pc_load = nir_load_uniform(b, 1, 64,
- nir_imul_imm(b, intrin->src[0].ssa, sizeof(uint64_t)),
- .base = offsetof(struct anv_push_constants, desc_sets),
- .range = sizeof_field(struct anv_push_constants, desc_sets),
- .dest_type = nir_type_uint64);
- nir_ssa_def_rewrite_uses(&intrin->dest.ssa, pc_load);
+ nir_def *desc_offset = nir_load_uniform(b, 1, 32,
+ nir_imul_imm(b, intrin->src[0].ssa, sizeof(uint32_t)),
+ .base = offsetof(struct anv_push_constants,
+ desc_surface_offsets),
+ .range = sizeof_field(struct anv_push_constants,
+ desc_surface_offsets),
+ .dest_type = nir_type_uint32);
+ desc_offset = nir_iand_imm(b, desc_offset, ANV_DESCRIPTOR_SET_OFFSET_MASK);
+ if (desc_type == ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_BUFFER &&
+ !pdevice->uses_ex_bso) {
+ nir_def *bindless_base_offset = nir_load_uniform(
+ b, 1, 32,
+ nir_imm_int(b, 0),
+ .base = offsetof(struct anv_push_constants,
+ surfaces_base_offset),
+ .range = sizeof_field(struct anv_push_constants,
+ surfaces_base_offset),
+ .dest_type = nir_type_uint32);
+ desc_offset = nir_iadd(b, bindless_base_offset, desc_offset);
+ }
+ nir_def *desc_addr =
+ nir_pack_64_2x32_split(
+ b, desc_offset,
+ nir_load_reloc_const_intel(
+ b,
+ desc_type == ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_BUFFER ?
+ BRW_SHADER_RELOC_DESCRIPTORS_BUFFER_ADDR_HIGH :
+ BRW_SHADER_RELOC_DESCRIPTORS_ADDR_HIGH));
+ nir_def_rewrite_uses(&intrin->def, desc_addr);
+ break;
+ }
+
+ case nir_intrinsic_load_desc_set_dynamic_index_intel: {
+ b->cursor = nir_before_instr(&intrin->instr);
+ nir_def *pc_load = nir_load_uniform(b, 1, 32,
+ nir_imul_imm(b, intrin->src[0].ssa, sizeof(uint32_t)),
+ .base = offsetof(struct anv_push_constants,
+ desc_surface_offsets),
+ .range = sizeof_field(struct anv_push_constants,
+ desc_surface_offsets),
+ .dest_type = nir_type_uint32);
+ pc_load = nir_iand_imm(
+ b, pc_load, ANV_DESCRIPTOR_SET_DYNAMIC_INDEX_MASK);
+ nir_def_rewrite_uses(&intrin->def, pc_load);
break;
}
@@ -176,15 +240,9 @@ anv_nir_compute_push_layout(const struct anv_physical_device *pdevice,
}
if (push_ubo_ranges) {
- brw_nir_analyze_ubo_ranges(compiler, nir, NULL, prog_data->ubo_ranges);
+ brw_nir_analyze_ubo_ranges(compiler, nir, prog_data->ubo_ranges);
- /* The vec4 back-end pushes at most 32 regs while the scalar back-end
- * pushes up to 64. This is primarily because the scalar back-end has a
- * massively more competent register allocator and so the risk of
- * spilling due to UBO pushing isn't nearly as high.
- */
- const unsigned max_push_regs =
- compiler->scalar_stage[nir->info.stage] ? 64 : 32;
+ const unsigned max_push_regs = 64;
unsigned total_push_regs = push_constant_range.length;
for (unsigned i = 0; i < 4; i++) {
@@ -199,7 +257,7 @@ anv_nir_compute_push_layout(const struct anv_physical_device *pdevice,
if (push_constant_range.length > 0)
map->push_ranges[n++] = push_constant_range;
- if (robust_buffer_access) {
+ if (robust_flags & BRW_ROBUSTNESS_UBO) {
const uint32_t push_reg_mask_offset =
offsetof(struct anv_push_constants, push_reg_mask[nir->info.stage]);
assert(push_reg_mask_offset >= push_start);
@@ -214,13 +272,14 @@ anv_nir_compute_push_layout(const struct anv_physical_device *pdevice,
if (ubo_range->length == 0)
continue;
- if (n >= 4 || (n == 3 && compiler->constant_buffer_0_is_relative)) {
+ if (n >= 4) {
memset(ubo_range, 0, sizeof(*ubo_range));
continue;
}
+ assert(ubo_range->block < push_map->block_count);
const struct anv_pipeline_binding *binding =
- &map->surface_to_descriptor[ubo_range->block];
+ &push_map->block_to_descriptor[ubo_range->block];
map->push_ranges[n++] = (struct anv_push_range) {
.set = binding->set,
@@ -231,7 +290,8 @@ anv_nir_compute_push_layout(const struct anv_physical_device *pdevice,
};
/* We only bother to shader-zero pushed client UBOs */
- if (binding->set < MAX_SETS && robust_buffer_access) {
+ if (binding->set < MAX_SETS &&
+ (robust_flags & BRW_ROBUSTNESS_UBO)) {
prog_data->zero_push_reg |= BITFIELD64_RANGE(range_start_reg,
ubo_range->length);
}
@@ -250,6 +310,27 @@ anv_nir_compute_push_layout(const struct anv_physical_device *pdevice,
map->push_ranges[0] = push_constant_range;
}
+ if (nir->info.stage == MESA_SHADER_FRAGMENT && fragment_dynamic) {
+ struct brw_wm_prog_data *wm_prog_data =
+ container_of(prog_data, struct brw_wm_prog_data, base);
+
+ const uint32_t fs_msaa_flags_offset =
+ offsetof(struct anv_push_constants, gfx.fs_msaa_flags);
+ assert(fs_msaa_flags_offset >= push_start);
+ wm_prog_data->msaa_flags_param =
+ (fs_msaa_flags_offset - push_start) / 4;
+ }
+
+#if 0
+ fprintf(stderr, "stage=%s push ranges:\n", gl_shader_stage_name(nir->info.stage));
+ for (unsigned i = 0; i < ARRAY_SIZE(map->push_ranges); i++)
+ fprintf(stderr, " range%i: %03u-%03u set=%u index=%u\n", i,
+ map->push_ranges[i].start,
+ map->push_ranges[i].length,
+ map->push_ranges[i].set,
+ map->push_ranges[i].index);
+#endif
+
/* Now that we're done computing the push constant portion of the
* bind map, hash it. This lets us quickly determine if the actual
* mapping has changed and not just a no-op pipeline change.
diff --git a/src/intel/vulkan/anv_nir_lower_load_patch_vertices_in.c b/src/intel/vulkan/anv_nir_lower_load_patch_vertices_in.c
new file mode 100644
index 00000000000..a9e0fde6f2e
--- /dev/null
+++ b/src/intel/vulkan/anv_nir_lower_load_patch_vertices_in.c
@@ -0,0 +1,66 @@
+/*
+ * Copyright © 2023 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/**
+ * This file implements the lowering required for
+ * VK_EXT_extended_dynamic_state2 extendedDynamicState2PatchControlPoints.
+ *
+ * When VK_DYNAMIC_STATE_PATCH_CONTROL_POINTS_EXT is set on a pipeline, we
+ * need to compile the TCS shader assuming the max (32) number of control
+ * points. The actually value is provided through push constants.
+ */
+
+#include "anv_nir.h"
+#include "nir_builder.h"
+
+#define sizeof_field(type, field) sizeof(((type *)0)->field)
+
+static bool
+lower_patch_vertices_in_instr(nir_builder *b, nir_intrinsic_instr *load,
+ UNUSED void *_data)
+{
+ if (load->intrinsic != nir_intrinsic_load_patch_vertices_in)
+ return false;
+
+ b->cursor = nir_before_instr(&load->instr);
+
+ nir_def_rewrite_uses(
+ &load->def,
+ nir_load_push_constant(
+ b, 1, 32,
+ nir_imm_int(b, 0),
+ .base = offsetof(struct anv_push_constants, gfx.tcs_input_vertices),
+ .range = sizeof_field(struct anv_push_constants, gfx.tcs_input_vertices)));
+ nir_instr_remove(&load->instr);
+
+ return true;
+}
+
+bool
+anv_nir_lower_load_patch_vertices_in(nir_shader *shader)
+{
+ return nir_shader_intrinsics_pass(shader, lower_patch_vertices_in_instr,
+ nir_metadata_block_index |
+ nir_metadata_dominance,
+ NULL);
+}
diff --git a/src/intel/vulkan/anv_nir_lower_multiview.c b/src/intel/vulkan/anv_nir_lower_multiview.c
index 63d9f5a2e8e..b26dd6970db 100644
--- a/src/intel/vulkan/anv_nir_lower_multiview.c
+++ b/src/intel/vulkan/anv_nir_lower_multiview.c
@@ -23,7 +23,7 @@
#include "anv_nir.h"
#include "nir/nir_builder.h"
-#include "util/debug.h"
+#include "util/u_debug.h"
/**
* This file implements the lowering required for VK_KHR_multiview.
@@ -42,11 +42,12 @@ struct lower_multiview_state {
uint32_t view_mask;
- nir_ssa_def *instance_id;
- nir_ssa_def *view_index;
+ nir_def *instance_id_with_views;
+ nir_def *instance_id;
+ nir_def *view_index;
};
-static nir_ssa_def *
+static nir_def *
build_instance_id(struct lower_multiview_state *state)
{
assert(state->builder.shader->info.stage == MESA_SHADER_VERTEX);
@@ -54,27 +55,31 @@ build_instance_id(struct lower_multiview_state *state)
if (state->instance_id == NULL) {
nir_builder *b = &state->builder;
- b->cursor = nir_before_block(nir_start_block(b->impl));
+ b->cursor =
+ nir_after_instr(state->instance_id_with_views->parent_instr);
/* We use instancing for implementing multiview. The actual instance id
* is given by dividing instance_id by the number of views in this
* subpass.
*/
state->instance_id =
- nir_idiv(b, nir_load_instance_id(b),
+ nir_idiv(b, state->instance_id_with_views,
nir_imm_int(b, util_bitcount(state->view_mask)));
}
return state->instance_id;
}
-static nir_ssa_def *
+static nir_def *
build_view_index(struct lower_multiview_state *state)
{
+ assert(state->builder.shader->info.stage != MESA_SHADER_FRAGMENT);
+
if (state->view_index == NULL) {
nir_builder *b = &state->builder;
- b->cursor = nir_before_block(nir_start_block(b->impl));
+ b->cursor =
+ nir_after_instr(state->instance_id_with_views->parent_instr);
assert(state->view_mask != 0);
if (util_bitcount(state->view_mask) == 1) {
@@ -88,9 +93,9 @@ build_view_index(struct lower_multiview_state *state)
* id is given by instance_id % view_count. We then have to convert
* that to an actual view id.
*/
- nir_ssa_def *compacted =
- nir_umod(b, nir_load_instance_id(b),
- nir_imm_int(b, util_bitcount(state->view_mask)));
+ nir_def *compacted =
+ nir_umod_imm(b, state->instance_id_with_views,
+ util_bitcount(state->view_mask));
if (util_is_power_of_two_or_zero(state->view_mask + 1)) {
/* If we have a full view mask, then compacted is what we want */
@@ -107,24 +112,24 @@ build_view_index(struct lower_multiview_state *state)
remap |= (uint64_t)bit << (i++ * 4);
}
- nir_ssa_def *shift = nir_imul(b, compacted, nir_imm_int(b, 4));
+ nir_def *shift = nir_imul_imm(b, compacted, 4);
/* One of these days, when we have int64 everywhere, this will be
* easier.
*/
- nir_ssa_def *shifted;
+ nir_def *shifted;
if (remap <= UINT32_MAX) {
shifted = nir_ushr(b, nir_imm_int(b, remap), shift);
} else {
- nir_ssa_def *shifted_low =
+ nir_def *shifted_low =
nir_ushr(b, nir_imm_int(b, remap), shift);
- nir_ssa_def *shifted_high =
+ nir_def *shifted_high =
nir_ushr(b, nir_imm_int(b, remap >> 32),
- nir_isub(b, shift, nir_imm_int(b, 32)));
- shifted = nir_bcsel(b, nir_ilt(b, shift, nir_imm_int(b, 32)),
+ nir_iadd_imm(b, shift, -32));
+ shifted = nir_bcsel(b, nir_ilt_imm(b, shift, 32),
shifted_low, shifted_high);
}
- state->view_index = nir_iand(b, shifted, nir_imm_int(b, 0xf));
+ state->view_index = nir_iand_imm(b, shifted, 0xf);
}
} else {
const struct glsl_type *type = glsl_int_type();
@@ -157,7 +162,7 @@ is_load_view_index(const nir_instr *instr, const void *data)
nir_instr_as_intrinsic(instr)->intrinsic == nir_intrinsic_load_view_index;
}
-static nir_ssa_def *
+static nir_def *
replace_load_view_index_with_zero(struct nir_builder *b,
nir_instr *instr, void *data)
{
@@ -165,12 +170,19 @@ replace_load_view_index_with_zero(struct nir_builder *b,
return nir_imm_zero(b, 1, 32);
}
+static nir_def *
+replace_load_view_index_with_layer_id(struct nir_builder *b,
+ nir_instr *instr, void *data)
+{
+ assert(is_load_view_index(instr, data));
+ return nir_load_layer_id(b);
+}
+
bool
-anv_nir_lower_multiview(nir_shader *shader,
- struct anv_graphics_pipeline *pipeline)
+anv_nir_lower_multiview(nir_shader *shader, uint32_t view_mask,
+ bool use_primitive_replication)
{
assert(shader->info.stage != MESA_SHADER_COMPUTE);
- uint32_t view_mask = pipeline->subpass->view_mask;
/* If multiview isn't enabled, just lower the ViewIndex builtin to zero. */
if (view_mask == 0) {
@@ -178,6 +190,11 @@ anv_nir_lower_multiview(nir_shader *shader,
replace_load_view_index_with_zero, NULL);
}
+ if (shader->info.stage == MESA_SHADER_FRAGMENT) {
+ return nir_shader_lower_instructions(shader, is_load_view_index,
+ replace_load_view_index_with_layer_id, NULL);
+ }
+
/* This pass assumes a single entrypoint */
nir_function_impl *entrypoint = nir_shader_get_entrypoint(shader);
@@ -186,16 +203,11 @@ anv_nir_lower_multiview(nir_shader *shader,
* view, then it is possible to use the feature instead of instancing to
* implement multiview.
*/
- if (pipeline->use_primitive_replication) {
- if (shader->info.stage == MESA_SHADER_FRAGMENT)
- return false;
-
- bool progress = nir_lower_multiview(shader, pipeline->subpass->view_mask);
+ if (use_primitive_replication) {
+ bool progress = nir_lower_multiview(shader, view_mask);
if (progress) {
- nir_builder b;
- nir_builder_init(&b, entrypoint);
- b.cursor = nir_before_cf_list(&entrypoint->body);
+ nir_builder b = nir_builder_at(nir_before_impl(entrypoint));
/* Fill Layer ID with zero. Replication will use that as base to
* apply the RTAI offsets.
@@ -214,81 +226,92 @@ anv_nir_lower_multiview(nir_shader *shader,
.view_mask = view_mask,
};
- nir_builder_init(&state.builder, entrypoint);
+ state.builder = nir_builder_at(nir_before_impl(entrypoint));
+ nir_builder *b = &state.builder;
- bool progress = false;
- nir_foreach_block(block, entrypoint) {
- nir_foreach_instr_safe(instr, block) {
- if (instr->type != nir_instr_type_intrinsic)
- continue;
-
- nir_intrinsic_instr *load = nir_instr_as_intrinsic(instr);
-
- if (load->intrinsic != nir_intrinsic_load_instance_id &&
- load->intrinsic != nir_intrinsic_load_view_index)
- continue;
-
- assert(load->dest.is_ssa);
-
- nir_ssa_def *value;
- if (load->intrinsic == nir_intrinsic_load_instance_id) {
- value = build_instance_id(&state);
- } else {
- assert(load->intrinsic == nir_intrinsic_load_view_index);
- value = build_view_index(&state);
- }
-
- nir_ssa_def_rewrite_uses(&load->dest.ssa, value);
-
- nir_instr_remove(&load->instr);
- progress = true;
- }
- }
+ /* Save the original "instance ID" which is the actual instance ID
+ * multiplied by the number of views.
+ */
+ state.instance_id_with_views = nir_load_instance_id(b);
/* The view index is available in all stages but the instance id is only
* available in the VS. If it's not a fragment shader, we need to pass
* the view index on to the next stage.
*/
- if (shader->info.stage != MESA_SHADER_FRAGMENT) {
- nir_ssa_def *view_index = build_view_index(&state);
+ nir_def *view_index = build_view_index(&state);
+
+ assert(view_index->parent_instr->block == nir_start_block(entrypoint));
+ b->cursor = nir_after_instr(view_index->parent_instr);
- nir_builder *b = &state.builder;
+ /* Unless there is only one possible view index (that would be set
+ * directly), pass it to the next stage.
+ */
+ nir_variable *view_index_out = NULL;
+ if (util_bitcount(state.view_mask) != 1) {
+ view_index_out = nir_variable_create(shader, nir_var_shader_out,
+ glsl_int_type(), "view index");
+ view_index_out->data.location = VARYING_SLOT_VIEW_INDEX;
+ }
- assert(view_index->parent_instr->block == nir_start_block(entrypoint));
- b->cursor = nir_after_instr(view_index->parent_instr);
+ nir_variable *layer_id_out =
+ nir_variable_create(shader, nir_var_shader_out,
+ glsl_int_type(), "layer ID");
+ layer_id_out->data.location = VARYING_SLOT_LAYER;
- /* Unless there is only one possible view index (that would be set
- * directly), pass it to the next stage. */
- if (util_bitcount(state.view_mask) != 1) {
- nir_variable *view_index_out =
- nir_variable_create(shader, nir_var_shader_out,
- glsl_int_type(), "view index");
- view_index_out->data.location = VARYING_SLOT_VIEW_INDEX;
+ if (shader->info.stage != MESA_SHADER_GEOMETRY) {
+ if (view_index_out)
nir_store_var(b, view_index_out, view_index, 0x1);
- }
- nir_variable *layer_id_out =
- nir_variable_create(shader, nir_var_shader_out,
- glsl_int_type(), "layer ID");
- layer_id_out->data.location = VARYING_SLOT_LAYER;
nir_store_var(b, layer_id_out, view_index, 0x1);
-
- progress = true;
}
- if (progress) {
- nir_metadata_preserve(entrypoint, nir_metadata_block_index |
- nir_metadata_dominance);
+ nir_foreach_block(block, entrypoint) {
+ nir_foreach_instr_safe(instr, block) {
+ if (instr->type != nir_instr_type_intrinsic)
+ continue;
+
+ nir_intrinsic_instr *load = nir_instr_as_intrinsic(instr);
+
+ switch (load->intrinsic) {
+ case nir_intrinsic_load_instance_id:
+ if (&load->def != state.instance_id_with_views) {
+ nir_def_rewrite_uses(&load->def, build_instance_id(&state));
+ nir_instr_remove(&load->instr);
+ }
+ break;
+ case nir_intrinsic_load_view_index:
+ nir_def_rewrite_uses(&load->def, view_index);
+ nir_instr_remove(&load->instr);
+ break;
+ case nir_intrinsic_emit_vertex_with_counter:
+ /* In geometry shaders, outputs become undefined after every
+ * EmitVertex() call. We need to re-emit them for each vertex.
+ */
+ b->cursor = nir_before_instr(instr);
+ if (view_index_out)
+ nir_store_var(b, view_index_out, view_index, 0x1);
+
+ nir_store_var(b, layer_id_out, view_index, 0x1);
+ break;
+ default:
+ break;
+ }
+ }
}
- return progress;
+ nir_metadata_preserve(entrypoint, nir_metadata_block_index |
+ nir_metadata_dominance);
+
+ return true;
}
bool
-anv_check_for_primitive_replication(nir_shader **shaders,
- struct anv_graphics_pipeline *pipeline)
+anv_check_for_primitive_replication(struct anv_device *device,
+ VkShaderStageFlags stages,
+ nir_shader **shaders,
+ uint32_t view_mask)
{
- assert(pipeline->base.device->info.ver >= 12);
+ assert(device->info->ver >= 12);
static int primitive_replication_max_views = -1;
if (primitive_replication_max_views < 0) {
@@ -300,7 +323,7 @@ anv_check_for_primitive_replication(nir_shader **shaders,
primitive_replication_max_views =
MIN2(MAX_VIEWS_FOR_PRIMITIVE_REPLICATION,
- env_var_as_unsigned("ANV_PRIMITIVE_REPLICATION_MAX_VIEWS",
+ debug_get_num_option("ANV_PRIMITIVE_REPLICATION_MAX_VIEWS",
default_max_views));
}
@@ -308,18 +331,15 @@ anv_check_for_primitive_replication(nir_shader **shaders,
* later than Vertex. In that case only the last stage can refer to
* gl_ViewIndex.
*/
- if (pipeline->active_stages != (VK_SHADER_STAGE_VERTEX_BIT |
- VK_SHADER_STAGE_FRAGMENT_BIT)) {
+ if (stages & ~(VK_SHADER_STAGE_VERTEX_BIT | VK_SHADER_STAGE_FRAGMENT_BIT))
return false;
- }
- uint32_t view_mask = pipeline->subpass->view_mask;
- int view_count = util_bitcount(view_mask);
- if (view_count == 1 || view_count > primitive_replication_max_views)
+ /* It's possible we have no vertex shader yet (with pipeline libraries) */
+ if (!(stages & VK_SHADER_STAGE_VERTEX_BIT))
return false;
- /* We can't access the view index in the fragment shader. */
- if (nir_shader_uses_view_index(shaders[MESA_SHADER_FRAGMENT]))
+ int view_count = util_bitcount(view_mask);
+ if (view_count == 1 || view_count > primitive_replication_max_views)
return false;
return nir_can_lower_multiview(shaders[MESA_SHADER_VERTEX]);
diff --git a/src/intel/vulkan/anv_nir_lower_resource_intel.c b/src/intel/vulkan/anv_nir_lower_resource_intel.c
new file mode 100644
index 00000000000..92b18bf51b9
--- /dev/null
+++ b/src/intel/vulkan/anv_nir_lower_resource_intel.c
@@ -0,0 +1,170 @@
+/*
+ * Copyright © 2022 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "anv_nir.h"
+#include "nir_builder.h"
+
+/* This pass updates the block index in the resource_intel intrinsics if the
+ * array index is constant.
+ *
+ * This pass must be run before anv_nir_compute_push_layout().
+ */
+static bool
+update_resource_intel_block(nir_builder *b, nir_intrinsic_instr *intrin,
+ UNUSED void *data)
+{
+ if (intrin->intrinsic != nir_intrinsic_resource_intel)
+ return false;
+
+ /* If the array index in the descriptor binding is not const, we won't be
+ * able to turn this load_ubo into a push constant.
+ *
+ * Also if not pushable, set the block to 0xffffffff.
+ *
+ * Otherwise we need to update the block index by adding the array index so
+ * that when anv_nir_compute_push_layout() uses the block value it uses the
+ * right surface in the array of the binding.
+ */
+ if (!nir_src_is_const(intrin->src[2]) ||
+ !(nir_intrinsic_resource_access_intel(intrin) &
+ nir_resource_intel_pushable)) {
+ nir_intrinsic_set_resource_block_intel(intrin, 0xffffffff);
+ nir_intrinsic_set_resource_access_intel(
+ intrin,
+ nir_intrinsic_resource_access_intel(intrin) &
+ ~nir_resource_intel_pushable);
+ } else {
+ nir_intrinsic_set_resource_block_intel(
+ intrin,
+ nir_intrinsic_resource_block_intel(intrin) +
+ nir_src_as_uint(intrin->src[2]));
+ }
+
+ return true;
+}
+
+bool
+anv_nir_update_resource_intel_block(nir_shader *shader)
+{
+ return nir_shader_intrinsics_pass(shader, update_resource_intel_block,
+ nir_metadata_all,
+ NULL);
+}
+
+struct lower_resource_state {
+ enum anv_descriptor_set_layout_type desc_type;
+ const struct anv_physical_device *device;
+};
+
+/* This pass lower resource_intel surface_index source, combining the
+ * descriptor set offset with the surface offset in the descriptor set.
+ *
+ * This pass must be run after anv_nir_compute_push_layout() because we want
+ * the push constant selection to tell if the surface offset is constant. Once
+ * combined the constant detection does not work anymore.
+ */
+static bool
+lower_resource_intel(nir_builder *b, nir_intrinsic_instr *intrin, void *data)
+{
+ if (intrin->intrinsic != nir_intrinsic_resource_intel)
+ return false;
+
+ const bool is_bindless =
+ (nir_intrinsic_resource_access_intel(intrin) &
+ nir_resource_intel_bindless) != 0;
+ const bool is_sampler =
+ (nir_intrinsic_resource_access_intel(intrin) &
+ nir_resource_intel_sampler) != 0;
+ const bool is_embedded_sampler =
+ (nir_intrinsic_resource_access_intel(intrin) &
+ nir_resource_intel_sampler_embedded) != 0;
+ const struct lower_resource_state *state = data;
+
+ /* Ignore binding table accesses & embedded samplers */
+ if (is_embedded_sampler) {
+ assert(state->desc_type == ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_BUFFER);
+ return false;
+ }
+
+ if (!is_bindless)
+ return true;
+
+ b->cursor = nir_before_instr(&intrin->instr);
+
+ nir_def *set_offset = intrin->src[0].ssa;
+ nir_def *binding_offset = intrin->src[1].ssa;
+
+ /* When using indirect descriptor, the surface handles are loaded from the
+ * descriptor buffer and do not need any offset.
+ */
+ if (state->desc_type == ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_DIRECT ||
+ state->desc_type == ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_BUFFER) {
+ if (!state->device->uses_ex_bso) {
+ /* We're trying to reduce the number of instructions in the shaders
+ * to compute surface handles. The assumption is that we're using
+ * more surface handles than sampler handles (UBO, SSBO, images,
+ * etc...) so it's worth optimizing that case.
+ *
+ * Surface handles in the extended descriptor message have to be
+ * shifted left by 6 prior to ex_bso (bits 31:12 in extended
+ * descriptor, match bits 25:6 of the surface handle). We have to
+ * combine 2 parts in the shader to build the final surface handle,
+ * base offset of the descriptor set (in the push constant, located
+ * in resource_intel::src[0]) and the relative descriptor offset
+ * (resource_intel::src[1]).
+ *
+ * For convenience, up to here, resource_intel::src[1] is in bytes.
+ * We now have to shift it left by 6 to match the shifted left by 6
+ * done for the push constant value provided in
+ * resource_intel::src[0]. That way the shader can just do a single
+ * ADD and get the surface handle.
+ */
+ if (!is_sampler)
+ binding_offset = nir_ishl_imm(b, binding_offset, 6);
+ }
+
+ nir_src_rewrite(&intrin->src[1],
+ nir_iadd(b, set_offset, binding_offset));
+ }
+
+ /* Now unused values : set offset, array index */
+ nir_src_rewrite(&intrin->src[0], nir_imm_int(b, 0xdeaddeed));
+ nir_src_rewrite(&intrin->src[2], nir_imm_int(b, 0xdeaddeed));
+
+ return true;
+}
+
+bool
+anv_nir_lower_resource_intel(nir_shader *shader,
+ const struct anv_physical_device *device,
+ enum anv_descriptor_set_layout_type desc_type)
+{
+ struct lower_resource_state state = {
+ .desc_type = desc_type,
+ .device = device,
+ };
+ return nir_shader_intrinsics_pass(shader, lower_resource_intel,
+ nir_metadata_block_index |
+ nir_metadata_dominance,
+ &state);
+}
diff --git a/src/intel/vulkan/anv_nir_lower_ubo_loads.c b/src/intel/vulkan/anv_nir_lower_ubo_loads.c
index 35b963835e6..c85c656d296 100644
--- a/src/intel/vulkan/anv_nir_lower_ubo_loads.c
+++ b/src/intel/vulkan/anv_nir_lower_ubo_loads.c
@@ -25,29 +25,26 @@
#include "nir_builder.h"
static bool
-lower_ubo_load_instr(nir_builder *b, nir_instr *instr, UNUSED void *_data)
+lower_ubo_load_instr(nir_builder *b, nir_intrinsic_instr *load,
+ UNUSED void *_data)
{
- if (instr->type != nir_instr_type_intrinsic)
- return false;
-
- nir_intrinsic_instr *load = nir_instr_as_intrinsic(instr);
if (load->intrinsic != nir_intrinsic_load_global_constant_offset &&
load->intrinsic != nir_intrinsic_load_global_constant_bounded)
return false;
- b->cursor = nir_before_instr(instr);
+ b->cursor = nir_before_instr(&load->instr);
- nir_ssa_def *base_addr = load->src[0].ssa;
- nir_ssa_def *bound = NULL;
+ nir_def *base_addr = load->src[0].ssa;
+ nir_def *bound = NULL;
if (load->intrinsic == nir_intrinsic_load_global_constant_bounded)
bound = load->src[2].ssa;
- unsigned bit_size = load->dest.ssa.bit_size;
+ unsigned bit_size = load->def.bit_size;
assert(bit_size >= 8 && bit_size % 8 == 0);
unsigned byte_size = bit_size / 8;
- nir_ssa_def *val;
- if (nir_src_is_const(load->src[1])) {
+ nir_def *val;
+ if (!nir_src_is_divergent(load->src[0]) && nir_src_is_const(load->src[1])) {
uint32_t offset = nir_src_as_uint(load->src[1]);
/* Things should be component-aligned. */
@@ -59,17 +56,16 @@ lower_ubo_load_instr(nir_builder *b, nir_instr *instr, UNUSED void *_data)
uint64_t aligned_offset = offset - suboffset;
/* Load two just in case we go over a 64B boundary */
- nir_ssa_def *data[2];
+ nir_def *data[2];
for (unsigned i = 0; i < 2; i++) {
- nir_ssa_def *pred;
+ nir_def *pred;
if (bound) {
- pred = nir_ilt(b, nir_imm_int(b, aligned_offset + i * 64 + 63),
- bound);
+ pred = nir_igt_imm(b, bound, aligned_offset + i * 64 + 63);
} else {
pred = nir_imm_true(b);
}
- nir_ssa_def *addr = nir_iadd_imm(b, base_addr,
+ nir_def *addr = nir_iadd_imm(b, base_addr,
aligned_offset + i * 64);
data[i] = nir_load_global_const_block_intel(b, 16, addr, pred);
@@ -78,21 +74,21 @@ lower_ubo_load_instr(nir_builder *b, nir_instr *instr, UNUSED void *_data)
val = nir_extract_bits(b, data, 2, suboffset * 8,
load->num_components, bit_size);
} else {
- nir_ssa_def *offset = load->src[1].ssa;
- nir_ssa_def *addr = nir_iadd(b, base_addr, nir_u2u64(b, offset));
+ nir_def *offset = load->src[1].ssa;
+ nir_def *addr = nir_iadd(b, base_addr, nir_u2u64(b, offset));
if (bound) {
- nir_ssa_def *zero = nir_imm_zero(b, load->num_components, bit_size);
+ nir_def *zero = nir_imm_zero(b, load->num_components, bit_size);
unsigned load_size = byte_size * load->num_components;
- nir_ssa_def *in_bounds =
+ nir_def *in_bounds =
nir_ilt(b, nir_iadd_imm(b, offset, load_size - 1), bound);
nir_push_if(b, in_bounds);
- nir_ssa_def *load_val =
- nir_build_load_global_constant(b, load->dest.ssa.num_components,
- load->dest.ssa.bit_size, addr,
+ nir_def *load_val =
+ nir_build_load_global_constant(b, load->def.num_components,
+ load->def.bit_size, addr,
.access = nir_intrinsic_access(load),
.align_mul = nir_intrinsic_align_mul(load),
.align_offset = nir_intrinsic_align_offset(load));
@@ -101,15 +97,15 @@ lower_ubo_load_instr(nir_builder *b, nir_instr *instr, UNUSED void *_data)
val = nir_if_phi(b, load_val, zero);
} else {
- val = nir_build_load_global_constant(b, load->dest.ssa.num_components,
- load->dest.ssa.bit_size, addr,
+ val = nir_build_load_global_constant(b, load->def.num_components,
+ load->def.bit_size, addr,
.access = nir_intrinsic_access(load),
.align_mul = nir_intrinsic_align_mul(load),
.align_offset = nir_intrinsic_align_offset(load));
}
}
- nir_ssa_def_rewrite_uses(&load->dest.ssa, val);
+ nir_def_rewrite_uses(&load->def, val);
nir_instr_remove(&load->instr);
return true;
@@ -118,8 +114,7 @@ lower_ubo_load_instr(nir_builder *b, nir_instr *instr, UNUSED void *_data)
bool
anv_nir_lower_ubo_loads(nir_shader *shader)
{
- return nir_shader_instructions_pass(shader, lower_ubo_load_instr,
- nir_metadata_block_index |
- nir_metadata_dominance,
+ return nir_shader_intrinsics_pass(shader, lower_ubo_load_instr,
+ nir_metadata_none,
NULL);
}
diff --git a/src/intel/vulkan/anv_nir_lower_ycbcr_textures.c b/src/intel/vulkan/anv_nir_lower_ycbcr_textures.c
deleted file mode 100644
index a1504120247..00000000000
--- a/src/intel/vulkan/anv_nir_lower_ycbcr_textures.c
+++ /dev/null
@@ -1,367 +0,0 @@
-/*
- * Copyright © 2017 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-#include "anv_nir.h"
-#include "anv_private.h"
-#include "nir/nir.h"
-#include "nir/nir_builder.h"
-#include "nir/nir_vulkan.h"
-
-struct ycbcr_state {
- nir_builder *builder;
- nir_ssa_def *image_size;
- nir_tex_instr *origin_tex;
- nir_deref_instr *tex_deref;
- struct anv_ycbcr_conversion *conversion;
-};
-
-/* TODO: we should probably replace this with a push constant/uniform. */
-static nir_ssa_def *
-get_texture_size(struct ycbcr_state *state, nir_deref_instr *texture)
-{
- if (state->image_size)
- return state->image_size;
-
- nir_builder *b = state->builder;
- const struct glsl_type *type = texture->type;
- nir_tex_instr *tex = nir_tex_instr_create(b->shader, 1);
-
- tex->op = nir_texop_txs;
- tex->sampler_dim = glsl_get_sampler_dim(type);
- tex->is_array = glsl_sampler_type_is_array(type);
- tex->is_shadow = glsl_sampler_type_is_shadow(type);
- tex->dest_type = nir_type_int32;
-
- tex->src[0].src_type = nir_tex_src_texture_deref;
- tex->src[0].src = nir_src_for_ssa(&texture->dest.ssa);
-
- nir_ssa_dest_init(&tex->instr, &tex->dest,
- nir_tex_instr_dest_size(tex), 32, NULL);
- nir_builder_instr_insert(b, &tex->instr);
-
- state->image_size = nir_i2f32(b, &tex->dest.ssa);
-
- return state->image_size;
-}
-
-static nir_ssa_def *
-implicit_downsampled_coord(nir_builder *b,
- nir_ssa_def *value,
- nir_ssa_def *max_value,
- int div_scale)
-{
- return nir_fadd(b,
- value,
- nir_fdiv(b,
- nir_imm_float(b, 1.0f),
- nir_fmul(b,
- nir_imm_float(b, div_scale),
- max_value)));
-}
-
-static nir_ssa_def *
-implicit_downsampled_coords(struct ycbcr_state *state,
- nir_ssa_def *old_coords,
- const struct anv_format_plane *plane_format)
-{
- nir_builder *b = state->builder;
- struct anv_ycbcr_conversion *conversion = state->conversion;
- nir_ssa_def *image_size = get_texture_size(state, state->tex_deref);
- nir_ssa_def *comp[4] = { NULL, };
- int c;
-
- for (c = 0; c < ARRAY_SIZE(conversion->chroma_offsets); c++) {
- if (plane_format->denominator_scales[c] > 1 &&
- conversion->chroma_offsets[c] == VK_CHROMA_LOCATION_COSITED_EVEN) {
- comp[c] = implicit_downsampled_coord(b,
- nir_channel(b, old_coords, c),
- nir_channel(b, image_size, c),
- plane_format->denominator_scales[c]);
- } else {
- comp[c] = nir_channel(b, old_coords, c);
- }
- }
-
- /* Leave other coordinates untouched */
- for (; c < old_coords->num_components; c++)
- comp[c] = nir_channel(b, old_coords, c);
-
- return nir_vec(b, comp, old_coords->num_components);
-}
-
-static nir_ssa_def *
-create_plane_tex_instr_implicit(struct ycbcr_state *state,
- uint32_t plane)
-{
- nir_builder *b = state->builder;
- struct anv_ycbcr_conversion *conversion = state->conversion;
- const struct anv_format_plane *plane_format =
- &conversion->format->planes[plane];
- nir_tex_instr *old_tex = state->origin_tex;
- nir_tex_instr *tex = nir_tex_instr_create(b->shader, old_tex->num_srcs + 1);
-
- for (uint32_t i = 0; i < old_tex->num_srcs; i++) {
- tex->src[i].src_type = old_tex->src[i].src_type;
-
- switch (old_tex->src[i].src_type) {
- case nir_tex_src_coord:
- if (plane_format->has_chroma && conversion->chroma_reconstruction) {
- assert(old_tex->src[i].src.is_ssa);
- tex->src[i].src =
- nir_src_for_ssa(implicit_downsampled_coords(state,
- old_tex->src[i].src.ssa,
- plane_format));
- break;
- }
- FALLTHROUGH;
- default:
- nir_src_copy(&tex->src[i].src, &old_tex->src[i].src);
- break;
- }
- }
- tex->src[tex->num_srcs - 1].src = nir_src_for_ssa(nir_imm_int(b, plane));
- tex->src[tex->num_srcs - 1].src_type = nir_tex_src_plane;
-
- tex->sampler_dim = old_tex->sampler_dim;
- tex->dest_type = old_tex->dest_type;
-
- tex->op = old_tex->op;
- tex->coord_components = old_tex->coord_components;
- tex->is_new_style_shadow = old_tex->is_new_style_shadow;
- tex->component = old_tex->component;
-
- tex->texture_index = old_tex->texture_index;
- tex->sampler_index = old_tex->sampler_index;
- tex->is_array = old_tex->is_array;
-
- nir_ssa_dest_init(&tex->instr, &tex->dest,
- old_tex->dest.ssa.num_components,
- nir_dest_bit_size(old_tex->dest), NULL);
- nir_builder_instr_insert(b, &tex->instr);
-
- return &tex->dest.ssa;
-}
-
-static unsigned
-channel_to_component(enum isl_channel_select channel)
-{
- switch (channel) {
- case ISL_CHANNEL_SELECT_RED:
- return 0;
- case ISL_CHANNEL_SELECT_GREEN:
- return 1;
- case ISL_CHANNEL_SELECT_BLUE:
- return 2;
- case ISL_CHANNEL_SELECT_ALPHA:
- return 3;
- default:
- unreachable("invalid channel");
- return 0;
- }
-}
-
-static enum isl_channel_select
-swizzle_channel(struct isl_swizzle swizzle, unsigned channel)
-{
- switch (channel) {
- case 0:
- return swizzle.r;
- case 1:
- return swizzle.g;
- case 2:
- return swizzle.b;
- case 3:
- return swizzle.a;
- default:
- unreachable("invalid channel");
- return 0;
- }
-}
-
-static bool
-try_lower_tex_ycbcr(const struct anv_pipeline_layout *layout,
- nir_builder *builder,
- nir_tex_instr *tex)
-{
- int deref_src_idx = nir_tex_instr_src_index(tex, nir_tex_src_texture_deref);
- assert(deref_src_idx >= 0);
- nir_deref_instr *deref = nir_src_as_deref(tex->src[deref_src_idx].src);
-
- nir_variable *var = nir_deref_instr_get_variable(deref);
- const struct anv_descriptor_set_layout *set_layout =
- layout->set[var->data.descriptor_set].layout;
- const struct anv_descriptor_set_binding_layout *binding =
- &set_layout->binding[var->data.binding];
-
- /* For the following instructions, we don't apply any change and let the
- * instruction apply to the first plane.
- */
- if (tex->op == nir_texop_txs ||
- tex->op == nir_texop_query_levels ||
- tex->op == nir_texop_lod)
- return false;
-
- if (binding->immutable_samplers == NULL)
- return false;
-
- assert(tex->texture_index == 0);
- unsigned array_index = 0;
- if (deref->deref_type != nir_deref_type_var) {
- assert(deref->deref_type == nir_deref_type_array);
- if (!nir_src_is_const(deref->arr.index))
- return false;
- array_index = nir_src_as_uint(deref->arr.index);
- array_index = MIN2(array_index, binding->array_size - 1);
- }
- const struct anv_sampler *sampler = binding->immutable_samplers[array_index];
-
- if (sampler->conversion == NULL)
- return false;
-
- struct ycbcr_state state = {
- .builder = builder,
- .origin_tex = tex,
- .tex_deref = deref,
- .conversion = sampler->conversion,
- };
-
- builder->cursor = nir_before_instr(&tex->instr);
-
- const struct anv_format *format = state.conversion->format;
- const struct isl_format_layout *y_isl_layout = NULL;
- for (uint32_t p = 0; p < format->n_planes; p++) {
- if (!format->planes[p].has_chroma)
- y_isl_layout = isl_format_get_layout(format->planes[p].isl_format);
- }
- assert(y_isl_layout != NULL);
- uint8_t y_bpc = y_isl_layout->channels_array[0].bits;
-
- /* |ycbcr_comp| holds components in the order : Cr-Y-Cb */
- nir_ssa_def *zero = nir_imm_float(builder, 0.0f);
- nir_ssa_def *one = nir_imm_float(builder, 1.0f);
- /* Use extra 2 channels for following swizzle */
- nir_ssa_def *ycbcr_comp[5] = { zero, zero, zero, one, zero };
-
- uint8_t ycbcr_bpcs[5];
- memset(ycbcr_bpcs, y_bpc, sizeof(ycbcr_bpcs));
-
- /* Go through all the planes and gather the samples into a |ycbcr_comp|
- * while applying a swizzle required by the spec:
- *
- * R, G, B should respectively map to Cr, Y, Cb
- */
- for (uint32_t p = 0; p < format->n_planes; p++) {
- const struct anv_format_plane *plane_format = &format->planes[p];
- nir_ssa_def *plane_sample = create_plane_tex_instr_implicit(&state, p);
-
- for (uint32_t pc = 0; pc < 4; pc++) {
- enum isl_channel_select ycbcr_swizzle =
- swizzle_channel(plane_format->ycbcr_swizzle, pc);
- if (ycbcr_swizzle == ISL_CHANNEL_SELECT_ZERO)
- continue;
-
- unsigned ycbcr_component = channel_to_component(ycbcr_swizzle);
- ycbcr_comp[ycbcr_component] = nir_channel(builder, plane_sample, pc);
-
- /* Also compute the number of bits for each component. */
- const struct isl_format_layout *isl_layout =
- isl_format_get_layout(plane_format->isl_format);
- ycbcr_bpcs[ycbcr_component] = isl_layout->channels_array[pc].bits;
- }
- }
-
- /* Now remaps components to the order specified by the conversion. */
- nir_ssa_def *swizzled_comp[4] = { NULL, };
- uint32_t swizzled_bpcs[4] = { 0, };
-
- for (uint32_t i = 0; i < ARRAY_SIZE(state.conversion->mapping); i++) {
- /* Maps to components in |ycbcr_comp| */
- static const uint32_t swizzle_mapping[] = {
- [VK_COMPONENT_SWIZZLE_ZERO] = 4,
- [VK_COMPONENT_SWIZZLE_ONE] = 3,
- [VK_COMPONENT_SWIZZLE_R] = 0,
- [VK_COMPONENT_SWIZZLE_G] = 1,
- [VK_COMPONENT_SWIZZLE_B] = 2,
- [VK_COMPONENT_SWIZZLE_A] = 3,
- };
- const VkComponentSwizzle m = state.conversion->mapping[i];
-
- if (m == VK_COMPONENT_SWIZZLE_IDENTITY) {
- swizzled_comp[i] = ycbcr_comp[i];
- swizzled_bpcs[i] = ycbcr_bpcs[i];
- } else {
- swizzled_comp[i] = ycbcr_comp[swizzle_mapping[m]];
- swizzled_bpcs[i] = ycbcr_bpcs[swizzle_mapping[m]];
- }
- }
-
- nir_ssa_def *result = nir_vec(builder, swizzled_comp, 4);
- if (state.conversion->ycbcr_model != VK_SAMPLER_YCBCR_MODEL_CONVERSION_RGB_IDENTITY) {
- result = nir_convert_ycbcr_to_rgb(builder,
- state.conversion->ycbcr_model,
- state.conversion->ycbcr_range,
- result,
- swizzled_bpcs);
- }
-
- nir_ssa_def_rewrite_uses(&tex->dest.ssa, result);
- nir_instr_remove(&tex->instr);
-
- return true;
-}
-
-bool
-anv_nir_lower_ycbcr_textures(nir_shader *shader,
- const struct anv_pipeline_layout *layout)
-{
- bool progress = false;
-
- nir_foreach_function(function, shader) {
- if (!function->impl)
- continue;
-
- bool function_progress = false;
- nir_builder builder;
- nir_builder_init(&builder, function->impl);
-
- nir_foreach_block(block, function->impl) {
- nir_foreach_instr_safe(instr, block) {
- if (instr->type != nir_instr_type_tex)
- continue;
-
- nir_tex_instr *tex = nir_instr_as_tex(instr);
- function_progress |= try_lower_tex_ycbcr(layout, &builder, tex);
- }
- }
-
- if (function_progress) {
- nir_metadata_preserve(function->impl,
- nir_metadata_block_index |
- nir_metadata_dominance);
- }
-
- progress |= function_progress;
- }
-
- return progress;
-}
diff --git a/src/intel/vulkan/anv_nir_push_descriptor_analysis.c b/src/intel/vulkan/anv_nir_push_descriptor_analysis.c
new file mode 100644
index 00000000000..c6dcb03769d
--- /dev/null
+++ b/src/intel/vulkan/anv_nir_push_descriptor_analysis.c
@@ -0,0 +1,261 @@
+/*
+ * Copyright © 2022 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "anv_nir.h"
+
+#include "compiler/brw_nir.h"
+
+const struct anv_descriptor_set_layout *
+anv_pipeline_layout_get_push_set(const struct anv_pipeline_sets_layout *layout,
+ uint8_t *set_idx)
+{
+ for (unsigned s = 0; s < ARRAY_SIZE(layout->set); s++) {
+ struct anv_descriptor_set_layout *set_layout = layout->set[s].layout;
+
+ if (!set_layout ||
+ !(set_layout->flags &
+ VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR))
+ continue;
+
+ if (set_idx)
+ *set_idx = s;
+
+ return set_layout;
+ }
+
+ return NULL;
+}
+
+/* This function returns a bitfield of used descriptors in the push descriptor
+ * set. You can only call this function before calling
+ * anv_nir_apply_pipeline_layout() as information required is lost after
+ * applying the pipeline layout.
+ */
+uint32_t
+anv_nir_compute_used_push_descriptors(nir_shader *shader,
+ const struct anv_pipeline_sets_layout *layout)
+{
+ uint8_t push_set;
+ const struct anv_descriptor_set_layout *push_set_layout =
+ anv_pipeline_layout_get_push_set(layout, &push_set);
+ if (push_set_layout == NULL)
+ return 0;
+
+ uint32_t used_push_bindings = 0;
+ nir_foreach_variable_with_modes(var, shader,
+ nir_var_uniform |
+ nir_var_image |
+ nir_var_mem_ubo |
+ nir_var_mem_ssbo) {
+ if (var->data.descriptor_set == push_set) {
+ uint32_t desc_idx =
+ push_set_layout->binding[var->data.binding].descriptor_index;
+ assert(desc_idx < MAX_PUSH_DESCRIPTORS);
+ used_push_bindings |= BITFIELD_BIT(desc_idx);
+ }
+ }
+
+ nir_foreach_function_impl(impl, shader) {
+ nir_foreach_block(block, impl) {
+ nir_foreach_instr(instr, block) {
+ if (instr->type != nir_instr_type_intrinsic)
+ continue;
+
+ nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+ if (intrin->intrinsic != nir_intrinsic_vulkan_resource_index)
+ continue;
+
+ uint8_t set = nir_intrinsic_desc_set(intrin);
+ if (set != push_set)
+ continue;
+
+ uint32_t binding = nir_intrinsic_binding(intrin);
+ uint32_t desc_idx =
+ push_set_layout->binding[binding].descriptor_index;
+ assert(desc_idx < MAX_PUSH_DESCRIPTORS);
+
+ used_push_bindings |= BITFIELD_BIT(desc_idx);
+ }
+ }
+ }
+
+ return used_push_bindings;
+}
+
+/* This function checks whether the shader accesses the push descriptor
+ * buffer. This function must be called after anv_nir_compute_push_layout().
+ */
+bool
+anv_nir_loads_push_desc_buffer(nir_shader *nir,
+ const struct anv_pipeline_sets_layout *layout,
+ const struct anv_pipeline_bind_map *bind_map)
+{
+ uint8_t push_set;
+ const struct anv_descriptor_set_layout *push_set_layout =
+ anv_pipeline_layout_get_push_set(layout, &push_set);
+ if (push_set_layout == NULL)
+ return false;
+
+ nir_foreach_function_impl(impl, nir) {
+ nir_foreach_block(block, impl) {
+ nir_foreach_instr(instr, block) {
+ if (instr->type != nir_instr_type_intrinsic)
+ continue;
+
+ nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+ if (intrin->intrinsic != nir_intrinsic_load_ubo)
+ continue;
+
+ const unsigned bt_idx =
+ brw_nir_ubo_surface_index_get_bti(intrin->src[0]);
+ if (bt_idx == UINT32_MAX)
+ continue;
+
+ const struct anv_pipeline_binding *binding =
+ &bind_map->surface_to_descriptor[bt_idx];
+ if ((binding->set == ANV_DESCRIPTOR_SET_DESCRIPTORS ||
+ binding->set == ANV_DESCRIPTOR_SET_DESCRIPTORS_BUFFER) &&
+ binding->index == push_set) {
+ return true;
+ }
+ }
+ }
+ }
+
+ return false;
+}
+
+/* This function computes a bitfield of all the UBOs bindings in the push
+ * descriptor set that are fully promoted to push constants. If a binding's
+ * bit in the field is set, the corresponding binding table entry will not be
+ * accessed by the shader. This function must be called after
+ * anv_nir_compute_push_layout().
+ */
+uint32_t
+anv_nir_push_desc_ubo_fully_promoted(nir_shader *nir,
+ const struct anv_pipeline_sets_layout *layout,
+ const struct anv_pipeline_bind_map *bind_map)
+{
+ uint8_t push_set;
+ const struct anv_descriptor_set_layout *push_set_layout =
+ anv_pipeline_layout_get_push_set(layout, &push_set);
+ if (push_set_layout == NULL)
+ return 0;
+
+ /* Assume every UBO can be promoted first. */
+ uint32_t ubos_fully_promoted = 0;
+ for (uint32_t b = 0; b < push_set_layout->binding_count; b++) {
+ const struct anv_descriptor_set_binding_layout *bind_layout =
+ &push_set_layout->binding[b];
+ if (bind_layout->type == -1)
+ continue;
+
+ assert(bind_layout->descriptor_index < MAX_PUSH_DESCRIPTORS);
+ if (bind_layout->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER)
+ ubos_fully_promoted |= BITFIELD_BIT(bind_layout->descriptor_index);
+ }
+
+ /* For each load_ubo intrinsic, if the descriptor index or the offset is
+ * not a constant, we could not promote to push constant. Then check the
+ * offset + size against the push ranges.
+ */
+ nir_foreach_function_impl(impl, nir) {
+ nir_foreach_block(block, impl) {
+ nir_foreach_instr(instr, block) {
+ if (instr->type != nir_instr_type_intrinsic)
+ continue;
+
+ nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+ if (intrin->intrinsic != nir_intrinsic_load_ubo)
+ continue;
+
+ /* Don't check the load_ubo from descriptor buffers */
+ nir_intrinsic_instr *resource =
+ intrin->src[0].ssa->parent_instr->type == nir_instr_type_intrinsic ?
+ nir_instr_as_intrinsic(intrin->src[0].ssa->parent_instr) : NULL;
+ if (resource == NULL || resource->intrinsic != nir_intrinsic_resource_intel)
+ continue;
+
+ /* Skip load_ubo not loading from the push descriptor */
+ if (nir_intrinsic_desc_set(resource) != push_set)
+ continue;
+
+ uint32_t binding = nir_intrinsic_binding(resource);
+
+ /* If we have indirect indexing in the binding, no push promotion
+ * in possible for the entire binding.
+ */
+ if (!nir_src_is_const(resource->src[1])) {
+ for (uint32_t i = 0; i < push_set_layout->binding[binding].array_size; i++) {
+ ubos_fully_promoted &=
+ ~BITFIELD_BIT(push_set_layout->binding[binding].descriptor_index + i);
+ }
+ continue;
+ }
+
+ const nir_const_value *const_bt_id =
+ nir_src_as_const_value(resource->src[1]);
+ uint32_t bt_id = const_bt_id[0].u32;
+
+ const struct anv_pipeline_binding *pipe_bind =
+ &bind_map->surface_to_descriptor[bt_id];
+
+ const uint32_t desc_idx =
+ push_set_layout->binding[binding].descriptor_index;
+
+ /* If the offset in the entry is dynamic, we can't tell if
+ * promoted or not.
+ */
+ const nir_const_value *const_load_offset =
+ nir_src_as_const_value(intrin->src[1]);
+ if (const_load_offset == NULL) {
+ ubos_fully_promoted &= ~BITFIELD_BIT(desc_idx);
+ continue;
+ }
+
+ /* Check if the load was promoted to a push constant. */
+ const unsigned load_offset = const_load_offset[0].u32;
+ const int load_bytes = nir_intrinsic_dest_components(intrin) *
+ (intrin->def.bit_size / 8);
+
+ bool promoted = false;
+ for (unsigned i = 0; i < ARRAY_SIZE(bind_map->push_ranges); i++) {
+ if (bind_map->push_ranges[i].set == pipe_bind->set &&
+ bind_map->push_ranges[i].index == desc_idx &&
+ bind_map->push_ranges[i].start * 32 <= load_offset &&
+ (bind_map->push_ranges[i].start +
+ bind_map->push_ranges[i].length) * 32 >=
+ (load_offset + load_bytes)) {
+ promoted = true;
+ break;
+ }
+ }
+
+ if (!promoted)
+ ubos_fully_promoted &= ~BITFIELD_BIT(desc_idx);
+ }
+ }
+ }
+
+ return ubos_fully_promoted;
+}
diff --git a/src/intel/vulkan/anv_pass.c b/src/intel/vulkan/anv_pass.c
deleted file mode 100644
index 634a3a3e24e..00000000000
--- a/src/intel/vulkan/anv_pass.c
+++ /dev/null
@@ -1,490 +0,0 @@
-/*
- * Copyright © 2015 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-#include "anv_private.h"
-
-#include "vk_format.h"
-#include "vk_util.h"
-
-static void
-anv_render_pass_add_subpass_dep(struct anv_device *device,
- struct anv_render_pass *pass,
- const VkSubpassDependency2KHR *dep)
-{
- if (dep->dstSubpass == VK_SUBPASS_EXTERNAL) {
- pass->subpass_flushes[pass->subpass_count] |=
- anv_pipe_invalidate_bits_for_access_flags(device, dep->dstAccessMask);
- } else {
- assert(dep->dstSubpass < pass->subpass_count);
- pass->subpass_flushes[dep->dstSubpass] |=
- anv_pipe_invalidate_bits_for_access_flags(device, dep->dstAccessMask);
- }
-
- if (dep->srcSubpass == VK_SUBPASS_EXTERNAL) {
- pass->subpass_flushes[0] |=
- anv_pipe_flush_bits_for_access_flags(device, dep->srcAccessMask);
- } else {
- assert(dep->srcSubpass < pass->subpass_count);
- pass->subpass_flushes[dep->srcSubpass + 1] |=
- anv_pipe_flush_bits_for_access_flags(device, dep->srcAccessMask);
- }
-}
-
-/* Do a second "compile" step on a render pass */
-static void
-anv_render_pass_compile(struct anv_render_pass *pass)
-{
- /* The CreateRenderPass code zeros the entire render pass and also uses a
- * designated initializer for filling these out. There's no need for us to
- * do it again.
- *
- * for (uint32_t i = 0; i < pass->attachment_count; i++) {
- * pass->attachments[i].usage = 0;
- * pass->attachments[i].first_subpass_layout = VK_IMAGE_LAYOUT_UNDEFINED;
- * }
- */
-
- VkImageUsageFlags all_usage = 0;
- for (uint32_t i = 0; i < pass->subpass_count; i++) {
- struct anv_subpass *subpass = &pass->subpasses[i];
-
- /* We don't allow depth_stencil_attachment to be non-NULL and be
- * VK_ATTACHMENT_UNUSED. This way something can just check for NULL
- * and be guaranteed that they have a valid attachment.
- */
- if (subpass->depth_stencil_attachment &&
- subpass->depth_stencil_attachment->attachment == VK_ATTACHMENT_UNUSED)
- subpass->depth_stencil_attachment = NULL;
-
- if (subpass->ds_resolve_attachment &&
- subpass->ds_resolve_attachment->attachment == VK_ATTACHMENT_UNUSED)
- subpass->ds_resolve_attachment = NULL;
-
- for (uint32_t j = 0; j < subpass->attachment_count; j++) {
- struct anv_subpass_attachment *subpass_att = &subpass->attachments[j];
- if (subpass_att->attachment == VK_ATTACHMENT_UNUSED)
- continue;
-
- struct anv_render_pass_attachment *pass_att =
- &pass->attachments[subpass_att->attachment];
-
- pass_att->usage |= subpass_att->usage;
- pass_att->last_subpass_idx = i;
-
- all_usage |= subpass_att->usage;
-
- if (pass_att->first_subpass_layout == VK_IMAGE_LAYOUT_UNDEFINED) {
- pass_att->first_subpass_layout = subpass_att->layout;
- assert(pass_att->first_subpass_layout != VK_IMAGE_LAYOUT_UNDEFINED);
- }
-
- if (subpass_att->usage == VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT &&
- subpass->depth_stencil_attachment &&
- subpass_att->attachment == subpass->depth_stencil_attachment->attachment)
- subpass->has_ds_self_dep = true;
- }
-
- /* We have to handle resolve attachments specially */
- subpass->has_color_resolve = false;
- if (subpass->resolve_attachments) {
- for (uint32_t j = 0; j < subpass->color_count; j++) {
- struct anv_subpass_attachment *color_att =
- &subpass->color_attachments[j];
- struct anv_subpass_attachment *resolve_att =
- &subpass->resolve_attachments[j];
- if (resolve_att->attachment == VK_ATTACHMENT_UNUSED)
- continue;
-
- subpass->has_color_resolve = true;
-
- assert(color_att->attachment < pass->attachment_count);
- struct anv_render_pass_attachment *color_pass_att =
- &pass->attachments[color_att->attachment];
-
- assert(resolve_att->usage == VK_IMAGE_USAGE_TRANSFER_DST_BIT);
- assert(color_att->usage == VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT);
- color_pass_att->usage |= VK_IMAGE_USAGE_TRANSFER_SRC_BIT;
- }
- }
-
- if (subpass->ds_resolve_attachment) {
- struct anv_subpass_attachment *ds_att =
- subpass->depth_stencil_attachment;
- UNUSED struct anv_subpass_attachment *resolve_att =
- subpass->ds_resolve_attachment;
-
- assert(ds_att->attachment < pass->attachment_count);
- struct anv_render_pass_attachment *ds_pass_att =
- &pass->attachments[ds_att->attachment];
-
- assert(resolve_att->usage == VK_IMAGE_USAGE_TRANSFER_DST_BIT);
- assert(ds_att->usage == VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT);
- ds_pass_att->usage |= VK_IMAGE_USAGE_TRANSFER_SRC_BIT;
- }
-
- for (uint32_t j = 0; j < subpass->attachment_count; j++)
- assert(__builtin_popcount(subpass->attachments[j].usage) == 1);
- }
-
- /* From the Vulkan 1.0.39 spec:
- *
- * If there is no subpass dependency from VK_SUBPASS_EXTERNAL to the
- * first subpass that uses an attachment, then an implicit subpass
- * dependency exists from VK_SUBPASS_EXTERNAL to the first subpass it is
- * used in. The subpass dependency operates as if defined with the
- * following parameters:
- *
- * VkSubpassDependency implicitDependency = {
- * .srcSubpass = VK_SUBPASS_EXTERNAL;
- * .dstSubpass = firstSubpass; // First subpass attachment is used in
- * .srcStageMask = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
- * .dstStageMask = VK_PIPELINE_STAGE_ALL_COMMANDS_BIT;
- * .srcAccessMask = 0;
- * .dstAccessMask = VK_ACCESS_INPUT_ATTACHMENT_READ_BIT |
- * VK_ACCESS_COLOR_ATTACHMENT_READ_BIT |
- * VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT |
- * VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT |
- * VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT;
- * .dependencyFlags = 0;
- * };
- *
- * Similarly, if there is no subpass dependency from the last subpass
- * that uses an attachment to VK_SUBPASS_EXTERNAL, then an implicit
- * subpass dependency exists from the last subpass it is used in to
- * VK_SUBPASS_EXTERNAL. The subpass dependency operates as if defined
- * with the following parameters:
- *
- * VkSubpassDependency implicitDependency = {
- * .srcSubpass = lastSubpass; // Last subpass attachment is used in
- * .dstSubpass = VK_SUBPASS_EXTERNAL;
- * .srcStageMask = VK_PIPELINE_STAGE_ALL_COMMANDS_BIT;
- * .dstStageMask = VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT;
- * .srcAccessMask = VK_ACCESS_INPUT_ATTACHMENT_READ_BIT |
- * VK_ACCESS_COLOR_ATTACHMENT_READ_BIT |
- * VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT |
- * VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT |
- * VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT;
- * .dstAccessMask = 0;
- * .dependencyFlags = 0;
- * };
- *
- * We could implement this by walking over all of the attachments and
- * subpasses and checking to see if any of them don't have an external
- * dependency. Or, we could just be lazy and add a couple extra flushes.
- * We choose to be lazy.
- *
- * From the documentation for vkCmdNextSubpass:
- *
- * "Moving to the next subpass automatically performs any multisample
- * resolve operations in the subpass being ended. End-of-subpass
- * multisample resolves are treated as color attachment writes for the
- * purposes of synchronization. This applies to resolve operations for
- * both color and depth/stencil attachments. That is, they are
- * considered to execute in the
- * VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT pipeline stage and
- * their writes are synchronized with
- * VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT."
- *
- * Therefore, the above flags concerning color attachments also apply to
- * color and depth/stencil resolve attachments.
- */
- if (all_usage & VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT) {
- pass->subpass_flushes[0] |=
- ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT;
- }
- if (all_usage & (VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT |
- VK_IMAGE_USAGE_TRANSFER_DST_BIT)) {
- pass->subpass_flushes[pass->subpass_count] |=
- ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;
- }
- if (all_usage & VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT) {
- pass->subpass_flushes[pass->subpass_count] |=
- ANV_PIPE_DEPTH_CACHE_FLUSH_BIT;
- }
-}
-
-static unsigned
-num_subpass_attachments2(const VkSubpassDescription2KHR *desc)
-{
- const VkSubpassDescriptionDepthStencilResolveKHR *ds_resolve =
- vk_find_struct_const(desc->pNext,
- SUBPASS_DESCRIPTION_DEPTH_STENCIL_RESOLVE_KHR);
-
- return desc->inputAttachmentCount +
- desc->colorAttachmentCount +
- (desc->pResolveAttachments ? desc->colorAttachmentCount : 0) +
- (desc->pDepthStencilAttachment != NULL) +
- (ds_resolve && ds_resolve->pDepthStencilResolveAttachment);
-}
-
-static bool
-vk_image_layout_depth_only(VkImageLayout layout)
-{
- switch (layout) {
- case VK_IMAGE_LAYOUT_DEPTH_READ_ONLY_OPTIMAL:
- case VK_IMAGE_LAYOUT_DEPTH_ATTACHMENT_OPTIMAL:
- return true;
-
- default:
- return false;
- }
-}
-
-/* From the Vulkan Specification 1.2.166 - VkAttachmentReference2:
- *
- * "If layout only specifies the layout of the depth aspect of the
- * attachment, the layout of the stencil aspect is specified by the
- * stencilLayout member of a VkAttachmentReferenceStencilLayout structure
- * included in the pNext chain. Otherwise, layout describes the layout for
- * all relevant image aspects."
- */
-static VkImageLayout
-stencil_ref_layout(const VkAttachmentReference2KHR *att_ref)
-{
- if (!vk_image_layout_depth_only(att_ref->layout))
- return att_ref->layout;
-
- const VkAttachmentReferenceStencilLayoutKHR *stencil_ref =
- vk_find_struct_const(att_ref->pNext,
- ATTACHMENT_REFERENCE_STENCIL_LAYOUT_KHR);
- if (!stencil_ref)
- return VK_IMAGE_LAYOUT_UNDEFINED;
- return stencil_ref->stencilLayout;
-}
-
-/* From the Vulkan Specification 1.2.166 - VkAttachmentDescription2:
- *
- * "If format is a depth/stencil format, and initialLayout only specifies
- * the initial layout of the depth aspect of the attachment, the initial
- * layout of the stencil aspect is specified by the stencilInitialLayout
- * member of a VkAttachmentDescriptionStencilLayout structure included in
- * the pNext chain. Otherwise, initialLayout describes the initial layout
- * for all relevant image aspects."
- */
-static VkImageLayout
-stencil_desc_layout(const VkAttachmentDescription2KHR *att_desc, bool final)
-{
- if (!vk_format_has_stencil(att_desc->format))
- return VK_IMAGE_LAYOUT_UNDEFINED;
-
- const VkImageLayout main_layout =
- final ? att_desc->finalLayout : att_desc->initialLayout;
- if (!vk_image_layout_depth_only(main_layout))
- return main_layout;
-
- const VkAttachmentDescriptionStencilLayoutKHR *stencil_desc =
- vk_find_struct_const(att_desc->pNext,
- ATTACHMENT_DESCRIPTION_STENCIL_LAYOUT_KHR);
- assert(stencil_desc);
- return final ?
- stencil_desc->stencilFinalLayout :
- stencil_desc->stencilInitialLayout;
-}
-
-VkResult anv_CreateRenderPass2(
- VkDevice _device,
- const VkRenderPassCreateInfo2KHR* pCreateInfo,
- const VkAllocationCallbacks* pAllocator,
- VkRenderPass* pRenderPass)
-{
- ANV_FROM_HANDLE(anv_device, device, _device);
-
- assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO_2_KHR);
-
- VK_MULTIALLOC(ma);
- VK_MULTIALLOC_DECL(&ma, struct anv_render_pass, pass, 1);
- VK_MULTIALLOC_DECL(&ma, struct anv_subpass, subpasses,
- pCreateInfo->subpassCount);
- VK_MULTIALLOC_DECL(&ma, struct anv_render_pass_attachment, attachments,
- pCreateInfo->attachmentCount);
- VK_MULTIALLOC_DECL(&ma, enum anv_pipe_bits, subpass_flushes,
- pCreateInfo->subpassCount + 1);
-
- uint32_t subpass_attachment_count = 0;
- for (uint32_t i = 0; i < pCreateInfo->subpassCount; i++) {
- subpass_attachment_count +=
- num_subpass_attachments2(&pCreateInfo->pSubpasses[i]);
- }
- VK_MULTIALLOC_DECL(&ma, struct anv_subpass_attachment, subpass_attachments,
- subpass_attachment_count);
-
- if (!vk_object_multizalloc(&device->vk, &ma, pAllocator,
- VK_OBJECT_TYPE_RENDER_PASS))
- return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
-
- /* Clear the subpasses along with the parent pass. This required because
- * each array member of anv_subpass must be a valid pointer if not NULL.
- */
- pass->attachment_count = pCreateInfo->attachmentCount;
- pass->subpass_count = pCreateInfo->subpassCount;
- pass->attachments = attachments;
- pass->subpass_flushes = subpass_flushes;
-
- for (uint32_t i = 0; i < pCreateInfo->attachmentCount; i++) {
- pass->attachments[i] = (struct anv_render_pass_attachment) {
- .format = pCreateInfo->pAttachments[i].format,
- .samples = pCreateInfo->pAttachments[i].samples,
- .load_op = pCreateInfo->pAttachments[i].loadOp,
- .store_op = pCreateInfo->pAttachments[i].storeOp,
- .stencil_load_op = pCreateInfo->pAttachments[i].stencilLoadOp,
- .initial_layout = pCreateInfo->pAttachments[i].initialLayout,
- .final_layout = pCreateInfo->pAttachments[i].finalLayout,
-
- .stencil_initial_layout = stencil_desc_layout(&pCreateInfo->pAttachments[i],
- false),
- .stencil_final_layout = stencil_desc_layout(&pCreateInfo->pAttachments[i],
- true),
- };
- }
-
- for (uint32_t i = 0; i < pCreateInfo->subpassCount; i++) {
- const VkSubpassDescription2KHR *desc = &pCreateInfo->pSubpasses[i];
- struct anv_subpass *subpass = &pass->subpasses[i];
-
- subpass->input_count = desc->inputAttachmentCount;
- subpass->color_count = desc->colorAttachmentCount;
- subpass->attachment_count = num_subpass_attachments2(desc);
- subpass->attachments = subpass_attachments;
- subpass->view_mask = desc->viewMask;
-
- if (desc->inputAttachmentCount > 0) {
- subpass->input_attachments = subpass_attachments;
- subpass_attachments += desc->inputAttachmentCount;
-
- for (uint32_t j = 0; j < desc->inputAttachmentCount; j++) {
- subpass->input_attachments[j] = (struct anv_subpass_attachment) {
- .usage = VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT,
- .attachment = desc->pInputAttachments[j].attachment,
- .layout = desc->pInputAttachments[j].layout,
- .stencil_layout = stencil_ref_layout(&desc->pInputAttachments[j]),
- };
- }
- }
-
- if (desc->colorAttachmentCount > 0) {
- subpass->color_attachments = subpass_attachments;
- subpass_attachments += desc->colorAttachmentCount;
-
- for (uint32_t j = 0; j < desc->colorAttachmentCount; j++) {
- subpass->color_attachments[j] = (struct anv_subpass_attachment) {
- .usage = VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT,
- .attachment = desc->pColorAttachments[j].attachment,
- .layout = desc->pColorAttachments[j].layout,
- };
- }
- }
-
- if (desc->pResolveAttachments) {
- subpass->resolve_attachments = subpass_attachments;
- subpass_attachments += desc->colorAttachmentCount;
-
- for (uint32_t j = 0; j < desc->colorAttachmentCount; j++) {
- subpass->resolve_attachments[j] = (struct anv_subpass_attachment) {
- .usage = VK_IMAGE_USAGE_TRANSFER_DST_BIT,
- .attachment = desc->pResolveAttachments[j].attachment,
- .layout = desc->pResolveAttachments[j].layout,
- };
- }
- }
-
- if (desc->pDepthStencilAttachment) {
- subpass->depth_stencil_attachment = subpass_attachments++;
-
- *subpass->depth_stencil_attachment = (struct anv_subpass_attachment) {
- .usage = VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT,
- .attachment = desc->pDepthStencilAttachment->attachment,
- .layout = desc->pDepthStencilAttachment->layout,
- .stencil_layout = stencil_ref_layout(desc->pDepthStencilAttachment),
- };
- }
-
- const VkSubpassDescriptionDepthStencilResolveKHR *ds_resolve =
- vk_find_struct_const(desc->pNext,
- SUBPASS_DESCRIPTION_DEPTH_STENCIL_RESOLVE_KHR);
-
- if (ds_resolve && ds_resolve->pDepthStencilResolveAttachment) {
- subpass->ds_resolve_attachment = subpass_attachments++;
-
- *subpass->ds_resolve_attachment = (struct anv_subpass_attachment) {
- .usage = VK_IMAGE_USAGE_TRANSFER_DST_BIT,
- .attachment = ds_resolve->pDepthStencilResolveAttachment->attachment,
- .layout = ds_resolve->pDepthStencilResolveAttachment->layout,
- .stencil_layout = stencil_ref_layout(ds_resolve->pDepthStencilResolveAttachment),
- };
- subpass->depth_resolve_mode = ds_resolve->depthResolveMode;
- subpass->stencil_resolve_mode = ds_resolve->stencilResolveMode;
- }
- }
-
- for (uint32_t i = 0; i < pCreateInfo->dependencyCount; i++) {
- anv_render_pass_add_subpass_dep(device, pass,
- &pCreateInfo->pDependencies[i]);
- }
-
- vk_foreach_struct(ext, pCreateInfo->pNext) {
- switch (ext->sType) {
- default:
- anv_debug_ignored_stype(ext->sType);
- }
- }
-
- anv_render_pass_compile(pass);
-
- *pRenderPass = anv_render_pass_to_handle(pass);
-
- return VK_SUCCESS;
-}
-
-void anv_DestroyRenderPass(
- VkDevice _device,
- VkRenderPass _pass,
- const VkAllocationCallbacks* pAllocator)
-{
- ANV_FROM_HANDLE(anv_device, device, _device);
- ANV_FROM_HANDLE(anv_render_pass, pass, _pass);
-
- if (!pass)
- return;
-
- vk_object_free(&device->vk, pAllocator, pass);
-}
-
-void anv_GetRenderAreaGranularity(
- VkDevice device,
- VkRenderPass renderPass,
- VkExtent2D* pGranularity)
-{
- ANV_FROM_HANDLE(anv_render_pass, pass, renderPass);
-
- /* This granularity satisfies HiZ fast clear alignment requirements
- * for all sample counts.
- */
- for (unsigned i = 0; i < pass->subpass_count; ++i) {
- if (pass->subpasses[i].depth_stencil_attachment) {
- *pGranularity = (VkExtent2D) { .width = 8, .height = 4 };
- return;
- }
- }
-
- *pGranularity = (VkExtent2D) { 1, 1 };
-}
diff --git a/src/intel/vulkan/anv_perf.c b/src/intel/vulkan/anv_perf.c
index 560da6a7c31..3b23067ab23 100644
--- a/src/intel/vulkan/anv_perf.c
+++ b/src/intel/vulkan/anv_perf.c
@@ -36,39 +36,21 @@
void
anv_physical_device_init_perf(struct anv_physical_device *device, int fd)
{
- const struct intel_device_info *devinfo = &device->info;
-
device->perf = NULL;
- /* We need self modifying batches. The i915 parser prevents it on
- * Gfx7.5 :( maybe one day.
- */
- if (devinfo->ver < 8)
- return;
-
struct intel_perf_config *perf = intel_perf_new(NULL);
intel_perf_init_metrics(perf, &device->info, fd,
false /* pipeline statistics */,
true /* register snapshots */);
- if (!perf->n_queries) {
- if (perf->platform_supported) {
- static bool warned_once = false;
-
- if (!warned_once) {
- mesa_logw("Performance support disabled, "
- "consider sysctl dev.i915.perf_stream_paranoid=0\n");
- warned_once = true;
- }
- }
+ if (!perf->n_queries)
goto err;
- }
/* We need DRM_I915_PERF_PROP_HOLD_PREEMPTION support, only available in
* perf revision 2.
*/
- if (!(INTEL_DEBUG & DEBUG_NO_OACONFIG)) {
+ if (!INTEL_DEBUG(DEBUG_NO_OACONFIG)) {
if (!intel_perf_has_hold_preemption(perf))
goto err;
}
@@ -89,10 +71,13 @@ anv_physical_device_init_perf(struct anv_physical_device *device, int fd)
break;
case INTEL_PERF_QUERY_FIELD_TYPE_SRM_PERFCNT:
case INTEL_PERF_QUERY_FIELD_TYPE_SRM_RPSTAT:
+ case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_A:
case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_B:
case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_C:
device->n_perf_query_commands += field->size / 4;
break;
+ default:
+ unreachable("Unhandled register type");
}
}
device->n_perf_query_commands *= 2; /* Begin & End */
@@ -124,9 +109,10 @@ anv_device_perf_open(struct anv_device *device, uint64_t metric_id)
properties[p++] = metric_id;
properties[p++] = DRM_I915_PERF_PROP_OA_FORMAT;
- properties[p++] = device->info.ver >= 8 ?
- I915_OA_FORMAT_A32u40_A4u32_B8_C8 :
- I915_OA_FORMAT_A45_B8_C8;
+ properties[p++] =
+ device->info->verx10 >= 125 ?
+ I915_OA_FORMAT_A24u40_A14u32_B8_C8 :
+ I915_OA_FORMAT_A32u40_A4u32_B8_C8;
properties[p++] = DRM_I915_PERF_PROP_OA_EXPONENT;
properties[p++] = 31; /* slowest sampling period */
@@ -141,8 +127,12 @@ anv_device_perf_open(struct anv_device *device, uint64_t metric_id)
* Gfx11 for instance we use the full EU array. Initially when perf was
* enabled we would use only half on Gfx11 because of functional
* requirements.
+ *
+ * Temporary disable this option on Gfx12.5+, kernel doesn't appear to
+ * support it.
*/
- if (intel_perf_has_global_sseu(device->physical->perf)) {
+ if (intel_perf_has_global_sseu(device->physical->perf) &&
+ device->info->verx10 < 125) {
properties[p++] = DRM_I915_PERF_PROP_GLOBAL_SSEU;
properties[p++] = (uintptr_t) &device->physical->perf->sseu;
}
@@ -223,9 +213,9 @@ VkResult anv_AcquirePerformanceConfigurationINTEL(
config = vk_object_alloc(&device->vk, NULL, sizeof(*config),
VK_OBJECT_TYPE_PERFORMANCE_CONFIGURATION_INTEL);
if (!config)
- return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+ return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
- if (!(INTEL_DEBUG & DEBUG_NO_OACONFIG)) {
+ if (!INTEL_DEBUG(DEBUG_NO_OACONFIG)) {
config->register_config =
intel_perf_load_configuration(device->physical->perf, device->fd,
INTEL_PERF_QUERY_GUID_MDAPI);
@@ -258,7 +248,7 @@ VkResult anv_ReleasePerformanceConfigurationINTEL(
ANV_FROM_HANDLE(anv_device, device, _device);
ANV_FROM_HANDLE(anv_performance_configuration_intel, config, _configuration);
- if (!(INTEL_DEBUG & DEBUG_NO_OACONFIG))
+ if (!INTEL_DEBUG(DEBUG_NO_OACONFIG))
intel_ioctl(device->fd, DRM_IOCTL_I915_PERF_REMOVE_CONFIG, &config->config_id);
ralloc_free(config->register_config);
@@ -276,7 +266,7 @@ VkResult anv_QueueSetPerformanceConfigurationINTEL(
ANV_FROM_HANDLE(anv_performance_configuration_intel, config, _configuration);
struct anv_device *device = queue->device;
- if (!(INTEL_DEBUG & DEBUG_NO_OACONFIG)) {
+ if (!INTEL_DEBUG(DEBUG_NO_OACONFIG)) {
if (device->perf_fd < 0) {
device->perf_fd = anv_device_perf_open(device, config->config_id);
if (device->perf_fd < 0)
@@ -285,7 +275,7 @@ VkResult anv_QueueSetPerformanceConfigurationINTEL(
int ret = intel_ioctl(device->perf_fd, I915_PERF_IOCTL_CONFIG,
(void *)(uintptr_t) config->config_id);
if (ret < 0)
- return anv_device_set_lost(device, "i915-perf config failed: %m");
+ return vk_device_set_lost(&device->vk, "i915-perf config failed: %m");
}
}
@@ -346,15 +336,25 @@ VkResult anv_EnumeratePhysicalDeviceQueueFamilyPerformanceQueryCountersKHR(
uint32_t desc_count = *pCounterCount;
- VK_OUTARRAY_MAKE(out, pCounters, pCounterCount);
- VK_OUTARRAY_MAKE(out_desc, pCounterDescriptions, &desc_count);
+ VK_OUTARRAY_MAKE_TYPED(VkPerformanceCounterKHR, out, pCounters, pCounterCount);
+ VK_OUTARRAY_MAKE_TYPED(VkPerformanceCounterDescriptionKHR, out_desc,
+ pCounterDescriptions, &desc_count);
+
+ /* We cannot support performance queries on anything other than RCS,
+ * because the MI_REPORT_PERF_COUNT command is not available on other
+ * engines.
+ */
+ struct anv_queue_family *queue_family =
+ &pdevice->queue.families[queueFamilyIndex];
+ if (queue_family->engine_class != INTEL_ENGINE_CLASS_RENDER)
+ return vk_outarray_status(&out);
for (int c = 0; c < (perf ? perf->n_counters : 0); c++) {
const struct intel_perf_query_counter *intel_counter = perf->counter_infos[c].counter;
- vk_outarray_append(&out, counter) {
+ vk_outarray_append_typed(VkPerformanceCounterKHR, &out, counter) {
counter->unit = intel_perf_counter_unit_to_vk_unit[intel_counter->units];
- counter->scope = VK_QUERY_SCOPE_COMMAND_KHR;
+ counter->scope = VK_PERFORMANCE_COUNTER_SCOPE_COMMAND_KHR;
counter->storage = intel_perf_counter_data_type_to_vk_storage[intel_counter->data_type];
unsigned char sha1_result[20];
@@ -364,9 +364,12 @@ VkResult anv_EnumeratePhysicalDeviceQueueFamilyPerformanceQueryCountersKHR(
memcpy(counter->uuid, sha1_result, sizeof(counter->uuid));
}
- vk_outarray_append(&out_desc, desc) {
+ vk_outarray_append_typed(VkPerformanceCounterDescriptionKHR, &out_desc, desc) {
desc->flags = 0; /* None so far. */
- snprintf(desc->name, sizeof(desc->name), "%s", intel_counter->name);
+ snprintf(desc->name, sizeof(desc->name), "%s",
+ INTEL_DEBUG(DEBUG_PERF_SYMBOL_NAMES) ?
+ intel_counter->symbol_name :
+ intel_counter->name);
snprintf(desc->category, sizeof(desc->category), "%s", intel_counter->category);
snprintf(desc->description, sizeof(desc->description), "%s", intel_counter->desc);
}
@@ -405,7 +408,7 @@ VkResult anv_AcquireProfilingLockKHR(
assert(device->perf_fd == -1);
- if (!(INTEL_DEBUG & DEBUG_NO_OACONFIG)) {
+ if (!INTEL_DEBUG(DEBUG_NO_OACONFIG)) {
fd = anv_device_perf_open(device, first_metric_set->oa_metrics_set_id);
if (fd < 0)
return VK_TIMEOUT;
@@ -420,7 +423,7 @@ void anv_ReleaseProfilingLockKHR(
{
ANV_FROM_HANDLE(anv_device, device, _device);
- if (!(INTEL_DEBUG & DEBUG_NO_OACONFIG)) {
+ if (!INTEL_DEBUG(DEBUG_NO_OACONFIG)) {
assert(device->perf_fd >= 0);
close(device->perf_fd);
}
@@ -433,10 +436,12 @@ anv_perf_write_pass_results(struct intel_perf_config *perf,
const struct intel_perf_query_result *accumulated_results,
union VkPerformanceCounterResultKHR *results)
{
+ const struct intel_perf_query_info *query = pool->pass_query[pass];
+
for (uint32_t c = 0; c < pool->n_counters; c++) {
const struct intel_perf_counter_pass *counter_pass = &pool->counter_pass[c];
- if (counter_pass->pass != pass)
+ if (counter_pass->query != query)
continue;
switch (pool->pass_query[pass]->kind) {
diff --git a/src/intel/vulkan/anv_pipeline.c b/src/intel/vulkan/anv_pipeline.c
index 864c9733224..6d417fda354 100644
--- a/src/intel/vulkan/anv_pipeline.c
+++ b/src/intel/vulkan/anv_pipeline.c
@@ -30,80 +30,122 @@
#include "util/mesa-sha1.h"
#include "util/os_time.h"
#include "common/intel_l3_config.h"
-#include "common/intel_disasm.h"
#include "common/intel_sample_positions.h"
+#include "compiler/brw_disasm.h"
#include "anv_private.h"
#include "compiler/brw_nir.h"
#include "compiler/brw_nir_rt.h"
+#include "compiler/intel_nir.h"
#include "anv_nir.h"
#include "nir/nir_xfb_info.h"
#include "spirv/nir_spirv.h"
+#include "vk_nir_convert_ycbcr.h"
+#include "vk_nir.h"
+#include "vk_pipeline.h"
+#include "vk_render_pass.h"
#include "vk_util.h"
-/* Needed for SWIZZLE macros */
-#include "program/prog_instruction.h"
+struct lower_set_vtx_and_prim_count_state {
+ nir_variable *primitive_count;
+};
-// Shader functions
-#define SPIR_V_MAGIC_NUMBER 0x07230203
+static nir_variable *
+anv_nir_prim_count_store(nir_builder *b, nir_def *val)
+{
+ nir_variable *primitive_count =
+ nir_variable_create(b->shader,
+ nir_var_shader_out,
+ glsl_uint_type(),
+ "gl_PrimitiveCountNV");
+ primitive_count->data.location = VARYING_SLOT_PRIMITIVE_COUNT;
+ primitive_count->data.interpolation = INTERP_MODE_NONE;
+
+ nir_def *local_invocation_index = nir_load_local_invocation_index(b);
+
+ nir_def *cmp = nir_ieq_imm(b, local_invocation_index, 0);
+ nir_if *if_stmt = nir_push_if(b, cmp);
+ {
+ nir_deref_instr *prim_count_deref = nir_build_deref_var(b, primitive_count);
+ nir_store_deref(b, prim_count_deref, val, 1);
+ }
+ nir_pop_if(b, if_stmt);
-struct anv_spirv_debug_data {
- struct anv_device *device;
- const struct vk_shader_module *module;
-};
+ return primitive_count;
+}
-static void anv_spirv_nir_debug(void *private_data,
- enum nir_spirv_debug_level level,
- size_t spirv_offset,
- const char *message)
+static bool
+anv_nir_lower_set_vtx_and_prim_count_instr(nir_builder *b,
+ nir_intrinsic_instr *intrin,
+ void *data)
{
- struct anv_spirv_debug_data *debug_data = private_data;
- struct anv_instance *instance = debug_data->device->physical->instance;
+ if (intrin->intrinsic != nir_intrinsic_set_vertex_and_primitive_count)
+ return false;
- static const VkDebugReportFlagsEXT vk_flags[] = {
- [NIR_SPIRV_DEBUG_LEVEL_INFO] = VK_DEBUG_REPORT_INFORMATION_BIT_EXT,
- [NIR_SPIRV_DEBUG_LEVEL_WARNING] = VK_DEBUG_REPORT_WARNING_BIT_EXT,
- [NIR_SPIRV_DEBUG_LEVEL_ERROR] = VK_DEBUG_REPORT_ERROR_BIT_EXT,
- };
- char buffer[256];
+ /* Detect some cases of invalid primitive count. They might lead to URB
+ * memory corruption, where workgroups overwrite each other output memory.
+ */
+ if (nir_src_is_const(intrin->src[1]) &&
+ nir_src_as_uint(intrin->src[1]) > b->shader->info.mesh.max_primitives_out) {
+ assert(!"number of primitives bigger than max specified");
+ }
+
+ struct lower_set_vtx_and_prim_count_state *state = data;
+ /* this intrinsic should show up only once */
+ assert(state->primitive_count == NULL);
+
+ b->cursor = nir_before_instr(&intrin->instr);
+
+ state->primitive_count = anv_nir_prim_count_store(b, intrin->src[1].ssa);
- snprintf(buffer, sizeof(buffer), "SPIR-V offset %lu: %s", (unsigned long) spirv_offset, message);
+ nir_instr_remove(&intrin->instr);
- vk_debug_report(&instance->vk, vk_flags[level],
- &debug_data->module->base,
- 0, 0, "anv", buffer);
+ return true;
+}
+
+static bool
+anv_nir_lower_set_vtx_and_prim_count(nir_shader *nir)
+{
+ struct lower_set_vtx_and_prim_count_state state = { NULL, };
+
+ nir_shader_intrinsics_pass(nir, anv_nir_lower_set_vtx_and_prim_count_instr,
+ nir_metadata_none,
+ &state);
+
+ /* If we didn't find set_vertex_and_primitive_count, then we have to
+ * insert store of value 0 to primitive_count.
+ */
+ if (state.primitive_count == NULL) {
+ nir_builder b;
+ nir_function_impl *entrypoint = nir_shader_get_entrypoint(nir);
+ b = nir_builder_at(nir_before_impl(entrypoint));
+ nir_def *zero = nir_imm_int(&b, 0);
+ state.primitive_count = anv_nir_prim_count_store(&b, zero);
+ }
+
+ assert(state.primitive_count != NULL);
+ return true;
}
/* Eventually, this will become part of anv_CreateShader. Unfortunately,
* we can't do that yet because we don't have the ability to copy nir.
*/
static nir_shader *
-anv_shader_compile_to_nir(struct anv_device *device,
- void *mem_ctx,
- const struct vk_shader_module *module,
- const char *entrypoint_name,
- gl_shader_stage stage,
- const VkSpecializationInfo *spec_info)
+anv_shader_stage_to_nir(struct anv_device *device,
+ const VkPipelineShaderStageCreateInfo *stage_info,
+ enum brw_robustness_flags robust_flags,
+ void *mem_ctx)
{
const struct anv_physical_device *pdevice = device->physical;
const struct brw_compiler *compiler = pdevice->compiler;
+ gl_shader_stage stage = vk_to_mesa_shader_stage(stage_info->stage);
const nir_shader_compiler_options *nir_options =
- compiler->glsl_compiler_options[stage].NirOptions;
+ compiler->nir_options[stage];
- uint32_t *spirv = (uint32_t *) module->data;
- assert(spirv[0] == SPIR_V_MAGIC_NUMBER);
- assert(module->size % 4 == 0);
-
- uint32_t num_spec_entries = 0;
- struct nir_spirv_specialization *spec_entries =
- vk_spec_info_to_nir_spirv(spec_info, &num_spec_entries);
-
- struct anv_spirv_debug_data spirv_debug_data = {
- .device = device,
- .module = module,
- };
- struct spirv_to_nir_options spirv_options = {
- .frag_coord_is_sysval = true,
+ const bool rt_enabled = ANV_SUPPORT_RT && pdevice->info.has_ray_tracing;
+ const struct spirv_to_nir_options spirv_options = {
.caps = {
+ .amd_image_gather_bias_lod = pdevice->info.ver >= 20,
+ .cooperative_matrix = anv_has_cooperative_matrix(pdevice),
.demote_to_helper_invocation = true,
.derivative_group = true,
.descriptor_array_dynamic_indexing = true,
@@ -111,51 +153,60 @@ anv_shader_compile_to_nir(struct anv_device *device,
.descriptor_indexing = true,
.device_group = true,
.draw_parameters = true,
- .float16 = pdevice->info.ver >= 8,
+ .float16 = true,
.float32_atomic_add = pdevice->info.has_lsc,
- .float32_atomic_min_max = pdevice->info.ver >= 9,
- .float64 = pdevice->info.ver >= 8,
+ .float32_atomic_min_max = true,
+ .float64 = true,
.float64_atomic_min_max = pdevice->info.has_lsc,
- .fragment_shader_sample_interlock = pdevice->info.ver >= 9,
- .fragment_shader_pixel_interlock = pdevice->info.ver >= 9,
+ .fragment_shader_sample_interlock = true,
+ .fragment_shader_pixel_interlock = true,
.geometry_streams = true,
+ .image_read_without_format = true,
.image_write_without_format = true,
- .int8 = pdevice->info.ver >= 8,
- .int16 = pdevice->info.ver >= 8,
- .int64 = pdevice->info.ver >= 8,
- .int64_atomics = pdevice->info.ver >= 9 && pdevice->use_softpin,
- .integer_functions2 = pdevice->info.ver >= 8,
+ .int8 = true,
+ .int16 = true,
+ .int64 = true,
+ .int64_atomics = true,
+ .integer_functions2 = true,
+ .mesh_shading = pdevice->vk.supported_extensions.EXT_mesh_shader,
+ .mesh_shading_nv = false,
.min_lod = true,
.multiview = true,
- .physical_storage_buffer_address = pdevice->has_a64_buffer_access,
- .post_depth_coverage = pdevice->info.ver >= 9,
+ .physical_storage_buffer_address = true,
+ .post_depth_coverage = true,
+ .quad_control = true,
.runtime_descriptor_array = true,
- .float_controls = pdevice->info.ver >= 8,
- .ray_tracing = pdevice->info.has_ray_tracing,
+ .float_controls = true,
+ .float_controls2 = true,
+ .ray_cull_mask = rt_enabled,
+ .ray_query = rt_enabled,
+ .ray_tracing = rt_enabled,
+ .ray_tracing_position_fetch = rt_enabled,
.shader_clock = true,
.shader_viewport_index_layer = true,
- .stencil_export = pdevice->info.ver >= 9,
- .storage_8bit = pdevice->info.ver >= 8,
- .storage_16bit = pdevice->info.ver >= 8,
+ .sparse_residency = pdevice->sparse_type != ANV_SPARSE_TYPE_NOT_SUPPORTED,
+ .stencil_export = true,
+ .storage_8bit = true,
+ .storage_16bit = true,
.subgroup_arithmetic = true,
.subgroup_basic = true,
.subgroup_ballot = true,
+ .subgroup_dispatch = true,
.subgroup_quad = true,
+ .subgroup_rotate = true,
.subgroup_uniform_control_flow = true,
.subgroup_shuffle = true,
.subgroup_vote = true,
.tessellation = true,
- .transform_feedback = pdevice->info.ver >= 8,
+ .transform_feedback = true,
.variable_pointers = true,
.vk_memory_model = true,
.vk_memory_model_device_scope = true,
.workgroup_memory_explicit_layout = true,
.fragment_shading_rate = pdevice->info.ver >= 11,
},
- .ubo_addr_format =
- anv_nir_ubo_addr_format(pdevice, device->robust_buffer_access),
- .ssbo_addr_format =
- anv_nir_ssbo_addr_format(pdevice, device->robust_buffer_access),
+ .ubo_addr_format = anv_nir_ubo_addr_format(pdevice, robust_flags),
+ .ssbo_addr_format = anv_nir_ssbo_addr_format(pdevice, robust_flags),
.phys_ssbo_addr_format = nir_address_format_64bit_global,
.push_const_addr_format = nir_address_format_logical,
@@ -164,89 +215,36 @@ anv_shader_compile_to_nir(struct anv_device *device,
* with certain code / code generators.
*/
.shared_addr_format = nir_address_format_32bit_offset,
- .debug = {
- .func = anv_spirv_nir_debug,
- .private_data = &spirv_debug_data,
- },
- };
+ .min_ubo_alignment = ANV_UBO_ALIGNMENT,
+ .min_ssbo_alignment = ANV_SSBO_ALIGNMENT,
+ };
- nir_shader *nir =
- spirv_to_nir(spirv, module->size / 4,
- spec_entries, num_spec_entries,
- stage, entrypoint_name, &spirv_options, nir_options);
- if (!nir) {
- free(spec_entries);
+ nir_shader *nir;
+ VkResult result =
+ vk_pipeline_shader_stage_to_nir(&device->vk, stage_info,
+ &spirv_options, nir_options,
+ mem_ctx, &nir);
+ if (result != VK_SUCCESS)
return NULL;
- }
-
- assert(nir->info.stage == stage);
- nir_validate_shader(nir, "after spirv_to_nir");
- nir_validate_ssa_dominance(nir, "after spirv_to_nir");
- ralloc_steal(mem_ctx, nir);
-
- free(spec_entries);
- if (INTEL_DEBUG & intel_debug_flag_for_shader_stage(stage)) {
+ if (INTEL_DEBUG(intel_debug_flag_for_shader_stage(stage))) {
fprintf(stderr, "NIR (from SPIR-V) for %s shader:\n",
gl_shader_stage_name(stage));
nir_print_shader(nir, stderr);
}
- /* We have to lower away local constant initializers right before we
- * inline functions. That way they get properly initialized at the top
- * of the function and not at the top of its caller.
- */
- NIR_PASS_V(nir, nir_lower_variable_initializers, nir_var_function_temp);
- NIR_PASS_V(nir, nir_lower_returns);
- NIR_PASS_V(nir, nir_inline_functions);
- NIR_PASS_V(nir, nir_copy_prop);
- NIR_PASS_V(nir, nir_opt_deref);
-
- /* Pick off the single entrypoint that we want */
- foreach_list_typed_safe(nir_function, func, node, &nir->functions) {
- if (!func->is_entrypoint)
- exec_node_remove(&func->node);
- }
- assert(exec_list_length(&nir->functions) == 1);
-
- /* Now that we've deleted all but the main function, we can go ahead and
- * lower the rest of the constant initializers. We do this here so that
- * nir_remove_dead_variables and split_per_member_structs below see the
- * corresponding stores.
- */
- NIR_PASS_V(nir, nir_lower_variable_initializers, ~0);
-
- /* Split member structs. We do this before lower_io_to_temporaries so that
- * it doesn't lower system values to temporaries by accident.
- */
- NIR_PASS_V(nir, nir_split_var_copies);
- NIR_PASS_V(nir, nir_split_per_member_structs);
-
- NIR_PASS_V(nir, nir_remove_dead_variables,
- nir_var_shader_in | nir_var_shader_out | nir_var_system_value |
- nir_var_shader_call_data | nir_var_ray_hit_attrib,
- NULL);
-
- NIR_PASS_V(nir, nir_propagate_invariant, false);
NIR_PASS_V(nir, nir_lower_io_to_temporaries,
nir_shader_get_entrypoint(nir), true, false);
- NIR_PASS_V(nir, nir_lower_frexp);
-
- /* Vulkan uses the separate-shader linking model */
- nir->info.separate_shader = true;
-
- brw_preprocess_nir(compiler, nir, NULL);
-
return nir;
}
-VkResult
+static VkResult
anv_pipeline_init(struct anv_pipeline *pipeline,
struct anv_device *device,
enum anv_pipeline_type type,
- VkPipelineCreateFlags flags,
+ VkPipelineCreateFlags2KHR flags,
const VkAllocationCallbacks *pAllocator)
{
VkResult result;
@@ -264,8 +262,9 @@ anv_pipeline_init(struct anv_pipeline *pipeline,
pipeline->batch.relocs = &pipeline->batch_relocs;
pipeline->batch.status = VK_SUCCESS;
+ const bool uses_relocs = device->physical->uses_relocs;
result = anv_reloc_list_init(&pipeline->batch_relocs,
- pipeline->batch.alloc);
+ pipeline->batch.alloc, uses_relocs);
if (result != VK_SUCCESS)
return result;
@@ -276,16 +275,40 @@ anv_pipeline_init(struct anv_pipeline *pipeline,
util_dynarray_init(&pipeline->executables, pipeline->mem_ctx);
+ anv_pipeline_sets_layout_init(&pipeline->layout, device,
+ false /* independent_sets */);
+
return VK_SUCCESS;
}
-void
+static void
+anv_pipeline_init_layout(struct anv_pipeline *pipeline,
+ struct anv_pipeline_layout *pipeline_layout)
+{
+ if (pipeline_layout) {
+ struct anv_pipeline_sets_layout *layout = &pipeline_layout->sets_layout;
+ for (uint32_t s = 0; s < layout->num_sets; s++) {
+ if (layout->set[s].layout == NULL)
+ continue;
+
+ anv_pipeline_sets_layout_add(&pipeline->layout, s,
+ layout->set[s].layout);
+ }
+ }
+
+ anv_pipeline_sets_layout_hash(&pipeline->layout);
+ assert(!pipeline_layout ||
+ !memcmp(pipeline->layout.sha1,
+ pipeline_layout->sets_layout.sha1,
+ sizeof(pipeline_layout->sets_layout.sha1)));
+}
+
+static void
anv_pipeline_finish(struct anv_pipeline *pipeline,
- struct anv_device *device,
- const VkAllocationCallbacks *pAllocator)
+ struct anv_device *device)
{
- anv_reloc_list_finish(&pipeline->batch_relocs,
- pAllocator ? pAllocator : &device->vk.alloc);
+ anv_pipeline_sets_layout_fini(&pipeline->layout);
+ anv_reloc_list_finish(&pipeline->batch_relocs);
ralloc_free(pipeline->mem_ctx);
vk_object_base_finish(&pipeline->base);
}
@@ -301,19 +324,27 @@ void anv_DestroyPipeline(
if (!pipeline)
return;
+ ANV_RMV(resource_destroy, device, pipeline);
+
switch (pipeline->type) {
+ case ANV_PIPELINE_GRAPHICS_LIB: {
+ struct anv_graphics_lib_pipeline *gfx_pipeline =
+ anv_pipeline_to_graphics_lib(pipeline);
+
+ for (unsigned s = 0; s < ARRAY_SIZE(gfx_pipeline->base.shaders); s++) {
+ if (gfx_pipeline->base.shaders[s])
+ anv_shader_bin_unref(device, gfx_pipeline->base.shaders[s]);
+ }
+ break;
+ }
+
case ANV_PIPELINE_GRAPHICS: {
struct anv_graphics_pipeline *gfx_pipeline =
anv_pipeline_to_graphics(pipeline);
- if (gfx_pipeline->blend_state.map)
- anv_state_pool_free(&device->dynamic_state_pool, gfx_pipeline->blend_state);
- if (gfx_pipeline->cps_state.map)
- anv_state_pool_free(&device->dynamic_state_pool, gfx_pipeline->cps_state);
-
- for (unsigned s = 0; s < ARRAY_SIZE(gfx_pipeline->shaders); s++) {
- if (gfx_pipeline->shaders[s])
- anv_shader_bin_unref(device, gfx_pipeline->shaders[s]);
+ for (unsigned s = 0; s < ARRAY_SIZE(gfx_pipeline->base.shaders); s++) {
+ if (gfx_pipeline->base.shaders[s])
+ anv_shader_bin_unref(device, gfx_pipeline->base.shaders[s]);
}
break;
}
@@ -343,358 +374,436 @@ void anv_DestroyPipeline(
unreachable("invalid pipeline type");
}
- anv_pipeline_finish(pipeline, device, pAllocator);
+ anv_pipeline_finish(pipeline, device);
vk_free2(&device->vk.alloc, pAllocator, pipeline);
}
-static const uint32_t vk_to_intel_primitive_type[] = {
- [VK_PRIMITIVE_TOPOLOGY_POINT_LIST] = _3DPRIM_POINTLIST,
- [VK_PRIMITIVE_TOPOLOGY_LINE_LIST] = _3DPRIM_LINELIST,
- [VK_PRIMITIVE_TOPOLOGY_LINE_STRIP] = _3DPRIM_LINESTRIP,
- [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST] = _3DPRIM_TRILIST,
- [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP] = _3DPRIM_TRISTRIP,
- [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN] = _3DPRIM_TRIFAN,
- [VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY] = _3DPRIM_LINELIST_ADJ,
- [VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY] = _3DPRIM_LINESTRIP_ADJ,
- [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY] = _3DPRIM_TRILIST_ADJ,
- [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY] = _3DPRIM_TRISTRIP_ADJ,
-};
+struct anv_pipeline_stage {
+ gl_shader_stage stage;
-static void
-populate_sampler_prog_key(const struct intel_device_info *devinfo,
- struct brw_sampler_prog_key_data *key)
-{
- /* Almost all multisampled textures are compressed. The only time when we
- * don't compress a multisampled texture is for 16x MSAA with a surface
- * width greater than 8k which is a bit of an edge case. Since the sampler
- * just ignores the MCS parameter to ld2ms when MCS is disabled, it's safe
- * to tell the compiler to always assume compression.
- */
- key->compressed_multisample_layout_mask = ~0;
-
- /* SkyLake added support for 16x MSAA. With this came a new message for
- * reading from a 16x MSAA surface with compression. The new message was
- * needed because now the MCS data is 64 bits instead of 32 or lower as is
- * the case for 8x, 4x, and 2x. The key->msaa_16 bit-field controls which
- * message we use. Fortunately, the 16x message works for 8x, 4x, and 2x
- * so we can just use it unconditionally. This may not be quite as
- * efficient but it saves us from recompiling.
+ struct vk_pipeline_robustness_state rstate;
+
+ /* VkComputePipelineCreateInfo, VkGraphicsPipelineCreateInfo or
+ * VkRayTracingPipelineCreateInfoKHR pNext field
*/
- if (devinfo->ver >= 9)
- key->msaa_16 = ~0;
+ const void *pipeline_pNext;
+ const VkPipelineShaderStageCreateInfo *info;
- /* XXX: Handle texture swizzle on HSW- */
- for (int i = 0; i < MAX_SAMPLERS; i++) {
- /* Assume color sampler, no swizzling. (Works for BDW+) */
- key->swizzles[i] = SWIZZLE_XYZW;
- }
-}
+ unsigned char shader_sha1[20];
+ uint32_t source_hash;
+
+ union brw_any_prog_key key;
+
+ struct {
+ gl_shader_stage stage;
+ unsigned char sha1[20];
+ } cache_key;
+
+ nir_shader *nir;
+
+ struct {
+ nir_shader *nir;
+ struct anv_shader_bin *bin;
+ } imported;
+
+ struct anv_push_descriptor_info push_desc_info;
+
+ enum gl_subgroup_size subgroup_size_type;
+
+ enum brw_robustness_flags robust_flags;
+
+ struct anv_pipeline_bind_map bind_map;
+
+ bool uses_bt_for_push_descs;
+
+ enum anv_dynamic_push_bits dynamic_push_values;
+
+ union brw_any_prog_data prog_data;
+
+ uint32_t num_stats;
+ struct brw_compile_stats stats[3];
+ char *disasm[3];
+
+ VkPipelineCreationFeedback feedback;
+ uint32_t feedback_idx;
+
+ const unsigned *code;
+
+ struct anv_shader_bin *bin;
+};
static void
-populate_base_prog_key(const struct intel_device_info *devinfo,
- VkPipelineShaderStageCreateFlags flags,
- bool robust_buffer_acccess,
- struct brw_base_prog_key *key)
+anv_stage_allocate_bind_map_tables(struct anv_pipeline *pipeline,
+ struct anv_pipeline_stage *stage,
+ void *mem_ctx)
{
- if (flags & VK_PIPELINE_SHADER_STAGE_CREATE_ALLOW_VARYING_SUBGROUP_SIZE_BIT_EXT)
- key->subgroup_size_type = BRW_SUBGROUP_SIZE_VARYING;
- else
- key->subgroup_size_type = BRW_SUBGROUP_SIZE_API_CONSTANT;
-
- key->robust_buffer_access = robust_buffer_acccess;
+ struct anv_pipeline_binding *surface_bindings =
+ brw_shader_stage_requires_bindless_resources(stage->stage) ? NULL :
+ rzalloc_array(mem_ctx, struct anv_pipeline_binding, 256);
+ struct anv_pipeline_binding *sampler_bindings =
+ brw_shader_stage_requires_bindless_resources(stage->stage) ? NULL :
+ rzalloc_array(mem_ctx, struct anv_pipeline_binding, 256);
+ struct anv_pipeline_embedded_sampler_binding *embedded_sampler_bindings =
+ rzalloc_array(mem_ctx, struct anv_pipeline_embedded_sampler_binding,
+ anv_pipeline_sets_layout_embedded_sampler_count(
+ &pipeline->layout));
+
+ stage->bind_map = (struct anv_pipeline_bind_map) {
+ .surface_to_descriptor = surface_bindings,
+ .sampler_to_descriptor = sampler_bindings,
+ .embedded_sampler_to_binding = embedded_sampler_bindings,
+ };
+}
- populate_sampler_prog_key(devinfo, &key->tex);
+static enum brw_robustness_flags
+anv_get_robust_flags(const struct vk_pipeline_robustness_state *rstate)
+{
+ return
+ ((rstate->storage_buffers !=
+ VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_DISABLED_EXT) ?
+ BRW_ROBUSTNESS_SSBO : 0) |
+ ((rstate->uniform_buffers !=
+ VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_DISABLED_EXT) ?
+ BRW_ROBUSTNESS_UBO : 0);
}
static void
-populate_vs_prog_key(const struct intel_device_info *devinfo,
- VkPipelineShaderStageCreateFlags flags,
- bool robust_buffer_acccess,
- struct brw_vs_prog_key *key)
+populate_base_prog_key(struct anv_pipeline_stage *stage,
+ const struct anv_device *device)
{
- memset(key, 0, sizeof(*key));
-
- populate_base_prog_key(devinfo, flags, robust_buffer_acccess, &key->base);
+ stage->key.base.robust_flags = anv_get_robust_flags(&stage->rstate);
+ stage->key.base.limit_trig_input_range =
+ device->physical->instance->limit_trig_input_range;
+}
- /* XXX: Handle vertex input work-arounds */
+static void
+populate_vs_prog_key(struct anv_pipeline_stage *stage,
+ const struct anv_device *device)
+{
+ memset(&stage->key, 0, sizeof(stage->key));
- /* XXX: Handle sampler_prog_key */
+ populate_base_prog_key(stage, device);
}
static void
-populate_tcs_prog_key(const struct intel_device_info *devinfo,
- VkPipelineShaderStageCreateFlags flags,
- bool robust_buffer_acccess,
- unsigned input_vertices,
- struct brw_tcs_prog_key *key)
+populate_tcs_prog_key(struct anv_pipeline_stage *stage,
+ const struct anv_device *device,
+ unsigned input_vertices)
{
- memset(key, 0, sizeof(*key));
+ memset(&stage->key, 0, sizeof(stage->key));
- populate_base_prog_key(devinfo, flags, robust_buffer_acccess, &key->base);
+ populate_base_prog_key(stage, device);
- key->input_vertices = input_vertices;
+ stage->key.tcs.input_vertices = input_vertices;
}
static void
-populate_tes_prog_key(const struct intel_device_info *devinfo,
- VkPipelineShaderStageCreateFlags flags,
- bool robust_buffer_acccess,
- struct brw_tes_prog_key *key)
+populate_tes_prog_key(struct anv_pipeline_stage *stage,
+ const struct anv_device *device)
{
- memset(key, 0, sizeof(*key));
+ memset(&stage->key, 0, sizeof(stage->key));
- populate_base_prog_key(devinfo, flags, robust_buffer_acccess, &key->base);
+ populate_base_prog_key(stage, device);
}
static void
-populate_gs_prog_key(const struct intel_device_info *devinfo,
- VkPipelineShaderStageCreateFlags flags,
- bool robust_buffer_acccess,
- struct brw_gs_prog_key *key)
+populate_gs_prog_key(struct anv_pipeline_stage *stage,
+ const struct anv_device *device)
{
- memset(key, 0, sizeof(*key));
+ memset(&stage->key, 0, sizeof(stage->key));
- populate_base_prog_key(devinfo, flags, robust_buffer_acccess, &key->base);
+ populate_base_prog_key(stage, device);
}
static bool
-pipeline_has_coarse_pixel(const struct anv_graphics_pipeline *pipeline,
- const VkPipelineFragmentShadingRateStateCreateInfoKHR *fsr_info)
+pipeline_has_coarse_pixel(const BITSET_WORD *dynamic,
+ const struct vk_multisample_state *ms,
+ const struct vk_fragment_shading_rate_state *fsr)
{
- if (pipeline->sample_shading_enable)
- return false;
-
- /* Not dynamic & not specified for the pipeline. */
- if ((pipeline->dynamic_states & ANV_CMD_DIRTY_DYNAMIC_SHADING_RATE) == 0 && !fsr_info)
+ /* The Vulkan 1.2.199 spec says:
+ *
+ * "If any of the following conditions are met, Cxy' must be set to
+ * {1,1}:
+ *
+ * * If Sample Shading is enabled.
+ * * [...]"
+ *
+ * And "sample shading" is defined as follows:
+ *
+ * "Sample shading is enabled for a graphics pipeline:
+ *
+ * * If the interface of the fragment shader entry point of the
+ * graphics pipeline includes an input variable decorated with
+ * SampleId or SamplePosition. In this case minSampleShadingFactor
+ * takes the value 1.0.
+ *
+ * * Else if the sampleShadingEnable member of the
+ * VkPipelineMultisampleStateCreateInfo structure specified when
+ * creating the graphics pipeline is set to VK_TRUE. In this case
+ * minSampleShadingFactor takes the value of
+ * VkPipelineMultisampleStateCreateInfo::minSampleShading.
+ *
+ * Otherwise, sample shading is considered disabled."
+ *
+ * The first bullet above is handled by the back-end compiler because those
+ * inputs both force per-sample dispatch. The second bullet is handled
+ * here. Note that this sample shading being enabled has nothing to do
+ * with minSampleShading.
+ */
+ if (ms != NULL && ms->sample_shading_enable)
return false;
/* Not dynamic & pipeline has a 1x1 fragment shading rate with no
- * possibility for element of the pipeline to change the value.
+ * possibility for element of the pipeline to change the value or fragment
+ * shading rate not specified at all.
*/
- if ((pipeline->dynamic_states & ANV_CMD_DIRTY_DYNAMIC_SHADING_RATE) == 0 &&
- fsr_info->fragmentSize.width <= 1 &&
- fsr_info->fragmentSize.height <= 1 &&
- fsr_info->combinerOps[0] == VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR &&
- fsr_info->combinerOps[1] == VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR)
+ if (!BITSET_TEST(dynamic, MESA_VK_DYNAMIC_FSR) &&
+ (fsr == NULL ||
+ (fsr->fragment_size.width <= 1 &&
+ fsr->fragment_size.height <= 1 &&
+ fsr->combiner_ops[0] == VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR &&
+ fsr->combiner_ops[1] == VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR)))
return false;
return true;
}
static void
-populate_wm_prog_key(const struct anv_graphics_pipeline *pipeline,
- VkPipelineShaderStageCreateFlags flags,
- bool robust_buffer_acccess,
- const struct anv_subpass *subpass,
- const VkPipelineMultisampleStateCreateInfo *ms_info,
- const VkPipelineFragmentShadingRateStateCreateInfoKHR *fsr_info,
- struct brw_wm_prog_key *key)
+populate_task_prog_key(struct anv_pipeline_stage *stage,
+ const struct anv_device *device)
+{
+ memset(&stage->key, 0, sizeof(stage->key));
+
+ populate_base_prog_key(stage, device);
+}
+
+static void
+populate_mesh_prog_key(struct anv_pipeline_stage *stage,
+ const struct anv_device *device,
+ bool compact_mue)
+{
+ memset(&stage->key, 0, sizeof(stage->key));
+
+ populate_base_prog_key(stage, device);
+
+ stage->key.mesh.compact_mue = compact_mue;
+}
+
+static uint32_t
+rp_color_mask(const struct vk_render_pass_state *rp)
+{
+ if (rp == NULL || !vk_render_pass_state_has_attachment_info(rp))
+ return ((1u << MAX_RTS) - 1);
+
+ uint32_t color_mask = 0;
+ for (uint32_t i = 0; i < rp->color_attachment_count; i++) {
+ if (rp->color_attachment_formats[i] != VK_FORMAT_UNDEFINED)
+ color_mask |= BITFIELD_BIT(i);
+ }
+
+ return color_mask;
+}
+
+static void
+populate_wm_prog_key(struct anv_pipeline_stage *stage,
+ const struct anv_graphics_base_pipeline *pipeline,
+ const BITSET_WORD *dynamic,
+ const struct vk_multisample_state *ms,
+ const struct vk_fragment_shading_rate_state *fsr,
+ const struct vk_render_pass_state *rp,
+ const enum brw_sometimes is_mesh)
{
const struct anv_device *device = pipeline->base.device;
- const struct intel_device_info *devinfo = &device->info;
- memset(key, 0, sizeof(*key));
+ memset(&stage->key, 0, sizeof(stage->key));
+
+ populate_base_prog_key(stage, device);
- populate_base_prog_key(devinfo, flags, robust_buffer_acccess, &key->base);
+ struct brw_wm_prog_key *key = &stage->key.wm;
/* We set this to 0 here and set to the actual value before we call
* brw_compile_fs.
*/
key->input_slots_valid = 0;
- /* Vulkan doesn't specify a default */
- key->high_quality_derivatives = false;
-
/* XXX Vulkan doesn't appear to specify */
key->clamp_fragment_color = false;
key->ignore_sample_mask_out = false;
- assert(subpass->color_count <= MAX_RTS);
- for (uint32_t i = 0; i < subpass->color_count; i++) {
- if (subpass->color_attachments[i].attachment != VK_ATTACHMENT_UNUSED)
- key->color_outputs_valid |= (1 << i);
- }
-
- key->nr_color_regions = subpass->color_count;
+ assert(rp == NULL || rp->color_attachment_count <= MAX_RTS);
+ /* Consider all inputs as valid until look at the NIR variables. */
+ key->color_outputs_valid = rp_color_mask(rp);
+ key->nr_color_regions = util_last_bit(key->color_outputs_valid);
/* To reduce possible shader recompilations we would need to know if
* there is a SampleMask output variable to compute if we should emit
* code to workaround the issue that hardware disables alpha to coverage
* when there is SampleMask output.
+ *
+ * If the pipeline we compile the fragment shader in includes the output
+ * interface, then we can be sure whether alpha_coverage is enabled or not.
+ * If we don't have that output interface, then we have to compile the
+ * shader with some conditionals.
*/
- key->alpha_to_coverage = ms_info && ms_info->alphaToCoverageEnable;
-
- /* Vulkan doesn't support fixed-function alpha test */
- key->alpha_test_replicate_alpha = false;
-
- if (ms_info) {
- /* We should probably pull this out of the shader, but it's fairly
- * harmless to compute it and then let dead-code take care of it.
+ if (ms != NULL) {
+ /* VUID-VkGraphicsPipelineCreateInfo-rasterizerDiscardEnable-00751:
+ *
+ * "If the pipeline is being created with fragment shader state,
+ * pMultisampleState must be a valid pointer to a valid
+ * VkPipelineMultisampleStateCreateInfo structure"
+ *
+ * It's also required for the fragment output interface.
*/
- if (ms_info->rasterizationSamples > 1) {
- key->persample_interp = ms_info->sampleShadingEnable &&
- (ms_info->minSampleShading * ms_info->rasterizationSamples) > 1;
- key->multisample_fbo = true;
- }
+ key->multisample_fbo =
+ BITSET_TEST(dynamic, MESA_VK_DYNAMIC_MS_RASTERIZATION_SAMPLES) ?
+ BRW_SOMETIMES :
+ ms->rasterization_samples > 1 ? BRW_ALWAYS : BRW_NEVER;
+ key->persample_interp =
+ BITSET_TEST(dynamic, MESA_VK_DYNAMIC_MS_RASTERIZATION_SAMPLES) ?
+ BRW_SOMETIMES :
+ (ms->sample_shading_enable &&
+ (ms->min_sample_shading * ms->rasterization_samples) > 1) ?
+ BRW_ALWAYS : BRW_NEVER;
+ key->alpha_to_coverage =
+ BITSET_TEST(dynamic, MESA_VK_DYNAMIC_MS_ALPHA_TO_COVERAGE_ENABLE) ?
+ BRW_SOMETIMES :
+ (ms->alpha_to_coverage_enable ? BRW_ALWAYS : BRW_NEVER);
+
+ /* TODO: We should make this dynamic */
+ if (device->physical->instance->sample_mask_out_opengl_behaviour)
+ key->ignore_sample_mask_out = !key->multisample_fbo;
+ } else {
+ /* Consider all inputs as valid until we look at the NIR variables. */
+ key->color_outputs_valid = (1u << MAX_RTS) - 1;
+ key->nr_color_regions = MAX_RTS;
- key->frag_coord_adds_sample_pos = key->persample_interp;
+ key->alpha_to_coverage = BRW_SOMETIMES;
+ key->multisample_fbo = BRW_SOMETIMES;
+ key->persample_interp = BRW_SOMETIMES;
}
- key->coarse_pixel =
- device->vk.enabled_extensions.KHR_fragment_shading_rate &&
- pipeline_has_coarse_pixel(pipeline, fsr_info);
-}
-
-static void
-populate_cs_prog_key(const struct intel_device_info *devinfo,
- VkPipelineShaderStageCreateFlags flags,
- bool robust_buffer_acccess,
- const VkPipelineShaderStageRequiredSubgroupSizeCreateInfoEXT *rss_info,
- struct brw_cs_prog_key *key)
-{
- memset(key, 0, sizeof(*key));
+ key->mesh_input = is_mesh;
- populate_base_prog_key(devinfo, flags, robust_buffer_acccess, &key->base);
-
- if (rss_info) {
- assert(key->base.subgroup_size_type != BRW_SUBGROUP_SIZE_VARYING);
+ /* Vulkan doesn't support fixed-function alpha test */
+ key->alpha_test_replicate_alpha = false;
- /* These enum values are expressly chosen to be equal to the subgroup
- * size that they require.
- */
- assert(rss_info->requiredSubgroupSize == 8 ||
- rss_info->requiredSubgroupSize == 16 ||
- rss_info->requiredSubgroupSize == 32);
- key->base.subgroup_size_type = rss_info->requiredSubgroupSize;
- } else if (flags & VK_PIPELINE_SHADER_STAGE_CREATE_REQUIRE_FULL_SUBGROUPS_BIT_EXT) {
- /* If the client expressly requests full subgroups and they don't
- * specify a subgroup size, we need to pick one. If they're requested
- * varying subgroup sizes, we set it to UNIFORM and let the back-end
- * compiler pick. Otherwise, we specify the API value of 32.
- * Performance will likely be terrible in this case but there's nothing
- * we can do about that. The client should have chosen a size.
- */
- if (flags & VK_PIPELINE_SHADER_STAGE_CREATE_ALLOW_VARYING_SUBGROUP_SIZE_BIT_EXT)
- key->base.subgroup_size_type = BRW_SUBGROUP_SIZE_UNIFORM;
- else
- key->base.subgroup_size_type = BRW_SUBGROUP_SIZE_REQUIRE_32;
- }
+ key->coarse_pixel =
+ device->vk.enabled_extensions.KHR_fragment_shading_rate &&
+ pipeline_has_coarse_pixel(dynamic, ms, fsr);
}
static void
-populate_bs_prog_key(const struct intel_device_info *devinfo,
- VkPipelineShaderStageCreateFlags flags,
- bool robust_buffer_access,
- struct brw_bs_prog_key *key)
+populate_cs_prog_key(struct anv_pipeline_stage *stage,
+ const struct anv_device *device)
{
- memset(key, 0, sizeof(*key));
+ memset(&stage->key, 0, sizeof(stage->key));
- populate_base_prog_key(devinfo, flags, robust_buffer_access, &key->base);
+ populate_base_prog_key(stage, device);
}
-struct anv_pipeline_stage {
- gl_shader_stage stage;
-
- const struct vk_shader_module *module;
- const char *entrypoint;
- const VkSpecializationInfo *spec_info;
-
- unsigned char shader_sha1[20];
-
- union brw_any_prog_key key;
+static void
+populate_bs_prog_key(struct anv_pipeline_stage *stage,
+ const struct anv_device *device,
+ uint32_t ray_flags)
+{
+ memset(&stage->key, 0, sizeof(stage->key));
- struct {
- gl_shader_stage stage;
- unsigned char sha1[20];
- } cache_key;
+ populate_base_prog_key(stage, device);
- nir_shader *nir;
+ stage->key.bs.pipeline_ray_flags = ray_flags;
+ stage->key.bs.pipeline_ray_flags = ray_flags;
+}
- struct anv_pipeline_binding surface_to_descriptor[256];
- struct anv_pipeline_binding sampler_to_descriptor[256];
- struct anv_pipeline_bind_map bind_map;
+static void
+anv_stage_write_shader_hash(struct anv_pipeline_stage *stage,
+ const struct anv_device *device)
+{
+ vk_pipeline_robustness_state_fill(&device->vk,
+ &stage->rstate,
+ stage->pipeline_pNext,
+ stage->info->pNext);
- union brw_any_prog_data prog_data;
+ vk_pipeline_hash_shader_stage(stage->info, &stage->rstate, stage->shader_sha1);
- uint32_t num_stats;
- struct brw_compile_stats stats[3];
- char *disasm[3];
+ stage->robust_flags = anv_get_robust_flags(&stage->rstate);
- VkPipelineCreationFeedbackEXT feedback;
+ /* Use lowest dword of source shader sha1 for shader hash. */
+ stage->source_hash = ((uint32_t*)stage->shader_sha1)[0];
+}
- const unsigned *code;
+static bool
+anv_graphics_pipeline_stage_fragment_dynamic(const struct anv_pipeline_stage *stage)
+{
+ if (stage->stage != MESA_SHADER_FRAGMENT)
+ return false;
- struct anv_shader_bin *bin;
-};
+ return stage->key.wm.persample_interp == BRW_SOMETIMES ||
+ stage->key.wm.multisample_fbo == BRW_SOMETIMES ||
+ stage->key.wm.alpha_to_coverage == BRW_SOMETIMES;
+}
static void
-anv_pipeline_hash_shader(const struct vk_shader_module *module,
- const char *entrypoint,
- gl_shader_stage stage,
- const VkSpecializationInfo *spec_info,
- unsigned char *sha1_out)
+anv_pipeline_hash_common(struct mesa_sha1 *ctx,
+ const struct anv_pipeline *pipeline)
{
- struct mesa_sha1 ctx;
- _mesa_sha1_init(&ctx);
+ struct anv_device *device = pipeline->device;
- _mesa_sha1_update(&ctx, module->sha1, sizeof(module->sha1));
- _mesa_sha1_update(&ctx, entrypoint, strlen(entrypoint));
- _mesa_sha1_update(&ctx, &stage, sizeof(stage));
- if (spec_info) {
- _mesa_sha1_update(&ctx, spec_info->pMapEntries,
- spec_info->mapEntryCount *
- sizeof(*spec_info->pMapEntries));
- _mesa_sha1_update(&ctx, spec_info->pData,
- spec_info->dataSize);
- }
+ _mesa_sha1_update(ctx, pipeline->layout.sha1, sizeof(pipeline->layout.sha1));
- _mesa_sha1_final(&ctx, sha1_out);
+ const bool indirect_descriptors = device->physical->indirect_descriptors;
+ _mesa_sha1_update(ctx, &indirect_descriptors, sizeof(indirect_descriptors));
+
+ const bool rba = device->robust_buffer_access;
+ _mesa_sha1_update(ctx, &rba, sizeof(rba));
+
+ const int spilling_rate = device->physical->compiler->spilling_rate;
+ _mesa_sha1_update(ctx, &spilling_rate, sizeof(spilling_rate));
}
static void
-anv_pipeline_hash_graphics(struct anv_graphics_pipeline *pipeline,
- struct anv_pipeline_layout *layout,
+anv_pipeline_hash_graphics(struct anv_graphics_base_pipeline *pipeline,
struct anv_pipeline_stage *stages,
+ uint32_t view_mask,
unsigned char *sha1_out)
{
+ const struct anv_device *device = pipeline->base.device;
struct mesa_sha1 ctx;
_mesa_sha1_init(&ctx);
- _mesa_sha1_update(&ctx, &pipeline->subpass->view_mask,
- sizeof(pipeline->subpass->view_mask));
+ anv_pipeline_hash_common(&ctx, &pipeline->base);
- if (layout)
- _mesa_sha1_update(&ctx, layout->sha1, sizeof(layout->sha1));
+ _mesa_sha1_update(&ctx, &view_mask, sizeof(view_mask));
- const bool rba = pipeline->base.device->robust_buffer_access;
- _mesa_sha1_update(&ctx, &rba, sizeof(rba));
-
- for (unsigned s = 0; s < ARRAY_SIZE(pipeline->shaders); s++) {
- if (stages[s].entrypoint) {
+ for (uint32_t s = 0; s < ANV_GRAPHICS_SHADER_STAGE_COUNT; s++) {
+ if (pipeline->base.active_stages & BITFIELD_BIT(s)) {
_mesa_sha1_update(&ctx, stages[s].shader_sha1,
sizeof(stages[s].shader_sha1));
_mesa_sha1_update(&ctx, &stages[s].key, brw_prog_key_size(s));
}
}
+ if (stages[MESA_SHADER_MESH].info || stages[MESA_SHADER_TASK].info) {
+ const uint8_t afs = device->physical->instance->assume_full_subgroups;
+ _mesa_sha1_update(&ctx, &afs, sizeof(afs));
+ }
+
_mesa_sha1_final(&ctx, sha1_out);
}
static void
anv_pipeline_hash_compute(struct anv_compute_pipeline *pipeline,
- struct anv_pipeline_layout *layout,
struct anv_pipeline_stage *stage,
unsigned char *sha1_out)
{
+ const struct anv_device *device = pipeline->base.device;
struct mesa_sha1 ctx;
_mesa_sha1_init(&ctx);
- if (layout)
- _mesa_sha1_update(&ctx, layout->sha1, sizeof(layout->sha1));
+ anv_pipeline_hash_common(&ctx, &pipeline->base);
- const bool rba = pipeline->base.device->robust_buffer_access;
- _mesa_sha1_update(&ctx, &rba, sizeof(rba));
+ const uint8_t afs = device->physical->instance->assume_full_subgroups;
+ _mesa_sha1_update(&ctx, &afs, sizeof(afs));
_mesa_sha1_update(&ctx, stage->shader_sha1,
sizeof(stage->shader_sha1));
@@ -705,18 +814,13 @@ anv_pipeline_hash_compute(struct anv_compute_pipeline *pipeline,
static void
anv_pipeline_hash_ray_tracing_shader(struct anv_ray_tracing_pipeline *pipeline,
- struct anv_pipeline_layout *layout,
struct anv_pipeline_stage *stage,
unsigned char *sha1_out)
{
struct mesa_sha1 ctx;
_mesa_sha1_init(&ctx);
- if (layout != NULL)
- _mesa_sha1_update(&ctx, layout->sha1, sizeof(layout->sha1));
-
- const bool rba = pipeline->base.device->robust_buffer_access;
- _mesa_sha1_update(&ctx, &rba, sizeof(rba));
+ anv_pipeline_hash_common(&ctx, &pipeline->base);
_mesa_sha1_update(&ctx, stage->shader_sha1, sizeof(stage->shader_sha1));
_mesa_sha1_update(&ctx, &stage->key, sizeof(stage->key.bs));
@@ -726,7 +830,6 @@ anv_pipeline_hash_ray_tracing_shader(struct anv_ray_tracing_pipeline *pipeline,
static void
anv_pipeline_hash_ray_tracing_combined_shader(struct anv_ray_tracing_pipeline *pipeline,
- struct anv_pipeline_layout *layout,
struct anv_pipeline_stage *intersection,
struct anv_pipeline_stage *any_hit,
unsigned char *sha1_out)
@@ -734,8 +837,8 @@ anv_pipeline_hash_ray_tracing_combined_shader(struct anv_ray_tracing_pipeline *p
struct mesa_sha1 ctx;
_mesa_sha1_init(&ctx);
- if (layout != NULL)
- _mesa_sha1_update(&ctx, layout->sha1, sizeof(layout->sha1));
+ _mesa_sha1_update(&ctx, pipeline->base.layout.sha1,
+ sizeof(pipeline->base.layout.sha1));
const bool rba = pipeline->base.device->robust_buffer_access;
_mesa_sha1_update(&ctx, &rba, sizeof(rba));
@@ -750,14 +853,14 @@ anv_pipeline_hash_ray_tracing_combined_shader(struct anv_ray_tracing_pipeline *p
static nir_shader *
anv_pipeline_stage_get_nir(struct anv_pipeline *pipeline,
- struct anv_pipeline_cache *cache,
+ struct vk_pipeline_cache *cache,
void *mem_ctx,
struct anv_pipeline_stage *stage)
{
const struct brw_compiler *compiler =
pipeline->device->physical->compiler;
const nir_shader_compiler_options *nir_options =
- compiler->glsl_compiler_options[stage->stage].NirOptions;
+ compiler->nir_options[stage->stage];
nir_shader *nir;
nir = anv_device_search_for_nir(pipeline->device, cache,
@@ -769,12 +872,8 @@ anv_pipeline_stage_get_nir(struct anv_pipeline *pipeline,
return nir;
}
- nir = anv_shader_compile_to_nir(pipeline->device,
- mem_ctx,
- stage->module,
- stage->entrypoint,
- stage->stage,
- stage->spec_info);
+ nir = anv_shader_stage_to_nir(pipeline->device, stage->info,
+ stage->key.base.robust_flags, mem_ctx);
if (nir) {
anv_device_upload_nir(pipeline->device, cache, nir, stage->shader_sha1);
return nir;
@@ -783,6 +882,29 @@ anv_pipeline_stage_get_nir(struct anv_pipeline *pipeline,
return NULL;
}
+static const struct vk_ycbcr_conversion_state *
+lookup_ycbcr_conversion(const void *_sets_layout, uint32_t set,
+ uint32_t binding, uint32_t array_index)
+{
+ const struct anv_pipeline_sets_layout *sets_layout = _sets_layout;
+
+ assert(set < MAX_SETS);
+ assert(binding < sets_layout->set[set].layout->binding_count);
+ const struct anv_descriptor_set_binding_layout *bind_layout =
+ &sets_layout->set[set].layout->binding[binding];
+
+ if (bind_layout->immutable_samplers == NULL)
+ return NULL;
+
+ array_index = MIN2(array_index, bind_layout->array_size - 1);
+
+ const struct anv_sampler *sampler =
+ bind_layout->immutable_samplers[array_index];
+
+ return sampler && sampler->vk.ycbcr_conversion ?
+ &sampler->vk.ycbcr_conversion->state : NULL;
+}
+
static void
shared_type_info(const struct glsl_type *type, unsigned *size, unsigned *align)
{
@@ -795,11 +917,91 @@ shared_type_info(const struct glsl_type *type, unsigned *size, unsigned *align)
*align = comp_size * (length == 3 ? 4 : length);
}
+static enum anv_dynamic_push_bits
+anv_nir_compute_dynamic_push_bits(nir_shader *shader)
+{
+ enum anv_dynamic_push_bits ret = 0;
+
+ nir_foreach_function_impl(impl, shader) {
+ nir_foreach_block(block, impl) {
+ nir_foreach_instr(instr, block) {
+ if (instr->type != nir_instr_type_intrinsic)
+ continue;
+
+ nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+ if (intrin->intrinsic != nir_intrinsic_load_push_constant)
+ continue;
+
+ switch (nir_intrinsic_base(intrin)) {
+ case offsetof(struct anv_push_constants, gfx.tcs_input_vertices):
+ ret |= ANV_DYNAMIC_PUSH_INPUT_VERTICES;
+ break;
+
+ default:
+ break;
+ }
+ }
+ }
+ }
+
+ return ret;
+}
+
+static void
+anv_fixup_subgroup_size(struct anv_device *device, struct shader_info *info)
+{
+ switch (info->stage) {
+ case MESA_SHADER_COMPUTE:
+ case MESA_SHADER_TASK:
+ case MESA_SHADER_MESH:
+ break;
+ default:
+ return;
+ }
+
+ unsigned local_size = info->workgroup_size[0] *
+ info->workgroup_size[1] *
+ info->workgroup_size[2];
+
+ /* Games don't always request full subgroups when they should,
+ * which can cause bugs, as they may expect bigger size of the
+ * subgroup than we choose for the execution.
+ */
+ if (device->physical->instance->assume_full_subgroups &&
+ info->uses_wide_subgroup_intrinsics &&
+ info->subgroup_size == SUBGROUP_SIZE_API_CONSTANT &&
+ local_size &&
+ local_size % BRW_SUBGROUP_SIZE == 0)
+ info->subgroup_size = SUBGROUP_SIZE_FULL_SUBGROUPS;
+
+ /* If the client requests that we dispatch full subgroups but doesn't
+ * allow us to pick a subgroup size, we have to smash it to the API
+ * value of 32. Performance will likely be terrible in this case but
+ * there's nothing we can do about that. The client should have chosen
+ * a size.
+ */
+ if (info->subgroup_size == SUBGROUP_SIZE_FULL_SUBGROUPS)
+ info->subgroup_size =
+ device->physical->instance->assume_full_subgroups != 0 ?
+ device->physical->instance->assume_full_subgroups : BRW_SUBGROUP_SIZE;
+
+ /* Cooperative matrix extension requires that all invocations in a subgroup
+ * be active. As a result, when the application does not request a specific
+ * subgroup size, we must use SIMD32.
+ */
+ if (info->stage == MESA_SHADER_COMPUTE && info->cs.has_cooperative_matrix &&
+ info->subgroup_size < SUBGROUP_SIZE_REQUIRE_8) {
+ info->subgroup_size = BRW_SUBGROUP_SIZE;
+ }
+}
+
static void
anv_pipeline_lower_nir(struct anv_pipeline *pipeline,
void *mem_ctx,
struct anv_pipeline_stage *stage,
- struct anv_pipeline_layout *layout)
+ struct anv_pipeline_sets_layout *layout,
+ uint32_t view_mask,
+ bool use_primitive_replication)
{
const struct anv_physical_device *pdevice = pipeline->device->physical;
const struct brw_compiler *compiler = pdevice->compiler;
@@ -808,80 +1010,153 @@ anv_pipeline_lower_nir(struct anv_pipeline *pipeline,
nir_shader *nir = stage->nir;
if (nir->info.stage == MESA_SHADER_FRAGMENT) {
- /* Check if sample shading is enabled in the shader and toggle
- * it on for the pipeline independent if sampleShadingEnable is set.
- */
- nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
- if (nir->info.fs.uses_sample_shading)
- anv_pipeline_to_graphics(pipeline)->sample_shading_enable = true;
+ NIR_PASS(_, nir, nir_lower_wpos_center);
+ NIR_PASS(_, nir, nir_lower_input_attachments,
+ &(nir_input_attachment_options) {
+ .use_fragcoord_sysval = true,
+ .use_layer_id_sysval = true,
+ });
+ }
+
+ if (nir->info.stage == MESA_SHADER_MESH ||
+ nir->info.stage == MESA_SHADER_TASK) {
+ nir_lower_compute_system_values_options options = {
+ .lower_cs_local_id_to_index = true,
+ .lower_workgroup_id_to_index = true,
+ /* nir_lower_idiv generates expensive code */
+ .shortcut_1d_workgroup_id = compiler->devinfo->verx10 >= 125,
+ };
- NIR_PASS_V(nir, nir_lower_wpos_center,
- anv_pipeline_to_graphics(pipeline)->sample_shading_enable);
- NIR_PASS_V(nir, nir_lower_input_attachments,
- &(nir_input_attachment_options) {
- .use_fragcoord_sysval = true,
- .use_layer_id_sysval = true,
- });
+ NIR_PASS(_, nir, nir_lower_compute_system_values, &options);
}
- NIR_PASS_V(nir, anv_nir_lower_ycbcr_textures, layout);
+ NIR_PASS(_, nir, nir_vk_lower_ycbcr_tex, lookup_ycbcr_conversion, layout);
- if (pipeline->type == ANV_PIPELINE_GRAPHICS) {
- NIR_PASS_V(nir, anv_nir_lower_multiview,
- anv_pipeline_to_graphics(pipeline));
+ if (pipeline->type == ANV_PIPELINE_GRAPHICS ||
+ pipeline->type == ANV_PIPELINE_GRAPHICS_LIB) {
+ NIR_PASS(_, nir, anv_nir_lower_multiview, view_mask,
+ use_primitive_replication);
}
+ if (nir->info.stage == MESA_SHADER_COMPUTE && nir->info.cs.has_cooperative_matrix) {
+ anv_fixup_subgroup_size(pipeline->device, &nir->info);
+ NIR_PASS(_, nir, brw_nir_lower_cmat, nir->info.subgroup_size);
+ NIR_PASS_V(nir, nir_lower_indirect_derefs, nir_var_function_temp, 16);
+ }
+
+ /* The patch control points are delivered through a push constant when
+ * dynamic.
+ */
+ if (nir->info.stage == MESA_SHADER_TESS_CTRL &&
+ stage->key.tcs.input_vertices == 0)
+ NIR_PASS(_, nir, anv_nir_lower_load_patch_vertices_in);
+
nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
- NIR_PASS_V(nir, brw_nir_lower_storage_image, compiler->devinfo);
+ NIR_PASS(_, nir, brw_nir_lower_storage_image,
+ &(struct brw_nir_lower_storage_image_opts) {
+ /* Anv only supports Gfx9+ which has better defined typed read
+ * behavior. It allows us to only have to care about lowering
+ * loads.
+ */
+ .devinfo = compiler->devinfo,
+ .lower_loads = true,
+ });
- NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_mem_global,
- nir_address_format_64bit_global);
- NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_mem_push_const,
- nir_address_format_32bit_offset);
+ NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_global,
+ nir_address_format_64bit_global);
+ NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_push_const,
+ nir_address_format_32bit_offset);
+
+ NIR_PASS(_, nir, brw_nir_lower_ray_queries, &pdevice->info);
+
+ stage->push_desc_info.used_descriptors =
+ anv_nir_compute_used_push_descriptors(nir, layout);
+
+ struct anv_pipeline_push_map push_map = {};
/* Apply the actual pipeline layout to UBOs, SSBOs, and textures */
- anv_nir_apply_pipeline_layout(pdevice,
- pipeline->device->robust_buffer_access,
- layout, nir, &stage->bind_map);
+ NIR_PASS_V(nir, anv_nir_apply_pipeline_layout,
+ pdevice, stage->key.base.robust_flags,
+ layout->independent_sets,
+ layout, &stage->bind_map, &push_map, mem_ctx);
- NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_mem_ubo,
- anv_nir_ubo_addr_format(pdevice,
- pipeline->device->robust_buffer_access));
- NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_mem_ssbo,
- anv_nir_ssbo_addr_format(pdevice,
- pipeline->device->robust_buffer_access));
+ NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_ubo,
+ anv_nir_ubo_addr_format(pdevice, stage->key.base.robust_flags));
+ NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_ssbo,
+ anv_nir_ssbo_addr_format(pdevice, stage->key.base.robust_flags));
/* First run copy-prop to get rid of all of the vec() that address
* calculations often create and then constant-fold so that, when we
* get to anv_nir_lower_ubo_loads, we can detect constant offsets.
*/
- NIR_PASS_V(nir, nir_copy_prop);
- NIR_PASS_V(nir, nir_opt_constant_folding);
+ bool progress;
+ do {
+ progress = false;
+ NIR_PASS(progress, nir, nir_opt_algebraic);
+ NIR_PASS(progress, nir, nir_copy_prop);
+ NIR_PASS(progress, nir, nir_opt_constant_folding);
+ NIR_PASS(progress, nir, nir_opt_dce);
+ } while (progress);
+
+ /* Required for nir_divergence_analysis() which is needed for
+ * anv_nir_lower_ubo_loads.
+ */
+ NIR_PASS(_, nir, nir_convert_to_lcssa, true, true);
+ nir_divergence_analysis(nir);
+
+ NIR_PASS(_, nir, anv_nir_lower_ubo_loads);
- NIR_PASS_V(nir, anv_nir_lower_ubo_loads);
+ NIR_PASS(_, nir, nir_opt_remove_phis);
- /* We don't support non-uniform UBOs and non-uniform SSBO access is
- * handled naturally by falling back to A64 messages.
+ enum nir_lower_non_uniform_access_type lower_non_uniform_access_types =
+ nir_lower_non_uniform_texture_access |
+ nir_lower_non_uniform_image_access |
+ nir_lower_non_uniform_get_ssbo_size;
+
+ /* In practice, most shaders do not have non-uniform-qualified
+ * accesses (see
+ * https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17558#note_1475069)
+ * thus a cheaper and likely to fail check is run first.
*/
- NIR_PASS_V(nir, nir_lower_non_uniform_access,
- &(nir_lower_non_uniform_access_options) {
- .types = nir_lower_non_uniform_texture_access |
- nir_lower_non_uniform_image_access,
+ if (nir_has_non_uniform_access(nir, lower_non_uniform_access_types)) {
+ NIR_PASS(_, nir, nir_opt_non_uniform_access);
+
+ /* We don't support non-uniform UBOs and non-uniform SSBO access is
+ * handled naturally by falling back to A64 messages.
+ */
+ NIR_PASS(_, nir, nir_lower_non_uniform_access,
+ &(nir_lower_non_uniform_access_options) {
+ .types = lower_non_uniform_access_types,
.callback = NULL,
- });
+ });
+
+ NIR_PASS(_, nir, intel_nir_lower_non_uniform_resource_intel);
+ NIR_PASS(_, nir, intel_nir_cleanup_resource_intel);
+ NIR_PASS(_, nir, nir_opt_dce);
+ }
+
+ NIR_PASS_V(nir, anv_nir_update_resource_intel_block);
- anv_nir_compute_push_layout(pdevice, pipeline->device->robust_buffer_access,
- nir, prog_data, &stage->bind_map, mem_ctx);
+ stage->dynamic_push_values = anv_nir_compute_dynamic_push_bits(nir);
+
+ NIR_PASS_V(nir, anv_nir_compute_push_layout,
+ pdevice, stage->key.base.robust_flags,
+ anv_graphics_pipeline_stage_fragment_dynamic(stage),
+ prog_data, &stage->bind_map, &push_map,
+ pipeline->layout.type, mem_ctx);
+
+ NIR_PASS_V(nir, anv_nir_lower_resource_intel, pdevice,
+ pipeline->layout.type);
if (gl_shader_stage_uses_workgroup(nir->info.stage)) {
if (!nir->info.shared_memory_explicit_layout) {
- NIR_PASS_V(nir, nir_lower_vars_to_explicit_types,
- nir_var_mem_shared, shared_type_info);
+ NIR_PASS(_, nir, nir_lower_vars_to_explicit_types,
+ nir_var_mem_shared, shared_type_info);
}
- NIR_PASS_V(nir, nir_lower_explicit_io,
- nir_var_mem_shared, nir_address_format_32bit_offset);
+ NIR_PASS(_, nir, nir_lower_explicit_io,
+ nir_var_mem_shared, nir_address_format_32bit_offset);
if (nir->info.zero_initialize_shared_memory &&
nir->info.shared_size > 0) {
@@ -894,11 +1169,22 @@ anv_pipeline_lower_nir(struct anv_pipeline *pipeline,
assert(shared_size <=
intel_calculate_slm_size(compiler->devinfo->ver, nir->info.shared_size));
- NIR_PASS_V(nir, nir_zero_initialize_shared_memory,
- shared_size, chunk_size);
+ NIR_PASS(_, nir, nir_zero_initialize_shared_memory,
+ shared_size, chunk_size);
}
}
+ if (gl_shader_stage_is_compute(nir->info.stage) ||
+ gl_shader_stage_is_mesh(nir->info.stage)) {
+ NIR_PASS(_, nir, brw_nir_lower_cs_intrinsics, compiler->devinfo,
+ &stage->prog_data.cs);
+ }
+
+ stage->push_desc_info.used_set_buffer =
+ anv_nir_loads_push_desc_buffer(nir, layout, &stage->bind_map);
+ stage->push_desc_info.fully_promoted_ubo_descriptors =
+ anv_nir_push_desc_ubo_fully_promoted(nir, layout, &stage->bind_map);
+
stage->nir = nir;
}
@@ -914,14 +1200,19 @@ anv_pipeline_link_vs(const struct brw_compiler *compiler,
static void
anv_pipeline_compile_vs(const struct brw_compiler *compiler,
void *mem_ctx,
- struct anv_graphics_pipeline *pipeline,
- struct anv_pipeline_stage *vs_stage)
+ struct anv_graphics_base_pipeline *pipeline,
+ struct anv_pipeline_stage *vs_stage,
+ uint32_t view_mask)
{
/* When using Primitive Replication for multiview, each view gets its own
* position slot.
*/
- uint32_t pos_slots = pipeline->use_primitive_replication ?
- anv_subpass_view_count(pipeline->subpass) : 1;
+ uint32_t pos_slots =
+ (vs_stage->nir->info.per_view_outputs & VARYING_BIT_POS) ?
+ MAX2(1, util_bitcount(view_mask)) : 1;
+
+ /* Only position is allowed to be per-view */
+ assert(!(vs_stage->nir->info.per_view_outputs & ~VARYING_BIT_POS));
brw_compute_vue_map(compiler->devinfo,
&vs_stage->prog_data.vs.base.vue_map,
@@ -932,14 +1223,18 @@ anv_pipeline_compile_vs(const struct brw_compiler *compiler,
vs_stage->num_stats = 1;
struct brw_compile_vs_params params = {
- .nir = vs_stage->nir,
+ .base = {
+ .nir = vs_stage->nir,
+ .stats = vs_stage->stats,
+ .log_data = pipeline->base.device,
+ .mem_ctx = mem_ctx,
+ .source_hash = vs_stage->source_hash,
+ },
.key = &vs_stage->key.vs,
.prog_data = &vs_stage->prog_data.vs,
- .stats = vs_stage->stats,
- .log_data = pipeline->base.device,
};
- vs_stage->code = brw_compile_vs(compiler, mem_ctx, &params);
+ vs_stage->code = brw_compile_vs(compiler, &params);
}
static void
@@ -973,10 +1268,10 @@ merge_tess_info(struct shader_info *tes_info,
tcs_info->tess.spacing == tes_info->tess.spacing);
tes_info->tess.spacing |= tcs_info->tess.spacing;
- assert(tcs_info->tess.primitive_mode == 0 ||
- tes_info->tess.primitive_mode == 0 ||
- tcs_info->tess.primitive_mode == tes_info->tess.primitive_mode);
- tes_info->tess.primitive_mode |= tcs_info->tess.primitive_mode;
+ assert(tcs_info->tess._primitive_mode == 0 ||
+ tes_info->tess._primitive_mode == 0 ||
+ tcs_info->tess._primitive_mode == tes_info->tess._primitive_mode);
+ tes_info->tess._primitive_mode |= tcs_info->tess._primitive_mode;
tes_info->tess.ccw |= tcs_info->tess.ccw;
tes_info->tess.point_mode |= tcs_info->tess.point_mode;
}
@@ -1001,12 +1296,8 @@ anv_pipeline_link_tcs(const struct brw_compiler *compiler,
* this comes from the SPIR-V, which is part of the hash used for the
* pipeline cache. So it should be safe.
*/
- tcs_stage->key.tcs.tes_primitive_mode =
- tes_stage->nir->info.tess.primitive_mode;
- tcs_stage->key.tcs.quads_workaround =
- compiler->devinfo->ver < 9 &&
- tes_stage->nir->info.tess.primitive_mode == 7 /* GL_QUADS */ &&
- tes_stage->nir->info.tess.spacing == TESS_SPACING_EQUAL;
+ tcs_stage->key.tcs._tes_primitive_mode =
+ tes_stage->nir->info.tess._primitive_mode;
}
static void
@@ -1022,11 +1313,20 @@ anv_pipeline_compile_tcs(const struct brw_compiler *compiler,
tcs_stage->nir->info.patch_outputs_written;
tcs_stage->num_stats = 1;
- tcs_stage->code = brw_compile_tcs(compiler, device, mem_ctx,
- &tcs_stage->key.tcs,
- &tcs_stage->prog_data.tcs,
- tcs_stage->nir, -1,
- tcs_stage->stats, NULL);
+
+ struct brw_compile_tcs_params params = {
+ .base = {
+ .nir = tcs_stage->nir,
+ .stats = tcs_stage->stats,
+ .log_data = device,
+ .mem_ctx = mem_ctx,
+ .source_hash = tcs_stage->source_hash,
+ },
+ .key = &tcs_stage->key.tcs,
+ .prog_data = &tcs_stage->prog_data.tcs,
+ };
+
+ tcs_stage->code = brw_compile_tcs(compiler, &params);
}
static void
@@ -1051,12 +1351,21 @@ anv_pipeline_compile_tes(const struct brw_compiler *compiler,
tcs_stage->nir->info.patch_outputs_written;
tes_stage->num_stats = 1;
- tes_stage->code = brw_compile_tes(compiler, device, mem_ctx,
- &tes_stage->key.tes,
- &tcs_stage->prog_data.tcs.base.vue_map,
- &tes_stage->prog_data.tes,
- tes_stage->nir, -1,
- tes_stage->stats, NULL);
+
+ struct brw_compile_tes_params params = {
+ .base = {
+ .nir = tes_stage->nir,
+ .stats = tes_stage->stats,
+ .log_data = device,
+ .mem_ctx = mem_ctx,
+ .source_hash = tes_stage->source_hash,
+ },
+ .key = &tes_stage->key.tes,
+ .prog_data = &tes_stage->prog_data.tes,
+ .input_vue_map = &tcs_stage->prog_data.tcs.base.vue_map,
+ };
+
+ tes_stage->code = brw_compile_tes(compiler, &params);
}
static void
@@ -1081,17 +1390,120 @@ anv_pipeline_compile_gs(const struct brw_compiler *compiler,
gs_stage->nir->info.separate_shader, 1);
gs_stage->num_stats = 1;
- gs_stage->code = brw_compile_gs(compiler, device, mem_ctx,
- &gs_stage->key.gs,
- &gs_stage->prog_data.gs,
- gs_stage->nir, -1,
- gs_stage->stats, NULL);
+
+ struct brw_compile_gs_params params = {
+ .base = {
+ .nir = gs_stage->nir,
+ .stats = gs_stage->stats,
+ .log_data = device,
+ .mem_ctx = mem_ctx,
+ .source_hash = gs_stage->source_hash,
+ },
+ .key = &gs_stage->key.gs,
+ .prog_data = &gs_stage->prog_data.gs,
+ };
+
+ gs_stage->code = brw_compile_gs(compiler, &params);
+}
+
+static void
+anv_pipeline_link_task(const struct brw_compiler *compiler,
+ struct anv_pipeline_stage *task_stage,
+ struct anv_pipeline_stage *next_stage)
+{
+ assert(next_stage);
+ assert(next_stage->stage == MESA_SHADER_MESH);
+ brw_nir_link_shaders(compiler, task_stage->nir, next_stage->nir);
+}
+
+static void
+anv_pipeline_compile_task(const struct brw_compiler *compiler,
+ void *mem_ctx,
+ struct anv_device *device,
+ struct anv_pipeline_stage *task_stage)
+{
+ task_stage->num_stats = 1;
+
+ struct brw_compile_task_params params = {
+ .base = {
+ .nir = task_stage->nir,
+ .stats = task_stage->stats,
+ .log_data = device,
+ .mem_ctx = mem_ctx,
+ .source_hash = task_stage->source_hash,
+ },
+ .key = &task_stage->key.task,
+ .prog_data = &task_stage->prog_data.task,
+ };
+
+ task_stage->code = brw_compile_task(compiler, &params);
+}
+
+static void
+anv_pipeline_link_mesh(const struct brw_compiler *compiler,
+ struct anv_pipeline_stage *mesh_stage,
+ struct anv_pipeline_stage *next_stage)
+{
+ if (next_stage) {
+ brw_nir_link_shaders(compiler, mesh_stage->nir, next_stage->nir);
+ }
+}
+
+static void
+anv_pipeline_compile_mesh(const struct brw_compiler *compiler,
+ void *mem_ctx,
+ struct anv_device *device,
+ struct anv_pipeline_stage *mesh_stage,
+ struct anv_pipeline_stage *prev_stage)
+{
+ mesh_stage->num_stats = 1;
+
+ struct brw_compile_mesh_params params = {
+ .base = {
+ .nir = mesh_stage->nir,
+ .stats = mesh_stage->stats,
+ .log_data = device,
+ .mem_ctx = mem_ctx,
+ .source_hash = mesh_stage->source_hash,
+ },
+ .key = &mesh_stage->key.mesh,
+ .prog_data = &mesh_stage->prog_data.mesh,
+ };
+
+ if (prev_stage) {
+ assert(prev_stage->stage == MESA_SHADER_TASK);
+ params.tue_map = &prev_stage->prog_data.task.map;
+ }
+
+ mesh_stage->code = brw_compile_mesh(compiler, &params);
}
static void
anv_pipeline_link_fs(const struct brw_compiler *compiler,
- struct anv_pipeline_stage *stage)
+ struct anv_pipeline_stage *stage,
+ const struct vk_render_pass_state *rp)
{
+ /* Initially the valid outputs value is set to all possible render targets
+ * valid (see populate_wm_prog_key()), before we look at the shader
+ * variables. Here we look at the output variables of the shader an compute
+ * a correct number of render target outputs.
+ */
+ stage->key.wm.color_outputs_valid = 0;
+ nir_foreach_shader_out_variable_safe(var, stage->nir) {
+ if (var->data.location < FRAG_RESULT_DATA0)
+ continue;
+
+ const unsigned rt = var->data.location - FRAG_RESULT_DATA0;
+ const unsigned array_len =
+ glsl_type_is_array(var->type) ? glsl_get_length(var->type) : 1;
+ assert(rt + array_len <= MAX_RTS);
+
+ stage->key.wm.color_outputs_valid |= BITFIELD_RANGE(rt, array_len);
+ }
+ stage->key.wm.color_outputs_valid &= rp_color_mask(rp);
+ stage->key.wm.nr_color_regions =
+ util_last_bit(stage->key.wm.color_outputs_valid);
+
unsigned num_rt_bindings;
struct anv_pipeline_binding rt_bindings[MAX_RTS];
if (stage->key.wm.nr_color_regions > 0) {
@@ -1101,12 +1513,15 @@ anv_pipeline_link_fs(const struct brw_compiler *compiler,
rt_bindings[rt] = (struct anv_pipeline_binding) {
.set = ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS,
.index = rt,
+ .binding = UINT32_MAX,
+
};
} else {
/* Setup a null render target */
rt_bindings[rt] = (struct anv_pipeline_binding) {
.set = ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS,
.index = UINT32_MAX,
+ .binding = UINT32_MAX,
};
}
}
@@ -1125,53 +1540,6 @@ anv_pipeline_link_fs(const struct brw_compiler *compiler,
typed_memcpy(stage->bind_map.surface_to_descriptor,
rt_bindings, num_rt_bindings);
stage->bind_map.surface_count += num_rt_bindings;
-
- /* Now that we've set up the color attachments, we can go through and
- * eliminate any shader outputs that map to VK_ATTACHMENT_UNUSED in the
- * hopes that dead code can clean them up in this and any earlier shader
- * stages.
- */
- nir_function_impl *impl = nir_shader_get_entrypoint(stage->nir);
- bool deleted_output = false;
- nir_foreach_shader_out_variable_safe(var, stage->nir) {
- /* TODO: We don't delete depth/stencil writes. We probably could if the
- * subpass doesn't have a depth/stencil attachment.
- */
- if (var->data.location < FRAG_RESULT_DATA0)
- continue;
-
- const unsigned rt = var->data.location - FRAG_RESULT_DATA0;
-
- /* If this is the RT at location 0 and we have alpha to coverage
- * enabled we still need that write because it will affect the coverage
- * mask even if it's never written to a color target.
- */
- if (rt == 0 && stage->key.wm.alpha_to_coverage)
- continue;
-
- const unsigned array_len =
- glsl_type_is_array(var->type) ? glsl_get_length(var->type) : 1;
- assert(rt + array_len <= MAX_RTS);
-
- if (rt >= MAX_RTS || !(stage->key.wm.color_outputs_valid &
- BITFIELD_RANGE(rt, array_len))) {
- deleted_output = true;
- var->data.mode = nir_var_function_temp;
- exec_node_remove(&var->node);
- exec_list_push_tail(&impl->locals, &var->node);
- }
- }
-
- if (deleted_output)
- nir_fixup_deref_modes(stage->nir);
-
- /* We stored the number of subpass color attachments in nr_color_regions
- * when calculating the key for caching. Now that we've computed the bind
- * map, we can reduce this to the actual max before we go into the back-end
- * compiler.
- */
- stage->key.wm.nr_color_regions =
- util_last_bit(stage->key.wm.color_outputs_valid);
}
static void
@@ -1179,45 +1547,61 @@ anv_pipeline_compile_fs(const struct brw_compiler *compiler,
void *mem_ctx,
struct anv_device *device,
struct anv_pipeline_stage *fs_stage,
- struct anv_pipeline_stage *prev_stage)
+ struct anv_pipeline_stage *prev_stage,
+ struct anv_graphics_base_pipeline *pipeline,
+ uint32_t view_mask,
+ bool use_primitive_replication)
{
- /* TODO: we could set this to 0 based on the information in nir_shader, but
- * we need this before we call spirv_to_nir.
+ /* When using Primitive Replication for multiview, each view gets its own
+ * position slot.
+ */
+ uint32_t pos_slots = use_primitive_replication ?
+ MAX2(1, util_bitcount(view_mask)) : 1;
+
+ /* If we have a previous stage we can use that to deduce valid slots.
+ * Otherwise, rely on inputs of the input shader.
*/
- assert(prev_stage);
- fs_stage->key.wm.input_slots_valid =
- prev_stage->prog_data.vue.vue_map.slots_valid;
+ if (prev_stage) {
+ fs_stage->key.wm.input_slots_valid =
+ prev_stage->prog_data.vue.vue_map.slots_valid;
+ } else {
+ struct intel_vue_map prev_vue_map;
+ brw_compute_vue_map(compiler->devinfo,
+ &prev_vue_map,
+ fs_stage->nir->info.inputs_read,
+ fs_stage->nir->info.separate_shader,
+ pos_slots);
+
+ fs_stage->key.wm.input_slots_valid = prev_vue_map.slots_valid;
+ }
struct brw_compile_fs_params params = {
- .nir = fs_stage->nir,
+ .base = {
+ .nir = fs_stage->nir,
+ .stats = fs_stage->stats,
+ .log_data = device,
+ .mem_ctx = mem_ctx,
+ .source_hash = fs_stage->source_hash,
+ },
.key = &fs_stage->key.wm,
.prog_data = &fs_stage->prog_data.wm,
.allow_spilling = true,
- .stats = fs_stage->stats,
- .log_data = device,
+ .max_polygons = UCHAR_MAX,
};
- fs_stage->code = brw_compile_fs(compiler, mem_ctx, &params);
+ if (prev_stage && prev_stage->stage == MESA_SHADER_MESH) {
+ params.mue_map = &prev_stage->prog_data.mesh.map;
+ /* TODO(mesh): Slots valid, do we even use/rely on it? */
+ }
+
+ fs_stage->code = brw_compile_fs(compiler, &params);
- fs_stage->num_stats = (uint32_t)fs_stage->prog_data.wm.dispatch_8 +
+ fs_stage->num_stats = (uint32_t)!!fs_stage->prog_data.wm.dispatch_multi +
+ (uint32_t)fs_stage->prog_data.wm.dispatch_8 +
(uint32_t)fs_stage->prog_data.wm.dispatch_16 +
(uint32_t)fs_stage->prog_data.wm.dispatch_32;
-
- if (fs_stage->key.wm.color_outputs_valid == 0 &&
- !fs_stage->prog_data.wm.has_side_effects &&
- !fs_stage->prog_data.wm.uses_omask &&
- !fs_stage->key.wm.alpha_to_coverage &&
- !fs_stage->prog_data.wm.uses_kill &&
- fs_stage->prog_data.wm.computed_depth_mode == BRW_PSCDEPTH_OFF &&
- !fs_stage->prog_data.wm.computed_stencil) {
- /* This fragment shader has no outputs and no side effects. Go ahead
- * and return the code pointer so we don't accidentally think the
- * compile failed but zero out prog_data which will set program_size to
- * zero and disable the stage.
- */
- memset(&fs_stage->prog_data, 0, sizeof(fs_stage->prog_data));
- }
+ assert(fs_stage->num_stats <= ARRAY_SIZE(fs_stage->stats));
}
static void
@@ -1229,14 +1613,14 @@ anv_pipeline_add_executable(struct anv_pipeline *pipeline,
char *nir = NULL;
if (stage->nir &&
(pipeline->flags &
- VK_PIPELINE_CREATE_CAPTURE_INTERNAL_REPRESENTATIONS_BIT_KHR)) {
+ VK_PIPELINE_CREATE_2_CAPTURE_INTERNAL_REPRESENTATIONS_BIT_KHR)) {
nir = nir_shader_as_str(stage->nir, pipeline->mem_ctx);
}
char *disasm = NULL;
if (stage->code &&
(pipeline->flags &
- VK_PIPELINE_CREATE_CAPTURE_INTERNAL_REPRESENTATIONS_BIT_KHR)) {
+ VK_PIPELINE_CREATE_2_CAPTURE_INTERNAL_REPRESENTATIONS_BIT_KHR)) {
char *stream_data = NULL;
size_t stream_size = 0;
FILE *stream = open_memstream(&stream_data, &stream_size);
@@ -1262,6 +1646,12 @@ anv_pipeline_add_executable(struct anv_pipeline *pipeline,
fprintf(stream, "Vulkan push constants and API params");
break;
+ case ANV_DESCRIPTOR_SET_DESCRIPTORS_BUFFER:
+ fprintf(stream, "Descriptor buffer (desc buffer) for set %d (start=%dB)",
+ stage->bind_map.push_ranges[i].index,
+ stage->bind_map.push_ranges[i].start * 32);
+ break;
+
case ANV_DESCRIPTOR_SET_DESCRIPTORS:
fprintf(stream, "Descriptor buffer for set %d (start=%dB)",
stage->bind_map.push_ranges[i].index,
@@ -1271,11 +1661,6 @@ anv_pipeline_add_executable(struct anv_pipeline *pipeline,
case ANV_DESCRIPTOR_SET_NUM_WORK_GROUPS:
unreachable("gl_NumWorkgroups is never pushed");
- case ANV_DESCRIPTOR_SET_SHADER_CONSTANTS:
- fprintf(stream, "Inline shader constant data (start=%dB)",
- stage->bind_map.push_ranges[i].start * 32);
- break;
-
case ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS:
unreachable("Color attachments can't be pushed");
@@ -1294,8 +1679,8 @@ anv_pipeline_add_executable(struct anv_pipeline *pipeline,
/* Creating this is far cheaper than it looks. It's perfectly fine to
* do it for every binary.
*/
- intel_disassemble(&pipeline->device->info,
- stage->code, code_offset, stream);
+ brw_disassemble_with_errors(&pipeline->device->physical->compiler->isa,
+ stage->code, code_offset, stream);
fclose(stream);
@@ -1319,8 +1704,7 @@ anv_pipeline_add_executable(struct anv_pipeline *pipeline,
static void
anv_pipeline_add_executables(struct anv_pipeline *pipeline,
- struct anv_pipeline_stage *stage,
- struct anv_shader_bin *bin)
+ struct anv_pipeline_stage *stage)
{
if (stage->stage == MESA_SHADER_FRAGMENT) {
/* We pull the prog data and stats out of the anv_shader_bin because
@@ -1328,10 +1712,11 @@ anv_pipeline_add_executables(struct anv_pipeline *pipeline,
* looked up the shader in a cache.
*/
const struct brw_wm_prog_data *wm_prog_data =
- (const struct brw_wm_prog_data *)bin->prog_data;
- struct brw_compile_stats *stats = bin->stats;
+ (const struct brw_wm_prog_data *)stage->bin->prog_data;
+ struct brw_compile_stats *stats = stage->bin->stats;
- if (wm_prog_data->dispatch_8) {
+ if (wm_prog_data->dispatch_8 ||
+ wm_prog_data->dispatch_multi) {
anv_pipeline_add_executable(pipeline, stage, stats++, 0);
}
@@ -1345,551 +1730,950 @@ anv_pipeline_add_executables(struct anv_pipeline *pipeline,
wm_prog_data->prog_offset_32);
}
} else {
- anv_pipeline_add_executable(pipeline, stage, bin->stats, 0);
+ anv_pipeline_add_executable(pipeline, stage, stage->bin->stats, 0);
}
}
static void
-anv_pipeline_init_from_cached_graphics(struct anv_graphics_pipeline *pipeline)
+anv_pipeline_account_shader(struct anv_pipeline *pipeline,
+ struct anv_shader_bin *shader)
{
- /* TODO: Cache this pipeline-wide information. */
+ pipeline->scratch_size = MAX2(pipeline->scratch_size,
+ shader->prog_data->total_scratch);
- /* Primitive replication depends on information from all the shaders.
- * Recover this bit from the fact that we have more than one position slot
- * in the vertex shader when using it.
- */
- assert(pipeline->active_stages & VK_SHADER_STAGE_VERTEX_BIT);
- int pos_slots = 0;
- const struct brw_vue_prog_data *vue_prog_data =
- (const void *) pipeline->shaders[MESA_SHADER_VERTEX]->prog_data;
- const struct brw_vue_map *vue_map = &vue_prog_data->vue_map;
- for (int i = 0; i < vue_map->num_slots; i++) {
- if (vue_map->slot_to_varying[i] == VARYING_SLOT_POS)
- pos_slots++;
+ pipeline->ray_queries = MAX2(pipeline->ray_queries,
+ shader->prog_data->ray_queries);
+
+ if (shader->push_desc_info.used_set_buffer) {
+ pipeline->use_push_descriptor_buffer |=
+ mesa_to_vk_shader_stage(shader->stage);
}
- pipeline->use_primitive_replication = pos_slots > 1;
+ if (shader->push_desc_info.used_descriptors &
+ ~shader->push_desc_info.fully_promoted_ubo_descriptors)
+ pipeline->use_push_descriptor |= mesa_to_vk_shader_stage(shader->stage);
}
-static VkResult
-anv_pipeline_compile_graphics(struct anv_graphics_pipeline *pipeline,
- struct anv_pipeline_cache *cache,
- const VkGraphicsPipelineCreateInfo *info)
+/* This function return true if a shader should not be looked at because of
+ * fast linking. Instead we should use the shader binaries provided by
+ * libraries.
+ */
+static bool
+anv_graphics_pipeline_skip_shader_compile(struct anv_graphics_base_pipeline *pipeline,
+ struct anv_pipeline_stage *stages,
+ bool link_optimize,
+ gl_shader_stage stage)
{
- VkPipelineCreationFeedbackEXT pipeline_feedback = {
- .flags = VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT_EXT,
- };
- int64_t pipeline_start = os_time_get_nano();
-
- const struct brw_compiler *compiler = pipeline->base.device->physical->compiler;
- struct anv_pipeline_stage stages[MESA_SHADER_STAGES] = {};
-
- pipeline->active_stages = 0;
+ /* Always skip non active stages */
+ if (!anv_pipeline_base_has_stage(pipeline, stage))
+ return true;
- /* Information on which states are considered dynamic. */
- const VkPipelineDynamicStateCreateInfo *dyn_info =
- info->pDynamicState;
- uint32_t dynamic_states = 0;
- if (dyn_info) {
- for (unsigned i = 0; i < dyn_info->dynamicStateCount; i++)
- dynamic_states |=
- anv_cmd_dirty_bit_for_vk_dynamic_state(dyn_info->pDynamicStates[i]);
- }
+ /* When link optimizing, consider all stages */
+ if (link_optimize)
+ return false;
- VkResult result;
- for (uint32_t i = 0; i < info->stageCount; i++) {
- const VkPipelineShaderStageCreateInfo *sinfo = &info->pStages[i];
- gl_shader_stage stage = vk_to_mesa_shader_stage(sinfo->stage);
+ /* Otherwise check if the stage was specified through
+ * VkGraphicsPipelineCreateInfo
+ */
+ assert(stages[stage].info != NULL || stages[stage].imported.bin != NULL);
+ return stages[stage].info == NULL;
+}
- pipeline->active_stages |= sinfo->stage;
+static void
+anv_graphics_pipeline_init_keys(struct anv_graphics_base_pipeline *pipeline,
+ const struct vk_graphics_pipeline_state *state,
+ struct anv_pipeline_stage *stages)
+{
+ for (uint32_t s = 0; s < ANV_GRAPHICS_SHADER_STAGE_COUNT; s++) {
+ if (!anv_pipeline_base_has_stage(pipeline, s))
+ continue;
int64_t stage_start = os_time_get_nano();
- stages[stage].stage = stage;
- stages[stage].module = vk_shader_module_from_handle(sinfo->module);
- stages[stage].entrypoint = sinfo->pName;
- stages[stage].spec_info = sinfo->pSpecializationInfo;
- anv_pipeline_hash_shader(stages[stage].module,
- stages[stage].entrypoint,
- stage,
- stages[stage].spec_info,
- stages[stage].shader_sha1);
-
- const struct intel_device_info *devinfo = &pipeline->base.device->info;
- switch (stage) {
+ const struct anv_device *device = pipeline->base.device;
+ switch (stages[s].stage) {
case MESA_SHADER_VERTEX:
- populate_vs_prog_key(devinfo, sinfo->flags,
- pipeline->base.device->robust_buffer_access,
- &stages[stage].key.vs);
+ populate_vs_prog_key(&stages[s], device);
break;
case MESA_SHADER_TESS_CTRL:
- populate_tcs_prog_key(devinfo, sinfo->flags,
- pipeline->base.device->robust_buffer_access,
- info->pTessellationState->patchControlPoints,
- &stages[stage].key.tcs);
+ populate_tcs_prog_key(&stages[s],
+ device,
+ BITSET_TEST(state->dynamic,
+ MESA_VK_DYNAMIC_TS_PATCH_CONTROL_POINTS) ?
+ 0 : state->ts->patch_control_points);
break;
case MESA_SHADER_TESS_EVAL:
- populate_tes_prog_key(devinfo, sinfo->flags,
- pipeline->base.device->robust_buffer_access,
- &stages[stage].key.tes);
+ populate_tes_prog_key(&stages[s], device);
break;
case MESA_SHADER_GEOMETRY:
- populate_gs_prog_key(devinfo, sinfo->flags,
- pipeline->base.device->robust_buffer_access,
- &stages[stage].key.gs);
+ populate_gs_prog_key(&stages[s], device);
break;
case MESA_SHADER_FRAGMENT: {
+ /* Assume rasterization enabled in any of the following case :
+ *
+ * - We're a pipeline library without pre-rasterization information
+ *
+ * - Rasterization is not disabled in the non dynamic state
+ *
+ * - Rasterization disable is dynamic
+ */
const bool raster_enabled =
- !info->pRasterizationState->rasterizerDiscardEnable ||
- dynamic_states & ANV_CMD_DIRTY_DYNAMIC_RASTERIZER_DISCARD_ENABLE;
- populate_wm_prog_key(pipeline, sinfo->flags,
- pipeline->base.device->robust_buffer_access,
- pipeline->subpass,
- raster_enabled ? info->pMultisampleState : NULL,
- vk_find_struct_const(info->pNext,
- PIPELINE_FRAGMENT_SHADING_RATE_STATE_CREATE_INFO_KHR),
- &stages[stage].key.wm);
+ state->rs == NULL ||
+ !state->rs->rasterizer_discard_enable ||
+ BITSET_TEST(state->dynamic, MESA_VK_DYNAMIC_RS_RASTERIZER_DISCARD_ENABLE);
+ enum brw_sometimes is_mesh = BRW_NEVER;
+ if (device->vk.enabled_extensions.EXT_mesh_shader) {
+ if (anv_pipeline_base_has_stage(pipeline, MESA_SHADER_VERTEX))
+ is_mesh = BRW_NEVER;
+ else if (anv_pipeline_base_has_stage(pipeline, MESA_SHADER_MESH))
+ is_mesh = BRW_ALWAYS;
+ else {
+ assert(pipeline->base.type == ANV_PIPELINE_GRAPHICS_LIB);
+ is_mesh = BRW_SOMETIMES;
+ }
+ }
+ populate_wm_prog_key(&stages[s],
+ pipeline,
+ state->dynamic,
+ raster_enabled ? state->ms : NULL,
+ state->fsr, state->rp, is_mesh);
break;
}
+
+ case MESA_SHADER_TASK:
+ populate_task_prog_key(&stages[s], device);
+ break;
+
+ case MESA_SHADER_MESH: {
+ const bool compact_mue =
+ !(pipeline->base.type == ANV_PIPELINE_GRAPHICS_LIB &&
+ !anv_pipeline_base_has_stage(pipeline, MESA_SHADER_FRAGMENT));
+ populate_mesh_prog_key(&stages[s], device, compact_mue);
+ break;
+ }
+
default:
unreachable("Invalid graphics shader stage");
}
- stages[stage].feedback.duration += os_time_get_nano() - stage_start;
- stages[stage].feedback.flags |= VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT_EXT;
+ stages[s].feedback.duration += os_time_get_nano() - stage_start;
+ stages[s].feedback.flags |= VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT;
}
+}
+
+static void
+anv_graphics_lib_retain_shaders(struct anv_graphics_base_pipeline *pipeline,
+ struct anv_pipeline_stage *stages,
+ bool will_compile)
+{
+ /* There isn't much point in retaining NIR shaders on final pipelines. */
+ assert(pipeline->base.type == ANV_PIPELINE_GRAPHICS_LIB);
+
+ struct anv_graphics_lib_pipeline *lib = (struct anv_graphics_lib_pipeline *) pipeline;
- if (pipeline->active_stages & VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT)
- pipeline->active_stages |= VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT;
+ for (int s = 0; s < ARRAY_SIZE(pipeline->shaders); s++) {
+ if (!anv_pipeline_base_has_stage(pipeline, s))
+ continue;
- assert(pipeline->active_stages & VK_SHADER_STAGE_VERTEX_BIT);
+ memcpy(lib->retained_shaders[s].shader_sha1, stages[s].shader_sha1,
+ sizeof(stages[s].shader_sha1));
- ANV_FROM_HANDLE(anv_pipeline_layout, layout, info->layout);
+ lib->retained_shaders[s].subgroup_size_type = stages[s].subgroup_size_type;
- unsigned char sha1[20];
- anv_pipeline_hash_graphics(pipeline, layout, stages, sha1);
+ nir_shader *nir = stages[s].nir != NULL ? stages[s].nir : stages[s].imported.nir;
+ assert(nir != NULL);
+
+ if (!will_compile) {
+ lib->retained_shaders[s].nir = nir;
+ } else {
+ lib->retained_shaders[s].nir =
+ nir_shader_clone(pipeline->base.mem_ctx, nir);
+ }
+ }
+}
+
+static bool
+anv_graphics_pipeline_load_cached_shaders(struct anv_graphics_base_pipeline *pipeline,
+ struct vk_pipeline_cache *cache,
+ struct anv_pipeline_stage *stages,
+ bool link_optimize,
+ VkPipelineCreationFeedback *pipeline_feedback)
+{
+ struct anv_device *device = pipeline->base.device;
+ unsigned cache_hits = 0, found = 0, imported = 0;
for (unsigned s = 0; s < ARRAY_SIZE(pipeline->shaders); s++) {
- if (!stages[s].entrypoint)
+ if (!anv_pipeline_base_has_stage(pipeline, s))
continue;
- stages[s].cache_key.stage = s;
- memcpy(stages[s].cache_key.sha1, sha1, sizeof(sha1));
- }
+ int64_t stage_start = os_time_get_nano();
- const bool skip_cache_lookup =
- (pipeline->base.flags & VK_PIPELINE_CREATE_CAPTURE_INTERNAL_REPRESENTATIONS_BIT_KHR);
+ bool cache_hit;
+ stages[s].bin =
+ anv_device_search_for_kernel(device, cache, &stages[s].cache_key,
+ sizeof(stages[s].cache_key), &cache_hit);
+ if (stages[s].bin) {
+ found++;
+ pipeline->shaders[s] = stages[s].bin;
+ }
- if (!skip_cache_lookup) {
- unsigned found = 0;
- unsigned cache_hits = 0;
+ if (cache_hit) {
+ cache_hits++;
+ stages[s].feedback.flags |=
+ VK_PIPELINE_CREATION_FEEDBACK_APPLICATION_PIPELINE_CACHE_HIT_BIT;
+ }
+ stages[s].feedback.duration += os_time_get_nano() - stage_start;
+ }
+
+ /* When not link optimizing, lookup the missing shader in the imported
+ * libraries.
+ */
+ if (!link_optimize) {
for (unsigned s = 0; s < ARRAY_SIZE(pipeline->shaders); s++) {
- if (!stages[s].entrypoint)
+ if (!anv_pipeline_base_has_stage(pipeline, s))
continue;
- int64_t stage_start = os_time_get_nano();
+ if (pipeline->shaders[s] != NULL)
+ continue;
- bool cache_hit;
- struct anv_shader_bin *bin =
- anv_device_search_for_kernel(pipeline->base.device, cache,
- &stages[s].cache_key,
- sizeof(stages[s].cache_key), &cache_hit);
- if (bin) {
- found++;
- pipeline->shaders[s] = bin;
- }
+ if (stages[s].imported.bin == NULL)
+ continue;
- if (cache_hit) {
- cache_hits++;
- stages[s].feedback.flags |=
- VK_PIPELINE_CREATION_FEEDBACK_APPLICATION_PIPELINE_CACHE_HIT_BIT_EXT;
- }
- stages[s].feedback.duration += os_time_get_nano() - stage_start;
+ stages[s].bin = stages[s].imported.bin;
+ pipeline->shaders[s] = anv_shader_bin_ref(stages[s].imported.bin);
+ pipeline->source_hashes[s] = stages[s].source_hash;
+ imported++;
}
+ }
- if (found == __builtin_popcount(pipeline->active_stages)) {
- if (cache_hits == found) {
- pipeline_feedback.flags |=
- VK_PIPELINE_CREATION_FEEDBACK_APPLICATION_PIPELINE_CACHE_HIT_BIT_EXT;
- }
- /* We found all our shaders in the cache. We're done. */
- for (unsigned s = 0; s < ARRAY_SIZE(pipeline->shaders); s++) {
- if (!stages[s].entrypoint)
- continue;
+ if ((found + imported) == __builtin_popcount(pipeline->base.active_stages)) {
+ if (cache_hits == found && found != 0) {
+ pipeline_feedback->flags |=
+ VK_PIPELINE_CREATION_FEEDBACK_APPLICATION_PIPELINE_CACHE_HIT_BIT;
+ }
+ /* We found all our shaders in the cache. We're done. */
+ for (unsigned s = 0; s < ARRAY_SIZE(pipeline->shaders); s++) {
+ if (pipeline->shaders[s] == NULL)
+ continue;
- anv_pipeline_add_executables(&pipeline->base, &stages[s],
- pipeline->shaders[s]);
- }
- anv_pipeline_init_from_cached_graphics(pipeline);
- goto done;
- } else if (found > 0) {
- /* We found some but not all of our shaders. This shouldn't happen
- * most of the time but it can if we have a partially populated
- * pipeline cache.
+ /* Only add the executables when we're not importing or doing link
+ * optimizations. The imported executables are added earlier. Link
+ * optimization can produce different binaries.
*/
- assert(found < __builtin_popcount(pipeline->active_stages));
-
- vk_debug_report(&pipeline->base.device->physical->instance->vk,
- VK_DEBUG_REPORT_WARNING_BIT_EXT |
- VK_DEBUG_REPORT_PERFORMANCE_WARNING_BIT_EXT,
- &cache->base, 0, 0, "anv",
- "Found a partial pipeline in the cache. This is "
- "most likely caused by an incomplete pipeline cache "
- "import or export");
-
- /* We're going to have to recompile anyway, so just throw away our
- * references to the shaders in the cache. We'll get them out of the
- * cache again as part of the compilation process.
- */
- for (unsigned s = 0; s < ARRAY_SIZE(pipeline->shaders); s++) {
- stages[s].feedback.flags = 0;
- if (pipeline->shaders[s]) {
- anv_shader_bin_unref(pipeline->base.device, pipeline->shaders[s]);
- pipeline->shaders[s] = NULL;
- }
+ if (stages[s].imported.bin == NULL || link_optimize)
+ anv_pipeline_add_executables(&pipeline->base, &stages[s]);
+ pipeline->source_hashes[s] = stages[s].source_hash;
+ }
+ return true;
+ } else if (found > 0) {
+ /* We found some but not all of our shaders. This shouldn't happen most
+ * of the time but it can if we have a partially populated pipeline
+ * cache.
+ */
+ assert(found < __builtin_popcount(pipeline->base.active_stages));
+
+ /* With GPL, this might well happen if the app does an optimized
+ * link.
+ */
+ if (!pipeline->base.device->vk.enabled_extensions.EXT_graphics_pipeline_library) {
+ vk_perf(VK_LOG_OBJS(cache ? &cache->base :
+ &pipeline->base.device->vk.base),
+ "Found a partial pipeline in the cache. This is "
+ "most likely caused by an incomplete pipeline cache "
+ "import or export");
+ }
+
+ /* We're going to have to recompile anyway, so just throw away our
+ * references to the shaders in the cache. We'll get them out of the
+ * cache again as part of the compilation process.
+ */
+ for (unsigned s = 0; s < ARRAY_SIZE(pipeline->shaders); s++) {
+ stages[s].feedback.flags = 0;
+ if (pipeline->shaders[s]) {
+ anv_shader_bin_unref(device, pipeline->shaders[s]);
+ pipeline->shaders[s] = NULL;
}
}
}
- if (info->flags & VK_PIPELINE_CREATE_FAIL_ON_PIPELINE_COMPILE_REQUIRED_BIT_EXT)
- return VK_PIPELINE_COMPILE_REQUIRED_EXT;
+ return false;
+}
- void *pipeline_ctx = ralloc_context(NULL);
+static const gl_shader_stage graphics_shader_order[] = {
+ MESA_SHADER_VERTEX,
+ MESA_SHADER_TESS_CTRL,
+ MESA_SHADER_TESS_EVAL,
+ MESA_SHADER_GEOMETRY,
- for (unsigned s = 0; s < ARRAY_SIZE(pipeline->shaders); s++) {
- if (!stages[s].entrypoint)
+ MESA_SHADER_TASK,
+ MESA_SHADER_MESH,
+
+ MESA_SHADER_FRAGMENT,
+};
+
+/* This function loads NIR only for stages specified in
+ * VkGraphicsPipelineCreateInfo::pStages[]
+ */
+static VkResult
+anv_graphics_pipeline_load_nir(struct anv_graphics_base_pipeline *pipeline,
+ struct vk_pipeline_cache *cache,
+ struct anv_pipeline_stage *stages,
+ void *mem_ctx,
+ bool need_clone)
+{
+ for (unsigned s = 0; s < ANV_GRAPHICS_SHADER_STAGE_COUNT; s++) {
+ if (!anv_pipeline_base_has_stage(pipeline, s))
continue;
int64_t stage_start = os_time_get_nano();
assert(stages[s].stage == s);
- assert(pipeline->shaders[s] == NULL);
-
- stages[s].bind_map = (struct anv_pipeline_bind_map) {
- .surface_to_descriptor = stages[s].surface_to_descriptor,
- .sampler_to_descriptor = stages[s].sampler_to_descriptor
- };
- stages[s].nir = anv_pipeline_stage_get_nir(&pipeline->base, cache,
- pipeline_ctx,
- &stages[s]);
- if (stages[s].nir == NULL) {
- result = vk_error(VK_ERROR_UNKNOWN);
- goto fail;
+ /* Only use the create NIR from the pStages[] element if we don't have
+ * an imported library for the same stage.
+ */
+ if (stages[s].imported.bin == NULL) {
+ stages[s].nir = anv_pipeline_stage_get_nir(&pipeline->base, cache,
+ mem_ctx, &stages[s]);
+ if (stages[s].nir == NULL)
+ return vk_error(pipeline, VK_ERROR_UNKNOWN);
+ } else {
+ stages[s].nir = need_clone ?
+ nir_shader_clone(mem_ctx, stages[s].imported.nir) :
+ stages[s].imported.nir;
}
- /* This is rather ugly.
+ stages[s].feedback.duration += os_time_get_nano() - stage_start;
+ }
+
+ return VK_SUCCESS;
+}
+
+static void
+anv_pipeline_nir_preprocess(struct anv_pipeline *pipeline,
+ struct anv_pipeline_stage *stage)
+{
+ struct anv_device *device = pipeline->device;
+ const struct brw_compiler *compiler = device->physical->compiler;
+
+ const struct nir_lower_sysvals_to_varyings_options sysvals_to_varyings = {
+ .point_coord = true,
+ };
+ NIR_PASS(_, stage->nir, nir_lower_sysvals_to_varyings, &sysvals_to_varyings);
+
+ const nir_opt_access_options opt_access_options = {
+ .is_vulkan = true,
+ };
+ NIR_PASS(_, stage->nir, nir_opt_access, &opt_access_options);
+
+ /* Vulkan uses the separate-shader linking model */
+ stage->nir->info.separate_shader = true;
+
+ struct brw_nir_compiler_opts opts = {
+ .softfp64 = device->fp64_nir,
+ /* Assume robustness with EXT_pipeline_robustness because this can be
+ * turned on/off per pipeline and we have no visibility on this here.
+ */
+ .robust_image_access = device->vk.enabled_features.robustImageAccess ||
+ device->vk.enabled_features.robustImageAccess2 ||
+ device->vk.enabled_extensions.EXT_pipeline_robustness,
+ .input_vertices = stage->nir->info.stage == MESA_SHADER_TESS_CTRL ?
+ stage->key.tcs.input_vertices : 0,
+ };
+ brw_preprocess_nir(compiler, stage->nir, &opts);
+
+ if (stage->nir->info.stage == MESA_SHADER_MESH) {
+ NIR_PASS(_, stage->nir, anv_nir_lower_set_vtx_and_prim_count);
+ NIR_PASS(_, stage->nir, nir_opt_dce);
+ NIR_PASS(_, stage->nir, nir_remove_dead_variables, nir_var_shader_out, NULL);
+ }
+
+ NIR_PASS(_, stage->nir, nir_opt_barrier_modes);
+
+ nir_shader_gather_info(stage->nir, nir_shader_get_entrypoint(stage->nir));
+}
+
+static void
+anv_fill_pipeline_creation_feedback(const struct anv_graphics_base_pipeline *pipeline,
+ VkPipelineCreationFeedback *pipeline_feedback,
+ const VkGraphicsPipelineCreateInfo *info,
+ struct anv_pipeline_stage *stages)
+{
+ const VkPipelineCreationFeedbackCreateInfo *create_feedback =
+ vk_find_struct_const(info->pNext, PIPELINE_CREATION_FEEDBACK_CREATE_INFO);
+ if (create_feedback) {
+ *create_feedback->pPipelineCreationFeedback = *pipeline_feedback;
+
+ /* VkPipelineCreationFeedbackCreateInfo:
+ *
+ * "An implementation must set or clear the
+ * VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT in
+ * VkPipelineCreationFeedback::flags for pPipelineCreationFeedback
+ * and every element of pPipelineStageCreationFeedbacks."
*
- * Any variable annotated as interpolated by sample essentially disables
- * coarse pixel shading. Unfortunately the CTS tests exercising this set
- * the varying value in the previous stage using a constant. Our NIR
- * infrastructure is clever enough to lookup variables across stages and
- * constant fold, removing the variable. So in order to comply with CTS
- * we have check variables here.
*/
- if (s == MESA_SHADER_FRAGMENT) {
- nir_foreach_variable_in_list(var, &stages[s].nir->variables) {
- if (var->data.sample) {
- stages[s].key.wm.coarse_pixel = false;
- break;
+ for (uint32_t i = 0; i < create_feedback->pipelineStageCreationFeedbackCount; i++) {
+ create_feedback->pPipelineStageCreationFeedbacks[i].flags &=
+ ~VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT;
+ }
+ /* This part is not really specified in the Vulkan spec at the moment.
+ * We're kind of guessing what the CTS wants. We might need to update
+ * when https://gitlab.khronos.org/vulkan/vulkan/-/issues/3115 is
+ * clarified.
+ */
+ for (uint32_t s = 0; s < ANV_GRAPHICS_SHADER_STAGE_COUNT; s++) {
+ if (!anv_pipeline_base_has_stage(pipeline, s))
+ continue;
+
+ if (stages[s].feedback_idx < create_feedback->pipelineStageCreationFeedbackCount) {
+ create_feedback->pPipelineStageCreationFeedbacks[
+ stages[s].feedback_idx] = stages[s].feedback;
+ }
+ }
+ }
+}
+
+static uint32_t
+anv_graphics_pipeline_imported_shader_count(struct anv_pipeline_stage *stages)
+{
+ uint32_t count = 0;
+ for (uint32_t s = 0; s < ANV_GRAPHICS_SHADER_STAGE_COUNT; s++) {
+ if (stages[s].imported.bin != NULL)
+ count++;
+ }
+ return count;
+}
+
+static VkResult
+anv_graphics_pipeline_compile(struct anv_graphics_base_pipeline *pipeline,
+ struct anv_pipeline_stage *stages,
+ struct vk_pipeline_cache *cache,
+ VkPipelineCreationFeedback *pipeline_feedback,
+ const VkGraphicsPipelineCreateInfo *info,
+ const struct vk_graphics_pipeline_state *state)
+{
+ int64_t pipeline_start = os_time_get_nano();
+
+ struct anv_device *device = pipeline->base.device;
+ const struct intel_device_info *devinfo = device->info;
+ const struct brw_compiler *compiler = device->physical->compiler;
+
+ /* Setup the shaders given in this VkGraphicsPipelineCreateInfo::pStages[].
+ * Other shaders imported from libraries should have been added by
+ * anv_graphics_pipeline_import_lib().
+ */
+ uint32_t shader_count = anv_graphics_pipeline_imported_shader_count(stages);
+ for (uint32_t i = 0; i < info->stageCount; i++) {
+ gl_shader_stage stage = vk_to_mesa_shader_stage(info->pStages[i].stage);
+
+ /* If a pipeline library is loaded in this stage, we should ignore the
+ * pStages[] entry of the same stage.
+ */
+ if (stages[stage].imported.bin != NULL)
+ continue;
+
+ stages[stage].stage = stage;
+ stages[stage].pipeline_pNext = info->pNext;
+ stages[stage].info = &info->pStages[i];
+ stages[stage].feedback_idx = shader_count++;
+
+ anv_stage_write_shader_hash(&stages[stage], device);
+ }
+
+ /* Prepare shader keys for all shaders in pipeline->base.active_stages
+ * (this includes libraries) before generating the hash for cache look up.
+ *
+ * We're doing this because the spec states that :
+ *
+ * "When an implementation is looking up a pipeline in a pipeline cache,
+ * if that pipeline is being created using linked libraries,
+ * implementations should always return an equivalent pipeline created
+ * with VK_PIPELINE_CREATE_LINK_TIME_OPTIMIZATION_BIT_EXT if available,
+ * whether or not that bit was specified."
+ *
+ * So even if the application does not request link optimization, we have
+ * to do our cache lookup with the entire set of shader sha1s so that we
+ * can find what would be the best optimized pipeline in the case as if we
+ * had compiled all the shaders together and known the full graphics state.
+ */
+ anv_graphics_pipeline_init_keys(pipeline, state, stages);
+
+ uint32_t view_mask = state->rp ? state->rp->view_mask : 0;
+
+ unsigned char sha1[20];
+ anv_pipeline_hash_graphics(pipeline, stages, view_mask, sha1);
+
+ for (unsigned s = 0; s < ANV_GRAPHICS_SHADER_STAGE_COUNT; s++) {
+ if (!anv_pipeline_base_has_stage(pipeline, s))
+ continue;
+
+ stages[s].cache_key.stage = s;
+ memcpy(stages[s].cache_key.sha1, sha1, sizeof(sha1));
+ }
+
+ const bool retain_shaders =
+ pipeline->base.flags & VK_PIPELINE_CREATE_2_RETAIN_LINK_TIME_OPTIMIZATION_INFO_BIT_EXT;
+ const bool link_optimize =
+ pipeline->base.flags & VK_PIPELINE_CREATE_2_LINK_TIME_OPTIMIZATION_BIT_EXT;
+
+ VkResult result = VK_SUCCESS;
+ const bool skip_cache_lookup =
+ (pipeline->base.flags & VK_PIPELINE_CREATE_CAPTURE_INTERNAL_REPRESENTATIONS_BIT_KHR);
+
+ if (!skip_cache_lookup) {
+ bool found_all_shaders =
+ anv_graphics_pipeline_load_cached_shaders(pipeline, cache, stages,
+ link_optimize,
+ pipeline_feedback);
+
+ if (found_all_shaders) {
+ /* If we need to retain shaders, we need to also load from the NIR
+ * cache.
+ */
+ if (pipeline->base.type == ANV_PIPELINE_GRAPHICS_LIB && retain_shaders) {
+ result = anv_graphics_pipeline_load_nir(pipeline, cache,
+ stages,
+ pipeline->base.mem_ctx,
+ false /* need_clone */);
+ if (result != VK_SUCCESS) {
+ vk_perf(VK_LOG_OBJS(cache ? &cache->base :
+ &pipeline->base.device->vk.base),
+ "Found all ISA shaders in the cache but not all NIR shaders.");
}
+
+ anv_graphics_lib_retain_shaders(pipeline, stages, false /* will_compile */);
+ }
+
+ if (result == VK_SUCCESS)
+ goto done;
+
+ for (unsigned s = 0; s < ANV_GRAPHICS_SHADER_STAGE_COUNT; s++) {
+ if (!anv_pipeline_base_has_stage(pipeline, s))
+ continue;
+
+ if (stages[s].nir) {
+ ralloc_free(stages[s].nir);
+ stages[s].nir = NULL;
+ }
+
+ assert(pipeline->shaders[s] != NULL);
+ anv_shader_bin_unref(device, pipeline->shaders[s]);
+ pipeline->shaders[s] = NULL;
}
}
+ }
- stages[s].feedback.duration += os_time_get_nano() - stage_start;
+ if (pipeline->base.flags & VK_PIPELINE_CREATE_2_FAIL_ON_PIPELINE_COMPILE_REQUIRED_BIT_KHR)
+ return VK_PIPELINE_COMPILE_REQUIRED;
+
+ void *tmp_ctx = ralloc_context(NULL);
+
+ result = anv_graphics_pipeline_load_nir(pipeline, cache, stages,
+ tmp_ctx, link_optimize /* need_clone */);
+ if (result != VK_SUCCESS)
+ goto fail;
+
+ /* Retain shaders now if asked, this only applies to libraries */
+ if (pipeline->base.type == ANV_PIPELINE_GRAPHICS_LIB && retain_shaders)
+ anv_graphics_lib_retain_shaders(pipeline, stages, true /* will_compile */);
+
+ /* The following steps will be executed for shaders we need to compile :
+ *
+ * - specified through VkGraphicsPipelineCreateInfo::pStages[]
+ *
+ * - or compiled from libraries with retained shaders (libraries
+ * compiled with CREATE_RETAIN_LINK_TIME_OPTIMIZATION_INFO_BIT) if the
+ * pipeline has the CREATE_LINK_TIME_OPTIMIZATION_BIT flag.
+ */
+
+ /* Preprocess all NIR shaders. */
+ for (int s = 0; s < ARRAY_SIZE(pipeline->shaders); s++) {
+ if (anv_graphics_pipeline_skip_shader_compile(pipeline, stages,
+ link_optimize, s))
+ continue;
+
+ anv_stage_allocate_bind_map_tables(&pipeline->base, &stages[s], tmp_ctx);
+
+ anv_pipeline_nir_preprocess(&pipeline->base, &stages[s]);
+ }
+
+ if (stages[MESA_SHADER_MESH].info && stages[MESA_SHADER_FRAGMENT].info) {
+ anv_apply_per_prim_attr_wa(stages[MESA_SHADER_MESH].nir,
+ stages[MESA_SHADER_FRAGMENT].nir,
+ device,
+ info);
}
/* Walk backwards to link */
struct anv_pipeline_stage *next_stage = NULL;
- for (int s = ARRAY_SIZE(pipeline->shaders) - 1; s >= 0; s--) {
- if (!stages[s].entrypoint)
+ for (int i = ARRAY_SIZE(graphics_shader_order) - 1; i >= 0; i--) {
+ gl_shader_stage s = graphics_shader_order[i];
+ if (anv_graphics_pipeline_skip_shader_compile(pipeline, stages,
+ link_optimize, s))
continue;
+ struct anv_pipeline_stage *stage = &stages[s];
+
switch (s) {
case MESA_SHADER_VERTEX:
- anv_pipeline_link_vs(compiler, &stages[s], next_stage);
+ anv_pipeline_link_vs(compiler, stage, next_stage);
break;
case MESA_SHADER_TESS_CTRL:
- anv_pipeline_link_tcs(compiler, &stages[s], next_stage);
+ anv_pipeline_link_tcs(compiler, stage, next_stage);
break;
case MESA_SHADER_TESS_EVAL:
- anv_pipeline_link_tes(compiler, &stages[s], next_stage);
+ anv_pipeline_link_tes(compiler, stage, next_stage);
break;
case MESA_SHADER_GEOMETRY:
- anv_pipeline_link_gs(compiler, &stages[s], next_stage);
+ anv_pipeline_link_gs(compiler, stage, next_stage);
+ break;
+ case MESA_SHADER_TASK:
+ anv_pipeline_link_task(compiler, stage, next_stage);
+ break;
+ case MESA_SHADER_MESH:
+ anv_pipeline_link_mesh(compiler, stage, next_stage);
break;
case MESA_SHADER_FRAGMENT:
- anv_pipeline_link_fs(compiler, &stages[s]);
+ anv_pipeline_link_fs(compiler, stage, state->rp);
break;
default:
unreachable("Invalid graphics shader stage");
}
- next_stage = &stages[s];
+ next_stage = stage;
}
- if (pipeline->base.device->info.ver >= 12 &&
- pipeline->subpass->view_mask != 0) {
+ bool use_primitive_replication = false;
+ if (devinfo->ver >= 12 && view_mask != 0) {
/* For some pipelines HW Primitive Replication can be used instead of
* instancing to implement Multiview. This depend on how viewIndex is
* used in all the active shaders, so this check can't be done per
* individual shaders.
*/
- nir_shader *shaders[MESA_SHADER_STAGES] = {};
- for (unsigned s = 0; s < MESA_SHADER_STAGES; s++)
+ nir_shader *shaders[ANV_GRAPHICS_SHADER_STAGE_COUNT] = {};
+ for (unsigned s = 0; s < ARRAY_SIZE(shaders); s++)
shaders[s] = stages[s].nir;
- pipeline->use_primitive_replication =
- anv_check_for_primitive_replication(shaders, pipeline);
- } else {
- pipeline->use_primitive_replication = false;
+ use_primitive_replication =
+ anv_check_for_primitive_replication(device,
+ pipeline->base.active_stages,
+ shaders, view_mask);
}
struct anv_pipeline_stage *prev_stage = NULL;
- for (unsigned s = 0; s < ARRAY_SIZE(pipeline->shaders); s++) {
- if (!stages[s].entrypoint)
+ for (unsigned i = 0; i < ARRAY_SIZE(graphics_shader_order); i++) {
+ gl_shader_stage s = graphics_shader_order[i];
+ if (anv_graphics_pipeline_skip_shader_compile(pipeline, stages,
+ link_optimize, s))
continue;
+ struct anv_pipeline_stage *stage = &stages[s];
+
int64_t stage_start = os_time_get_nano();
- void *stage_ctx = ralloc_context(NULL);
+ anv_pipeline_lower_nir(&pipeline->base, tmp_ctx, stage,
+ &pipeline->base.layout, view_mask,
+ use_primitive_replication);
+
+ struct shader_info *cur_info = &stage->nir->info;
- anv_pipeline_lower_nir(&pipeline->base, stage_ctx, &stages[s], layout);
+ if (prev_stage && compiler->nir_options[s]->unify_interfaces) {
+ struct shader_info *prev_info = &prev_stage->nir->info;
- if (prev_stage && compiler->glsl_compiler_options[s].NirOptions->unify_interfaces) {
- prev_stage->nir->info.outputs_written |= stages[s].nir->info.inputs_read &
+ prev_info->outputs_written |= cur_info->inputs_read &
~(VARYING_BIT_TESS_LEVEL_INNER | VARYING_BIT_TESS_LEVEL_OUTER);
- stages[s].nir->info.inputs_read |= prev_stage->nir->info.outputs_written &
+ cur_info->inputs_read |= prev_info->outputs_written &
~(VARYING_BIT_TESS_LEVEL_INNER | VARYING_BIT_TESS_LEVEL_OUTER);
- prev_stage->nir->info.patch_outputs_written |= stages[s].nir->info.patch_inputs_read;
- stages[s].nir->info.patch_inputs_read |= prev_stage->nir->info.patch_outputs_written;
+ prev_info->patch_outputs_written |= cur_info->patch_inputs_read;
+ cur_info->patch_inputs_read |= prev_info->patch_outputs_written;
}
- ralloc_free(stage_ctx);
+ anv_fixup_subgroup_size(device, cur_info);
- stages[s].feedback.duration += os_time_get_nano() - stage_start;
+ stage->feedback.duration += os_time_get_nano() - stage_start;
- prev_stage = &stages[s];
+ prev_stage = stage;
+ }
+
+ /* In the case the platform can write the primitive variable shading rate
+ * and KHR_fragment_shading_rate is enabled :
+ * - there can be a fragment shader but we don't have it yet
+ * - the fragment shader needs fragment shading rate
+ *
+ * figure out the last geometry stage that should write the primitive
+ * shading rate, and ensure it is marked as used there. The backend will
+ * write a default value if the shader doesn't actually write it.
+ *
+ * We iterate backwards in the stage and stop on the first shader that can
+ * set the value.
+ *
+ * Don't apply this to MESH stages, as this is a per primitive thing.
+ */
+ if (devinfo->has_coarse_pixel_primitive_and_cb &&
+ device->vk.enabled_extensions.KHR_fragment_shading_rate &&
+ pipeline_has_coarse_pixel(state->dynamic, state->ms, state->fsr) &&
+ (!stages[MESA_SHADER_FRAGMENT].info ||
+ stages[MESA_SHADER_FRAGMENT].key.wm.coarse_pixel) &&
+ stages[MESA_SHADER_MESH].nir == NULL) {
+ struct anv_pipeline_stage *last_psr = NULL;
+
+ for (unsigned i = 0; i < ARRAY_SIZE(graphics_shader_order); i++) {
+ gl_shader_stage s =
+ graphics_shader_order[ARRAY_SIZE(graphics_shader_order) - i - 1];
+
+ if (anv_graphics_pipeline_skip_shader_compile(pipeline, stages,
+ link_optimize, s) ||
+ !gl_shader_stage_can_set_fragment_shading_rate(s))
+ continue;
+
+ last_psr = &stages[s];
+ break;
+ }
+
+ /* Only set primitive shading rate if there is a pre-rasterization
+ * shader in this pipeline/pipeline-library.
+ */
+ if (last_psr)
+ last_psr->nir->info.outputs_written |= VARYING_BIT_PRIMITIVE_SHADING_RATE;
}
prev_stage = NULL;
- for (unsigned s = 0; s < MESA_SHADER_STAGES; s++) {
- if (!stages[s].entrypoint)
+ for (unsigned i = 0; i < ARRAY_SIZE(graphics_shader_order); i++) {
+ gl_shader_stage s = graphics_shader_order[i];
+ struct anv_pipeline_stage *stage = &stages[s];
+
+ if (anv_graphics_pipeline_skip_shader_compile(pipeline, stages, link_optimize, s))
continue;
int64_t stage_start = os_time_get_nano();
void *stage_ctx = ralloc_context(NULL);
- nir_xfb_info *xfb_info = NULL;
- if (s == MESA_SHADER_VERTEX ||
- s == MESA_SHADER_TESS_EVAL ||
- s == MESA_SHADER_GEOMETRY)
- xfb_info = nir_gather_xfb_info(stages[s].nir, stage_ctx);
-
switch (s) {
case MESA_SHADER_VERTEX:
anv_pipeline_compile_vs(compiler, stage_ctx, pipeline,
- &stages[s]);
+ stage, view_mask);
break;
case MESA_SHADER_TESS_CTRL:
- anv_pipeline_compile_tcs(compiler, stage_ctx, pipeline->base.device,
- &stages[s], prev_stage);
+ anv_pipeline_compile_tcs(compiler, stage_ctx, device,
+ stage, prev_stage);
break;
case MESA_SHADER_TESS_EVAL:
- anv_pipeline_compile_tes(compiler, stage_ctx, pipeline->base.device,
- &stages[s], prev_stage);
+ anv_pipeline_compile_tes(compiler, stage_ctx, device,
+ stage, prev_stage);
break;
case MESA_SHADER_GEOMETRY:
- anv_pipeline_compile_gs(compiler, stage_ctx, pipeline->base.device,
- &stages[s], prev_stage);
+ anv_pipeline_compile_gs(compiler, stage_ctx, device,
+ stage, prev_stage);
+ break;
+ case MESA_SHADER_TASK:
+ anv_pipeline_compile_task(compiler, stage_ctx, device,
+ stage);
+ break;
+ case MESA_SHADER_MESH:
+ anv_pipeline_compile_mesh(compiler, stage_ctx, device,
+ stage, prev_stage);
break;
case MESA_SHADER_FRAGMENT:
- anv_pipeline_compile_fs(compiler, stage_ctx, pipeline->base.device,
- &stages[s], prev_stage);
+ anv_pipeline_compile_fs(compiler, stage_ctx, device,
+ stage, prev_stage, pipeline,
+ view_mask,
+ use_primitive_replication);
break;
default:
unreachable("Invalid graphics shader stage");
}
- if (stages[s].code == NULL) {
+ if (stage->code == NULL) {
ralloc_free(stage_ctx);
- result = vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+ result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
goto fail;
}
- anv_nir_validate_push_layout(&stages[s].prog_data.base,
- &stages[s].bind_map);
-
- struct anv_shader_bin *bin =
- anv_device_upload_kernel(pipeline->base.device, cache, s,
- &stages[s].cache_key,
- sizeof(stages[s].cache_key),
- stages[s].code,
- stages[s].prog_data.base.program_size,
- &stages[s].prog_data.base,
- brw_prog_data_size(s),
- stages[s].stats, stages[s].num_stats,
- xfb_info, &stages[s].bind_map);
- if (!bin) {
+ anv_nir_validate_push_layout(&stage->prog_data.base,
+ &stage->bind_map);
+
+ struct anv_shader_upload_params upload_params = {
+ .stage = s,
+ .key_data = &stage->cache_key,
+ .key_size = sizeof(stage->cache_key),
+ .kernel_data = stage->code,
+ .kernel_size = stage->prog_data.base.program_size,
+ .prog_data = &stage->prog_data.base,
+ .prog_data_size = brw_prog_data_size(s),
+ .stats = stage->stats,
+ .num_stats = stage->num_stats,
+ .xfb_info = stage->nir->xfb_info,
+ .bind_map = &stage->bind_map,
+ .push_desc_info = &stage->push_desc_info,
+ .dynamic_push_values = stage->dynamic_push_values,
+ };
+
+ stage->bin =
+ anv_device_upload_kernel(device, cache, &upload_params);
+ if (!stage->bin) {
ralloc_free(stage_ctx);
- result = vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+ result = vk_error(pipeline, VK_ERROR_OUT_OF_HOST_MEMORY);
goto fail;
}
- anv_pipeline_add_executables(&pipeline->base, &stages[s], bin);
+ anv_pipeline_add_executables(&pipeline->base, stage);
+ pipeline->source_hashes[s] = stage->source_hash;
+ pipeline->shaders[s] = stage->bin;
- pipeline->shaders[s] = bin;
ralloc_free(stage_ctx);
- stages[s].feedback.duration += os_time_get_nano() - stage_start;
+ stage->feedback.duration += os_time_get_nano() - stage_start;
- prev_stage = &stages[s];
+ prev_stage = stage;
}
- ralloc_free(pipeline_ctx);
+ /* Finally add the imported shaders that were not compiled as part of this
+ * step.
+ */
+ for (unsigned s = 0; s < ARRAY_SIZE(pipeline->shaders); s++) {
+ if (!anv_pipeline_base_has_stage(pipeline, s))
+ continue;
+
+ if (pipeline->shaders[s] != NULL)
+ continue;
-done:
+ /* We should have recompiled everything with link optimization. */
+ assert(!link_optimize);
- if (pipeline->shaders[MESA_SHADER_FRAGMENT] &&
- pipeline->shaders[MESA_SHADER_FRAGMENT]->prog_data->program_size == 0) {
- /* This can happen if we decided to implicitly disable the fragment
- * shader. See anv_pipeline_compile_fs().
- */
- anv_shader_bin_unref(pipeline->base.device,
- pipeline->shaders[MESA_SHADER_FRAGMENT]);
- pipeline->shaders[MESA_SHADER_FRAGMENT] = NULL;
- pipeline->active_stages &= ~VK_SHADER_STAGE_FRAGMENT_BIT;
+ struct anv_pipeline_stage *stage = &stages[s];
+
+ pipeline->source_hashes[s] = stage->source_hash;
+ pipeline->shaders[s] = anv_shader_bin_ref(stage->imported.bin);
}
- pipeline_feedback.duration = os_time_get_nano() - pipeline_start;
+ ralloc_free(tmp_ctx);
- const VkPipelineCreationFeedbackCreateInfoEXT *create_feedback =
- vk_find_struct_const(info->pNext, PIPELINE_CREATION_FEEDBACK_CREATE_INFO_EXT);
- if (create_feedback) {
- *create_feedback->pPipelineCreationFeedback = pipeline_feedback;
+done:
- assert(info->stageCount == create_feedback->pipelineStageCreationFeedbackCount);
- for (uint32_t i = 0; i < info->stageCount; i++) {
- gl_shader_stage s = vk_to_mesa_shader_stage(info->pStages[i].stage);
- create_feedback->pPipelineStageCreationFeedbacks[i] = stages[s].feedback;
- }
+ /* Write the feedback index into the pipeline */
+ for (unsigned s = 0; s < ARRAY_SIZE(pipeline->shaders); s++) {
+ if (!anv_pipeline_base_has_stage(pipeline, s))
+ continue;
+
+ struct anv_pipeline_stage *stage = &stages[s];
+ pipeline->feedback_index[s] = stage->feedback_idx;
+ pipeline->robust_flags[s] = stage->robust_flags;
+
+ anv_pipeline_account_shader(&pipeline->base, pipeline->shaders[s]);
+ }
+
+ pipeline_feedback->duration = os_time_get_nano() - pipeline_start;
+
+ if (pipeline->shaders[MESA_SHADER_FRAGMENT]) {
+ pipeline->fragment_dynamic =
+ anv_graphics_pipeline_stage_fragment_dynamic(
+ &stages[MESA_SHADER_FRAGMENT]);
}
return VK_SUCCESS;
fail:
- ralloc_free(pipeline_ctx);
+ ralloc_free(tmp_ctx);
for (unsigned s = 0; s < ARRAY_SIZE(pipeline->shaders); s++) {
if (pipeline->shaders[s])
- anv_shader_bin_unref(pipeline->base.device, pipeline->shaders[s]);
+ anv_shader_bin_unref(device, pipeline->shaders[s]);
}
return result;
}
-VkResult
+static VkResult
anv_pipeline_compile_cs(struct anv_compute_pipeline *pipeline,
- struct anv_pipeline_cache *cache,
- const VkComputePipelineCreateInfo *info,
- const struct vk_shader_module *module,
- const char *entrypoint,
- const VkSpecializationInfo *spec_info)
-{
- VkPipelineCreationFeedbackEXT pipeline_feedback = {
- .flags = VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT_EXT,
+ struct vk_pipeline_cache *cache,
+ const VkComputePipelineCreateInfo *info)
+{
+ ASSERTED const VkPipelineShaderStageCreateInfo *sinfo = &info->stage;
+ assert(sinfo->stage == VK_SHADER_STAGE_COMPUTE_BIT);
+
+ VkPipelineCreationFeedback pipeline_feedback = {
+ .flags = VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT,
};
int64_t pipeline_start = os_time_get_nano();
- const struct brw_compiler *compiler = pipeline->base.device->physical->compiler;
+ struct anv_device *device = pipeline->base.device;
+ const struct brw_compiler *compiler = device->physical->compiler;
struct anv_pipeline_stage stage = {
.stage = MESA_SHADER_COMPUTE,
- .module = module,
- .entrypoint = entrypoint,
- .spec_info = spec_info,
+ .info = &info->stage,
+ .pipeline_pNext = info->pNext,
.cache_key = {
.stage = MESA_SHADER_COMPUTE,
},
.feedback = {
- .flags = VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT_EXT,
+ .flags = VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT,
},
};
- anv_pipeline_hash_shader(stage.module,
- stage.entrypoint,
- MESA_SHADER_COMPUTE,
- stage.spec_info,
- stage.shader_sha1);
+ anv_stage_write_shader_hash(&stage, device);
- struct anv_shader_bin *bin = NULL;
-
- const VkPipelineShaderStageRequiredSubgroupSizeCreateInfoEXT *rss_info =
- vk_find_struct_const(info->stage.pNext,
- PIPELINE_SHADER_STAGE_REQUIRED_SUBGROUP_SIZE_CREATE_INFO_EXT);
-
- populate_cs_prog_key(&pipeline->base.device->info, info->stage.flags,
- pipeline->base.device->robust_buffer_access,
- rss_info, &stage.key.cs);
-
- ANV_FROM_HANDLE(anv_pipeline_layout, layout, info->layout);
+ populate_cs_prog_key(&stage, device);
const bool skip_cache_lookup =
(pipeline->base.flags & VK_PIPELINE_CREATE_CAPTURE_INTERNAL_REPRESENTATIONS_BIT_KHR);
- anv_pipeline_hash_compute(pipeline, layout, &stage, stage.cache_key.sha1);
+ anv_pipeline_hash_compute(pipeline, &stage, stage.cache_key.sha1);
bool cache_hit = false;
if (!skip_cache_lookup) {
- bin = anv_device_search_for_kernel(pipeline->base.device, cache,
- &stage.cache_key,
- sizeof(stage.cache_key),
- &cache_hit);
+ stage.bin = anv_device_search_for_kernel(device, cache,
+ &stage.cache_key,
+ sizeof(stage.cache_key),
+ &cache_hit);
}
- if (bin == NULL &&
- (info->flags & VK_PIPELINE_CREATE_FAIL_ON_PIPELINE_COMPILE_REQUIRED_BIT_EXT))
- return VK_PIPELINE_COMPILE_REQUIRED_EXT;
+ if (stage.bin == NULL &&
+ (pipeline->base.flags & VK_PIPELINE_CREATE_FAIL_ON_PIPELINE_COMPILE_REQUIRED_BIT))
+ return VK_PIPELINE_COMPILE_REQUIRED;
void *mem_ctx = ralloc_context(NULL);
- if (bin == NULL) {
+ if (stage.bin == NULL) {
int64_t stage_start = os_time_get_nano();
- stage.bind_map = (struct anv_pipeline_bind_map) {
- .surface_to_descriptor = stage.surface_to_descriptor,
- .sampler_to_descriptor = stage.sampler_to_descriptor
- };
+ anv_stage_allocate_bind_map_tables(&pipeline->base, &stage, mem_ctx);
/* Set up a binding for the gl_NumWorkGroups */
stage.bind_map.surface_count = 1;
stage.bind_map.surface_to_descriptor[0] = (struct anv_pipeline_binding) {
.set = ANV_DESCRIPTOR_SET_NUM_WORK_GROUPS,
+ .binding = UINT32_MAX,
};
stage.nir = anv_pipeline_stage_get_nir(&pipeline->base, cache, mem_ctx, &stage);
if (stage.nir == NULL) {
ralloc_free(mem_ctx);
- return vk_error(VK_ERROR_UNKNOWN);
+ return vk_error(pipeline, VK_ERROR_UNKNOWN);
}
- NIR_PASS_V(stage.nir, anv_nir_add_base_work_group_id);
+ anv_pipeline_nir_preprocess(&pipeline->base, &stage);
- anv_pipeline_lower_nir(&pipeline->base, mem_ctx, &stage, layout);
+ anv_pipeline_lower_nir(&pipeline->base, mem_ctx, &stage,
+ &pipeline->base.layout, 0 /* view_mask */,
+ false /* use_primitive_replication */);
- NIR_PASS_V(stage.nir, brw_nir_lower_cs_intrinsics);
+ anv_fixup_subgroup_size(device, &stage.nir->info);
stage.num_stats = 1;
struct brw_compile_cs_params params = {
- .nir = stage.nir,
+ .base = {
+ .nir = stage.nir,
+ .stats = stage.stats,
+ .log_data = device,
+ .mem_ctx = mem_ctx,
+ },
.key = &stage.key.cs,
.prog_data = &stage.prog_data.cs,
- .stats = stage.stats,
- .log_data = pipeline->base.device,
};
- stage.code = brw_compile_cs(compiler, mem_ctx, &params);
+ stage.code = brw_compile_cs(compiler, &params);
if (stage.code == NULL) {
ralloc_free(mem_ctx);
- return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+ return vk_error(pipeline, VK_ERROR_OUT_OF_HOST_MEMORY);
}
anv_nir_validate_push_layout(&stage.prog_data.base, &stage.bind_map);
@@ -1900,588 +2684,662 @@ anv_pipeline_compile_cs(struct anv_compute_pipeline *pipeline,
stage.bind_map.surface_to_descriptor[0].set = ANV_DESCRIPTOR_SET_NULL;
}
- const unsigned code_size = stage.prog_data.base.program_size;
- bin = anv_device_upload_kernel(pipeline->base.device, cache,
- MESA_SHADER_COMPUTE,
- &stage.cache_key, sizeof(stage.cache_key),
- stage.code, code_size,
- &stage.prog_data.base,
- sizeof(stage.prog_data.cs),
- stage.stats, stage.num_stats,
- NULL, &stage.bind_map);
- if (!bin) {
+ struct anv_shader_upload_params upload_params = {
+ .stage = MESA_SHADER_COMPUTE,
+ .key_data = &stage.cache_key,
+ .key_size = sizeof(stage.cache_key),
+ .kernel_data = stage.code,
+ .kernel_size = stage.prog_data.base.program_size,
+ .prog_data = &stage.prog_data.base,
+ .prog_data_size = sizeof(stage.prog_data.cs),
+ .stats = stage.stats,
+ .num_stats = stage.num_stats,
+ .bind_map = &stage.bind_map,
+ .push_desc_info = &stage.push_desc_info,
+ .dynamic_push_values = stage.dynamic_push_values,
+ };
+
+ stage.bin = anv_device_upload_kernel(device, cache, &upload_params);
+ if (!stage.bin) {
ralloc_free(mem_ctx);
- return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+ return vk_error(pipeline, VK_ERROR_OUT_OF_HOST_MEMORY);
}
stage.feedback.duration = os_time_get_nano() - stage_start;
}
- anv_pipeline_add_executables(&pipeline->base, &stage, bin);
+ anv_pipeline_account_shader(&pipeline->base, stage.bin);
+ anv_pipeline_add_executables(&pipeline->base, &stage);
+ pipeline->source_hash = stage.source_hash;
ralloc_free(mem_ctx);
if (cache_hit) {
stage.feedback.flags |=
- VK_PIPELINE_CREATION_FEEDBACK_APPLICATION_PIPELINE_CACHE_HIT_BIT_EXT;
+ VK_PIPELINE_CREATION_FEEDBACK_APPLICATION_PIPELINE_CACHE_HIT_BIT;
pipeline_feedback.flags |=
- VK_PIPELINE_CREATION_FEEDBACK_APPLICATION_PIPELINE_CACHE_HIT_BIT_EXT;
+ VK_PIPELINE_CREATION_FEEDBACK_APPLICATION_PIPELINE_CACHE_HIT_BIT;
}
pipeline_feedback.duration = os_time_get_nano() - pipeline_start;
- const VkPipelineCreationFeedbackCreateInfoEXT *create_feedback =
- vk_find_struct_const(info->pNext, PIPELINE_CREATION_FEEDBACK_CREATE_INFO_EXT);
+ const VkPipelineCreationFeedbackCreateInfo *create_feedback =
+ vk_find_struct_const(info->pNext, PIPELINE_CREATION_FEEDBACK_CREATE_INFO);
if (create_feedback) {
*create_feedback->pPipelineCreationFeedback = pipeline_feedback;
- assert(create_feedback->pipelineStageCreationFeedbackCount == 1);
- create_feedback->pPipelineStageCreationFeedbacks[0] = stage.feedback;
+ if (create_feedback->pipelineStageCreationFeedbackCount) {
+ assert(create_feedback->pipelineStageCreationFeedbackCount == 1);
+ create_feedback->pPipelineStageCreationFeedbacks[0] = stage.feedback;
+ }
}
- pipeline->cs = bin;
+ pipeline->cs = stage.bin;
return VK_SUCCESS;
}
-/**
- * Copy pipeline state not marked as dynamic.
- * Dynamic state is pipeline state which hasn't been provided at pipeline
- * creation time, but is dynamically provided afterwards using various
- * vkCmdSet* functions.
- *
- * The set of state considered "non_dynamic" is determined by the pieces of
- * state that have their corresponding VkDynamicState enums omitted from
- * VkPipelineDynamicStateCreateInfo::pDynamicStates.
- *
- * @param[out] pipeline Destination non_dynamic state.
- * @param[in] pCreateInfo Source of non_dynamic state to be copied.
- */
-static void
-copy_non_dynamic_state(struct anv_graphics_pipeline *pipeline,
- const VkGraphicsPipelineCreateInfo *pCreateInfo)
+static VkResult
+anv_compute_pipeline_create(struct anv_device *device,
+ struct vk_pipeline_cache *cache,
+ const VkComputePipelineCreateInfo *pCreateInfo,
+ const VkAllocationCallbacks *pAllocator,
+ VkPipeline *pPipeline)
{
- anv_cmd_dirty_mask_t states = ANV_CMD_DIRTY_DYNAMIC_ALL;
- struct anv_subpass *subpass = pipeline->subpass;
+ struct anv_compute_pipeline *pipeline;
+ VkResult result;
- pipeline->dynamic_state = default_dynamic_state;
+ assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO);
- states &= ~pipeline->dynamic_states;
+ pipeline = vk_zalloc2(&device->vk.alloc, pAllocator, sizeof(*pipeline), 8,
+ VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+ if (pipeline == NULL)
+ return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
- struct anv_dynamic_state *dynamic = &pipeline->dynamic_state;
+ result = anv_pipeline_init(&pipeline->base, device,
+ ANV_PIPELINE_COMPUTE,
+ vk_compute_pipeline_create_flags(pCreateInfo),
+ pAllocator);
+ if (result != VK_SUCCESS) {
+ vk_free2(&device->vk.alloc, pAllocator, pipeline);
+ return result;
+ }
- bool raster_discard =
- pCreateInfo->pRasterizationState->rasterizerDiscardEnable &&
- !(pipeline->dynamic_states & ANV_CMD_DIRTY_DYNAMIC_RASTERIZER_DISCARD_ENABLE);
- /* Section 9.2 of the Vulkan 1.0.15 spec says:
- *
- * pViewportState is [...] NULL if the pipeline
- * has rasterization disabled.
- */
- if (!raster_discard) {
- assert(pCreateInfo->pViewportState);
-
- dynamic->viewport.count = pCreateInfo->pViewportState->viewportCount;
- if (states & ANV_CMD_DIRTY_DYNAMIC_VIEWPORT) {
- typed_memcpy(dynamic->viewport.viewports,
- pCreateInfo->pViewportState->pViewports,
- pCreateInfo->pViewportState->viewportCount);
- }
+ ANV_FROM_HANDLE(anv_pipeline_layout, pipeline_layout, pCreateInfo->layout);
+ anv_pipeline_init_layout(&pipeline->base, pipeline_layout);
- dynamic->scissor.count = pCreateInfo->pViewportState->scissorCount;
- if (states & ANV_CMD_DIRTY_DYNAMIC_SCISSOR) {
- typed_memcpy(dynamic->scissor.scissors,
- pCreateInfo->pViewportState->pScissors,
- pCreateInfo->pViewportState->scissorCount);
- }
- }
+ pipeline->base.active_stages = VK_SHADER_STAGE_COMPUTE_BIT;
- if (states & ANV_CMD_DIRTY_DYNAMIC_LINE_WIDTH) {
- assert(pCreateInfo->pRasterizationState);
- dynamic->line_width = pCreateInfo->pRasterizationState->lineWidth;
- }
+ anv_batch_set_storage(&pipeline->base.batch, ANV_NULL_ADDRESS,
+ pipeline->batch_data, sizeof(pipeline->batch_data));
- if (states & ANV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS) {
- assert(pCreateInfo->pRasterizationState);
- dynamic->depth_bias.bias =
- pCreateInfo->pRasterizationState->depthBiasConstantFactor;
- dynamic->depth_bias.clamp =
- pCreateInfo->pRasterizationState->depthBiasClamp;
- dynamic->depth_bias.slope =
- pCreateInfo->pRasterizationState->depthBiasSlopeFactor;
+ result = anv_pipeline_compile_cs(pipeline, cache, pCreateInfo);
+ if (result != VK_SUCCESS) {
+ anv_pipeline_finish(&pipeline->base, device);
+ vk_free2(&device->vk.alloc, pAllocator, pipeline);
+ return result;
}
- if (states & ANV_CMD_DIRTY_DYNAMIC_CULL_MODE) {
- assert(pCreateInfo->pRasterizationState);
- dynamic->cull_mode =
- pCreateInfo->pRasterizationState->cullMode;
- }
+ anv_genX(device->info, compute_pipeline_emit)(pipeline);
- if (states & ANV_CMD_DIRTY_DYNAMIC_FRONT_FACE) {
- assert(pCreateInfo->pRasterizationState);
- dynamic->front_face =
- pCreateInfo->pRasterizationState->frontFace;
- }
+ ANV_RMV(compute_pipeline_create, device, pipeline, false);
- if (states & ANV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY) {
- assert(pCreateInfo->pInputAssemblyState);
- dynamic->primitive_topology = pCreateInfo->pInputAssemblyState->topology;
- }
+ *pPipeline = anv_pipeline_to_handle(&pipeline->base);
- if (states & ANV_CMD_DIRTY_DYNAMIC_RASTERIZER_DISCARD_ENABLE) {
- assert(pCreateInfo->pRasterizationState);
- dynamic->raster_discard =
- pCreateInfo->pRasterizationState->rasterizerDiscardEnable;
- }
+ return pipeline->base.batch.status;
+}
- if (states & ANV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS_ENABLE) {
- assert(pCreateInfo->pRasterizationState);
- dynamic->depth_bias_enable =
- pCreateInfo->pRasterizationState->depthBiasEnable;
- }
+VkResult anv_CreateComputePipelines(
+ VkDevice _device,
+ VkPipelineCache pipelineCache,
+ uint32_t count,
+ const VkComputePipelineCreateInfo* pCreateInfos,
+ const VkAllocationCallbacks* pAllocator,
+ VkPipeline* pPipelines)
+{
+ ANV_FROM_HANDLE(anv_device, device, _device);
+ ANV_FROM_HANDLE(vk_pipeline_cache, pipeline_cache, pipelineCache);
- if (states & ANV_CMD_DIRTY_DYNAMIC_PRIMITIVE_RESTART_ENABLE) {
- assert(pCreateInfo->pInputAssemblyState);
- dynamic->primitive_restart_enable =
- pCreateInfo->pInputAssemblyState->primitiveRestartEnable;
- }
+ VkResult result = VK_SUCCESS;
- /* Section 9.2 of the Vulkan 1.0.15 spec says:
- *
- * pColorBlendState is [...] NULL if the pipeline has rasterization
- * disabled or if the subpass of the render pass the pipeline is
- * created against does not use any color attachments.
- */
- bool uses_color_att = false;
- for (unsigned i = 0; i < subpass->color_count; ++i) {
- if (subpass->color_attachments[i].attachment != VK_ATTACHMENT_UNUSED) {
- uses_color_att = true;
+ unsigned i;
+ for (i = 0; i < count; i++) {
+ const VkPipelineCreateFlags2KHR flags =
+ vk_compute_pipeline_create_flags(&pCreateInfos[i]);
+ VkResult res = anv_compute_pipeline_create(device, pipeline_cache,
+ &pCreateInfos[i],
+ pAllocator, &pPipelines[i]);
+
+ if (res == VK_SUCCESS)
+ continue;
+
+ /* Bail out on the first error != VK_PIPELINE_COMPILE_REQUIRED as it
+ * is not obvious what error should be report upon 2 different failures.
+ * */
+ result = res;
+ if (res != VK_PIPELINE_COMPILE_REQUIRED)
break;
- }
- }
- if (uses_color_att && !raster_discard) {
- assert(pCreateInfo->pColorBlendState);
+ pPipelines[i] = VK_NULL_HANDLE;
- if (states & ANV_CMD_DIRTY_DYNAMIC_BLEND_CONSTANTS)
- typed_memcpy(dynamic->blend_constants,
- pCreateInfo->pColorBlendState->blendConstants, 4);
+ if (flags & VK_PIPELINE_CREATE_2_EARLY_RETURN_ON_FAILURE_BIT_KHR)
+ break;
}
- /* If there is no depthstencil attachment, then don't read
- * pDepthStencilState. The Vulkan spec states that pDepthStencilState may
- * be NULL in this case. Even if pDepthStencilState is non-NULL, there is
- * no need to override the depthstencil defaults in
- * anv_pipeline::dynamic_state when there is no depthstencil attachment.
- *
- * Section 9.2 of the Vulkan 1.0.15 spec says:
- *
- * pDepthStencilState is [...] NULL if the pipeline has rasterization
- * disabled or if the subpass of the render pass the pipeline is created
- * against does not use a depth/stencil attachment.
- */
- if (!raster_discard && subpass->depth_stencil_attachment) {
- assert(pCreateInfo->pDepthStencilState);
-
- if (states & ANV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS) {
- dynamic->depth_bounds.min =
- pCreateInfo->pDepthStencilState->minDepthBounds;
- dynamic->depth_bounds.max =
- pCreateInfo->pDepthStencilState->maxDepthBounds;
- }
+ for (; i < count; i++)
+ pPipelines[i] = VK_NULL_HANDLE;
- if (states & ANV_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK) {
- dynamic->stencil_compare_mask.front =
- pCreateInfo->pDepthStencilState->front.compareMask;
- dynamic->stencil_compare_mask.back =
- pCreateInfo->pDepthStencilState->back.compareMask;
- }
+ return result;
+}
- if (states & ANV_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK) {
- dynamic->stencil_write_mask.front =
- pCreateInfo->pDepthStencilState->front.writeMask;
- dynamic->stencil_write_mask.back =
- pCreateInfo->pDepthStencilState->back.writeMask;
- }
+/**
+ * Calculate the desired L3 partitioning based on the current state of the
+ * pipeline. For now this simply returns the conservative defaults calculated
+ * by get_default_l3_weights(), but we could probably do better by gathering
+ * more statistics from the pipeline state (e.g. guess of expected URB usage
+ * and bound surfaces), or by using feed-back from performance counters.
+ */
+void
+anv_pipeline_setup_l3_config(struct anv_pipeline *pipeline, bool needs_slm)
+{
+ const struct intel_device_info *devinfo = pipeline->device->info;
- if (states & ANV_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE) {
- dynamic->stencil_reference.front =
- pCreateInfo->pDepthStencilState->front.reference;
- dynamic->stencil_reference.back =
- pCreateInfo->pDepthStencilState->back.reference;
- }
+ const struct intel_l3_weights w =
+ intel_get_default_l3_weights(devinfo, true, needs_slm);
- if (states & ANV_CMD_DIRTY_DYNAMIC_DEPTH_TEST_ENABLE) {
- dynamic->depth_test_enable =
- pCreateInfo->pDepthStencilState->depthTestEnable;
- }
+ pipeline->l3_config = intel_get_l3_config(devinfo, w);
+}
- if (states & ANV_CMD_DIRTY_DYNAMIC_DEPTH_WRITE_ENABLE) {
- dynamic->depth_write_enable =
- pCreateInfo->pDepthStencilState->depthWriteEnable;
- }
+static uint32_t
+get_vs_input_elements(const struct brw_vs_prog_data *vs_prog_data)
+{
+ /* Pull inputs_read out of the VS prog data */
+ const uint64_t inputs_read = vs_prog_data->inputs_read;
+ const uint64_t double_inputs_read =
+ vs_prog_data->double_inputs_read & inputs_read;
+ assert((inputs_read & ((1 << VERT_ATTRIB_GENERIC0) - 1)) == 0);
+ const uint32_t elements = inputs_read >> VERT_ATTRIB_GENERIC0;
+ const uint32_t elements_double = double_inputs_read >> VERT_ATTRIB_GENERIC0;
+
+ return __builtin_popcount(elements) -
+ __builtin_popcount(elements_double) / 2;
+}
- if (states & ANV_CMD_DIRTY_DYNAMIC_DEPTH_COMPARE_OP) {
- dynamic->depth_compare_op =
- pCreateInfo->pDepthStencilState->depthCompareOp;
- }
+static void
+anv_graphics_pipeline_emit(struct anv_graphics_pipeline *pipeline,
+ const struct vk_graphics_pipeline_state *state)
+{
+ pipeline->view_mask = state->rp->view_mask;
- if (states & ANV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS_TEST_ENABLE) {
- dynamic->depth_bounds_test_enable =
- pCreateInfo->pDepthStencilState->depthBoundsTestEnable;
- }
+ anv_pipeline_setup_l3_config(&pipeline->base.base, false);
- if (states & ANV_CMD_DIRTY_DYNAMIC_STENCIL_TEST_ENABLE) {
- dynamic->stencil_test_enable =
- pCreateInfo->pDepthStencilState->stencilTestEnable;
- }
+ if (anv_pipeline_is_primitive(pipeline)) {
+ const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
- if (states & ANV_CMD_DIRTY_DYNAMIC_STENCIL_OP) {
- const VkPipelineDepthStencilStateCreateInfo *info =
- pCreateInfo->pDepthStencilState;
- memcpy(&dynamic->stencil_op.front, &info->front,
- sizeof(dynamic->stencil_op.front));
- memcpy(&dynamic->stencil_op.back, &info->back,
- sizeof(dynamic->stencil_op.back));
- }
+ /* The total number of vertex elements we need to program. We might need
+ * a couple more to implement some of the draw parameters.
+ */
+ pipeline->svgs_count =
+ (vs_prog_data->uses_vertexid ||
+ vs_prog_data->uses_instanceid ||
+ vs_prog_data->uses_firstvertex ||
+ vs_prog_data->uses_baseinstance) + vs_prog_data->uses_drawid;
+
+ pipeline->vs_input_elements = get_vs_input_elements(vs_prog_data);
+
+ pipeline->vertex_input_elems =
+ (BITSET_TEST(state->dynamic, MESA_VK_DYNAMIC_VI) ?
+ 0 : pipeline->vs_input_elements) + pipeline->svgs_count;
+
+ /* Our implementation of VK_KHR_multiview uses instancing to draw the
+ * different views when primitive replication cannot be used. If the
+ * client asks for instancing, we need to multiply by the client's
+ * instance count at draw time and instance divisor in the vertex
+ * bindings by the number of views ensure that we repeat the client's
+ * per-instance data once for each view.
+ */
+ const bool uses_primitive_replication =
+ anv_pipeline_get_last_vue_prog_data(pipeline)->vue_map.num_pos_slots > 1;
+ pipeline->instance_multiplier = 1;
+ if (pipeline->view_mask && !uses_primitive_replication)
+ pipeline->instance_multiplier = util_bitcount(pipeline->view_mask);
+ } else {
+ assert(anv_pipeline_is_mesh(pipeline));
+ /* TODO(mesh): Mesh vs. Multiview with Instancing. */
}
- const VkPipelineRasterizationLineStateCreateInfoEXT *line_state =
- vk_find_struct_const(pCreateInfo->pRasterizationState->pNext,
- PIPELINE_RASTERIZATION_LINE_STATE_CREATE_INFO_EXT);
- if (!raster_discard && line_state && line_state->stippledLineEnable) {
- if (states & ANV_CMD_DIRTY_DYNAMIC_LINE_STIPPLE) {
- dynamic->line_stipple.factor = line_state->lineStippleFactor;
- dynamic->line_stipple.pattern = line_state->lineStipplePattern;
- }
- }
- const VkPipelineMultisampleStateCreateInfo *ms_info =
- pCreateInfo->pRasterizationState->rasterizerDiscardEnable ? NULL :
- pCreateInfo->pMultisampleState;
- if (states & ANV_CMD_DIRTY_DYNAMIC_SAMPLE_LOCATIONS) {
- const VkPipelineSampleLocationsStateCreateInfoEXT *sl_info = ms_info ?
- vk_find_struct_const(ms_info, PIPELINE_SAMPLE_LOCATIONS_STATE_CREATE_INFO_EXT) : NULL;
-
- if (sl_info) {
- dynamic->sample_locations.samples =
- sl_info->sampleLocationsInfo.sampleLocationsCount;
- const VkSampleLocationEXT *positions =
- sl_info->sampleLocationsInfo.pSampleLocations;
- for (uint32_t i = 0; i < dynamic->sample_locations.samples; i++) {
- dynamic->sample_locations.locations[i].x = positions[i].x;
- dynamic->sample_locations.locations[i].y = positions[i].y;
- }
- }
+ pipeline->dynamic_patch_control_points =
+ anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_CTRL) &&
+ BITSET_TEST(state->dynamic, MESA_VK_DYNAMIC_TS_PATCH_CONTROL_POINTS) &&
+ (pipeline->base.shaders[MESA_SHADER_TESS_CTRL]->dynamic_push_values &
+ ANV_DYNAMIC_PUSH_INPUT_VERTICES);
+
+ if (pipeline->base.shaders[MESA_SHADER_FRAGMENT] && state->ms) {
+ pipeline->sample_shading_enable = state->ms->sample_shading_enable;
+ pipeline->min_sample_shading = state->ms->min_sample_shading;
}
- /* Ensure we always have valid values for sample_locations. */
- if (pipeline->base.device->vk.enabled_extensions.EXT_sample_locations &&
- dynamic->sample_locations.samples == 0) {
- dynamic->sample_locations.samples =
- ms_info ? ms_info->rasterizationSamples : 1;
- const struct intel_sample_position *positions =
- intel_get_sample_positions(dynamic->sample_locations.samples);
- for (uint32_t i = 0; i < dynamic->sample_locations.samples; i++) {
- dynamic->sample_locations.locations[i].x = positions[i].x;
- dynamic->sample_locations.locations[i].y = positions[i].y;
- }
+
+ const struct anv_device *device = pipeline->base.base.device;
+ const struct intel_device_info *devinfo = device->info;
+ anv_genX(devinfo, graphics_pipeline_emit)(pipeline, state);
+}
+
+static void
+anv_graphics_pipeline_import_layout(struct anv_graphics_base_pipeline *pipeline,
+ struct anv_pipeline_sets_layout *layout)
+{
+ pipeline->base.layout.independent_sets |= layout->independent_sets;
+
+ for (uint32_t s = 0; s < layout->num_sets; s++) {
+ if (layout->set[s].layout == NULL)
+ continue;
+
+ anv_pipeline_sets_layout_add(&pipeline->base.layout, s,
+ layout->set[s].layout);
}
+}
- if (states & ANV_CMD_DIRTY_DYNAMIC_COLOR_BLEND_STATE) {
- if (!pCreateInfo->pRasterizationState->rasterizerDiscardEnable &&
- uses_color_att) {
- assert(pCreateInfo->pColorBlendState);
- const VkPipelineColorWriteCreateInfoEXT *color_write_info =
- vk_find_struct_const(pCreateInfo->pColorBlendState->pNext,
- PIPELINE_COLOR_WRITE_CREATE_INFO_EXT);
+static void
+anv_graphics_pipeline_import_lib(struct anv_graphics_base_pipeline *pipeline,
+ bool link_optimize,
+ bool retain_shaders,
+ struct anv_pipeline_stage *stages,
+ struct anv_graphics_lib_pipeline *lib)
+{
+ struct anv_pipeline_sets_layout *lib_layout =
+ &lib->base.base.layout;
+ anv_graphics_pipeline_import_layout(pipeline, lib_layout);
- if (color_write_info) {
- dynamic->color_writes = 0;
- for (uint32_t i = 0; i < color_write_info->attachmentCount; i++) {
- dynamic->color_writes |=
- color_write_info->pColorWriteEnables[i] ? (1u << i) : 0;
- }
- }
+ /* We can't have shaders specified twice through libraries. */
+ assert((pipeline->base.active_stages & lib->base.base.active_stages) == 0);
+
+ /* VK_EXT_graphics_pipeline_library:
+ *
+ * "To perform link time optimizations,
+ * VK_PIPELINE_CREATE_RETAIN_LINK_TIME_OPTIMIZATION_INFO_BIT_EXT must
+ * be specified on all pipeline libraries that are being linked
+ * together. Implementations should retain any additional information
+ * needed to perform optimizations at the final link step when this bit
+ * is present."
+ */
+ assert(!link_optimize || lib->retain_shaders);
+
+ pipeline->base.active_stages |= lib->base.base.active_stages;
+
+ /* Propagate the fragment dynamic flag, unless we're doing link
+ * optimization, in that case we'll have all the state information and this
+ * will never be dynamic.
+ */
+ if (!link_optimize) {
+ if (lib->base.fragment_dynamic) {
+ assert(lib->base.base.active_stages & VK_SHADER_STAGE_FRAGMENT_BIT);
+ pipeline->fragment_dynamic = true;
}
}
- const VkPipelineFragmentShadingRateStateCreateInfoKHR *fsr_state =
- vk_find_struct_const(pCreateInfo->pNext,
- PIPELINE_FRAGMENT_SHADING_RATE_STATE_CREATE_INFO_KHR);
- if (fsr_state) {
- if (states & ANV_CMD_DIRTY_DYNAMIC_SHADING_RATE)
- dynamic->fragment_shading_rate = fsr_state->fragmentSize;
- }
+ uint32_t shader_count = anv_graphics_pipeline_imported_shader_count(stages);
+ for (uint32_t s = 0; s < ARRAY_SIZE(lib->base.shaders); s++) {
+ if (lib->base.shaders[s] == NULL)
+ continue;
- pipeline->dynamic_state_mask = states;
+ stages[s].stage = s;
+ stages[s].feedback_idx = shader_count + lib->base.feedback_index[s];
+ stages[s].robust_flags = lib->base.robust_flags[s];
- /* Mark states that can either be dynamic or fully baked into the pipeline.
- */
- pipeline->static_state_mask = states &
- (ANV_CMD_DIRTY_DYNAMIC_SAMPLE_LOCATIONS |
- ANV_CMD_DIRTY_DYNAMIC_COLOR_BLEND_STATE |
- ANV_CMD_DIRTY_DYNAMIC_SHADING_RATE |
- ANV_CMD_DIRTY_DYNAMIC_RASTERIZER_DISCARD_ENABLE |
- ANV_CMD_DIRTY_DYNAMIC_LOGIC_OP |
- ANV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY);
-}
+ /* Always import the shader sha1, this will be used for cache lookup. */
+ memcpy(stages[s].shader_sha1, lib->retained_shaders[s].shader_sha1,
+ sizeof(stages[s].shader_sha1));
+ stages[s].source_hash = lib->base.source_hashes[s];
-static void
-anv_pipeline_validate_create_info(const VkGraphicsPipelineCreateInfo *info)
-{
-#ifdef DEBUG
- struct anv_render_pass *renderpass = NULL;
- struct anv_subpass *subpass = NULL;
+ stages[s].subgroup_size_type = lib->retained_shaders[s].subgroup_size_type;
+ stages[s].imported.nir = lib->retained_shaders[s].nir;
+ stages[s].imported.bin = lib->base.shaders[s];
+ }
- /* Assert that all required members of VkGraphicsPipelineCreateInfo are
- * present. See the Vulkan 1.0.28 spec, Section 9.2 Graphics Pipelines.
+ /* When not link optimizing, import the executables (shader descriptions
+ * for VK_KHR_pipeline_executable_properties). With link optimization there
+ * is a chance it'll produce different binaries, so we'll add the optimized
+ * version later.
*/
- assert(info->sType == VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO);
-
- renderpass = anv_render_pass_from_handle(info->renderPass);
- assert(renderpass);
-
- assert(info->subpass < renderpass->subpass_count);
- subpass = &renderpass->subpasses[info->subpass];
-
- assert(info->stageCount >= 1);
- assert(info->pVertexInputState);
- assert(info->pInputAssemblyState);
- assert(info->pRasterizationState);
- if (!info->pRasterizationState->rasterizerDiscardEnable) {
- assert(info->pViewportState);
- assert(info->pMultisampleState);
-
- if (subpass && subpass->depth_stencil_attachment)
- assert(info->pDepthStencilState);
-
- if (subpass && subpass->color_count > 0) {
- bool all_color_unused = true;
- for (int i = 0; i < subpass->color_count; i++) {
- if (subpass->color_attachments[i].attachment != VK_ATTACHMENT_UNUSED)
- all_color_unused = false;
- }
- /* pColorBlendState is ignored if the pipeline has rasterization
- * disabled or if the subpass of the render pass the pipeline is
- * created against does not use any color attachments.
- */
- assert(info->pColorBlendState || all_color_unused);
+ if (!link_optimize) {
+ util_dynarray_foreach(&lib->base.base.executables,
+ struct anv_pipeline_executable, exe) {
+ util_dynarray_append(&pipeline->base.executables,
+ struct anv_pipeline_executable, *exe);
}
}
+}
- for (uint32_t i = 0; i < info->stageCount; ++i) {
- switch (info->pStages[i].stage) {
- case VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT:
- case VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT:
- assert(info->pTessellationState);
- break;
- default:
- break;
+static void
+anv_graphics_lib_validate_shaders(struct anv_graphics_lib_pipeline *lib,
+ bool retained_shaders)
+{
+ for (uint32_t s = 0; s < ARRAY_SIZE(lib->retained_shaders); s++) {
+ if (anv_pipeline_base_has_stage(&lib->base, s)) {
+ assert(!retained_shaders || lib->retained_shaders[s].nir != NULL);
+ assert(lib->base.shaders[s] != NULL);
}
}
-#endif
}
-/**
- * Calculate the desired L3 partitioning based on the current state of the
- * pipeline. For now this simply returns the conservative defaults calculated
- * by get_default_l3_weights(), but we could probably do better by gathering
- * more statistics from the pipeline state (e.g. guess of expected URB usage
- * and bound surfaces), or by using feed-back from performance counters.
- */
-void
-anv_pipeline_setup_l3_config(struct anv_pipeline *pipeline, bool needs_slm)
+static VkResult
+anv_graphics_lib_pipeline_create(struct anv_device *device,
+ struct vk_pipeline_cache *cache,
+ const VkGraphicsPipelineCreateInfo *pCreateInfo,
+ const VkAllocationCallbacks *pAllocator,
+ VkPipeline *pPipeline)
{
- const struct intel_device_info *devinfo = &pipeline->device->info;
+ struct anv_pipeline_stage stages[ANV_GRAPHICS_SHADER_STAGE_COUNT] = {};
+ VkPipelineCreationFeedback pipeline_feedback = {
+ .flags = VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT,
+ };
+ int64_t pipeline_start = os_time_get_nano();
- const struct intel_l3_weights w =
- intel_get_default_l3_weights(devinfo, true, needs_slm);
+ struct anv_graphics_lib_pipeline *pipeline;
+ VkResult result;
- pipeline->l3_config = intel_get_l3_config(devinfo, w);
-}
+ const VkPipelineCreateFlags2KHR flags =
+ vk_graphics_pipeline_create_flags(pCreateInfo);
+ assert(flags & VK_PIPELINE_CREATE_2_LIBRARY_BIT_KHR);
-static VkLineRasterizationModeEXT
-vk_line_rasterization_mode(const VkPipelineRasterizationLineStateCreateInfoEXT *line_info,
- const VkPipelineMultisampleStateCreateInfo *ms_info)
-{
- VkLineRasterizationModeEXT line_mode =
- line_info ? line_info->lineRasterizationMode :
- VK_LINE_RASTERIZATION_MODE_DEFAULT_EXT;
+ const VkPipelineLibraryCreateInfoKHR *libs_info =
+ vk_find_struct_const(pCreateInfo->pNext,
+ PIPELINE_LIBRARY_CREATE_INFO_KHR);
- if (line_mode == VK_LINE_RASTERIZATION_MODE_DEFAULT_EXT) {
- if (ms_info && ms_info->rasterizationSamples > 1) {
- return VK_LINE_RASTERIZATION_MODE_RECTANGULAR_EXT;
- } else {
- return VK_LINE_RASTERIZATION_MODE_BRESENHAM_EXT;
+ pipeline = vk_zalloc2(&device->vk.alloc, pAllocator, sizeof(*pipeline), 8,
+ VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+ if (pipeline == NULL)
+ return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+ result = anv_pipeline_init(&pipeline->base.base, device,
+ ANV_PIPELINE_GRAPHICS_LIB, flags,
+ pAllocator);
+ if (result != VK_SUCCESS) {
+ vk_free2(&device->vk.alloc, pAllocator, pipeline);
+ if (result == VK_PIPELINE_COMPILE_REQUIRED)
+ *pPipeline = VK_NULL_HANDLE;
+ return result;
+ }
+
+ /* Capture the retain state before we compile/load any shader. */
+ pipeline->retain_shaders =
+ (flags & VK_PIPELINE_CREATE_2_RETAIN_LINK_TIME_OPTIMIZATION_INFO_BIT_EXT) != 0;
+
+ /* If we have libraries, import them first. */
+ if (libs_info) {
+ for (uint32_t i = 0; i < libs_info->libraryCount; i++) {
+ ANV_FROM_HANDLE(anv_pipeline, pipeline_lib, libs_info->pLibraries[i]);
+ struct anv_graphics_lib_pipeline *gfx_pipeline_lib =
+ anv_pipeline_to_graphics_lib(pipeline_lib);
+
+ vk_graphics_pipeline_state_merge(&pipeline->state, &gfx_pipeline_lib->state);
+ anv_graphics_pipeline_import_lib(&pipeline->base,
+ false /* link_optimize */,
+ pipeline->retain_shaders,
+ stages, gfx_pipeline_lib);
}
}
- return line_mode;
+ result = vk_graphics_pipeline_state_fill(&device->vk,
+ &pipeline->state, pCreateInfo,
+ NULL /* driver_rp */,
+ 0 /* driver_rp_flags */,
+ &pipeline->all_state, NULL, 0, NULL);
+ if (result != VK_SUCCESS) {
+ anv_pipeline_finish(&pipeline->base.base, device);
+ vk_free2(&device->vk.alloc, pAllocator, pipeline);
+ return result;
+ }
+
+ pipeline->base.base.active_stages = pipeline->state.shader_stages;
+
+ /* After we've imported all the libraries' layouts, import the pipeline
+ * layout and hash the whole lot.
+ */
+ ANV_FROM_HANDLE(anv_pipeline_layout, pipeline_layout, pCreateInfo->layout);
+ if (pipeline_layout != NULL) {
+ anv_graphics_pipeline_import_layout(&pipeline->base,
+ &pipeline_layout->sets_layout);
+ }
+
+ anv_pipeline_sets_layout_hash(&pipeline->base.base.layout);
+
+ /* Compile shaders. We can skip this if there are no active stage in that
+ * pipeline.
+ */
+ if (pipeline->base.base.active_stages != 0) {
+ result = anv_graphics_pipeline_compile(&pipeline->base, stages,
+ cache, &pipeline_feedback,
+ pCreateInfo, &pipeline->state);
+ if (result != VK_SUCCESS) {
+ anv_pipeline_finish(&pipeline->base.base, device);
+ vk_free2(&device->vk.alloc, pAllocator, pipeline);
+ return result;
+ }
+ }
+
+ pipeline_feedback.duration = os_time_get_nano() - pipeline_start;
+
+ anv_fill_pipeline_creation_feedback(&pipeline->base, &pipeline_feedback,
+ pCreateInfo, stages);
+
+ anv_graphics_lib_validate_shaders(
+ pipeline,
+ flags & VK_PIPELINE_CREATE_2_RETAIN_LINK_TIME_OPTIMIZATION_INFO_BIT_EXT);
+
+ *pPipeline = anv_pipeline_to_handle(&pipeline->base.base);
+
+ return VK_SUCCESS;
}
-VkResult
-anv_graphics_pipeline_init(struct anv_graphics_pipeline *pipeline,
- struct anv_device *device,
- struct anv_pipeline_cache *cache,
- const VkGraphicsPipelineCreateInfo *pCreateInfo,
- const VkAllocationCallbacks *alloc)
+static VkResult
+anv_graphics_pipeline_create(struct anv_device *device,
+ struct vk_pipeline_cache *cache,
+ const VkGraphicsPipelineCreateInfo *pCreateInfo,
+ const VkAllocationCallbacks *pAllocator,
+ VkPipeline *pPipeline)
{
+ struct anv_pipeline_stage stages[ANV_GRAPHICS_SHADER_STAGE_COUNT] = {};
+ VkPipelineCreationFeedback pipeline_feedback = {
+ .flags = VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT,
+ };
+ int64_t pipeline_start = os_time_get_nano();
+
+ struct anv_graphics_pipeline *pipeline;
VkResult result;
- anv_pipeline_validate_create_info(pCreateInfo);
+ const VkPipelineCreateFlags2KHR flags =
+ vk_graphics_pipeline_create_flags(pCreateInfo);
+ assert((flags & VK_PIPELINE_CREATE_2_LIBRARY_BIT_KHR) == 0);
- result = anv_pipeline_init(&pipeline->base, device,
- ANV_PIPELINE_GRAPHICS, pCreateInfo->flags,
- alloc);
- if (result != VK_SUCCESS)
+ const VkPipelineLibraryCreateInfoKHR *libs_info =
+ vk_find_struct_const(pCreateInfo->pNext,
+ PIPELINE_LIBRARY_CREATE_INFO_KHR);
+
+ pipeline = vk_zalloc2(&device->vk.alloc, pAllocator, sizeof(*pipeline), 8,
+ VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+ if (pipeline == NULL)
+ return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+ /* Initialize some information required by shaders */
+ result = anv_pipeline_init(&pipeline->base.base, device,
+ ANV_PIPELINE_GRAPHICS, flags,
+ pAllocator);
+ if (result != VK_SUCCESS) {
+ vk_free2(&device->vk.alloc, pAllocator, pipeline);
return result;
+ }
- anv_batch_set_storage(&pipeline->base.batch, ANV_NULL_ADDRESS,
- pipeline->batch_data, sizeof(pipeline->batch_data));
+ const bool link_optimize =
+ (flags & VK_PIPELINE_CREATE_2_LINK_TIME_OPTIMIZATION_BIT_EXT) != 0;
- ANV_FROM_HANDLE(anv_render_pass, render_pass, pCreateInfo->renderPass);
- assert(pCreateInfo->subpass < render_pass->subpass_count);
- pipeline->subpass = &render_pass->subpasses[pCreateInfo->subpass];
+ struct vk_graphics_pipeline_all_state all;
+ struct vk_graphics_pipeline_state state = { };
- assert(pCreateInfo->pRasterizationState);
+ /* If we have libraries, import them first. */
+ if (libs_info) {
+ for (uint32_t i = 0; i < libs_info->libraryCount; i++) {
+ ANV_FROM_HANDLE(anv_pipeline, pipeline_lib, libs_info->pLibraries[i]);
+ struct anv_graphics_lib_pipeline *gfx_pipeline_lib =
+ anv_pipeline_to_graphics_lib(pipeline_lib);
- if (pCreateInfo->pDynamicState) {
- /* Remove all of the states that are marked as dynamic */
- uint32_t count = pCreateInfo->pDynamicState->dynamicStateCount;
- for (uint32_t s = 0; s < count; s++) {
- pipeline->dynamic_states |= anv_cmd_dirty_bit_for_vk_dynamic_state(
- pCreateInfo->pDynamicState->pDynamicStates[s]);
+ /* If we have link time optimization, all libraries must be created
+ * with
+ * VK_PIPELINE_CREATE_RETAIN_LINK_TIME_OPTIMIZATION_INFO_BIT_EXT.
+ */
+ assert(!link_optimize || gfx_pipeline_lib->retain_shaders);
+
+ vk_graphics_pipeline_state_merge(&state, &gfx_pipeline_lib->state);
+ anv_graphics_pipeline_import_lib(&pipeline->base,
+ link_optimize,
+ false,
+ stages,
+ gfx_pipeline_lib);
}
}
- copy_non_dynamic_state(pipeline, pCreateInfo);
- pipeline->depth_clamp_enable = pCreateInfo->pRasterizationState->depthClampEnable;
+ result = vk_graphics_pipeline_state_fill(&device->vk, &state, pCreateInfo,
+ NULL /* driver_rp */,
+ 0 /* driver_rp_flags */,
+ &all, NULL, 0, NULL);
+ if (result != VK_SUCCESS) {
+ anv_pipeline_finish(&pipeline->base.base, device);
+ vk_free2(&device->vk.alloc, pAllocator, pipeline);
+ return result;
+ }
+
+ pipeline->dynamic_state.vi = &pipeline->vertex_input;
+ pipeline->dynamic_state.ms.sample_locations = &pipeline->base.sample_locations;
+ vk_dynamic_graphics_state_fill(&pipeline->dynamic_state, &state);
+
+ pipeline->base.base.active_stages = state.shader_stages;
+
+ /* Sanity check on the shaders */
+ assert(pipeline->base.base.active_stages & VK_SHADER_STAGE_VERTEX_BIT ||
+ pipeline->base.base.active_stages & VK_SHADER_STAGE_MESH_BIT_EXT);
- /* Previously we enabled depth clipping when !depthClampEnable.
- * DepthClipStateCreateInfo now makes depth clipping explicit so if the
- * clipping info is available, use its enable value to determine clipping,
- * otherwise fallback to the previous !depthClampEnable logic.
+ if (anv_pipeline_is_mesh(pipeline)) {
+ assert(device->physical->vk.supported_extensions.EXT_mesh_shader);
+ }
+
+ /* After we've imported all the libraries' layouts, import the pipeline
+ * layout and hash the whole lot.
*/
- const VkPipelineRasterizationDepthClipStateCreateInfoEXT *clip_info =
- vk_find_struct_const(pCreateInfo->pRasterizationState->pNext,
- PIPELINE_RASTERIZATION_DEPTH_CLIP_STATE_CREATE_INFO_EXT);
- pipeline->depth_clip_enable = clip_info ? clip_info->depthClipEnable : !pipeline->depth_clamp_enable;
+ ANV_FROM_HANDLE(anv_pipeline_layout, pipeline_layout, pCreateInfo->layout);
+ if (pipeline_layout != NULL) {
+ anv_graphics_pipeline_import_layout(&pipeline->base,
+ &pipeline_layout->sets_layout);
+ }
- pipeline->sample_shading_enable =
- !pCreateInfo->pRasterizationState->rasterizerDiscardEnable &&
- pCreateInfo->pMultisampleState &&
- pCreateInfo->pMultisampleState->sampleShadingEnable;
+ anv_pipeline_sets_layout_hash(&pipeline->base.base.layout);
- result = anv_pipeline_compile_graphics(pipeline, cache, pCreateInfo);
+ /* Compile shaders, all required information should be have been copied in
+ * the previous step. We can skip this if there are no active stage in that
+ * pipeline.
+ */
+ result = anv_graphics_pipeline_compile(&pipeline->base, stages,
+ cache, &pipeline_feedback,
+ pCreateInfo, &state);
if (result != VK_SUCCESS) {
- anv_pipeline_finish(&pipeline->base, device, alloc);
+ anv_pipeline_finish(&pipeline->base.base, device);
+ vk_free2(&device->vk.alloc, pAllocator, pipeline);
return result;
}
- assert(pipeline->shaders[MESA_SHADER_VERTEX]);
-
- anv_pipeline_setup_l3_config(&pipeline->base, false);
-
- const VkPipelineVertexInputStateCreateInfo *vi_info =
- pCreateInfo->pVertexInputState;
+ /* Prepare a batch for the commands and emit all the non dynamic ones.
+ */
+ anv_batch_set_storage(&pipeline->base.base.batch, ANV_NULL_ADDRESS,
+ pipeline->batch_data, sizeof(pipeline->batch_data));
- const uint64_t inputs_read = get_vs_prog_data(pipeline)->inputs_read;
+ if (pipeline->base.base.active_stages & VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT)
+ pipeline->base.base.active_stages |= VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT;
- for (uint32_t i = 0; i < vi_info->vertexAttributeDescriptionCount; i++) {
- const VkVertexInputAttributeDescription *desc =
- &vi_info->pVertexAttributeDescriptions[i];
+ if (anv_pipeline_is_mesh(pipeline))
+ assert(device->physical->vk.supported_extensions.EXT_mesh_shader);
- if (inputs_read & (1ull << (VERT_ATTRIB_GENERIC0 + desc->location)))
- pipeline->vb_used |= 1 << desc->binding;
- }
+ anv_graphics_pipeline_emit(pipeline, &state);
- for (uint32_t i = 0; i < vi_info->vertexBindingDescriptionCount; i++) {
- const VkVertexInputBindingDescription *desc =
- &vi_info->pVertexBindingDescriptions[i];
+ pipeline_feedback.duration = os_time_get_nano() - pipeline_start;
- pipeline->vb[desc->binding].stride = desc->stride;
+ anv_fill_pipeline_creation_feedback(&pipeline->base, &pipeline_feedback,
+ pCreateInfo, stages);
- /* Step rate is programmed per vertex element (attribute), not
- * binding. Set up a map of which bindings step per instance, for
- * reference by vertex element setup. */
- switch (desc->inputRate) {
- default:
- case VK_VERTEX_INPUT_RATE_VERTEX:
- pipeline->vb[desc->binding].instanced = false;
- break;
- case VK_VERTEX_INPUT_RATE_INSTANCE:
- pipeline->vb[desc->binding].instanced = true;
- break;
- }
+ ANV_RMV(graphics_pipeline_create, device, pipeline, false);
- pipeline->vb[desc->binding].instance_divisor = 1;
- }
+ *pPipeline = anv_pipeline_to_handle(&pipeline->base.base);
- const VkPipelineVertexInputDivisorStateCreateInfoEXT *vi_div_state =
- vk_find_struct_const(vi_info->pNext,
- PIPELINE_VERTEX_INPUT_DIVISOR_STATE_CREATE_INFO_EXT);
- if (vi_div_state) {
- for (uint32_t i = 0; i < vi_div_state->vertexBindingDivisorCount; i++) {
- const VkVertexInputBindingDivisorDescriptionEXT *desc =
- &vi_div_state->pVertexBindingDivisors[i];
+ return pipeline->base.base.batch.status;
+}
- pipeline->vb[desc->binding].instance_divisor = desc->divisor;
+VkResult anv_CreateGraphicsPipelines(
+ VkDevice _device,
+ VkPipelineCache pipelineCache,
+ uint32_t count,
+ const VkGraphicsPipelineCreateInfo* pCreateInfos,
+ const VkAllocationCallbacks* pAllocator,
+ VkPipeline* pPipelines)
+{
+ ANV_FROM_HANDLE(anv_device, device, _device);
+ ANV_FROM_HANDLE(vk_pipeline_cache, pipeline_cache, pipelineCache);
+
+ VkResult result = VK_SUCCESS;
+
+ unsigned i;
+ for (i = 0; i < count; i++) {
+ assert(pCreateInfos[i].sType == VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO);
+
+ const VkPipelineCreateFlags2KHR flags =
+ vk_graphics_pipeline_create_flags(&pCreateInfos[i]);
+ VkResult res;
+ if (flags & VK_PIPELINE_CREATE_2_LIBRARY_BIT_KHR) {
+ res = anv_graphics_lib_pipeline_create(device, pipeline_cache,
+ &pCreateInfos[i],
+ pAllocator,
+ &pPipelines[i]);
+ } else {
+ res = anv_graphics_pipeline_create(device,
+ pipeline_cache,
+ &pCreateInfos[i],
+ pAllocator, &pPipelines[i]);
}
- }
- /* Our implementation of VK_KHR_multiview uses instancing to draw the
- * different views. If the client asks for instancing, we need to multiply
- * the instance divisor by the number of views ensure that we repeat the
- * client's per-instance data once for each view.
- */
- if (pipeline->subpass->view_mask && !pipeline->use_primitive_replication) {
- const uint32_t view_count = anv_subpass_view_count(pipeline->subpass);
- for (uint32_t vb = 0; vb < MAX_VBS; vb++) {
- if (pipeline->vb[vb].instanced)
- pipeline->vb[vb].instance_divisor *= view_count;
- }
- }
+ if (res == VK_SUCCESS)
+ continue;
- const VkPipelineInputAssemblyStateCreateInfo *ia_info =
- pCreateInfo->pInputAssemblyState;
- const VkPipelineTessellationStateCreateInfo *tess_info =
- pCreateInfo->pTessellationState;
+ /* Bail out on the first error != VK_PIPELINE_COMPILE_REQUIRED as it
+ * is not obvious what error should be report upon 2 different failures.
+ * */
+ result = res;
+ if (res != VK_PIPELINE_COMPILE_REQUIRED)
+ break;
- if (anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL))
- pipeline->topology = _3DPRIM_PATCHLIST(tess_info->patchControlPoints);
- else
- pipeline->topology = vk_to_intel_primitive_type[ia_info->topology];
+ pPipelines[i] = VK_NULL_HANDLE;
- /* If rasterization is not enabled, ms_info must be ignored. */
- const bool raster_enabled =
- !pCreateInfo->pRasterizationState->rasterizerDiscardEnable ||
- (pipeline->dynamic_states &
- ANV_CMD_DIRTY_DYNAMIC_RASTERIZER_DISCARD_ENABLE);
+ if (flags & VK_PIPELINE_CREATE_2_EARLY_RETURN_ON_FAILURE_BIT_KHR)
+ break;
+ }
- const VkPipelineMultisampleStateCreateInfo *ms_info =
- raster_enabled ? pCreateInfo->pMultisampleState : NULL;
+ for (; i < count; i++)
+ pPipelines[i] = VK_NULL_HANDLE;
- const VkPipelineRasterizationLineStateCreateInfoEXT *line_info =
- vk_find_struct_const(pCreateInfo->pRasterizationState->pNext,
- PIPELINE_RASTERIZATION_LINE_STATE_CREATE_INFO_EXT);
+ return result;
+}
- /* Store line mode, polygon mode and rasterization samples, these are used
- * for dynamic primitive topology.
- */
- pipeline->line_mode = vk_line_rasterization_mode(line_info, ms_info);
- pipeline->polygon_mode = pCreateInfo->pRasterizationState->polygonMode;
- pipeline->rasterization_samples =
- ms_info ? ms_info->rasterizationSamples : 1;
+static bool
+should_remat_cb(nir_instr *instr, void *data)
+{
+ if (instr->type != nir_instr_type_intrinsic)
+ return false;
- return VK_SUCCESS;
+ return nir_instr_as_intrinsic(instr)->intrinsic == nir_intrinsic_resource_intel;
}
static VkResult
compile_upload_rt_shader(struct anv_ray_tracing_pipeline *pipeline,
- struct anv_pipeline_cache *cache,
+ struct vk_pipeline_cache *cache,
nir_shader *nir,
struct anv_pipeline_stage *stage,
- struct anv_shader_bin **shader_out,
void *mem_ctx)
{
const struct brw_compiler *compiler =
@@ -2491,48 +3349,64 @@ compile_upload_rt_shader(struct anv_ray_tracing_pipeline *pipeline,
nir_shader **resume_shaders = NULL;
uint32_t num_resume_shaders = 0;
if (nir->info.stage != MESA_SHADER_COMPUTE) {
- NIR_PASS_V(nir, nir_lower_shader_calls,
- nir_address_format_64bit_global,
- BRW_BTD_STACK_ALIGN,
- &resume_shaders, &num_resume_shaders, mem_ctx);
- NIR_PASS_V(nir, brw_nir_lower_shader_calls);
+ const nir_lower_shader_calls_options opts = {
+ .address_format = nir_address_format_64bit_global,
+ .stack_alignment = BRW_BTD_STACK_ALIGN,
+ .localized_loads = true,
+ .vectorizer_callback = brw_nir_should_vectorize_mem,
+ .vectorizer_data = NULL,
+ .should_remat_callback = should_remat_cb,
+ };
+
+ NIR_PASS(_, nir, nir_lower_shader_calls, &opts,
+ &resume_shaders, &num_resume_shaders, mem_ctx);
+ NIR_PASS(_, nir, brw_nir_lower_shader_calls, &stage->key.bs);
NIR_PASS_V(nir, brw_nir_lower_rt_intrinsics, devinfo);
}
for (unsigned i = 0; i < num_resume_shaders; i++) {
- NIR_PASS_V(resume_shaders[i], brw_nir_lower_shader_calls);
+ NIR_PASS(_,resume_shaders[i], brw_nir_lower_shader_calls, &stage->key.bs);
NIR_PASS_V(resume_shaders[i], brw_nir_lower_rt_intrinsics, devinfo);
}
- stage->code =
- brw_compile_bs(compiler, pipeline->base.device, mem_ctx,
- &stage->key.bs, &stage->prog_data.bs, nir,
- num_resume_shaders, resume_shaders, stage->stats, NULL);
- if (stage->code == NULL)
- return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
-
- /* Ray-tracing shaders don't have a "real" bind map */
- struct anv_pipeline_bind_map empty_bind_map = {};
+ struct brw_compile_bs_params params = {
+ .base = {
+ .nir = nir,
+ .stats = stage->stats,
+ .log_data = pipeline->base.device,
+ .mem_ctx = mem_ctx,
+ },
+ .key = &stage->key.bs,
+ .prog_data = &stage->prog_data.bs,
+ .num_resume_shaders = num_resume_shaders,
+ .resume_shaders = resume_shaders,
+ };
- const unsigned code_size = stage->prog_data.base.program_size;
- struct anv_shader_bin *bin =
- anv_device_upload_kernel(pipeline->base.device,
- cache,
- stage->stage,
- &stage->cache_key, sizeof(stage->cache_key),
- stage->code, code_size,
- &stage->prog_data.base,
- sizeof(stage->prog_data.bs),
- stage->stats, 1,
- NULL, &empty_bind_map);
- if (bin == NULL)
- return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+ stage->code = brw_compile_bs(compiler, &params);
+ if (stage->code == NULL)
+ return vk_error(pipeline, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+ struct anv_shader_upload_params upload_params = {
+ .stage = stage->stage,
+ .key_data = &stage->cache_key,
+ .key_size = sizeof(stage->cache_key),
+ .kernel_data = stage->code,
+ .kernel_size = stage->prog_data.base.program_size,
+ .prog_data = &stage->prog_data.base,
+ .prog_data_size = brw_prog_data_size(stage->stage),
+ .stats = stage->stats,
+ .num_stats = 1,
+ .bind_map = &stage->bind_map,
+ .push_desc_info = &stage->push_desc_info,
+ .dynamic_push_values = stage->dynamic_push_values,
+ };
- /* TODO: Figure out executables for resume shaders */
- anv_pipeline_add_executables(&pipeline->base, stage, bin);
- util_dynarray_append(&pipeline->shaders, struct anv_shader_bin *, bin);
+ stage->bin =
+ anv_device_upload_kernel(pipeline->base.device, cache, &upload_params);
+ if (stage->bin == NULL)
+ return vk_error(pipeline, VK_ERROR_OUT_OF_HOST_MEMORY);
- *shader_out = bin;
+ anv_pipeline_add_executables(&pipeline->base, stage);
return VK_SUCCESS;
}
@@ -2595,51 +3469,72 @@ anv_pipeline_compute_ray_tracing_stacks(struct anv_ray_tracing_pipeline *pipelin
}
}
+static enum brw_rt_ray_flags
+anv_pipeline_get_pipeline_ray_flags(VkPipelineCreateFlags2KHR flags)
+{
+ uint32_t ray_flags = 0;
+
+ const bool rt_skip_triangles =
+ flags & VK_PIPELINE_CREATE_2_RAY_TRACING_SKIP_TRIANGLES_BIT_KHR;
+ const bool rt_skip_aabbs =
+ flags & VK_PIPELINE_CREATE_2_RAY_TRACING_SKIP_AABBS_BIT_KHR;
+ assert(!(rt_skip_triangles && rt_skip_aabbs));
+
+ if (rt_skip_triangles)
+ ray_flags |= BRW_RT_RAY_FLAG_SKIP_TRIANGLES;
+ else if (rt_skip_aabbs)
+ ray_flags |= BRW_RT_RAY_FLAG_SKIP_AABBS;
+
+ return ray_flags;
+}
+
static struct anv_pipeline_stage *
anv_pipeline_init_ray_tracing_stages(struct anv_ray_tracing_pipeline *pipeline,
const VkRayTracingPipelineCreateInfoKHR *info,
- void *pipeline_ctx)
+ void *tmp_pipeline_ctx)
{
- ANV_FROM_HANDLE(anv_pipeline_layout, layout, info->layout);
-
+ struct anv_device *device = pipeline->base.device;
/* Create enough stage entries for all shader modules plus potential
* combinaisons in the groups.
*/
struct anv_pipeline_stage *stages =
- rzalloc_array(pipeline_ctx, struct anv_pipeline_stage, info->stageCount);
+ rzalloc_array(tmp_pipeline_ctx, struct anv_pipeline_stage, info->stageCount);
+
+ enum brw_rt_ray_flags ray_flags =
+ anv_pipeline_get_pipeline_ray_flags(pipeline->base.flags);
for (uint32_t i = 0; i < info->stageCount; i++) {
const VkPipelineShaderStageCreateInfo *sinfo = &info->pStages[i];
- if (sinfo->module == VK_NULL_HANDLE)
+ if (vk_pipeline_shader_stage_is_null(sinfo))
continue;
int64_t stage_start = os_time_get_nano();
stages[i] = (struct anv_pipeline_stage) {
.stage = vk_to_mesa_shader_stage(sinfo->stage),
- .module = vk_shader_module_from_handle(sinfo->module),
- .entrypoint = sinfo->pName,
- .spec_info = sinfo->pSpecializationInfo,
+ .pipeline_pNext = info->pNext,
+ .info = sinfo,
.cache_key = {
.stage = vk_to_mesa_shader_stage(sinfo->stage),
},
.feedback = {
- .flags = VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT_EXT,
+ .flags = VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT,
},
};
- populate_bs_prog_key(&pipeline->base.device->info, sinfo->flags,
- pipeline->base.device->robust_buffer_access,
- &stages[i].key.bs);
+ anv_stage_allocate_bind_map_tables(&pipeline->base, &stages[i],
+ tmp_pipeline_ctx);
+
+ pipeline->base.active_stages |= sinfo->stage;
- anv_pipeline_hash_shader(stages[i].module,
- stages[i].entrypoint,
- stages[i].stage,
- stages[i].spec_info,
- stages[i].shader_sha1);
+ anv_stage_write_shader_hash(&stages[i], device);
+
+ populate_bs_prog_key(&stages[i],
+ pipeline->base.device,
+ ray_flags);
if (stages[i].stage != MESA_SHADER_INTERSECTION) {
- anv_pipeline_hash_ray_tracing_shader(pipeline, layout, &stages[i],
+ anv_pipeline_hash_ray_tracing_shader(pipeline, &stages[i],
stages[i].cache_key.sha1);
}
@@ -2661,12 +3556,11 @@ anv_pipeline_init_ray_tracing_stages(struct anv_ray_tracing_pipeline *pipeline,
if (any_hit_idx != VK_SHADER_UNUSED_KHR) {
assert(any_hit_idx < info->stageCount);
anv_pipeline_hash_ray_tracing_combined_shader(pipeline,
- layout,
&stages[intersection_idx],
&stages[any_hit_idx],
stages[intersection_idx].cache_key.sha1);
} else {
- anv_pipeline_hash_ray_tracing_shader(pipeline, layout,
+ anv_pipeline_hash_ray_tracing_shader(pipeline,
&stages[intersection_idx],
stages[intersection_idx].cache_key.sha1);
}
@@ -2678,15 +3572,14 @@ anv_pipeline_init_ray_tracing_stages(struct anv_ray_tracing_pipeline *pipeline,
}
static bool
-anv_pipeline_load_cached_shaders(struct anv_ray_tracing_pipeline *pipeline,
- struct anv_pipeline_cache *cache,
- const VkRayTracingPipelineCreateInfoKHR *info,
- struct anv_pipeline_stage *stages,
- uint32_t *stack_max)
+anv_ray_tracing_pipeline_load_cached_shaders(struct anv_ray_tracing_pipeline *pipeline,
+ struct vk_pipeline_cache *cache,
+ const VkRayTracingPipelineCreateInfoKHR *info,
+ struct anv_pipeline_stage *stages)
{
uint32_t shaders = 0, cache_hits = 0;
for (uint32_t i = 0; i < info->stageCount; i++) {
- if (stages[i].entrypoint == NULL)
+ if (stages[i].info == NULL)
continue;
shaders++;
@@ -2701,18 +3594,11 @@ anv_pipeline_load_cached_shaders(struct anv_ray_tracing_pipeline *pipeline,
if (cache_hit) {
cache_hits++;
stages[i].feedback.flags |=
- VK_PIPELINE_CREATION_FEEDBACK_APPLICATION_PIPELINE_CACHE_HIT_BIT_EXT;
+ VK_PIPELINE_CREATION_FEEDBACK_APPLICATION_PIPELINE_CACHE_HIT_BIT;
}
- if (stages[i].bin != NULL) {
- anv_pipeline_add_executables(&pipeline->base, &stages[i], stages[i].bin);
- util_dynarray_append(&pipeline->shaders, struct anv_shader_bin *, stages[i].bin);
-
- uint32_t stack_size =
- brw_bs_prog_data_const(stages[i].bin->prog_data)->max_stack_size;
- stack_max[stages[i].stage] =
- MAX2(stack_max[stages[i].stage], stack_size);
- }
+ if (stages[i].bin != NULL)
+ anv_pipeline_add_executables(&pipeline->base, &stages[i]);
stages[i].feedback.duration += os_time_get_nano() - stage_start;
}
@@ -2722,61 +3608,54 @@ anv_pipeline_load_cached_shaders(struct anv_ray_tracing_pipeline *pipeline,
static VkResult
anv_pipeline_compile_ray_tracing(struct anv_ray_tracing_pipeline *pipeline,
- struct anv_pipeline_cache *cache,
+ void *tmp_pipeline_ctx,
+ struct anv_pipeline_stage *stages,
+ struct vk_pipeline_cache *cache,
const VkRayTracingPipelineCreateInfoKHR *info)
{
- const struct intel_device_info *devinfo = &pipeline->base.device->info;
+ const struct intel_device_info *devinfo = pipeline->base.device->info;
VkResult result;
- VkPipelineCreationFeedbackEXT pipeline_feedback = {
- .flags = VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT_EXT,
+ VkPipelineCreationFeedback pipeline_feedback = {
+ .flags = VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT,
};
int64_t pipeline_start = os_time_get_nano();
- void *pipeline_ctx = ralloc_context(NULL);
-
- struct anv_pipeline_stage *stages =
- anv_pipeline_init_ray_tracing_stages(pipeline, info, pipeline_ctx);
-
- ANV_FROM_HANDLE(anv_pipeline_layout, layout, info->layout);
-
const bool skip_cache_lookup =
(pipeline->base.flags & VK_PIPELINE_CREATE_CAPTURE_INTERNAL_REPRESENTATIONS_BIT_KHR);
- uint32_t stack_max[MESA_VULKAN_SHADER_STAGES] = {};
-
if (!skip_cache_lookup &&
- anv_pipeline_load_cached_shaders(pipeline, cache, info, stages, stack_max)) {
+ anv_ray_tracing_pipeline_load_cached_shaders(pipeline, cache, info, stages)) {
pipeline_feedback.flags |=
- VK_PIPELINE_CREATION_FEEDBACK_APPLICATION_PIPELINE_CACHE_HIT_BIT_EXT;
+ VK_PIPELINE_CREATION_FEEDBACK_APPLICATION_PIPELINE_CACHE_HIT_BIT;
goto done;
}
- if (info->flags & VK_PIPELINE_CREATE_FAIL_ON_PIPELINE_COMPILE_REQUIRED_BIT_EXT) {
- ralloc_free(pipeline_ctx);
- return VK_PIPELINE_COMPILE_REQUIRED_EXT;
- }
+ if (pipeline->base.flags & VK_PIPELINE_CREATE_2_FAIL_ON_PIPELINE_COMPILE_REQUIRED_BIT_KHR)
+ return VK_PIPELINE_COMPILE_REQUIRED;
for (uint32_t i = 0; i < info->stageCount; i++) {
- if (stages[i].entrypoint == NULL)
+ if (stages[i].info == NULL)
continue;
int64_t stage_start = os_time_get_nano();
stages[i].nir = anv_pipeline_stage_get_nir(&pipeline->base, cache,
- pipeline_ctx, &stages[i]);
- if (stages[i].nir == NULL) {
- ralloc_free(pipeline_ctx);
- return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
- }
+ tmp_pipeline_ctx, &stages[i]);
+ if (stages[i].nir == NULL)
+ return vk_error(pipeline, VK_ERROR_OUT_OF_HOST_MEMORY);
- anv_pipeline_lower_nir(&pipeline->base, pipeline_ctx, &stages[i], layout);
+ anv_pipeline_nir_preprocess(&pipeline->base, &stages[i]);
+
+ anv_pipeline_lower_nir(&pipeline->base, tmp_pipeline_ctx, &stages[i],
+ &pipeline->base.layout, 0 /* view_mask */,
+ false /* use_primitive_replication */);
stages[i].feedback.duration += os_time_get_nano() - stage_start;
}
for (uint32_t i = 0; i < info->stageCount; i++) {
- if (stages[i].entrypoint == NULL)
+ if (stages[i].info == NULL)
continue;
/* Shader found in cache already. */
@@ -2789,9 +3668,9 @@ anv_pipeline_compile_ray_tracing(struct anv_ray_tracing_pipeline *pipeline,
int64_t stage_start = os_time_get_nano();
- void *stage_ctx = ralloc_context(pipeline_ctx);
+ void *tmp_stage_ctx = ralloc_context(tmp_pipeline_ctx);
- nir_shader *nir = nir_shader_clone(stage_ctx, stages[i].nir);
+ nir_shader *nir = nir_shader_clone(tmp_stage_ctx, stages[i].nir);
switch (stages[i].stage) {
case MESA_SHADER_RAYGEN:
brw_nir_lower_raygen(nir);
@@ -2821,21 +3700,18 @@ anv_pipeline_compile_ray_tracing(struct anv_ray_tracing_pipeline *pipeline,
}
result = compile_upload_rt_shader(pipeline, cache, nir, &stages[i],
- &stages[i].bin, stage_ctx);
+ tmp_stage_ctx);
if (result != VK_SUCCESS) {
- ralloc_free(pipeline_ctx);
+ ralloc_free(tmp_stage_ctx);
return result;
}
- uint32_t stack_size =
- brw_bs_prog_data_const(stages[i].bin->prog_data)->max_stack_size;
- stack_max[stages[i].stage] = MAX2(stack_max[stages[i].stage], stack_size);
-
- ralloc_free(stage_ctx);
+ ralloc_free(tmp_stage_ctx);
stages[i].feedback.duration += os_time_get_nano() - stage_start;
}
+ done:
for (uint32_t i = 0; i < info->groupCount; i++) {
const VkRayTracingShaderGroupCreateInfoKHR *ginfo = &info->pGroups[i];
struct anv_rt_shader_group *group = &pipeline->groups[i];
@@ -2869,9 +3745,9 @@ anv_pipeline_compile_ray_tracing(struct anv_ray_tracing_pipeline *pipeline,
if (any_hit_idx < info->stageCount)
any_hit = stages[any_hit_idx].nir;
- void *group_ctx = ralloc_context(pipeline_ctx);
+ void *tmp_group_ctx = ralloc_context(tmp_pipeline_ctx);
nir_shader *intersection =
- nir_shader_clone(group_ctx, stages[intersection_idx].nir);
+ nir_shader_clone(tmp_group_ctx, stages[intersection_idx].nir);
brw_nir_lower_combined_intersection_any_hit(intersection, any_hit,
devinfo);
@@ -2879,20 +3755,13 @@ anv_pipeline_compile_ray_tracing(struct anv_ray_tracing_pipeline *pipeline,
result = compile_upload_rt_shader(pipeline, cache,
intersection,
&stages[intersection_idx],
- &group->intersection,
- group_ctx);
- ralloc_free(group_ctx);
+ tmp_group_ctx);
+ ralloc_free(tmp_group_ctx);
if (result != VK_SUCCESS)
return result;
- } else {
- group->intersection = stages[intersection_idx].bin;
}
- uint32_t stack_size =
- brw_bs_prog_data_const(group->intersection->prog_data)->max_stack_size;
- stack_max[MESA_SHADER_INTERSECTION] =
- MAX2(stack_max[MESA_SHADER_INTERSECTION], stack_size);
-
+ group->intersection = stages[intersection_idx].bin;
break;
}
@@ -2901,20 +3770,16 @@ anv_pipeline_compile_ray_tracing(struct anv_ray_tracing_pipeline *pipeline,
}
}
- done:
- ralloc_free(pipeline_ctx);
-
- anv_pipeline_compute_ray_tracing_stacks(pipeline, info, stack_max);
-
pipeline_feedback.duration = os_time_get_nano() - pipeline_start;
- const VkPipelineCreationFeedbackCreateInfoEXT *create_feedback =
- vk_find_struct_const(info->pNext, PIPELINE_CREATION_FEEDBACK_CREATE_INFO_EXT);
+ const VkPipelineCreationFeedbackCreateInfo *create_feedback =
+ vk_find_struct_const(info->pNext, PIPELINE_CREATION_FEEDBACK_CREATE_INFO);
if (create_feedback) {
*create_feedback->pPipelineCreationFeedback = pipeline_feedback;
- assert(info->stageCount == create_feedback->pipelineStageCreationFeedbackCount);
- for (uint32_t i = 0; i < info->stageCount; i++) {
+ uint32_t stage_count = create_feedback->pipelineStageCreationFeedbackCount;
+ assert(stage_count == 0 || info->stageCount == stage_count);
+ for (uint32_t i = 0; i < stage_count; i++) {
gl_shader_stage s = vk_to_mesa_shader_stage(info->pStages[i].stage);
create_feedback->pPipelineStageCreationFeedbacks[i] = stages[s].feedback;
}
@@ -2926,23 +3791,23 @@ anv_pipeline_compile_ray_tracing(struct anv_ray_tracing_pipeline *pipeline,
VkResult
anv_device_init_rt_shaders(struct anv_device *device)
{
+ device->bvh_build_method = ANV_BVH_BUILD_METHOD_NEW_SAH;
+
if (!device->vk.enabled_extensions.KHR_ray_tracing_pipeline)
return VK_SUCCESS;
bool cache_hit;
+ struct anv_push_descriptor_info empty_push_desc_info = {};
+ struct anv_pipeline_bind_map empty_bind_map = {};
struct brw_rt_trampoline {
char name[16];
struct brw_cs_prog_key key;
} trampoline_key = {
.name = "rt-trampoline",
- .key = {
- /* TODO: Other subgroup sizes? */
- .base.subgroup_size_type = BRW_SUBGROUP_SIZE_REQUIRE_8,
- },
};
device->rt_trampoline =
- anv_device_search_for_kernel(device, &device->default_pipeline_cache,
+ anv_device_search_for_kernel(device, device->internal_cache,
&trampoline_key, sizeof(trampoline_key),
&cache_hit);
if (device->rt_trampoline == NULL) {
@@ -2951,10 +3816,8 @@ anv_device_init_rt_shaders(struct anv_device *device)
nir_shader *trampoline_nir =
brw_nir_create_raygen_trampoline(device->physical->compiler, tmp_ctx);
- struct anv_pipeline_bind_map bind_map = {
- .surface_count = 0,
- .sampler_count = 0,
- };
+ trampoline_nir->info.subgroup_size = SUBGROUP_SIZE_REQUIRE_16;
+
uint32_t dummy_params[4] = { 0, };
struct brw_cs_prog_data trampoline_prog_data = {
.base.nr_params = 4,
@@ -2963,30 +3826,44 @@ anv_device_init_rt_shaders(struct anv_device *device)
.uses_btd_stack_ids = true,
};
struct brw_compile_cs_params params = {
- .nir = trampoline_nir,
+ .base = {
+ .nir = trampoline_nir,
+ .log_data = device,
+ .mem_ctx = tmp_ctx,
+ },
.key = &trampoline_key.key,
.prog_data = &trampoline_prog_data,
- .log_data = device,
};
const unsigned *tramp_data =
- brw_compile_cs(device->physical->compiler, tmp_ctx, &params);
+ brw_compile_cs(device->physical->compiler, &params);
+
+ struct anv_shader_upload_params upload_params = {
+ .stage = MESA_SHADER_COMPUTE,
+ .key_data = &trampoline_key,
+ .key_size = sizeof(trampoline_key),
+ .kernel_data = tramp_data,
+ .kernel_size = trampoline_prog_data.base.program_size,
+ .prog_data = &trampoline_prog_data.base,
+ .prog_data_size = sizeof(trampoline_prog_data),
+ .bind_map = &empty_bind_map,
+ .push_desc_info = &empty_push_desc_info,
+ };
device->rt_trampoline =
- anv_device_upload_kernel(device, &device->default_pipeline_cache,
- MESA_SHADER_COMPUTE,
- &trampoline_key, sizeof(trampoline_key),
- tramp_data,
- trampoline_prog_data.base.program_size,
- &trampoline_prog_data.base,
- sizeof(trampoline_prog_data),
- NULL, 0, NULL, &bind_map);
+ anv_device_upload_kernel(device, device->internal_cache,
+ &upload_params);
ralloc_free(tmp_ctx);
if (device->rt_trampoline == NULL)
- return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+ return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
}
+ /* The cache already has a reference and it's not going anywhere so there
+ * is no need to hold a second reference.
+ */
+ anv_shader_bin_unref(device, device->rt_trampoline);
+
struct brw_rt_trivial_return {
char name[16];
struct brw_bs_prog_key key;
@@ -2994,7 +3871,7 @@ anv_device_init_rt_shaders(struct anv_device *device)
.name = "rt-trivial-ret",
};
device->rt_trivial_return =
- anv_device_search_for_kernel(device, &device->default_pipeline_cache,
+ anv_device_search_for_kernel(device, device->internal_cache,
&return_key, sizeof(return_key),
&cache_hit);
if (device->rt_trivial_return == NULL) {
@@ -3002,34 +3879,48 @@ anv_device_init_rt_shaders(struct anv_device *device)
nir_shader *trivial_return_nir =
brw_nir_create_trivial_return_shader(device->physical->compiler, tmp_ctx);
- NIR_PASS_V(trivial_return_nir, brw_nir_lower_rt_intrinsics, &device->info);
+ NIR_PASS_V(trivial_return_nir, brw_nir_lower_rt_intrinsics, device->info);
- struct anv_pipeline_bind_map bind_map = {
- .surface_count = 0,
- .sampler_count = 0,
- };
struct brw_bs_prog_data return_prog_data = { 0, };
+ struct brw_compile_bs_params params = {
+ .base = {
+ .nir = trivial_return_nir,
+ .log_data = device,
+ .mem_ctx = tmp_ctx,
+ },
+ .key = &return_key.key,
+ .prog_data = &return_prog_data,
+ };
const unsigned *return_data =
- brw_compile_bs(device->physical->compiler, device, tmp_ctx,
- &return_key.key, &return_prog_data, trivial_return_nir,
- 0, 0, NULL, NULL);
+ brw_compile_bs(device->physical->compiler, &params);
+
+ struct anv_shader_upload_params upload_params = {
+ .stage = MESA_SHADER_CALLABLE,
+ .key_data = &return_key,
+ .key_size = sizeof(return_key),
+ .kernel_data = return_data,
+ .kernel_size = return_prog_data.base.program_size,
+ .prog_data = &return_prog_data.base,
+ .prog_data_size = sizeof(return_prog_data),
+ .bind_map = &empty_bind_map,
+ .push_desc_info = &empty_push_desc_info,
+ };
device->rt_trivial_return =
- anv_device_upload_kernel(device, &device->default_pipeline_cache,
- MESA_SHADER_CALLABLE,
- &return_key, sizeof(return_key),
- return_data, return_prog_data.base.program_size,
- &return_prog_data.base, sizeof(return_prog_data),
- NULL, 0, NULL, &bind_map);
+ anv_device_upload_kernel(device, device->internal_cache,
+ &upload_params);
ralloc_free(tmp_ctx);
- if (device->rt_trivial_return == NULL) {
- anv_shader_bin_unref(device, device->rt_trampoline);
- return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
- }
+ if (device->rt_trivial_return == NULL)
+ return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
}
+ /* The cache already has a reference and it's not going anywhere so there
+ * is no need to hold a second reference.
+ */
+ anv_shader_bin_unref(device, device->rt_trivial_return);
+
return VK_SUCCESS;
}
@@ -3038,34 +3929,247 @@ anv_device_finish_rt_shaders(struct anv_device *device)
{
if (!device->vk.enabled_extensions.KHR_ray_tracing_pipeline)
return;
-
- anv_shader_bin_unref(device, device->rt_trampoline);
}
-VkResult
+static void
anv_ray_tracing_pipeline_init(struct anv_ray_tracing_pipeline *pipeline,
struct anv_device *device,
- struct anv_pipeline_cache *cache,
+ struct vk_pipeline_cache *cache,
const VkRayTracingPipelineCreateInfoKHR *pCreateInfo,
const VkAllocationCallbacks *alloc)
{
- VkResult result;
-
util_dynarray_init(&pipeline->shaders, pipeline->base.mem_ctx);
- result = anv_pipeline_compile_ray_tracing(pipeline, cache, pCreateInfo);
- if (result != VK_SUCCESS)
- goto fail;
+ ANV_FROM_HANDLE(anv_pipeline_layout, pipeline_layout, pCreateInfo->layout);
+ anv_pipeline_init_layout(&pipeline->base, pipeline_layout);
anv_pipeline_setup_l3_config(&pipeline->base, /* needs_slm */ false);
+}
- return VK_SUCCESS;
+static void
+assert_rt_stage_index_valid(const VkRayTracingPipelineCreateInfoKHR* pCreateInfo,
+ uint32_t stage_idx,
+ VkShaderStageFlags valid_stages)
+{
+ if (stage_idx == VK_SHADER_UNUSED_KHR)
+ return;
-fail:
- util_dynarray_foreach(&pipeline->shaders,
- struct anv_shader_bin *, shader) {
- anv_shader_bin_unref(device, *shader);
+ assert(stage_idx <= pCreateInfo->stageCount);
+ assert(util_bitcount(pCreateInfo->pStages[stage_idx].stage) == 1);
+ assert(pCreateInfo->pStages[stage_idx].stage & valid_stages);
+}
+
+static VkResult
+anv_ray_tracing_pipeline_create(
+ VkDevice _device,
+ struct vk_pipeline_cache * cache,
+ const VkRayTracingPipelineCreateInfoKHR* pCreateInfo,
+ const VkAllocationCallbacks* pAllocator,
+ VkPipeline* pPipeline)
+{
+ ANV_FROM_HANDLE(anv_device, device, _device);
+ VkResult result;
+
+ assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_RAY_TRACING_PIPELINE_CREATE_INFO_KHR);
+
+ uint32_t group_count = pCreateInfo->groupCount;
+ if (pCreateInfo->pLibraryInfo) {
+ for (uint32_t l = 0; l < pCreateInfo->pLibraryInfo->libraryCount; l++) {
+ ANV_FROM_HANDLE(anv_pipeline, library,
+ pCreateInfo->pLibraryInfo->pLibraries[l]);
+ struct anv_ray_tracing_pipeline *rt_library =
+ anv_pipeline_to_ray_tracing(library);
+ group_count += rt_library->group_count;
+ }
+ }
+
+ VK_MULTIALLOC(ma);
+ VK_MULTIALLOC_DECL(&ma, struct anv_ray_tracing_pipeline, pipeline, 1);
+ VK_MULTIALLOC_DECL(&ma, struct anv_rt_shader_group, groups, group_count);
+ if (!vk_multialloc_zalloc2(&ma, &device->vk.alloc, pAllocator,
+ VK_SYSTEM_ALLOCATION_SCOPE_DEVICE))
+ return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+ result = anv_pipeline_init(&pipeline->base, device,
+ ANV_PIPELINE_RAY_TRACING,
+ vk_rt_pipeline_create_flags(pCreateInfo),
+ pAllocator);
+ if (result != VK_SUCCESS) {
+ vk_free2(&device->vk.alloc, pAllocator, pipeline);
+ return result;
}
+
+ pipeline->group_count = group_count;
+ pipeline->groups = groups;
+
+ ASSERTED const VkShaderStageFlags ray_tracing_stages =
+ VK_SHADER_STAGE_RAYGEN_BIT_KHR |
+ VK_SHADER_STAGE_ANY_HIT_BIT_KHR |
+ VK_SHADER_STAGE_CLOSEST_HIT_BIT_KHR |
+ VK_SHADER_STAGE_MISS_BIT_KHR |
+ VK_SHADER_STAGE_INTERSECTION_BIT_KHR |
+ VK_SHADER_STAGE_CALLABLE_BIT_KHR;
+
+ for (uint32_t i = 0; i < pCreateInfo->stageCount; i++)
+ assert((pCreateInfo->pStages[i].stage & ~ray_tracing_stages) == 0);
+
+ for (uint32_t i = 0; i < pCreateInfo->groupCount; i++) {
+ const VkRayTracingShaderGroupCreateInfoKHR *ginfo =
+ &pCreateInfo->pGroups[i];
+ assert_rt_stage_index_valid(pCreateInfo, ginfo->generalShader,
+ VK_SHADER_STAGE_RAYGEN_BIT_KHR |
+ VK_SHADER_STAGE_MISS_BIT_KHR |
+ VK_SHADER_STAGE_CALLABLE_BIT_KHR);
+ assert_rt_stage_index_valid(pCreateInfo, ginfo->closestHitShader,
+ VK_SHADER_STAGE_CLOSEST_HIT_BIT_KHR);
+ assert_rt_stage_index_valid(pCreateInfo, ginfo->anyHitShader,
+ VK_SHADER_STAGE_ANY_HIT_BIT_KHR);
+ assert_rt_stage_index_valid(pCreateInfo, ginfo->intersectionShader,
+ VK_SHADER_STAGE_INTERSECTION_BIT_KHR);
+ switch (ginfo->type) {
+ case VK_RAY_TRACING_SHADER_GROUP_TYPE_GENERAL_KHR:
+ assert(ginfo->generalShader < pCreateInfo->stageCount);
+ assert(ginfo->anyHitShader == VK_SHADER_UNUSED_KHR);
+ assert(ginfo->closestHitShader == VK_SHADER_UNUSED_KHR);
+ assert(ginfo->intersectionShader == VK_SHADER_UNUSED_KHR);
+ break;
+
+ case VK_RAY_TRACING_SHADER_GROUP_TYPE_TRIANGLES_HIT_GROUP_KHR:
+ assert(ginfo->generalShader == VK_SHADER_UNUSED_KHR);
+ assert(ginfo->intersectionShader == VK_SHADER_UNUSED_KHR);
+ break;
+
+ case VK_RAY_TRACING_SHADER_GROUP_TYPE_PROCEDURAL_HIT_GROUP_KHR:
+ assert(ginfo->generalShader == VK_SHADER_UNUSED_KHR);
+ break;
+
+ default:
+ unreachable("Invalid ray-tracing shader group type");
+ }
+ }
+
+ anv_ray_tracing_pipeline_init(pipeline, device, cache,
+ pCreateInfo, pAllocator);
+
+ void *tmp_ctx = ralloc_context(NULL);
+
+ struct anv_pipeline_stage *stages =
+ anv_pipeline_init_ray_tracing_stages(pipeline, pCreateInfo, tmp_ctx);
+
+ result = anv_pipeline_compile_ray_tracing(pipeline, tmp_ctx, stages,
+ cache, pCreateInfo);
+ if (result != VK_SUCCESS) {
+ ralloc_free(tmp_ctx);
+ util_dynarray_foreach(&pipeline->shaders, struct anv_shader_bin *, shader)
+ anv_shader_bin_unref(device, *shader);
+ anv_pipeline_finish(&pipeline->base, device);
+ vk_free2(&device->vk.alloc, pAllocator, pipeline);
+ return result;
+ }
+
+ /* Compute the size of the scratch BO (for register spilling) by taking the
+ * max of all the shaders in the pipeline. Also add the shaders to the list
+ * of executables.
+ */
+ uint32_t stack_max[MESA_VULKAN_SHADER_STAGES] = {};
+ for (uint32_t s = 0; s < pCreateInfo->stageCount; s++) {
+ util_dynarray_append(&pipeline->shaders,
+ struct anv_shader_bin *,
+ stages[s].bin);
+
+ uint32_t stack_size =
+ brw_bs_prog_data_const(stages[s].bin->prog_data)->max_stack_size;
+ stack_max[stages[s].stage] = MAX2(stack_max[stages[s].stage], stack_size);
+
+ anv_pipeline_account_shader(&pipeline->base, stages[s].bin);
+ }
+
+ anv_pipeline_compute_ray_tracing_stacks(pipeline, pCreateInfo, stack_max);
+
+ if (pCreateInfo->pLibraryInfo) {
+ uint32_t g = pCreateInfo->groupCount;
+ for (uint32_t l = 0; l < pCreateInfo->pLibraryInfo->libraryCount; l++) {
+ ANV_FROM_HANDLE(anv_pipeline, library,
+ pCreateInfo->pLibraryInfo->pLibraries[l]);
+ struct anv_ray_tracing_pipeline *rt_library =
+ anv_pipeline_to_ray_tracing(library);
+ for (uint32_t lg = 0; lg < rt_library->group_count; lg++) {
+ pipeline->groups[g] = rt_library->groups[lg];
+ pipeline->groups[g].imported = true;
+ g++;
+ }
+
+ /* Account for shaders in the library. */
+ util_dynarray_foreach(&rt_library->shaders,
+ struct anv_shader_bin *, shader) {
+ util_dynarray_append(&pipeline->shaders,
+ struct anv_shader_bin *,
+ anv_shader_bin_ref(*shader));
+ anv_pipeline_account_shader(&pipeline->base, *shader);
+ }
+
+ /* Add the library shaders to this pipeline's executables. */
+ util_dynarray_foreach(&rt_library->base.executables,
+ struct anv_pipeline_executable, exe) {
+ util_dynarray_append(&pipeline->base.executables,
+ struct anv_pipeline_executable, *exe);
+ }
+
+ pipeline->base.active_stages |= rt_library->base.active_stages;
+ }
+ }
+
+ anv_genX(device->info, ray_tracing_pipeline_emit)(pipeline);
+
+ ralloc_free(tmp_ctx);
+
+ ANV_RMV(rt_pipeline_create, device, pipeline, false);
+
+ *pPipeline = anv_pipeline_to_handle(&pipeline->base);
+
+ return pipeline->base.batch.status;
+}
+
+VkResult
+anv_CreateRayTracingPipelinesKHR(
+ VkDevice _device,
+ VkDeferredOperationKHR deferredOperation,
+ VkPipelineCache pipelineCache,
+ uint32_t createInfoCount,
+ const VkRayTracingPipelineCreateInfoKHR* pCreateInfos,
+ const VkAllocationCallbacks* pAllocator,
+ VkPipeline* pPipelines)
+{
+ ANV_FROM_HANDLE(vk_pipeline_cache, pipeline_cache, pipelineCache);
+
+ VkResult result = VK_SUCCESS;
+
+ unsigned i;
+ for (i = 0; i < createInfoCount; i++) {
+ const VkPipelineCreateFlags2KHR flags =
+ vk_rt_pipeline_create_flags(&pCreateInfos[i]);
+ VkResult res = anv_ray_tracing_pipeline_create(_device, pipeline_cache,
+ &pCreateInfos[i],
+ pAllocator, &pPipelines[i]);
+
+ if (res == VK_SUCCESS)
+ continue;
+
+ /* Bail out on the first error as it is not obvious what error should be
+ * report upon 2 different failures. */
+ result = res;
+ if (result != VK_PIPELINE_COMPILE_REQUIRED)
+ break;
+
+ pPipelines[i] = VK_NULL_HANDLE;
+
+ if (flags & VK_PIPELINE_CREATE_2_EARLY_RETURN_ON_FAILURE_BIT_KHR)
+ break;
+ }
+
+ for (; i < createInfoCount; i++)
+ pPipelines[i] = VK_NULL_HANDLE;
+
return result;
}
@@ -3082,19 +4186,26 @@ VkResult anv_GetPipelineExecutablePropertiesKHR(
VkPipelineExecutablePropertiesKHR* pProperties)
{
ANV_FROM_HANDLE(anv_pipeline, pipeline, pPipelineInfo->pipeline);
- VK_OUTARRAY_MAKE(out, pProperties, pExecutableCount);
+ VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutablePropertiesKHR, out,
+ pProperties, pExecutableCount);
util_dynarray_foreach (&pipeline->executables, struct anv_pipeline_executable, exe) {
- vk_outarray_append(&out, props) {
+ vk_outarray_append_typed(VkPipelineExecutablePropertiesKHR, &out, props) {
gl_shader_stage stage = exe->stage;
props->stages = mesa_to_vk_shader_stage(stage);
unsigned simd_width = exe->stats.dispatch_width;
if (stage == MESA_SHADER_FRAGMENT) {
- WRITE_STR(props->name, "%s%d %s",
- simd_width ? "SIMD" : "vec",
- simd_width ? simd_width : 4,
- _mesa_shader_stage_to_string(stage));
+ if (exe->stats.max_polygons > 1)
+ WRITE_STR(props->name, "SIMD%dx%d %s",
+ exe->stats.max_polygons,
+ simd_width / exe->stats.max_polygons,
+ _mesa_shader_stage_to_string(stage));
+ else
+ WRITE_STR(props->name, "%s%d %s",
+ simd_width ? "SIMD" : "vec",
+ simd_width ? simd_width : 4,
+ _mesa_shader_stage_to_string(stage));
} else {
WRITE_STR(props->name, "%s", _mesa_shader_stage_to_string(stage));
}
@@ -3129,26 +4240,36 @@ VkResult anv_GetPipelineExecutableStatisticsKHR(
VkPipelineExecutableStatisticKHR* pStatistics)
{
ANV_FROM_HANDLE(anv_pipeline, pipeline, pExecutableInfo->pipeline);
- VK_OUTARRAY_MAKE(out, pStatistics, pStatisticCount);
+ VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutableStatisticKHR, out,
+ pStatistics, pStatisticCount);
const struct anv_pipeline_executable *exe =
anv_pipeline_get_executable(pipeline, pExecutableInfo->executableIndex);
const struct brw_stage_prog_data *prog_data;
switch (pipeline->type) {
- case ANV_PIPELINE_GRAPHICS: {
- prog_data = anv_pipeline_to_graphics(pipeline)->shaders[exe->stage]->prog_data;
+ case ANV_PIPELINE_GRAPHICS:
+ case ANV_PIPELINE_GRAPHICS_LIB: {
+ prog_data = anv_pipeline_to_graphics(pipeline)->base.shaders[exe->stage]->prog_data;
break;
}
case ANV_PIPELINE_COMPUTE: {
prog_data = anv_pipeline_to_compute(pipeline)->cs->prog_data;
break;
}
+ case ANV_PIPELINE_RAY_TRACING: {
+ struct anv_shader_bin **shader =
+ util_dynarray_element(&anv_pipeline_to_ray_tracing(pipeline)->shaders,
+ struct anv_shader_bin *,
+ pExecutableInfo->executableIndex);
+ prog_data = (*shader)->prog_data;
+ break;
+ }
default:
unreachable("invalid pipeline type");
}
- vk_outarray_append(&out, stat) {
+ vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
WRITE_STR(stat->name, "Instruction Count");
WRITE_STR(stat->description,
"Number of GEN instructions in the final generated "
@@ -3157,7 +4278,7 @@ VkResult anv_GetPipelineExecutableStatisticsKHR(
stat->value.u64 = exe->stats.instructions;
}
- vk_outarray_append(&out, stat) {
+ vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
WRITE_STR(stat->name, "SEND Count");
WRITE_STR(stat->description,
"Number of instructions in the final generated shader "
@@ -3167,7 +4288,7 @@ VkResult anv_GetPipelineExecutableStatisticsKHR(
stat->value.u64 = exe->stats.sends;
}
- vk_outarray_append(&out, stat) {
+ vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
WRITE_STR(stat->name, "Loop Count");
WRITE_STR(stat->description,
"Number of loops (not unrolled) in the final generated "
@@ -3176,7 +4297,7 @@ VkResult anv_GetPipelineExecutableStatisticsKHR(
stat->value.u64 = exe->stats.loops;
}
- vk_outarray_append(&out, stat) {
+ vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
WRITE_STR(stat->name, "Cycle Count");
WRITE_STR(stat->description,
"Estimate of the number of EU cycles required to execute "
@@ -3186,7 +4307,7 @@ VkResult anv_GetPipelineExecutableStatisticsKHR(
stat->value.u64 = exe->stats.cycles;
}
- vk_outarray_append(&out, stat) {
+ vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
WRITE_STR(stat->name, "Spill Count");
WRITE_STR(stat->description,
"Number of scratch spill operations. This gives a rough "
@@ -3197,7 +4318,7 @@ VkResult anv_GetPipelineExecutableStatisticsKHR(
stat->value.u64 = exe->stats.spills;
}
- vk_outarray_append(&out, stat) {
+ vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
WRITE_STR(stat->name, "Fill Count");
WRITE_STR(stat->description,
"Number of scratch fill operations. This gives a rough "
@@ -3208,7 +4329,7 @@ VkResult anv_GetPipelineExecutableStatisticsKHR(
stat->value.u64 = exe->stats.fills;
}
- vk_outarray_append(&out, stat) {
+ vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
WRITE_STR(stat->name, "Scratch Memory Size");
WRITE_STR(stat->description,
"Number of bytes of scratch memory required by the "
@@ -3219,15 +4340,50 @@ VkResult anv_GetPipelineExecutableStatisticsKHR(
stat->value.u64 = prog_data->total_scratch;
}
- if (gl_shader_stage_uses_workgroup(exe->stage)) {
- vk_outarray_append(&out, stat) {
- WRITE_STR(stat->name, "Workgroup Memory Size");
- WRITE_STR(stat->description,
- "Number of bytes of workgroup shared memory used by this "
- "shader including any padding.");
- stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
+ vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
+ WRITE_STR(stat->name, "Max dispatch width");
+ WRITE_STR(stat->description,
+ "Largest SIMD dispatch width.");
+ stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
+ /* Report the max dispatch width only on the smallest SIMD variant */
+ if (exe->stage != MESA_SHADER_FRAGMENT || exe->stats.dispatch_width == 8)
+ stat->value.u64 = exe->stats.max_dispatch_width;
+ else
+ stat->value.u64 = 0;
+ }
+
+ vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
+ WRITE_STR(stat->name, "Max live registers");
+ WRITE_STR(stat->description,
+ "Maximum number of registers used across the entire shader.");
+ stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
+ stat->value.u64 = exe->stats.max_live_registers;
+ }
+
+ vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
+ WRITE_STR(stat->name, "Workgroup Memory Size");
+ WRITE_STR(stat->description,
+ "Number of bytes of workgroup shared memory used by this "
+ "shader including any padding.");
+ stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
+ if (gl_shader_stage_uses_workgroup(exe->stage))
stat->value.u64 = prog_data->total_shared;
- }
+ else
+ stat->value.u64 = 0;
+ }
+
+ vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
+ uint32_t hash = pipeline->type == ANV_PIPELINE_COMPUTE ?
+ anv_pipeline_to_compute(pipeline)->source_hash :
+ (pipeline->type == ANV_PIPELINE_GRAPHICS_LIB ||
+ pipeline->type == ANV_PIPELINE_GRAPHICS) ?
+ anv_pipeline_to_graphics_base(pipeline)->source_hashes[exe->stage] :
+ 0 /* No source hash for ray tracing */;
+ WRITE_STR(stat->name, "Source hash");
+ WRITE_STR(stat->description,
+ "hash = 0x%08x. Hash generated from shader source.", hash);
+ stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
+ stat->value.u64 = hash;
}
return vk_outarray_status(&out);
@@ -3261,15 +4417,15 @@ VkResult anv_GetPipelineExecutableInternalRepresentationsKHR(
VkPipelineExecutableInternalRepresentationKHR* pInternalRepresentations)
{
ANV_FROM_HANDLE(anv_pipeline, pipeline, pExecutableInfo->pipeline);
- VK_OUTARRAY_MAKE(out, pInternalRepresentations,
- pInternalRepresentationCount);
+ VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutableInternalRepresentationKHR, out,
+ pInternalRepresentations, pInternalRepresentationCount);
bool incomplete_text = false;
const struct anv_pipeline_executable *exe =
anv_pipeline_get_executable(pipeline, pExecutableInfo->executableIndex);
if (exe->nir) {
- vk_outarray_append(&out, ir) {
+ vk_outarray_append_typed(VkPipelineExecutableInternalRepresentationKHR, &out, ir) {
WRITE_STR(ir->name, "Final NIR");
WRITE_STR(ir->description,
"Final NIR before going into the back-end compiler");
@@ -3280,7 +4436,7 @@ VkResult anv_GetPipelineExecutableInternalRepresentationsKHR(
}
if (exe->disasm) {
- vk_outarray_append(&out, ir) {
+ vk_outarray_append_typed(VkPipelineExecutableInternalRepresentationKHR, &out, ir) {
WRITE_STR(ir->name, "GEN Assembly");
WRITE_STR(ir->description,
"Final GEN assembly for the generated shader binary");
@@ -3295,20 +4451,23 @@ VkResult anv_GetPipelineExecutableInternalRepresentationsKHR(
VkResult
anv_GetRayTracingShaderGroupHandlesKHR(
- VkDevice device,
+ VkDevice _device,
VkPipeline _pipeline,
uint32_t firstGroup,
uint32_t groupCount,
size_t dataSize,
void* pData)
{
+ ANV_FROM_HANDLE(anv_device, device, _device);
ANV_FROM_HANDLE(anv_pipeline, pipeline, _pipeline);
+
if (pipeline->type != ANV_PIPELINE_RAY_TRACING)
- return vk_error(VK_ERROR_FEATURE_NOT_PRESENT);
+ return vk_error(device, VK_ERROR_FEATURE_NOT_PRESENT);
struct anv_ray_tracing_pipeline *rt_pipeline =
anv_pipeline_to_ray_tracing(pipeline);
+ assert(firstGroup + groupCount <= rt_pipeline->group_count);
for (uint32_t i = 0; i < groupCount; i++) {
struct anv_rt_shader_group *group = &rt_pipeline->groups[firstGroup + i];
memcpy(pData, group->handle, sizeof(group->handle));
@@ -3320,15 +4479,16 @@ anv_GetRayTracingShaderGroupHandlesKHR(
VkResult
anv_GetRayTracingCaptureReplayShaderGroupHandlesKHR(
- VkDevice device,
+ VkDevice _device,
VkPipeline pipeline,
uint32_t firstGroup,
uint32_t groupCount,
size_t dataSize,
void* pData)
{
+ ANV_FROM_HANDLE(anv_device, device, _device);
unreachable("Unimplemented");
- return vk_error(VK_ERROR_FEATURE_NOT_PRESENT);
+ return vk_error(device, VK_ERROR_FEATURE_NOT_PRESENT);
}
VkDeviceSize
diff --git a/src/intel/vulkan/anv_pipeline_cache.c b/src/intel/vulkan/anv_pipeline_cache.c
index 0bbc0849c2a..73a145664a5 100644
--- a/src/intel/vulkan/anv_pipeline_cache.c
+++ b/src/intel/vulkan/anv_pipeline_cache.c
@@ -23,15 +23,184 @@
#include "util/blob.h"
#include "util/hash_table.h"
-#include "util/debug.h"
+#include "util/u_debug.h"
#include "util/disk_cache.h"
#include "util/mesa-sha1.h"
#include "nir/nir_serialize.h"
#include "anv_private.h"
#include "nir/nir_xfb_info.h"
-#include "vulkan/util/vk_util.h"
+#include "vk_util.h"
+#include "compiler/spirv/nir_spirv.h"
+#include "shaders/float64_spv.h"
-struct anv_shader_bin *
+/**
+ * Embedded sampler management.
+ */
+
+static unsigned
+embedded_sampler_key_hash(const void *key)
+{
+ return _mesa_hash_data(key, sizeof(struct anv_embedded_sampler_key));
+}
+
+static bool
+embedded_sampler_key_equal(const void *a, const void *b)
+{
+ return memcmp(a, b, sizeof(struct anv_embedded_sampler_key)) == 0;
+}
+
+static void
+anv_embedded_sampler_free(struct anv_device *device,
+ struct anv_embedded_sampler *sampler)
+{
+ anv_state_pool_free(&device->dynamic_state_db_pool, sampler->sampler_state);
+ anv_state_pool_free(&device->dynamic_state_db_pool, sampler->border_color_state);
+ vk_free(&device->vk.alloc, sampler);
+}
+
+static struct anv_embedded_sampler *
+anv_embedded_sampler_ref(struct anv_embedded_sampler *sampler)
+{
+ sampler->ref_cnt++;
+ return sampler;
+}
+
+static void
+anv_embedded_sampler_unref(struct anv_device *device,
+ struct anv_embedded_sampler *sampler)
+{
+ simple_mtx_lock(&device->embedded_samplers.mutex);
+ if (--sampler->ref_cnt == 0) {
+ _mesa_hash_table_remove_key(device->embedded_samplers.map,
+ &sampler->key);
+ anv_embedded_sampler_free(device, sampler);
+ }
+ simple_mtx_unlock(&device->embedded_samplers.mutex);
+}
+
+void
+anv_device_init_embedded_samplers(struct anv_device *device)
+{
+ simple_mtx_init(&device->embedded_samplers.mutex, mtx_plain);
+ device->embedded_samplers.map =
+ _mesa_hash_table_create(NULL,
+ embedded_sampler_key_hash,
+ embedded_sampler_key_equal);
+}
+
+void
+anv_device_finish_embedded_samplers(struct anv_device *device)
+{
+ hash_table_foreach(device->embedded_samplers.map, entry) {
+ anv_embedded_sampler_free(device, entry->data);
+ }
+ ralloc_free(device->embedded_samplers.map);
+ simple_mtx_destroy(&device->embedded_samplers.mutex);
+}
+
+static VkResult
+anv_shader_bin_get_embedded_samplers(struct anv_device *device,
+ struct anv_shader_bin *shader,
+ const struct anv_pipeline_bind_map *bind_map)
+{
+ VkResult result = VK_SUCCESS;
+
+ simple_mtx_lock(&device->embedded_samplers.mutex);
+
+ for (uint32_t i = 0; i < bind_map->embedded_sampler_count; i++) {
+ struct hash_entry *entry =
+ _mesa_hash_table_search(device->embedded_samplers.map,
+ &bind_map->embedded_sampler_to_binding[i].key);
+ if (entry == NULL) {
+ shader->embedded_samplers[i] =
+ vk_zalloc(&device->vk.alloc,
+ sizeof(struct anv_embedded_sampler), 8,
+ VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
+ if (shader->embedded_samplers[i] == NULL) {
+ result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+ goto err;
+ }
+
+ anv_genX(device->info, emit_embedded_sampler)(
+ device, shader->embedded_samplers[i],
+ &bind_map->embedded_sampler_to_binding[i]);
+ _mesa_hash_table_insert(device->embedded_samplers.map,
+ &shader->embedded_samplers[i]->key,
+ shader->embedded_samplers[i]);
+ } else {
+ shader->embedded_samplers[i] = anv_embedded_sampler_ref(entry->data);
+ }
+ }
+
+ err:
+ simple_mtx_unlock(&device->embedded_samplers.mutex);
+ return result;
+}
+
+/**
+ *
+ */
+
+static bool
+anv_shader_bin_serialize(struct vk_pipeline_cache_object *object,
+ struct blob *blob);
+
+struct vk_pipeline_cache_object *
+anv_shader_bin_deserialize(struct vk_pipeline_cache *cache,
+ const void *key_data, size_t key_size,
+ struct blob_reader *blob);
+
+static void
+anv_shader_bin_destroy(struct vk_device *_device,
+ struct vk_pipeline_cache_object *object)
+{
+ struct anv_device *device =
+ container_of(_device, struct anv_device, vk);
+
+ struct anv_shader_bin *shader =
+ container_of(object, struct anv_shader_bin, base);
+
+ for (uint32_t i = 0; i < shader->bind_map.embedded_sampler_count; i++)
+ anv_embedded_sampler_unref(device, shader->embedded_samplers[i]);
+
+ anv_state_pool_free(&device->instruction_state_pool, shader->kernel);
+ vk_pipeline_cache_object_finish(&shader->base);
+ vk_free(&device->vk.alloc, shader);
+}
+
+static const struct vk_pipeline_cache_object_ops anv_shader_bin_ops = {
+ .serialize = anv_shader_bin_serialize,
+ .deserialize = anv_shader_bin_deserialize,
+ .destroy = anv_shader_bin_destroy,
+};
+
+const struct vk_pipeline_cache_object_ops *const anv_cache_import_ops[2] = {
+ &anv_shader_bin_ops,
+ NULL
+};
+
+static void
+anv_shader_bin_rewrite_embedded_samplers(struct anv_device *device,
+ struct anv_shader_bin *shader,
+ const struct anv_pipeline_bind_map *bind_map,
+ const struct brw_stage_prog_data *prog_data_in)
+{
+ int rv_count = 0;
+ struct brw_shader_reloc_value reloc_values[BRW_MAX_EMBEDDED_SAMPLERS];
+
+ for (uint32_t i = 0; i < bind_map->embedded_sampler_count; i++) {
+ reloc_values[rv_count++] = (struct brw_shader_reloc_value) {
+ .id = BRW_SHADER_RELOC_EMBEDDED_SAMPLER_HANDLE + i,
+ .value = shader->embedded_samplers[i]->sampler_state.offset,
+ };
+ }
+
+ brw_write_shader_relocs(&device->physical->compiler->isa,
+ shader->kernel.map, prog_data_in,
+ reloc_values, rv_count);
+}
+
+static struct anv_shader_bin *
anv_shader_bin_create(struct anv_device *device,
gl_shader_stage stage,
const void *key_data, uint32_t key_size,
@@ -40,12 +209,13 @@ anv_shader_bin_create(struct anv_device *device,
uint32_t prog_data_size,
const struct brw_compile_stats *stats, uint32_t num_stats,
const nir_xfb_info *xfb_info_in,
- const struct anv_pipeline_bind_map *bind_map)
+ const struct anv_pipeline_bind_map *bind_map,
+ const struct anv_push_descriptor_info *push_desc_info,
+ enum anv_dynamic_push_bits dynamic_push_values)
{
VK_MULTIALLOC(ma);
VK_MULTIALLOC_DECL(&ma, struct anv_shader_bin, shader, 1);
- VK_MULTIALLOC_DECL_SIZE(&ma, struct anv_shader_bin_key, key,
- sizeof(*key) + key_size);
+ VK_MULTIALLOC_DECL_SIZE(&ma, void, obj_key_data, key_size);
VK_MULTIALLOC_DECL_SIZE(&ma, struct brw_stage_prog_data, prog_data,
prog_data_size);
VK_MULTIALLOC_DECL(&ma, struct brw_shader_reloc, prog_data_relocs,
@@ -59,38 +229,69 @@ anv_shader_bin_create(struct anv_device *device,
VK_MULTIALLOC_DECL(&ma, struct anv_pipeline_binding, surface_to_descriptor,
bind_map->surface_count);
VK_MULTIALLOC_DECL(&ma, struct anv_pipeline_binding, sampler_to_descriptor,
- bind_map->sampler_count);
+ bind_map->sampler_count);
+ VK_MULTIALLOC_DECL(&ma, struct anv_pipeline_embedded_sampler_binding,
+ embedded_sampler_to_binding,
+ bind_map->embedded_sampler_count);
+ VK_MULTIALLOC_DECL(&ma, struct brw_kernel_arg_desc, kernel_args,
+ bind_map->kernel_arg_count);
+ VK_MULTIALLOC_DECL(&ma, struct anv_embedded_sampler *, embedded_samplers,
+ bind_map->embedded_sampler_count);
if (!vk_multialloc_alloc(&ma, &device->vk.alloc,
VK_SYSTEM_ALLOCATION_SCOPE_DEVICE))
return NULL;
- shader->ref_cnt = 1;
+ memcpy(obj_key_data, key_data, key_size);
+ vk_pipeline_cache_object_init(&device->vk, &shader->base,
+ &anv_shader_bin_ops, obj_key_data, key_size);
shader->stage = stage;
- key->size = key_size;
- memcpy(key->data, key_data, key_size);
- shader->key = key;
-
shader->kernel =
anv_state_pool_alloc(&device->instruction_state_pool, kernel_size, 64);
memcpy(shader->kernel.map, kernel_data, kernel_size);
shader->kernel_size = kernel_size;
- uint64_t shader_data_addr = INSTRUCTION_STATE_POOL_MIN_ADDRESS +
- shader->kernel.offset +
- prog_data_in->const_data_offset;
+ if (bind_map->embedded_sampler_count > 0) {
+ shader->embedded_samplers = embedded_samplers;
+ if (anv_shader_bin_get_embedded_samplers(device, shader, bind_map) != VK_SUCCESS) {
+ anv_state_pool_free(&device->instruction_state_pool, shader->kernel);
+ vk_free(&device->vk.alloc, shader);
+ return NULL;
+ }
+ }
+
+ uint64_t shader_data_addr =
+ device->physical->va.instruction_state_pool.addr +
+ shader->kernel.offset +
+ prog_data_in->const_data_offset;
int rv_count = 0;
- struct brw_shader_reloc_value reloc_values[5];
+ struct brw_shader_reloc_value reloc_values[7];
+ assert((device->physical->va.descriptor_buffer_pool.addr & 0xffffffff) == 0);
+ reloc_values[rv_count++] = (struct brw_shader_reloc_value) {
+ .id = BRW_SHADER_RELOC_DESCRIPTORS_BUFFER_ADDR_HIGH,
+ .value = device->physical->va.descriptor_buffer_pool.addr >> 32,
+ };
+ assert((device->physical->va.indirect_descriptor_pool.addr & 0xffffffff) == 0);
+ assert((device->physical->va.internal_surface_state_pool.addr & 0xffffffff) == 0);
+ reloc_values[rv_count++] = (struct brw_shader_reloc_value) {
+ .id = BRW_SHADER_RELOC_DESCRIPTORS_ADDR_HIGH,
+ .value = device->physical->indirect_descriptors ?
+ (device->physical->va.indirect_descriptor_pool.addr >> 32) :
+ (device->physical->va.internal_surface_state_pool.addr >> 32),
+ };
+ assert((device->physical->va.instruction_state_pool.addr & 0xffffffff) == 0);
reloc_values[rv_count++] = (struct brw_shader_reloc_value) {
.id = BRW_SHADER_RELOC_CONST_DATA_ADDR_LOW,
.value = shader_data_addr,
};
+ assert((device->physical->va.instruction_state_pool.addr & 0xffffffff) == 0);
+ assert(shader_data_addr >> 32 == device->physical->va.instruction_state_pool.addr >> 32);
reloc_values[rv_count++] = (struct brw_shader_reloc_value) {
.id = BRW_SHADER_RELOC_CONST_DATA_ADDR_HIGH,
- .value = shader_data_addr >> 32,
+ .value = device->physical->va.instruction_state_pool.addr >> 32,
};
reloc_values[rv_count++] = (struct brw_shader_reloc_value) {
.id = BRW_SHADER_RELOC_SHADER_START_OFFSET,
@@ -99,9 +300,10 @@ anv_shader_bin_create(struct anv_device *device,
if (brw_shader_stage_is_bindless(stage)) {
const struct brw_bs_prog_data *bs_prog_data =
brw_bs_prog_data_const(prog_data_in);
- uint64_t resume_sbt_addr = INSTRUCTION_STATE_POOL_MIN_ADDRESS +
- shader->kernel.offset +
- bs_prog_data->resume_sbt_offset;
+ uint64_t resume_sbt_addr =
+ device->physical->va.instruction_state_pool.addr +
+ shader->kernel.offset +
+ bs_prog_data->resume_sbt_offset;
reloc_values[rv_count++] = (struct brw_shader_reloc_value) {
.id = BRW_SHADER_RELOC_RESUME_SBT_ADDR_LOW,
.value = resume_sbt_addr,
@@ -112,9 +314,12 @@ anv_shader_bin_create(struct anv_device *device,
};
}
- brw_write_shader_relocs(&device->info, shader->kernel.map, prog_data_in,
+ brw_write_shader_relocs(&device->physical->compiler->isa,
+ shader->kernel.map, prog_data_in,
reloc_values, rv_count);
+ anv_shader_bin_rewrite_embedded_samplers(device, shader, bind_map, prog_data_in);
+
memcpy(prog_data, prog_data_in, prog_data_size);
typed_memcpy(prog_data_relocs, prog_data_in->relocs,
prog_data_in->num_relocs);
@@ -138,40 +343,52 @@ anv_shader_bin_create(struct anv_device *device,
shader->xfb_info = NULL;
}
+ shader->dynamic_push_values = dynamic_push_values;
+
+ typed_memcpy(&shader->push_desc_info, push_desc_info, 1);
+
shader->bind_map = *bind_map;
+
typed_memcpy(surface_to_descriptor, bind_map->surface_to_descriptor,
bind_map->surface_count);
shader->bind_map.surface_to_descriptor = surface_to_descriptor;
+
typed_memcpy(sampler_to_descriptor, bind_map->sampler_to_descriptor,
bind_map->sampler_count);
shader->bind_map.sampler_to_descriptor = sampler_to_descriptor;
- return shader;
-}
+ typed_memcpy(embedded_sampler_to_binding, bind_map->embedded_sampler_to_binding,
+ bind_map->embedded_sampler_count);
+ shader->bind_map.embedded_sampler_to_binding = embedded_sampler_to_binding;
-void
-anv_shader_bin_destroy(struct anv_device *device,
- struct anv_shader_bin *shader)
-{
- assert(shader->ref_cnt == 0);
- anv_state_pool_free(&device->instruction_state_pool, shader->kernel);
- vk_free(&device->vk.alloc, shader);
+ typed_memcpy(kernel_args, bind_map->kernel_args,
+ bind_map->kernel_arg_count);
+ shader->bind_map.kernel_args = kernel_args;
+
+ return shader;
}
static bool
-anv_shader_bin_write_to_blob(const struct anv_shader_bin *shader,
- struct blob *blob)
+anv_shader_bin_serialize(struct vk_pipeline_cache_object *object,
+ struct blob *blob)
{
- blob_write_uint32(blob, shader->stage);
+ struct anv_shader_bin *shader =
+ container_of(object, struct anv_shader_bin, base);
- blob_write_uint32(blob, shader->key->size);
- blob_write_bytes(blob, shader->key->data, shader->key->size);
+ blob_write_uint32(blob, shader->stage);
blob_write_uint32(blob, shader->kernel_size);
blob_write_bytes(blob, shader->kernel.map, shader->kernel_size);
blob_write_uint32(blob, shader->prog_data_size);
- blob_write_bytes(blob, shader->prog_data, shader->prog_data_size);
+
+ union brw_any_prog_data prog_data;
+ assert(shader->prog_data_size <= sizeof(prog_data));
+ memcpy(&prog_data, shader->prog_data, shader->prog_data_size);
+ prog_data.base.relocs = NULL;
+ prog_data.base.param = NULL;
+ blob_write_bytes(blob, &prog_data, shader->prog_data_size);
+
blob_write_bytes(blob, shader->prog_data->relocs,
shader->prog_data->num_relocs *
sizeof(shader->prog_data->relocs[0]));
@@ -189,6 +406,12 @@ anv_shader_bin_write_to_blob(const struct anv_shader_bin *shader,
blob_write_uint32(blob, 0);
}
+ blob_write_uint32(blob, shader->dynamic_push_values);
+
+ blob_write_uint32(blob, shader->push_desc_info.used_descriptors);
+ blob_write_uint32(blob, shader->push_desc_info.fully_promoted_ubo_descriptors);
+ blob_write_uint8(blob, shader->push_desc_info.used_set_buffer);
+
blob_write_bytes(blob, shader->bind_map.surface_sha1,
sizeof(shader->bind_map.surface_sha1));
blob_write_bytes(blob, shader->bind_map.sampler_sha1,
@@ -197,26 +420,39 @@ anv_shader_bin_write_to_blob(const struct anv_shader_bin *shader,
sizeof(shader->bind_map.push_sha1));
blob_write_uint32(blob, shader->bind_map.surface_count);
blob_write_uint32(blob, shader->bind_map.sampler_count);
+ blob_write_uint32(blob, shader->bind_map.embedded_sampler_count);
+ if (shader->stage == MESA_SHADER_KERNEL) {
+ uint32_t packed = (uint32_t)shader->bind_map.kernel_args_size << 16 |
+ (uint32_t)shader->bind_map.kernel_arg_count;
+ blob_write_uint32(blob, packed);
+ }
blob_write_bytes(blob, shader->bind_map.surface_to_descriptor,
shader->bind_map.surface_count *
sizeof(*shader->bind_map.surface_to_descriptor));
blob_write_bytes(blob, shader->bind_map.sampler_to_descriptor,
shader->bind_map.sampler_count *
sizeof(*shader->bind_map.sampler_to_descriptor));
+ blob_write_bytes(blob, shader->bind_map.embedded_sampler_to_binding,
+ shader->bind_map.embedded_sampler_count *
+ sizeof(*shader->bind_map.embedded_sampler_to_binding));
+ blob_write_bytes(blob, shader->bind_map.kernel_args,
+ shader->bind_map.kernel_arg_count *
+ sizeof(*shader->bind_map.kernel_args));
blob_write_bytes(blob, shader->bind_map.push_ranges,
sizeof(shader->bind_map.push_ranges));
return !blob->out_of_memory;
}
-static struct anv_shader_bin *
-anv_shader_bin_create_from_blob(struct anv_device *device,
- struct blob_reader *blob)
+struct vk_pipeline_cache_object *
+anv_shader_bin_deserialize(struct vk_pipeline_cache *cache,
+ const void *key_data, size_t key_size,
+ struct blob_reader *blob)
{
- gl_shader_stage stage = blob_read_uint32(blob);
+ struct anv_device *device =
+ container_of(cache->base.device, struct anv_device, vk);
- uint32_t key_size = blob_read_uint32(blob);
- const void *key_data = blob_read_bytes(blob, key_size);
+ gl_shader_stage stage = blob_read_uint32(blob);
uint32_t kernel_size = blob_read_uint32(blob);
const void *kernel_data = blob_read_bytes(blob, kernel_size);
@@ -242,614 +478,205 @@ anv_shader_bin_create_from_blob(struct anv_device *device,
if (xfb_size)
xfb_info = blob_read_bytes(blob, xfb_size);
- struct anv_pipeline_bind_map bind_map;
+ enum anv_dynamic_push_bits dynamic_push_values = blob_read_uint32(blob);
+
+ struct anv_push_descriptor_info push_desc_info = {};
+ push_desc_info.used_descriptors = blob_read_uint32(blob);
+ push_desc_info.fully_promoted_ubo_descriptors = blob_read_uint32(blob);
+ push_desc_info.used_set_buffer = blob_read_uint8(blob);
+
+ struct anv_pipeline_bind_map bind_map = {};
blob_copy_bytes(blob, bind_map.surface_sha1, sizeof(bind_map.surface_sha1));
blob_copy_bytes(blob, bind_map.sampler_sha1, sizeof(bind_map.sampler_sha1));
blob_copy_bytes(blob, bind_map.push_sha1, sizeof(bind_map.push_sha1));
bind_map.surface_count = blob_read_uint32(blob);
bind_map.sampler_count = blob_read_uint32(blob);
+ bind_map.embedded_sampler_count = blob_read_uint32(blob);
+ if (stage == MESA_SHADER_KERNEL) {
+ uint32_t packed = blob_read_uint32(blob);
+ bind_map.kernel_args_size = (uint16_t)(packed >> 16);
+ bind_map.kernel_arg_count = (uint16_t)packed;
+ }
bind_map.surface_to_descriptor = (void *)
blob_read_bytes(blob, bind_map.surface_count *
sizeof(*bind_map.surface_to_descriptor));
bind_map.sampler_to_descriptor = (void *)
blob_read_bytes(blob, bind_map.sampler_count *
sizeof(*bind_map.sampler_to_descriptor));
+ bind_map.embedded_sampler_to_binding = (void *)
+ blob_read_bytes(blob, bind_map.embedded_sampler_count *
+ sizeof(*bind_map.embedded_sampler_to_binding));
+ bind_map.kernel_args = (void *)
+ blob_read_bytes(blob, bind_map.kernel_arg_count *
+ sizeof(*bind_map.kernel_args));
blob_copy_bytes(blob, bind_map.push_ranges, sizeof(bind_map.push_ranges));
if (blob->overrun)
return NULL;
- return anv_shader_bin_create(device, stage,
- key_data, key_size,
- kernel_data, kernel_size,
- &prog_data.base, prog_data_size,
- stats, num_stats, xfb_info, &bind_map);
-}
-
-/* Remaining work:
- *
- * - Compact binding table layout so it's tight and not dependent on
- * descriptor set layout.
- *
- * - Review prog_data struct for size and cacheability: struct
- * brw_stage_prog_data has binding_table which uses a lot of uint32_t for 8
- * bit quantities etc; use bit fields for all bools, eg dual_src_blend.
- */
-
-static uint32_t
-shader_bin_key_hash_func(const void *void_key)
-{
- const struct anv_shader_bin_key *key = void_key;
- return _mesa_hash_data(key->data, key->size);
-}
-
-static bool
-shader_bin_key_compare_func(const void *void_a, const void *void_b)
-{
- const struct anv_shader_bin_key *a = void_a, *b = void_b;
- if (a->size != b->size)
- return false;
-
- return memcmp(a->data, b->data, a->size) == 0;
-}
-
-static uint32_t
-sha1_hash_func(const void *sha1)
-{
- return _mesa_hash_data(sha1, 20);
-}
-
-static bool
-sha1_compare_func(const void *sha1_a, const void *sha1_b)
-{
- return memcmp(sha1_a, sha1_b, 20) == 0;
-}
-
-void
-anv_pipeline_cache_init(struct anv_pipeline_cache *cache,
- struct anv_device *device,
- bool cache_enabled,
- bool external_sync)
-{
- vk_object_base_init(&device->vk, &cache->base,
- VK_OBJECT_TYPE_PIPELINE_CACHE);
- cache->device = device;
- cache->external_sync = external_sync;
- pthread_mutex_init(&cache->mutex, NULL);
-
- if (cache_enabled) {
- cache->cache = _mesa_hash_table_create(NULL, shader_bin_key_hash_func,
- shader_bin_key_compare_func);
- cache->nir_cache = _mesa_hash_table_create(NULL, sha1_hash_func,
- sha1_compare_func);
- } else {
- cache->cache = NULL;
- cache->nir_cache = NULL;
- }
-}
-
-void
-anv_pipeline_cache_finish(struct anv_pipeline_cache *cache)
-{
- pthread_mutex_destroy(&cache->mutex);
-
- if (cache->cache) {
- /* This is a bit unfortunate. In order to keep things from randomly
- * going away, the shader cache has to hold a reference to all shader
- * binaries it contains. We unref them when we destroy the cache.
- */
- hash_table_foreach(cache->cache, entry)
- anv_shader_bin_unref(cache->device, entry->data);
-
- _mesa_hash_table_destroy(cache->cache, NULL);
- }
-
- if (cache->nir_cache) {
- hash_table_foreach(cache->nir_cache, entry)
- ralloc_free(entry->data);
-
- _mesa_hash_table_destroy(cache->nir_cache, NULL);
- }
-
- vk_object_base_finish(&cache->base);
-}
-
-static struct anv_shader_bin *
-anv_pipeline_cache_search_locked(struct anv_pipeline_cache *cache,
- const void *key_data, uint32_t key_size)
-{
- uint32_t vla[1 + DIV_ROUND_UP(key_size, sizeof(uint32_t))];
- struct anv_shader_bin_key *key = (void *)vla;
- key->size = key_size;
- memcpy(key->data, key_data, key_size);
-
- struct hash_entry *entry = _mesa_hash_table_search(cache->cache, key);
- if (entry)
- return entry->data;
- else
- return NULL;
-}
-
-static inline void
-anv_cache_lock(struct anv_pipeline_cache *cache)
-{
- if (!cache->external_sync)
- pthread_mutex_lock(&cache->mutex);
-}
-
-static inline void
-anv_cache_unlock(struct anv_pipeline_cache *cache)
-{
- if (!cache->external_sync)
- pthread_mutex_unlock(&cache->mutex);
-}
-
-struct anv_shader_bin *
-anv_pipeline_cache_search(struct anv_pipeline_cache *cache,
- const void *key_data, uint32_t key_size)
-{
- if (!cache->cache)
- return NULL;
-
- anv_cache_lock(cache);
-
- struct anv_shader_bin *shader =
- anv_pipeline_cache_search_locked(cache, key_data, key_size);
-
- anv_cache_unlock(cache);
-
- /* We increment refcount before handing it to the caller */
- if (shader)
- anv_shader_bin_ref(shader);
-
- return shader;
-}
-
-static void
-anv_pipeline_cache_add_shader_bin(struct anv_pipeline_cache *cache,
- struct anv_shader_bin *bin)
-{
- if (!cache->cache)
- return;
-
- anv_cache_lock(cache);
-
- struct hash_entry *entry = _mesa_hash_table_search(cache->cache, bin->key);
- if (entry == NULL) {
- /* Take a reference for the cache */
- anv_shader_bin_ref(bin);
- _mesa_hash_table_insert(cache->cache, bin->key, bin);
- }
-
- anv_cache_unlock(cache);
-}
-
-static struct anv_shader_bin *
-anv_pipeline_cache_add_shader_locked(struct anv_pipeline_cache *cache,
- gl_shader_stage stage,
- const void *key_data, uint32_t key_size,
- const void *kernel_data,
- uint32_t kernel_size,
- const struct brw_stage_prog_data *prog_data,
- uint32_t prog_data_size,
- const struct brw_compile_stats *stats,
- uint32_t num_stats,
- const nir_xfb_info *xfb_info,
- const struct anv_pipeline_bind_map *bind_map)
-{
struct anv_shader_bin *shader =
- anv_pipeline_cache_search_locked(cache, key_data, key_size);
- if (shader)
- return shader;
-
- struct anv_shader_bin *bin =
- anv_shader_bin_create(cache->device, stage,
+ anv_shader_bin_create(device, stage,
key_data, key_size,
kernel_data, kernel_size,
- prog_data, prog_data_size,
- stats, num_stats, xfb_info, bind_map);
- if (!bin)
+ &prog_data.base, prog_data_size,
+ stats, num_stats, xfb_info, &bind_map,
+ &push_desc_info,
+ dynamic_push_values);
+ if (shader == NULL)
return NULL;
- _mesa_hash_table_insert(cache->cache, bin->key, bin);
-
- return bin;
-}
-
-struct anv_shader_bin *
-anv_pipeline_cache_upload_kernel(struct anv_pipeline_cache *cache,
- gl_shader_stage stage,
- const void *key_data, uint32_t key_size,
- const void *kernel_data, uint32_t kernel_size,
- const struct brw_stage_prog_data *prog_data,
- uint32_t prog_data_size,
- const struct brw_compile_stats *stats,
- uint32_t num_stats,
- const nir_xfb_info *xfb_info,
- const struct anv_pipeline_bind_map *bind_map)
-{
- if (cache->cache) {
- anv_cache_lock(cache);
-
- struct anv_shader_bin *bin =
- anv_pipeline_cache_add_shader_locked(cache, stage, key_data, key_size,
- kernel_data, kernel_size,
- prog_data, prog_data_size,
- stats, num_stats,
- xfb_info, bind_map);
-
- anv_cache_unlock(cache);
-
- /* We increment refcount before handing it to the caller */
- if (bin)
- anv_shader_bin_ref(bin);
-
- return bin;
- } else {
- /* In this case, we're not caching it so the caller owns it entirely */
- return anv_shader_bin_create(cache->device, stage,
- key_data, key_size,
- kernel_data, kernel_size,
- prog_data, prog_data_size,
- stats, num_stats,
- xfb_info, bind_map);
- }
-}
-
-static void
-anv_pipeline_cache_load(struct anv_pipeline_cache *cache,
- const void *data, size_t size)
-{
- struct anv_device *device = cache->device;
- struct anv_physical_device *pdevice = device->physical;
-
- if (cache->cache == NULL)
- return;
-
- struct blob_reader blob;
- blob_reader_init(&blob, data, size);
-
- struct vk_pipeline_cache_header header;
- blob_copy_bytes(&blob, &header, sizeof(header));
- uint32_t count = blob_read_uint32(&blob);
- if (blob.overrun)
- return;
-
- if (header.header_size < sizeof(header))
- return;
- if (header.header_version != VK_PIPELINE_CACHE_HEADER_VERSION_ONE)
- return;
- if (header.vendor_id != 0x8086)
- return;
- if (header.device_id != device->info.chipset_id)
- return;
- if (memcmp(header.uuid, pdevice->pipeline_cache_uuid, VK_UUID_SIZE) != 0)
- return;
-
- for (uint32_t i = 0; i < count; i++) {
- struct anv_shader_bin *bin =
- anv_shader_bin_create_from_blob(device, &blob);
- if (!bin)
- break;
- _mesa_hash_table_insert(cache->cache, bin->key, bin);
- }
-}
-
-VkResult anv_CreatePipelineCache(
- VkDevice _device,
- const VkPipelineCacheCreateInfo* pCreateInfo,
- const VkAllocationCallbacks* pAllocator,
- VkPipelineCache* pPipelineCache)
-{
- ANV_FROM_HANDLE(anv_device, device, _device);
- struct anv_pipeline_cache *cache;
-
- assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_PIPELINE_CACHE_CREATE_INFO);
-
- cache = vk_alloc2(&device->vk.alloc, pAllocator,
- sizeof(*cache), 8,
- VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
- if (cache == NULL)
- return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
-
- anv_pipeline_cache_init(cache, device,
- device->physical->instance->pipeline_cache_enabled,
- pCreateInfo->flags & VK_PIPELINE_CACHE_CREATE_EXTERNALLY_SYNCHRONIZED_BIT_EXT);
-
- if (pCreateInfo->initialDataSize > 0)
- anv_pipeline_cache_load(cache,
- pCreateInfo->pInitialData,
- pCreateInfo->initialDataSize);
-
- *pPipelineCache = anv_pipeline_cache_to_handle(cache);
-
- return VK_SUCCESS;
-}
-
-void anv_DestroyPipelineCache(
- VkDevice _device,
- VkPipelineCache _cache,
- const VkAllocationCallbacks* pAllocator)
-{
- ANV_FROM_HANDLE(anv_device, device, _device);
- ANV_FROM_HANDLE(anv_pipeline_cache, cache, _cache);
-
- if (!cache)
- return;
-
- anv_pipeline_cache_finish(cache);
-
- vk_free2(&device->vk.alloc, pAllocator, cache);
-}
-
-VkResult anv_GetPipelineCacheData(
- VkDevice _device,
- VkPipelineCache _cache,
- size_t* pDataSize,
- void* pData)
-{
- ANV_FROM_HANDLE(anv_device, device, _device);
- ANV_FROM_HANDLE(anv_pipeline_cache, cache, _cache);
-
- struct blob blob;
- if (pData) {
- blob_init_fixed(&blob, pData, *pDataSize);
- } else {
- blob_init_fixed(&blob, NULL, SIZE_MAX);
- }
-
- struct vk_pipeline_cache_header header = {
- .header_size = sizeof(struct vk_pipeline_cache_header),
- .header_version = VK_PIPELINE_CACHE_HEADER_VERSION_ONE,
- .vendor_id = 0x8086,
- .device_id = device->info.chipset_id,
- };
- memcpy(header.uuid, device->physical->pipeline_cache_uuid, VK_UUID_SIZE);
- blob_write_bytes(&blob, &header, sizeof(header));
-
- uint32_t count = 0;
- intptr_t count_offset = blob_reserve_uint32(&blob);
- if (count_offset < 0) {
- *pDataSize = 0;
- blob_finish(&blob);
- return VK_INCOMPLETE;
- }
-
- VkResult result = VK_SUCCESS;
- if (cache->cache) {
- hash_table_foreach(cache->cache, entry) {
- struct anv_shader_bin *shader = entry->data;
-
- size_t save_size = blob.size;
- if (!anv_shader_bin_write_to_blob(shader, &blob)) {
- /* If it fails reset to the previous size and bail */
- blob.size = save_size;
- result = VK_INCOMPLETE;
- break;
- }
-
- count++;
- }
- }
-
- blob_overwrite_uint32(&blob, count_offset, count);
-
- *pDataSize = blob.size;
-
- blob_finish(&blob);
-
- return result;
-}
-
-VkResult anv_MergePipelineCaches(
- VkDevice _device,
- VkPipelineCache destCache,
- uint32_t srcCacheCount,
- const VkPipelineCache* pSrcCaches)
-{
- ANV_FROM_HANDLE(anv_pipeline_cache, dst, destCache);
-
- if (!dst->cache)
- return VK_SUCCESS;
-
- for (uint32_t i = 0; i < srcCacheCount; i++) {
- ANV_FROM_HANDLE(anv_pipeline_cache, src, pSrcCaches[i]);
- if (!src->cache)
- continue;
-
- hash_table_foreach(src->cache, entry) {
- struct anv_shader_bin *bin = entry->data;
- assert(bin);
-
- if (_mesa_hash_table_search(dst->cache, bin->key))
- continue;
-
- anv_shader_bin_ref(bin);
- _mesa_hash_table_insert(dst->cache, bin->key, bin);
- }
- }
-
- return VK_SUCCESS;
+ return &shader->base;
}
struct anv_shader_bin *
anv_device_search_for_kernel(struct anv_device *device,
- struct anv_pipeline_cache *cache,
+ struct vk_pipeline_cache *cache,
const void *key_data, uint32_t key_size,
bool *user_cache_hit)
{
- struct anv_shader_bin *bin;
-
- *user_cache_hit = false;
-
- if (cache) {
- bin = anv_pipeline_cache_search(cache, key_data, key_size);
- if (bin) {
- *user_cache_hit = cache != &device->default_pipeline_cache;
- return bin;
- }
+ /* Use the default pipeline cache if none is specified */
+ if (cache == NULL)
+ cache = device->default_pipeline_cache;
+
+ bool cache_hit = false;
+ struct vk_pipeline_cache_object *object =
+ vk_pipeline_cache_lookup_object(cache, key_data, key_size,
+ &anv_shader_bin_ops, &cache_hit);
+ if (user_cache_hit != NULL) {
+ *user_cache_hit = object != NULL && cache_hit &&
+ cache != device->default_pipeline_cache;
}
-#ifdef ENABLE_SHADER_CACHE
- struct disk_cache *disk_cache = device->physical->disk_cache;
- if (disk_cache && device->physical->instance->pipeline_cache_enabled) {
- cache_key cache_key;
- disk_cache_compute_key(disk_cache, key_data, key_size, cache_key);
-
- size_t buffer_size;
- uint8_t *buffer = disk_cache_get(disk_cache, cache_key, &buffer_size);
- if (buffer) {
- struct blob_reader blob;
- blob_reader_init(&blob, buffer, buffer_size);
- bin = anv_shader_bin_create_from_blob(device, &blob);
- free(buffer);
-
- if (bin) {
- if (cache)
- anv_pipeline_cache_add_shader_bin(cache, bin);
- return bin;
- }
- }
- }
-#endif
+ if (object == NULL)
+ return NULL;
- return NULL;
+ return container_of(object, struct anv_shader_bin, base);
}
struct anv_shader_bin *
anv_device_upload_kernel(struct anv_device *device,
- struct anv_pipeline_cache *cache,
- gl_shader_stage stage,
- const void *key_data, uint32_t key_size,
- const void *kernel_data, uint32_t kernel_size,
- const struct brw_stage_prog_data *prog_data,
- uint32_t prog_data_size,
- const struct brw_compile_stats *stats,
- uint32_t num_stats,
- const nir_xfb_info *xfb_info,
- const struct anv_pipeline_bind_map *bind_map)
+ struct vk_pipeline_cache *cache,
+ const struct anv_shader_upload_params *params)
{
- struct anv_shader_bin *bin;
- if (cache) {
- bin = anv_pipeline_cache_upload_kernel(cache, stage, key_data, key_size,
- kernel_data, kernel_size,
- prog_data, prog_data_size,
- stats, num_stats,
- xfb_info, bind_map);
- } else {
- bin = anv_shader_bin_create(device, stage, key_data, key_size,
- kernel_data, kernel_size,
- prog_data, prog_data_size,
- stats, num_stats,
- xfb_info, bind_map);
- }
+ /* Use the default pipeline cache if none is specified */
+ if (cache == NULL)
+ cache = device->default_pipeline_cache;
- if (bin == NULL)
- return NULL;
-#ifdef ENABLE_SHADER_CACHE
- struct disk_cache *disk_cache = device->physical->disk_cache;
- if (disk_cache) {
- struct blob binary;
- blob_init(&binary);
- if (anv_shader_bin_write_to_blob(bin, &binary)) {
- cache_key cache_key;
- disk_cache_compute_key(disk_cache, key_data, key_size, cache_key);
- disk_cache_put(disk_cache, cache_key, binary.data, binary.size, NULL);
- }
+ struct anv_shader_bin *shader =
+ anv_shader_bin_create(device,
+ params->stage,
+ params->key_data,
+ params->key_size,
+ params->kernel_data,
+ params->kernel_size,
+ params->prog_data,
+ params->prog_data_size,
+ params->stats,
+ params->num_stats,
+ params->xfb_info,
+ params->bind_map,
+ params->push_desc_info,
+ params->dynamic_push_values);
+ if (shader == NULL)
+ return NULL;
- blob_finish(&binary);
- }
-#endif
+ struct vk_pipeline_cache_object *cached =
+ vk_pipeline_cache_add_object(cache, &shader->base);
- return bin;
+ return container_of(cached, struct anv_shader_bin, base);
}
-struct serialized_nir {
- unsigned char sha1_key[20];
- size_t size;
- char data[0];
-};
+#define SHA1_KEY_SIZE 20
struct nir_shader *
anv_device_search_for_nir(struct anv_device *device,
- struct anv_pipeline_cache *cache,
+ struct vk_pipeline_cache *cache,
const nir_shader_compiler_options *nir_options,
- unsigned char sha1_key[20],
+ unsigned char sha1_key[SHA1_KEY_SIZE],
void *mem_ctx)
{
- if (cache && cache->nir_cache) {
- const struct serialized_nir *snir = NULL;
-
- anv_cache_lock(cache);
- struct hash_entry *entry =
- _mesa_hash_table_search(cache->nir_cache, sha1_key);
- if (entry)
- snir = entry->data;
- anv_cache_unlock(cache);
-
- if (snir) {
- struct blob_reader blob;
- blob_reader_init(&blob, snir->data, snir->size);
-
- nir_shader *nir = nir_deserialize(mem_ctx, nir_options, &blob);
- if (blob.overrun) {
- ralloc_free(nir);
- } else {
- return nir;
- }
- }
- }
+ if (cache == NULL)
+ cache = device->default_pipeline_cache;
- return NULL;
+ return vk_pipeline_cache_lookup_nir(cache, sha1_key, SHA1_KEY_SIZE,
+ nir_options, NULL, mem_ctx);
}
void
anv_device_upload_nir(struct anv_device *device,
- struct anv_pipeline_cache *cache,
+ struct vk_pipeline_cache *cache,
const struct nir_shader *nir,
- unsigned char sha1_key[20])
+ unsigned char sha1_key[SHA1_KEY_SIZE])
{
- if (cache && cache->nir_cache) {
- anv_cache_lock(cache);
- struct hash_entry *entry =
- _mesa_hash_table_search(cache->nir_cache, sha1_key);
- anv_cache_unlock(cache);
- if (entry)
- return;
-
- struct blob blob;
- blob_init(&blob);
-
- nir_serialize(&blob, nir, false);
- if (blob.out_of_memory) {
- blob_finish(&blob);
- return;
- }
+ if (cache == NULL)
+ cache = device->default_pipeline_cache;
- anv_cache_lock(cache);
- /* Because ralloc isn't thread-safe, we have to do all this inside the
- * lock. We could unlock for the big memcpy but it's probably not worth
- * the hassle.
- */
- entry = _mesa_hash_table_search(cache->nir_cache, sha1_key);
- if (entry) {
- blob_finish(&blob);
- anv_cache_unlock(cache);
- return;
- }
+ vk_pipeline_cache_add_nir(cache, sha1_key, SHA1_KEY_SIZE, nir);
+}
+
+void
+anv_load_fp64_shader(struct anv_device *device)
+{
+ const nir_shader_compiler_options *nir_options =
+ device->physical->compiler->nir_options[MESA_SHADER_VERTEX];
+
+ const char* shader_name = "float64_spv_lib";
+ struct mesa_sha1 sha1_ctx;
+ uint8_t sha1[20];
+ _mesa_sha1_init(&sha1_ctx);
+ _mesa_sha1_update(&sha1_ctx, shader_name, strlen(shader_name));
+ _mesa_sha1_final(&sha1_ctx, sha1);
+
+ device->fp64_nir =
+ anv_device_search_for_nir(device, device->internal_cache,
+ nir_options, sha1, NULL);
+
+ /* The shader found, no need to call spirv_to_nir() again. */
+ if (device->fp64_nir)
+ return;
- struct serialized_nir *snir =
- ralloc_size(cache->nir_cache, sizeof(*snir) + blob.size);
- memcpy(snir->sha1_key, sha1_key, 20);
- snir->size = blob.size;
- memcpy(snir->data, blob.data, blob.size);
+ struct spirv_to_nir_options spirv_options = {
+ .caps = {
+ .address = true,
+ .float64 = true,
+ .int8 = true,
+ .int16 = true,
+ .int64 = true,
+ },
+ .environment = NIR_SPIRV_VULKAN,
+ .create_library = true
+ };
- blob_finish(&blob);
+ nir_shader* nir =
+ spirv_to_nir(float64_spv_source, sizeof(float64_spv_source) / 4,
+ NULL, 0, MESA_SHADER_VERTEX, "main",
+ &spirv_options, nir_options);
- _mesa_hash_table_insert(cache->nir_cache, snir->sha1_key, snir);
+ assert(nir != NULL);
- anv_cache_unlock(cache);
- }
+ nir_validate_shader(nir, "after spirv_to_nir");
+ nir_validate_ssa_dominance(nir, "after spirv_to_nir");
+
+ NIR_PASS_V(nir, nir_lower_variable_initializers, nir_var_function_temp);
+ NIR_PASS_V(nir, nir_lower_returns);
+ NIR_PASS_V(nir, nir_inline_functions);
+ NIR_PASS_V(nir, nir_opt_deref);
+
+ NIR_PASS_V(nir, nir_lower_vars_to_ssa);
+ NIR_PASS_V(nir, nir_copy_prop);
+ NIR_PASS_V(nir, nir_opt_dce);
+ NIR_PASS_V(nir, nir_opt_cse);
+ NIR_PASS_V(nir, nir_opt_gcm, true);
+ NIR_PASS_V(nir, nir_opt_peephole_select, 1, false, false);
+ NIR_PASS_V(nir, nir_opt_dce);
+
+ NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_function_temp,
+ nir_address_format_62bit_generic);
+
+ anv_device_upload_nir(device, device->internal_cache,
+ nir, sha1);
+
+ device->fp64_nir = nir;
}
diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h
index 5194a2f1887..3949a14c3f9 100644
--- a/src/intel/vulkan/anv_private.h
+++ b/src/intel/vulkan/anv_private.h
@@ -30,47 +30,82 @@
#include <pthread.h>
#include <assert.h>
#include <stdint.h>
-#include "drm-uapi/i915_drm.h"
+#include "drm-uapi/drm_fourcc.h"
#ifdef HAVE_VALGRIND
#include <valgrind.h>
#include <memcheck.h>
#define VG(x) x
-#ifndef NDEBUG
-#define __gen_validate_value(x) VALGRIND_CHECK_MEM_IS_DEFINED(&(x), sizeof(x))
-#endif
#else
#define VG(x) ((void)0)
#endif
-#include "common/intel_clflush.h"
-#include "common/intel_decoder.h"
+#include "common/intel_aux_map.h"
+#include "common/intel_bind_timeline.h"
+#include "common/intel_engine.h"
#include "common/intel_gem.h"
#include "common/intel_l3_config.h"
#include "common/intel_measure.h"
+#include "common/intel_mem.h"
+#include "common/intel_sample_positions.h"
+#include "decoder/intel_decoder.h"
#include "dev/intel_device_info.h"
#include "blorp/blorp.h"
#include "compiler/brw_compiler.h"
+#include "compiler/brw_kernel.h"
#include "compiler/brw_rt.h"
+#include "ds/intel_driver_ds.h"
#include "util/bitset.h"
#include "util/bitscan.h"
+#include "util/detect_os.h"
#include "util/macros.h"
#include "util/hash_table.h"
#include "util/list.h"
+#include "util/perf/u_trace.h"
+#include "util/set.h"
#include "util/sparse_array.h"
#include "util/u_atomic.h"
+#if DETECT_OS_ANDROID
+#include "util/u_gralloc/u_gralloc.h"
+#endif
#include "util/u_vector.h"
#include "util/u_math.h"
#include "util/vma.h"
#include "util/xmlconfig.h"
+#include "vk_acceleration_structure.h"
#include "vk_alloc.h"
+#include "vk_buffer.h"
+#include "vk_buffer_view.h"
+#include "vk_command_buffer.h"
+#include "vk_command_pool.h"
#include "vk_debug_report.h"
+#include "vk_descriptor_update_template.h"
#include "vk_device.h"
+#include "vk_device_memory.h"
+#include "vk_drm_syncobj.h"
+#include "vk_enum_defines.h"
+#include "vk_format.h"
+#include "vk_framebuffer.h"
+#include "vk_graphics_state.h"
#include "vk_image.h"
#include "vk_instance.h"
+#include "vk_pipeline_cache.h"
#include "vk_physical_device.h"
+#include "vk_sampler.h"
#include "vk_shader_module.h"
+#include "vk_sync.h"
+#include "vk_sync_timeline.h"
+#include "vk_texcompress_astc.h"
#include "vk_util.h"
+#include "vk_query_pool.h"
+#include "vk_queue.h"
+#include "vk_log.h"
+#include "vk_ycbcr_conversion.h"
+#include "vk_video.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
/* Pre-declarations needed for WSI entrypoints */
struct wl_surface;
@@ -83,7 +118,6 @@ struct anv_batch;
struct anv_buffer;
struct anv_buffer_view;
struct anv_image_view;
-struct anv_acceleration_structure;
struct anv_instance;
struct intel_aux_map_context;
@@ -96,6 +130,8 @@ struct intel_perf_query_result;
#include "anv_android.h"
#include "anv_entrypoints.h"
+#include "anv_kmd_backend.h"
+#include "anv_rmv.h"
#include "isl/isl.h"
#include "dev/intel_debug.h"
@@ -106,59 +142,7 @@ struct intel_perf_query_result;
#define NSEC_PER_SEC 1000000000ull
-/* anv Virtual Memory Layout
- * =========================
- *
- * When the anv driver is determining the virtual graphics addresses of memory
- * objects itself using the softpin mechanism, the following memory ranges
- * will be used.
- *
- * Three special considerations to notice:
- *
- * (1) the dynamic state pool is located within the same 4 GiB as the low
- * heap. This is to work around a VF cache issue described in a comment in
- * anv_physical_device_init_heaps.
- *
- * (2) the binding table pool is located at lower addresses than the surface
- * state pool, within a 4 GiB range. This allows surface state base addresses
- * to cover both binding tables (16 bit offsets) and surface states (32 bit
- * offsets).
- *
- * (3) the last 4 GiB of the address space is withheld from the high
- * heap. Various hardware units will read past the end of an object for
- * various reasons. This healthy margin prevents reads from wrapping around
- * 48-bit addresses.
- */
-#define GENERAL_STATE_POOL_MIN_ADDRESS 0x000000010000ULL /* 64 KiB */
-#define GENERAL_STATE_POOL_MAX_ADDRESS 0x00003fffffffULL
-#define LOW_HEAP_MIN_ADDRESS 0x000040000000ULL /* 1 GiB */
-#define LOW_HEAP_MAX_ADDRESS 0x00007fffffffULL
-#define DYNAMIC_STATE_POOL_MIN_ADDRESS 0x0000c0000000ULL /* 3 GiB */
-#define DYNAMIC_STATE_POOL_MAX_ADDRESS 0x0000ffffffffULL
-#define BINDING_TABLE_POOL_MIN_ADDRESS 0x000100000000ULL /* 4 GiB */
-#define BINDING_TABLE_POOL_MAX_ADDRESS 0x00013fffffffULL
-#define SURFACE_STATE_POOL_MIN_ADDRESS 0x000140000000ULL /* 5 GiB */
-#define SURFACE_STATE_POOL_MAX_ADDRESS 0x00017fffffffULL
-#define INSTRUCTION_STATE_POOL_MIN_ADDRESS 0x000180000000ULL /* 6 GiB */
-#define INSTRUCTION_STATE_POOL_MAX_ADDRESS 0x0001bfffffffULL
-#define CLIENT_VISIBLE_HEAP_MIN_ADDRESS 0x0001c0000000ULL /* 7 GiB */
-#define CLIENT_VISIBLE_HEAP_MAX_ADDRESS 0x0002bfffffffULL
-#define HIGH_HEAP_MIN_ADDRESS 0x0002c0000000ULL /* 11 GiB */
-
-#define GENERAL_STATE_POOL_SIZE \
- (GENERAL_STATE_POOL_MAX_ADDRESS - GENERAL_STATE_POOL_MIN_ADDRESS + 1)
-#define LOW_HEAP_SIZE \
- (LOW_HEAP_MAX_ADDRESS - LOW_HEAP_MIN_ADDRESS + 1)
-#define DYNAMIC_STATE_POOL_SIZE \
- (DYNAMIC_STATE_POOL_MAX_ADDRESS - DYNAMIC_STATE_POOL_MIN_ADDRESS + 1)
-#define BINDING_TABLE_POOL_SIZE \
- (BINDING_TABLE_POOL_MAX_ADDRESS - BINDING_TABLE_POOL_MIN_ADDRESS + 1)
-#define SURFACE_STATE_POOL_SIZE \
- (SURFACE_STATE_POOL_MAX_ADDRESS - SURFACE_STATE_POOL_MIN_ADDRESS + 1)
-#define INSTRUCTION_STATE_POOL_SIZE \
- (INSTRUCTION_STATE_POOL_MAX_ADDRESS - INSTRUCTION_STATE_POOL_MIN_ADDRESS + 1)
-#define CLIENT_VISIBLE_HEAP_SIZE \
- (CLIENT_VISIBLE_HEAP_MAX_ADDRESS - CLIENT_VISIBLE_HEAP_MIN_ADDRESS + 1)
+#define BINDING_TABLE_POOL_BLOCK_SIZE (65536)
/* Allowing different clear colors requires us to perform a depth resolve at
* the end of certain render passes. This is because while slow clears store
@@ -175,7 +159,16 @@ struct intel_perf_query_result;
*/
#define ANV_HZ_FC_VAL 1.0f
-#define MAX_VBS 28
+/* 3DSTATE_VERTEX_BUFFER supports 33 VBs, we use 2 for base & drawid SGVs */
+#define MAX_VBS (33 - 2)
+
+/* 3DSTATE_VERTEX_ELEMENTS supports up to 34 VEs, but our backend compiler
+ * only supports the push model of VS inputs, and we only have 128 GRFs,
+ * minus the g0 and g1 payload, which gives us a maximum of 31 VEs. Plus,
+ * we use two of them for SGVs.
+ */
+#define MAX_VES (31 - 2)
+
#define MAX_XFB_BUFFERS 4
#define MAX_XFB_STREAMS 4
#define MAX_SETS 8
@@ -184,10 +177,10 @@ struct intel_perf_query_result;
#define MAX_SCISSORS 16
#define MAX_PUSH_CONSTANTS_SIZE 128
#define MAX_DYNAMIC_BUFFERS 16
-#define MAX_IMAGES 64
#define MAX_PUSH_DESCRIPTORS 32 /* Minimum requirement */
#define MAX_INLINE_UNIFORM_BLOCK_SIZE 4096
#define MAX_INLINE_UNIFORM_BLOCK_DESCRIPTORS 32
+#define MAX_EMBEDDED_SAMPLERS 2048
/* We need 16 for UBO block reads to work and 32 for push UBOs. However, we
* use 64 here to avoid cache issues. This could most likely bring it back to
* 32 if we had different virtual addresses for the different views on a given
@@ -199,6 +192,11 @@ struct intel_perf_query_result;
#define MAX_VIEWS_FOR_PRIMITIVE_REPLICATION 16
#define MAX_SAMPLE_LOCATIONS 16
+/* RENDER_SURFACE_STATE is a bit smaller (48b) but since it is aligned to 64
+ * and we can't put anything else there we use 64b.
+ */
+#define ANV_SURFACE_STATE_SIZE (64)
+
/* From the Skylake PRM Vol. 7 "Binding Table Surface State Model":
*
* "The surface state model is used when a Binding Table Index (specified
@@ -211,25 +209,6 @@ struct intel_perf_query_result;
*/
#define MAX_BINDING_TABLE_SIZE 240
-/* The kernel relocation API has a limitation of a 32-bit delta value
- * applied to the address before it is written which, in spite of it being
- * unsigned, is treated as signed . Because of the way that this maps to
- * the Vulkan API, we cannot handle an offset into a buffer that does not
- * fit into a signed 32 bits. The only mechanism we have for dealing with
- * this at the moment is to limit all VkDeviceMemory objects to a maximum
- * of 2GB each. The Vulkan spec allows us to do this:
- *
- * "Some platforms may have a limit on the maximum size of a single
- * allocation. For example, certain systems may fail to create
- * allocations with a size greater than or equal to 4GB. Such a limit is
- * implementation-dependent, and if such a failure occurs then the error
- * VK_ERROR_OUT_OF_DEVICE_MEMORY should be returned."
- *
- * We don't use vk_error here because it's not an error so much as an
- * indication to the application that the allocation is too large.
- */
-#define MAX_MEMORY_ALLOCATION_SIZE (1ull << 31)
-
#define ANV_SVGS_VB_INDEX MAX_VBS
#define ANV_DRAWID_VB_INDEX (MAX_VBS + 1)
@@ -244,6 +223,14 @@ struct intel_perf_query_result;
*/
#define ANV_PERF_QUERY_OFFSET_REG 0x2670 /* MI_ALU_REG14 */
+#define ANV_GRAPHICS_SHADER_STAGE_COUNT (MESA_SHADER_MESH + 1)
+
+/* RENDER_SURFACE_STATE is a bit smaller (48b) but since it is aligned to 64
+ * and we can't put anything else there we use 64b.
+ */
+#define ANV_SURFACE_STATE_SIZE (64)
+#define ANV_SAMPLER_STATE_SIZE (32)
+
/* For gfx12 we set the streamout buffers using 4 separate commands
* (3DSTATE_SO_BUFFER_INDEX_*) instead of 3DSTATE_SO_BUFFER. However the layout
* of the 3DSTATE_SO_BUFFER_INDEX_* commands is identical to that of
@@ -255,46 +242,28 @@ struct intel_perf_query_result;
#define SO_BUFFER_INDEX_0_CMD 0x60
#define anv_printflike(a, b) __attribute__((__format__(__printf__, a, b)))
+/* The TR-TT L1 page table entries may contain these values instead of actual
+ * pointers to indicate the regions are either NULL or invalid. We program
+ * these values to TR-TT registers, so we could change them, but it's super
+ * convenient to have the NULL value be 0 because everything is
+ * zero-initialized when allocated.
+ *
+ * Since we reserve these values for NULL/INVALID, then we can't use them as
+ * destinations for TR-TT address translation. Both values are shifted by 16
+ * bits, wich results in graphic addresses 0 and 64k. On Anv the first vma
+ * starts at 2MB, so we already don't use 0 and 64k for anything, so there's
+ * nothing really to reserve. We could instead just reserve random 64kb
+ * ranges from any of the non-TR-TT vmas and use their addresses.
+ */
+#define ANV_TRTT_L1_NULL_TILE_VAL 0
+#define ANV_TRTT_L1_INVALID_TILE_VAL 1
+
static inline uint32_t
align_down_npot_u32(uint32_t v, uint32_t a)
{
return v - (v % a);
}
-static inline uint32_t
-align_down_u32(uint32_t v, uint32_t a)
-{
- assert(a != 0 && a == (a & -a));
- return v & ~(a - 1);
-}
-
-static inline uint32_t
-align_u32(uint32_t v, uint32_t a)
-{
- assert(a != 0 && a == (a & -a));
- return align_down_u32(v + a - 1, a);
-}
-
-static inline uint64_t
-align_down_u64(uint64_t v, uint64_t a)
-{
- assert(a != 0 && a == (a & -a));
- return v & ~(a - 1);
-}
-
-static inline uint64_t
-align_u64(uint64_t v, uint64_t a)
-{
- return align_down_u64(v + a - 1, a);
-}
-
-static inline int32_t
-align_i32(int32_t v, int32_t a)
-{
- assert(a != 0 && a == (a & -a));
- return (v + a - 1) & ~(a - 1);
-}
-
/** Alignment must be a power of 2. */
static inline bool
anv_is_aligned(uintmax_t n, uintmax_t a)
@@ -303,39 +272,6 @@ anv_is_aligned(uintmax_t n, uintmax_t a)
return (n & (a - 1)) == 0;
}
-static inline uint32_t
-anv_minify(uint32_t n, uint32_t levels)
-{
- if (unlikely(n == 0))
- return 0;
- else
- return MAX2(n >> levels, 1);
-}
-
-static inline float
-anv_clamp_f(float f, float min, float max)
-{
- assert(min < max);
-
- if (f > max)
- return max;
- else if (f < min)
- return min;
- else
- return f;
-}
-
-static inline bool
-anv_clear_mask(uint32_t *inout_mask, uint32_t clear_mask)
-{
- if (*inout_mask & clear_mask) {
- *inout_mask &= ~clear_mask;
- return true;
- } else {
- return false;
- }
-}
-
static inline union isl_color_value
vk_to_isl_color(VkClearColorValue color)
{
@@ -349,55 +285,26 @@ vk_to_isl_color(VkClearColorValue color)
};
}
-static inline void *anv_unpack_ptr(uintptr_t ptr, int bits, int *flags)
-{
- uintptr_t mask = (1ull << bits) - 1;
- *flags = ptr & mask;
- return (void *) (ptr & ~mask);
-}
-
-static inline uintptr_t anv_pack_ptr(void *ptr, int bits, int flags)
+static inline union isl_color_value
+vk_to_isl_color_with_format(VkClearColorValue color, enum isl_format format)
{
- uintptr_t value = (uintptr_t) ptr;
- uintptr_t mask = (1ull << bits) - 1;
- return value | (mask & flags);
-}
+ const struct isl_format_layout *fmtl = isl_format_get_layout(format);
+ union isl_color_value isl_color = { .u32 = {0, } };
-/* Whenever we generate an error, pass it through this function. Useful for
- * debugging, where we can break on it. Only call at error site, not when
- * propagating errors. Might be useful to plug in a stack trace here.
- */
+#define COPY_COLOR_CHANNEL(c, i) \
+ if (fmtl->channels.c.bits) \
+ isl_color.u32[i] = color.uint32[i]
-VkResult __vk_errorv(struct anv_instance *instance,
- const struct vk_object_base *object, VkResult error,
- const char *file, int line, const char *format,
- va_list args);
+ COPY_COLOR_CHANNEL(r, 0);
+ COPY_COLOR_CHANNEL(g, 1);
+ COPY_COLOR_CHANNEL(b, 2);
+ COPY_COLOR_CHANNEL(a, 3);
-VkResult __vk_errorf(struct anv_instance *instance,
- const struct vk_object_base *object, VkResult error,
- const char *file, int line, const char *format, ...)
- anv_printflike(6, 7);
-
-#ifdef DEBUG
-#define vk_error(error) __vk_errorf(NULL, NULL, error, __FILE__, __LINE__, NULL)
-#define vk_errorfi(instance, obj, error, format, ...)\
- __vk_errorf(instance, obj, error,\
- __FILE__, __LINE__, format, ## __VA_ARGS__)
-#define vk_errorf(device, obj, error, format, ...)\
- vk_errorfi(anv_device_instance_or_null(device),\
- obj, error, format, ## __VA_ARGS__)
-#else
+#undef COPY_COLOR_CHANNEL
-static inline VkResult __dummy_vk_error(VkResult error, UNUSED const void *ignored)
-{
- return error;
+ return isl_color;
}
-#define vk_error(error) __dummy_vk_error(error, NULL)
-#define vk_errorfi(instance, obj, error, format, ...) __dummy_vk_error(error, instance)
-#define vk_errorf(device, obj, error, format, ...) __dummy_vk_error(error, device)
-#endif
-
/**
* Warn on ignored extension structs.
*
@@ -420,8 +327,6 @@ void __anv_perf_warn(struct anv_device *device,
const struct vk_object_base *object,
const char *file, int line, const char *format, ...)
anv_printflike(5, 6);
-void anv_loge(const char *format, ...) anv_printflike(1, 2);
-void anv_loge_v(const char *format, va_list va);
/**
* Print a FINISHME message, including its source location.
@@ -439,18 +344,20 @@ void anv_loge_v(const char *format, va_list va);
/**
* Print a perf warning message. Set INTEL_DEBUG=perf to see these.
*/
-#define anv_perf_warn(instance, obj, format, ...) \
+#define anv_perf_warn(objects_macro, format, ...) \
do { \
static bool reported = false; \
- if (!reported && (INTEL_DEBUG & DEBUG_PERF)) { \
- __anv_perf_warn(instance, obj, __FILE__, __LINE__,\
- format, ##__VA_ARGS__); \
+ if (!reported && INTEL_DEBUG(DEBUG_PERF)) { \
+ __vk_log(VK_DEBUG_UTILS_MESSAGE_SEVERITY_WARNING_BIT_EXT, \
+ VK_DEBUG_UTILS_MESSAGE_TYPE_PERFORMANCE_BIT_EXT, \
+ objects_macro, __FILE__, __LINE__, \
+ format, ## __VA_ARGS__); \
reported = true; \
} \
} while (0)
/* A non-fatal assert. Useful for debugging. */
-#ifdef DEBUG
+#if MESA_DEBUG
#define anv_assert(x) ({ \
if (unlikely(!(x))) \
mesa_loge("%s:%d ASSERT: %s", __FILE__, __LINE__, #x); \
@@ -459,89 +366,179 @@ void anv_loge_v(const char *format, va_list va);
#define anv_assert(x)
#endif
+enum anv_bo_alloc_flags {
+ /** Specifies that the BO must have a 32-bit address
+ *
+ * This is the opposite of EXEC_OBJECT_SUPPORTS_48B_ADDRESS.
+ */
+ ANV_BO_ALLOC_32BIT_ADDRESS = (1 << 0),
+
+ /** Specifies that the BO may be shared externally */
+ ANV_BO_ALLOC_EXTERNAL = (1 << 1),
+
+ /** Specifies that the BO should be mapped */
+ ANV_BO_ALLOC_MAPPED = (1 << 2),
+
+ /** Specifies that the BO should be coherent.
+ *
+ * Note: In platforms with LLC where HOST_CACHED + HOST_COHERENT is free,
+ * bo can get upgraded to HOST_CACHED_COHERENT
+ */
+ ANV_BO_ALLOC_HOST_COHERENT = (1 << 3),
+
+ /** Specifies that the BO should be captured in error states */
+ ANV_BO_ALLOC_CAPTURE = (1 << 4),
+
+ /** Specifies that the BO will have an address assigned by the caller
+ *
+ * Such BOs do not exist in any VMA heap.
+ */
+ ANV_BO_ALLOC_FIXED_ADDRESS = (1 << 5),
+
+ /** Enables implicit synchronization on the BO
+ *
+ * This is the opposite of EXEC_OBJECT_ASYNC.
+ */
+ ANV_BO_ALLOC_IMPLICIT_SYNC = (1 << 6),
+
+ /** Enables implicit synchronization on the BO
+ *
+ * This is equivalent to EXEC_OBJECT_WRITE.
+ */
+ ANV_BO_ALLOC_IMPLICIT_WRITE = (1 << 7),
+
+ /** Has an address which is visible to the client */
+ ANV_BO_ALLOC_CLIENT_VISIBLE_ADDRESS = (1 << 8),
+
+ /** Align the BO's virtual address to match AUX-TT requirements */
+ ANV_BO_ALLOC_AUX_TT_ALIGNED = (1 << 9),
+
+ /** This buffer is allocated from local memory and should be cpu visible */
+ ANV_BO_ALLOC_LOCAL_MEM_CPU_VISIBLE = (1 << 10),
+
+ /** For non device local allocations */
+ ANV_BO_ALLOC_NO_LOCAL_MEM = (1 << 11),
+
+ /** This buffer will be scanout to display */
+ ANV_BO_ALLOC_SCANOUT = (1 << 12),
+
+ /** For descriptor pools */
+ ANV_BO_ALLOC_DESCRIPTOR_POOL = (1 << 13),
+
+ /** For buffers that will be bound using TR-TT.
+ *
+ * Not for buffers used as the TR-TT page tables.
+ */
+ ANV_BO_ALLOC_TRTT = (1 << 14),
+
+ /** Protected buffer */
+ ANV_BO_ALLOC_PROTECTED = (1 << 15),
+
+ /** Specifies that the BO should be cached and incoherent. */
+ ANV_BO_ALLOC_HOST_CACHED = (1 << 16),
+
+ /** For sampler pools */
+ ANV_BO_ALLOC_SAMPLER_POOL = (1 << 17),
+
+ /** Specifies that the BO is imported.
+ *
+ * Imported BOs must also be marked as ANV_BO_ALLOC_EXTERNAL
+ */
+ ANV_BO_ALLOC_IMPORTED = (1 << 18),
+
+ /** Specify whether this BO is internal to the driver */
+ ANV_BO_ALLOC_INTERNAL = (1 << 19),
+
+ /** Allocate with CCS AUX requirements
+ *
+ * This pads the BO include CCS data mapppable through the AUX-TT and
+ * aligned to the AUX-TT requirements.
+ */
+ ANV_BO_ALLOC_AUX_CCS = (1 << 20),
+
+ /** For descriptor buffer pools */
+ ANV_BO_ALLOC_DESCRIPTOR_BUFFER_POOL = (1 << 21),
+};
+
+/** Specifies that the BO should be cached and coherent. */
+#define ANV_BO_ALLOC_HOST_CACHED_COHERENT (ANV_BO_ALLOC_HOST_COHERENT | \
+ ANV_BO_ALLOC_HOST_CACHED)
+
+
struct anv_bo {
const char *name;
+ /* The VMA heap in anv_device from which this BO takes its offset.
+ *
+ * This can only be NULL when has_fixed_address is true.
+ */
+ struct util_vma_heap *vma_heap;
+
+ /* All userptr bos in Xe KMD has gem_handle set to workaround_bo->gem_handle */
uint32_t gem_handle;
uint32_t refcount;
/* Index into the current validation list. This is used by the
- * validation list building alrogithm to track which buffers are already
+ * validation list building algorithm to track which buffers are already
* in the validation list so that we can ensure uniqueness.
*/
- uint32_t index;
+ uint32_t exec_obj_index;
/* Index for use with util_sparse_array_free_list */
uint32_t free_index;
/* Last known offset. This value is provided by the kernel when we
* execbuf and is used as the presumed offset for the next bunch of
- * relocations.
+ * relocations, in canonical address format.
*/
uint64_t offset;
- /** Size of the buffer not including implicit aux */
+ /** Size of the buffer */
uint64_t size;
+ /** Offset at which the CCS data is stored */
+ uint64_t ccs_offset;
+
/* Map for internally mapped BOs.
*
- * If ANV_BO_WRAPPER is set in flags, map points to the wrapped BO.
+ * If ANV_BO_ALLOC_MAPPED is set in flags, this is the map for the whole
+ * BO.
*/
void *map;
- /** Size of the implicit CCS range at the end of the buffer
- *
- * On Gfx12, CCS data is always a direct 1/256 scale-down. A single 64K
- * page of main surface data maps to a 256B chunk of CCS data and that
- * mapping is provided on TGL-LP by the AUX table which maps virtual memory
- * addresses in the main surface to virtual memory addresses for CCS data.
- *
- * Because we can't change these maps around easily and because Vulkan
- * allows two VkImages to be bound to overlapping memory regions (as long
- * as the app is careful), it's not feasible to make this mapping part of
- * the image. (On Gfx11 and earlier, the mapping was provided via
- * RENDER_SURFACE_STATE so each image had its own main -> CCS mapping.)
- * Instead, we attach the CCS data directly to the buffer object and setup
- * the AUX table mapping at BO creation time.
- *
- * This field is for internal tracking use by the BO allocator only and
- * should not be touched by other parts of the code. If something wants to
- * know if a BO has implicit CCS data, it should instead look at the
- * has_implicit_ccs boolean below.
- *
- * This data is not included in maps of this buffer.
+ /* The actual size of bo allocated by kmd, basically:
+ * align(size, mem_alignment)
*/
- uint32_t _ccs_size;
+ uint64_t actual_size;
/** Flags to pass to the kernel through drm_i915_exec_object2::flags */
uint32_t flags;
- /** True if this BO may be shared with other processes */
- bool is_external:1;
-
- /** True if this BO is a wrapper
- *
- * When set to true, none of the fields in this BO are meaningful except
- * for anv_bo::is_wrapper and anv_bo::map which points to the actual BO.
- * See also anv_bo_unwrap(). Wrapper BOs are not allowed when use_softpin
- * is set in the physical device.
- */
- bool is_wrapper:1;
-
- /** See also ANV_BO_ALLOC_FIXED_ADDRESS */
- bool has_fixed_address:1;
+ enum anv_bo_alloc_flags alloc_flags;
/** True if this BO wraps a host pointer */
bool from_host_ptr:1;
- /** See also ANV_BO_ALLOC_CLIENT_VISIBLE_ADDRESS */
- bool has_client_visible_address:1;
-
- /** True if this BO has implicit CCS data attached to it */
- bool has_implicit_ccs:1;
+ /** True if this BO is mapped in the GTT (only used for RMV) */
+ bool gtt_mapped:1;
};
+static inline bool
+anv_bo_is_external(const struct anv_bo *bo)
+{
+ return bo->alloc_flags & ANV_BO_ALLOC_EXTERNAL;
+}
+
+static inline bool
+anv_bo_is_vram_only(const struct anv_bo *bo)
+{
+ return !(bo->alloc_flags & (ANV_BO_ALLOC_NO_LOCAL_MEM |
+ ANV_BO_ALLOC_MAPPED |
+ ANV_BO_ALLOC_LOCAL_MEM_CPU_VISIBLE |
+ ANV_BO_ALLOC_IMPORTED));
+}
+
static inline struct anv_bo *
anv_bo_ref(struct anv_bo *bo)
{
@@ -549,14 +546,71 @@ anv_bo_ref(struct anv_bo *bo)
return bo;
}
-static inline struct anv_bo *
-anv_bo_unwrap(struct anv_bo *bo)
+enum intel_device_info_mmap_mode
+anv_bo_get_mmap_mode(struct anv_device *device, struct anv_bo *bo);
+
+static inline bool
+anv_bo_needs_host_cache_flush(enum anv_bo_alloc_flags alloc_flags)
{
- while (bo->is_wrapper)
- bo = bo->map;
- return bo;
+ return (alloc_flags & (ANV_BO_ALLOC_HOST_CACHED | ANV_BO_ALLOC_HOST_COHERENT)) ==
+ ANV_BO_ALLOC_HOST_CACHED;
+}
+
+struct anv_address {
+ struct anv_bo *bo;
+ int64_t offset;
+};
+
+#define ANV_NULL_ADDRESS ((struct anv_address) { NULL, 0 })
+
+static inline struct anv_address
+anv_address_from_u64(uint64_t addr_u64)
+{
+ assert(addr_u64 == intel_canonical_address(addr_u64));
+ return (struct anv_address) {
+ .bo = NULL,
+ .offset = addr_u64,
+ };
+}
+
+static inline bool
+anv_address_is_null(struct anv_address addr)
+{
+ return addr.bo == NULL && addr.offset == 0;
}
+static inline uint64_t
+anv_address_physical(struct anv_address addr)
+{
+ uint64_t address = (addr.bo ? addr.bo->offset : 0ull) + addr.offset;
+ return intel_canonical_address(address);
+}
+
+static inline struct anv_address
+anv_address_add(struct anv_address addr, uint64_t offset)
+{
+ addr.offset += offset;
+ return addr;
+}
+
+static inline void *
+anv_address_map(struct anv_address addr)
+{
+ if (addr.bo == NULL)
+ return NULL;
+
+ if (addr.bo->map == NULL)
+ return NULL;
+
+ return addr.bo->map + addr.offset;
+}
+
+/* Represent a virtual address range */
+struct anv_va_range {
+ uint64_t addr;
+ uint64_t size;
+};
+
/* Represents a lock-free linked list of "free" things. This is used by
* both the block pool and the state pools. Unfortunately, in order to
* solve the ABA problem, we can't use a single uint32_t head.
@@ -571,7 +625,7 @@ union anv_free_list {
/* Make sure it's aligned to 64 bits. This will make atomic operations
* faster on 32 bit platforms.
*/
- uint64_t u64 __attribute__ ((aligned (8)));
+ alignas(8) uint64_t u64;
};
#define ANV_FREE_LIST_EMPTY ((union anv_free_list) { { UINT32_MAX, 0 } })
@@ -585,7 +639,7 @@ struct anv_block_state {
/* Make sure it's aligned to 64 bits. This will make atomic operations
* faster on 32 bit platforms.
*/
- uint64_t u64 __attribute__ ((aligned (8)));
+ alignas(8) uint64_t u64;
};
};
@@ -600,22 +654,18 @@ struct anv_block_pool {
const char *name;
struct anv_device *device;
- bool use_softpin;
-
- /* Wrapper BO for use in relocation lists. This BO is simply a wrapper
- * around the actual BO so that we grow the pool after the wrapper BO has
- * been put in a relocation list. This is only used in the non-softpin
- * case.
- */
- struct anv_bo wrapper_bo;
struct anv_bo *bos[ANV_MAX_BLOCK_POOL_BOS];
struct anv_bo *bo;
uint32_t nbos;
+ /* Maximum size of the pool */
+ uint64_t max_size;
+
+ /* Current size of the pool */
uint64_t size;
- /* The address where the start of the pool is pinned. The various bos that
+ /* The canonical address where the start of the pool is pinned. The various bos that
* are created as the pool grows will have addresses in the range
* [start_address, start_address + BLOCK_POOL_MEMFD_SIZE).
*/
@@ -627,30 +677,9 @@ struct anv_block_pool {
*/
uint32_t center_bo_offset;
- /* Current memory map of the block pool. This pointer may or may not
- * point to the actual beginning of the block pool memory. If
- * anv_block_pool_alloc_back has ever been called, then this pointer
- * will point to the "center" position of the buffer and all offsets
- * (negative or positive) given out by the block pool alloc functions
- * will be valid relative to this pointer.
- *
- * In particular, map == bo.map + center_offset
- *
- * DO NOT access this pointer directly. Use anv_block_pool_map() instead,
- * since it will handle the softpin case as well, where this points to NULL.
- */
- void *map;
- int fd;
-
- /**
- * Array of mmaps and gem handles owned by the block pool, reclaimed when
- * the block pool is destroyed.
- */
- struct u_vector mmap_cleanups;
-
struct anv_block_state state;
- struct anv_block_state back_state;
+ enum anv_bo_alloc_flags bo_alloc_flags;
};
/* Block pools are backed by a fixed-size 1GB memfd */
@@ -664,14 +693,14 @@ struct anv_block_pool {
static inline uint32_t
anv_block_pool_size(struct anv_block_pool *pool)
{
- return pool->state.end + pool->back_state.end;
+ return pool->state.end;
}
struct anv_state {
- int32_t offset;
+ int64_t offset;
uint32_t alloc_size;
- void *map;
uint32_t idx;
+ void *map;
};
#define ANV_STATE_NULL ((struct anv_state) { .alloc_size = 0 })
@@ -682,7 +711,7 @@ struct anv_fixed_size_state_pool {
};
#define ANV_MIN_STATE_SIZE_LOG2 6
-#define ANV_MAX_STATE_SIZE_LOG2 21
+#define ANV_MAX_STATE_SIZE_LOG2 22
#define ANV_STATE_BUCKETS (ANV_MAX_STATE_SIZE_LOG2 - ANV_MIN_STATE_SIZE_LOG2 + 1)
@@ -696,6 +725,7 @@ struct anv_state_table {
int fd;
struct anv_free_entry *map;
uint32_t size;
+ uint64_t max_size;
struct anv_block_state state;
struct u_vector cleanups;
};
@@ -706,16 +736,13 @@ struct anv_state_pool {
/* Offset into the relevant state base address where the state pool starts
* allocating memory.
*/
- int32_t start_offset;
+ int64_t start_offset;
struct anv_state_table table;
/* The size of blocks which will be allocated from the block pool */
uint32_t block_size;
- /** Free list for "back" allocations */
- union anv_free_list back_alloc_free_list;
-
struct anv_fixed_size_state_pool buckets[ANV_STATE_BUCKETS];
};
@@ -725,6 +752,21 @@ struct anv_state_reserved_pool {
uint32_t count;
};
+struct anv_state_reserved_array_pool {
+ struct anv_state_pool *pool;
+ simple_mtx_t mutex;
+ /* Bitfield of usable elements */
+ BITSET_WORD *states;
+ /* Backing store */
+ struct anv_state state;
+ /* Number of elements */
+ uint32_t count;
+ /* Stride between each element */
+ uint32_t stride;
+ /* Size of each element */
+ uint32_t size;
+};
+
struct anv_state_stream {
struct anv_state_pool *state_pool;
@@ -737,10 +779,42 @@ struct anv_state_stream {
/* Offset into the current block at which to allocate the next state */
uint32_t next;
+ /* Sum of all the blocks in all_blocks */
+ uint32_t total_size;
+
/* List of all blocks allocated from this pool */
struct util_dynarray all_blocks;
};
+struct anv_sparse_submission {
+ struct anv_queue *queue;
+
+ struct anv_vm_bind *binds;
+ int binds_len;
+ int binds_capacity;
+
+ uint32_t wait_count;
+ uint32_t signal_count;
+
+ struct vk_sync_wait *waits;
+ struct vk_sync_signal *signals;
+};
+
+struct anv_trtt_bind {
+ uint64_t pte_addr;
+ uint64_t entry_addr;
+};
+
+struct anv_trtt_submission {
+ struct anv_sparse_submission *sparse;
+
+ struct anv_trtt_bind *l3l2_binds;
+ struct anv_trtt_bind *l1_binds;
+
+ int l3l2_binds_len;
+ int l1_binds_len;
+};
+
/* The block_pool functions exported for testing only. The block pool should
* only be used via a state pool (see below).
*/
@@ -748,26 +822,54 @@ VkResult anv_block_pool_init(struct anv_block_pool *pool,
struct anv_device *device,
const char *name,
uint64_t start_address,
- uint32_t initial_size);
+ uint32_t initial_size,
+ uint32_t max_size);
void anv_block_pool_finish(struct anv_block_pool *pool);
-int32_t anv_block_pool_alloc(struct anv_block_pool *pool,
- uint32_t block_size, uint32_t *padding);
-int32_t anv_block_pool_alloc_back(struct anv_block_pool *pool,
- uint32_t block_size);
+VkResult anv_block_pool_alloc(struct anv_block_pool *pool,
+ uint32_t block_size,
+ int64_t *offset,
+ uint32_t *padding);
void* anv_block_pool_map(struct anv_block_pool *pool, int32_t offset, uint32_t
size);
+struct anv_state_pool_params {
+ const char *name;
+ uint64_t base_address;
+ int64_t start_offset;
+ uint32_t block_size;
+ uint32_t max_size;
+};
+
VkResult anv_state_pool_init(struct anv_state_pool *pool,
struct anv_device *device,
- const char *name,
- uint64_t base_address,
- int32_t start_offset,
- uint32_t block_size);
+ const struct anv_state_pool_params *params);
void anv_state_pool_finish(struct anv_state_pool *pool);
struct anv_state anv_state_pool_alloc(struct anv_state_pool *pool,
uint32_t state_size, uint32_t alignment);
-struct anv_state anv_state_pool_alloc_back(struct anv_state_pool *pool);
void anv_state_pool_free(struct anv_state_pool *pool, struct anv_state state);
+
+static inline struct anv_address
+anv_state_pool_state_address(struct anv_state_pool *pool, struct anv_state state)
+{
+ return (struct anv_address) {
+ .bo = pool->block_pool.bo,
+ .offset = state.offset - pool->start_offset,
+ };
+}
+
+static inline struct anv_state
+anv_state_pool_emit_data(struct anv_state_pool *pool,
+ size_t size, size_t align,
+ const void *p)
+{
+ struct anv_state state;
+
+ state = anv_state_pool_alloc(pool, size, align);
+ memcpy(state.map, p, size);
+
+ return state;
+}
+
void anv_state_stream_init(struct anv_state_stream *stream,
struct anv_state_pool *state_pool,
uint32_t block_size);
@@ -784,6 +886,20 @@ struct anv_state anv_state_reserved_pool_alloc(struct anv_state_reserved_pool *p
void anv_state_reserved_pool_free(struct anv_state_reserved_pool *pool,
struct anv_state state);
+VkResult anv_state_reserved_array_pool_init(struct anv_state_reserved_array_pool *pool,
+ struct anv_state_pool *parent,
+ uint32_t count, uint32_t size,
+ uint32_t alignment);
+void anv_state_reserved_array_pool_finish(struct anv_state_reserved_array_pool *pool);
+struct anv_state anv_state_reserved_array_pool_alloc(struct anv_state_reserved_array_pool *pool,
+ bool alloc_back);
+struct anv_state anv_state_reserved_array_pool_alloc_index(struct anv_state_reserved_array_pool *pool,
+ unsigned idx);
+uint32_t anv_state_reserved_array_pool_state_index(struct anv_state_reserved_array_pool *pool,
+ struct anv_state state);
+void anv_state_reserved_array_pool_free(struct anv_state_reserved_array_pool *pool,
+ struct anv_state state);
+
VkResult anv_state_table_init(struct anv_state_table *table,
struct anv_device *device,
uint32_t initial_entries);
@@ -811,11 +927,13 @@ struct anv_bo_pool {
struct anv_device *device;
+ enum anv_bo_alloc_flags bo_alloc_flags;
+
struct util_sparse_array_free_list free_list[16];
};
void anv_bo_pool_init(struct anv_bo_pool *pool, struct anv_device *device,
- const char *name);
+ const char *name, enum anv_bo_alloc_flags alloc_flags);
void anv_bo_pool_finish(struct anv_bo_pool *pool);
VkResult anv_bo_pool_alloc(struct anv_bo_pool *pool, uint32_t size,
struct anv_bo **bo_out);
@@ -846,7 +964,8 @@ struct anv_bo_cache {
pthread_mutex_t mutex;
};
-VkResult anv_bo_cache_init(struct anv_bo_cache *cache);
+VkResult anv_bo_cache_init(struct anv_bo_cache *cache,
+ struct anv_device *device);
void anv_bo_cache_finish(struct anv_bo_cache *cache);
struct anv_queue_family {
@@ -854,16 +973,17 @@ struct anv_queue_family {
VkQueueFlags queueFlags;
uint32_t queueCount;
- /* Driver internal information */
- enum drm_i915_gem_engine_class engine_class;
+ enum intel_engine_class engine_class;
};
-#define ANV_MAX_QUEUE_FAMILIES 3
+#define ANV_MAX_QUEUE_FAMILIES 5
struct anv_memory_type {
/* Standard bits passed on to the client */
VkMemoryPropertyFlags propertyFlags;
uint32_t heapIndex;
+ /* Whether this is the descriptor buffer memory type */
+ bool descriptor_buffer;
};
struct anv_memory_heap {
@@ -875,17 +995,25 @@ struct anv_memory_heap {
*
* Align it to 64 bits to make atomic operations faster on 32 bit platforms.
*/
- VkDeviceSize used __attribute__ ((aligned (8)));
+ alignas(8) VkDeviceSize used;
bool is_local_mem;
};
struct anv_memregion {
- struct drm_i915_gem_memory_class_instance region;
+ const struct intel_memory_class_instance *region;
uint64_t size;
uint64_t available;
};
+enum anv_timestamp_capture_type {
+ ANV_TIMESTAMP_CAPTURE_TOP_OF_PIPE,
+ ANV_TIMESTAMP_CAPTURE_END_OF_PIPE,
+ ANV_TIMESTAMP_CAPTURE_AT_CS_STALL,
+ ANV_TIMESTAMP_REWRITE_COMPUTE_WALKER,
+ ANV_TIMESTAMP_REWRITE_INDIRECT_DISPATCH,
+};
+
struct anv_physical_device {
struct vk_physical_device vk;
@@ -894,22 +1022,10 @@ struct anv_physical_device {
struct anv_instance * instance;
char path[20];
- struct {
- uint16_t domain;
- uint8_t bus;
- uint8_t device;
- uint8_t function;
- } pci_info;
struct intel_device_info info;
- /** Amount of "GPU memory" we want to advertise
- *
- * Clearly, this value is bogus since Intel is a UMA architecture. On
- * gfx7 platforms, we are limited by GTT size unless we want to implement
- * fine-grained tracking and GTT splitting. On Broadwell and above we are
- * practically unlimited. However, we will never report more than 3/4 of
- * the total system ram to try and avoid running out of RAM.
- */
- bool supports_48bit_addresses;
+
+ bool video_decode_enabled;
+
struct brw_compiler * compiler;
struct isl_device isl_dev;
struct intel_perf_config * perf;
@@ -918,29 +1034,14 @@ struct anv_physical_device {
* end.
*/
uint32_t n_perf_query_commands;
- int cmd_parser_version;
bool has_exec_async;
bool has_exec_capture;
- bool has_exec_fence;
- bool has_syncobj_wait;
- bool has_syncobj_wait_available;
- bool has_context_priority;
- bool has_context_isolation;
- bool has_thread_submit;
- bool has_mmap_offset;
- bool has_userptr_probe;
+ VkQueueGlobalPriorityKHR max_context_priority;
uint64_t gtt_size;
- bool use_softpin;
bool always_use_bindless;
bool use_call_secondary;
- /** True if we can access buffers using A64 messages */
- bool has_a64_buffer_access;
- /** True if we can use bindless access for images */
- bool has_bindless_images;
- /** True if we can use bindless access for samplers */
- bool has_bindless_samplers;
/** True if we can use timeline semaphores through execbuf */
bool has_exec_timeline;
@@ -951,15 +1052,78 @@ struct anv_physical_device {
*/
bool has_reg_timestamp;
- /** True if this device has implicit AUX
- *
- * If true, CCS is handled as an implicit attachment to the BO rather than
- * as an explicitly bound surface.
+ /** True if we can create protected contexts. */
+ bool has_protected_contexts;
+
+ /** Whether KMD has the ability to create VM objects */
+ bool has_vm_control;
+
+ /** True if we have the means to do sparse binding (e.g., a Kernel driver
+ * a vm_bind ioctl).
*/
- bool has_implicit_ccs;
+ enum anv_sparse_type {
+ ANV_SPARSE_TYPE_NOT_SUPPORTED = 0,
+ ANV_SPARSE_TYPE_VM_BIND,
+ ANV_SPARSE_TYPE_TRTT,
+ ANV_SPARSE_TYPE_FAKE,
+ } sparse_type;
+
+ /** True if HW supports ASTC LDR */
+ bool has_astc_ldr;
+ /** True if denorms in void extents should be flushed to zero */
+ bool flush_astc_ldr_void_extent_denorms;
+ /** True if ASTC LDR is supported via emulation */
+ bool emu_astc_ldr;
+ /* true if FCV optimization should be disabled. */
+ bool disable_fcv;
+ /**/
+ bool uses_ex_bso;
bool always_flush_cache;
+ /** True if application memory is allocated with extra AUX memory
+ *
+ * Applications quite often pool image allocations together in a single
+ * VkDeviceMemory object. On platforms like MTL, the alignment of images
+ * with compression mapped through the AUX translation tables is large :
+ * 1MB. This can create a lot of wasted space in the application memory
+ * objects.
+ *
+ * To workaround this problem, we allocate CCS data at the end of
+ * VkDeviceMemory objects. This would not work well for TGL-like platforms
+ * because the AUX translation tables also contain the format of the
+ * images, but on MTL the HW ignore those values. So we can share the AUX
+ * TT entries between different images without problem.
+ *
+ * This should be only true for platforms with AUX TT.
+ */
+ bool alloc_aux_tt_mem;
+
+ /**
+ * True if the descriptors buffers are holding one of the following :
+ * - anv_sampled_image_descriptor
+ * - anv_storage_image_descriptor
+ * - anv_address_range_descriptor
+ *
+ * Accessing the descriptors in a bindless fashion from the shader
+ * requires an indirection in the shader, first fetch one of the structure
+ * listed above from the descriptor buffer, then emit the send message to
+ * the fixed function (sampler, dataport, etc...) with the handle fetched
+ * above.
+ *
+ * We need to do things this way prior to DG2 because the bindless surface
+ * state space is limited to 64Mb and some application will allocate more
+ * than what HW can support. On DG2+ we get 4Gb of bindless surface state
+ * and so we can reference directly RENDER_SURFACE_STATE/SAMPLER_STATE
+ * structures instead.
+ */
+ bool indirect_descriptors;
+
+ bool uses_relocs;
+
+ /** Can the platform support cooperative matrices and is it enabled? */
+ bool has_cooperative_matrix;
+
struct {
uint32_t family_count;
struct anv_queue_family families[ANV_MAX_QUEUE_FAMILIES];
@@ -970,17 +1134,104 @@ struct anv_physical_device {
struct anv_memory_type types[VK_MAX_MEMORY_TYPES];
uint32_t heap_count;
struct anv_memory_heap heaps[VK_MAX_MEMORY_HEAPS];
- bool need_clflush;
+#ifdef SUPPORT_INTEL_INTEGRATED_GPUS
+ bool need_flush;
+#endif
+ /** Mask of memory types of normal allocations */
+ uint32_t default_buffer_mem_types;
+ /** Mask of memory types of descriptor buffers */
+ uint32_t desc_buffer_mem_types;
+ /** Mask of memory types of protected buffers/images */
+ uint32_t protected_mem_types;
} memory;
- struct anv_memregion vram;
+ struct {
+ /**
+ * General state pool
+ */
+ struct anv_va_range general_state_pool;
+ /**
+ * Low 32bit heap
+ */
+ struct anv_va_range low_heap;
+ /**
+ * Binding table pool
+ */
+ struct anv_va_range binding_table_pool;
+ /**
+ * Internal surface states for blorp & push descriptors.
+ */
+ struct anv_va_range internal_surface_state_pool;
+ /**
+ * Scratch surfaces (overlaps with internal_surface_state_pool).
+ */
+ struct anv_va_range scratch_surface_state_pool;
+ /**
+ * Bindless surface states (indirectly referred to by indirect
+ * descriptors or for direct descriptors)
+ */
+ struct anv_va_range bindless_surface_state_pool;
+ /**
+ * Dynamic state pool
+ */
+ struct anv_va_range dynamic_state_pool;
+ /**
+ * Sampler state pool
+ */
+ struct anv_va_range sampler_state_pool;
+ /**
+ * Indirect descriptor pool
+ */
+ struct anv_va_range indirect_descriptor_pool;
+ /**
+ * Indirect push descriptor pool
+ */
+ struct anv_va_range indirect_push_descriptor_pool;
+ /**
+ * Instruction state pool
+ */
+ struct anv_va_range instruction_state_pool;
+ /**
+ * Dynamic state pool when using descriptor buffers
+ */
+ struct anv_va_range dynamic_state_db_pool;
+ /**
+ * Descriptor buffers
+ */
+ struct anv_va_range descriptor_buffer_pool;
+ /**
+ * Push descriptor with descriptor buffers
+ */
+ struct anv_va_range push_descriptor_buffer_pool;
+ /**
+ * AUX-TT
+ */
+ struct anv_va_range aux_tt_pool;
+ /**
+ * Client heap
+ */
+ struct anv_va_range high_heap;
+ struct anv_va_range trtt;
+ } va;
+
+ /* Either we have a single vram region and it's all mappable, or we have
+ * both mappable & non-mappable parts. System memory is always available.
+ */
+ struct anv_memregion vram_mappable;
+ struct anv_memregion vram_non_mappable;
struct anv_memregion sys;
uint8_t driver_build_sha1[20];
uint8_t pipeline_cache_uuid[VK_UUID_SIZE];
uint8_t driver_uuid[VK_UUID_SIZE];
uint8_t device_uuid[VK_UUID_SIZE];
+ uint8_t rt_uuid[VK_UUID_SIZE];
- struct disk_cache * disk_cache;
+ /* Maximum amount of scratch space used by all the GRL kernels */
+ uint32_t max_grl_scratch_size;
+
+ struct vk_sync_type sync_syncobj_type;
+ struct vk_sync_timeline_type sync_timeline_type;
+ const struct vk_sync_type * sync_types[4];
struct wsi_device wsi_device;
int local_fd;
@@ -991,229 +1242,609 @@ struct anv_physical_device {
bool has_master;
int64_t master_major;
int64_t master_minor;
- struct drm_i915_query_engine_info * engine_info;
+ struct intel_query_engine_info * engine_info;
- void (*cmd_emit_timestamp)(struct anv_batch *, struct anv_bo *, uint32_t );
+ void (*cmd_emit_timestamp)(struct anv_batch *, struct anv_device *, struct anv_address,
+ enum anv_timestamp_capture_type, void *);
struct intel_measure_device measure_device;
-};
-struct anv_app_info {
- const char* app_name;
- uint32_t app_version;
- const char* engine_name;
- uint32_t engine_version;
- uint32_t api_version;
+ /* Value of PIPELINE_SELECT::PipelineSelection == GPGPU */
+ uint32_t gpgpu_pipeline_value;
};
-struct anv_instance {
- struct vk_instance vk;
+static inline uint32_t
+anv_physical_device_bindless_heap_size(const struct anv_physical_device *device,
+ bool descriptor_buffer)
+{
+ /* Pre-Gfx12.5, the HW bindless surface heap is only 64MB. After it's 4GB,
+ * but we have some workarounds that require 2 heaps to overlap, so the
+ * size is dictated by our VA allocation.
+ */
+ return device->uses_ex_bso ?
+ (descriptor_buffer ?
+ device->va.descriptor_buffer_pool.size :
+ device->va.bindless_surface_state_pool.size) :
+ 64 * 1024 * 1024 /* 64 MiB */;
+}
- bool physical_devices_enumerated;
- struct list_head physical_devices;
+static inline bool
+anv_physical_device_has_vram(const struct anv_physical_device *device)
+{
+ return device->vram_mappable.size > 0;
+}
- bool pipeline_cache_enabled;
+struct anv_instance {
+ struct vk_instance vk;
struct driOptionCache dri_options;
struct driOptionCache available_dri_options;
+
+ int mesh_conv_prim_attrs_to_vert_attrs;
+ bool enable_tbimr;
+ bool external_memory_implicit_sync;
+
+ /**
+ * Workarounds for game bugs.
+ */
+ uint8_t assume_full_subgroups;
+ bool limit_trig_input_range;
+ bool sample_mask_out_opengl_behaviour;
+ bool force_filter_addr_rounding;
+ bool fp64_workaround_enabled;
+ float lower_depth_range_rate;
+ unsigned generated_indirect_threshold;
+ unsigned generated_indirect_ring_threshold;
+ unsigned query_clear_with_blorp_threshold;
+ unsigned query_copy_with_shader_threshold;
+ unsigned force_vk_vendor;
+ bool has_fake_sparse;
+ bool disable_fcv;
+ bool compression_control_enabled;
+
+ /* HW workarounds */
+ bool no_16bit;
+ bool intel_enable_wa_14018912822;
};
VkResult anv_init_wsi(struct anv_physical_device *physical_device);
void anv_finish_wsi(struct anv_physical_device *physical_device);
-struct anv_queue_submit {
- struct anv_cmd_buffer ** cmd_buffers;
- uint32_t cmd_buffer_count;
- uint32_t cmd_buffer_array_length;
-
- uint32_t fence_count;
- uint32_t fence_array_length;
- struct drm_i915_gem_exec_fence * fences;
- uint64_t * fence_values;
-
- uint32_t temporary_semaphore_count;
- uint32_t temporary_semaphore_array_length;
- struct anv_semaphore_impl * temporary_semaphores;
-
- /* Allocated only with non shareable timelines. */
- union {
- struct anv_timeline ** wait_timelines;
- uint32_t * wait_timeline_syncobjs;
- };
- uint32_t wait_timeline_count;
- uint32_t wait_timeline_array_length;
- uint64_t * wait_timeline_values;
-
- struct anv_timeline ** signal_timelines;
- uint32_t signal_timeline_count;
- uint32_t signal_timeline_array_length;
- uint64_t * signal_timeline_values;
-
- int in_fence;
- bool need_out_fence;
- int out_fence;
-
- uint32_t fence_bo_count;
- uint32_t fence_bo_array_length;
- /* An array of struct anv_bo pointers with lower bit used as a flag to
- * signal we will wait on that BO (see anv_(un)pack_ptr).
- */
- uintptr_t * fence_bos;
-
- int perf_query_pass;
- struct anv_query_pool * perf_query_pool;
-
- const VkAllocationCallbacks * alloc;
- VkSystemAllocationScope alloc_scope;
-
- struct anv_bo * simple_bo;
- uint32_t simple_bo_size;
-
- struct list_head link;
-};
-
struct anv_queue {
- struct vk_object_base base;
+ struct vk_queue vk;
struct anv_device * device;
- VkDeviceQueueCreateFlags flags;
const struct anv_queue_family * family;
- uint32_t exec_flags;
+ struct intel_batch_decode_ctx * decoder;
- /* Set once from the device api calls. */
- bool lost_signaled;
+ union {
+ uint32_t exec_flags; /* i915 */
+ uint32_t context_id; /* i915 */
+ uint32_t exec_queue_id; /* Xe */
+ };
- /* Only set once atomically by the queue */
- int lost;
- int error_line;
- const char * error_file;
- char error_msg[80];
+ /** Context/Engine id which executes companion RCS command buffer */
+ uint32_t companion_rcs_id;
- /*
- * This mutext protects the variables below.
- */
- pthread_mutex_t mutex;
+ /** Synchronization object for debug purposes (DEBUG_SYNC) */
+ struct vk_sync *sync;
- pthread_t thread;
- pthread_cond_t cond;
-
- /*
- * A list of struct anv_queue_submit to be submitted to i915.
+ /** Companion synchronization object
+ *
+ * Vulkan command buffers can be destroyed as soon as their lifecycle moved
+ * from the Pending state to the Invalid/Executable state. This transition
+ * happens when the VkFence/VkSemaphore associated with the completion of
+ * the command buffer work is signaled.
+ *
+ * When we're using a companion command buffer to execute part of another
+ * command buffer, we need to tie the 2 work submissions together to ensure
+ * when the associated VkFence/VkSemaphore is signaled, both command
+ * buffers are actually unused by the HW. To do this, we run an empty batch
+ * buffer that we use to signal after both submissions :
+ *
+ * CCS --> main ---> empty_batch (with wait on companion) --> signal
+ * RCS --> companion -|
+ *
+ * When companion batch completes, it signals companion_sync and allow
+ * empty_batch to execute. Since empty_batch is running on the main engine,
+ * we're guaranteed that upon completion both main & companion command
+ * buffers are not used by HW anymore.
*/
- struct list_head queued_submits;
+ struct vk_sync *companion_sync;
- /* Set to true to stop the submission thread */
- bool quit;
-};
-
-struct anv_pipeline_cache {
- struct vk_object_base base;
- struct anv_device * device;
- pthread_mutex_t mutex;
-
- struct hash_table * nir_cache;
-
- struct hash_table * cache;
-
- bool external_sync;
+ struct intel_ds_queue ds;
};
struct nir_xfb_info;
struct anv_pipeline_bind_map;
+struct anv_pipeline_sets_layout;
+struct anv_push_descriptor_info;
+enum anv_dynamic_push_bits;
-void anv_pipeline_cache_init(struct anv_pipeline_cache *cache,
- struct anv_device *device,
- bool cache_enabled,
- bool external_sync);
-void anv_pipeline_cache_finish(struct anv_pipeline_cache *cache);
+void anv_device_init_embedded_samplers(struct anv_device *device);
+void anv_device_finish_embedded_samplers(struct anv_device *device);
-struct anv_shader_bin *
-anv_pipeline_cache_search(struct anv_pipeline_cache *cache,
- const void *key, uint32_t key_size);
-struct anv_shader_bin *
-anv_pipeline_cache_upload_kernel(struct anv_pipeline_cache *cache,
- gl_shader_stage stage,
- const void *key_data, uint32_t key_size,
- const void *kernel_data, uint32_t kernel_size,
- const struct brw_stage_prog_data *prog_data,
- uint32_t prog_data_size,
- const struct brw_compile_stats *stats,
- uint32_t num_stats,
- const struct nir_xfb_info *xfb_info,
- const struct anv_pipeline_bind_map *bind_map);
+extern const struct vk_pipeline_cache_object_ops *const anv_cache_import_ops[2];
struct anv_shader_bin *
anv_device_search_for_kernel(struct anv_device *device,
- struct anv_pipeline_cache *cache,
+ struct vk_pipeline_cache *cache,
const void *key_data, uint32_t key_size,
bool *user_cache_bit);
+struct anv_shader_upload_params;
+
struct anv_shader_bin *
anv_device_upload_kernel(struct anv_device *device,
- struct anv_pipeline_cache *cache,
- gl_shader_stage stage,
- const void *key_data, uint32_t key_size,
- const void *kernel_data, uint32_t kernel_size,
- const struct brw_stage_prog_data *prog_data,
- uint32_t prog_data_size,
- const struct brw_compile_stats *stats,
- uint32_t num_stats,
- const struct nir_xfb_info *xfb_info,
- const struct anv_pipeline_bind_map *bind_map);
+ struct vk_pipeline_cache *cache,
+ const struct anv_shader_upload_params *params);
struct nir_shader;
struct nir_shader_compiler_options;
struct nir_shader *
anv_device_search_for_nir(struct anv_device *device,
- struct anv_pipeline_cache *cache,
+ struct vk_pipeline_cache *cache,
const struct nir_shader_compiler_options *nir_options,
unsigned char sha1_key[20],
void *mem_ctx);
void
anv_device_upload_nir(struct anv_device *device,
- struct anv_pipeline_cache *cache,
+ struct vk_pipeline_cache *cache,
const struct nir_shader *nir,
unsigned char sha1_key[20]);
-struct anv_address {
+void
+anv_load_fp64_shader(struct anv_device *device);
+
+/**
+ * This enum tracks the various HW instructions that hold graphics state
+ * needing to be reprogrammed. Some instructions are grouped together as they
+ * pretty much need to be emitted together (like 3DSTATE_URB_*).
+ *
+ * Not all bits apply to all platforms. We build a dirty state based on
+ * enabled extensions & generation on anv_device.
+ */
+enum anv_gfx_state_bits {
+ /* Pipeline states */
+ ANV_GFX_STATE_URB, /* All legacy stages, including mesh */
+ ANV_GFX_STATE_VF_STATISTICS,
+ ANV_GFX_STATE_VF_SGVS,
+ ANV_GFX_STATE_VF_SGVS_2,
+ ANV_GFX_STATE_VF_SGVS_VI, /* 3DSTATE_VERTEX_ELEMENTS for sgvs elements */
+ ANV_GFX_STATE_VF_SGVS_INSTANCING, /* 3DSTATE_VF_INSTANCING for sgvs elements */
+ ANV_GFX_STATE_PRIMITIVE_REPLICATION,
+ ANV_GFX_STATE_SBE,
+ ANV_GFX_STATE_SBE_SWIZ,
+ ANV_GFX_STATE_SO_DECL_LIST,
+ ANV_GFX_STATE_VS,
+ ANV_GFX_STATE_HS,
+ ANV_GFX_STATE_DS,
+ ANV_GFX_STATE_GS,
+ ANV_GFX_STATE_PS,
+ ANV_GFX_STATE_SBE_MESH,
+ ANV_GFX_STATE_CLIP_MESH,
+ ANV_GFX_STATE_MESH_CONTROL,
+ ANV_GFX_STATE_MESH_SHADER,
+ ANV_GFX_STATE_MESH_DISTRIB,
+ ANV_GFX_STATE_TASK_CONTROL,
+ ANV_GFX_STATE_TASK_SHADER,
+ ANV_GFX_STATE_TASK_REDISTRIB,
+ /* Dynamic states */
+ ANV_GFX_STATE_BLEND_STATE, /* Just the dynamic state structure */
+ ANV_GFX_STATE_BLEND_STATE_PTR, /* The pointer to the dynamic state */
+ ANV_GFX_STATE_CLIP,
+ ANV_GFX_STATE_CC_STATE,
+ ANV_GFX_STATE_CC_STATE_PTR,
+ ANV_GFX_STATE_CPS,
+ ANV_GFX_STATE_DEPTH_BOUNDS,
+ ANV_GFX_STATE_INDEX_BUFFER,
+ ANV_GFX_STATE_LINE_STIPPLE,
+ ANV_GFX_STATE_MULTISAMPLE,
+ ANV_GFX_STATE_PS_BLEND,
+ ANV_GFX_STATE_RASTER,
+ ANV_GFX_STATE_SAMPLE_MASK,
+ ANV_GFX_STATE_SAMPLE_PATTERN,
+ ANV_GFX_STATE_SCISSOR,
+ ANV_GFX_STATE_SF,
+ ANV_GFX_STATE_STREAMOUT,
+ ANV_GFX_STATE_TE,
+ ANV_GFX_STATE_VERTEX_INPUT,
+ ANV_GFX_STATE_VF,
+ ANV_GFX_STATE_VF_TOPOLOGY,
+ ANV_GFX_STATE_VFG,
+ ANV_GFX_STATE_VIEWPORT_CC,
+ ANV_GFX_STATE_VIEWPORT_CC_PTR,
+ ANV_GFX_STATE_VIEWPORT_SF_CLIP,
+ ANV_GFX_STATE_WM,
+ ANV_GFX_STATE_WM_DEPTH_STENCIL,
+ ANV_GFX_STATE_PS_EXTRA,
+ ANV_GFX_STATE_PMA_FIX, /* Fake state to implement workaround */
+ ANV_GFX_STATE_WA_18019816803, /* Fake state to implement workaround */
+ ANV_GFX_STATE_TBIMR_TILE_PASS_INFO,
+
+ ANV_GFX_STATE_MAX,
+};
+
+const char *anv_gfx_state_bit_to_str(enum anv_gfx_state_bits state);
+
+/* This structure tracks the values to program in HW instructions for
+ * corresponding to dynamic states of the Vulkan API. Only fields that need to
+ * be reemitted outside of the VkPipeline object are tracked here.
+ */
+struct anv_gfx_dynamic_state {
+ /* 3DSTATE_BLEND_STATE_POINTERS */
+ struct {
+ bool AlphaToCoverageEnable;
+ bool AlphaToOneEnable;
+ bool IndependentAlphaBlendEnable;
+ struct {
+ bool WriteDisableAlpha;
+ bool WriteDisableRed;
+ bool WriteDisableGreen;
+ bool WriteDisableBlue;
+
+ uint32_t LogicOpFunction;
+ bool LogicOpEnable;
+
+ bool ColorBufferBlendEnable;
+ uint32_t ColorClampRange;
+ bool PreBlendColorClampEnable;
+ bool PostBlendColorClampEnable;
+ uint32_t SourceBlendFactor;
+ uint32_t DestinationBlendFactor;
+ uint32_t ColorBlendFunction;
+ uint32_t SourceAlphaBlendFactor;
+ uint32_t DestinationAlphaBlendFactor;
+ uint32_t AlphaBlendFunction;
+ } rts[MAX_RTS];
+
+ struct anv_state state;
+ } blend;
+
+ /* 3DSTATE_CC_STATE_POINTERS */
+ struct {
+ float BlendConstantColorRed;
+ float BlendConstantColorGreen;
+ float BlendConstantColorBlue;
+ float BlendConstantColorAlpha;
+
+ struct anv_state state;
+ } cc;
+
+ /* 3DSTATE_CLIP */
+ struct {
+ uint32_t APIMode;
+ uint32_t ViewportXYClipTestEnable;
+ uint32_t MaximumVPIndex;
+ uint32_t TriangleStripListProvokingVertexSelect;
+ uint32_t LineStripListProvokingVertexSelect;
+ uint32_t TriangleFanProvokingVertexSelect;
+ } clip;
+
+ /* 3DSTATE_CPS/3DSTATE_CPS_POINTERS */
+ struct {
+ /* Gfx11 */
+ uint32_t CoarsePixelShadingMode;
+ float MinCPSizeX;
+ float MinCPSizeY;
+ /* Gfx12+ */
+ uint32_t CoarsePixelShadingStateArrayPointer;
+ } cps;
+
+ /* 3DSTATE_DEPTH_BOUNDS */
+ struct {
+ bool DepthBoundsTestEnable;
+ float DepthBoundsTestMinValue;
+ float DepthBoundsTestMaxValue;
+ } db;
+
+ /* 3DSTATE_GS */
+ struct {
+ uint32_t ReorderMode;
+ } gs;
+
+ /* 3DSTATE_LINE_STIPPLE */
+ struct {
+ uint32_t LineStipplePattern;
+ float LineStippleInverseRepeatCount;
+ uint32_t LineStippleRepeatCount;
+ } ls;
+
+ /* 3DSTATE_MULTISAMPLE */
+ struct {
+ uint32_t NumberofMultisamples;
+ } ms;
+
+ /* 3DSTATE_PS */
+ struct {
+ uint32_t PositionXYOffsetSelect;
+
+ uint32_t KernelStartPointer0;
+ uint32_t KernelStartPointer1;
+ uint32_t KernelStartPointer2;
+
+ uint32_t DispatchGRFStartRegisterForConstantSetupData0;
+ uint32_t DispatchGRFStartRegisterForConstantSetupData1;
+ uint32_t DispatchGRFStartRegisterForConstantSetupData2;
+
+ /* Pre-Gfx20 only */
+ bool _8PixelDispatchEnable;
+ bool _16PixelDispatchEnable;
+ bool _32PixelDispatchEnable;
+
+ /* Gfx20+ only */
+ bool Kernel0Enable;
+ bool Kernel1Enable;
+ uint32_t Kernel0SIMDWidth;
+ uint32_t Kernel1SIMDWidth;
+ uint32_t Kernel0PolyPackingPolicy;
+ } ps;
+
+ /* 3DSTATE_PS_EXTRA */
+ struct {
+ bool PixelShaderIsPerSample;
+ bool PixelShaderKillsPixel;
+ bool PixelShaderIsPerCoarsePixel;
+ bool EnablePSDependencyOnCPsizeChange;
+ } ps_extra;
+
+ /* 3DSTATE_PS_BLEND */
+ struct {
+ bool HasWriteableRT;
+ bool ColorBufferBlendEnable;
+ uint32_t SourceAlphaBlendFactor;
+ uint32_t DestinationAlphaBlendFactor;
+ uint32_t SourceBlendFactor;
+ uint32_t DestinationBlendFactor;
+ bool AlphaTestEnable;
+ bool IndependentAlphaBlendEnable;
+ bool AlphaToCoverageEnable;
+ } ps_blend;
+
+ /* 3DSTATE_RASTER */
+ struct {
+ uint32_t APIMode;
+ bool DXMultisampleRasterizationEnable;
+ bool AntialiasingEnable;
+ uint32_t CullMode;
+ uint32_t FrontWinding;
+ bool GlobalDepthOffsetEnableSolid;
+ bool GlobalDepthOffsetEnableWireframe;
+ bool GlobalDepthOffsetEnablePoint;
+ float GlobalDepthOffsetConstant;
+ float GlobalDepthOffsetScale;
+ float GlobalDepthOffsetClamp;
+ uint32_t FrontFaceFillMode;
+ uint32_t BackFaceFillMode;
+ bool ViewportZFarClipTestEnable;
+ bool ViewportZNearClipTestEnable;
+ bool ConservativeRasterizationEnable;
+ } raster;
+
+ /* 3DSTATE_SCISSOR_STATE_POINTERS */
+ struct {
+ uint32_t count;
+ struct {
+ uint32_t ScissorRectangleYMin;
+ uint32_t ScissorRectangleXMin;
+ uint32_t ScissorRectangleYMax;
+ uint32_t ScissorRectangleXMax;
+ } elem[MAX_SCISSORS];
+ } scissor;
+
+ /* 3DSTATE_SF */
+ struct {
+ float LineWidth;
+ uint32_t TriangleStripListProvokingVertexSelect;
+ uint32_t LineStripListProvokingVertexSelect;
+ uint32_t TriangleFanProvokingVertexSelect;
+ bool LegacyGlobalDepthBiasEnable;
+ } sf;
+
+ /* 3DSTATE_STREAMOUT */
+ struct {
+ bool RenderingDisable;
+ uint32_t RenderStreamSelect;
+ uint32_t ReorderMode;
+ uint32_t ForceRendering;
+ } so;
+
+ /* 3DSTATE_SAMPLE_MASK */
+ struct {
+ uint32_t SampleMask;
+ } sm;
+
+ /* 3DSTATE_TE */
+ struct {
+ uint32_t OutputTopology;
+ } te;
+
+ /* 3DSTATE_VF */
+ struct {
+ bool IndexedDrawCutIndexEnable;
+ uint32_t CutIndex;
+ } vf;
+
+ /* 3DSTATE_VFG */
+ struct {
+ uint32_t DistributionMode;
+ bool ListCutIndexEnable;
+ } vfg;
+
+ /* 3DSTATE_VF_TOPOLOGY */
+ struct {
+ uint32_t PrimitiveTopologyType;
+ } vft;
+
+ /* 3DSTATE_VIEWPORT_STATE_POINTERS_CC */
+ struct {
+ uint32_t count;
+ struct {
+ float MinimumDepth;
+ float MaximumDepth;
+ } elem[MAX_VIEWPORTS];
+
+ struct anv_state state;
+ } vp_cc;
+
+ /* 3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP */
+ struct {
+ uint32_t count;
+ struct {
+ float ViewportMatrixElementm00;
+ float ViewportMatrixElementm11;
+ float ViewportMatrixElementm22;
+ float ViewportMatrixElementm30;
+ float ViewportMatrixElementm31;
+ float ViewportMatrixElementm32;
+ float XMinClipGuardband;
+ float XMaxClipGuardband;
+ float YMinClipGuardband;
+ float YMaxClipGuardband;
+ float XMinViewPort;
+ float XMaxViewPort;
+ float YMinViewPort;
+ float YMaxViewPort;
+ } elem[MAX_VIEWPORTS];
+ } vp_sf_clip;
+
+ /* 3DSTATE_WM */
+ struct {
+ uint32_t ForceThreadDispatchEnable;
+ bool LineStippleEnable;
+ uint32_t BarycentricInterpolationMode;
+ } wm;
+
+ /* 3DSTATE_WM_DEPTH_STENCIL */
+ struct {
+ bool DoubleSidedStencilEnable;
+ uint32_t StencilTestMask;
+ uint32_t StencilWriteMask;
+ uint32_t BackfaceStencilTestMask;
+ uint32_t BackfaceStencilWriteMask;
+ uint32_t StencilReferenceValue;
+ uint32_t BackfaceStencilReferenceValue;
+ bool DepthTestEnable;
+ bool DepthBufferWriteEnable;
+ uint32_t DepthTestFunction;
+ bool StencilTestEnable;
+ bool StencilBufferWriteEnable;
+ uint32_t StencilFailOp;
+ uint32_t StencilPassDepthPassOp;
+ uint32_t StencilPassDepthFailOp;
+ uint32_t StencilTestFunction;
+ uint32_t BackfaceStencilFailOp;
+ uint32_t BackfaceStencilPassDepthPassOp;
+ uint32_t BackfaceStencilPassDepthFailOp;
+ uint32_t BackfaceStencilTestFunction;
+ } ds;
+
+ /* 3DSTATE_TBIMR_TILE_PASS_INFO */
+ struct {
+ unsigned TileRectangleHeight;
+ unsigned TileRectangleWidth;
+ unsigned VerticalTileCount;
+ unsigned HorizontalTileCount;
+ unsigned TBIMRBatchSize;
+ unsigned TileBoxCheck;
+ } tbimr;
+ bool use_tbimr;
+
+ bool pma_fix;
+
+ BITSET_DECLARE(dirty, ANV_GFX_STATE_MAX);
+};
+
+enum anv_internal_kernel_name {
+ ANV_INTERNAL_KERNEL_GENERATED_DRAWS,
+ ANV_INTERNAL_KERNEL_COPY_QUERY_RESULTS_COMPUTE,
+ ANV_INTERNAL_KERNEL_COPY_QUERY_RESULTS_FRAGMENT,
+ ANV_INTERNAL_KERNEL_MEMCPY_COMPUTE,
+
+ ANV_INTERNAL_KERNEL_COUNT,
+};
+
+enum anv_rt_bvh_build_method {
+ ANV_BVH_BUILD_METHOD_TRIVIAL,
+ ANV_BVH_BUILD_METHOD_NEW_SAH,
+};
+
+struct anv_device_astc_emu {
+ struct vk_texcompress_astc_state *texcompress;
+
+ /* for flush_astc_ldr_void_extent_denorms */
+ simple_mtx_t mutex;
+ VkDescriptorSetLayout ds_layout;
+ VkPipelineLayout pipeline_layout;
+ VkPipeline pipeline;
+};
+
+struct anv_trtt_batch_bo {
struct anv_bo *bo;
- int64_t offset;
+ uint32_t size;
+
+ /* Once device->trtt.timeline_handle signals timeline_val as complete we
+ * can free this struct and its members.
+ */
+ uint64_t timeline_val;
+
+ /* Part of device->trtt.in_flight_batches. */
+ struct list_head link;
};
struct anv_device {
struct vk_device vk;
struct anv_physical_device * physical;
- struct intel_device_info info;
+ const struct intel_device_info * info;
+ const struct anv_kmd_backend * kmd_backend;
struct isl_device isl_dev;
- int context_id;
+ union {
+ uint32_t context_id; /* i915 */
+ uint32_t vm_id; /* Xe */
+ };
int fd;
- bool can_chain_batches;
- bool robust_buffer_access;
- bool has_thread_submit;
pthread_mutex_t vma_mutex;
struct util_vma_heap vma_lo;
- struct util_vma_heap vma_cva;
struct util_vma_heap vma_hi;
+ struct util_vma_heap vma_desc;
+ struct util_vma_heap vma_desc_buf;
+ struct util_vma_heap vma_samplers;
+ struct util_vma_heap vma_trtt;
/** List of all anv_device_memory objects */
struct list_head memory_objects;
+ /** List of anv_image objects with a private binding for implicit CCS */
+ struct list_head image_private_objects;
+
+ /** Memory pool for batch buffers */
struct anv_bo_pool batch_bo_pool;
+ /** Memory pool for utrace timestamp buffers */
+ struct anv_bo_pool utrace_bo_pool;
+ /** Memory pool for BVH build buffers */
+ struct anv_bo_pool bvh_bo_pool;
struct anv_bo_cache bo_cache;
struct anv_state_pool general_state_pool;
+ struct anv_state_pool aux_tt_pool;
struct anv_state_pool dynamic_state_pool;
+ struct anv_state_pool dynamic_state_db_pool;
struct anv_state_pool instruction_state_pool;
struct anv_state_pool binding_table_pool;
- struct anv_state_pool surface_state_pool;
+ struct anv_state_pool scratch_surface_state_pool;
+ struct anv_state_pool internal_surface_state_pool;
+ struct anv_state_pool bindless_surface_state_pool;
+ struct anv_state_pool indirect_push_descriptor_pool;
+ struct anv_state_pool push_descriptor_buffer_pool;
struct anv_state_reserved_pool custom_border_colors;
+ struct anv_state_reserved_array_pool custom_border_colors_db;
/** BO used for various workarounds
*
@@ -1227,31 +1858,100 @@ struct anv_device {
struct anv_bo * workaround_bo;
struct anv_address workaround_address;
+ /**
+ * Workarounds for game bugs.
+ */
+ struct {
+ struct set * doom64_images;
+ } workarounds;
+
struct anv_bo * trivial_batch_bo;
struct anv_state null_surface_state;
- struct anv_pipeline_cache default_pipeline_cache;
- struct blorp_context blorp;
+ /**
+ * NULL surface state copy stored in host memory for use as a fast
+ * memcpy() source.
+ */
+ char host_null_surface_state[ANV_SURFACE_STATE_SIZE];
+
+ struct vk_pipeline_cache * default_pipeline_cache;
+ struct vk_pipeline_cache * internal_cache;
+
+ struct {
+ struct blorp_context context;
+ struct {
+ struct anv_state state;
+ struct anv_state db_state;
+ } dynamic_states[BLORP_DYNAMIC_STATE_COUNT];
+ } blorp;
struct anv_state border_colors;
+ struct anv_state border_colors_db;
struct anv_state slice_hash;
+ struct anv_state slice_hash_db;
+
+ /** An array of CPS_STATE structures grouped by MAX_VIEWPORTS elements
+ *
+ * We need to emit CPS_STATE structures for each viewport accessible by a
+ * pipeline. So rather than write many identical CPS_STATE structures
+ * dynamically, we can enumerate all possible combinaisons and then just
+ * emit a 3DSTATE_CPS_POINTERS instruction with the right offset into this
+ * array.
+ */
+ struct anv_state cps_states;
+ struct anv_state cps_states_db;
uint32_t queue_count;
struct anv_queue * queues;
struct anv_scratch_pool scratch_pool;
struct anv_bo *rt_scratch_bos[16];
+ struct anv_bo *btd_fifo_bo;
+ struct anv_address rt_uuid_addr;
+
+ /** A pre packed VERTEX_ELEMENT_STATE feeding 0s to the VS stage
+ *
+ * For use when a pipeline has no VS input
+ */
+ uint32_t empty_vs_input[2];
+
+ bool robust_buffer_access;
+
+ uint32_t protected_session_id;
+
+ /** Shadow ray query BO
+ *
+ * The ray_query_bo only holds the current ray being traced. When using
+ * more than 1 ray query per thread, we cannot fit all the queries in
+ * there, so we need a another buffer to hold query data that is not
+ * currently being used by the HW for tracing, similar to a scratch space.
+ *
+ * The size of the shadow buffer depends on the number of queries per
+ * shader.
+ */
+ struct anv_bo *ray_query_shadow_bos[16];
+ /** Ray query buffer used to communicated with HW unit.
+ */
+ struct anv_bo *ray_query_bo;
struct anv_shader_bin *rt_trampoline;
struct anv_shader_bin *rt_trivial_return;
+ enum anv_rt_bvh_build_method bvh_build_method;
+
+ /** Draw generation shader
+ *
+ * Generates direct draw calls out of indirect parameters. Used to
+ * workaround slowness with indirect draw calls.
+ */
+ struct anv_shader_bin *internal_kernels[ANV_INTERNAL_KERNEL_COUNT];
+ const struct intel_l3_config *internal_kernels_l3_config;
+
pthread_mutex_t mutex;
pthread_cond_t queue_submit;
- int _lost;
- int lost_reported;
- struct intel_batch_decode_ctx decoder_ctx;
+ struct intel_batch_decode_ctx decoder[ANV_MAX_QUEUE_FAMILIES];
/*
* When decoding a anv_cmd_buffer, we might need to search for BOs through
* the cmd_buffer's list.
@@ -1266,62 +1966,138 @@ struct anv_device {
const struct intel_l3_config *l3_config;
struct intel_debug_block_frame *debug_frame_desc;
-};
-#if defined(GFX_VERx10) && GFX_VERx10 >= 90
-#define ANV_ALWAYS_SOFTPIN true
-#else
-#define ANV_ALWAYS_SOFTPIN false
+ struct intel_ds_device ds;
+
+ nir_shader *fp64_nir;
+
+ uint32_t draw_call_count;
+ struct anv_state breakpoint;
+#if DETECT_OS_ANDROID
+ struct u_gralloc *u_gralloc;
#endif
-static inline bool
-anv_use_softpin(const struct anv_physical_device *pdevice)
+ /** Precompute all dirty graphics bits
+ *
+ * Depending on platforms, some of the dirty bits don't apply (for example
+ * 3DSTATE_PRIMITIVE_REPLICATION is only Gfx12.0+). Disabling some
+ * extensions like Mesh shaders also allow us to avoid emitting any
+ * mesh/task related instructions (we only initialize them once at device
+ * initialization).
+ */
+ BITSET_DECLARE(gfx_dirty_state, ANV_GFX_STATE_MAX);
+
+ /*
+ * Command pool for companion RCS command buffer.
+ */
+ VkCommandPool companion_rcs_cmd_pool;
+
+ struct anv_trtt {
+ pthread_mutex_t mutex;
+
+ /* Sometimes we need to run batches from places where we don't have a
+ * queue coming from the API, so we use this.
+ */
+ struct anv_queue *queue;
+
+ /* There's only one L3 table, so if l3_addr is zero that means we
+ * didn't initialize the TR-TT context yet (i.e., we're not using TR-TT
+ * yet in this context).
+ */
+ uint64_t l3_addr;
+
+ /* We don't want to access the page tables from the CPU, so just
+ * maintain a mirror that we can use.
+ */
+ uint64_t *l3_mirror;
+ uint64_t *l2_mirror;
+
+ /* We keep a dynamic list of page table bos, and each bo can store
+ * multiple page tables.
+ */
+ struct anv_bo **page_table_bos;
+ int num_page_table_bos;
+ int page_table_bos_capacity;
+
+ /* These are used to keep track of space available for more page tables
+ * within a bo.
+ */
+ struct anv_bo *cur_page_table_bo;
+ uint64_t next_page_table_bo_offset;
+
+ /* Timeline syncobj used to track completion of the TR-TT batch BOs. */
+ uint32_t timeline_handle;
+ uint64_t timeline_val;
+
+ /* List of struct anv_trtt_batch_bo batches that are in flight and can
+ * be freed once their timeline gets signaled.
+ */
+ struct list_head in_flight_batches;
+ } trtt;
+
+ /* Number of sparse resources that currently exist. This is used for a
+ * workaround that makes every memoryBarrier flush more things than it
+ * should. Some workloads create and then immediately destroy sparse
+ * resources when they start, so just counting if a sparse resource was
+ * ever created is not enough.
+ */
+ uint32_t num_sparse_resources;
+
+ struct anv_device_astc_emu astc_emu;
+
+ struct intel_bind_timeline bind_timeline; /* Xe only */
+
+ struct {
+ simple_mtx_t mutex;
+ struct hash_table *map;
+ } embedded_samplers;
+};
+
+static inline uint32_t
+anv_get_first_render_queue_index(struct anv_physical_device *pdevice)
{
-#if defined(GFX_VERx10) && GFX_VERx10 >= 90
- /* Sky Lake and later always uses softpin */
- assert(pdevice->use_softpin);
- return true;
-#elif defined(GFX_VERx10) && GFX_VERx10 < 80
- /* Haswell and earlier never use softpin */
- assert(!pdevice->use_softpin);
- return false;
-#else
- /* If we don't have a GFX_VERx10 #define, we need to look at the physical
- * device. Also, for GFX version 8, we need to look at the physical
- * device because Broadwell softpins but Cherryview doesn't.
- */
- return pdevice->use_softpin;
-#endif
+ assert(pdevice != NULL);
+
+ for (uint32_t i = 0; i < pdevice->queue.family_count; i++) {
+ if (pdevice->queue.families[i].queueFlags & VK_QUEUE_GRAPHICS_BIT) {
+ return i;
+ }
+ }
+
+ unreachable("Graphics capable queue family not found");
}
-static inline struct anv_instance *
-anv_device_instance_or_null(const struct anv_device *device)
+static inline struct anv_state
+anv_binding_table_pool_alloc(struct anv_device *device)
{
- return device ? device->physical->instance : NULL;
+ return anv_state_pool_alloc(&device->binding_table_pool,
+ device->binding_table_pool.block_size, 0);
}
-static inline struct anv_state_pool *
-anv_binding_table_pool(struct anv_device *device)
+static inline void
+anv_binding_table_pool_free(struct anv_device *device, struct anv_state state)
{
- if (anv_use_softpin(device->physical))
- return &device->binding_table_pool;
- else
- return &device->surface_state_pool;
+ anv_state_pool_free(&device->binding_table_pool, state);
}
static inline struct anv_state
-anv_binding_table_pool_alloc(struct anv_device *device)
+anv_null_surface_state_for_binding_table(struct anv_device *device)
{
- if (anv_use_softpin(device->physical))
- return anv_state_pool_alloc(&device->binding_table_pool,
- device->binding_table_pool.block_size, 0);
- else
- return anv_state_pool_alloc_back(&device->surface_state_pool);
+ struct anv_state state = device->null_surface_state;
+ if (device->physical->indirect_descriptors) {
+ state.offset += device->physical->va.bindless_surface_state_pool.addr -
+ device->physical->va.internal_surface_state_pool.addr;
+ }
+ return state;
}
-static inline void
-anv_binding_table_pool_free(struct anv_device *device, struct anv_state state) {
- anv_state_pool_free(anv_binding_table_pool(device), state);
+static inline struct anv_state
+anv_bindless_state_for_binding_table(struct anv_device *device,
+ struct anv_state state)
+{
+ state.offset += device->physical->va.bindless_surface_state_pool.addr -
+ device->physical->va.internal_surface_state_pool.addr;
+ return state;
}
static inline uint32_t
@@ -1329,92 +2105,34 @@ anv_mocs(const struct anv_device *device,
const struct anv_bo *bo,
isl_surf_usage_flags_t usage)
{
- return isl_mocs(&device->isl_dev, usage, bo && bo->is_external);
+ return isl_mocs(&device->isl_dev, usage, bo && anv_bo_is_external(bo));
}
-void anv_device_init_blorp(struct anv_device *device);
-void anv_device_finish_blorp(struct anv_device *device);
-
-void _anv_device_report_lost(struct anv_device *device);
-VkResult _anv_device_set_lost(struct anv_device *device,
- const char *file, int line,
- const char *msg, ...)
- anv_printflike(4, 5);
-VkResult _anv_queue_set_lost(struct anv_queue *queue,
- const char *file, int line,
- const char *msg, ...)
- anv_printflike(4, 5);
-#define anv_device_set_lost(dev, ...) \
- _anv_device_set_lost(dev, __FILE__, __LINE__, __VA_ARGS__)
-#define anv_queue_set_lost(queue, ...) \
- (queue)->device->has_thread_submit ? \
- _anv_queue_set_lost(queue, __FILE__, __LINE__, __VA_ARGS__) : \
- _anv_device_set_lost(queue->device, __FILE__, __LINE__, __VA_ARGS__)
-
-static inline bool
-anv_device_is_lost(struct anv_device *device)
+static inline uint32_t
+anv_mocs_for_address(const struct anv_device *device,
+ struct anv_address *addr)
{
- int lost = p_atomic_read(&device->_lost);
- if (unlikely(lost && !device->lost_reported))
- _anv_device_report_lost(device);
- return lost;
+ return anv_mocs(device, addr->bo, 0);
}
-VkResult anv_device_query_status(struct anv_device *device);
-
-
-enum anv_bo_alloc_flags {
- /** Specifies that the BO must have a 32-bit address
- *
- * This is the opposite of EXEC_OBJECT_SUPPORTS_48B_ADDRESS.
- */
- ANV_BO_ALLOC_32BIT_ADDRESS = (1 << 0),
-
- /** Specifies that the BO may be shared externally */
- ANV_BO_ALLOC_EXTERNAL = (1 << 1),
-
- /** Specifies that the BO should be mapped */
- ANV_BO_ALLOC_MAPPED = (1 << 2),
-
- /** Specifies that the BO should be snooped so we get coherency */
- ANV_BO_ALLOC_SNOOPED = (1 << 3),
-
- /** Specifies that the BO should be captured in error states */
- ANV_BO_ALLOC_CAPTURE = (1 << 4),
-
- /** Specifies that the BO will have an address assigned by the caller
- *
- * Such BOs do not exist in any VMA heap.
- */
- ANV_BO_ALLOC_FIXED_ADDRESS = (1 << 5),
-
- /** Enables implicit synchronization on the BO
- *
- * This is the opposite of EXEC_OBJECT_ASYNC.
- */
- ANV_BO_ALLOC_IMPLICIT_SYNC = (1 << 6),
-
- /** Enables implicit synchronization on the BO
- *
- * This is equivalent to EXEC_OBJECT_WRITE.
- */
- ANV_BO_ALLOC_IMPLICIT_WRITE = (1 << 7),
-
- /** Has an address which is visible to the client */
- ANV_BO_ALLOC_CLIENT_VISIBLE_ADDRESS = (1 << 8),
-
- /** This buffer has implicit CCS data attached to it */
- ANV_BO_ALLOC_IMPLICIT_CCS = (1 << 9),
-
- /** This buffer is allocated from local memory */
- ANV_BO_ALLOC_LOCAL_MEM = (1 << 10),
-};
+void anv_device_init_blorp(struct anv_device *device);
+void anv_device_finish_blorp(struct anv_device *device);
VkResult anv_device_alloc_bo(struct anv_device *device,
const char *name, uint64_t size,
enum anv_bo_alloc_flags alloc_flags,
uint64_t explicit_address,
struct anv_bo **bo);
+VkResult anv_device_map_bo(struct anv_device *device,
+ struct anv_bo *bo,
+ uint64_t offset,
+ size_t size,
+ void *placed_addr,
+ void **map_out);
+VkResult anv_device_unmap_bo(struct anv_device *device,
+ struct anv_bo *bo,
+ void *map, size_t map_size,
+ bool replace);
VkResult anv_device_import_bo_from_host_ptr(struct anv_device *device,
void *host_ptr, uint32_t size,
enum anv_bo_alloc_flags alloc_flags,
@@ -1426,128 +2144,124 @@ VkResult anv_device_import_bo(struct anv_device *device, int fd,
struct anv_bo **bo);
VkResult anv_device_export_bo(struct anv_device *device,
struct anv_bo *bo, int *fd_out);
+VkResult anv_device_get_bo_tiling(struct anv_device *device,
+ struct anv_bo *bo,
+ enum isl_tiling *tiling_out);
+VkResult anv_device_set_bo_tiling(struct anv_device *device,
+ struct anv_bo *bo,
+ uint32_t row_pitch_B,
+ enum isl_tiling tiling);
void anv_device_release_bo(struct anv_device *device,
struct anv_bo *bo);
+static inline void anv_device_set_physical(struct anv_device *device,
+ struct anv_physical_device *physical_device)
+{
+ device->physical = physical_device;
+ device->info = &physical_device->info;
+ device->isl_dev = physical_device->isl_dev;
+}
+
static inline struct anv_bo *
anv_device_lookup_bo(struct anv_device *device, uint32_t gem_handle)
{
return util_sparse_array_get(&device->bo_cache.bo_map, gem_handle);
}
-VkResult anv_device_bo_busy(struct anv_device *device, struct anv_bo *bo);
VkResult anv_device_wait(struct anv_device *device, struct anv_bo *bo,
int64_t timeout);
VkResult anv_queue_init(struct anv_device *device, struct anv_queue *queue,
- uint32_t exec_flags,
- const VkDeviceQueueCreateInfo *pCreateInfo);
+ const VkDeviceQueueCreateInfo *pCreateInfo,
+ uint32_t index_in_family);
void anv_queue_finish(struct anv_queue *queue);
-VkResult anv_queue_execbuf_locked(struct anv_queue *queue, struct anv_queue_submit *submit);
+VkResult anv_queue_submit(struct vk_queue *queue,
+ struct vk_queue_submit *submit);
VkResult anv_queue_submit_simple_batch(struct anv_queue *queue,
- struct anv_batch *batch);
-
-uint64_t anv_gettime_ns(void);
-uint64_t anv_get_absolute_timeout(uint64_t timeout);
-
-void* anv_gem_mmap(struct anv_device *device,
- uint32_t gem_handle, uint64_t offset, uint64_t size, uint32_t flags);
-void anv_gem_munmap(struct anv_device *device, void *p, uint64_t size);
-uint32_t anv_gem_create(struct anv_device *device, uint64_t size);
-void anv_gem_close(struct anv_device *device, uint32_t gem_handle);
-uint32_t anv_gem_create_regions(struct anv_device *device, uint64_t anv_bo_size,
- uint32_t num_regions,
- struct drm_i915_gem_memory_class_instance *regions);
-uint32_t anv_gem_userptr(struct anv_device *device, void *mem, size_t size);
-int anv_gem_busy(struct anv_device *device, uint32_t gem_handle);
+ struct anv_batch *batch,
+ bool is_companion_rcs_batch);
+VkResult anv_queue_submit_trtt_batch(struct anv_sparse_submission *submit,
+ struct anv_batch *batch);
+
+static inline void
+anv_trtt_batch_bo_free(struct anv_device *device,
+ struct anv_trtt_batch_bo *trtt_bbo)
+{
+ anv_bo_pool_free(&device->batch_bo_pool, trtt_bbo->bo);
+ list_del(&trtt_bbo->link);
+ vk_free(&device->vk.alloc, trtt_bbo);
+}
+
+void anv_queue_trace(struct anv_queue *queue, const char *label,
+ bool frame, bool begin);
+
+static inline VkResult
+anv_queue_post_submit(struct anv_queue *queue, VkResult submit_result)
+{
+ if (submit_result != VK_SUCCESS)
+ return submit_result;
+
+ VkResult result = VK_SUCCESS;
+ if (queue->sync) {
+ result = vk_sync_wait(&queue->device->vk, queue->sync, 0,
+ VK_SYNC_WAIT_COMPLETE, UINT64_MAX);
+ if (result != VK_SUCCESS)
+ result = vk_queue_set_lost(&queue->vk, "sync wait failed");
+ }
+
+ return result;
+}
+
int anv_gem_wait(struct anv_device *device, uint32_t gem_handle, int64_t *timeout_ns);
-int anv_gem_execbuffer(struct anv_device *device,
- struct drm_i915_gem_execbuffer2 *execbuf);
int anv_gem_set_tiling(struct anv_device *device, uint32_t gem_handle,
uint32_t stride, uint32_t tiling);
-int anv_gem_create_context(struct anv_device *device);
-int anv_gem_create_context_engines(struct anv_device *device,
- const struct drm_i915_query_engine_info *info,
- int num_engines,
- uint16_t *engine_classes);
-bool anv_gem_has_context_priority(int fd);
-int anv_gem_destroy_context(struct anv_device *device, int context);
-int anv_gem_set_context_param(int fd, int context, uint32_t param,
- uint64_t value);
-int anv_gem_get_context_param(int fd, int context, uint32_t param,
- uint64_t *value);
-int anv_gem_get_param(int fd, uint32_t param);
-uint64_t anv_gem_get_drm_cap(int fd, uint32_t capability);
int anv_gem_get_tiling(struct anv_device *device, uint32_t gem_handle);
-bool anv_gem_get_bit6_swizzle(int fd, uint32_t tiling);
-int anv_gem_context_get_reset_stats(int fd, int context,
- uint32_t *active, uint32_t *pending);
int anv_gem_handle_to_fd(struct anv_device *device, uint32_t gem_handle);
-int anv_gem_reg_read(int fd, uint32_t offset, uint64_t *result);
uint32_t anv_gem_fd_to_handle(struct anv_device *device, int fd);
-int anv_gem_set_caching(struct anv_device *device, uint32_t gem_handle, uint32_t caching);
-int anv_gem_set_domain(struct anv_device *device, uint32_t gem_handle,
- uint32_t read_domains, uint32_t write_domain);
-int anv_gem_sync_file_merge(struct anv_device *device, int fd1, int fd2);
-uint32_t anv_gem_syncobj_create(struct anv_device *device, uint32_t flags);
-void anv_gem_syncobj_destroy(struct anv_device *device, uint32_t handle);
-int anv_gem_syncobj_handle_to_fd(struct anv_device *device, uint32_t handle);
-uint32_t anv_gem_syncobj_fd_to_handle(struct anv_device *device, int fd);
-int anv_gem_syncobj_export_sync_file(struct anv_device *device,
- uint32_t handle);
-int anv_gem_syncobj_import_sync_file(struct anv_device *device,
- uint32_t handle, int fd);
-void anv_gem_syncobj_reset(struct anv_device *device, uint32_t handle);
-bool anv_gem_supports_syncobj_wait(int fd);
-int anv_gem_syncobj_wait(struct anv_device *device,
- const uint32_t *handles, uint32_t num_handles,
- int64_t abs_timeout_ns, bool wait_all);
-int anv_gem_syncobj_timeline_wait(struct anv_device *device,
- const uint32_t *handles, const uint64_t *points,
- uint32_t num_items, int64_t abs_timeout_ns,
- bool wait_all, bool wait_materialize);
-int anv_gem_syncobj_timeline_signal(struct anv_device *device,
- const uint32_t *handles, const uint64_t *points,
- uint32_t num_items);
-int anv_gem_syncobj_timeline_query(struct anv_device *device,
- const uint32_t *handles, uint64_t *points,
- uint32_t num_items);
-int anv_i915_query(int fd, uint64_t query_id, void *buffer,
- int32_t *buffer_len);
-struct drm_i915_query_engine_info *anv_gem_get_engine_info(int fd);
-int anv_gem_count_engines(const struct drm_i915_query_engine_info *info,
- uint16_t engine_class);
+int anv_gem_set_context_param(int fd, uint32_t context, uint32_t param,
+ uint64_t value);
+VkResult
+anv_gem_import_bo_alloc_flags_to_bo_flags(struct anv_device *device,
+ struct anv_bo *bo,
+ enum anv_bo_alloc_flags alloc_flags,
+ uint32_t *bo_flags);
+const struct intel_device_info_pat_entry *
+anv_device_get_pat_entry(struct anv_device *device,
+ enum anv_bo_alloc_flags alloc_flags);
uint64_t anv_vma_alloc(struct anv_device *device,
uint64_t size, uint64_t align,
enum anv_bo_alloc_flags alloc_flags,
- uint64_t client_address);
+ uint64_t client_address,
+ struct util_vma_heap **out_vma_heap);
void anv_vma_free(struct anv_device *device,
+ struct util_vma_heap *vma_heap,
uint64_t address, uint64_t size);
struct anv_reloc_list {
- uint32_t num_relocs;
- uint32_t array_length;
- struct drm_i915_gem_relocation_entry * relocs;
- struct anv_bo ** reloc_bos;
+ bool uses_relocs;
uint32_t dep_words;
BITSET_WORD * deps;
+ const VkAllocationCallbacks *alloc;
};
VkResult anv_reloc_list_init(struct anv_reloc_list *list,
- const VkAllocationCallbacks *alloc);
-void anv_reloc_list_finish(struct anv_reloc_list *list,
- const VkAllocationCallbacks *alloc);
+ const VkAllocationCallbacks *alloc,
+ bool uses_relocs);
+void anv_reloc_list_finish(struct anv_reloc_list *list);
+
+VkResult
+anv_reloc_list_add_bo_impl(struct anv_reloc_list *list, struct anv_bo *target_bo);
-VkResult anv_reloc_list_add(struct anv_reloc_list *list,
- const VkAllocationCallbacks *alloc,
- uint32_t offset, struct anv_bo *target_bo,
- uint32_t delta, uint64_t *address_u64_out);
+static inline VkResult
+anv_reloc_list_add_bo(struct anv_reloc_list *list, struct anv_bo *target_bo)
+{
+ return list->uses_relocs ? anv_reloc_list_add_bo_impl(list, target_bo) : VK_SUCCESS;
+}
-VkResult anv_reloc_list_add_bo(struct anv_reloc_list *list,
- const VkAllocationCallbacks *alloc,
- struct anv_bo *target_bo);
+VkResult anv_reloc_list_append(struct anv_reloc_list *list,
+ struct anv_reloc_list *other);
struct anv_batch_bo {
/* Link in the anv_cmd_buffer.owned_batch_bos list */
@@ -1572,6 +2286,12 @@ struct anv_batch_bo {
struct anv_batch {
const VkAllocationCallbacks * alloc;
+ /**
+ * Sum of all the anv_batch_bo sizes allocated for this command buffer.
+ * Used to increase allocation size for long command buffers.
+ */
+ size_t allocated_batch_size;
+
struct anv_address start_addr;
void * start;
@@ -1583,7 +2303,7 @@ struct anv_batch {
/* This callback is called (with the associated user data) in the event
* that the batch runs out of space.
*/
- VkResult (*extend_cb)(struct anv_batch *, void *);
+ VkResult (*extend_cb)(struct anv_batch *, uint32_t, void *);
void * user_data;
/**
@@ -1594,12 +2314,27 @@ struct anv_batch {
* of the driver.
*/
VkResult status;
+
+ enum intel_engine_class engine_class;
+
+ /**
+ * Number of 3DPRIMITIVE's emitted for WA 16014538804
+ */
+ uint8_t num_3d_primitives_emitted;
};
void *anv_batch_emit_dwords(struct anv_batch *batch, int num_dwords);
+VkResult anv_batch_emit_ensure_space(struct anv_batch *batch, uint32_t size);
+void anv_batch_advance(struct anv_batch *batch, uint32_t size);
void anv_batch_emit_batch(struct anv_batch *batch, struct anv_batch *other);
struct anv_address anv_batch_address(struct anv_batch *batch, void *batch_location);
+static inline struct anv_address
+anv_batch_current_address(struct anv_batch *batch)
+{
+ return anv_batch_address(batch, batch->next);
+}
+
static inline void
anv_batch_set_storage(struct anv_batch *batch, struct anv_address addr,
void *map, size_t size)
@@ -1625,97 +2360,16 @@ anv_batch_has_error(struct anv_batch *batch)
}
static inline uint64_t
-anv_batch_emit_reloc(struct anv_batch *batch,
- void *location, struct anv_bo *bo, uint32_t delta)
-{
- uint64_t address_u64 = 0;
- VkResult result;
-
- if (ANV_ALWAYS_SOFTPIN) {
- address_u64 = bo->offset + delta;
- result = anv_reloc_list_add_bo(batch->relocs, batch->alloc, bo);
- } else {
- result = anv_reloc_list_add(batch->relocs, batch->alloc,
- location - batch->start, bo, delta,
- &address_u64);
- }
- if (unlikely(result != VK_SUCCESS)) {
- anv_batch_set_error(batch, result);
- return 0;
- }
-
- return address_u64;
-}
-
-
-#define ANV_NULL_ADDRESS ((struct anv_address) { NULL, 0 })
-
-static inline struct anv_address
-anv_address_from_u64(uint64_t addr_u64)
-{
- assert(addr_u64 == intel_canonical_address(addr_u64));
- return (struct anv_address) {
- .bo = NULL,
- .offset = addr_u64,
- };
-}
-
-static inline bool
-anv_address_is_null(struct anv_address addr)
-{
- return addr.bo == NULL && addr.offset == 0;
-}
-
-static inline uint64_t
-anv_address_physical(struct anv_address addr)
-{
- if (addr.bo && (ANV_ALWAYS_SOFTPIN ||
- (addr.bo->flags & EXEC_OBJECT_PINNED))) {
- assert(addr.bo->flags & EXEC_OBJECT_PINNED);
- return intel_canonical_address(addr.bo->offset + addr.offset);
- } else {
- return intel_canonical_address(addr.offset);
- }
-}
-
-static inline struct anv_address
-anv_address_add(struct anv_address addr, uint64_t offset)
-{
- addr.offset += offset;
- return addr;
-}
-
-static inline void
-write_reloc(const struct anv_device *device, void *p, uint64_t v, bool flush)
-{
- unsigned reloc_size = 0;
- if (device->info.ver >= 8) {
- reloc_size = sizeof(uint64_t);
- *(uint64_t *)p = intel_canonical_address(v);
- } else {
- reloc_size = sizeof(uint32_t);
- *(uint32_t *)p = v;
- }
-
- if (flush && !device->info.has_llc)
- intel_flush_range(p, reloc_size);
-}
-
-static inline uint64_t
_anv_combine_address(struct anv_batch *batch, void *location,
const struct anv_address address, uint32_t delta)
{
- if (address.bo == NULL) {
+ if (address.bo == NULL)
return address.offset + delta;
- } else if (batch == NULL) {
- assert(address.bo->flags & EXEC_OBJECT_PINNED);
- return anv_address_physical(anv_address_add(address, delta));
- } else {
- assert(batch->start <= location && location < batch->end);
- /* i915 relocations are signed. */
- assert(INT32_MIN <= address.offset && address.offset <= INT32_MAX);
- return anv_batch_emit_reloc(batch, location, address.bo, address.offset + delta);
- }
+
+ if (batch)
+ anv_reloc_list_add_bo(batch->relocs, address.bo);
+
+ return anv_address_physical(anv_address_add(address, delta));
}
#define __gen_address_type struct anv_address
@@ -1755,18 +2409,20 @@ _anv_combine_address(struct anv_batch *batch, void *location,
__dst; \
})
-#define anv_batch_emit_merge(batch, dwords0, dwords1) \
- do { \
- uint32_t *dw; \
- \
- STATIC_ASSERT(ARRAY_SIZE(dwords0) == ARRAY_SIZE(dwords1)); \
- dw = anv_batch_emit_dwords((batch), ARRAY_SIZE(dwords0)); \
- if (!dw) \
- break; \
- for (uint32_t i = 0; i < ARRAY_SIZE(dwords0); i++) \
- dw[i] = (dwords0)[i] | (dwords1)[i]; \
- VG(VALGRIND_CHECK_MEM_IS_DEFINED(dw, ARRAY_SIZE(dwords0) * 4));\
- } while (0)
+#define anv_batch_emit_merge(batch, cmd, pipeline, state, name) \
+ for (struct cmd name = { 0 }, \
+ *_dst = anv_batch_emit_dwords(batch, __anv_cmd_length(cmd)); \
+ __builtin_expect(_dst != NULL, 1); \
+ ({ uint32_t _partial[__anv_cmd_length(cmd)]; \
+ assert((pipeline)->state.len == __anv_cmd_length(cmd)); \
+ __anv_cmd_pack(cmd)(batch, _partial, &name); \
+ for (uint32_t i = 0; i < __anv_cmd_length(cmd); i++) { \
+ ((uint32_t *)_dst)[i] = _partial[i] | \
+ (pipeline)->batch_data[(pipeline)->state.offset + i]; \
+ } \
+ VG(VALGRIND_CHECK_MEM_IS_DEFINED(_dst, __anv_cmd_length(cmd) * 4)); \
+ _dst = NULL; \
+ }))
#define anv_batch_emit(batch, cmd, name) \
for (struct cmd name = { __anv_cmd_header(cmd) }, \
@@ -1797,22 +2453,18 @@ _anv_combine_address(struct anv_batch *batch, void *location,
/* #define __gen_address_offset anv_address_add */
struct anv_device_memory {
- struct vk_object_base base;
+ struct vk_device_memory vk;
struct list_head link;
struct anv_bo * bo;
const struct anv_memory_type * type;
- VkDeviceSize map_size;
- void * map;
- /* If set, we are holding reference to AHardwareBuffer
- * which we must release when memory is freed.
- */
- struct AHardwareBuffer * ahw;
+ void * map;
+ size_t map_size;
- /* If set, this memory comes from a host pointer. */
- void * host_ptr;
+ /* The map, from the user PoV is map + map_delta */
+ uint64_t map_delta;
};
/**
@@ -1846,17 +2498,6 @@ struct anv_sampled_image_descriptor {
uint32_t sampler;
};
-struct anv_texture_swizzle_descriptor {
- /** Texture swizzle
- *
- * See also nir_intrinsic_channel_select_intel
- */
- uint8_t swizzle[4];
-
- /** Unused padding to ensure the struct is a multiple of 64 bits */
- uint32_t _pad;
-};
-
/** Struct representing a storage image descriptor */
struct anv_storage_image_descriptor {
/** Bindless image handles
@@ -1864,8 +2505,29 @@ struct anv_storage_image_descriptor {
* These are expected to already be shifted such that the 20-bit
* SURFACE_STATE table index is in the top 20 bits.
*/
- uint32_t read_write;
- uint32_t write_only;
+ uint32_t vanilla;
+
+ /** Image depth
+ *
+ * By default the HW RESINFO message allows us to query the depth of an image :
+ *
+ * From the Kaby Lake docs for the RESINFO message:
+ *
+ * "Surface Type | ... | Blue
+ * --------------+-----+----------------
+ * SURFTYPE_3D | ... | (Depth+1)»LOD"
+ *
+ * With VK_EXT_sliced_view_of_3d, we have to support a slice of a 3D image,
+ * meaning at a depth offset with a new depth value potentially reduced
+ * from the original image. Unfortunately if we change the Depth value of
+ * the image, we then run into issues with Yf/Ys tilings where the HW fetch
+ * data at incorrect locations.
+ *
+ * To solve this, we put the slice depth in the descriptor and recompose
+ * the vec3 (width, height, depth) using this field for z and xy using the
+ * RESINFO result.
+ */
+ uint32_t image_depth;
};
/** Struct representing a address/range descriptor
@@ -1883,23 +2545,25 @@ struct anv_address_range_descriptor {
enum anv_descriptor_data {
/** The descriptor contains a BTI reference to a surface state */
- ANV_DESCRIPTOR_SURFACE_STATE = (1 << 0),
+ ANV_DESCRIPTOR_BTI_SURFACE_STATE = BITFIELD_BIT(0),
/** The descriptor contains a BTI reference to a sampler state */
- ANV_DESCRIPTOR_SAMPLER_STATE = (1 << 1),
+ ANV_DESCRIPTOR_BTI_SAMPLER_STATE = BITFIELD_BIT(1),
/** The descriptor contains an actual buffer view */
- ANV_DESCRIPTOR_BUFFER_VIEW = (1 << 2),
- /** The descriptor contains auxiliary image layout data */
- ANV_DESCRIPTOR_IMAGE_PARAM = (1 << 3),
- /** The descriptor contains auxiliary image layout data */
- ANV_DESCRIPTOR_INLINE_UNIFORM = (1 << 4),
+ ANV_DESCRIPTOR_BUFFER_VIEW = BITFIELD_BIT(2),
+ /** The descriptor contains inline uniform data */
+ ANV_DESCRIPTOR_INLINE_UNIFORM = BITFIELD_BIT(3),
/** anv_address_range_descriptor with a buffer address and range */
- ANV_DESCRIPTOR_ADDRESS_RANGE = (1 << 5),
- /** Bindless surface handle */
- ANV_DESCRIPTOR_SAMPLED_IMAGE = (1 << 6),
- /** Storage image handles */
- ANV_DESCRIPTOR_STORAGE_IMAGE = (1 << 7),
- /** Storage image handles */
- ANV_DESCRIPTOR_TEXTURE_SWIZZLE = (1 << 8),
+ ANV_DESCRIPTOR_INDIRECT_ADDRESS_RANGE = BITFIELD_BIT(4),
+ /** Bindless surface handle (through anv_sampled_image_descriptor) */
+ ANV_DESCRIPTOR_INDIRECT_SAMPLED_IMAGE = BITFIELD_BIT(5),
+ /** Storage image handles (through anv_storage_image_descriptor) */
+ ANV_DESCRIPTOR_INDIRECT_STORAGE_IMAGE = BITFIELD_BIT(6),
+ /** The descriptor contains a single RENDER_SURFACE_STATE */
+ ANV_DESCRIPTOR_SURFACE = BITFIELD_BIT(7),
+ /** The descriptor contains a SAMPLER_STATE */
+ ANV_DESCRIPTOR_SAMPLER = BITFIELD_BIT(8),
+ /** A tuple of RENDER_SURFACE_STATE & SAMPLER_STATE */
+ ANV_DESCRIPTOR_SURFACE_SAMPLER = BITFIELD_BIT(9),
};
struct anv_descriptor_set_binding_layout {
@@ -1907,7 +2571,7 @@ struct anv_descriptor_set_binding_layout {
VkDescriptorType type;
/* Flags provided when this binding was created */
- VkDescriptorBindingFlagsEXT flags;
+ VkDescriptorBindingFlags flags;
/* Bitfield representing the type of data this descriptor contains */
enum anv_descriptor_data data;
@@ -1920,38 +2584,58 @@ struct anv_descriptor_set_binding_layout {
*/
uint32_t array_size;
- /* Index into the flattend descriptor set */
+ /* Index into the flattened descriptor set */
uint32_t descriptor_index;
- /* Index into the dynamic state array for a dynamic buffer */
+ /* Index into the dynamic state array for a dynamic buffer, relative to the
+ * set.
+ */
int16_t dynamic_offset_index;
+ /* Computed surface size from data (for one plane) */
+ uint16_t descriptor_data_surface_size;
+
+ /* Computed sampler size from data (for one plane) */
+ uint16_t descriptor_data_sampler_size;
+
/* Index into the descriptor set buffer views */
int32_t buffer_view_index;
- /* Offset into the descriptor buffer where this descriptor lives */
- uint32_t descriptor_offset;
+ /* Offset into the descriptor buffer where the surface descriptor lives */
+ uint32_t descriptor_surface_offset;
- /* Immutable samplers (or NULL if no immutable samplers) */
- struct anv_sampler **immutable_samplers;
-};
+ /* Offset into the descriptor buffer where the sampler descriptor lives */
+ uint16_t descriptor_sampler_offset;
-unsigned anv_descriptor_size(const struct anv_descriptor_set_binding_layout *layout);
+ /* Pre computed surface stride (with multiplane descriptor, the descriptor
+ * includes all the planes)
+ */
+ uint16_t descriptor_surface_stride;
-unsigned anv_descriptor_type_size(const struct anv_physical_device *pdevice,
- VkDescriptorType type);
+ /* Pre computed sampler stride (with multiplane descriptor, the descriptor
+ * includes all the planes)
+ */
+ uint16_t descriptor_sampler_stride;
-bool anv_descriptor_supports_bindless(const struct anv_physical_device *pdevice,
- const struct anv_descriptor_set_binding_layout *binding,
- bool sampler);
+ /* Immutable samplers (or NULL if no immutable samplers) */
+ struct anv_sampler **immutable_samplers;
+};
-bool anv_descriptor_requires_bindless(const struct anv_physical_device *pdevice,
- const struct anv_descriptor_set_binding_layout *binding,
- bool sampler);
+enum anv_descriptor_set_layout_type {
+ ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_UNKNOWN,
+ ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_INDIRECT,
+ ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_DIRECT,
+ ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_BUFFER,
+};
struct anv_descriptor_set_layout {
struct vk_object_base base;
+ VkDescriptorSetLayoutCreateFlags flags;
+
+ /* Type of descriptor set layout */
+ enum anv_descriptor_set_layout_type type;
+
/* Descriptor set layouts can be destroyed at almost any time */
uint32_t ref_cnt;
@@ -1975,21 +2659,43 @@ struct anv_descriptor_set_layout {
*/
VkShaderStageFlags dynamic_offset_stages[MAX_DYNAMIC_BUFFERS];
- /* Size of the descriptor buffer for this descriptor set */
- uint32_t descriptor_buffer_size;
+ /* Size of the descriptor buffer dedicated to surface states for this
+ * descriptor set
+ */
+ uint32_t descriptor_buffer_surface_size;
+
+ /* Size of the descriptor buffer dedicated to sampler states for this
+ * descriptor set
+ */
+ uint32_t descriptor_buffer_sampler_size;
+
+ /* Number of embedded sampler count */
+ uint32_t embedded_sampler_count;
/* Bindings in this descriptor set */
struct anv_descriptor_set_binding_layout binding[0];
};
+bool anv_descriptor_supports_bindless(const struct anv_physical_device *pdevice,
+ const struct anv_descriptor_set_layout *set,
+ const struct anv_descriptor_set_binding_layout *binding);
+
+bool anv_descriptor_requires_bindless(const struct anv_physical_device *pdevice,
+ const struct anv_descriptor_set_layout *set,
+ const struct anv_descriptor_set_binding_layout *binding);
+
void anv_descriptor_set_layout_destroy(struct anv_device *device,
struct anv_descriptor_set_layout *layout);
-static inline void
+void anv_descriptor_set_layout_print(const struct anv_descriptor_set_layout *layout);
+
+static inline struct anv_descriptor_set_layout *
anv_descriptor_set_layout_ref(struct anv_descriptor_set_layout *layout)
{
assert(layout && layout->ref_cnt >= 1);
p_atomic_inc(&layout->ref_cnt);
+
+ return layout;
}
static inline void
@@ -2012,12 +2718,16 @@ struct anv_descriptor {
};
struct {
+ struct anv_buffer_view *set_buffer_view;
struct anv_buffer *buffer;
uint64_t offset;
uint64_t range;
+ uint64_t bind_range;
};
struct anv_buffer_view *buffer_view;
+
+ struct vk_acceleration_structure *accel_struct;
};
};
@@ -2032,13 +2742,36 @@ struct anv_descriptor_set {
*/
uint32_t size;
- /* State relative to anv_descriptor_pool::bo */
- struct anv_state desc_mem;
+ /* Is this descriptor set a push descriptor */
+ bool is_push;
+
+ /* Bitfield of descriptors for which we need to generate surface states.
+ * Only valid for push descriptors
+ */
+ uint32_t generate_surface_states;
+
+ /* State relative to anv_descriptor_pool::surface_bo */
+ struct anv_state desc_surface_mem;
+ /* State relative to anv_descriptor_pool::sampler_bo */
+ struct anv_state desc_sampler_mem;
/* Surface state for the descriptor buffer */
struct anv_state desc_surface_state;
- /* Descriptor set address. */
- struct anv_address desc_addr;
+ /* Descriptor set address pointing to desc_surface_mem (we don't need one
+ * for sampler because they're never accessed other than by the HW through
+ * the shader sampler handle).
+ */
+ struct anv_address desc_surface_addr;
+
+ struct anv_address desc_sampler_addr;
+
+ /* Descriptor offset from the
+ * device->va.internal_surface_state_pool.addr
+ *
+ * It just needs to be added to the binding table offset to be put into the
+ * HW BTI entry.
+ */
+ uint32_t desc_offset;
uint32_t buffer_view_count;
struct anv_buffer_view *buffer_views;
@@ -2056,19 +2789,31 @@ anv_descriptor_set_is_push(struct anv_descriptor_set *set)
return set->pool == NULL;
}
-struct anv_buffer_view {
- struct vk_object_base base;
+struct anv_surface_state_data {
+ uint8_t data[ANV_SURFACE_STATE_SIZE];
+};
- enum isl_format format; /**< VkBufferViewCreateInfo::format */
- uint64_t range; /**< VkBufferViewCreateInfo::range */
+struct anv_buffer_state {
+ /** Surface state allocated from the bindless heap
+ *
+ * Only valid if anv_physical_device::indirect_descriptors is true
+ */
+ struct anv_state state;
- struct anv_address address;
+ /** Surface state after genxml packing
+ *
+ * Only valid if anv_physical_device::indirect_descriptors is false
+ */
+ struct anv_surface_state_data state_data;
+};
+
+struct anv_buffer_view {
+ struct vk_buffer_view vk;
- struct anv_state surface_state;
- struct anv_state storage_surface_state;
- struct anv_state writeonly_storage_surface_state;
+ struct anv_address address;
- struct brw_image_param storage_image_param;
+ struct anv_buffer_state general;
+ struct anv_buffer_state storage;
};
struct anv_push_descriptor_set {
@@ -2098,78 +2843,57 @@ anv_descriptor_set_address(struct anv_descriptor_set *set)
push_set->set_used_on_gpu = true;
}
- return set->desc_addr;
+ return set->desc_surface_addr;
}
-struct anv_descriptor_pool {
- struct vk_object_base base;
-
- uint32_t size;
- uint32_t next;
- uint32_t free_list;
+struct anv_descriptor_pool_heap {
+ /* BO allocated to back the pool (unused for host pools) */
+ struct anv_bo *bo;
- struct anv_bo *bo;
- struct util_vma_heap bo_heap;
+ /* Host memory allocated to back a host pool */
+ void *host_mem;
- struct anv_state_stream surface_state_stream;
- void *surface_state_free_list;
+ /* Heap tracking allocations in bo/host_mem */
+ struct util_vma_heap heap;
- struct list_head desc_sets;
-
- char data[0];
-};
-
-enum anv_descriptor_template_entry_type {
- ANV_DESCRIPTOR_TEMPLATE_ENTRY_TYPE_IMAGE,
- ANV_DESCRIPTOR_TEMPLATE_ENTRY_TYPE_BUFFER,
- ANV_DESCRIPTOR_TEMPLATE_ENTRY_TYPE_BUFFER_VIEW
+ /* Size of the heap */
+ uint32_t size;
};
-struct anv_descriptor_template_entry {
- /* The type of descriptor in this entry */
- VkDescriptorType type;
-
- /* Binding in the descriptor set */
- uint32_t binding;
-
- /* Offset at which to write into the descriptor set binding */
- uint32_t array_element;
+struct anv_descriptor_pool {
+ struct vk_object_base base;
- /* Number of elements to write into the descriptor set binding */
- uint32_t array_count;
+ struct anv_descriptor_pool_heap surfaces;
+ struct anv_descriptor_pool_heap samplers;
- /* Offset into the user provided data */
- size_t offset;
+ struct anv_state_stream surface_state_stream;
+ void *surface_state_free_list;
- /* Stride between elements into the user provided data */
- size_t stride;
-};
+ /** List of anv_descriptor_set. */
+ struct list_head desc_sets;
-struct anv_descriptor_update_template {
- struct vk_object_base base;
+ /** Heap over host_mem */
+ struct util_vma_heap host_heap;
- VkPipelineBindPoint bind_point;
+ /** Allocated size of host_mem */
+ uint32_t host_mem_size;
- /* The descriptor set this template corresponds to. This value is only
- * valid if the template was created with the templateType
- * VK_DESCRIPTOR_UPDATE_TEMPLATE_TYPE_DESCRIPTOR_SET.
+ /**
+ * VK_DESCRIPTOR_POOL_CREATE_HOST_ONLY_BIT_EXT. If set, then
+ * surface_state_stream is unused.
*/
- uint8_t set;
+ bool host_only;
- /* Number of entries in this template */
- uint32_t entry_count;
-
- /* Entries of the template */
- struct anv_descriptor_template_entry entries[0];
+ char host_mem[0];
};
-size_t
-anv_descriptor_set_layout_size(const struct anv_descriptor_set_layout *layout,
- uint32_t var_desc_count);
+bool
+anv_push_descriptor_set_init(struct anv_cmd_buffer *cmd_buffer,
+ struct anv_push_descriptor_set *push_set,
+ struct anv_descriptor_set_layout *layout);
-uint32_t
-anv_descriptor_set_layout_descriptor_buffer_size(const struct anv_descriptor_set_layout *set_layout,
- uint32_t var_desc_count);
+void
+anv_push_descriptor_set_finish(struct anv_push_descriptor_set *push_set);
void
anv_descriptor_set_write_image_view(struct anv_device *device,
@@ -2190,7 +2914,6 @@ anv_descriptor_set_write_buffer_view(struct anv_device *device,
void
anv_descriptor_set_write_buffer(struct anv_device *device,
struct anv_descriptor_set *set,
- struct anv_state_stream *alloc_stream,
VkDescriptorType type,
struct anv_buffer *buffer,
uint32_t binding,
@@ -2199,9 +2922,14 @@ anv_descriptor_set_write_buffer(struct anv_device *device,
VkDeviceSize range);
void
+anv_descriptor_write_surface_state(struct anv_device *device,
+ struct anv_descriptor *desc,
+ struct anv_state surface_state);
+
+void
anv_descriptor_set_write_acceleration_structure(struct anv_device *device,
struct anv_descriptor_set *set,
- struct anv_acceleration_structure *accel,
+ struct vk_acceleration_structure *accel,
uint32_t binding,
uint32_t element);
@@ -2214,30 +2942,23 @@ anv_descriptor_set_write_inline_uniform_data(struct anv_device *device,
size_t size);
void
+anv_descriptor_set_write(struct anv_device *device,
+ struct anv_descriptor_set *set_override,
+ uint32_t write_count,
+ const VkWriteDescriptorSet *writes);
+
+void
anv_descriptor_set_write_template(struct anv_device *device,
struct anv_descriptor_set *set,
- struct anv_state_stream *alloc_stream,
- const struct anv_descriptor_update_template *template,
+ const struct vk_descriptor_update_template *template,
const void *data);
-VkResult
-anv_descriptor_set_create(struct anv_device *device,
- struct anv_descriptor_pool *pool,
- struct anv_descriptor_set_layout *layout,
- uint32_t var_desc_count,
- struct anv_descriptor_set **out_set);
-
-void
-anv_descriptor_set_destroy(struct anv_device *device,
- struct anv_descriptor_pool *pool,
- struct anv_descriptor_set *set);
-
-#define ANV_DESCRIPTOR_SET_NULL (UINT8_MAX - 5)
-#define ANV_DESCRIPTOR_SET_PUSH_CONSTANTS (UINT8_MAX - 4)
-#define ANV_DESCRIPTOR_SET_DESCRIPTORS (UINT8_MAX - 3)
-#define ANV_DESCRIPTOR_SET_NUM_WORK_GROUPS (UINT8_MAX - 2)
-#define ANV_DESCRIPTOR_SET_SHADER_CONSTANTS (UINT8_MAX - 1)
-#define ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS UINT8_MAX
+#define ANV_DESCRIPTOR_SET_DESCRIPTORS_BUFFER (UINT8_MAX - 5)
+#define ANV_DESCRIPTOR_SET_NULL (UINT8_MAX - 4)
+#define ANV_DESCRIPTOR_SET_PUSH_CONSTANTS (UINT8_MAX - 3)
+#define ANV_DESCRIPTOR_SET_DESCRIPTORS (UINT8_MAX - 2)
+#define ANV_DESCRIPTOR_SET_NUM_WORK_GROUPS (UINT8_MAX - 1)
+#define ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS UINT8_MAX
struct anv_pipeline_binding {
/** Index in the descriptor set
@@ -2247,6 +2968,19 @@ struct anv_pipeline_binding {
*/
uint32_t index;
+ /** Binding in the descriptor set. Not valid for any of the
+ * ANV_DESCRIPTOR_SET_*
+ */
+ uint32_t binding;
+
+ /** Offset in the descriptor buffer
+ *
+ * Relative to anv_descriptor_set::desc_addr. This is useful for
+ * ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_DIRECT, to generate the binding
+ * table entry.
+ */
+ uint32_t set_offset;
+
/** The descriptor set this surface corresponds to.
*
* The special ANV_DESCRIPTOR_SET_* values above indicates that this
@@ -2261,17 +2995,39 @@ struct anv_pipeline_binding {
/** Input attachment index (relative to the subpass) */
uint8_t input_attachment_index;
- /** Dynamic offset index (for dynamic UBOs and SSBOs) */
+ /** Dynamic offset index
+ *
+ * For dynamic UBOs and SSBOs, relative to set.
+ */
uint8_t dynamic_offset_index;
};
+};
- /** For a storage image, whether it is write-only */
- uint8_t write_only;
-
- /** Pad to 64 bits so that there are no holes and we can safely memcmp
- * assuming POD zero-initialization.
+struct anv_embedded_sampler_key {
+ /** No need to track binding elements for embedded samplers as :
+ *
+ * VUID-VkDescriptorSetLayoutBinding-flags-08006:
+ *
+ * "If VkDescriptorSetLayoutCreateInfo:flags contains
+ * VK_DESCRIPTOR_SET_LAYOUT_CREATE_EMBEDDED_IMMUTABLE_SAMPLERS_BIT_EXT,
+ * descriptorCount must: less than or equal to 1"
+ *
+ * The following struct can be safely hash as it doesn't include in
+ * address/offset.
*/
- uint8_t pad;
+ uint32_t sampler[4];
+ uint32_t color[4];
+};
+
+struct anv_pipeline_embedded_sampler_binding {
+ /** The descriptor set this sampler belongs to */
+ uint8_t set;
+
+ /** The binding in the set this sampler belongs to */
+ uint32_t binding;
+
+ /** The data configuring the sampler */
+ struct anv_embedded_sampler_key key;
};
struct anv_push_range {
@@ -2281,7 +3037,7 @@ struct anv_push_range {
/** Descriptor set index */
uint8_t set;
- /** Dynamic offset index (for dynamic UBOs) */
+ /** Dynamic offset index (for dynamic UBOs), relative to set. */
uint8_t dynamic_offset_index;
/** Start offset in units of 32B */
@@ -2291,175 +3047,153 @@ struct anv_push_range {
uint8_t length;
};
-struct anv_pipeline_layout {
- struct vk_object_base base;
+struct anv_pipeline_sets_layout {
+ struct anv_device *device;
struct {
struct anv_descriptor_set_layout *layout;
uint32_t dynamic_offset_start;
} set[MAX_SETS];
+ enum anv_descriptor_set_layout_type type;
+
uint32_t num_sets;
+ uint32_t num_dynamic_buffers;
+ int push_descriptor_set_index;
+
+ bool independent_sets;
unsigned char sha1[20];
};
-struct anv_buffer {
- struct vk_object_base base;
+void anv_pipeline_sets_layout_init(struct anv_pipeline_sets_layout *layout,
+ struct anv_device *device,
+ bool independent_sets);
- struct anv_device * device;
- VkDeviceSize size;
+void anv_pipeline_sets_layout_fini(struct anv_pipeline_sets_layout *layout);
- VkBufferCreateFlags create_flags;
- VkBufferUsageFlags usage;
+void anv_pipeline_sets_layout_add(struct anv_pipeline_sets_layout *layout,
+ uint32_t set_idx,
+ struct anv_descriptor_set_layout *set_layout);
- /* Set when bound */
- struct anv_address address;
+uint32_t
+anv_pipeline_sets_layout_embedded_sampler_count(const struct anv_pipeline_sets_layout *layout);
+
+void anv_pipeline_sets_layout_hash(struct anv_pipeline_sets_layout *layout);
+
+void anv_pipeline_sets_layout_print(const struct anv_pipeline_sets_layout *layout);
+
+struct anv_pipeline_layout {
+ struct vk_object_base base;
+
+ struct anv_pipeline_sets_layout sets_layout;
};
-static inline uint64_t
-anv_buffer_get_range(struct anv_buffer *buffer, uint64_t offset, uint64_t range)
+const struct anv_descriptor_set_layout *
+anv_pipeline_layout_get_push_set(const struct anv_pipeline_sets_layout *layout,
+ uint8_t *desc_idx);
+
+struct anv_sparse_binding_data {
+ uint64_t address;
+ uint64_t size;
+
+ /* This is kept only because it's given to us by vma_alloc() and need to be
+ * passed back to vma_free(), we have no other particular use for it
+ */
+ struct util_vma_heap *vma_heap;
+};
+
+#define ANV_SPARSE_BLOCK_SIZE (64 * 1024)
+
+static inline bool
+anv_sparse_binding_is_enabled(struct anv_device *device)
{
- assert(offset <= buffer->size);
- if (range == VK_WHOLE_SIZE) {
- return buffer->size - offset;
- } else {
- assert(range + offset >= range);
- assert(range + offset <= buffer->size);
- return range;
- }
+ return device->vk.enabled_features.sparseBinding;
}
-enum anv_cmd_dirty_bits {
- ANV_CMD_DIRTY_DYNAMIC_VIEWPORT = 1 << 0, /* VK_DYNAMIC_STATE_VIEWPORT */
- ANV_CMD_DIRTY_DYNAMIC_SCISSOR = 1 << 1, /* VK_DYNAMIC_STATE_SCISSOR */
- ANV_CMD_DIRTY_DYNAMIC_LINE_WIDTH = 1 << 2, /* VK_DYNAMIC_STATE_LINE_WIDTH */
- ANV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS = 1 << 3, /* VK_DYNAMIC_STATE_DEPTH_BIAS */
- ANV_CMD_DIRTY_DYNAMIC_BLEND_CONSTANTS = 1 << 4, /* VK_DYNAMIC_STATE_BLEND_CONSTANTS */
- ANV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS = 1 << 5, /* VK_DYNAMIC_STATE_DEPTH_BOUNDS */
- ANV_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK = 1 << 6, /* VK_DYNAMIC_STATE_STENCIL_COMPARE_MASK */
- ANV_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK = 1 << 7, /* VK_DYNAMIC_STATE_STENCIL_WRITE_MASK */
- ANV_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE = 1 << 8, /* VK_DYNAMIC_STATE_STENCIL_REFERENCE */
- ANV_CMD_DIRTY_PIPELINE = 1 << 9,
- ANV_CMD_DIRTY_INDEX_BUFFER = 1 << 10,
- ANV_CMD_DIRTY_RENDER_TARGETS = 1 << 11,
- ANV_CMD_DIRTY_XFB_ENABLE = 1 << 12,
- ANV_CMD_DIRTY_DYNAMIC_LINE_STIPPLE = 1 << 13, /* VK_DYNAMIC_STATE_LINE_STIPPLE_EXT */
- ANV_CMD_DIRTY_DYNAMIC_CULL_MODE = 1 << 14, /* VK_DYNAMIC_STATE_CULL_MODE_EXT */
- ANV_CMD_DIRTY_DYNAMIC_FRONT_FACE = 1 << 15, /* VK_DYNAMIC_STATE_FRONT_FACE_EXT */
- ANV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY = 1 << 16, /* VK_DYNAMIC_STATE_PRIMITIVE_TOPOLOGY_EXT */
- ANV_CMD_DIRTY_DYNAMIC_VERTEX_INPUT_BINDING_STRIDE = 1 << 17, /* VK_DYNAMIC_STATE_VERTEX_INPUT_BINDING_STRIDE_EXT */
- ANV_CMD_DIRTY_DYNAMIC_DEPTH_TEST_ENABLE = 1 << 18, /* VK_DYNAMIC_STATE_DEPTH_TEST_ENABLE_EXT */
- ANV_CMD_DIRTY_DYNAMIC_DEPTH_WRITE_ENABLE = 1 << 19, /* VK_DYNAMIC_STATE_DEPTH_WRITE_ENABLE_EXT */
- ANV_CMD_DIRTY_DYNAMIC_DEPTH_COMPARE_OP = 1 << 20, /* VK_DYNAMIC_STATE_DEPTH_COMPARE_OP_EXT */
- ANV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS_TEST_ENABLE = 1 << 21, /* VK_DYNAMIC_STATE_DEPTH_BOUNDS_TEST_ENABLE_EXT */
- ANV_CMD_DIRTY_DYNAMIC_STENCIL_TEST_ENABLE = 1 << 22, /* VK_DYNAMIC_STATE_STENCIL_TEST_ENABLE_EXT */
- ANV_CMD_DIRTY_DYNAMIC_STENCIL_OP = 1 << 23, /* VK_DYNAMIC_STATE_STENCIL_OP_EXT */
- ANV_CMD_DIRTY_DYNAMIC_SAMPLE_LOCATIONS = 1 << 24, /* VK_DYNAMIC_STATE_SAMPLE_LOCATIONS_EXT */
- ANV_CMD_DIRTY_DYNAMIC_COLOR_BLEND_STATE = 1 << 25, /* VK_DYNAMIC_STATE_COLOR_WRITE_ENABLE_EXT */
- ANV_CMD_DIRTY_DYNAMIC_SHADING_RATE = 1 << 26, /* VK_DYNAMIC_STATE_FRAGMENT_SHADING_RATE_KHR */
- ANV_CMD_DIRTY_DYNAMIC_RASTERIZER_DISCARD_ENABLE = 1 << 27, /* VK_DYNAMIC_STATE_RASTERIZER_DISCARD_ENABLE_EXT */
- ANV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS_ENABLE = 1 << 28, /* VK_DYNAMIC_STATE_DEPTH_BIAS_ENABLE_EXT */
- ANV_CMD_DIRTY_DYNAMIC_LOGIC_OP = 1 << 29, /* VK_DYNAMIC_STATE_LOGIC_OP_EXT */
- ANV_CMD_DIRTY_DYNAMIC_PRIMITIVE_RESTART_ENABLE = 1 << 30, /* VK_DYNAMIC_STATE_PRIMITIVE_RESTART_ENABLE_EXT */
-};
-typedef uint32_t anv_cmd_dirty_mask_t;
-
-#define ANV_CMD_DIRTY_DYNAMIC_ALL \
- (ANV_CMD_DIRTY_DYNAMIC_VIEWPORT | \
- ANV_CMD_DIRTY_DYNAMIC_SCISSOR | \
- ANV_CMD_DIRTY_DYNAMIC_LINE_WIDTH | \
- ANV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS | \
- ANV_CMD_DIRTY_DYNAMIC_BLEND_CONSTANTS | \
- ANV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS | \
- ANV_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK | \
- ANV_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK | \
- ANV_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE | \
- ANV_CMD_DIRTY_DYNAMIC_LINE_STIPPLE | \
- ANV_CMD_DIRTY_DYNAMIC_CULL_MODE | \
- ANV_CMD_DIRTY_DYNAMIC_FRONT_FACE | \
- ANV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY | \
- ANV_CMD_DIRTY_DYNAMIC_VERTEX_INPUT_BINDING_STRIDE | \
- ANV_CMD_DIRTY_DYNAMIC_DEPTH_TEST_ENABLE | \
- ANV_CMD_DIRTY_DYNAMIC_DEPTH_WRITE_ENABLE | \
- ANV_CMD_DIRTY_DYNAMIC_DEPTH_COMPARE_OP | \
- ANV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS_TEST_ENABLE | \
- ANV_CMD_DIRTY_DYNAMIC_STENCIL_TEST_ENABLE | \
- ANV_CMD_DIRTY_DYNAMIC_STENCIL_OP | \
- ANV_CMD_DIRTY_DYNAMIC_SAMPLE_LOCATIONS | \
- ANV_CMD_DIRTY_DYNAMIC_COLOR_BLEND_STATE | \
- ANV_CMD_DIRTY_DYNAMIC_SHADING_RATE | \
- ANV_CMD_DIRTY_DYNAMIC_RASTERIZER_DISCARD_ENABLE | \
- ANV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS_ENABLE | \
- ANV_CMD_DIRTY_DYNAMIC_LOGIC_OP | \
- ANV_CMD_DIRTY_DYNAMIC_PRIMITIVE_RESTART_ENABLE)
-
-static inline enum anv_cmd_dirty_bits
-anv_cmd_dirty_bit_for_vk_dynamic_state(VkDynamicState vk_state)
+static inline bool
+anv_sparse_residency_is_enabled(struct anv_device *device)
{
- switch (vk_state) {
- case VK_DYNAMIC_STATE_VIEWPORT:
- case VK_DYNAMIC_STATE_VIEWPORT_WITH_COUNT_EXT:
- return ANV_CMD_DIRTY_DYNAMIC_VIEWPORT;
- case VK_DYNAMIC_STATE_SCISSOR:
- case VK_DYNAMIC_STATE_SCISSOR_WITH_COUNT_EXT:
- return ANV_CMD_DIRTY_DYNAMIC_SCISSOR;
- case VK_DYNAMIC_STATE_LINE_WIDTH:
- return ANV_CMD_DIRTY_DYNAMIC_LINE_WIDTH;
- case VK_DYNAMIC_STATE_DEPTH_BIAS:
- return ANV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS;
- case VK_DYNAMIC_STATE_BLEND_CONSTANTS:
- return ANV_CMD_DIRTY_DYNAMIC_BLEND_CONSTANTS;
- case VK_DYNAMIC_STATE_DEPTH_BOUNDS:
- return ANV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS;
- case VK_DYNAMIC_STATE_STENCIL_COMPARE_MASK:
- return ANV_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK;
- case VK_DYNAMIC_STATE_STENCIL_WRITE_MASK:
- return ANV_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK;
- case VK_DYNAMIC_STATE_STENCIL_REFERENCE:
- return ANV_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE;
- case VK_DYNAMIC_STATE_LINE_STIPPLE_EXT:
- return ANV_CMD_DIRTY_DYNAMIC_LINE_STIPPLE;
- case VK_DYNAMIC_STATE_CULL_MODE_EXT:
- return ANV_CMD_DIRTY_DYNAMIC_CULL_MODE;
- case VK_DYNAMIC_STATE_FRONT_FACE_EXT:
- return ANV_CMD_DIRTY_DYNAMIC_FRONT_FACE;
- case VK_DYNAMIC_STATE_PRIMITIVE_TOPOLOGY_EXT:
- return ANV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY;
- case VK_DYNAMIC_STATE_VERTEX_INPUT_BINDING_STRIDE_EXT:
- return ANV_CMD_DIRTY_DYNAMIC_VERTEX_INPUT_BINDING_STRIDE;
- case VK_DYNAMIC_STATE_DEPTH_TEST_ENABLE_EXT:
- return ANV_CMD_DIRTY_DYNAMIC_DEPTH_TEST_ENABLE;
- case VK_DYNAMIC_STATE_DEPTH_WRITE_ENABLE_EXT:
- return ANV_CMD_DIRTY_DYNAMIC_DEPTH_WRITE_ENABLE;
- case VK_DYNAMIC_STATE_DEPTH_COMPARE_OP_EXT:
- return ANV_CMD_DIRTY_DYNAMIC_DEPTH_COMPARE_OP;
- case VK_DYNAMIC_STATE_DEPTH_BOUNDS_TEST_ENABLE_EXT:
- return ANV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS_TEST_ENABLE;
- case VK_DYNAMIC_STATE_STENCIL_TEST_ENABLE_EXT:
- return ANV_CMD_DIRTY_DYNAMIC_STENCIL_TEST_ENABLE;
- case VK_DYNAMIC_STATE_STENCIL_OP_EXT:
- return ANV_CMD_DIRTY_DYNAMIC_STENCIL_OP;
- case VK_DYNAMIC_STATE_SAMPLE_LOCATIONS_EXT:
- return ANV_CMD_DIRTY_DYNAMIC_SAMPLE_LOCATIONS;
- case VK_DYNAMIC_STATE_COLOR_WRITE_ENABLE_EXT:
- return ANV_CMD_DIRTY_DYNAMIC_COLOR_BLEND_STATE;
- case VK_DYNAMIC_STATE_FRAGMENT_SHADING_RATE_KHR:
- return ANV_CMD_DIRTY_DYNAMIC_SHADING_RATE;
- case VK_DYNAMIC_STATE_RASTERIZER_DISCARD_ENABLE_EXT:
- return ANV_CMD_DIRTY_DYNAMIC_RASTERIZER_DISCARD_ENABLE;
- case VK_DYNAMIC_STATE_DEPTH_BIAS_ENABLE_EXT:
- return ANV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS_ENABLE;
- case VK_DYNAMIC_STATE_LOGIC_OP_EXT:
- return ANV_CMD_DIRTY_DYNAMIC_LOGIC_OP;
- case VK_DYNAMIC_STATE_PRIMITIVE_RESTART_ENABLE_EXT:
- return ANV_CMD_DIRTY_DYNAMIC_PRIMITIVE_RESTART_ENABLE;
- default:
- assert(!"Unsupported dynamic state");
- return 0;
- }
+ return device->vk.enabled_features.sparseResidencyBuffer ||
+ device->vk.enabled_features.sparseResidencyImage2D ||
+ device->vk.enabled_features.sparseResidencyImage3D ||
+ device->vk.enabled_features.sparseResidency2Samples ||
+ device->vk.enabled_features.sparseResidency4Samples ||
+ device->vk.enabled_features.sparseResidency8Samples ||
+ device->vk.enabled_features.sparseResidency16Samples ||
+ device->vk.enabled_features.sparseResidencyAliased;
}
+VkResult anv_init_sparse_bindings(struct anv_device *device,
+ uint64_t size,
+ struct anv_sparse_binding_data *sparse,
+ enum anv_bo_alloc_flags alloc_flags,
+ uint64_t client_address,
+ struct anv_address *out_address);
+void anv_free_sparse_bindings(struct anv_device *device,
+ struct anv_sparse_binding_data *sparse);
+VkResult anv_sparse_bind_buffer(struct anv_device *device,
+ struct anv_buffer *buffer,
+ const VkSparseMemoryBind *vk_bind,
+ struct anv_sparse_submission *submit);
+VkResult anv_sparse_bind_image_opaque(struct anv_device *device,
+ struct anv_image *image,
+ const VkSparseMemoryBind *vk_bind,
+ struct anv_sparse_submission *submit);
+VkResult anv_sparse_bind_image_memory(struct anv_queue *queue,
+ struct anv_image *image,
+ const VkSparseImageMemoryBind *bind,
+ struct anv_sparse_submission *submit);
+VkResult anv_sparse_bind(struct anv_device *device,
+ struct anv_sparse_submission *sparse_submit);
+
+VkSparseImageFormatProperties
+anv_sparse_calc_image_format_properties(struct anv_physical_device *pdevice,
+ VkImageAspectFlags aspect,
+ VkImageType vk_image_type,
+ struct isl_surf *surf);
+void anv_sparse_calc_miptail_properties(struct anv_device *device,
+ struct anv_image *image,
+ VkImageAspectFlags vk_aspect,
+ uint32_t *imageMipTailFirstLod,
+ VkDeviceSize *imageMipTailSize,
+ VkDeviceSize *imageMipTailOffset,
+ VkDeviceSize *imageMipTailStride);
+VkResult anv_sparse_image_check_support(struct anv_physical_device *pdevice,
+ VkImageCreateFlags flags,
+ VkImageTiling tiling,
+ VkSampleCountFlagBits samples,
+ VkImageType type,
+ VkFormat format);
+VkResult anv_trtt_batch_bo_new(struct anv_device *device, uint32_t batch_size,
+ struct anv_trtt_batch_bo **out_trtt_bbo);
+
+struct anv_buffer {
+ struct vk_buffer vk;
+
+ /* Set when bound */
+ struct anv_address address;
+
+ struct anv_sparse_binding_data sparse_data;
+};
+
+static inline bool
+anv_buffer_is_sparse(const struct anv_buffer *buffer)
+{
+ return buffer->vk.create_flags & VK_BUFFER_CREATE_SPARSE_BINDING_BIT;
+}
+
+enum anv_cmd_dirty_bits {
+ ANV_CMD_DIRTY_PIPELINE = 1 << 0,
+ ANV_CMD_DIRTY_INDEX_BUFFER = 1 << 1,
+ ANV_CMD_DIRTY_RENDER_AREA = 1 << 2,
+ ANV_CMD_DIRTY_RENDER_TARGETS = 1 << 3,
+ ANV_CMD_DIRTY_XFB_ENABLE = 1 << 4,
+ ANV_CMD_DIRTY_RESTART_INDEX = 1 << 5,
+ ANV_CMD_DIRTY_OCCLUSION_QUERY_ACTIVE = 1 << 6,
+ ANV_CMD_DIRTY_FS_MSAA_FLAGS = 1 << 7,
+};
+typedef enum anv_cmd_dirty_bits anv_cmd_dirty_mask_t;
enum anv_pipe_bits {
ANV_PIPE_DEPTH_CACHE_FLUSH_BIT = (1 << 0),
@@ -2479,6 +3213,20 @@ enum anv_pipe_bits {
* must reinterpret this flush as ANV_PIPE_DATA_CACHE_FLUSH_BIT.
*/
ANV_PIPE_HDC_PIPELINE_FLUSH_BIT = (1 << 14),
+ ANV_PIPE_PSS_STALL_SYNC_BIT = (1 << 15),
+
+ /*
+ * This bit flush data-port's Untyped L1 data cache (LSC L1).
+ */
+ ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT = (1 << 16),
+
+ /* This bit controls the flushing of the engine (Render, Compute) specific
+ * entries from the compression cache.
+ */
+ ANV_PIPE_CCS_CACHE_FLUSH_BIT = (1 << 17),
+
+ ANV_PIPE_TLB_INVALIDATE_BIT = (1 << 18),
+
ANV_PIPE_CS_STALL_BIT = (1 << 20),
ANV_PIPE_END_OF_PIPE_SYNC_BIT = (1 << 21),
@@ -2489,225 +3237,158 @@ enum anv_pipe_bits {
*/
ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT = (1 << 22),
- /* This bit does not exist directly in PIPE_CONTROL. It means that render
- * target operations related to transfer commands with VkBuffer as
- * destination are ongoing. Some operations like copies on the command
- * streamer might need to be aware of this to trigger the appropriate stall
- * before they can proceed with the copy.
- */
- ANV_PIPE_RENDER_TARGET_BUFFER_WRITES = (1 << 23),
-
/* This bit does not exist directly in PIPE_CONTROL. It means that Gfx12
* AUX-TT data has changed and we need to invalidate AUX-TT data. This is
* done by writing the AUX-TT register.
*/
- ANV_PIPE_AUX_TABLE_INVALIDATE_BIT = (1 << 24),
+ ANV_PIPE_AUX_TABLE_INVALIDATE_BIT = (1 << 23),
/* This bit does not exist directly in PIPE_CONTROL. It means that a
* PIPE_CONTROL with a post-sync operation will follow. This is used to
* implement a workaround for Gfx9.
*/
- ANV_PIPE_POST_SYNC_BIT = (1 << 25),
+ ANV_PIPE_POST_SYNC_BIT = (1 << 24),
};
+/* These bits track the state of buffer writes for queries. They get cleared
+ * based on PIPE_CONTROL emissions.
+ */
+enum anv_query_bits {
+ ANV_QUERY_WRITES_RT_FLUSH = (1 << 0),
+
+ ANV_QUERY_WRITES_TILE_FLUSH = (1 << 1),
+
+ ANV_QUERY_WRITES_CS_STALL = (1 << 2),
+
+ ANV_QUERY_WRITES_DATA_FLUSH = (1 << 3),
+};
+
+/* It's not clear why DG2 doesn't have issues with L3/CS coherency. But it's
+ * likely related to performance workaround 14015868140.
+ *
+ * For now we enable this only on DG2 and platform prior to Gfx12 where there
+ * is no tile cache.
+ */
+#define ANV_DEVINFO_HAS_COHERENT_L3_CS(devinfo) \
+ (intel_device_info_is_dg2(devinfo))
+
+/* Things we need to flush before accessing query data using the command
+ * streamer.
+ *
+ * Prior to DG2 experiments show that the command streamer is not coherent
+ * with the tile cache so we need to flush it to make any data visible to CS.
+ *
+ * Otherwise we want to flush the RT cache which is where blorp writes, either
+ * for clearing the query buffer or for clearing the destination buffer in
+ * vkCopyQueryPoolResults().
+ */
+#define ANV_QUERY_RENDER_TARGET_WRITES_PENDING_BITS(devinfo) \
+ (((!ANV_DEVINFO_HAS_COHERENT_L3_CS(devinfo) && \
+ (devinfo)->ver >= 12) ? \
+ ANV_QUERY_WRITES_TILE_FLUSH : 0) | \
+ ANV_QUERY_WRITES_RT_FLUSH | \
+ ANV_QUERY_WRITES_CS_STALL)
+#define ANV_QUERY_COMPUTE_WRITES_PENDING_BITS \
+ (ANV_QUERY_WRITES_DATA_FLUSH | \
+ ANV_QUERY_WRITES_CS_STALL)
+
+#define ANV_PIPE_QUERY_BITS(pending_query_bits) ( \
+ ((pending_query_bits & ANV_QUERY_WRITES_RT_FLUSH) ? \
+ ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT : 0) | \
+ ((pending_query_bits & ANV_QUERY_WRITES_TILE_FLUSH) ? \
+ ANV_PIPE_TILE_CACHE_FLUSH_BIT : 0) | \
+ ((pending_query_bits & ANV_QUERY_WRITES_CS_STALL) ? \
+ ANV_PIPE_CS_STALL_BIT : 0) | \
+ ((pending_query_bits & ANV_QUERY_WRITES_DATA_FLUSH) ? \
+ (ANV_PIPE_DATA_CACHE_FLUSH_BIT | \
+ ANV_PIPE_HDC_PIPELINE_FLUSH_BIT | \
+ ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT) : 0))
+
#define ANV_PIPE_FLUSH_BITS ( \
ANV_PIPE_DEPTH_CACHE_FLUSH_BIT | \
ANV_PIPE_DATA_CACHE_FLUSH_BIT | \
ANV_PIPE_HDC_PIPELINE_FLUSH_BIT | \
+ ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT | \
ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT | \
ANV_PIPE_TILE_CACHE_FLUSH_BIT)
#define ANV_PIPE_STALL_BITS ( \
ANV_PIPE_STALL_AT_SCOREBOARD_BIT | \
ANV_PIPE_DEPTH_STALL_BIT | \
- ANV_PIPE_CS_STALL_BIT)
+ ANV_PIPE_CS_STALL_BIT | \
+ ANV_PIPE_PSS_STALL_SYNC_BIT)
#define ANV_PIPE_INVALIDATE_BITS ( \
ANV_PIPE_STATE_CACHE_INVALIDATE_BIT | \
ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT | \
ANV_PIPE_VF_CACHE_INVALIDATE_BIT | \
- ANV_PIPE_HDC_PIPELINE_FLUSH_BIT | \
ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT | \
ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT | \
ANV_PIPE_AUX_TABLE_INVALIDATE_BIT)
-static inline enum anv_pipe_bits
-anv_pipe_flush_bits_for_access_flags(struct anv_device *device,
- VkAccessFlags flags)
-{
- enum anv_pipe_bits pipe_bits = 0;
-
- u_foreach_bit(b, flags) {
- switch ((VkAccessFlagBits)(1 << b)) {
- case VK_ACCESS_SHADER_WRITE_BIT:
- /* We're transitioning a buffer that was previously used as write
- * destination through the data port. To make its content available
- * to future operations, flush the hdc pipeline.
- */
- pipe_bits |= ANV_PIPE_HDC_PIPELINE_FLUSH_BIT;
- break;
- case VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT:
- /* We're transitioning a buffer that was previously used as render
- * target. To make its content available to future operations, flush
- * the render target cache.
- */
- pipe_bits |= ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;
- break;
- case VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT:
- /* We're transitioning a buffer that was previously used as depth
- * buffer. To make its content available to future operations, flush
- * the depth cache.
- */
- pipe_bits |= ANV_PIPE_DEPTH_CACHE_FLUSH_BIT;
- break;
- case VK_ACCESS_TRANSFER_WRITE_BIT:
- /* We're transitioning a buffer that was previously used as a
- * transfer write destination. Generic write operations include color
- * & depth operations as well as buffer operations like :
- * - vkCmdClearColorImage()
- * - vkCmdClearDepthStencilImage()
- * - vkCmdBlitImage()
- * - vkCmdCopy*(), vkCmdUpdate*(), vkCmdFill*()
- *
- * Most of these operations are implemented using Blorp which writes
- * through the render target, so flush that cache to make it visible
- * to future operations. And for depth related operations we also
- * need to flush the depth cache.
- */
- pipe_bits |= ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;
- pipe_bits |= ANV_PIPE_DEPTH_CACHE_FLUSH_BIT;
- break;
- case VK_ACCESS_MEMORY_WRITE_BIT:
- /* We're transitioning a buffer for generic write operations. Flush
- * all the caches.
- */
- pipe_bits |= ANV_PIPE_FLUSH_BITS;
- break;
- case VK_ACCESS_HOST_WRITE_BIT:
- /* We're transitioning a buffer for access by CPU. Invalidate
- * all the caches. Since data and tile caches don't have invalidate,
- * we are forced to flush those as well.
- */
- pipe_bits |= ANV_PIPE_FLUSH_BITS;
- pipe_bits |= ANV_PIPE_INVALIDATE_BITS;
- break;
- default:
- break; /* Nothing to do */
- }
- }
-
- return pipe_bits;
-}
+/* PIPE_CONTROL bits that should be set only in 3D RCS mode.
+ * For more details see genX(emit_apply_pipe_flushes).
+ */
+#define ANV_PIPE_GFX_BITS ( \
+ ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT | \
+ ANV_PIPE_DEPTH_CACHE_FLUSH_BIT | \
+ ANV_PIPE_TILE_CACHE_FLUSH_BIT | \
+ ANV_PIPE_DEPTH_STALL_BIT | \
+ ANV_PIPE_STALL_AT_SCOREBOARD_BIT | \
+ (GFX_VERx10 >= 125 ? ANV_PIPE_PSS_STALL_SYNC_BIT : 0) | \
+ ANV_PIPE_VF_CACHE_INVALIDATE_BIT)
-static inline enum anv_pipe_bits
-anv_pipe_invalidate_bits_for_access_flags(struct anv_device *device,
- VkAccessFlags flags)
-{
- enum anv_pipe_bits pipe_bits = 0;
-
- u_foreach_bit(b, flags) {
- switch ((VkAccessFlagBits)(1 << b)) {
- case VK_ACCESS_INDIRECT_COMMAND_READ_BIT:
- /* Indirect draw commands take a buffer as input that we're going to
- * read from the command streamer to load some of the HW registers
- * (see genX_cmd_buffer.c:load_indirect_parameters). This requires a
- * command streamer stall so that all the cache flushes have
- * completed before the command streamer loads from memory.
- */
- pipe_bits |= ANV_PIPE_CS_STALL_BIT;
- /* Indirect draw commands also set gl_BaseVertex & gl_BaseIndex
- * through a vertex buffer, so invalidate that cache.
- */
- pipe_bits |= ANV_PIPE_VF_CACHE_INVALIDATE_BIT;
- /* For CmdDipatchIndirect, we also load gl_NumWorkGroups through a
- * UBO from the buffer, so we need to invalidate constant cache.
- */
- pipe_bits |= ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT;
- pipe_bits |= ANV_PIPE_DATA_CACHE_FLUSH_BIT;
- /* Tile cache flush needed For CmdDipatchIndirect since command
- * streamer and vertex fetch aren't L3 coherent.
- */
- pipe_bits |= ANV_PIPE_TILE_CACHE_FLUSH_BIT;
- break;
- case VK_ACCESS_INDEX_READ_BIT:
- case VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT:
- /* We transitioning a buffer to be used for as input for vkCmdDraw*
- * commands, so we invalidate the VF cache to make sure there is no
- * stale data when we start rendering.
- */
- pipe_bits |= ANV_PIPE_VF_CACHE_INVALIDATE_BIT;
- break;
- case VK_ACCESS_UNIFORM_READ_BIT:
- /* We transitioning a buffer to be used as uniform data. Because
- * uniform is accessed through the data port & sampler, we need to
- * invalidate the texture cache (sampler) & constant cache (data
- * port) to avoid stale data.
- */
- pipe_bits |= ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT;
- if (device->physical->compiler->indirect_ubos_use_sampler)
- pipe_bits |= ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT;
- else
- pipe_bits |= ANV_PIPE_HDC_PIPELINE_FLUSH_BIT;
- break;
- case VK_ACCESS_SHADER_READ_BIT:
- case VK_ACCESS_INPUT_ATTACHMENT_READ_BIT:
- case VK_ACCESS_TRANSFER_READ_BIT:
- /* Transitioning a buffer to be read through the sampler, so
- * invalidate the texture cache, we don't want any stale data.
- */
- pipe_bits |= ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT;
- break;
- case VK_ACCESS_MEMORY_READ_BIT:
- /* Transitioning a buffer for generic read, invalidate all the
- * caches.
- */
- pipe_bits |= ANV_PIPE_INVALIDATE_BITS;
- break;
- case VK_ACCESS_MEMORY_WRITE_BIT:
- /* Generic write, make sure all previously written things land in
- * memory.
- */
- pipe_bits |= ANV_PIPE_FLUSH_BITS;
- break;
- case VK_ACCESS_CONDITIONAL_RENDERING_READ_BIT_EXT:
- /* Transitioning a buffer for conditional rendering. We'll load the
- * content of this buffer into HW registers using the command
- * streamer, so we need to stall the command streamer to make sure
- * any in-flight flush operations have completed. Needs tile cache
- * and data cache flush because command stream isn't L3 coherent yet.
- */
- pipe_bits |= ANV_PIPE_CS_STALL_BIT;
- pipe_bits |= ANV_PIPE_TILE_CACHE_FLUSH_BIT;
- pipe_bits |= ANV_PIPE_DATA_CACHE_FLUSH_BIT;
- break;
- case VK_ACCESS_HOST_READ_BIT:
- /* We're transitioning a buffer that was written by CPU. Flush
- * all the caches.
- */
- pipe_bits |= ANV_PIPE_FLUSH_BITS;
- break;
- default:
- break; /* Nothing to do */
- }
- }
+/* PIPE_CONTROL bits that should be set only in Media/GPGPU RCS mode.
+ * For more details see genX(emit_apply_pipe_flushes).
+ *
+ * Documentation says that untyped L1 dataport cache flush is controlled by
+ * HDC pipeline flush in 3D mode according to HDC_CHICKEN0 register:
+ *
+ * BSpec 47112: PIPE_CONTROL::HDC Pipeline Flush:
+ *
+ * "When the "Pipeline Select" mode in PIPELINE_SELECT command is set to
+ * "3D", HDC Pipeline Flush can also flush/invalidate the LSC Untyped L1
+ * cache based on the programming of HDC_Chicken0 register bits 13:11."
+ *
+ * "When the 'Pipeline Select' mode is set to 'GPGPU', the LSC Untyped L1
+ * cache flush is controlled by 'Untyped Data-Port Cache Flush' bit in the
+ * PIPE_CONTROL command."
+ *
+ * As part of Wa_22010960976 & Wa_14013347512, i915 is programming
+ * HDC_CHICKEN0[11:13] = 0 ("Untyped L1 is flushed, for both 3D Pipecontrol
+ * Dataport flush, and UAV coherency barrier event"). So there is no need
+ * to set "Untyped Data-Port Cache" in 3D mode.
+ *
+ * On MTL the HDC_CHICKEN0 default values changed to match what was programmed
+ * by Wa_22010960976 & Wa_14013347512 on DG2, but experiments show that the
+ * change runs a bit deeper. Even manually writing to the HDC_CHICKEN0
+ * register to force L1 untyped flush with HDC pipeline flush has no effect on
+ * MTL.
+ *
+ * It seems like the HW change completely disconnected L1 untyped flush from
+ * HDC pipeline flush with no way to bring that behavior back. So leave the L1
+ * untyped flush active in 3D mode on all platforms since it doesn't seems to
+ * cause issues there too.
+ *
+ * Maybe we'll have some GPGPU only bits here at some point.
+ */
+#define ANV_PIPE_GPGPU_BITS (0)
- return pipe_bits;
-}
+enum intel_ds_stall_flag
+anv_pipe_flush_bit_to_ds_stall_flag(enum anv_pipe_bits bits);
-#define VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV ( \
- VK_IMAGE_ASPECT_COLOR_BIT | \
- VK_IMAGE_ASPECT_PLANE_0_BIT | \
- VK_IMAGE_ASPECT_PLANE_1_BIT | \
- VK_IMAGE_ASPECT_PLANE_2_BIT)
#define VK_IMAGE_ASPECT_PLANES_BITS_ANV ( \
VK_IMAGE_ASPECT_PLANE_0_BIT | \
VK_IMAGE_ASPECT_PLANE_1_BIT | \
VK_IMAGE_ASPECT_PLANE_2_BIT)
+#define VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV ( \
+ VK_IMAGE_ASPECT_COLOR_BIT | \
+ VK_IMAGE_ASPECT_PLANES_BITS_ANV)
+
struct anv_vertex_binding {
struct anv_buffer * buffer;
VkDeviceSize offset;
- VkDeviceSize stride;
VkDeviceSize size;
};
@@ -2721,129 +3402,88 @@ struct anv_push_constants {
/** Push constant data provided by the client through vkPushConstants */
uint8_t client_data[MAX_PUSH_CONSTANTS_SIZE];
- /** Dynamic offsets for dynamic UBOs and SSBOs */
- uint32_t dynamic_offsets[MAX_DYNAMIC_BUFFERS];
-
- /* Robust access pushed registers. */
- uint64_t push_reg_mask[MESA_SHADER_STAGES];
-
- /** Pad out to a multiple of 32 bytes */
- uint32_t pad[2];
-
- /* Base addresses for descriptor sets */
- uint64_t desc_sets[MAX_SETS];
-
- struct {
- /** Base workgroup ID
- *
- * Used for vkCmdDispatchBase.
- */
- uint32_t base_work_group_id[3];
-
- /** Subgroup ID
- *
- * This is never set by software but is implicitly filled out when
- * uploading the push constants for compute shaders.
- */
- uint32_t subgroup_id;
- } cs;
-};
+#define ANV_DESCRIPTOR_SET_DYNAMIC_INDEX_MASK ((uint32_t)ANV_UBO_ALIGNMENT - 1)
+#define ANV_DESCRIPTOR_SET_OFFSET_MASK (~(uint32_t)(ANV_UBO_ALIGNMENT - 1))
-struct anv_dynamic_state {
- struct {
- uint32_t count;
- VkViewport viewports[MAX_VIEWPORTS];
- } viewport;
-
- struct {
- uint32_t count;
- VkRect2D scissors[MAX_SCISSORS];
- } scissor;
-
- float line_width;
-
- struct {
- float bias;
- float clamp;
- float slope;
- } depth_bias;
-
- float blend_constants[4];
-
- struct {
- float min;
- float max;
- } depth_bounds;
+ /**
+ * Base offsets for descriptor sets from
+ *
+ * The offset has different meaning depending on a number of factors :
+ *
+ * - with descriptor sets (direct or indirect), this relative
+ * pdevice->va.descriptor_pool
+ *
+ * - with descriptor buffers on DG2+, relative
+ * device->va.descriptor_buffer_pool
+ *
+ * - with descriptor buffers prior to DG2, relative the programmed value
+ * in STATE_BASE_ADDRESS::BindlessSurfaceStateBaseAddress
+ */
+ uint32_t desc_surface_offsets[MAX_SETS];
- struct {
- uint32_t front;
- uint32_t back;
- } stencil_compare_mask;
+ /**
+ * Base offsets for descriptor sets from
+ */
+ uint32_t desc_sampler_offsets[MAX_SETS];
- struct {
- uint32_t front;
- uint32_t back;
- } stencil_write_mask;
+ /** Dynamic offsets for dynamic UBOs and SSBOs */
+ uint32_t dynamic_offsets[MAX_DYNAMIC_BUFFERS];
- struct {
- uint32_t front;
- uint32_t back;
- } stencil_reference;
+ /** Surface buffer base offset
+ *
+ * Only used prior to DG2 with descriptor buffers.
+ *
+ * (surfaces_base_offset + desc_offsets[set_index]) is relative to
+ * device->va.descriptor_buffer_pool and can be used to compute a 64bit
+ * address to the descriptor buffer (using load_desc_set_address_intel).
+ */
+ uint32_t surfaces_base_offset;
- struct {
- struct {
- VkStencilOp fail_op;
- VkStencilOp pass_op;
- VkStencilOp depth_fail_op;
- VkCompareOp compare_op;
- } front;
+ union {
struct {
- VkStencilOp fail_op;
- VkStencilOp pass_op;
- VkStencilOp depth_fail_op;
- VkCompareOp compare_op;
- } back;
- } stencil_op;
+ /** Dynamic MSAA value */
+ uint32_t fs_msaa_flags;
- struct {
- uint32_t factor;
- uint16_t pattern;
- } line_stipple;
+ /** Dynamic TCS input vertices */
+ uint32_t tcs_input_vertices;
+ } gfx;
- struct {
- uint32_t samples;
- VkSampleLocationEXT locations[MAX_SAMPLE_LOCATIONS];
- } sample_locations;
+ struct {
+ /** Base workgroup ID
+ *
+ * Used for vkCmdDispatchBase.
+ */
+ uint32_t base_work_group_id[3];
- VkExtent2D fragment_shading_rate;
+ /** Subgroup ID
+ *
+ * This is never set by software but is implicitly filled out when
+ * uploading the push constants for compute shaders.
+ */
+ uint32_t subgroup_id;
+ } cs;
+ };
- VkCullModeFlags cull_mode;
- VkFrontFace front_face;
- VkPrimitiveTopology primitive_topology;
- bool depth_test_enable;
- bool depth_write_enable;
- VkCompareOp depth_compare_op;
- bool depth_bounds_test_enable;
- bool stencil_test_enable;
- bool raster_discard;
- bool depth_bias_enable;
- bool primitive_restart_enable;
- VkLogicOp logic_op;
- bool dyn_vbo_stride;
- bool dyn_vbo_size;
+ /* Robust access pushed registers. */
+ uint64_t push_reg_mask[MESA_SHADER_STAGES];
- /* Bitfield, one bit per render target */
- uint8_t color_writes;
+ /** Ray query globals (RT_DISPATCH_GLOBALS) */
+ uint64_t ray_query_globals;
};
-extern const struct anv_dynamic_state default_dynamic_state;
-
-uint32_t anv_dynamic_state_copy(struct anv_dynamic_state *dest,
- const struct anv_dynamic_state *src,
- uint32_t copy_mask);
-
struct anv_surface_state {
+ /** Surface state allocated from the bindless heap
+ *
+ * Can be NULL if unused.
+ */
struct anv_state state;
+
+ /** Surface state after genxml packing
+ *
+ * Same data as in state.
+ */
+ struct anv_surface_state_data state_data;
+
/** Address of the surface referred to by this state
*
* This address is relative to the start of the BO.
@@ -2864,31 +3504,16 @@ struct anv_surface_state {
struct anv_address clear_address;
};
-/**
- * Attachment state when recording a renderpass instance.
- *
- * The clear value is valid only if there exists a pending clear.
- */
-struct anv_attachment_state {
- enum isl_aux_usage aux_usage;
- struct anv_surface_state color;
- struct anv_surface_state input;
-
- VkImageLayout current_layout;
- VkImageLayout current_stencil_layout;
- VkImageAspectFlags pending_clear_aspects;
- VkImageAspectFlags pending_load_aspects;
- bool fast_clear;
- VkClearValue clear_value;
+struct anv_attachment {
+ VkFormat vk_format;
+ const struct anv_image_view *iview;
+ VkImageLayout layout;
+ enum isl_aux_usage aux_usage;
+ struct anv_surface_state surface_state;
- /* When multiview is active, attachments with a renderpass clear
- * operation have their respective layers cleared on the first
- * subpass that uses them, and only in that subpass. We keep track
- * of this using a bitfield to indicate which layers of an attachment
- * have not been cleared yet when multiview is active.
- */
- uint32_t pending_clear_views;
- struct anv_image_view * image_view;
+ VkResolveModeFlagBits resolve_mode;
+ const struct anv_image_view *resolve_iview;
+ VkImageLayout resolve_layout;
};
/** State tracking for vertex buffer flushes
@@ -2912,6 +3537,70 @@ struct anv_vb_cache_range {
uint64_t end;
};
+static inline void
+anv_merge_vb_cache_range(struct anv_vb_cache_range *dirty,
+ const struct anv_vb_cache_range *bound)
+{
+ if (dirty->start == dirty->end) {
+ *dirty = *bound;
+ } else if (bound->start != bound->end) {
+ dirty->start = MIN2(dirty->start, bound->start);
+ dirty->end = MAX2(dirty->end, bound->end);
+ }
+}
+
+/* Check whether we need to apply the Gfx8-9 vertex buffer workaround*/
+static inline bool
+anv_gfx8_9_vb_cache_range_needs_workaround(struct anv_vb_cache_range *bound,
+ struct anv_vb_cache_range *dirty,
+ struct anv_address vb_address,
+ uint32_t vb_size)
+{
+ if (vb_size == 0) {
+ bound->start = 0;
+ bound->end = 0;
+ return false;
+ }
+
+ bound->start = intel_48b_address(anv_address_physical(vb_address));
+ bound->end = bound->start + vb_size;
+ assert(bound->end > bound->start); /* No overflow */
+
+ /* Align everything to a cache line */
+ bound->start &= ~(64ull - 1ull);
+ bound->end = align64(bound->end, 64);
+
+ anv_merge_vb_cache_range(dirty, bound);
+
+ /* If our range is larger than 32 bits, we have to flush */
+ assert(bound->end - bound->start <= (1ull << 32));
+ return (dirty->end - dirty->start) > (1ull << 32);
+}
+
+/**
+ * State tracking for simple internal shaders
+ */
+struct anv_simple_shader {
+ /* The device associated with this emission */
+ struct anv_device *device;
+ /* The command buffer associated with this emission (can be NULL) */
+ struct anv_cmd_buffer *cmd_buffer;
+ /* State stream used for various internal allocations */
+ struct anv_state_stream *dynamic_state_stream;
+ struct anv_state_stream *general_state_stream;
+ /* Where to emit the commands (can be different from cmd_buffer->batch) */
+ struct anv_batch *batch;
+ /* Shader to use */
+ struct anv_shader_bin *kernel;
+ /* L3 config used by the shader */
+ const struct intel_l3_config *l3_config;
+ /* Current URB config */
+ const struct intel_urb_config *urb_cfg;
+
+ /* Managed by the simpler shader helper*/
+ struct anv_state bt_state;
+};
+
/** State tracking for particular pipeline bind point
*
* This struct is the base struct for anv_cmd_graphics_state and
@@ -2922,12 +3611,55 @@ struct anv_vb_cache_range {
*/
struct anv_cmd_pipeline_state {
struct anv_descriptor_set *descriptors[MAX_SETS];
- struct anv_push_descriptor_set *push_descriptors[MAX_SETS];
+ struct {
+ bool bound;
+ /**
+ * Buffer index used by this descriptor set.
+ */
+ int32_t buffer_index; /* -1 means push descriptor */
+ /**
+ * Offset of the descriptor set in the descriptor buffer.
+ */
+ uint32_t buffer_offset;
+ /**
+ * Final computed address to be emitted in the descriptor set surface
+ * state.
+ */
+ uint64_t address;
+ /**
+ * The descriptor set surface state.
+ */
+ struct anv_state state;
+ } descriptor_buffers[MAX_SETS];
+ struct anv_push_descriptor_set push_descriptor;
struct anv_push_constants push_constants;
+ /** Tracks whether the push constant data has changed and need to be reemitted */
+ bool push_constants_data_dirty;
+
/* Push constant state allocated when flushing push constants. */
struct anv_state push_constants_state;
+
+ /**
+ * Dynamic buffer offsets.
+ *
+ * We have a maximum of MAX_DYNAMIC_BUFFERS per pipeline, but with
+ * independent sets we cannot know which how much in total is going to be
+ * used. As a result we need to store the maximum possible number per set.
+ *
+ * Those values are written into anv_push_constants::dynamic_offsets at
+ * flush time when have the pipeline with the final
+ * anv_pipeline_sets_layout.
+ */
+ struct {
+ uint32_t offsets[MAX_DYNAMIC_BUFFERS];
+ } dynamic_offsets[MAX_SETS];
+
+ /**
+ * The current bound pipeline.
+ */
+ struct anv_pipeline *pipeline;
};
/** State tracking for graphics pipeline
@@ -2940,7 +3672,17 @@ struct anv_cmd_pipeline_state {
struct anv_cmd_graphics_state {
struct anv_cmd_pipeline_state base;
- struct anv_graphics_pipeline *pipeline;
+ VkRenderingFlags rendering_flags;
+ VkRect2D render_area;
+ uint32_t layer_count;
+ uint32_t samples;
+ uint32_t view_mask;
+ uint32_t color_att_count;
+ struct anv_state att_states;
+ struct anv_attachment color_att[MAX_RTS];
+ struct anv_attachment depth_att;
+ struct anv_attachment stencil_att;
+ struct anv_state null_surface_state;
anv_cmd_dirty_mask_t dirty;
uint32_t vb_dirty;
@@ -2950,23 +3692,55 @@ struct anv_cmd_graphics_state {
struct anv_vb_cache_range vb_bound_ranges[33];
struct anv_vb_cache_range vb_dirty_ranges[33];
- VkShaderStageFlags push_constant_stages;
+ uint32_t restart_index;
- struct anv_dynamic_state dynamic;
+ VkShaderStageFlags push_constant_stages;
uint32_t primitive_topology;
+ bool used_task_shader;
- struct {
- struct anv_buffer *index_buffer;
- uint32_t index_type; /**< 3DSTATE_INDEX_BUFFER.IndexFormat */
- uint32_t index_offset;
- } gfx7;
+ struct anv_buffer *index_buffer;
+ uint32_t index_type; /**< 3DSTATE_INDEX_BUFFER.IndexFormat */
+ uint32_t index_offset;
+ uint32_t index_size;
+
+ struct vk_vertex_input_state vertex_input;
+ struct vk_sample_locations_state sample_locations;
+
+ /* Dynamic msaa flags, this value can be different from
+ * anv_push_constants::gfx::fs_msaa_flags, as the push constant value only
+ * needs to be updated for fragment shaders dynamically checking the value.
+ */
+ enum intel_msaa_flags fs_msaa_flags;
+
+ bool object_preemption;
+ bool has_uint_rt;
+
+ /* State tracking for Wa_14018912822. */
+ bool color_blend_zero;
+ bool alpha_blend_zero;
+
+ /**
+ * DEPTH and STENCIL attachment write state for Wa_18019816803.
+ */
+ bool ds_write_state;
+
+ /**
+ * State tracking for Wa_18020335297.
+ */
+ bool viewport_set;
+
+ struct intel_urb_config urb_cfg;
+
+ uint32_t n_occlusion_queries;
+
+ struct anv_gfx_dynamic_state dyn_state;
};
enum anv_depth_reg_mode {
ANV_DEPTH_REG_MODE_UNKNOWN = 0,
ANV_DEPTH_REG_MODE_HW_DEFAULT,
- ANV_DEPTH_REG_MODE_D16,
+ ANV_DEPTH_REG_MODE_D16_1X_MSAA,
};
/** State tracking for compute pipeline
@@ -2979,26 +3753,33 @@ enum anv_depth_reg_mode {
struct anv_cmd_compute_state {
struct anv_cmd_pipeline_state base;
- struct anv_compute_pipeline *pipeline;
-
bool pipeline_dirty;
struct anv_state push_data;
struct anv_address num_workgroups;
+
+ uint32_t scratch_size;
};
struct anv_cmd_ray_tracing_state {
struct anv_cmd_pipeline_state base;
- struct anv_ray_tracing_pipeline *pipeline;
-
bool pipeline_dirty;
struct {
struct anv_bo *bo;
struct brw_rt_scratch_layout layout;
} scratch;
+
+ struct anv_address build_priv_mem_addr;
+ size_t build_priv_mem_size;
+};
+
+enum anv_cmd_descriptor_buffer_mode {
+ ANV_CMD_DESCRIPTOR_BUFFER_MODE_UNKNOWN,
+ ANV_CMD_DESCRIPTOR_BUFFER_MODE_LEGACY,
+ ANV_CMD_DESCRIPTOR_BUFFER_MODE_BUFFER,
};
/** State required while building cmd buffer */
@@ -3013,23 +3794,63 @@ struct anv_cmd_state {
struct anv_cmd_ray_tracing_state rt;
enum anv_pipe_bits pending_pipe_bits;
+ const char * pc_reasons[4];
+ uint32_t pc_reasons_count;
+
+ /**
+ * Whether the last programmed STATE_BASE_ADDRESS references
+ * anv_device::dynamic_state_pool or anv_device::dynamic_state_pool_db for
+ * the dynamic state heap.
+ */
+ enum anv_cmd_descriptor_buffer_mode current_db_mode;
+
+ /**
+ * Whether the command buffer has pending descriptor buffers bound it. This
+ * variable changes before anv_device::current_db_mode.
+ */
+ enum anv_cmd_descriptor_buffer_mode pending_db_mode;
+
+ struct {
+ /**
+ * Tracks operations susceptible to interfere with queries in the
+ * destination buffer of vkCmdCopyQueryResults, we need those operations to
+ * have completed before we do the work of vkCmdCopyQueryResults.
+ */
+ enum anv_query_bits buffer_write_bits;
+
+ /**
+ * Tracks clear operations of query buffers that can interact with
+ * vkCmdQueryBegin*, vkCmdWriteTimestamp*,
+ * vkCmdWriteAccelerationStructuresPropertiesKHR, etc...
+ *
+ * We need the clearing of the buffer completed before with write data with
+ * the command streamer or a shader.
+ */
+ enum anv_query_bits clear_bits;
+ } queries;
+
VkShaderStageFlags descriptors_dirty;
+ VkShaderStageFlags push_descriptors_dirty;
+ /** Tracks the 3DSTATE_CONSTANT_* instruction that needs to be reemitted */
VkShaderStageFlags push_constants_dirty;
- struct anv_framebuffer * framebuffer;
- struct anv_render_pass * pass;
- struct anv_subpass * subpass;
- VkRect2D render_area;
- uint32_t restart_index;
+ struct {
+ uint64_t surfaces_address;
+ uint64_t samplers_address;
+ bool dirty;
+ VkShaderStageFlags offsets_dirty;
+ uint64_t address[MAX_SETS];
+ } descriptor_buffers;
+
struct anv_vertex_binding vertex_bindings[MAX_VBS];
bool xfb_enabled;
struct anv_xfb_binding xfb_bindings[MAX_XFB_BUFFERS];
struct anv_state binding_tables[MESA_VULKAN_SHADER_STAGES];
struct anv_state samplers[MESA_VULKAN_SHADER_STAGES];
- unsigned char sampler_sha1s[MESA_SHADER_STAGES][20];
- unsigned char surface_sha1s[MESA_SHADER_STAGES][20];
- unsigned char push_sha1s[MESA_SHADER_STAGES][20];
+ unsigned char sampler_sha1s[MESA_VULKAN_SHADER_STAGES][20];
+ unsigned char surface_sha1s[MESA_VULKAN_SHADER_STAGES][20];
+ unsigned char push_sha1s[MESA_VULKAN_SHADER_STAGES][20];
/**
* Whether or not the gfx8 PMA fix is enabled. We ensure that, at the top
@@ -3045,47 +3866,36 @@ struct anv_cmd_state {
*/
bool hiz_enabled;
- /* We ensure the registers for the gfx12 D16 fix are initalized at the
+ /* We ensure the registers for the gfx12 D16 fix are initialized at the
* first non-NULL depth stencil packet emission of every command buffer.
* For secondary command buffer execution, we transfer the state from the
* last command buffer to the primary (if known).
*/
enum anv_depth_reg_mode depth_reg_mode;
- bool conditional_render_enabled;
-
/**
- * Last rendering scale argument provided to
- * genX(cmd_buffer_emit_hashing_mode)().
+ * Whether RHWO optimization is enabled (Wa_1508744258).
*/
- unsigned current_hash_scale;
+ bool rhwo_optimization_enabled;
/**
- * Array length is anv_cmd_state::pass::attachment_count. Array content is
- * valid only when recording a render pass instance.
+ * Pending state of the RHWO optimization, to be applied at the next
+ * genX(cmd_buffer_apply_pipe_flushes).
*/
- struct anv_attachment_state * attachments;
+ bool pending_rhwo_optimization_enabled;
+
+ bool conditional_render_enabled;
/**
- * Surface states for color render targets. These are stored in a single
- * flat array. For depth-stencil attachments, the surface state is simply
- * left blank.
+ * Last rendering scale argument provided to
+ * genX(cmd_buffer_emit_hashing_mode)().
*/
- struct anv_state attachment_states;
+ unsigned current_hash_scale;
/**
- * A null surface state of the right size to match the framebuffer. This
- * is one of the states in attachment_states.
+ * A buffer used for spill/fill of ray queries.
*/
- struct anv_state null_surface_state;
-};
-
-struct anv_cmd_pool {
- struct vk_object_base base;
- VkAllocationCallbacks alloc;
- struct list_head cmd_buffers;
-
- VkCommandPoolCreateFlags flags;
+ struct anv_bo * ray_query_shadow_bo;
};
#define ANV_MIN_CMD_BUFFER_BATCH_SIZE 8192
@@ -3103,13 +3913,12 @@ enum anv_cmd_buffer_exec_mode {
struct anv_measure_batch;
struct anv_cmd_buffer {
- struct vk_object_base base;
+ struct vk_command_buffer vk;
struct anv_device * device;
+ struct anv_queue_family * queue_family;
- struct anv_cmd_pool * pool;
- struct list_head pool_link;
-
+ /** Batch where the main commands live */
struct anv_batch batch;
/* Pointer to the location in the batch where MI_BATCH_BUFFER_END was
@@ -3140,8 +3949,6 @@ struct anv_cmd_buffer {
struct anv_state bt_next;
struct anv_reloc_list surface_relocs;
- /** Last seen surface state block pool center bo offset */
- uint32_t last_ss_pool_center;
/* Serial for tracking buffer completion */
uint32_t serial;
@@ -3149,10 +3956,12 @@ struct anv_cmd_buffer {
/* Stream objects for storing temporary data */
struct anv_state_stream surface_state_stream;
struct anv_state_stream dynamic_state_stream;
+ struct anv_state_stream dynamic_state_db_stream;
struct anv_state_stream general_state_stream;
+ struct anv_state_stream indirect_push_descriptor_stream;
+ struct anv_state_stream push_descriptor_buffer_stream;
VkCommandBufferUsageFlags usage_flags;
- VkCommandBufferLevel level;
struct anv_query_pool *perf_query_pool;
@@ -3180,22 +3989,169 @@ struct anv_cmd_buffer {
uint32_t perf_reloc_idx;
/**
- * Sum of all the anv_batch_bo sizes allocated for this command buffer.
- * Used to increase allocation size for long command buffers.
+ * Sum of all the anv_batch_bo written sizes for this command buffer
+ * including any executed secondary command buffer.
*/
uint32_t total_batch_size;
+
+ struct {
+ /** Batch generating part of the anv_cmd_buffer::batch */
+ struct anv_batch batch;
+
+ /**
+ * Location in anv_cmd_buffer::batch at which we left some space to
+ * insert a MI_BATCH_BUFFER_START into the
+ * anv_cmd_buffer::generation::batch if needed.
+ */
+ struct anv_address jump_addr;
+
+ /**
+ * Location in anv_cmd_buffer::batch at which the generation batch
+ * should jump back to.
+ */
+ struct anv_address return_addr;
+
+ /** List of anv_batch_bo used for generation
+ *
+ * We have to keep this separated of the anv_cmd_buffer::batch_bos that
+ * is used for a chaining optimization.
+ */
+ struct list_head batch_bos;
+
+ /** Ring buffer of generated commands
+ *
+ * When generating draws in ring mode, this buffer will hold generated
+ * 3DPRIMITIVE commands.
+ */
+ struct anv_bo *ring_bo;
+
+ /**
+ * State tracking of the generation shader (only used for the non-ring
+ * mode).
+ */
+ struct anv_simple_shader shader_state;
+ } generation;
+
+ /**
+ * A vector of anv_bo pointers for chunks of memory used by the command
+ * buffer that are too large to be allocated through dynamic_state_stream.
+ * This is the case for large enough acceleration structures.
+ *
+ * initialized by anv_cmd_buffer_init_batch_bo_chain()
+ */
+ struct u_vector dynamic_bos;
+
+ /**
+ * Structure holding tracepoints recorded in the command buffer.
+ */
+ struct u_trace trace;
+
+ /** Pointer to the last emitted COMPUTE_WALKER.
+ *
+ * This is used to edit the instruction post emission to replace the "Post
+ * Sync" field for utrace timestamp emission.
+ */
+ void *last_compute_walker;
+
+ /** Pointer to the last emitted EXECUTE_INDIRECT_DISPATCH.
+ *
+ * This is used to edit the instruction post emission to replace the "Post
+ * Sync" field for utrace timestamp emission.
+ */
+ void *last_indirect_dispatch;
+
+ struct {
+ struct anv_video_session *vid;
+ struct anv_video_session_params *params;
+ } video;
+
+ /**
+ * Companion RCS command buffer to support the MSAA operations on compute
+ * queue.
+ */
+ struct anv_cmd_buffer *companion_rcs_cmd_buffer;
+
+ /**
+ * Whether this command buffer is a companion command buffer of compute one.
+ */
+ bool is_companion_rcs_cmd_buffer;
+
};
+extern const struct vk_command_buffer_ops anv_cmd_buffer_ops;
+
/* Determine whether we can chain a given cmd_buffer to another one. We need
- * softpin and we also need to make sure that we can edit the end of the batch
- * to point to next one, which requires the command buffer to not be used
- * simultaneously.
+ * to make sure that we can edit the end of the batch to point to next one,
+ * which requires the command buffer to not be used simultaneously.
+ *
+ * We could in theory also implement chaining with companion command buffers,
+ * but let's sparse ourselves some pain and misery. This optimization has no
+ * benefit on the brand new Xe kernel driver.
*/
static inline bool
anv_cmd_buffer_is_chainable(struct anv_cmd_buffer *cmd_buffer)
{
- return anv_use_softpin(cmd_buffer->device->physical) &&
- !(cmd_buffer->usage_flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT);
+ return !(cmd_buffer->usage_flags &
+ VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT) &&
+ !(cmd_buffer->is_companion_rcs_cmd_buffer);
+}
+
+static inline bool
+anv_cmd_buffer_is_render_queue(const struct anv_cmd_buffer *cmd_buffer)
+{
+ struct anv_queue_family *queue_family = cmd_buffer->queue_family;
+ return (queue_family->queueFlags & VK_QUEUE_GRAPHICS_BIT) != 0;
+}
+
+static inline bool
+anv_cmd_buffer_is_video_queue(const struct anv_cmd_buffer *cmd_buffer)
+{
+ struct anv_queue_family *queue_family = cmd_buffer->queue_family;
+ return (queue_family->queueFlags & VK_QUEUE_VIDEO_DECODE_BIT_KHR) != 0;
+}
+
+static inline bool
+anv_cmd_buffer_is_compute_queue(const struct anv_cmd_buffer *cmd_buffer)
+{
+ struct anv_queue_family *queue_family = cmd_buffer->queue_family;
+ return queue_family->engine_class == INTEL_ENGINE_CLASS_COMPUTE;
+}
+
+static inline bool
+anv_cmd_buffer_is_blitter_queue(const struct anv_cmd_buffer *cmd_buffer)
+{
+ struct anv_queue_family *queue_family = cmd_buffer->queue_family;
+ return queue_family->engine_class == INTEL_ENGINE_CLASS_COPY;
+}
+
+static inline bool
+anv_cmd_buffer_is_render_or_compute_queue(const struct anv_cmd_buffer *cmd_buffer)
+{
+ return anv_cmd_buffer_is_render_queue(cmd_buffer) ||
+ anv_cmd_buffer_is_compute_queue(cmd_buffer);
+}
+
+static inline struct anv_address
+anv_cmd_buffer_dynamic_state_address(struct anv_cmd_buffer *cmd_buffer,
+ struct anv_state state)
+{
+ if (cmd_buffer->state.current_db_mode ==
+ ANV_CMD_DESCRIPTOR_BUFFER_MODE_BUFFER) {
+ return anv_state_pool_state_address(
+ &cmd_buffer->device->dynamic_state_db_pool, state);
+ }
+ return anv_state_pool_state_address(
+ &cmd_buffer->device->dynamic_state_pool, state);
+}
+
+static inline uint64_t
+anv_cmd_buffer_descriptor_buffer_address(struct anv_cmd_buffer *cmd_buffer,
+ int32_t buffer_index)
+{
+ if (buffer_index == -1)
+ return cmd_buffer->device->physical->va.push_descriptor_buffer_pool.addr;
+
+ return cmd_buffer->state.descriptor_buffers.address[buffer_index];
}
VkResult anv_cmd_buffer_init_batch_bo_chain(struct anv_cmd_buffer *cmd_buffer);
@@ -3216,7 +4172,8 @@ VkResult anv_cmd_buffer_execbuf(struct anv_queue *queue,
VkFence fence,
int perf_query_pass);
-VkResult anv_cmd_buffer_reset(struct anv_cmd_buffer *cmd_buffer);
+void anv_cmd_buffer_reset(struct vk_command_buffer *vk_cmd_buffer,
+ UNUSED VkCommandBufferResetFlags flags);
struct anv_state anv_cmd_buffer_emit_dynamic(struct anv_cmd_buffer *cmd_buffer,
const void *data, uint32_t size, uint32_t alignment);
@@ -3230,111 +4187,153 @@ struct anv_state
anv_cmd_buffer_alloc_binding_table(struct anv_cmd_buffer *cmd_buffer,
uint32_t entries, uint32_t *state_offset);
struct anv_state
-anv_cmd_buffer_alloc_surface_state(struct anv_cmd_buffer *cmd_buffer);
+anv_cmd_buffer_alloc_surface_states(struct anv_cmd_buffer *cmd_buffer,
+ uint32_t count);
struct anv_state
anv_cmd_buffer_alloc_dynamic_state(struct anv_cmd_buffer *cmd_buffer,
uint32_t size, uint32_t alignment);
+struct anv_state
+anv_cmd_buffer_alloc_general_state(struct anv_cmd_buffer *cmd_buffer,
+ uint32_t size, uint32_t alignment);
+static inline struct anv_state
+anv_cmd_buffer_alloc_temporary_state(struct anv_cmd_buffer *cmd_buffer,
+ uint32_t size, uint32_t alignment)
+{
+ struct anv_state state =
+ anv_state_stream_alloc(&cmd_buffer->dynamic_state_stream,
+ size, alignment);
+ if (state.map == NULL)
+ anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_OUT_OF_DEVICE_MEMORY);
+ return state;
+}
+static inline struct anv_address
+anv_cmd_buffer_temporary_state_address(struct anv_cmd_buffer *cmd_buffer,
+ struct anv_state state)
+{
+ return anv_state_pool_state_address(
+ &cmd_buffer->device->dynamic_state_pool, state);
+}
-VkResult
-anv_cmd_buffer_new_binding_table_block(struct anv_cmd_buffer *cmd_buffer);
+void
+anv_cmd_buffer_chain_command_buffers(struct anv_cmd_buffer **cmd_buffers,
+ uint32_t num_cmd_buffers);
+void
+anv_cmd_buffer_exec_batch_debug(struct anv_queue *queue,
+ uint32_t cmd_buffer_count,
+ struct anv_cmd_buffer **cmd_buffers,
+ struct anv_query_pool *perf_query_pool,
+ uint32_t perf_query_pass);
+void
+anv_cmd_buffer_clflush(struct anv_cmd_buffer **cmd_buffers,
+ uint32_t num_cmd_buffers);
+
+void
+anv_cmd_buffer_update_pending_query_bits(struct anv_cmd_buffer *cmd_buffer,
+ enum anv_pipe_bits flushed_bits);
+
+/**
+ * A allocation tied to a command buffer.
+ *
+ * Don't use anv_cmd_alloc::address::map to write memory from userspace, use
+ * anv_cmd_alloc::map instead.
+ */
+struct anv_cmd_alloc {
+ struct anv_address address;
+ void *map;
+ size_t size;
+};
+
+#define ANV_EMPTY_ALLOC ((struct anv_cmd_alloc) { .map = NULL, .size = 0 })
+
+static inline bool
+anv_cmd_alloc_is_empty(struct anv_cmd_alloc alloc)
+{
+ return alloc.size == 0;
+}
-void gfx8_cmd_buffer_emit_viewport(struct anv_cmd_buffer *cmd_buffer);
-void gfx8_cmd_buffer_emit_depth_viewport(struct anv_cmd_buffer *cmd_buffer,
- bool depth_clamp_enable);
-void gfx7_cmd_buffer_emit_scissor(struct anv_cmd_buffer *cmd_buffer);
+struct anv_cmd_alloc
+anv_cmd_buffer_alloc_space(struct anv_cmd_buffer *cmd_buffer,
+ size_t size, uint32_t alignment,
+ bool private);
-void anv_cmd_buffer_setup_attachments(struct anv_cmd_buffer *cmd_buffer,
- struct anv_render_pass *pass,
- struct anv_framebuffer *framebuffer,
- const VkClearValue *clear_values);
+VkResult
+anv_cmd_buffer_new_binding_table_block(struct anv_cmd_buffer *cmd_buffer);
-void anv_cmd_buffer_emit_state_base_address(struct anv_cmd_buffer *cmd_buffer);
+void anv_cmd_buffer_emit_bt_pool_base_address(struct anv_cmd_buffer *cmd_buffer);
struct anv_state
anv_cmd_buffer_gfx_push_constants(struct anv_cmd_buffer *cmd_buffer);
struct anv_state
anv_cmd_buffer_cs_push_constants(struct anv_cmd_buffer *cmd_buffer);
-const struct anv_image_view *
-anv_cmd_buffer_get_depth_stencil_view(const struct anv_cmd_buffer *cmd_buffer);
-
VkResult
anv_cmd_buffer_alloc_blorp_binding_table(struct anv_cmd_buffer *cmd_buffer,
uint32_t num_entries,
uint32_t *state_offset,
struct anv_state *bt_state);
-void anv_cmd_buffer_dump(struct anv_cmd_buffer *cmd_buffer);
-
void anv_cmd_emit_conditional_render_predicate(struct anv_cmd_buffer *cmd_buffer);
-enum anv_fence_type {
- ANV_FENCE_TYPE_NONE = 0,
- ANV_FENCE_TYPE_BO,
- ANV_FENCE_TYPE_WSI_BO,
- ANV_FENCE_TYPE_SYNCOBJ,
- ANV_FENCE_TYPE_WSI,
+static inline unsigned
+anv_cmd_buffer_get_view_count(struct anv_cmd_buffer *cmd_buffer)
+{
+ struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
+ return MAX2(1, util_bitcount(gfx->view_mask));
+}
+
+/* Save/restore cmd buffer states for meta operations */
+enum anv_cmd_saved_state_flags {
+ ANV_CMD_SAVED_STATE_COMPUTE_PIPELINE = BITFIELD_BIT(0),
+ ANV_CMD_SAVED_STATE_DESCRIPTOR_SET_0 = BITFIELD_BIT(1),
+ ANV_CMD_SAVED_STATE_PUSH_CONSTANTS = BITFIELD_BIT(2),
+};
+
+struct anv_cmd_saved_state {
+ uint32_t flags;
+
+ struct anv_pipeline *pipeline;
+ struct anv_descriptor_set *descriptor_set;
+ uint8_t push_constants[MAX_PUSH_CONSTANTS_SIZE];
};
-enum anv_bo_fence_state {
+void anv_cmd_buffer_save_state(struct anv_cmd_buffer *cmd_buffer,
+ uint32_t flags,
+ struct anv_cmd_saved_state *state);
+
+void anv_cmd_buffer_restore_state(struct anv_cmd_buffer *cmd_buffer,
+ struct anv_cmd_saved_state *state);
+
+enum anv_bo_sync_state {
/** Indicates that this is a new (or newly reset fence) */
- ANV_BO_FENCE_STATE_RESET,
+ ANV_BO_SYNC_STATE_RESET,
/** Indicates that this fence has been submitted to the GPU but is still
* (as far as we know) in use by the GPU.
*/
- ANV_BO_FENCE_STATE_SUBMITTED,
+ ANV_BO_SYNC_STATE_SUBMITTED,
- ANV_BO_FENCE_STATE_SIGNALED,
+ ANV_BO_SYNC_STATE_SIGNALED,
};
-struct anv_fence_impl {
- enum anv_fence_type type;
-
- union {
- /** Fence implementation for BO fences
- *
- * These fences use a BO and a set of CPU-tracked state flags. The BO
- * is added to the object list of the last execbuf call in a QueueSubmit
- * and is marked EXEC_WRITE. The state flags track when the BO has been
- * submitted to the kernel. We need to do this because Vulkan lets you
- * wait on a fence that has not yet been submitted and I915_GEM_BUSY
- * will say it's idle in this case.
- */
- struct {
- struct anv_bo *bo;
- enum anv_bo_fence_state state;
- } bo;
-
- /** DRM syncobj handle for syncobj-based fences */
- uint32_t syncobj;
+struct anv_bo_sync {
+ struct vk_sync sync;
- /** WSI fence */
- struct wsi_fence *fence_wsi;
- };
+ enum anv_bo_sync_state state;
+ struct anv_bo *bo;
};
-struct anv_fence {
- struct vk_object_base base;
-
- /* Permanent fence state. Every fence has some form of permanent state
- * (type != ANV_SEMAPHORE_TYPE_NONE). This may be a BO to fence on (for
- * cross-process fences) or it could just be a dummy for use internally.
- */
- struct anv_fence_impl permanent;
+extern const struct vk_sync_type anv_bo_sync_type;
- /* Temporary fence state. A fence *may* have temporary state. That state
- * is added to the fence by an import operation and is reset back to
- * ANV_SEMAPHORE_TYPE_NONE when the fence is reset. A fence with temporary
- * state cannot be signaled because the fence must already be signaled
- * before the temporary state can be exported from the fence in the other
- * process and imported here.
- */
- struct anv_fence_impl temporary;
-};
+static inline bool
+vk_sync_is_anv_bo_sync(const struct vk_sync *sync)
+{
+ return sync->type == &anv_bo_sync_type;
+}
-void anv_fence_reset_temporary(struct anv_device *device,
- struct anv_fence *fence);
+VkResult anv_create_sync_for_memory(struct vk_device *device,
+ VkDeviceMemory memory,
+ bool signal_memory,
+ struct vk_sync **sync_out);
struct anv_event {
struct vk_object_base base;
@@ -3342,89 +4341,6 @@ struct anv_event {
struct anv_state state;
};
-enum anv_semaphore_type {
- ANV_SEMAPHORE_TYPE_NONE = 0,
- ANV_SEMAPHORE_TYPE_DUMMY,
- ANV_SEMAPHORE_TYPE_WSI_BO,
- ANV_SEMAPHORE_TYPE_DRM_SYNCOBJ,
- ANV_SEMAPHORE_TYPE_TIMELINE,
- ANV_SEMAPHORE_TYPE_DRM_SYNCOBJ_TIMELINE,
-};
-
-struct anv_timeline_point {
- struct list_head link;
-
- uint64_t serial;
-
- /* Number of waiter on this point, when > 0 the point should not be garbage
- * collected.
- */
- int waiting;
-
- /* BO used for synchronization. */
- struct anv_bo *bo;
-};
-
-struct anv_timeline {
- pthread_mutex_t mutex;
- pthread_cond_t cond;
-
- uint64_t highest_past;
- uint64_t highest_pending;
-
- struct list_head points;
- struct list_head free_points;
-};
-
-struct anv_semaphore_impl {
- enum anv_semaphore_type type;
-
- union {
- /* A BO representing this semaphore when type == ANV_SEMAPHORE_TYPE_BO
- * or type == ANV_SEMAPHORE_TYPE_WSI_BO. This BO will be added to the
- * object list on any execbuf2 calls for which this semaphore is used as
- * a wait or signal fence. When used as a signal fence or when type ==
- * ANV_SEMAPHORE_TYPE_WSI_BO, the EXEC_OBJECT_WRITE flag will be set.
- */
- struct anv_bo *bo;
-
- /* Sync object handle when type == ANV_SEMAPHORE_TYPE_DRM_SYNCOBJ.
- * Unlike GEM BOs, DRM sync objects aren't deduplicated by the kernel on
- * import so we don't need to bother with a userspace cache.
- */
- uint32_t syncobj;
-
- /* Non shareable timeline semaphore
- *
- * Used when kernel don't have support for timeline semaphores.
- */
- struct anv_timeline timeline;
- };
-};
-
-struct anv_semaphore {
- struct vk_object_base base;
-
- /* Permanent semaphore state. Every semaphore has some form of permanent
- * state (type != ANV_SEMAPHORE_TYPE_NONE). This may be a BO to fence on
- * (for cross-process semaphores0 or it could just be a dummy for use
- * internally.
- */
- struct anv_semaphore_impl permanent;
-
- /* Temporary semaphore state. A semaphore *may* have temporary state.
- * That state is added to the semaphore by an import operation and is reset
- * back to ANV_SEMAPHORE_TYPE_NONE when the semaphore is waited on. A
- * semaphore with temporary state cannot be signaled because the semaphore
- * must already be signaled before the temporary state can be exported from
- * the semaphore in the other process and imported here.
- */
- struct anv_semaphore_impl temporary;
-};
-
-void anv_semaphore_reset_temporary(struct anv_device *device,
- struct anv_semaphore *semaphore);
-
#define ANV_STAGE_MASK ((1 << MESA_VULKAN_SHADER_STAGES) - 1)
#define anv_foreach_stage(stage, stage_bits) \
@@ -3440,24 +4356,71 @@ struct anv_pipeline_bind_map {
uint32_t surface_count;
uint32_t sampler_count;
+ uint32_t embedded_sampler_count;
+ uint16_t kernel_args_size;
+ uint16_t kernel_arg_count;
struct anv_pipeline_binding * surface_to_descriptor;
struct anv_pipeline_binding * sampler_to_descriptor;
+ struct anv_pipeline_embedded_sampler_binding* embedded_sampler_to_binding;
+ struct brw_kernel_arg_desc * kernel_args;
struct anv_push_range push_ranges[4];
};
-struct anv_shader_bin_key {
- uint32_t size;
- uint8_t data[0];
+struct anv_push_descriptor_info {
+ /* A bitfield of descriptors used. */
+ uint32_t used_descriptors;
+
+ /* A bitfield of UBOs bindings fully promoted to push constants. */
+ uint32_t fully_promoted_ubo_descriptors;
+
+ /* */
+ uint8_t used_set_buffer;
};
-struct anv_shader_bin {
- uint32_t ref_cnt;
+/* A list of values we push to implement some of the dynamic states */
+enum anv_dynamic_push_bits {
+ ANV_DYNAMIC_PUSH_INPUT_VERTICES = BITFIELD_BIT(0),
+};
+struct anv_shader_upload_params {
gl_shader_stage stage;
- const struct anv_shader_bin_key *key;
+ const void *key_data;
+ uint32_t key_size;
+
+ const void *kernel_data;
+ uint32_t kernel_size;
+
+ const struct brw_stage_prog_data *prog_data;
+ uint32_t prog_data_size;
+
+ const struct brw_compile_stats *stats;
+ uint32_t num_stats;
+
+ const struct nir_xfb_info *xfb_info;
+
+ const struct anv_pipeline_bind_map *bind_map;
+
+ const struct anv_push_descriptor_info *push_desc_info;
+
+ enum anv_dynamic_push_bits dynamic_push_values;
+};
+
+struct anv_embedded_sampler {
+ uint32_t ref_cnt;
+
+ struct anv_embedded_sampler_key key;
+
+ struct anv_state sampler_state;
+ struct anv_state border_color_state;
+};
+
+struct anv_shader_bin {
+ struct vk_pipeline_cache_object base;
+
+ gl_shader_stage stage;
struct anv_state kernel;
uint32_t kernel_size;
@@ -3470,51 +4433,33 @@ struct anv_shader_bin {
struct nir_xfb_info *xfb_info;
+ struct anv_push_descriptor_info push_desc_info;
+
struct anv_pipeline_bind_map bind_map;
-};
-struct anv_shader_bin *
-anv_shader_bin_create(struct anv_device *device,
- gl_shader_stage stage,
- const void *key, uint32_t key_size,
- const void *kernel, uint32_t kernel_size,
- const struct brw_stage_prog_data *prog_data,
- uint32_t prog_data_size,
- const struct brw_compile_stats *stats, uint32_t num_stats,
- const struct nir_xfb_info *xfb_info,
- const struct anv_pipeline_bind_map *bind_map);
+ enum anv_dynamic_push_bits dynamic_push_values;
-void
-anv_shader_bin_destroy(struct anv_device *device, struct anv_shader_bin *shader);
+ /* Not saved in the pipeline cache.
+ *
+ * Array of pointers of length bind_map.embedded_sampler_count
+ */
+ struct anv_embedded_sampler **embedded_samplers;
+};
-static inline void
+static inline struct anv_shader_bin *
anv_shader_bin_ref(struct anv_shader_bin *shader)
{
- assert(shader && shader->ref_cnt >= 1);
- p_atomic_inc(&shader->ref_cnt);
+ vk_pipeline_cache_object_ref(&shader->base);
+
+ return shader;
}
static inline void
anv_shader_bin_unref(struct anv_device *device, struct anv_shader_bin *shader)
{
- assert(shader && shader->ref_cnt >= 1);
- if (p_atomic_dec_zero(&shader->ref_cnt))
- anv_shader_bin_destroy(device, shader);
+ vk_pipeline_cache_object_unref(&device->vk, &shader->base);
}
-#define anv_shader_bin_get_bsr(bin, local_arg_offset) ({ \
- assert((local_arg_offset) % 8 == 0); \
- const struct brw_bs_prog_data *prog_data = \
- brw_bs_prog_data_const(bin->prog_data); \
- assert(prog_data->simd_size == 8 || prog_data->simd_size == 16); \
- \
- (struct GFX_BINDLESS_SHADER_RECORD) { \
- .OffsetToLocalArguments = (local_arg_offset) / 8, \
- .BindlessShaderDispatchMode = prog_data->simd_size / 16, \
- .KernelStartPointer = bin->kernel.offset, \
- }; \
-})
-
struct anv_pipeline_executable {
gl_shader_stage stage;
@@ -3526,6 +4471,7 @@ struct anv_pipeline_executable {
enum anv_pipeline_type {
ANV_PIPELINE_GRAPHICS,
+ ANV_PIPELINE_GRAPHICS_LIB,
ANV_PIPELINE_COMPUTE,
ANV_PIPELINE_RAY_TRACING,
};
@@ -3543,110 +4489,249 @@ struct anv_pipeline {
enum anv_pipeline_type type;
VkPipelineCreateFlags flags;
+ VkPipelineCreateFlags2KHR active_stages;
+
+ uint32_t ray_queries;
+
+ /**
+ * Mask of stages that are accessing push descriptors.
+ */
+ VkShaderStageFlags use_push_descriptor;
+
+ /**
+ * Mask of stages that are accessing the push descriptors buffer.
+ */
+ VkShaderStageFlags use_push_descriptor_buffer;
+
+ /**
+ * Maximum scratch size for all shaders in this pipeline.
+ */
+ uint32_t scratch_size;
+
+ /* Layout of the sets used by the pipeline. */
+ struct anv_pipeline_sets_layout layout;
+
struct util_dynarray executables;
const struct intel_l3_config * l3_config;
};
-struct anv_graphics_pipeline {
+/* The base graphics pipeline object only hold shaders. */
+struct anv_graphics_base_pipeline {
struct anv_pipeline base;
- uint32_t batch_data[512];
+ struct vk_sample_locations_state sample_locations;
+
+ /* Shaders */
+ struct anv_shader_bin * shaders[ANV_GRAPHICS_SHADER_STAGE_COUNT];
- /* States that are part of batch_data and should be not emitted
- * dynamically.
+ /* A small hash based of shader_info::source_sha1 for identifying
+ * shaders in renderdoc/shader-db.
*/
- anv_cmd_dirty_mask_t static_state_mask;
+ uint32_t source_hashes[ANV_GRAPHICS_SHADER_STAGE_COUNT];
- /* States that need to be reemitted in cmd_buffer_flush_dynamic_state().
- * This might cover more than the dynamic states specified at pipeline
- * creation.
+ /* Feedback index in
+ * VkPipelineCreationFeedbackCreateInfo::pPipelineStageCreationFeedbacks
+ *
+ * For pipeline libraries, we need to remember the order at creation when
+ * included into a linked pipeline.
*/
- anv_cmd_dirty_mask_t dynamic_state_mask;
+ uint32_t feedback_index[ANV_GRAPHICS_SHADER_STAGE_COUNT];
- struct anv_dynamic_state dynamic_state;
+ /* Robustness flags used shaders
+ */
+ enum brw_robustness_flags robust_flags[ANV_GRAPHICS_SHADER_STAGE_COUNT];
- /* States declared dynamic at pipeline creation. */
- anv_cmd_dirty_mask_t dynamic_states;
+ /* True if at the time the fragment shader was compiled, it didn't have all
+ * the information to avoid INTEL_MSAA_FLAG_ENABLE_DYNAMIC.
+ */
+ bool fragment_dynamic;
+};
- uint32_t topology;
+/* The library graphics pipeline object has a partial graphic state and
+ * possibly some shaders. If requested, shaders are also present in NIR early
+ * form.
+ */
+struct anv_graphics_lib_pipeline {
+ struct anv_graphics_base_pipeline base;
- /* These fields are required with dynamic primitive topology,
- * rasterization_samples used only with gen < 8.
- */
- VkLineRasterizationModeEXT line_mode;
- VkPolygonMode polygon_mode;
- uint32_t rasterization_samples;
+ VkGraphicsPipelineLibraryFlagsEXT lib_flags;
- struct anv_subpass * subpass;
+ struct vk_graphics_pipeline_all_state all_state;
+ struct vk_graphics_pipeline_state state;
- struct anv_shader_bin * shaders[MESA_SHADER_STAGES];
+ /* Retained shaders for link optimization. */
+ struct {
+ /* This hash is the same as computed in
+ * anv_graphics_pipeline_gather_shaders().
+ */
+ unsigned char shader_sha1[20];
- VkShaderStageFlags active_stages;
+ enum gl_subgroup_size subgroup_size_type;
+
+ /* NIR captured in anv_pipeline_stage_get_nir(), includes specialization
+ * constants.
+ */
+ nir_shader * nir;
+ } retained_shaders[ANV_GRAPHICS_SHADER_STAGE_COUNT];
+
+ /* Whether the shaders have been retained */
+ bool retain_shaders;
+};
+
+struct anv_gfx_state_ptr {
+ /* Both in dwords */
+ uint16_t offset;
+ uint16_t len;
+};
+
+/* The final graphics pipeline object has all the graphics state ready to be
+ * programmed into HW packets (dynamic_state field) or fully baked in its
+ * batch.
+ */
+struct anv_graphics_pipeline {
+ struct anv_graphics_base_pipeline base;
+
+ struct vk_vertex_input_state vertex_input;
+ struct vk_sample_locations_state sample_locations;
+ struct vk_dynamic_graphics_state dynamic_state;
+
+ /* If true, the patch control points are passed through push constants
+ * (anv_push_constants::gfx::tcs_input_vertices)
+ */
+ bool dynamic_patch_control_points;
+
+ uint32_t view_mask;
+ uint32_t instance_multiplier;
+
+ bool rp_has_ds_self_dep;
- bool writes_depth;
- bool depth_test_enable;
- bool writes_stencil;
- bool stencil_test_enable;
- bool depth_clamp_enable;
- bool depth_clip_enable;
- bool sample_shading_enable;
bool kill_pixel;
- bool depth_bounds_test_enable;
bool force_fragment_thread_dispatch;
+ bool uses_xfb;
+ bool sample_shading_enable;
+ float min_sample_shading;
- /* When primitive replication is used, subpass->view_mask will describe what
- * views to replicate.
- */
- bool use_primitive_replication;
+ /* Number of VERTEX_ELEMENT_STATE input elements used by the shader */
+ uint32_t vs_input_elements;
- struct anv_state blend_state;
+ /* Number of VERTEX_ELEMENT_STATE elements we need to implement some of the
+ * draw parameters
+ */
+ uint32_t svgs_count;
- struct anv_state cps_state;
+ /* Pre computed VERTEX_ELEMENT_STATE structures for the vertex input that
+ * can be copied into the anv_cmd_buffer behind a 3DSTATE_VERTEX_BUFFER.
+ *
+ * When MESA_VK_DYNAMIC_VI is not dynamic
+ *
+ * vertex_input_elems = vs_input_elements + svgs_count
+ *
+ * All the VERTEX_ELEMENT_STATE can be directly copied behind a
+ * 3DSTATE_VERTEX_ELEMENTS instruction in the command buffer. Otherwise
+ * this array only holds the svgs_count elements.
+ */
+ uint32_t vertex_input_elems;
+ uint32_t vertex_input_data[2 * 31 /* MAX_VES + 2 internal */];
- uint32_t vb_used;
- struct anv_pipeline_vertex_binding {
- uint32_t stride;
- bool instanced;
- uint32_t instance_divisor;
- } vb[MAX_VBS];
+ /* Pre computed CS instructions that can directly be copied into
+ * anv_cmd_buffer.
+ */
+ uint32_t batch_data[416];
- struct {
- uint32_t sf[7];
- uint32_t depth_stencil_state[3];
- uint32_t clip[4];
- uint32_t xfb_bo_pitch[4];
- uint32_t wm[3];
- uint32_t blend_state[MAX_RTS * 2];
- uint32_t streamout_state[3];
- } gfx7;
+ /* Urb setup utilized by this pipeline. */
+ struct intel_urb_config urb_cfg;
+ /* Fully backed instructions, ready to be emitted in the anv_cmd_buffer */
struct {
- uint32_t sf[4];
- uint32_t raster[5];
- uint32_t wm_depth_stencil[3];
- uint32_t wm[2];
- uint32_t ps_blend[2];
- uint32_t blend_state[1 + MAX_RTS * 2];
- uint32_t streamout_state[5];
- } gfx8;
-
+ struct anv_gfx_state_ptr urb;
+ struct anv_gfx_state_ptr vf_statistics;
+ struct anv_gfx_state_ptr vf_sgvs;
+ struct anv_gfx_state_ptr vf_sgvs_2;
+ struct anv_gfx_state_ptr vf_sgvs_instancing;
+ struct anv_gfx_state_ptr vf_instancing;
+ struct anv_gfx_state_ptr primitive_replication;
+ struct anv_gfx_state_ptr sbe;
+ struct anv_gfx_state_ptr sbe_swiz;
+ struct anv_gfx_state_ptr so_decl_list;
+ struct anv_gfx_state_ptr vs;
+ struct anv_gfx_state_ptr hs;
+ struct anv_gfx_state_ptr ds;
+
+ struct anv_gfx_state_ptr task_control;
+ struct anv_gfx_state_ptr task_shader;
+ struct anv_gfx_state_ptr task_redistrib;
+ struct anv_gfx_state_ptr clip_mesh;
+ struct anv_gfx_state_ptr mesh_control;
+ struct anv_gfx_state_ptr mesh_shader;
+ struct anv_gfx_state_ptr mesh_distrib;
+ struct anv_gfx_state_ptr sbe_mesh;
+ } final;
+
+ /* Pre packed CS instructions & structures that need to be merged later
+ * with dynamic state.
+ */
struct {
- uint32_t wm_depth_stencil[4];
- } gfx9;
+ struct anv_gfx_state_ptr clip;
+ struct anv_gfx_state_ptr sf;
+ struct anv_gfx_state_ptr raster;
+ struct anv_gfx_state_ptr ms;
+ struct anv_gfx_state_ptr ps_extra;
+ struct anv_gfx_state_ptr wm;
+ struct anv_gfx_state_ptr so;
+ struct anv_gfx_state_ptr gs;
+ struct anv_gfx_state_ptr te;
+ struct anv_gfx_state_ptr ps;
+ struct anv_gfx_state_ptr vfg;
+ } partial;
};
+#define anv_batch_merge_pipeline_state(batch, dwords0, pipeline, state) \
+ do { \
+ uint32_t *dw; \
+ \
+ assert(ARRAY_SIZE(dwords0) == (pipeline)->state.len); \
+ dw = anv_batch_emit_dwords((batch), ARRAY_SIZE(dwords0)); \
+ if (!dw) \
+ break; \
+ for (uint32_t i = 0; i < ARRAY_SIZE(dwords0); i++) \
+ dw[i] = (dwords0)[i] | \
+ (pipeline)->batch_data[(pipeline)->state.offset + i]; \
+ VG(VALGRIND_CHECK_MEM_IS_DEFINED(dw, ARRAY_SIZE(dwords0) * 4)); \
+ } while (0)
+
+#define anv_batch_emit_pipeline_state(batch, pipeline, state) \
+ do { \
+ if ((pipeline)->state.len == 0) \
+ break; \
+ uint32_t *dw; \
+ dw = anv_batch_emit_dwords((batch), (pipeline)->state.len); \
+ if (!dw) \
+ break; \
+ memcpy(dw, &(pipeline)->batch_data[(pipeline)->state.offset], \
+ 4 * (pipeline)->state.len); \
+ } while (0)
+
+
struct anv_compute_pipeline {
struct anv_pipeline base;
struct anv_shader_bin * cs;
uint32_t batch_data[9];
uint32_t interface_descriptor_data[8];
+
+ /* A small hash based of shader_info::source_sha1 for identifying shaders
+ * in renderdoc/shader-db.
+ */
+ uint32_t source_hash;
};
struct anv_rt_shader_group {
VkRayTracingShaderGroupTypeKHR type;
+ /* Whether this group was imported from another pipeline */
+ bool imported;
+
struct anv_shader_bin *general;
struct anv_shader_bin *closest_hit;
struct anv_shader_bin *any_hit;
@@ -3681,6 +4766,8 @@ struct anv_ray_tracing_pipeline {
}
ANV_DECL_PIPELINE_DOWNCAST(graphics, ANV_PIPELINE_GRAPHICS)
+ANV_DECL_PIPELINE_DOWNCAST(graphics_base, ANV_PIPELINE_GRAPHICS)
+ANV_DECL_PIPELINE_DOWNCAST(graphics_lib, ANV_PIPELINE_GRAPHICS_LIB)
ANV_DECL_PIPELINE_DOWNCAST(compute, ANV_PIPELINE_COMPUTE)
ANV_DECL_PIPELINE_DOWNCAST(ray_tracing, ANV_PIPELINE_RAY_TRACING)
@@ -3688,7 +4775,59 @@ static inline bool
anv_pipeline_has_stage(const struct anv_graphics_pipeline *pipeline,
gl_shader_stage stage)
{
- return (pipeline->active_stages & mesa_to_vk_shader_stage(stage)) != 0;
+ return (pipeline->base.base.active_stages & mesa_to_vk_shader_stage(stage)) != 0;
+}
+
+static inline bool
+anv_pipeline_base_has_stage(const struct anv_graphics_base_pipeline *pipeline,
+ gl_shader_stage stage)
+{
+ return (pipeline->base.active_stages & mesa_to_vk_shader_stage(stage)) != 0;
+}
+
+static inline bool
+anv_pipeline_is_primitive(const struct anv_graphics_pipeline *pipeline)
+{
+ return anv_pipeline_has_stage(pipeline, MESA_SHADER_VERTEX);
+}
+
+static inline bool
+anv_pipeline_is_mesh(const struct anv_graphics_pipeline *pipeline)
+{
+ return anv_pipeline_has_stage(pipeline, MESA_SHADER_MESH);
+}
+
+static inline bool
+anv_cmd_buffer_all_color_write_masked(const struct anv_cmd_buffer *cmd_buffer)
+{
+ const struct anv_cmd_graphics_state *state = &cmd_buffer->state.gfx;
+ const struct vk_dynamic_graphics_state *dyn =
+ &cmd_buffer->vk.dynamic_graphics_state;
+ uint8_t color_writes = dyn->cb.color_write_enables;
+
+ /* All writes disabled through vkCmdSetColorWriteEnableEXT */
+ if ((color_writes & ((1u << state->color_att_count) - 1)) == 0)
+ return true;
+
+ /* Or all write masks are empty */
+ for (uint32_t i = 0; i < state->color_att_count; i++) {
+ if (dyn->cb.attachments[i].write_mask != 0)
+ return false;
+ }
+
+ return true;
+}
+
+static inline void
+anv_cmd_graphic_state_update_has_uint_rt(struct anv_cmd_graphics_state *state)
+{
+ state->has_uint_rt = false;
+ for (unsigned a = 0; a < state->color_att_count; a++) {
+ if (vk_format_is_int(state->color_att[a].vk_format)) {
+ state->has_uint_rt = true;
+ break;
+ }
+ }
}
#define ANV_DECL_GET_GRAPHICS_PROG_DATA_FUNC(prefix, stage) \
@@ -3697,7 +4836,7 @@ get_##prefix##_prog_data(const struct anv_graphics_pipeline *pipeline) \
{ \
if (anv_pipeline_has_stage(pipeline, stage)) { \
return (const struct brw_##prefix##_prog_data *) \
- pipeline->shaders[stage]->prog_data; \
+ pipeline->base.shaders[stage]->prog_data; \
} else { \
return NULL; \
} \
@@ -3708,6 +4847,8 @@ ANV_DECL_GET_GRAPHICS_PROG_DATA_FUNC(tcs, MESA_SHADER_TESS_CTRL)
ANV_DECL_GET_GRAPHICS_PROG_DATA_FUNC(tes, MESA_SHADER_TESS_EVAL)
ANV_DECL_GET_GRAPHICS_PROG_DATA_FUNC(gs, MESA_SHADER_GEOMETRY)
ANV_DECL_GET_GRAPHICS_PROG_DATA_FUNC(wm, MESA_SHADER_FRAGMENT)
+ANV_DECL_GET_GRAPHICS_PROG_DATA_FUNC(mesh, MESA_SHADER_MESH)
+ANV_DECL_GET_GRAPHICS_PROG_DATA_FUNC(task, MESA_SHADER_TASK)
static inline const struct brw_cs_prog_data *
get_cs_prog_data(const struct anv_compute_pipeline *pipeline)
@@ -3733,62 +4874,38 @@ anv_device_init_rt_shaders(struct anv_device *device);
void
anv_device_finish_rt_shaders(struct anv_device *device);
-VkResult
-anv_pipeline_init(struct anv_pipeline *pipeline,
- struct anv_device *device,
- enum anv_pipeline_type type,
- VkPipelineCreateFlags flags,
- const VkAllocationCallbacks *pAllocator);
-
-void
-anv_pipeline_finish(struct anv_pipeline *pipeline,
- struct anv_device *device,
- const VkAllocationCallbacks *pAllocator);
+struct anv_kernel_arg {
+ bool is_ptr;
+ uint16_t size;
-VkResult
-anv_graphics_pipeline_init(struct anv_graphics_pipeline *pipeline, struct anv_device *device,
- struct anv_pipeline_cache *cache,
- const VkGraphicsPipelineCreateInfo *pCreateInfo,
- const VkAllocationCallbacks *alloc);
-
-VkResult
-anv_pipeline_compile_cs(struct anv_compute_pipeline *pipeline,
- struct anv_pipeline_cache *cache,
- const VkComputePipelineCreateInfo *info,
- const struct vk_shader_module *module,
- const char *entrypoint,
- const VkSpecializationInfo *spec_info);
+ union {
+ uint64_t u64;
+ void *ptr;
+ };
+};
-VkResult
-anv_ray_tracing_pipeline_init(struct anv_ray_tracing_pipeline *pipeline,
- struct anv_device *device,
- struct anv_pipeline_cache *cache,
- const VkRayTracingPipelineCreateInfoKHR *pCreateInfo,
- const VkAllocationCallbacks *alloc);
+struct anv_kernel {
+#ifndef NDEBUG
+ const char *name;
+#endif
+ struct anv_shader_bin *bin;
+ const struct intel_l3_config *l3_config;
+};
struct anv_format_plane {
enum isl_format isl_format:16;
struct isl_swizzle swizzle;
- /* Whether this plane contains chroma channels */
- bool has_chroma;
-
- /* For downscaling of YUV planes */
- uint8_t denominator_scales[2];
-
- /* How to map sampled ycbcr planes to a single 4 component element. */
- struct isl_swizzle ycbcr_swizzle;
-
/* What aspect is associated to this plane */
VkImageAspectFlags aspect;
};
-
struct anv_format {
struct anv_format_plane planes[3];
VkFormat vk_format;
uint8_t n_planes;
bool can_ycbcr;
+ bool can_video;
};
static inline void
@@ -3865,15 +4982,41 @@ anv_get_isl_format(const struct intel_device_info *devinfo, VkFormat vk_format,
return anv_get_format_aspect(devinfo, vk_format, aspect, tiling).isl_format;
}
+bool anv_format_supports_ccs_e(const struct intel_device_info *devinfo,
+ const enum isl_format format);
+
bool anv_formats_ccs_e_compatible(const struct intel_device_info *devinfo,
VkImageCreateFlags create_flags,
- VkFormat vk_format,
- VkImageTiling vk_tiling,
- const VkImageFormatListCreateInfoKHR *fmt_list);
+ VkFormat vk_format, VkImageTiling vk_tiling,
+ VkImageUsageFlags vk_usage,
+ const VkImageFormatListCreateInfo *fmt_list);
extern VkFormat
vk_format_from_android(unsigned android_format, unsigned android_usage);
+static inline VkFormat
+anv_get_emulation_format(const struct anv_physical_device *pdevice, VkFormat format)
+{
+ if (pdevice->flush_astc_ldr_void_extent_denorms) {
+ const struct util_format_description *desc =
+ vk_format_description(format);
+ if (desc->layout == UTIL_FORMAT_LAYOUT_ASTC &&
+ desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB)
+ return format;
+ }
+
+ if (pdevice->emu_astc_ldr)
+ return vk_texcompress_astc_emulation_format(format);
+
+ return VK_FORMAT_UNDEFINED;
+}
+
+static inline bool
+anv_is_format_emulated(const struct anv_physical_device *pdevice, VkFormat format)
+{
+ return anv_get_emulation_format(pdevice, format) != VK_FORMAT_UNDEFINED;
+}
+
static inline struct isl_swizzle
anv_swizzle_for_render(struct isl_swizzle swizzle)
{
@@ -3932,14 +5075,14 @@ struct anv_image_memory_range {
ANV_IMAGE_MEMORY_BINDING_END,
} binding;
+ uint32_t alignment;
+ uint64_t size;
+
/**
* Offset is relative to the start of the binding created by
* vkBindImageMemory, not to the start of the bo.
*/
uint64_t offset;
-
- uint64_t size;
- uint32_t alignment;
};
/**
@@ -3968,6 +5111,11 @@ struct anv_image {
bool disjoint;
/**
+ * Image is a WSI image
+ */
+ bool from_wsi;
+
+ /**
* Image was imported from an struct AHardwareBuffer. We have to delay
* final image creation until bind time.
*/
@@ -3980,6 +5128,12 @@ struct anv_image {
bool from_gralloc;
/**
+ * If not UNDEFINED, image has a hidden plane at planes[n_planes] for ASTC
+ * LDR workaround or emulation.
+ */
+ VkFormat emu_plane_format;
+
+ /**
* The memory bindings created by vkCreateImage and vkBindImageMemory.
*
* For details on the image's memory layout, see check_memory_bindings().
@@ -3996,6 +5150,7 @@ struct anv_image {
struct anv_image_binding {
struct anv_image_memory_range memory_range;
struct anv_address address;
+ struct anv_sparse_binding_data sparse_data;
} bindings[ANV_IMAGE_MEMORY_BINDING_END];
/**
@@ -4015,13 +5170,6 @@ struct anv_image {
struct anv_surface primary_surface;
/**
- * A surface which shadows the main surface and may have different
- * tiling. This is used for sampling using a tiling that isn't supported
- * for other operations.
- */
- struct anv_surface shadow_surface;
-
- /**
* The base aux usage for this image. For color images, this can be
* either CCS_E or CCS_D depending on whether or not we can reliably
* leave CCS on all the time.
@@ -4030,11 +5178,77 @@ struct anv_image {
struct anv_surface aux_surface;
+ /** Location of the compression control surface. */
+ struct anv_image_memory_range compr_ctrl_memory_range;
+
/** Location of the fast clear state. */
struct anv_image_memory_range fast_clear_memory_range;
+
+ /**
+ * Whether this image can be fast cleared with non-zero clear colors.
+ * This can happen with mutable images when formats of different bit
+ * sizes per components are used.
+ *
+ * On Gfx9+, because the clear colors are stored as a 4 components 32bit
+ * values, we can clear in R16G16_UNORM (store 2 16bit values in the
+ * components 0 & 1 of the clear color) and then draw in R32_UINT which
+ * would interpret the clear color as a single component value, using
+ * only the first 16bit component of the previous written clear color.
+ *
+ * On Gfx7/7.5/8, only CC_ZERO/CC_ONE clear colors are supported, this
+ * boolean will prevent the usage of CC_ONE.
+ */
+ bool can_non_zero_fast_clear;
+
+ struct {
+ /** Whether the image has CCS data mapped through AUX-TT. */
+ bool mapped;
+
+ /** Main address of the mapping. */
+ uint64_t addr;
+
+ /** Size of the mapping. */
+ uint64_t size;
+ } aux_tt;
} planes[3];
+
+ struct anv_image_memory_range vid_dmv_top_surface;
+
+ /* Link in the anv_device.image_private_objects list */
+ struct list_head link;
};
+static inline bool
+anv_image_is_sparse(const struct anv_image *image)
+{
+ return image->vk.create_flags & VK_IMAGE_CREATE_SPARSE_BINDING_BIT;
+}
+
+static inline bool
+anv_image_is_externally_shared(const struct anv_image *image)
+{
+ return image->vk.drm_format_mod != DRM_FORMAT_MOD_INVALID ||
+ image->vk.external_handle_types != 0;
+}
+
+static inline bool
+anv_image_has_private_binding(const struct anv_image *image)
+{
+ const struct anv_image_binding private_binding =
+ image->bindings[ANV_IMAGE_MEMORY_BINDING_PRIVATE];
+ return private_binding.memory_range.size != 0;
+}
+
+static inline bool
+anv_image_format_is_d16_or_s8(const struct anv_image *image)
+{
+ return image->vk.format == VK_FORMAT_D16_UNORM ||
+ image->vk.format == VK_FORMAT_D16_UNORM_S8_UINT ||
+ image->vk.format == VK_FORMAT_D24_UNORM_S8_UINT ||
+ image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT ||
+ image->vk.format == VK_FORMAT_S8_UINT;
+}
+
/* The ordering of this enum is important */
enum anv_fast_clear_type {
/** Image does not have/support any fast-clear blocks */
@@ -4127,9 +5341,16 @@ anv_image_get_fast_clear_type_addr(const struct anv_device *device,
struct anv_address addr =
anv_image_get_clear_color_addr(device, image, aspect);
- const unsigned clear_color_state_size = device->info.ver >= 10 ?
- device->isl_dev.ss.clear_color_state_size :
- device->isl_dev.ss.clear_value_size;
+ unsigned clear_color_state_size;
+ if (device->info->ver >= 11) {
+ /* The fast clear type and the first compression state are stored in the
+ * last 2 dwords of the clear color struct. Refer to the comment in
+ * add_aux_state_tracking_buffer().
+ */
+ assert(device->isl_dev.ss.clear_color_state_size >= 32);
+ clear_color_state_size = device->isl_dev.ss.clear_color_state_size - 8;
+ } else
+ clear_color_state_size = device->isl_dev.ss.clear_value_size;
return anv_address_add(addr, clear_color_state_size);
}
@@ -4142,16 +5363,16 @@ anv_image_get_compression_state_addr(const struct anv_device *device,
assert(level < anv_image_aux_levels(image, aspect));
assert(array_layer < anv_image_aux_layers(image, aspect, level));
UNUSED uint32_t plane = anv_image_aspect_to_plane(image, aspect);
- assert(image->planes[plane].aux_usage == ISL_AUX_USAGE_CCS_E);
+ assert(isl_aux_usage_has_ccs_e(image->planes[plane].aux_usage));
- /* Relative to start of the plane's fast clear memory range */
+ /* Relative to start of the plane's fast clear type */
uint32_t offset;
offset = 4; /* Go past the fast clear type */
if (image->vk.image_type == VK_IMAGE_TYPE_3D) {
for (uint32_t l = 0; l < level; l++)
- offset += anv_minify(image->vk.extent.depth, l) * 4;
+ offset += u_minify(image->vk.extent.depth, l) * 4;
} else {
offset += level * image->vk.array_layers * 4;
}
@@ -4165,6 +5386,16 @@ anv_image_get_compression_state_addr(const struct anv_device *device,
offset);
}
+static inline const struct anv_image_memory_range *
+anv_image_get_aux_memory_range(const struct anv_image *image,
+ uint32_t plane)
+{
+ if (image->planes[plane].aux_surface.memory_range.size > 0)
+ return &image->planes[plane].aux_surface.memory_range;
+ else
+ return &image->planes[plane].compr_ctrl_memory_range;
+}
+
/* Returns true if a HiZ-enabled depth buffer can be sampled from. */
static inline bool
anv_can_sample_with_hiz(const struct intel_device_info * const devinfo,
@@ -4183,13 +5414,7 @@ anv_can_sample_with_hiz(const struct intel_device_info * const devinfo,
if (image->vk.image_type == VK_IMAGE_TYPE_3D)
return false;
- /* Allow this feature on BDW even though it is disabled in the BDW devinfo
- * struct. There's documentation which suggests that this feature actually
- * reduces performance on BDW, but it has only been observed to help so
- * far. Sampling fast-cleared blocks on BDW must also be handled with care
- * (see depth_stencil_attachment_compute_aux_usage() for more info).
- */
- if (devinfo->ver != 8 && !devinfo->has_sample_with_hiz)
+ if (!devinfo->has_sample_with_hiz)
return false;
return image->vk.samples == 1;
@@ -4212,7 +5437,7 @@ anv_can_sample_mcs_with_clear(const struct intel_device_info * const devinfo,
* See HSD 1707282275, wa_14013111325. Due to the use of
* format-reinterpretation, a simplified workaround is implemented.
*/
- if (devinfo->ver >= 12 &&
+ if (intel_needs_workaround(devinfo, 14013111325) &&
isl_format_get_layout(anv_surf->isl.format)->bpb <= 16) {
return false;
}
@@ -4225,10 +5450,50 @@ anv_image_plane_uses_aux_map(const struct anv_device *device,
const struct anv_image *image,
uint32_t plane)
{
- return device->info.has_aux_map &&
+ return device->info->has_aux_map &&
isl_aux_usage_has_ccs(image->planes[plane].aux_usage);
}
+static inline bool
+anv_image_uses_aux_map(const struct anv_device *device,
+ const struct anv_image *image)
+{
+ for (uint32_t p = 0; p < image->n_planes; ++p) {
+ if (anv_image_plane_uses_aux_map(device, image, p))
+ return true;
+ }
+
+ return false;
+}
+
+static inline bool
+anv_bo_allows_aux_map(const struct anv_device *device,
+ const struct anv_bo *bo)
+{
+ if (device->aux_map_ctx == NULL)
+ return false;
+
+ return (bo->alloc_flags & ANV_BO_ALLOC_AUX_TT_ALIGNED) != 0;
+}
+
+static inline bool
+anv_address_allows_aux_map(const struct anv_device *device,
+ struct anv_address addr)
+{
+ if (device->aux_map_ctx == NULL)
+ return false;
+
+ /* Technically, we really only care about what offset the image is bound
+ * into on the BO, but we don't have that information here. As a heuristic,
+ * rely on the BO offset instead.
+ */
+ if (anv_address_physical(addr) %
+ intel_aux_map_get_alignment(device->aux_map_ctx) != 0)
+ return false;
+
+ return true;
+}
+
void
anv_cmd_buffer_mark_image_written(struct anv_cmd_buffer *cmd_buffer,
const struct anv_image *image,
@@ -4239,6 +5504,21 @@ anv_cmd_buffer_mark_image_written(struct anv_cmd_buffer *cmd_buffer,
uint32_t layer_count);
void
+anv_cmd_buffer_mark_image_fast_cleared(struct anv_cmd_buffer *cmd_buffer,
+ const struct anv_image *image,
+ const enum isl_format format,
+ union isl_color_value clear_color);
+
+void
+anv_cmd_buffer_load_clear_color_from_image(struct anv_cmd_buffer *cmd_buffer,
+ struct anv_state state,
+ const struct anv_image *image);
+
+struct anv_image_binding *
+anv_image_aspect_to_binding(struct anv_image *image,
+ VkImageAspectFlags aspect);
+
+void
anv_image_clear_color(struct anv_cmd_buffer *cmd_buffer,
const struct anv_image *image,
VkImageAspectFlagBits aspect,
@@ -4256,19 +5536,10 @@ anv_image_clear_depth_stencil(struct anv_cmd_buffer *cmd_buffer,
VkRect2D area,
float depth_value, uint8_t stencil_value);
void
-anv_image_msaa_resolve(struct anv_cmd_buffer *cmd_buffer,
- const struct anv_image *src_image,
- enum isl_aux_usage src_aux_usage,
- uint32_t src_level, uint32_t src_base_layer,
- const struct anv_image *dst_image,
- enum isl_aux_usage dst_aux_usage,
- uint32_t dst_level, uint32_t dst_base_layer,
- VkImageAspectFlagBits aspect,
- uint32_t src_x, uint32_t src_y,
- uint32_t dst_x, uint32_t dst_y,
- uint32_t width, uint32_t height,
- uint32_t layer_count,
- enum blorp_filter filter);
+anv_attachment_msaa_resolve(struct anv_cmd_buffer *cmd_buffer,
+ const struct anv_attachment *att,
+ VkImageLayout layout,
+ VkImageAspectFlagBits aspect);
void
anv_image_hiz_op(struct anv_cmd_buffer *cmd_buffer,
const struct anv_image *image,
@@ -4299,31 +5570,69 @@ anv_image_ccs_op(struct anv_cmd_buffer *cmd_buffer,
enum isl_aux_op ccs_op, union isl_color_value *clear_value,
bool predicate);
+isl_surf_usage_flags_t
+anv_image_choose_isl_surf_usage(struct anv_physical_device *device,
+ VkImageCreateFlags vk_create_flags,
+ VkImageUsageFlags vk_usage,
+ isl_surf_usage_flags_t isl_extra_usage,
+ VkImageAspectFlagBits aspect,
+ VkImageCompressionFlagsEXT comp_flags);
+
void
-anv_image_copy_to_shadow(struct anv_cmd_buffer *cmd_buffer,
- const struct anv_image *image,
- VkImageAspectFlagBits aspect,
- uint32_t base_level, uint32_t level_count,
- uint32_t base_layer, uint32_t layer_count);
+anv_cmd_buffer_fill_area(struct anv_cmd_buffer *cmd_buffer,
+ struct anv_address address,
+ VkDeviceSize size,
+ uint32_t data);
+
+VkResult
+anv_cmd_buffer_ensure_rcs_companion(struct anv_cmd_buffer *cmd_buffer);
+
+bool
+anv_can_hiz_clear_ds_view(struct anv_device *device,
+ const struct anv_image_view *iview,
+ VkImageLayout layout,
+ VkImageAspectFlags clear_aspects,
+ float depth_clear_value,
+ VkRect2D render_area,
+ const VkQueueFlagBits queue_flags);
+
+bool
+anv_can_fast_clear_color_view(struct anv_device *device,
+ struct anv_image_view *iview,
+ VkImageLayout layout,
+ union isl_color_value clear_color,
+ uint32_t num_layers,
+ VkRect2D render_area,
+ const VkQueueFlagBits queue_flags);
enum isl_aux_state ATTRIBUTE_PURE
anv_layout_to_aux_state(const struct intel_device_info * const devinfo,
const struct anv_image *image,
const VkImageAspectFlagBits aspect,
- const VkImageLayout layout);
+ const VkImageLayout layout,
+ const VkQueueFlagBits queue_flags);
enum isl_aux_usage ATTRIBUTE_PURE
anv_layout_to_aux_usage(const struct intel_device_info * const devinfo,
const struct anv_image *image,
const VkImageAspectFlagBits aspect,
const VkImageUsageFlagBits usage,
- const VkImageLayout layout);
+ const VkImageLayout layout,
+ const VkQueueFlagBits queue_flags);
enum anv_fast_clear_type ATTRIBUTE_PURE
anv_layout_to_fast_clear_type(const struct intel_device_info * const devinfo,
const struct anv_image * const image,
const VkImageAspectFlagBits aspect,
- const VkImageLayout layout);
+ const VkImageLayout layout,
+ const VkQueueFlagBits queue_flags);
+
+bool ATTRIBUTE_PURE
+anv_layout_has_untracked_aux_writes(const struct intel_device_info * const devinfo,
+ const struct anv_image * const image,
+ const VkImageAspectFlagBits aspect,
+ const VkImageLayout layout,
+ const VkQueueFlagBits queue_flags);
static inline bool
anv_image_aspects_compatible(VkImageAspectFlags aspects1,
@@ -4347,39 +5656,44 @@ struct anv_image_view {
const struct anv_image *image; /**< VkImageViewCreateInfo::image */
unsigned n_planes;
- struct {
- uint32_t image_plane;
+ /**
+ * True if the surface states (if any) are owned by some anv_state_stream
+ * from internal_surface_state_pool.
+ */
+ bool use_surface_state_stream;
+
+ struct {
struct isl_view isl;
/**
+ * A version of the image view for storage usage (can apply 3D image
+ * slicing).
+ */
+ struct isl_view isl_storage;
+
+ /**
* RENDER_SURFACE_STATE when using image as a sampler surface with an
* image layout of SHADER_READ_ONLY_OPTIMAL or
* DEPTH_STENCIL_READ_ONLY_OPTIMAL.
*/
- struct anv_surface_state optimal_sampler_surface_state;
+ struct anv_surface_state optimal_sampler;
/**
* RENDER_SURFACE_STATE when using image as a sampler surface with an
* image layout of GENERAL.
*/
- struct anv_surface_state general_sampler_surface_state;
+ struct anv_surface_state general_sampler;
/**
- * RENDER_SURFACE_STATE when using image as a storage image. Separate
- * states for write-only and readable, using the real format for
- * write-only and the lowered format for readable.
+ * RENDER_SURFACE_STATE when using image as a storage image.
*/
- struct anv_surface_state storage_surface_state;
- struct anv_surface_state writeonly_storage_surface_state;
-
- struct brw_image_param storage_image_param;
+ struct anv_surface_state storage;
} planes[3];
};
enum anv_image_view_state_flags {
- ANV_IMAGE_VIEW_STATE_STORAGE_WRITE_ONLY = (1 << 0),
- ANV_IMAGE_VIEW_STATE_TEXTURE_OPTIMAL = (1 << 1),
+ ANV_IMAGE_VIEW_STATE_TEXTURE_OPTIMAL = (1 << 0),
};
void anv_image_fill_surface_state(struct anv_device *device,
@@ -4390,8 +5704,41 @@ void anv_image_fill_surface_state(struct anv_device *device,
enum isl_aux_usage aux_usage,
const union isl_color_value *clear_color,
enum anv_image_view_state_flags flags,
- struct anv_surface_state *state_inout,
- struct brw_image_param *image_param_out);
+ struct anv_surface_state *state_inout);
+
+
+static inline const struct anv_surface_state *
+anv_image_view_texture_surface_state(const struct anv_image_view *iview,
+ uint32_t plane, VkImageLayout layout)
+{
+ return layout == VK_IMAGE_LAYOUT_GENERAL ?
+ &iview->planes[plane].general_sampler :
+ &iview->planes[plane].optimal_sampler;
+}
+
+static inline const struct anv_surface_state *
+anv_image_view_storage_surface_state(const struct anv_image_view *iview)
+{
+ return &iview->planes[0].storage;
+}
+
+static inline bool
+anv_cmd_graphics_state_has_image_as_attachment(const struct anv_cmd_graphics_state *state,
+ const struct anv_image *image)
+{
+ for (unsigned a = 0; a < state->color_att_count; a++) {
+ if (state->color_att[a].iview &&
+ state->color_att[a].iview->image == image)
+ return true;
+ }
+
+ if (state->depth_att.iview && state->depth_att.iview->image == image)
+ return true;
+ if (state->stencil_att.iview && state->stencil_att.iview->image == image)
+ return true;
+
+ return false;
+}
struct anv_image_create_info {
const VkImageCreateInfo *vk_info;
@@ -4401,112 +5748,104 @@ struct anv_image_create_info {
/** These flags will be added to any derived from VkImageCreateInfo. */
isl_surf_usage_flags_t isl_extra_usage_flags;
+
+ /** An opt-in stride in pixels, should be 0 for implicit layouts */
+ uint32_t stride;
+
+ /** Whether to allocate private binding */
+ bool no_private_binding_alloc;
};
-VkResult anv_image_create(VkDevice _device,
- const struct anv_image_create_info *info,
- const VkAllocationCallbacks* alloc,
- VkImage *pImage);
+VkResult anv_image_init(struct anv_device *device, struct anv_image *image,
+ const struct anv_image_create_info *create_info);
+
+void anv_image_finish(struct anv_image *image);
+
+void anv_image_get_memory_requirements(struct anv_device *device,
+ struct anv_image *image,
+ VkImageAspectFlags aspects,
+ VkMemoryRequirements2 *pMemoryRequirements);
+
+void anv_image_view_init(struct anv_device *device,
+ struct anv_image_view *iview,
+ const VkImageViewCreateInfo *pCreateInfo,
+ struct anv_state_stream *state_stream);
+
+void anv_image_view_finish(struct anv_image_view *iview);
enum isl_format
anv_isl_format_for_descriptor_type(const struct anv_device *device,
VkDescriptorType type);
-static inline VkExtent3D
-anv_sanitize_image_extent(const VkImageType imageType,
- const VkExtent3D imageExtent)
-{
- switch (imageType) {
- case VK_IMAGE_TYPE_1D:
- return (VkExtent3D) { imageExtent.width, 1, 1 };
- case VK_IMAGE_TYPE_2D:
- return (VkExtent3D) { imageExtent.width, imageExtent.height, 1 };
- case VK_IMAGE_TYPE_3D:
- return imageExtent;
- default:
- unreachable("invalid image type");
- }
-}
-
-static inline VkOffset3D
-anv_sanitize_image_offset(const VkImageType imageType,
- const VkOffset3D imageOffset)
+static inline isl_surf_usage_flags_t
+anv_isl_usage_for_descriptor_type(const VkDescriptorType type)
{
- switch (imageType) {
- case VK_IMAGE_TYPE_1D:
- return (VkOffset3D) { imageOffset.x, 0, 0 };
- case VK_IMAGE_TYPE_2D:
- return (VkOffset3D) { imageOffset.x, imageOffset.y, 0 };
- case VK_IMAGE_TYPE_3D:
- return imageOffset;
- default:
- unreachable("invalid image type");
+ switch(type) {
+ case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
+ case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
+ return ISL_SURF_USAGE_CONSTANT_BUFFER_BIT;
+ default:
+ return ISL_SURF_USAGE_STORAGE_BIT;
}
}
static inline uint32_t
anv_rasterization_aa_mode(VkPolygonMode raster_mode,
- VkLineRasterizationModeEXT line_mode)
+ VkLineRasterizationModeKHR line_mode)
{
if (raster_mode == VK_POLYGON_MODE_LINE &&
- line_mode == VK_LINE_RASTERIZATION_MODE_RECTANGULAR_SMOOTH_EXT)
+ line_mode == VK_LINE_RASTERIZATION_MODE_RECTANGULAR_SMOOTH_KHR)
return true;
return false;
}
-VkFormatFeatureFlags
-anv_get_image_format_features(const struct intel_device_info *devinfo,
- VkFormat vk_format,
- const struct anv_format *anv_format,
- VkImageTiling vk_tiling,
- const struct isl_drm_modifier_info *isl_mod_info);
+static inline VkLineRasterizationModeKHR
+anv_line_rasterization_mode(VkLineRasterizationModeKHR line_mode,
+ unsigned rasterization_samples)
+{
+ if (line_mode == VK_LINE_RASTERIZATION_MODE_DEFAULT_KHR) {
+ if (rasterization_samples > 1) {
+ return VK_LINE_RASTERIZATION_MODE_RECTANGULAR_KHR;
+ } else {
+ return VK_LINE_RASTERIZATION_MODE_BRESENHAM_KHR;
+ }
+ }
+ return line_mode;
+}
+
+static inline bool
+anv_is_dual_src_blend_factor(VkBlendFactor factor)
+{
+ return factor == VK_BLEND_FACTOR_SRC1_COLOR ||
+ factor == VK_BLEND_FACTOR_ONE_MINUS_SRC1_COLOR ||
+ factor == VK_BLEND_FACTOR_SRC1_ALPHA ||
+ factor == VK_BLEND_FACTOR_ONE_MINUS_SRC1_ALPHA;
+}
+
+static inline bool
+anv_is_dual_src_blend_equation(const struct vk_color_blend_attachment_state *cb)
+{
+ return anv_is_dual_src_blend_factor(cb->src_color_blend_factor) &&
+ anv_is_dual_src_blend_factor(cb->dst_color_blend_factor) &&
+ anv_is_dual_src_blend_factor(cb->src_alpha_blend_factor) &&
+ anv_is_dual_src_blend_factor(cb->dst_alpha_blend_factor);
+}
+
+VkFormatFeatureFlags2
+anv_get_image_format_features2(const struct anv_physical_device *physical_device,
+ VkFormat vk_format,
+ const struct anv_format *anv_format,
+ VkImageTiling vk_tiling,
+ const struct isl_drm_modifier_info *isl_mod_info);
void anv_fill_buffer_surface_state(struct anv_device *device,
- struct anv_state state,
+ void *surface_state_ptr,
enum isl_format format,
+ struct isl_swizzle swizzle,
isl_surf_usage_flags_t usage,
struct anv_address address,
uint32_t range, uint32_t stride);
-static inline void
-anv_clear_color_from_att_state(union isl_color_value *clear_color,
- const struct anv_attachment_state *att_state,
- const struct anv_image_view *iview)
-{
- const struct isl_format_layout *view_fmtl =
- isl_format_get_layout(iview->planes[0].isl.format);
-
-#define COPY_CLEAR_COLOR_CHANNEL(c, i) \
- if (view_fmtl->channels.c.bits) \
- clear_color->u32[i] = att_state->clear_value.color.uint32[i]
-
- COPY_CLEAR_COLOR_CHANNEL(r, 0);
- COPY_CLEAR_COLOR_CHANNEL(g, 1);
- COPY_CLEAR_COLOR_CHANNEL(b, 2);
- COPY_CLEAR_COLOR_CHANNEL(a, 3);
-
-#undef COPY_CLEAR_COLOR_CHANNEL
-}
-
-
-/* Haswell border color is a bit of a disaster. Float and unorm formats use a
- * straightforward 32-bit float color in the first 64 bytes. Instead of using
- * a nice float/integer union like Gfx8+, Haswell specifies the integer border
- * color as a separate entry /after/ the float color. The layout of this entry
- * also depends on the format's bpp (with extra hacks for RG32), and overlaps.
- *
- * Since we don't know the format/bpp, we can't make any of the border colors
- * containing '1' work for all formats, as it would be in the wrong place for
- * some of them. We opt to make 32-bit integers work as this seems like the
- * most common option. Fortunately, transparent black works regardless, as
- * all zeroes is the same in every bit-size.
- */
-struct hsw_border_color {
- float float32[4];
- uint32_t _pad0[12];
- uint32_t uint32[4];
- uint32_t _pad1[108];
-};
struct gfx8_border_color {
union {
@@ -4517,24 +5856,19 @@ struct gfx8_border_color {
uint32_t _pad[12];
};
-struct anv_ycbcr_conversion {
- struct vk_object_base base;
-
- const struct anv_format * format;
- VkSamplerYcbcrModelConversion ycbcr_model;
- VkSamplerYcbcrRange ycbcr_range;
- VkComponentSwizzle mapping[4];
- VkChromaLocation chroma_offsets[2];
- VkFilter chroma_filter;
- bool chroma_reconstruction;
-};
-
struct anv_sampler {
- struct vk_object_base base;
+ struct vk_sampler vk;
+
+ /* Hash of the sampler state + border color, useful for embedded samplers
+ * and included in the descriptor layout hash.
+ */
+ unsigned char sha1[20];
uint32_t state[3][4];
+ uint32_t db_state[3][4];
+ /* Packed SAMPLER_STATE without the border color pointer. */
+ uint32_t state_no_bc[3][4];
uint32_t n_planes;
- struct anv_ycbcr_conversion *conversion;
/* Blob of sampler state data which is guaranteed to be 32-byte aligned
* and with a 32-byte stride for use as bindless samplers.
@@ -4542,107 +5876,27 @@ struct anv_sampler {
struct anv_state bindless_state;
struct anv_state custom_border_color;
-};
-
-struct anv_framebuffer {
- struct vk_object_base base;
-
- uint32_t width;
- uint32_t height;
- uint32_t layers;
-
- uint32_t attachment_count;
- struct anv_image_view * attachments[0];
-};
-
-struct anv_subpass_attachment {
- VkImageUsageFlagBits usage;
- uint32_t attachment;
- VkImageLayout layout;
-
- /* Used only with attachment containing stencil data. */
- VkImageLayout stencil_layout;
-};
-
-struct anv_subpass {
- uint32_t attachment_count;
-
- /**
- * A pointer to all attachment references used in this subpass.
- * Only valid if ::attachment_count > 0.
- */
- struct anv_subpass_attachment * attachments;
- uint32_t input_count;
- struct anv_subpass_attachment * input_attachments;
- uint32_t color_count;
- struct anv_subpass_attachment * color_attachments;
- struct anv_subpass_attachment * resolve_attachments;
-
- struct anv_subpass_attachment * depth_stencil_attachment;
- struct anv_subpass_attachment * ds_resolve_attachment;
- VkResolveModeFlagBitsKHR depth_resolve_mode;
- VkResolveModeFlagBitsKHR stencil_resolve_mode;
-
- uint32_t view_mask;
-
- /** Subpass has a depth/stencil self-dependency */
- bool has_ds_self_dep;
-
- /** Subpass has at least one color resolve attachment */
- bool has_color_resolve;
-};
-
-static inline unsigned
-anv_subpass_view_count(const struct anv_subpass *subpass)
-{
- return MAX2(1, util_bitcount(subpass->view_mask));
-}
-
-struct anv_render_pass_attachment {
- /* TODO: Consider using VkAttachmentDescription instead of storing each of
- * its members individually.
- */
- VkFormat format;
- uint32_t samples;
- VkImageUsageFlags usage;
- VkAttachmentLoadOp load_op;
- VkAttachmentStoreOp store_op;
- VkAttachmentLoadOp stencil_load_op;
- VkImageLayout initial_layout;
- VkImageLayout final_layout;
- VkImageLayout first_subpass_layout;
-
- VkImageLayout stencil_initial_layout;
- VkImageLayout stencil_final_layout;
-
- /* The subpass id in which the attachment will be used last. */
- uint32_t last_subpass_idx;
-};
-
-struct anv_render_pass {
- struct vk_object_base base;
-
- uint32_t attachment_count;
- uint32_t subpass_count;
- /* An array of subpass_count+1 flushes, one per subpass boundary */
- enum anv_pipe_bits * subpass_flushes;
- struct anv_render_pass_attachment * attachments;
- struct anv_subpass subpasses[0];
+ struct anv_state custom_border_color_db;
};
#define ANV_PIPELINE_STATISTICS_MASK 0x000007ff
struct anv_query_pool {
- struct vk_object_base base;
+ struct vk_query_pool vk;
- VkQueryType type;
- VkQueryPipelineStatisticFlags pipeline_statistics;
/** Stride between slots, in bytes */
uint32_t stride;
/** Number of slots in this query pool */
- uint32_t slots;
struct anv_bo * bo;
+ /** Location for the KHR_performance_query small batch updating
+ * ANV_PERF_QUERY_OFFSET_REG
+ */
+ uint32_t khr_perf_preambles_offset;
+
+ /** Size of each small batch */
+ uint32_t khr_perf_preamble_stride;
+
/* KHR perf queries : */
uint32_t pass_size;
uint32_t data_offset;
@@ -4656,40 +5910,59 @@ struct anv_query_pool {
static inline uint32_t khr_perf_query_preamble_offset(const struct anv_query_pool *pool,
uint32_t pass)
{
- return pool->pass_size * pass + 8;
+ return pool->khr_perf_preambles_offset +
+ pool->khr_perf_preamble_stride * pass;
}
-struct anv_acceleration_structure {
- struct vk_object_base base;
+struct anv_vid_mem {
+ struct anv_device_memory *mem;
+ VkDeviceSize offset;
+ VkDeviceSize size;
+};
- VkDeviceSize size;
- struct anv_address address;
+#define ANV_VIDEO_MEM_REQS_H264 4
+#define ANV_VIDEO_MEM_REQS_H265 9
+#define ANV_MB_WIDTH 16
+#define ANV_MB_HEIGHT 16
+#define ANV_VIDEO_H264_MAX_NUM_REF_FRAME 16
+#define ANV_VIDEO_H265_MAX_NUM_REF_FRAME 16
+#define ANV_VIDEO_H265_HCP_NUM_REF_FRAME 8
+#define ANV_MAX_H265_CTB_SIZE 64
+
+enum anv_vid_mem_h264_types {
+ ANV_VID_MEM_H264_INTRA_ROW_STORE,
+ ANV_VID_MEM_H264_DEBLOCK_FILTER_ROW_STORE,
+ ANV_VID_MEM_H264_BSD_MPC_ROW_SCRATCH,
+ ANV_VID_MEM_H264_MPR_ROW_SCRATCH,
+ ANV_VID_MEM_H264_MAX,
};
-int anv_get_instance_entrypoint_index(const char *name);
-int anv_get_device_entrypoint_index(const char *name);
-int anv_get_physical_device_entrypoint_index(const char *name);
+enum anv_vid_mem_h265_types {
+ ANV_VID_MEM_H265_DEBLOCK_FILTER_ROW_STORE_LINE,
+ ANV_VID_MEM_H265_DEBLOCK_FILTER_ROW_STORE_TILE_LINE,
+ ANV_VID_MEM_H265_DEBLOCK_FILTER_ROW_STORE_TILE_COLUMN,
+ ANV_VID_MEM_H265_METADATA_LINE,
+ ANV_VID_MEM_H265_METADATA_TILE_LINE,
+ ANV_VID_MEM_H265_METADATA_TILE_COLUMN,
+ ANV_VID_MEM_H265_SAO_LINE,
+ ANV_VID_MEM_H265_SAO_TILE_LINE,
+ ANV_VID_MEM_H265_SAO_TILE_COLUMN,
+ ANV_VID_MEM_H265_MAX,
+};
-const char *anv_get_instance_entry_name(int index);
-const char *anv_get_physical_device_entry_name(int index);
-const char *anv_get_device_entry_name(int index);
+struct anv_video_session {
+ struct vk_video_session vk;
-bool
-anv_instance_entrypoint_is_enabled(int index, uint32_t core_version,
- const struct vk_instance_extension_table *instance);
-bool
-anv_physical_device_entrypoint_is_enabled(int index, uint32_t core_version,
- const struct vk_instance_extension_table *instance);
-bool
-anv_device_entrypoint_is_enabled(int index, uint32_t core_version,
- const struct vk_instance_extension_table *instance,
- const struct vk_device_extension_table *device);
+ /* the decoder needs some private memory allocations */
+ struct anv_vid_mem vid_mem[ANV_VID_MEM_H265_MAX];
+};
-const struct vk_device_dispatch_table *
-anv_get_device_dispatch_table(const struct intel_device_info *devinfo);
+struct anv_video_session_params {
+ struct vk_video_session_parameters vk;
+};
void
-anv_dump_pipe_bits(enum anv_pipe_bits bits);
+anv_dump_pipe_bits(enum anv_pipe_bits bits, FILE *f);
static inline void
anv_add_pending_pipe_bits(struct anv_cmd_buffer* cmd_buffer,
@@ -4697,27 +5970,17 @@ anv_add_pending_pipe_bits(struct anv_cmd_buffer* cmd_buffer,
const char* reason)
{
cmd_buffer->state.pending_pipe_bits |= bits;
- if (unlikely(INTEL_DEBUG & DEBUG_PIPE_CONTROL) && bits)
- {
- fputs("pc: add ", stderr);
- anv_dump_pipe_bits(bits);
- fprintf(stderr, "reason: %s\n", reason);
+ if (INTEL_DEBUG(DEBUG_PIPE_CONTROL) && bits) {
+ fputs("pc: add ", stdout);
+ anv_dump_pipe_bits(bits, stdout);
+ fprintf(stdout, "reason: %s\n", reason);
+ }
+ /* store reason, if space available*/
+ if (cmd_buffer->state.pc_reasons_count <
+ ARRAY_SIZE(cmd_buffer->state.pc_reasons)) {
+ cmd_buffer->state.pc_reasons[
+ cmd_buffer->state.pc_reasons_count++] = reason;
}
-}
-
-static inline uint32_t
-anv_get_subpass_id(const struct anv_cmd_state * const cmd_state)
-{
- /* This function must be called from within a subpass. */
- assert(cmd_state->pass && cmd_state->subpass);
-
- const uint32_t subpass_id = cmd_state->subpass - cmd_state->pass->subpasses;
-
- /* The id of this subpass shouldn't exceed the number of subpasses in this
- * render pass minus 1.
- */
- assert(subpass_id < cmd_state->pass->subpass_count);
- return subpass_id;
}
struct anv_performance_configuration_intel {
@@ -4728,6 +5991,7 @@ struct anv_performance_configuration_intel {
uint64_t config_id;
};
+void anv_physical_device_init_va_ranges(struct anv_physical_device *device);
void anv_physical_device_init_perf(struct anv_physical_device *device, int fd);
void anv_device_perf_init(struct anv_device *device);
void anv_perf_write_pass_results(struct intel_perf_config *perf,
@@ -4735,25 +5999,108 @@ void anv_perf_write_pass_results(struct intel_perf_config *perf,
const struct intel_perf_query_result *accumulated_results,
union VkPerformanceCounterResultKHR *results);
+void anv_apply_per_prim_attr_wa(struct nir_shader *ms_nir,
+ struct nir_shader *fs_nir,
+ struct anv_device *device,
+ const VkGraphicsPipelineCreateInfo *info);
+
+/* Use to emit a series of memcpy operations */
+struct anv_memcpy_state {
+ struct anv_device *device;
+ struct anv_batch *batch;
+
+ struct anv_vb_cache_range vb_bound;
+ struct anv_vb_cache_range vb_dirty;
+};
+
+VkResult anv_device_init_internal_kernels(struct anv_device *device);
+void anv_device_finish_internal_kernels(struct anv_device *device);
+VkResult anv_device_get_internal_shader(struct anv_device *device,
+ enum anv_internal_kernel_name name,
+ struct anv_shader_bin **out_bin);
+
+VkResult anv_device_init_astc_emu(struct anv_device *device);
+void anv_device_finish_astc_emu(struct anv_device *device);
+void anv_astc_emu_process(struct anv_cmd_buffer *cmd_buffer,
+ struct anv_image *image,
+ VkImageLayout layout,
+ const VkImageSubresourceLayers *subresource,
+ VkOffset3D block_offset,
+ VkExtent3D block_extent);
+
+/* This structure is used in 2 scenarios :
+ *
+ * - copy utrace timestamps from command buffer so that command buffer can
+ * be resubmitted multiple times without the recorded timestamps being
+ * overwritten before they're read back
+ *
+ * - emit trace points for queue debug tagging
+ * (vkQueueBeginDebugUtilsLabelEXT/vkQueueEndDebugUtilsLabelEXT)
+ */
+struct anv_utrace_submit {
+ /* Needs to be the first field */
+ struct intel_ds_flush_data ds;
+
+ /* Batch stuff to implement of copy of timestamps recorded in another
+ * buffer.
+ */
+ struct anv_reloc_list relocs;
+ struct anv_batch batch;
+ struct util_dynarray batch_bos;
+
+ /* Stream for temporary allocations */
+ struct anv_state_stream dynamic_state_stream;
+ struct anv_state_stream general_state_stream;
+
+ /* Syncobj to be signaled when the batch completes */
+ struct vk_sync *sync;
+
+ /* Queue on which all the recorded traces are submitted */
+ struct anv_queue *queue;
+
+ /* Buffer of 64bits timestamps (only used for timestamp copies) */
+ struct anv_bo *trace_bo;
+
+ /* Last fully read 64bit timestamp (used to rebuild the upper bits of 32bit
+ * timestamps)
+ */
+ uint64_t last_full_timestamp;
+
+ /* Memcpy state tracking (only used for timestamp copies on render engine) */
+ struct anv_memcpy_state memcpy_state;
+
+ /* Memcpy state tracking (only used for timestamp copies on compute engine) */
+ struct anv_simple_shader simple_state;
+};
+
+void anv_device_utrace_init(struct anv_device *device);
+void anv_device_utrace_finish(struct anv_device *device);
+VkResult
+anv_device_utrace_flush_cmd_buffers(struct anv_queue *queue,
+ uint32_t cmd_buffer_count,
+ struct anv_cmd_buffer **cmd_buffers,
+ struct anv_utrace_submit **out_submit);
+
+static bool
+anv_has_cooperative_matrix(const struct anv_physical_device *device)
+{
+ return device->has_cooperative_matrix;
+}
+
#define ANV_FROM_HANDLE(__anv_type, __name, __handle) \
VK_FROM_HANDLE(__anv_type, __name, __handle)
-VK_DEFINE_HANDLE_CASTS(anv_cmd_buffer, base, VkCommandBuffer,
+VK_DEFINE_HANDLE_CASTS(anv_cmd_buffer, vk.base, VkCommandBuffer,
VK_OBJECT_TYPE_COMMAND_BUFFER)
VK_DEFINE_HANDLE_CASTS(anv_device, vk.base, VkDevice, VK_OBJECT_TYPE_DEVICE)
VK_DEFINE_HANDLE_CASTS(anv_instance, vk.base, VkInstance, VK_OBJECT_TYPE_INSTANCE)
VK_DEFINE_HANDLE_CASTS(anv_physical_device, vk.base, VkPhysicalDevice,
VK_OBJECT_TYPE_PHYSICAL_DEVICE)
-VK_DEFINE_HANDLE_CASTS(anv_queue, base, VkQueue, VK_OBJECT_TYPE_QUEUE)
-
-VK_DEFINE_NONDISP_HANDLE_CASTS(anv_acceleration_structure, base,
- VkAccelerationStructureKHR,
- VK_OBJECT_TYPE_ACCELERATION_STRUCTURE_KHR)
-VK_DEFINE_NONDISP_HANDLE_CASTS(anv_cmd_pool, base, VkCommandPool,
- VK_OBJECT_TYPE_COMMAND_POOL)
-VK_DEFINE_NONDISP_HANDLE_CASTS(anv_buffer, base, VkBuffer,
+VK_DEFINE_HANDLE_CASTS(anv_queue, vk.base, VkQueue, VK_OBJECT_TYPE_QUEUE)
+
+VK_DEFINE_NONDISP_HANDLE_CASTS(anv_buffer, vk.base, VkBuffer,
VK_OBJECT_TYPE_BUFFER)
-VK_DEFINE_NONDISP_HANDLE_CASTS(anv_buffer_view, base, VkBufferView,
+VK_DEFINE_NONDISP_HANDLE_CASTS(anv_buffer_view, vk.base, VkBufferView,
VK_OBJECT_TYPE_BUFFER_VIEW)
VK_DEFINE_NONDISP_HANDLE_CASTS(anv_descriptor_pool, base, VkDescriptorPool,
VK_OBJECT_TYPE_DESCRIPTOR_POOL)
@@ -4762,51 +6109,33 @@ VK_DEFINE_NONDISP_HANDLE_CASTS(anv_descriptor_set, base, VkDescriptorSet,
VK_DEFINE_NONDISP_HANDLE_CASTS(anv_descriptor_set_layout, base,
VkDescriptorSetLayout,
VK_OBJECT_TYPE_DESCRIPTOR_SET_LAYOUT)
-VK_DEFINE_NONDISP_HANDLE_CASTS(anv_descriptor_update_template, base,
- VkDescriptorUpdateTemplate,
- VK_OBJECT_TYPE_DESCRIPTOR_UPDATE_TEMPLATE)
-VK_DEFINE_NONDISP_HANDLE_CASTS(anv_device_memory, base, VkDeviceMemory,
+VK_DEFINE_NONDISP_HANDLE_CASTS(anv_device_memory, vk.base, VkDeviceMemory,
VK_OBJECT_TYPE_DEVICE_MEMORY)
-VK_DEFINE_NONDISP_HANDLE_CASTS(anv_fence, base, VkFence, VK_OBJECT_TYPE_FENCE)
VK_DEFINE_NONDISP_HANDLE_CASTS(anv_event, base, VkEvent, VK_OBJECT_TYPE_EVENT)
-VK_DEFINE_NONDISP_HANDLE_CASTS(anv_framebuffer, base, VkFramebuffer,
- VK_OBJECT_TYPE_FRAMEBUFFER)
VK_DEFINE_NONDISP_HANDLE_CASTS(anv_image, vk.base, VkImage, VK_OBJECT_TYPE_IMAGE)
VK_DEFINE_NONDISP_HANDLE_CASTS(anv_image_view, vk.base, VkImageView,
VK_OBJECT_TYPE_IMAGE_VIEW);
-VK_DEFINE_NONDISP_HANDLE_CASTS(anv_pipeline_cache, base, VkPipelineCache,
- VK_OBJECT_TYPE_PIPELINE_CACHE)
VK_DEFINE_NONDISP_HANDLE_CASTS(anv_pipeline, base, VkPipeline,
VK_OBJECT_TYPE_PIPELINE)
VK_DEFINE_NONDISP_HANDLE_CASTS(anv_pipeline_layout, base, VkPipelineLayout,
VK_OBJECT_TYPE_PIPELINE_LAYOUT)
-VK_DEFINE_NONDISP_HANDLE_CASTS(anv_query_pool, base, VkQueryPool,
+VK_DEFINE_NONDISP_HANDLE_CASTS(anv_query_pool, vk.base, VkQueryPool,
VK_OBJECT_TYPE_QUERY_POOL)
-VK_DEFINE_NONDISP_HANDLE_CASTS(anv_render_pass, base, VkRenderPass,
- VK_OBJECT_TYPE_RENDER_PASS)
-VK_DEFINE_NONDISP_HANDLE_CASTS(anv_sampler, base, VkSampler,
+VK_DEFINE_NONDISP_HANDLE_CASTS(anv_sampler, vk.base, VkSampler,
VK_OBJECT_TYPE_SAMPLER)
-VK_DEFINE_NONDISP_HANDLE_CASTS(anv_semaphore, base, VkSemaphore,
- VK_OBJECT_TYPE_SEMAPHORE)
-VK_DEFINE_NONDISP_HANDLE_CASTS(anv_ycbcr_conversion, base,
- VkSamplerYcbcrConversion,
- VK_OBJECT_TYPE_SAMPLER_YCBCR_CONVERSION)
VK_DEFINE_NONDISP_HANDLE_CASTS(anv_performance_configuration_intel, base,
VkPerformanceConfigurationINTEL,
VK_OBJECT_TYPE_PERFORMANCE_CONFIGURATION_INTEL)
+VK_DEFINE_NONDISP_HANDLE_CASTS(anv_video_session, vk.base,
+ VkVideoSessionKHR,
+ VK_OBJECT_TYPE_VIDEO_SESSION_KHR)
+VK_DEFINE_NONDISP_HANDLE_CASTS(anv_video_session_params, vk.base,
+ VkVideoSessionParametersKHR,
+ VK_OBJECT_TYPE_VIDEO_SESSION_PARAMETERS_KHR)
#define anv_genX(devinfo, thing) ({ \
__typeof(&gfx9_##thing) genX_thing; \
switch ((devinfo)->verx10) { \
- case 70: \
- genX_thing = &gfx7_##thing; \
- break; \
- case 75: \
- genX_thing = &gfx75_##thing; \
- break; \
- case 80: \
- genX_thing = &gfx8_##thing; \
- break; \
case 90: \
genX_thing = &gfx9_##thing; \
break; \
@@ -4819,6 +6148,9 @@ VK_DEFINE_NONDISP_HANDLE_CASTS(anv_performance_configuration_intel, base,
case 125: \
genX_thing = &gfx125_##thing; \
break; \
+ case 200: \
+ genX_thing = &gfx20_##thing; \
+ break; \
default: \
unreachable("Unknown hardware generation"); \
} \
@@ -4829,15 +6161,6 @@ VK_DEFINE_NONDISP_HANDLE_CASTS(anv_performance_configuration_intel, base,
#ifdef genX
# include "anv_genX.h"
#else
-# define genX(x) gfx7_##x
-# include "anv_genX.h"
-# undef genX
-# define genX(x) gfx75_##x
-# include "anv_genX.h"
-# undef genX
-# define genX(x) gfx8_##x
-# include "anv_genX.h"
-# undef genX
# define genX(x) gfx9_##x
# include "anv_genX.h"
# undef genX
@@ -4850,6 +6173,13 @@ VK_DEFINE_NONDISP_HANDLE_CASTS(anv_performance_configuration_intel, base,
# define genX(x) gfx125_##x
# include "anv_genX.h"
# undef genX
+# define genX(x) gfx20_##x
+# include "anv_genX.h"
+# undef genX
+#endif
+
+#ifdef __cplusplus
+}
#endif
#endif /* ANV_PRIVATE_H */
diff --git a/src/intel/vulkan/anv_queue.c b/src/intel/vulkan/anv_queue.c
index f94223b1a30..1989016f6b2 100644
--- a/src/intel/vulkan/anv_queue.c
+++ b/src/intel/vulkan/anv_queue.c
@@ -22,2668 +22,106 @@
*/
/**
- * This file implements VkQueue, VkFence, and VkSemaphore
+ * This file implements VkQueue
*/
-#include <errno.h>
-#include <fcntl.h>
-#include <unistd.h>
-
-#include "util/os_file.h"
-
#include "anv_private.h"
-#include "anv_measure.h"
-#include "vk_util.h"
-
-#include "genxml/gen7_pack.h"
-
-uint64_t anv_gettime_ns(void)
-{
- struct timespec current;
- clock_gettime(CLOCK_MONOTONIC, &current);
- return (uint64_t)current.tv_sec * NSEC_PER_SEC + current.tv_nsec;
-}
-
-uint64_t anv_get_absolute_timeout(uint64_t timeout)
-{
- if (timeout == 0)
- return 0;
- uint64_t current_time = anv_gettime_ns();
- uint64_t max_timeout = (uint64_t) INT64_MAX - current_time;
-
- timeout = MIN2(max_timeout, timeout);
-
- return (current_time + timeout);
-}
-
-static int64_t anv_get_relative_timeout(uint64_t abs_timeout)
-{
- uint64_t now = anv_gettime_ns();
-
- /* We don't want negative timeouts.
- *
- * DRM_IOCTL_I915_GEM_WAIT uses a signed 64 bit timeout and is
- * supposed to block indefinitely timeouts < 0. Unfortunately,
- * this was broken for a couple of kernel releases. Since there's
- * no way to know whether or not the kernel we're using is one of
- * the broken ones, the best we can do is to clamp the timeout to
- * INT64_MAX. This limits the maximum timeout from 584 years to
- * 292 years - likely not a big deal.
- */
- if (abs_timeout < now)
- return 0;
-
- uint64_t rel_timeout = abs_timeout - now;
- if (rel_timeout > (uint64_t) INT64_MAX)
- rel_timeout = INT64_MAX;
-
- return rel_timeout;
-}
-
-static void anv_semaphore_impl_cleanup(struct anv_device *device,
- struct anv_semaphore_impl *impl);
-
-static void
-anv_queue_submit_free(struct anv_device *device,
- struct anv_queue_submit *submit)
-{
- const VkAllocationCallbacks *alloc = submit->alloc;
-
- for (uint32_t i = 0; i < submit->temporary_semaphore_count; i++)
- anv_semaphore_impl_cleanup(device, &submit->temporary_semaphores[i]);
- /* Execbuf does not consume the in_fence. It's our job to close it. */
- if (submit->in_fence != -1) {
- assert(!device->has_thread_submit);
- close(submit->in_fence);
- }
- if (submit->out_fence != -1) {
- assert(!device->has_thread_submit);
- close(submit->out_fence);
- }
- vk_free(alloc, submit->fences);
- vk_free(alloc, submit->fence_values);
- vk_free(alloc, submit->temporary_semaphores);
- vk_free(alloc, submit->wait_timelines);
- vk_free(alloc, submit->wait_timeline_values);
- vk_free(alloc, submit->signal_timelines);
- vk_free(alloc, submit->signal_timeline_values);
- vk_free(alloc, submit->fence_bos);
- vk_free(alloc, submit->cmd_buffers);
- vk_free(alloc, submit);
-}
-
-static bool
-anv_queue_submit_ready_locked(struct anv_queue_submit *submit)
-{
- for (uint32_t i = 0; i < submit->wait_timeline_count; i++) {
- if (submit->wait_timeline_values[i] > submit->wait_timelines[i]->highest_pending)
- return false;
- }
-
- return true;
-}
-
-static VkResult
-anv_timeline_init(struct anv_device *device,
- struct anv_timeline *timeline,
- uint64_t initial_value)
-{
- timeline->highest_past =
- timeline->highest_pending = initial_value;
- list_inithead(&timeline->points);
- list_inithead(&timeline->free_points);
-
- return VK_SUCCESS;
-}
-static void
-anv_timeline_finish(struct anv_device *device,
- struct anv_timeline *timeline)
-{
- list_for_each_entry_safe(struct anv_timeline_point, point,
- &timeline->free_points, link) {
- list_del(&point->link);
- anv_device_release_bo(device, point->bo);
- vk_free(&device->vk.alloc, point);
- }
- list_for_each_entry_safe(struct anv_timeline_point, point,
- &timeline->points, link) {
- list_del(&point->link);
- anv_device_release_bo(device, point->bo);
- vk_free(&device->vk.alloc, point);
- }
-}
+#include "i915/anv_queue.h"
+#include "xe/anv_queue.h"
static VkResult
-anv_timeline_add_point_locked(struct anv_device *device,
- struct anv_timeline *timeline,
- uint64_t value,
- struct anv_timeline_point **point)
+anv_create_engine(struct anv_device *device,
+ struct anv_queue *queue,
+ const VkDeviceQueueCreateInfo *pCreateInfo)
{
- VkResult result = VK_SUCCESS;
-
- if (list_is_empty(&timeline->free_points)) {
- *point =
- vk_zalloc(&device->vk.alloc, sizeof(**point),
- 8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
- if (!(*point))
- result = vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
- if (result == VK_SUCCESS) {
- result = anv_device_alloc_bo(device, "timeline-semaphore", 4096,
- ANV_BO_ALLOC_EXTERNAL |
- ANV_BO_ALLOC_IMPLICIT_SYNC,
- 0 /* explicit_address */,
- &(*point)->bo);
- if (result != VK_SUCCESS)
- vk_free(&device->vk.alloc, *point);
- }
- } else {
- *point = list_first_entry(&timeline->free_points,
- struct anv_timeline_point, link);
- list_del(&(*point)->link);
- }
-
- if (result == VK_SUCCESS) {
- (*point)->serial = value;
- list_addtail(&(*point)->link, &timeline->points);
- }
-
- return result;
-}
-
-static VkResult
-anv_timeline_gc_locked(struct anv_device *device,
- struct anv_timeline *timeline)
-{
- list_for_each_entry_safe(struct anv_timeline_point, point,
- &timeline->points, link) {
- /* timeline->higest_pending is only incremented once submission has
- * happened. If this point has a greater serial, it means the point
- * hasn't been submitted yet.
- */
- if (point->serial > timeline->highest_pending)
- return VK_SUCCESS;
-
- /* If someone is waiting on this time point, consider it busy and don't
- * try to recycle it. There's a slim possibility that it's no longer
- * busy by the time we look at it but we would be recycling it out from
- * under a waiter and that can lead to weird races.
- *
- * We walk the list in-order so if this time point is still busy so is
- * every following time point
- */
- assert(point->waiting >= 0);
- if (point->waiting)
- return VK_SUCCESS;
-
- /* Garbage collect any signaled point. */
- VkResult result = anv_device_bo_busy(device, point->bo);
- if (result == VK_NOT_READY) {
- /* We walk the list in-order so if this time point is still busy so
- * is every following time point
- */
- return VK_SUCCESS;
- } else if (result != VK_SUCCESS) {
- return result;
- }
-
- assert(timeline->highest_past < point->serial);
- timeline->highest_past = point->serial;
-
- list_del(&point->link);
- list_add(&point->link, &timeline->free_points);
- }
-
- return VK_SUCCESS;
-}
-
-static VkResult anv_queue_submit_add_fence_bo(struct anv_queue_submit *submit,
- struct anv_bo *bo,
- bool signal);
-
-static VkResult
-anv_queue_submit_timeline_locked(struct anv_queue *queue,
- struct anv_queue_submit *submit)
-{
- VkResult result;
-
- for (uint32_t i = 0; i < submit->wait_timeline_count; i++) {
- struct anv_timeline *timeline = submit->wait_timelines[i];
- uint64_t wait_value = submit->wait_timeline_values[i];
-
- if (timeline->highest_past >= wait_value)
- continue;
-
- list_for_each_entry(struct anv_timeline_point, point, &timeline->points, link) {
- if (point->serial < wait_value)
- continue;
- result = anv_queue_submit_add_fence_bo(submit, point->bo, false);
- if (result != VK_SUCCESS)
- return result;
- break;
- }
- }
- for (uint32_t i = 0; i < submit->signal_timeline_count; i++) {
- struct anv_timeline *timeline = submit->signal_timelines[i];
- uint64_t signal_value = submit->signal_timeline_values[i];
- struct anv_timeline_point *point;
-
- result = anv_timeline_add_point_locked(queue->device, timeline,
- signal_value, &point);
- if (result != VK_SUCCESS)
- return result;
-
- result = anv_queue_submit_add_fence_bo(submit, point->bo, true);
- if (result != VK_SUCCESS)
- return result;
- }
-
- result = anv_queue_execbuf_locked(queue, submit);
-
- if (result == VK_SUCCESS) {
- /* Update the pending values in the timeline objects. */
- for (uint32_t i = 0; i < submit->signal_timeline_count; i++) {
- struct anv_timeline *timeline = submit->signal_timelines[i];
- uint64_t signal_value = submit->signal_timeline_values[i];
-
- assert(signal_value > timeline->highest_pending);
- timeline->highest_pending = signal_value;
- }
- } else {
- /* Unblock any waiter by signaling the points, the application will get
- * a device lost error code.
- */
- for (uint32_t i = 0; i < submit->signal_timeline_count; i++) {
- struct anv_timeline *timeline = submit->signal_timelines[i];
- uint64_t signal_value = submit->signal_timeline_values[i];
-
- assert(signal_value > timeline->highest_pending);
- timeline->highest_past = timeline->highest_pending = signal_value;
- }
- }
-
- return result;
-}
-
-static VkResult
-anv_queue_submit_deferred_locked(struct anv_queue *queue, uint32_t *advance)
-{
- VkResult result = VK_SUCCESS;
-
- /* Go through all the queued submissions and submit then until we find one
- * that's waiting on a point that hasn't materialized yet.
- */
- list_for_each_entry_safe(struct anv_queue_submit, submit,
- &queue->queued_submits, link) {
- if (!anv_queue_submit_ready_locked(submit))
- break;
-
- (*advance)++;
- list_del(&submit->link);
-
- result = anv_queue_submit_timeline_locked(queue, submit);
-
- anv_queue_submit_free(queue->device, submit);
-
- if (result != VK_SUCCESS)
- break;
+ switch (device->info->kmd_type) {
+ case INTEL_KMD_TYPE_I915:
+ return anv_i915_create_engine(device, queue, pCreateInfo);
+ case INTEL_KMD_TYPE_XE:
+ return anv_xe_create_engine(device, queue, pCreateInfo);
+ default:
+ unreachable("Missing");
+ return VK_ERROR_UNKNOWN;
}
-
- return result;
-}
-
-static VkResult
-anv_device_submit_deferred_locked(struct anv_device *device)
-{
- VkResult result = VK_SUCCESS;
-
- uint32_t advance;
- do {
- advance = 0;
- for (uint32_t i = 0; i < device->queue_count; i++) {
- struct anv_queue *queue = &device->queues[i];
- VkResult qres = anv_queue_submit_deferred_locked(queue, &advance);
- if (qres != VK_SUCCESS)
- result = qres;
- }
- } while (advance);
-
- return result;
}
static void
-anv_queue_submit_signal_fences(struct anv_device *device,
- struct anv_queue_submit *submit)
-{
- for (uint32_t i = 0; i < submit->fence_count; i++) {
- if (submit->fences[i].flags & I915_EXEC_FENCE_SIGNAL) {
- anv_gem_syncobj_timeline_signal(device, &submit->fences[i].handle,
- &submit->fence_values[i], 1);
- }
- }
-}
-
-static void *
-anv_queue_task(void *_queue)
-{
- struct anv_queue *queue = _queue;
-
- pthread_mutex_lock(&queue->mutex);
-
- while (!queue->quit) {
- while (!list_is_empty(&queue->queued_submits)) {
- struct anv_queue_submit *submit =
- list_first_entry(&queue->queued_submits, struct anv_queue_submit, link);
- list_del(&submit->link);
-
- pthread_mutex_unlock(&queue->mutex);
-
- VkResult result = VK_ERROR_DEVICE_LOST;
-
- /* Wait for timeline points to materialize before submitting. We need
- * to do this because we're using threads to do the submit to i915.
- * We could end up in a situation where the application submits to 2
- * queues with the first submit creating the dma-fence for the
- * second. But because the scheduling of the submission threads might
- * wakeup the second queue thread first, this would make that execbuf
- * fail because the dma-fence it depends on hasn't materialized yet.
- */
- if (!queue->lost && submit->wait_timeline_count > 0) {
- int ret = queue->device->info.no_hw ? 0 :
- anv_gem_syncobj_timeline_wait(
- queue->device, submit->wait_timeline_syncobjs,
- submit->wait_timeline_values, submit->wait_timeline_count,
- anv_get_absolute_timeout(UINT64_MAX) /* wait forever */,
- true /* wait for all */, true /* wait for materialize */);
- if (ret) {
- result = anv_queue_set_lost(queue, "timeline timeout: %s",
- strerror(errno));
- }
- }
-
- /* Now submit */
- if (!queue->lost) {
- pthread_mutex_lock(&queue->device->mutex);
- result = anv_queue_execbuf_locked(queue, submit);
- pthread_mutex_unlock(&queue->device->mutex);
- }
-
- if (result != VK_SUCCESS) {
- /* vkQueueSubmit or some other entry point will report the
- * DEVICE_LOST error at some point, but until we have emptied our
- * list of execbufs we need to wake up all potential the waiters
- * until one of them spots the error.
- */
- anv_queue_submit_signal_fences(queue->device, submit);
- }
-
- anv_queue_submit_free(queue->device, submit);
-
- pthread_mutex_lock(&queue->mutex);
- }
-
- if (!queue->quit)
- pthread_cond_wait(&queue->cond, &queue->mutex);
- }
-
- pthread_mutex_unlock(&queue->mutex);
-
- return NULL;
-}
-
-static VkResult
-anv_queue_submit_post(struct anv_queue *queue,
- struct anv_queue_submit **_submit,
- bool flush_queue)
-{
- struct anv_queue_submit *submit = *_submit;
-
- /* Wait before signal behavior means we might keep alive the
- * anv_queue_submit object a bit longer, so transfer the ownership to the
- * anv_queue.
- */
- *_submit = NULL;
- if (queue->device->has_thread_submit) {
- pthread_mutex_lock(&queue->mutex);
- pthread_cond_broadcast(&queue->cond);
- list_addtail(&submit->link, &queue->queued_submits);
- pthread_mutex_unlock(&queue->mutex);
- return VK_SUCCESS;
- } else {
- pthread_mutex_lock(&queue->device->mutex);
- list_addtail(&submit->link, &queue->queued_submits);
- VkResult result = anv_device_submit_deferred_locked(queue->device);
- if (flush_queue) {
- while (result == VK_SUCCESS && !list_is_empty(&queue->queued_submits)) {
- int ret = pthread_cond_wait(&queue->device->queue_submit,
- &queue->device->mutex);
- if (ret != 0) {
- result = anv_device_set_lost(queue->device, "wait timeout");
- break;
- }
-
- result = anv_device_submit_deferred_locked(queue->device);
- }
- }
- pthread_mutex_unlock(&queue->device->mutex);
- return result;
- }
-}
-
-VkResult
-anv_queue_init(struct anv_device *device, struct anv_queue *queue,
- uint32_t exec_flags,
- const VkDeviceQueueCreateInfo *pCreateInfo)
-{
- struct anv_physical_device *pdevice = device->physical;
- VkResult result;
-
- queue->device = device;
- queue->flags = pCreateInfo->flags;
-
- assert(pCreateInfo->queueFamilyIndex < pdevice->queue.family_count);
- queue->family = &pdevice->queue.families[pCreateInfo->queueFamilyIndex];
-
- queue->exec_flags = exec_flags;
- queue->lost = false;
- queue->quit = false;
-
- list_inithead(&queue->queued_submits);
-
- /* We only need those additional thread/mutex when using a thread for
- * submission.
- */
- if (device->has_thread_submit) {
- if (pthread_mutex_init(&queue->mutex, NULL) != 0)
- return vk_error(VK_ERROR_INITIALIZATION_FAILED);
-
- if (pthread_cond_init(&queue->cond, NULL) != 0) {
- result = vk_error(VK_ERROR_INITIALIZATION_FAILED);
- goto fail_mutex;
- }
- if (pthread_create(&queue->thread, NULL, anv_queue_task, queue)) {
- result = vk_error(VK_ERROR_INITIALIZATION_FAILED);
- goto fail_cond;
- }
- }
-
- vk_object_base_init(&device->vk, &queue->base, VK_OBJECT_TYPE_QUEUE);
-
- return VK_SUCCESS;
-
- fail_cond:
- pthread_cond_destroy(&queue->cond);
- fail_mutex:
- pthread_mutex_destroy(&queue->mutex);
-
- return result;
-}
-
-void
-anv_queue_finish(struct anv_queue *queue)
-{
- if (queue->device->has_thread_submit) {
- pthread_mutex_lock(&queue->mutex);
- pthread_cond_broadcast(&queue->cond);
- queue->quit = true;
- pthread_mutex_unlock(&queue->mutex);
-
- void *ret;
- pthread_join(queue->thread, &ret);
-
- pthread_cond_destroy(&queue->cond);
- pthread_mutex_destroy(&queue->mutex);
- }
-
- vk_object_base_finish(&queue->base);
-}
-
-static VkResult
-anv_queue_submit_add_fence_bo(struct anv_queue_submit *submit,
- struct anv_bo *bo,
- bool signal)
-{
- if (submit->fence_bo_count >= submit->fence_bo_array_length) {
- uint32_t new_len = MAX2(submit->fence_bo_array_length * 2, 64);
- uintptr_t *new_fence_bos =
- vk_realloc(submit->alloc,
- submit->fence_bos, new_len * sizeof(*submit->fence_bos),
- 8, submit->alloc_scope);
- if (new_fence_bos == NULL)
- return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
-
- submit->fence_bos = new_fence_bos;
- submit->fence_bo_array_length = new_len;
- }
-
- /* Take advantage that anv_bo are allocated at 8 byte alignement so we can
- * use the lowest bit to store whether this is a BO we need to signal.
- */
- submit->fence_bos[submit->fence_bo_count++] = anv_pack_ptr(bo, 1, signal);
-
- return VK_SUCCESS;
-}
-
-static VkResult
-anv_queue_submit_add_syncobj(struct anv_queue_submit* submit,
- struct anv_device *device,
- uint32_t handle, uint32_t flags,
- uint64_t value)
-{
- assert(flags != 0);
-
- if (device->has_thread_submit && (flags & I915_EXEC_FENCE_WAIT)) {
- if (submit->wait_timeline_count >= submit->wait_timeline_array_length) {
- uint32_t new_len = MAX2(submit->wait_timeline_array_length * 2, 64);
-
- uint32_t *new_wait_timeline_syncobjs =
- vk_realloc(submit->alloc,
- submit->wait_timeline_syncobjs,
- new_len * sizeof(*submit->wait_timeline_syncobjs),
- 8, submit->alloc_scope);
- if (new_wait_timeline_syncobjs == NULL)
- return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
-
- submit->wait_timeline_syncobjs = new_wait_timeline_syncobjs;
-
- uint64_t *new_wait_timeline_values =
- vk_realloc(submit->alloc,
- submit->wait_timeline_values, new_len * sizeof(*submit->wait_timeline_values),
- 8, submit->alloc_scope);
- if (new_wait_timeline_values == NULL)
- return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
-
- submit->wait_timeline_values = new_wait_timeline_values;
- submit->wait_timeline_array_length = new_len;
- }
-
- submit->wait_timeline_syncobjs[submit->wait_timeline_count] = handle;
- submit->wait_timeline_values[submit->wait_timeline_count] = value;
-
- submit->wait_timeline_count++;
- }
-
- if (submit->fence_count >= submit->fence_array_length) {
- uint32_t new_len = MAX2(submit->fence_array_length * 2, 64);
- struct drm_i915_gem_exec_fence *new_fences =
- vk_realloc(submit->alloc,
- submit->fences, new_len * sizeof(*submit->fences),
- 8, submit->alloc_scope);
- if (new_fences == NULL)
- return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
-
- submit->fences = new_fences;
-
- uint64_t *new_fence_values =
- vk_realloc(submit->alloc,
- submit->fence_values, new_len * sizeof(*submit->fence_values),
- 8, submit->alloc_scope);
- if (new_fence_values == NULL)
- return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
-
- submit->fence_values = new_fence_values;
- submit->fence_array_length = new_len;
- }
-
- submit->fences[submit->fence_count] = (struct drm_i915_gem_exec_fence) {
- .handle = handle,
- .flags = flags,
- };
- submit->fence_values[submit->fence_count] = value;
- submit->fence_count++;
-
- return VK_SUCCESS;
-}
-
-static VkResult
-anv_queue_submit_add_timeline_wait(struct anv_queue_submit* submit,
- struct anv_device *device,
- struct anv_timeline *timeline,
- uint64_t value)
-{
- if (submit->wait_timeline_count >= submit->wait_timeline_array_length) {
- uint32_t new_len = MAX2(submit->wait_timeline_array_length * 2, 64);
- struct anv_timeline **new_wait_timelines =
- vk_realloc(submit->alloc,
- submit->wait_timelines, new_len * sizeof(*submit->wait_timelines),
- 8, submit->alloc_scope);
- if (new_wait_timelines == NULL)
- return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
-
- submit->wait_timelines = new_wait_timelines;
-
- uint64_t *new_wait_timeline_values =
- vk_realloc(submit->alloc,
- submit->wait_timeline_values, new_len * sizeof(*submit->wait_timeline_values),
- 8, submit->alloc_scope);
- if (new_wait_timeline_values == NULL)
- return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
-
- submit->wait_timeline_values = new_wait_timeline_values;
-
- submit->wait_timeline_array_length = new_len;
- }
-
- submit->wait_timelines[submit->wait_timeline_count] = timeline;
- submit->wait_timeline_values[submit->wait_timeline_count] = value;
-
- submit->wait_timeline_count++;
-
- return VK_SUCCESS;
-}
-
-static VkResult
-anv_queue_submit_add_timeline_signal(struct anv_queue_submit* submit,
- struct anv_device *device,
- struct anv_timeline *timeline,
- uint64_t value)
-{
- assert(timeline->highest_pending < value);
-
- if (submit->signal_timeline_count >= submit->signal_timeline_array_length) {
- uint32_t new_len = MAX2(submit->signal_timeline_array_length * 2, 64);
- struct anv_timeline **new_signal_timelines =
- vk_realloc(submit->alloc,
- submit->signal_timelines, new_len * sizeof(*submit->signal_timelines),
- 8, submit->alloc_scope);
- if (new_signal_timelines == NULL)
- return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
-
- submit->signal_timelines = new_signal_timelines;
-
- uint64_t *new_signal_timeline_values =
- vk_realloc(submit->alloc,
- submit->signal_timeline_values, new_len * sizeof(*submit->signal_timeline_values),
- 8, submit->alloc_scope);
- if (new_signal_timeline_values == NULL)
- return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
-
- submit->signal_timeline_values = new_signal_timeline_values;
-
- submit->signal_timeline_array_length = new_len;
- }
-
- submit->signal_timelines[submit->signal_timeline_count] = timeline;
- submit->signal_timeline_values[submit->signal_timeline_count] = value;
-
- submit->signal_timeline_count++;
-
- return VK_SUCCESS;
-}
-
-static struct anv_queue_submit *
-anv_queue_submit_alloc(struct anv_device *device)
-{
- const VkAllocationCallbacks *alloc = &device->vk.alloc;
- VkSystemAllocationScope alloc_scope = VK_SYSTEM_ALLOCATION_SCOPE_DEVICE;
-
- struct anv_queue_submit *submit = vk_zalloc(alloc, sizeof(*submit), 8, alloc_scope);
- if (!submit)
- return NULL;
-
- submit->alloc = alloc;
- submit->alloc_scope = alloc_scope;
- submit->in_fence = -1;
- submit->out_fence = -1;
- submit->perf_query_pass = -1;
-
- return submit;
-}
-
-VkResult
-anv_queue_submit_simple_batch(struct anv_queue *queue,
- struct anv_batch *batch)
+anv_destroy_engine(struct anv_queue *queue)
{
- if (queue->device->info.no_hw)
- return VK_SUCCESS;
-
struct anv_device *device = queue->device;
- struct anv_queue_submit *submit = anv_queue_submit_alloc(device);
- if (!submit)
- return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
-
- bool has_syncobj_wait = device->physical->has_syncobj_wait;
- VkResult result;
- uint32_t syncobj;
- struct anv_bo *batch_bo, *sync_bo;
-
- if (has_syncobj_wait) {
- syncobj = anv_gem_syncobj_create(device, 0);
- if (!syncobj) {
- result = vk_error(VK_ERROR_OUT_OF_DEVICE_MEMORY);
- goto err_free_submit;
- }
-
- result = anv_queue_submit_add_syncobj(submit, device, syncobj,
- I915_EXEC_FENCE_SIGNAL, 0);
- } else {
- result = anv_device_alloc_bo(device, "simple-batch-sync", 4096,
- ANV_BO_ALLOC_EXTERNAL |
- ANV_BO_ALLOC_IMPLICIT_SYNC,
- 0 /* explicit_address */,
- &sync_bo);
- if (result != VK_SUCCESS)
- goto err_free_submit;
-
- result = anv_queue_submit_add_fence_bo(submit, sync_bo, true /* signal */);
- }
-
- if (result != VK_SUCCESS)
- goto err_destroy_sync_primitive;
-
- if (batch) {
- uint32_t size = align_u32(batch->next - batch->start, 8);
- result = anv_bo_pool_alloc(&device->batch_bo_pool, size, &batch_bo);
- if (result != VK_SUCCESS)
- goto err_destroy_sync_primitive;
-
- memcpy(batch_bo->map, batch->start, size);
- if (!device->info.has_llc)
- intel_flush_range(batch_bo->map, size);
-
- submit->simple_bo = batch_bo;
- submit->simple_bo_size = size;
- }
-
- result = anv_queue_submit_post(queue, &submit, true);
-
- if (result == VK_SUCCESS) {
- if (has_syncobj_wait) {
- if (anv_gem_syncobj_wait(device, &syncobj, 1,
- anv_get_absolute_timeout(INT64_MAX), true))
- result = anv_device_set_lost(device, "anv_gem_syncobj_wait failed: %m");
- anv_gem_syncobj_destroy(device, syncobj);
- } else {
- result = anv_device_wait(device, sync_bo,
- anv_get_relative_timeout(INT64_MAX));
- anv_device_release_bo(device, sync_bo);
- }
- }
-
- if (batch)
- anv_bo_pool_free(&device->batch_bo_pool, batch_bo);
-
- if (submit)
- anv_queue_submit_free(device, submit);
-
- return result;
-
- err_destroy_sync_primitive:
- if (has_syncobj_wait)
- anv_gem_syncobj_destroy(device, syncobj);
- else
- anv_device_release_bo(device, sync_bo);
- err_free_submit:
- if (submit)
- anv_queue_submit_free(device, submit);
-
- return result;
-}
-
-/* Transfer ownership of temporary semaphores from the VkSemaphore object to
- * the anv_queue_submit object. Those temporary semaphores are then freed in
- * anv_queue_submit_free() once the driver is finished with them.
- */
-static VkResult
-maybe_transfer_temporary_semaphore(struct anv_queue_submit *submit,
- struct anv_semaphore *semaphore,
- struct anv_semaphore_impl **out_impl)
-{
- struct anv_semaphore_impl *impl = &semaphore->temporary;
-
- if (impl->type == ANV_SEMAPHORE_TYPE_NONE) {
- *out_impl = &semaphore->permanent;
- return VK_SUCCESS;
- }
-
- /* BO backed timeline semaphores cannot be temporary. */
- assert(impl->type != ANV_SEMAPHORE_TYPE_TIMELINE);
-
- /*
- * There is a requirement to reset semaphore to their permanent state after
- * submission. From the Vulkan 1.0.53 spec:
- *
- * "If the import is temporary, the implementation must restore the
- * semaphore to its prior permanent state after submitting the next
- * semaphore wait operation."
- *
- * In the case we defer the actual submission to a thread because of the
- * wait-before-submit behavior required for timeline semaphores, we need to
- * make copies of the temporary syncobj to ensure they stay alive until we
- * do the actual execbuffer ioctl.
- */
- if (submit->temporary_semaphore_count >= submit->temporary_semaphore_array_length) {
- uint32_t new_len = MAX2(submit->temporary_semaphore_array_length * 2, 8);
- /* Make sure that if the realloc fails, we still have the old semaphore
- * array around to properly clean things up on failure.
- */
- struct anv_semaphore_impl *new_array =
- vk_realloc(submit->alloc,
- submit->temporary_semaphores,
- new_len * sizeof(*submit->temporary_semaphores),
- 8, submit->alloc_scope);
- if (new_array == NULL)
- return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
-
- submit->temporary_semaphores = new_array;
- submit->temporary_semaphore_array_length = new_len;
- }
-
- /* Copy anv_semaphore_impl into anv_queue_submit. */
- submit->temporary_semaphores[submit->temporary_semaphore_count++] = *impl;
- *out_impl = &submit->temporary_semaphores[submit->temporary_semaphore_count - 1];
-
- /* Clear the incoming semaphore */
- impl->type = ANV_SEMAPHORE_TYPE_NONE;
-
- return VK_SUCCESS;
-}
-
-static VkResult
-anv_queue_submit_add_in_semaphores(struct anv_queue_submit *submit,
- struct anv_device *device,
- const VkSemaphore *in_semaphores,
- const uint64_t *in_values,
- uint32_t num_in_semaphores)
-{
- VkResult result;
-
- for (uint32_t i = 0; i < num_in_semaphores; i++) {
- ANV_FROM_HANDLE(anv_semaphore, semaphore, in_semaphores[i]);
- struct anv_semaphore_impl *impl;
-
- result = maybe_transfer_temporary_semaphore(submit, semaphore, &impl);
- if (result != VK_SUCCESS)
- return result;
-
- switch (impl->type) {
- case ANV_SEMAPHORE_TYPE_WSI_BO:
- /* When using a window-system buffer as a semaphore, always enable
- * EXEC_OBJECT_WRITE. This gives us a WaR hazard with the display or
- * compositor's read of the buffer and enforces that we don't start
- * rendering until they are finished. This is exactly the
- * synchronization we want with vkAcquireNextImage.
- */
- result = anv_queue_submit_add_fence_bo(submit, impl->bo, true /* signal */);
- if (result != VK_SUCCESS)
- return result;
- break;
-
- case ANV_SEMAPHORE_TYPE_DRM_SYNCOBJ: {
- result = anv_queue_submit_add_syncobj(submit, device,
- impl->syncobj,
- I915_EXEC_FENCE_WAIT,
- 0);
- if (result != VK_SUCCESS)
- return result;
- break;
- }
-
- case ANV_SEMAPHORE_TYPE_TIMELINE:
- assert(in_values);
- if (in_values[i] == 0)
- break;
- result = anv_queue_submit_add_timeline_wait(submit, device,
- &impl->timeline,
- in_values[i]);
- if (result != VK_SUCCESS)
- return result;
- break;
-
- case ANV_SEMAPHORE_TYPE_DRM_SYNCOBJ_TIMELINE:
- assert(in_values);
- if (in_values[i] == 0)
- break;
- result = anv_queue_submit_add_syncobj(submit, device,
- impl->syncobj,
- I915_EXEC_FENCE_WAIT,
- in_values[i]);
- if (result != VK_SUCCESS)
- return result;
- break;
-
- default:
- break;
- }
- }
-
- return VK_SUCCESS;
-}
-
-static VkResult
-anv_queue_submit_add_out_semaphores(struct anv_queue_submit *submit,
- struct anv_device *device,
- const VkSemaphore *out_semaphores,
- const uint64_t *out_values,
- uint32_t num_out_semaphores)
-{
- VkResult result;
-
- for (uint32_t i = 0; i < num_out_semaphores; i++) {
- ANV_FROM_HANDLE(anv_semaphore, semaphore, out_semaphores[i]);
-
- /* Under most circumstances, out fences won't be temporary. However,
- * the spec does allow it for opaque_fd. From the Vulkan 1.0.53 spec:
- *
- * "If the import is temporary, the implementation must restore the
- * semaphore to its prior permanent state after submitting the next
- * semaphore wait operation."
- *
- * The spec says nothing whatsoever about signal operations on
- * temporarily imported semaphores so it appears they are allowed.
- * There are also CTS tests that require this to work.
- */
- struct anv_semaphore_impl *impl =
- semaphore->temporary.type != ANV_SEMAPHORE_TYPE_NONE ?
- &semaphore->temporary : &semaphore->permanent;
-
- switch (impl->type) {
- case ANV_SEMAPHORE_TYPE_DRM_SYNCOBJ: {
- /*
- * Reset the content of the syncobj so it doesn't contain a
- * previously signaled dma-fence, until one is added by EXECBUFFER by
- * the submission thread.
- */
- anv_gem_syncobj_reset(device, impl->syncobj);
-
- result = anv_queue_submit_add_syncobj(submit, device, impl->syncobj,
- I915_EXEC_FENCE_SIGNAL,
- 0);
- if (result != VK_SUCCESS)
- return result;
- break;
- }
-
- case ANV_SEMAPHORE_TYPE_TIMELINE:
- assert(out_values);
- if (out_values[i] == 0)
- break;
- result = anv_queue_submit_add_timeline_signal(submit, device,
- &impl->timeline,
- out_values[i]);
- if (result != VK_SUCCESS)
- return result;
- break;
-
- case ANV_SEMAPHORE_TYPE_DRM_SYNCOBJ_TIMELINE:
- assert(out_values);
- if (out_values[i] == 0)
- break;
- result = anv_queue_submit_add_syncobj(submit, device, impl->syncobj,
- I915_EXEC_FENCE_SIGNAL,
- out_values[i]);
- if (result != VK_SUCCESS)
- return result;
- break;
-
- default:
- break;
- }
- }
-
- return VK_SUCCESS;
-}
-
-static VkResult
-anv_queue_submit_add_fence(struct anv_queue_submit *submit,
- struct anv_device *device,
- struct anv_fence *fence)
-{
- /* Under most circumstances, out fences won't be temporary. However, the
- * spec does allow it for opaque_fd. From the Vulkan 1.0.53 spec:
- *
- * "If the import is temporary, the implementation must restore the
- * semaphore to its prior permanent state after submitting the next
- * semaphore wait operation."
- *
- * The spec says nothing whatsoever about signal operations on temporarily
- * imported semaphores so it appears they are allowed. There are also CTS
- * tests that require this to work.
- */
- struct anv_fence_impl *impl =
- fence->temporary.type != ANV_FENCE_TYPE_NONE ?
- &fence->temporary : &fence->permanent;
-
- VkResult result;
-
- switch (impl->type) {
- case ANV_FENCE_TYPE_BO:
- assert(!device->has_thread_submit);
- result = anv_queue_submit_add_fence_bo(submit, impl->bo.bo, true /* signal */);
- if (result != VK_SUCCESS)
- return result;
+ switch (device->info->kmd_type) {
+ case INTEL_KMD_TYPE_I915:
+ anv_i915_destroy_engine(device, queue);
break;
-
- case ANV_FENCE_TYPE_SYNCOBJ: {
- /*
- * For the same reason we reset the signaled binary syncobj above, also
- * reset the fence's syncobj so that they don't contain a signaled
- * dma-fence.
- */
- anv_gem_syncobj_reset(device, impl->syncobj);
-
- result = anv_queue_submit_add_syncobj(submit, device, impl->syncobj,
- I915_EXEC_FENCE_SIGNAL,
- 0);
- if (result != VK_SUCCESS)
- return result;
+ case INTEL_KMD_TYPE_XE:
+ anv_xe_destroy_engine(device, queue);
break;
- }
-
default:
- unreachable("Invalid fence type");
- }
-
- return VK_SUCCESS;
-}
-
-static void
-anv_post_queue_fence_update(struct anv_device *device, struct anv_fence *fence)
-{
- if (fence->permanent.type == ANV_FENCE_TYPE_BO) {
- assert(!device->has_thread_submit);
- /* If we have permanent BO fence, the only type of temporary possible
- * would be BO_WSI (because BO fences are not shareable). The Vulkan spec
- * also requires that the fence passed to vkQueueSubmit() be :
- *
- * * unsignaled
- * * not be associated with any other queue command that has not yet
- * completed execution on that queue
- *
- * So the only acceptable type for the temporary is NONE.
- */
- assert(fence->temporary.type == ANV_FENCE_TYPE_NONE);
-
- /* Once the execbuf has returned, we need to set the fence state to
- * SUBMITTED. We can't do this before calling execbuf because
- * anv_GetFenceStatus does take the global device lock before checking
- * fence->state.
- *
- * We set the fence state to SUBMITTED regardless of whether or not the
- * execbuf succeeds because we need to ensure that vkWaitForFences() and
- * vkGetFenceStatus() return a valid result (VK_ERROR_DEVICE_LOST or
- * VK_SUCCESS) in a finite amount of time even if execbuf fails.
- */
- fence->permanent.bo.state = ANV_BO_FENCE_STATE_SUBMITTED;
- }
-}
-
-static VkResult
-anv_queue_submit_add_cmd_buffer(struct anv_queue_submit *submit,
- struct anv_cmd_buffer *cmd_buffer,
- int perf_pass)
-{
- if (submit->cmd_buffer_count >= submit->cmd_buffer_array_length) {
- uint32_t new_len = MAX2(submit->cmd_buffer_array_length * 2, 4);
- struct anv_cmd_buffer **new_cmd_buffers =
- vk_realloc(submit->alloc,
- submit->cmd_buffers, new_len * sizeof(*submit->cmd_buffers),
- 8, submit->alloc_scope);
- if (new_cmd_buffers == NULL)
- return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
-
- submit->cmd_buffers = new_cmd_buffers;
- submit->cmd_buffer_array_length = new_len;
+ unreachable("Missing");
}
-
- submit->cmd_buffers[submit->cmd_buffer_count++] = cmd_buffer;
- /* Only update the perf_query_pool if there is one. We can decide to batch
- * 2 command buffers if the second one doesn't use a query pool, but we
- * can't drop the already chosen one.
- */
- if (cmd_buffer->perf_query_pool)
- submit->perf_query_pool = cmd_buffer->perf_query_pool;
- submit->perf_query_pass = perf_pass;
-
- return VK_SUCCESS;
}
-static bool
-anv_queue_submit_can_add_cmd_buffer(const struct anv_queue_submit *submit,
- const struct anv_cmd_buffer *cmd_buffer,
- int perf_pass)
-{
- /* If first command buffer, no problem. */
- if (submit->cmd_buffer_count == 0)
- return true;
-
- /* Can we chain the last buffer into the next one? */
- if (!anv_cmd_buffer_is_chainable(submit->cmd_buffers[submit->cmd_buffer_count - 1]))
- return false;
-
- /* A change of perf query pools between VkSubmitInfo elements means we
- * can't batch things up.
- */
- if (cmd_buffer->perf_query_pool &&
- submit->perf_query_pool &&
- submit->perf_query_pool != cmd_buffer->perf_query_pool)
- return false;
-
- /* A change of perf pass also prevents batching things up.
- */
- if (submit->perf_query_pass != -1 &&
- submit->perf_query_pass != perf_pass)
- return false;
-
- return true;
-}
-
-static bool
-anv_queue_submit_can_add_submit(const struct anv_queue_submit *submit,
- uint32_t n_wait_semaphores,
- uint32_t n_signal_semaphores,
- int perf_pass)
+VkResult
+anv_queue_init(struct anv_device *device, struct anv_queue *queue,
+ const VkDeviceQueueCreateInfo *pCreateInfo,
+ uint32_t index_in_family)
{
- /* We can add to an empty anv_queue_submit. */
- if (submit->cmd_buffer_count == 0 &&
- submit->fence_count == 0 &&
- submit->wait_timeline_count == 0 &&
- submit->signal_timeline_count == 0 &&
- submit->fence_bo_count == 0)
- return true;
-
- /* Different perf passes will require different EXECBUF ioctls. */
- if (perf_pass != submit->perf_query_pass)
- return false;
-
- /* If the current submit is signaling anything, we can't add anything. */
- if (submit->signal_timeline_count)
- return false;
-
- /* If a submit is waiting on anything, anything that happened before needs
- * to be submitted.
- */
- if (n_wait_semaphores)
- return false;
-
- return true;
-}
+ struct anv_physical_device *pdevice = device->physical;
+ assert(queue->vk.queue_family_index < pdevice->queue.family_count);
+ struct anv_queue_family *queue_family =
+ &device->physical->queue.families[pCreateInfo->queueFamilyIndex];
+ VkResult result;
-static VkResult
-anv_queue_submit_post_and_alloc_new(struct anv_queue *queue,
- struct anv_queue_submit **submit)
-{
- VkResult result = anv_queue_submit_post(queue, submit, false);
+ result = vk_queue_init(&queue->vk, &device->vk, pCreateInfo,
+ index_in_family);
if (result != VK_SUCCESS)
return result;
- *submit = anv_queue_submit_alloc(queue->device);
- if (!*submit)
- return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
- return VK_SUCCESS;
-}
-
-VkResult anv_QueueSubmit(
- VkQueue _queue,
- uint32_t submitCount,
- const VkSubmitInfo* pSubmits,
- VkFence _fence)
-{
- ANV_FROM_HANDLE(anv_queue, queue, _queue);
- ANV_FROM_HANDLE(anv_fence, fence, _fence);
- struct anv_device *device = queue->device;
-
- if (device->info.no_hw)
- return VK_SUCCESS;
+ queue->vk.driver_submit = anv_queue_submit;
+ queue->device = device;
+ queue->family = queue_family;
+ queue->decoder = &device->decoder[queue->vk.queue_family_index];
- /* Query for device status prior to submitting. Technically, we don't need
- * to do this. However, if we have a client that's submitting piles of
- * garbage, we would rather break as early as possible to keep the GPU
- * hanging contained. If we don't check here, we'll either be waiting for
- * the kernel to kick us or we'll have to wait until the client waits on a
- * fence before we actually know whether or not we've hung.
- */
- VkResult result = anv_device_query_status(device);
- if (result != VK_SUCCESS)
+ result = anv_create_engine(device, queue, pCreateInfo);
+ if (result != VK_SUCCESS) {
+ vk_queue_finish(&queue->vk);
return result;
-
- struct anv_queue_submit *submit = anv_queue_submit_alloc(device);
- if (!submit)
- return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
-
- for (uint32_t i = 0; i < submitCount; i++) {
- const struct wsi_memory_signal_submit_info *mem_signal_info =
- vk_find_struct_const(pSubmits[i].pNext,
- WSI_MEMORY_SIGNAL_SUBMIT_INFO_MESA);
- struct anv_bo *wsi_signal_bo =
- mem_signal_info && mem_signal_info->memory != VK_NULL_HANDLE ?
- anv_device_memory_from_handle(mem_signal_info->memory)->bo : NULL;
-
- const VkTimelineSemaphoreSubmitInfoKHR *timeline_info =
- vk_find_struct_const(pSubmits[i].pNext,
- TIMELINE_SEMAPHORE_SUBMIT_INFO_KHR);
- const VkPerformanceQuerySubmitInfoKHR *perf_info =
- vk_find_struct_const(pSubmits[i].pNext,
- PERFORMANCE_QUERY_SUBMIT_INFO_KHR);
- const int perf_pass = perf_info ? perf_info->counterPassIndex : 0;
- const uint64_t *wait_values =
- timeline_info && timeline_info->waitSemaphoreValueCount ?
- timeline_info->pWaitSemaphoreValues : NULL;
- const uint64_t *signal_values =
- timeline_info && timeline_info->signalSemaphoreValueCount ?
- timeline_info->pSignalSemaphoreValues : NULL;
-
- if (!anv_queue_submit_can_add_submit(submit,
- pSubmits[i].waitSemaphoreCount,
- pSubmits[i].signalSemaphoreCount,
- perf_pass)) {
- result = anv_queue_submit_post_and_alloc_new(queue, &submit);
- if (result != VK_SUCCESS)
- goto out;
- }
-
- /* Wait semaphores */
- result = anv_queue_submit_add_in_semaphores(submit,
- device,
- pSubmits[i].pWaitSemaphores,
- wait_values,
- pSubmits[i].waitSemaphoreCount);
- if (result != VK_SUCCESS)
- goto out;
-
- /* Command buffers */
- for (uint32_t j = 0; j < pSubmits[i].commandBufferCount; j++) {
- ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer,
- pSubmits[i].pCommandBuffers[j]);
- assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
- assert(!anv_batch_has_error(&cmd_buffer->batch));
- anv_measure_submit(cmd_buffer);
-
- /* If we can't add an additional command buffer to the existing
- * anv_queue_submit, post it and create a new one.
- */
- if (!anv_queue_submit_can_add_cmd_buffer(submit, cmd_buffer, perf_pass)) {
- result = anv_queue_submit_post_and_alloc_new(queue, &submit);
- if (result != VK_SUCCESS)
- goto out;
- }
-
- result = anv_queue_submit_add_cmd_buffer(submit, cmd_buffer, perf_pass);
- if (result != VK_SUCCESS)
- goto out;
- }
-
- /* Signal semaphores */
- result = anv_queue_submit_add_out_semaphores(submit,
- device,
- pSubmits[i].pSignalSemaphores,
- signal_values,
- pSubmits[i].signalSemaphoreCount);
- if (result != VK_SUCCESS)
- goto out;
-
- /* WSI BO */
- if (wsi_signal_bo) {
- result = anv_queue_submit_add_fence_bo(submit, wsi_signal_bo,
- true /* signal */);
- if (result != VK_SUCCESS)
- goto out;
- }
- }
-
- if (fence) {
- result = anv_queue_submit_add_fence(submit, device, fence);
- if (result != VK_SUCCESS)
- goto out;
}
- result = anv_queue_submit_post(queue, &submit, false);
- if (result != VK_SUCCESS)
- goto out;
-
- if (fence)
- anv_post_queue_fence_update(device, fence);
-
-out:
- if (submit)
- anv_queue_submit_free(device, submit);
-
- if (result != VK_SUCCESS && result != VK_ERROR_DEVICE_LOST) {
- /* In the case that something has gone wrong we may end up with an
- * inconsistent state from which it may not be trivial to recover.
- * For example, we might have computed address relocations and
- * any future attempt to re-submit this job will need to know about
- * this and avoid computing relocation addresses again.
- *
- * To avoid this sort of issues, we assume that if something was
- * wrong during submission we must already be in a really bad situation
- * anyway (such us being out of memory) and return
- * VK_ERROR_DEVICE_LOST to ensure that clients do not attempt to
- * submit the same job again to this device.
- *
- * We skip doing this on VK_ERROR_DEVICE_LOST because
- * anv_device_set_lost() would have been called already by a callee of
- * anv_queue_submit().
- */
- result = anv_device_set_lost(device, "vkQueueSubmit() failed");
- }
-
- return result;
-}
-
-VkResult anv_QueueWaitIdle(
- VkQueue _queue)
-{
- ANV_FROM_HANDLE(anv_queue, queue, _queue);
-
- if (anv_device_is_lost(queue->device))
- return VK_ERROR_DEVICE_LOST;
-
- return anv_queue_submit_simple_batch(queue, NULL);
-}
-
-VkResult anv_CreateFence(
- VkDevice _device,
- const VkFenceCreateInfo* pCreateInfo,
- const VkAllocationCallbacks* pAllocator,
- VkFence* pFence)
-{
- ANV_FROM_HANDLE(anv_device, device, _device);
- struct anv_fence *fence;
-
- assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_FENCE_CREATE_INFO);
-
- fence = vk_object_zalloc(&device->vk, pAllocator, sizeof(*fence),
- VK_OBJECT_TYPE_FENCE);
- if (fence == NULL)
- return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
-
- if (device->physical->has_syncobj_wait) {
- fence->permanent.type = ANV_FENCE_TYPE_SYNCOBJ;
-
- uint32_t create_flags = 0;
- if (pCreateInfo->flags & VK_FENCE_CREATE_SIGNALED_BIT)
- create_flags |= DRM_SYNCOBJ_CREATE_SIGNALED;
-
- fence->permanent.syncobj = anv_gem_syncobj_create(device, create_flags);
- if (!fence->permanent.syncobj)
- return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
- } else {
- fence->permanent.type = ANV_FENCE_TYPE_BO;
-
- VkResult result = anv_bo_pool_alloc(&device->batch_bo_pool, 4096,
- &fence->permanent.bo.bo);
- if (result != VK_SUCCESS)
+ if (INTEL_DEBUG(DEBUG_SYNC)) {
+ result = vk_sync_create(&device->vk,
+ &device->physical->sync_syncobj_type,
+ 0, 0, &queue->sync);
+ if (result != VK_SUCCESS) {
+ anv_queue_finish(queue);
return result;
-
- if (pCreateInfo->flags & VK_FENCE_CREATE_SIGNALED_BIT) {
- fence->permanent.bo.state = ANV_BO_FENCE_STATE_SIGNALED;
- } else {
- fence->permanent.bo.state = ANV_BO_FENCE_STATE_RESET;
- }
- }
-
- *pFence = anv_fence_to_handle(fence);
-
- return VK_SUCCESS;
-}
-
-static void
-anv_fence_impl_cleanup(struct anv_device *device,
- struct anv_fence_impl *impl)
-{
- switch (impl->type) {
- case ANV_FENCE_TYPE_NONE:
- /* Dummy. Nothing to do */
- break;
-
- case ANV_FENCE_TYPE_BO:
- anv_bo_pool_free(&device->batch_bo_pool, impl->bo.bo);
- break;
-
- case ANV_FENCE_TYPE_WSI_BO:
- anv_device_release_bo(device, impl->bo.bo);
- break;
-
- case ANV_FENCE_TYPE_SYNCOBJ:
- anv_gem_syncobj_destroy(device, impl->syncobj);
- break;
-
- case ANV_FENCE_TYPE_WSI:
- impl->fence_wsi->destroy(impl->fence_wsi);
- break;
-
- default:
- unreachable("Invalid fence type");
- }
-
- impl->type = ANV_FENCE_TYPE_NONE;
-}
-
-void
-anv_fence_reset_temporary(struct anv_device *device,
- struct anv_fence *fence)
-{
- if (fence->temporary.type == ANV_FENCE_TYPE_NONE)
- return;
-
- anv_fence_impl_cleanup(device, &fence->temporary);
-}
-
-void anv_DestroyFence(
- VkDevice _device,
- VkFence _fence,
- const VkAllocationCallbacks* pAllocator)
-{
- ANV_FROM_HANDLE(anv_device, device, _device);
- ANV_FROM_HANDLE(anv_fence, fence, _fence);
-
- if (!fence)
- return;
-
- anv_fence_impl_cleanup(device, &fence->temporary);
- anv_fence_impl_cleanup(device, &fence->permanent);
-
- vk_object_free(&device->vk, pAllocator, fence);
-}
-
-VkResult anv_ResetFences(
- VkDevice _device,
- uint32_t fenceCount,
- const VkFence* pFences)
-{
- ANV_FROM_HANDLE(anv_device, device, _device);
-
- for (uint32_t i = 0; i < fenceCount; i++) {
- ANV_FROM_HANDLE(anv_fence, fence, pFences[i]);
-
- /* From the Vulkan 1.0.53 spec:
- *
- * "If any member of pFences currently has its payload imported with
- * temporary permanence, that fence’s prior permanent payload is
- * first restored. The remaining operations described therefore
- * operate on the restored payload.
- */
- anv_fence_reset_temporary(device, fence);
-
- struct anv_fence_impl *impl = &fence->permanent;
-
- switch (impl->type) {
- case ANV_FENCE_TYPE_BO:
- impl->bo.state = ANV_BO_FENCE_STATE_RESET;
- break;
-
- case ANV_FENCE_TYPE_SYNCOBJ:
- anv_gem_syncobj_reset(device, impl->syncobj);
- break;
-
- default:
- unreachable("Invalid fence type");
- }
- }
-
- return VK_SUCCESS;
-}
-
-VkResult anv_GetFenceStatus(
- VkDevice _device,
- VkFence _fence)
-{
- ANV_FROM_HANDLE(anv_device, device, _device);
- ANV_FROM_HANDLE(anv_fence, fence, _fence);
-
- if (anv_device_is_lost(device))
- return VK_ERROR_DEVICE_LOST;
-
- struct anv_fence_impl *impl =
- fence->temporary.type != ANV_FENCE_TYPE_NONE ?
- &fence->temporary : &fence->permanent;
-
- switch (impl->type) {
- case ANV_FENCE_TYPE_BO:
- case ANV_FENCE_TYPE_WSI_BO:
- switch (impl->bo.state) {
- case ANV_BO_FENCE_STATE_RESET:
- /* If it hasn't even been sent off to the GPU yet, it's not ready */
- return VK_NOT_READY;
-
- case ANV_BO_FENCE_STATE_SIGNALED:
- /* It's been signaled, return success */
- return VK_SUCCESS;
-
- case ANV_BO_FENCE_STATE_SUBMITTED: {
- VkResult result = anv_device_bo_busy(device, impl->bo.bo);
- if (result == VK_SUCCESS) {
- impl->bo.state = ANV_BO_FENCE_STATE_SIGNALED;
- return VK_SUCCESS;
- } else {
- return result;
- }
- }
- default:
- unreachable("Invalid fence status");
- }
-
- case ANV_FENCE_TYPE_SYNCOBJ: {
- if (device->has_thread_submit) {
- uint64_t binary_value = 0;
- int ret = anv_gem_syncobj_timeline_wait(device, &impl->syncobj,
- &binary_value, 1, 0,
- true /* wait_all */,
- false /* wait_materialize */);
- if (ret == -1) {
- if (errno == ETIME) {
- return VK_NOT_READY;
- } else {
- /* We don't know the real error. */
- return anv_device_set_lost(device, "drm_syncobj_wait failed: %m");
- }
- } else {
- return VK_SUCCESS;
- }
- } else {
- int ret = anv_gem_syncobj_wait(device, &impl->syncobj, 1, 0, false);
- if (ret == -1) {
- if (errno == ETIME) {
- return VK_NOT_READY;
- } else {
- /* We don't know the real error. */
- return anv_device_set_lost(device, "drm_syncobj_wait failed: %m");
- }
- } else {
- return VK_SUCCESS;
- }
- }
- }
-
- default:
- unreachable("Invalid fence type");
- }
-}
-
-static VkResult
-anv_wait_for_syncobj_fences(struct anv_device *device,
- uint32_t fenceCount,
- const VkFence *pFences,
- bool waitAll,
- uint64_t abs_timeout_ns)
-{
- uint32_t *syncobjs = vk_zalloc(&device->vk.alloc,
- sizeof(*syncobjs) * fenceCount, 8,
- VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
- if (!syncobjs)
- return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
-
- for (uint32_t i = 0; i < fenceCount; i++) {
- ANV_FROM_HANDLE(anv_fence, fence, pFences[i]);
- assert(fence->permanent.type == ANV_FENCE_TYPE_SYNCOBJ);
-
- struct anv_fence_impl *impl =
- fence->temporary.type != ANV_FENCE_TYPE_NONE ?
- &fence->temporary : &fence->permanent;
-
- assert(impl->type == ANV_FENCE_TYPE_SYNCOBJ);
- syncobjs[i] = impl->syncobj;
- }
-
- int ret = 0;
- /* The gem_syncobj_wait ioctl may return early due to an inherent
- * limitation in the way it computes timeouts. Loop until we've actually
- * passed the timeout.
- */
- do {
- ret = anv_gem_syncobj_wait(device, syncobjs, fenceCount,
- abs_timeout_ns, waitAll);
- } while (ret == -1 && errno == ETIME && anv_gettime_ns() < abs_timeout_ns);
-
- vk_free(&device->vk.alloc, syncobjs);
-
- if (ret == -1) {
- if (errno == ETIME) {
- return VK_TIMEOUT;
- } else {
- /* We don't know the real error. */
- return anv_device_set_lost(device, "drm_syncobj_wait failed: %m");
- }
- } else {
- return VK_SUCCESS;
- }
-}
-
-static VkResult
-anv_wait_for_bo_fences(struct anv_device *device,
- uint32_t fenceCount,
- const VkFence *pFences,
- bool waitAll,
- uint64_t abs_timeout_ns)
-{
- VkResult result = VK_SUCCESS;
- uint32_t pending_fences = fenceCount;
- while (pending_fences) {
- pending_fences = 0;
- bool signaled_fences = false;
- for (uint32_t i = 0; i < fenceCount; i++) {
- ANV_FROM_HANDLE(anv_fence, fence, pFences[i]);
-
- struct anv_fence_impl *impl =
- fence->temporary.type != ANV_FENCE_TYPE_NONE ?
- &fence->temporary : &fence->permanent;
- assert(impl->type == ANV_FENCE_TYPE_BO ||
- impl->type == ANV_FENCE_TYPE_WSI_BO);
-
- switch (impl->bo.state) {
- case ANV_BO_FENCE_STATE_RESET:
- /* This fence hasn't been submitted yet, we'll catch it the next
- * time around. Yes, this may mean we dead-loop but, short of
- * lots of locking and a condition variable, there's not much that
- * we can do about that.
- */
- pending_fences++;
- continue;
-
- case ANV_BO_FENCE_STATE_SIGNALED:
- /* This fence is not pending. If waitAll isn't set, we can return
- * early. Otherwise, we have to keep going.
- */
- if (!waitAll) {
- result = VK_SUCCESS;
- goto done;
- }
- continue;
-
- case ANV_BO_FENCE_STATE_SUBMITTED:
- /* These are the fences we really care about. Go ahead and wait
- * on it until we hit a timeout.
- */
- result = anv_device_wait(device, impl->bo.bo,
- anv_get_relative_timeout(abs_timeout_ns));
- switch (result) {
- case VK_SUCCESS:
- impl->bo.state = ANV_BO_FENCE_STATE_SIGNALED;
- signaled_fences = true;
- if (!waitAll)
- goto done;
- break;
-
- case VK_TIMEOUT:
- goto done;
-
- default:
- return result;
- }
- }
- }
-
- if (pending_fences && !signaled_fences) {
- /* If we've hit this then someone decided to vkWaitForFences before
- * they've actually submitted any of them to a queue. This is a
- * fairly pessimal case, so it's ok to lock here and use a standard
- * pthreads condition variable.
- */
- pthread_mutex_lock(&device->mutex);
-
- /* It's possible that some of the fences have changed state since the
- * last time we checked. Now that we have the lock, check for
- * pending fences again and don't wait if it's changed.
- */
- uint32_t now_pending_fences = 0;
- for (uint32_t i = 0; i < fenceCount; i++) {
- ANV_FROM_HANDLE(anv_fence, fence, pFences[i]);
- if (fence->permanent.bo.state == ANV_BO_FENCE_STATE_RESET)
- now_pending_fences++;
- }
- assert(now_pending_fences <= pending_fences);
-
- if (now_pending_fences == pending_fences) {
- struct timespec abstime = {
- .tv_sec = abs_timeout_ns / NSEC_PER_SEC,
- .tv_nsec = abs_timeout_ns % NSEC_PER_SEC,
- };
-
- ASSERTED int ret;
- ret = pthread_cond_timedwait(&device->queue_submit,
- &device->mutex, &abstime);
- assert(ret != EINVAL);
- if (anv_gettime_ns() >= abs_timeout_ns) {
- pthread_mutex_unlock(&device->mutex);
- result = VK_TIMEOUT;
- goto done;
- }
- }
-
- pthread_mutex_unlock(&device->mutex);
- }
- }
-
-done:
- if (anv_device_is_lost(device))
- return VK_ERROR_DEVICE_LOST;
-
- return result;
-}
-
-static VkResult
-anv_wait_for_wsi_fence(struct anv_device *device,
- struct anv_fence_impl *impl,
- uint64_t abs_timeout)
-{
- return impl->fence_wsi->wait(impl->fence_wsi, abs_timeout);
-}
-
-static VkResult
-anv_wait_for_fences(struct anv_device *device,
- uint32_t fenceCount,
- const VkFence *pFences,
- bool waitAll,
- uint64_t abs_timeout)
-{
- VkResult result = VK_SUCCESS;
-
- if (fenceCount <= 1 || waitAll) {
- for (uint32_t i = 0; i < fenceCount; i++) {
- ANV_FROM_HANDLE(anv_fence, fence, pFences[i]);
- struct anv_fence_impl *impl =
- fence->temporary.type != ANV_FENCE_TYPE_NONE ?
- &fence->temporary : &fence->permanent;
-
- switch (impl->type) {
- case ANV_FENCE_TYPE_BO:
- assert(!device->physical->has_syncobj_wait);
- FALLTHROUGH;
- case ANV_FENCE_TYPE_WSI_BO:
- result = anv_wait_for_bo_fences(device, 1, &pFences[i],
- true, abs_timeout);
- break;
- case ANV_FENCE_TYPE_SYNCOBJ:
- result = anv_wait_for_syncobj_fences(device, 1, &pFences[i],
- true, abs_timeout);
- break;
- case ANV_FENCE_TYPE_WSI:
- result = anv_wait_for_wsi_fence(device, impl, abs_timeout);
- break;
- case ANV_FENCE_TYPE_NONE:
- result = VK_SUCCESS;
- break;
- }
- if (result != VK_SUCCESS)
- return result;
}
- } else {
- do {
- for (uint32_t i = 0; i < fenceCount; i++) {
- if (anv_wait_for_fences(device, 1, &pFences[i], true, 0) == VK_SUCCESS)
- return VK_SUCCESS;
- }
- } while (anv_gettime_ns() < abs_timeout);
- result = VK_TIMEOUT;
}
- return result;
-}
-
-static bool anv_all_fences_syncobj(uint32_t fenceCount, const VkFence *pFences)
-{
- for (uint32_t i = 0; i < fenceCount; ++i) {
- ANV_FROM_HANDLE(anv_fence, fence, pFences[i]);
- struct anv_fence_impl *impl =
- fence->temporary.type != ANV_FENCE_TYPE_NONE ?
- &fence->temporary : &fence->permanent;
- if (impl->type != ANV_FENCE_TYPE_SYNCOBJ)
- return false;
- }
- return true;
-}
-
-static bool anv_all_fences_bo(uint32_t fenceCount, const VkFence *pFences)
-{
- for (uint32_t i = 0; i < fenceCount; ++i) {
- ANV_FROM_HANDLE(anv_fence, fence, pFences[i]);
- struct anv_fence_impl *impl =
- fence->temporary.type != ANV_FENCE_TYPE_NONE ?
- &fence->temporary : &fence->permanent;
- if (impl->type != ANV_FENCE_TYPE_BO &&
- impl->type != ANV_FENCE_TYPE_WSI_BO)
- return false;
- }
- return true;
-}
-
-VkResult anv_WaitForFences(
- VkDevice _device,
- uint32_t fenceCount,
- const VkFence* pFences,
- VkBool32 waitAll,
- uint64_t timeout)
-{
- ANV_FROM_HANDLE(anv_device, device, _device);
-
- if (device->info.no_hw)
- return VK_SUCCESS;
- if (anv_device_is_lost(device))
- return VK_ERROR_DEVICE_LOST;
-
- uint64_t abs_timeout = anv_get_absolute_timeout(timeout);
- if (anv_all_fences_syncobj(fenceCount, pFences)) {
- return anv_wait_for_syncobj_fences(device, fenceCount, pFences,
- waitAll, abs_timeout);
- } else if (anv_all_fences_bo(fenceCount, pFences)) {
- return anv_wait_for_bo_fences(device, fenceCount, pFences,
- waitAll, abs_timeout);
- } else {
- return anv_wait_for_fences(device, fenceCount, pFences,
- waitAll, abs_timeout);
- }
-}
-
-void anv_GetPhysicalDeviceExternalFenceProperties(
- VkPhysicalDevice physicalDevice,
- const VkPhysicalDeviceExternalFenceInfo* pExternalFenceInfo,
- VkExternalFenceProperties* pExternalFenceProperties)
-{
- ANV_FROM_HANDLE(anv_physical_device, device, physicalDevice);
-
- switch (pExternalFenceInfo->handleType) {
- case VK_EXTERNAL_FENCE_HANDLE_TYPE_OPAQUE_FD_BIT:
- case VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT:
- if (device->has_syncobj_wait) {
- pExternalFenceProperties->exportFromImportedHandleTypes =
- VK_EXTERNAL_FENCE_HANDLE_TYPE_OPAQUE_FD_BIT |
- VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT;
- pExternalFenceProperties->compatibleHandleTypes =
- VK_EXTERNAL_FENCE_HANDLE_TYPE_OPAQUE_FD_BIT |
- VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT;
- pExternalFenceProperties->externalFenceFeatures =
- VK_EXTERNAL_FENCE_FEATURE_EXPORTABLE_BIT |
- VK_EXTERNAL_FENCE_FEATURE_IMPORTABLE_BIT;
- return;
- }
- break;
-
- default:
- break;
- }
-
- pExternalFenceProperties->exportFromImportedHandleTypes = 0;
- pExternalFenceProperties->compatibleHandleTypes = 0;
- pExternalFenceProperties->externalFenceFeatures = 0;
-}
-
-VkResult anv_ImportFenceFdKHR(
- VkDevice _device,
- const VkImportFenceFdInfoKHR* pImportFenceFdInfo)
-{
- ANV_FROM_HANDLE(anv_device, device, _device);
- ANV_FROM_HANDLE(anv_fence, fence, pImportFenceFdInfo->fence);
- int fd = pImportFenceFdInfo->fd;
-
- assert(pImportFenceFdInfo->sType ==
- VK_STRUCTURE_TYPE_IMPORT_FENCE_FD_INFO_KHR);
-
- struct anv_fence_impl new_impl = {
- .type = ANV_FENCE_TYPE_NONE,
- };
-
- switch (pImportFenceFdInfo->handleType) {
- case VK_EXTERNAL_FENCE_HANDLE_TYPE_OPAQUE_FD_BIT:
- new_impl.type = ANV_FENCE_TYPE_SYNCOBJ;
-
- new_impl.syncobj = anv_gem_syncobj_fd_to_handle(device, fd);
- if (!new_impl.syncobj)
- return vk_error(VK_ERROR_INVALID_EXTERNAL_HANDLE);
-
- break;
-
- case VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT: {
- /* Sync files are a bit tricky. Because we want to continue using the
- * syncobj implementation of WaitForFences, we don't use the sync file
- * directly but instead import it into a syncobj.
- */
- new_impl.type = ANV_FENCE_TYPE_SYNCOBJ;
-
- /* "If handleType is VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT, the
- * special value -1 for fd is treated like a valid sync file descriptor
- * referring to an object that has already signaled. The import
- * operation will succeed and the VkFence will have a temporarily
- * imported payload as if a valid file descriptor had been provided."
- */
- uint32_t create_flags = 0;
- if (fd == -1)
- create_flags |= DRM_SYNCOBJ_CREATE_SIGNALED;
-
- new_impl.syncobj = anv_gem_syncobj_create(device, create_flags);
- if (!new_impl.syncobj)
- return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
-
- if (fd != -1 &&
- anv_gem_syncobj_import_sync_file(device, new_impl.syncobj, fd)) {
- anv_gem_syncobj_destroy(device, new_impl.syncobj);
- return vk_errorf(device, NULL, VK_ERROR_INVALID_EXTERNAL_HANDLE,
- "syncobj sync file import failed: %m");
- }
- break;
- }
-
- default:
- return vk_error(VK_ERROR_INVALID_EXTERNAL_HANDLE);
- }
-
- /* From the Vulkan 1.0.53 spec:
- *
- * "Importing a fence payload from a file descriptor transfers
- * ownership of the file descriptor from the application to the
- * Vulkan implementation. The application must not perform any
- * operations on the file descriptor after a successful import."
- *
- * If the import fails, we leave the file descriptor open.
- */
- if (fd != -1)
- close(fd);
-
- if (pImportFenceFdInfo->flags & VK_FENCE_IMPORT_TEMPORARY_BIT) {
- anv_fence_impl_cleanup(device, &fence->temporary);
- fence->temporary = new_impl;
- } else {
- anv_fence_impl_cleanup(device, &fence->permanent);
- fence->permanent = new_impl;
- }
-
- return VK_SUCCESS;
-}
-
-/* The sideband payload of the DRM syncobj was incremented when the
- * application called vkQueueSubmit(). Here we wait for a fence with the same
- * value to materialize so that we can exporting (typically as a SyncFD).
- */
-static VkResult
-wait_syncobj_materialize(struct anv_device *device,
- uint32_t syncobj,
- int *fd)
-{
- if (!device->has_thread_submit)
- return VK_SUCCESS;
-
- uint64_t binary_value = 0;
- /* We might need to wait until the fence materializes before we can
- * export to a sync FD when we use a thread for submission.
- */
- if (anv_gem_syncobj_timeline_wait(device, &syncobj, &binary_value, 1,
- anv_get_absolute_timeout(5ull * NSEC_PER_SEC),
- true /* wait_all */,
- true /* wait_materialize */))
- return anv_device_set_lost(device, "anv_gem_syncobj_timeline_wait failed: %m");
-
- return VK_SUCCESS;
-}
-
-VkResult anv_GetFenceFdKHR(
- VkDevice _device,
- const VkFenceGetFdInfoKHR* pGetFdInfo,
- int* pFd)
-{
- ANV_FROM_HANDLE(anv_device, device, _device);
- ANV_FROM_HANDLE(anv_fence, fence, pGetFdInfo->fence);
-
- assert(pGetFdInfo->sType == VK_STRUCTURE_TYPE_FENCE_GET_FD_INFO_KHR);
-
- struct anv_fence_impl *impl =
- fence->temporary.type != ANV_FENCE_TYPE_NONE ?
- &fence->temporary : &fence->permanent;
-
- assert(impl->type == ANV_FENCE_TYPE_SYNCOBJ);
- switch (pGetFdInfo->handleType) {
- case VK_EXTERNAL_FENCE_HANDLE_TYPE_OPAQUE_FD_BIT: {
- int fd = anv_gem_syncobj_handle_to_fd(device, impl->syncobj);
- if (fd < 0)
- return vk_error(VK_ERROR_TOO_MANY_OBJECTS);
-
- *pFd = fd;
- break;
- }
-
- case VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT: {
- VkResult result = wait_syncobj_materialize(device, impl->syncobj, pFd);
- if (result != VK_SUCCESS)
- return result;
-
- int fd = anv_gem_syncobj_export_sync_file(device, impl->syncobj);
- if (fd < 0)
- return vk_error(VK_ERROR_TOO_MANY_OBJECTS);
-
- *pFd = fd;
- break;
- }
-
- default:
- unreachable("Invalid fence export handle type");
- }
-
- /* From the Vulkan 1.0.53 spec:
- *
- * "Export operations have the same transference as the specified handle
- * type’s import operations. [...] If the fence was using a
- * temporarily imported payload, the fence’s prior permanent payload
- * will be restored.
- */
- if (impl == &fence->temporary)
- anv_fence_impl_cleanup(device, impl);
-
- return VK_SUCCESS;
-}
-
-// Queue semaphore functions
-
-static VkSemaphoreTypeKHR
-get_semaphore_type(const void *pNext, uint64_t *initial_value)
-{
- const VkSemaphoreTypeCreateInfoKHR *type_info =
- vk_find_struct_const(pNext, SEMAPHORE_TYPE_CREATE_INFO_KHR);
-
- if (!type_info)
- return VK_SEMAPHORE_TYPE_BINARY_KHR;
-
- if (initial_value)
- *initial_value = type_info->initialValue;
- return type_info->semaphoreType;
-}
-
-static VkResult
-binary_semaphore_create(struct anv_device *device,
- struct anv_semaphore_impl *impl,
- bool exportable)
-{
- impl->type = ANV_SEMAPHORE_TYPE_DRM_SYNCOBJ;
- impl->syncobj = anv_gem_syncobj_create(device, 0);
- if (!impl->syncobj)
- return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
- return VK_SUCCESS;
-}
-
-static VkResult
-timeline_semaphore_create(struct anv_device *device,
- struct anv_semaphore_impl *impl,
- uint64_t initial_value)
-{
- if (device->has_thread_submit) {
- impl->type = ANV_SEMAPHORE_TYPE_DRM_SYNCOBJ_TIMELINE;
- impl->syncobj = anv_gem_syncobj_create(device, 0);
- if (!impl->syncobj)
- return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
- if (initial_value) {
- if (anv_gem_syncobj_timeline_signal(device,
- &impl->syncobj,
- &initial_value, 1)) {
- anv_gem_syncobj_destroy(device, impl->syncobj);
- return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
- }
- }
- } else {
- impl->type = ANV_SEMAPHORE_TYPE_TIMELINE;
- anv_timeline_init(device, &impl->timeline, initial_value);
- }
-
- return VK_SUCCESS;
-}
-
-VkResult anv_CreateSemaphore(
- VkDevice _device,
- const VkSemaphoreCreateInfo* pCreateInfo,
- const VkAllocationCallbacks* pAllocator,
- VkSemaphore* pSemaphore)
-{
- ANV_FROM_HANDLE(anv_device, device, _device);
- struct anv_semaphore *semaphore;
-
- assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO);
-
- uint64_t timeline_value = 0;
- VkSemaphoreTypeKHR sem_type = get_semaphore_type(pCreateInfo->pNext, &timeline_value);
-
- semaphore = vk_object_alloc(&device->vk, NULL, sizeof(*semaphore),
- VK_OBJECT_TYPE_SEMAPHORE);
- if (semaphore == NULL)
- return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
-
- const VkExportSemaphoreCreateInfo *export =
- vk_find_struct_const(pCreateInfo->pNext, EXPORT_SEMAPHORE_CREATE_INFO);
- VkExternalSemaphoreHandleTypeFlags handleTypes =
- export ? export->handleTypes : 0;
- VkResult result;
-
- if (handleTypes == 0) {
- if (sem_type == VK_SEMAPHORE_TYPE_BINARY_KHR)
- result = binary_semaphore_create(device, &semaphore->permanent, false);
- else
- result = timeline_semaphore_create(device, &semaphore->permanent, timeline_value);
- if (result != VK_SUCCESS) {
- vk_object_free(&device->vk, pAllocator, semaphore);
- return result;
- }
- } else if (handleTypes & VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT) {
- assert(handleTypes == VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT);
- if (sem_type == VK_SEMAPHORE_TYPE_BINARY_KHR)
- result = binary_semaphore_create(device, &semaphore->permanent, true);
- else
- result = timeline_semaphore_create(device, &semaphore->permanent, timeline_value);
+ if (queue_family->engine_class == INTEL_ENGINE_CLASS_COPY ||
+ queue_family->engine_class == INTEL_ENGINE_CLASS_COMPUTE) {
+ result = vk_sync_create(&device->vk,
+ &device->physical->sync_syncobj_type,
+ 0, 0, &queue->companion_sync);
if (result != VK_SUCCESS) {
- vk_object_free(&device->vk, pAllocator, semaphore);
+ anv_queue_finish(queue);
return result;
}
- } else if (handleTypes & VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT) {
- assert(handleTypes == VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT);
- assert(sem_type == VK_SEMAPHORE_TYPE_BINARY_KHR);
- semaphore->permanent.type = ANV_SEMAPHORE_TYPE_DRM_SYNCOBJ;
- semaphore->permanent.syncobj = anv_gem_syncobj_create(device, 0);
- if (!semaphore->permanent.syncobj) {
- vk_object_free(&device->vk, pAllocator, semaphore);
- return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
- }
- } else {
- assert(!"Unknown handle type");
- vk_object_free(&device->vk, pAllocator, semaphore);
- return vk_error(VK_ERROR_INVALID_EXTERNAL_HANDLE);
}
- semaphore->temporary.type = ANV_SEMAPHORE_TYPE_NONE;
-
- *pSemaphore = anv_semaphore_to_handle(semaphore);
-
return VK_SUCCESS;
}
-static void
-anv_semaphore_impl_cleanup(struct anv_device *device,
- struct anv_semaphore_impl *impl)
-{
- switch (impl->type) {
- case ANV_SEMAPHORE_TYPE_NONE:
- case ANV_SEMAPHORE_TYPE_DUMMY:
- /* Dummy. Nothing to do */
- break;
-
- case ANV_SEMAPHORE_TYPE_WSI_BO:
- anv_device_release_bo(device, impl->bo);
- break;
-
- case ANV_SEMAPHORE_TYPE_TIMELINE:
- anv_timeline_finish(device, &impl->timeline);
- break;
-
- case ANV_SEMAPHORE_TYPE_DRM_SYNCOBJ:
- case ANV_SEMAPHORE_TYPE_DRM_SYNCOBJ_TIMELINE:
- anv_gem_syncobj_destroy(device, impl->syncobj);
- break;
-
- default:
- unreachable("Invalid semaphore type");
- }
-
- impl->type = ANV_SEMAPHORE_TYPE_NONE;
-}
-
void
-anv_semaphore_reset_temporary(struct anv_device *device,
- struct anv_semaphore *semaphore)
-{
- if (semaphore->temporary.type == ANV_SEMAPHORE_TYPE_NONE)
- return;
-
- anv_semaphore_impl_cleanup(device, &semaphore->temporary);
-}
-
-void anv_DestroySemaphore(
- VkDevice _device,
- VkSemaphore _semaphore,
- const VkAllocationCallbacks* pAllocator)
-{
- ANV_FROM_HANDLE(anv_device, device, _device);
- ANV_FROM_HANDLE(anv_semaphore, semaphore, _semaphore);
-
- if (semaphore == NULL)
- return;
-
- anv_semaphore_impl_cleanup(device, &semaphore->temporary);
- anv_semaphore_impl_cleanup(device, &semaphore->permanent);
-
- vk_object_base_finish(&semaphore->base);
- vk_free(&device->vk.alloc, semaphore);
-}
-
-void anv_GetPhysicalDeviceExternalSemaphoreProperties(
- VkPhysicalDevice physicalDevice,
- const VkPhysicalDeviceExternalSemaphoreInfo* pExternalSemaphoreInfo,
- VkExternalSemaphoreProperties* pExternalSemaphoreProperties)
-{
- ANV_FROM_HANDLE(anv_physical_device, device, physicalDevice);
-
- VkSemaphoreTypeKHR sem_type =
- get_semaphore_type(pExternalSemaphoreInfo->pNext, NULL);
-
- switch (pExternalSemaphoreInfo->handleType) {
- case VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT:
- /* Timeline semaphores are not exportable, unless we have threaded
- * submission.
- */
- if (sem_type == VK_SEMAPHORE_TYPE_TIMELINE_KHR && !device->has_thread_submit)
- break;
- pExternalSemaphoreProperties->exportFromImportedHandleTypes =
- VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT;
- pExternalSemaphoreProperties->compatibleHandleTypes =
- VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT;
- pExternalSemaphoreProperties->externalSemaphoreFeatures =
- VK_EXTERNAL_SEMAPHORE_FEATURE_EXPORTABLE_BIT |
- VK_EXTERNAL_SEMAPHORE_FEATURE_IMPORTABLE_BIT;
- return;
-
- case VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT:
- if (sem_type == VK_SEMAPHORE_TYPE_TIMELINE_KHR)
- break;
- if (!device->has_exec_fence)
- break;
- pExternalSemaphoreProperties->exportFromImportedHandleTypes =
- VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT;
- pExternalSemaphoreProperties->compatibleHandleTypes =
- VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT;
- pExternalSemaphoreProperties->externalSemaphoreFeatures =
- VK_EXTERNAL_SEMAPHORE_FEATURE_EXPORTABLE_BIT |
- VK_EXTERNAL_SEMAPHORE_FEATURE_IMPORTABLE_BIT;
- return;
-
- default:
- break;
- }
-
- pExternalSemaphoreProperties->exportFromImportedHandleTypes = 0;
- pExternalSemaphoreProperties->compatibleHandleTypes = 0;
- pExternalSemaphoreProperties->externalSemaphoreFeatures = 0;
-}
-
-VkResult anv_ImportSemaphoreFdKHR(
- VkDevice _device,
- const VkImportSemaphoreFdInfoKHR* pImportSemaphoreFdInfo)
-{
- ANV_FROM_HANDLE(anv_device, device, _device);
- ANV_FROM_HANDLE(anv_semaphore, semaphore, pImportSemaphoreFdInfo->semaphore);
- int fd = pImportSemaphoreFdInfo->fd;
-
- struct anv_semaphore_impl new_impl = {
- .type = ANV_SEMAPHORE_TYPE_NONE,
- };
-
- switch (pImportSemaphoreFdInfo->handleType) {
- case VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT:
- /* When importing non temporarily, reuse the semaphore's existing
- * type. The Linux/DRM implementation allows to interchangeably use
- * binary & timeline semaphores and we have no way to differenciate
- * them.
- */
- if (pImportSemaphoreFdInfo->flags & VK_SEMAPHORE_IMPORT_TEMPORARY_BIT)
- new_impl.type = ANV_SEMAPHORE_TYPE_DRM_SYNCOBJ;
- else
- new_impl.type = semaphore->permanent.type;
-
- new_impl.syncobj = anv_gem_syncobj_fd_to_handle(device, fd);
- if (!new_impl.syncobj)
- return vk_error(VK_ERROR_INVALID_EXTERNAL_HANDLE);
-
- /* From the Vulkan spec:
- *
- * "Importing semaphore state from a file descriptor transfers
- * ownership of the file descriptor from the application to the
- * Vulkan implementation. The application must not perform any
- * operations on the file descriptor after a successful import."
- *
- * If the import fails, we leave the file descriptor open.
- */
- close(fd);
- break;
-
- case VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT: {
- uint32_t create_flags = 0;
-
- if (fd == -1)
- create_flags |= DRM_SYNCOBJ_CREATE_SIGNALED;
-
- new_impl = (struct anv_semaphore_impl) {
- .type = ANV_SEMAPHORE_TYPE_DRM_SYNCOBJ,
- .syncobj = anv_gem_syncobj_create(device, create_flags),
- };
-
- if (!new_impl.syncobj)
- return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
-
- if (fd != -1) {
- if (anv_gem_syncobj_import_sync_file(device, new_impl.syncobj, fd)) {
- anv_gem_syncobj_destroy(device, new_impl.syncobj);
- return vk_errorf(device, NULL, VK_ERROR_INVALID_EXTERNAL_HANDLE,
- "syncobj sync file import failed: %m");
- }
- /* Ownership of the FD is transfered to Anv. Since we don't need it
- * anymore because the associated fence has been put into a syncobj,
- * we must close the FD.
- */
- close(fd);
- }
- break;
- }
-
- default:
- return vk_error(VK_ERROR_INVALID_EXTERNAL_HANDLE);
- }
-
- if (pImportSemaphoreFdInfo->flags & VK_SEMAPHORE_IMPORT_TEMPORARY_BIT) {
- anv_semaphore_impl_cleanup(device, &semaphore->temporary);
- semaphore->temporary = new_impl;
- } else {
- anv_semaphore_impl_cleanup(device, &semaphore->permanent);
- semaphore->permanent = new_impl;
- }
-
- return VK_SUCCESS;
-}
-
-VkResult anv_GetSemaphoreFdKHR(
- VkDevice _device,
- const VkSemaphoreGetFdInfoKHR* pGetFdInfo,
- int* pFd)
-{
- ANV_FROM_HANDLE(anv_device, device, _device);
- ANV_FROM_HANDLE(anv_semaphore, semaphore, pGetFdInfo->semaphore);
- int fd;
-
- assert(pGetFdInfo->sType == VK_STRUCTURE_TYPE_SEMAPHORE_GET_FD_INFO_KHR);
-
- struct anv_semaphore_impl *impl =
- semaphore->temporary.type != ANV_SEMAPHORE_TYPE_NONE ?
- &semaphore->temporary : &semaphore->permanent;
-
- switch (impl->type) {
- case ANV_SEMAPHORE_TYPE_DRM_SYNCOBJ:
- if (pGetFdInfo->handleType == VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT) {
- VkResult result = wait_syncobj_materialize(device, impl->syncobj, pFd);
- if (result != VK_SUCCESS)
- return result;
-
- fd = anv_gem_syncobj_export_sync_file(device, impl->syncobj);
- } else {
- assert(pGetFdInfo->handleType == VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT);
- fd = anv_gem_syncobj_handle_to_fd(device, impl->syncobj);
- }
- if (fd < 0)
- return vk_error(VK_ERROR_TOO_MANY_OBJECTS);
- *pFd = fd;
- break;
-
- case ANV_SEMAPHORE_TYPE_DRM_SYNCOBJ_TIMELINE:
- assert(pGetFdInfo->handleType == VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT);
- fd = anv_gem_syncobj_handle_to_fd(device, impl->syncobj);
- if (fd < 0)
- return vk_error(VK_ERROR_TOO_MANY_OBJECTS);
- *pFd = fd;
- break;
-
- default:
- return vk_error(VK_ERROR_INVALID_EXTERNAL_HANDLE);
- }
-
- /* From the Vulkan 1.0.53 spec:
- *
- * "Export operations have the same transference as the specified handle
- * type’s import operations. [...] If the semaphore was using a
- * temporarily imported payload, the semaphore’s prior permanent payload
- * will be restored.
- */
- if (impl == &semaphore->temporary)
- anv_semaphore_impl_cleanup(device, impl);
-
- return VK_SUCCESS;
-}
-
-VkResult anv_GetSemaphoreCounterValue(
- VkDevice _device,
- VkSemaphore _semaphore,
- uint64_t* pValue)
-{
- ANV_FROM_HANDLE(anv_device, device, _device);
- ANV_FROM_HANDLE(anv_semaphore, semaphore, _semaphore);
-
- struct anv_semaphore_impl *impl =
- semaphore->temporary.type != ANV_SEMAPHORE_TYPE_NONE ?
- &semaphore->temporary : &semaphore->permanent;
-
- switch (impl->type) {
- case ANV_SEMAPHORE_TYPE_TIMELINE: {
- pthread_mutex_lock(&device->mutex);
- anv_timeline_gc_locked(device, &impl->timeline);
- *pValue = impl->timeline.highest_past;
- pthread_mutex_unlock(&device->mutex);
- return VK_SUCCESS;
- }
-
- case ANV_SEMAPHORE_TYPE_DRM_SYNCOBJ_TIMELINE: {
- int ret = anv_gem_syncobj_timeline_query(device, &impl->syncobj, pValue, 1);
-
- if (ret != 0)
- return anv_device_set_lost(device, "unable to query timeline syncobj");
-
- return VK_SUCCESS;
- }
-
- default:
- unreachable("Invalid semaphore type");
- }
-}
-
-static VkResult
-anv_timeline_wait_locked(struct anv_device *device,
- struct anv_timeline *timeline,
- uint64_t serial, uint64_t abs_timeout_ns)
-{
- /* Wait on the queue_submit condition variable until the timeline has a
- * time point pending that's at least as high as serial.
- */
- while (timeline->highest_pending < serial) {
- struct timespec abstime = {
- .tv_sec = abs_timeout_ns / NSEC_PER_SEC,
- .tv_nsec = abs_timeout_ns % NSEC_PER_SEC,
- };
-
- UNUSED int ret = pthread_cond_timedwait(&device->queue_submit,
- &device->mutex, &abstime);
- assert(ret != EINVAL);
- if (anv_gettime_ns() >= abs_timeout_ns &&
- timeline->highest_pending < serial)
- return VK_TIMEOUT;
- }
-
- while (1) {
- VkResult result = anv_timeline_gc_locked(device, timeline);
- if (result != VK_SUCCESS)
- return result;
-
- if (timeline->highest_past >= serial)
- return VK_SUCCESS;
-
- /* If we got here, our earliest time point has a busy BO */
- struct anv_timeline_point *point =
- list_first_entry(&timeline->points,
- struct anv_timeline_point, link);
-
- /* Drop the lock while we wait. */
- point->waiting++;
- pthread_mutex_unlock(&device->mutex);
-
- result = anv_device_wait(device, point->bo,
- anv_get_relative_timeout(abs_timeout_ns));
-
- /* Pick the mutex back up */
- pthread_mutex_lock(&device->mutex);
- point->waiting--;
-
- /* This covers both VK_TIMEOUT and VK_ERROR_DEVICE_LOST */
- if (result != VK_SUCCESS)
- return result;
- }
-}
-
-static VkResult
-anv_timelines_wait(struct anv_device *device,
- struct anv_timeline **timelines,
- const uint64_t *serials,
- uint32_t n_timelines,
- bool wait_all,
- uint64_t abs_timeout_ns)
-{
- if (!wait_all && n_timelines > 1) {
- pthread_mutex_lock(&device->mutex);
-
- while (1) {
- VkResult result;
- for (uint32_t i = 0; i < n_timelines; i++) {
- result =
- anv_timeline_wait_locked(device, timelines[i], serials[i], 0);
- if (result != VK_TIMEOUT)
- break;
- }
-
- if (result != VK_TIMEOUT ||
- anv_gettime_ns() >= abs_timeout_ns) {
- pthread_mutex_unlock(&device->mutex);
- return result;
- }
-
- /* If none of them are ready do a short wait so we don't completely
- * spin while holding the lock. The 10us is completely arbitrary.
- */
- uint64_t abs_short_wait_ns =
- anv_get_absolute_timeout(
- MIN2((anv_gettime_ns() - abs_timeout_ns) / 10, 10 * 1000));
- struct timespec abstime = {
- .tv_sec = abs_short_wait_ns / NSEC_PER_SEC,
- .tv_nsec = abs_short_wait_ns % NSEC_PER_SEC,
- };
- ASSERTED int ret;
- ret = pthread_cond_timedwait(&device->queue_submit,
- &device->mutex, &abstime);
- assert(ret != EINVAL);
- }
- } else {
- VkResult result = VK_SUCCESS;
- pthread_mutex_lock(&device->mutex);
- for (uint32_t i = 0; i < n_timelines; i++) {
- result =
- anv_timeline_wait_locked(device, timelines[i],
- serials[i], abs_timeout_ns);
- if (result != VK_SUCCESS)
- break;
- }
- pthread_mutex_unlock(&device->mutex);
- return result;
- }
-}
-
-VkResult anv_WaitSemaphores(
- VkDevice _device,
- const VkSemaphoreWaitInfoKHR* pWaitInfo,
- uint64_t timeout)
-{
- ANV_FROM_HANDLE(anv_device, device, _device);
- uint32_t *handles;
- struct anv_timeline **timelines;
-
- VK_MULTIALLOC(ma);
-
- VK_MULTIALLOC_DECL(&ma, uint64_t, values, pWaitInfo->semaphoreCount);
- if (device->has_thread_submit) {
- vk_multialloc_add(&ma, &handles, uint32_t, pWaitInfo->semaphoreCount);
- } else {
- vk_multialloc_add(&ma, &timelines, struct anv_timeline *,
- pWaitInfo->semaphoreCount);
- }
-
- if (!vk_multialloc_alloc(&ma, &device->vk.alloc,
- VK_SYSTEM_ALLOCATION_SCOPE_COMMAND))
- return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
-
- uint32_t handle_count = 0;
- for (uint32_t i = 0; i < pWaitInfo->semaphoreCount; i++) {
- ANV_FROM_HANDLE(anv_semaphore, semaphore, pWaitInfo->pSemaphores[i]);
- struct anv_semaphore_impl *impl =
- semaphore->temporary.type != ANV_SEMAPHORE_TYPE_NONE ?
- &semaphore->temporary : &semaphore->permanent;
-
- if (pWaitInfo->pValues[i] == 0)
- continue;
-
- if (device->has_thread_submit) {
- assert(impl->type == ANV_SEMAPHORE_TYPE_DRM_SYNCOBJ_TIMELINE);
- handles[handle_count] = impl->syncobj;
- } else {
- assert(impl->type == ANV_SEMAPHORE_TYPE_TIMELINE);
- timelines[handle_count] = &impl->timeline;
- }
- values[handle_count] = pWaitInfo->pValues[i];
- handle_count++;
- }
-
- VkResult result = VK_SUCCESS;
- if (handle_count > 0) {
- if (device->has_thread_submit) {
- int ret =
- anv_gem_syncobj_timeline_wait(device,
- handles, values, handle_count,
- anv_get_absolute_timeout(timeout),
- !(pWaitInfo->flags & VK_SEMAPHORE_WAIT_ANY_BIT_KHR),
- false);
- if (ret != 0)
- result = errno == ETIME ? VK_TIMEOUT :
- anv_device_set_lost(device, "unable to wait on timeline syncobj");
- } else {
- result =
- anv_timelines_wait(device, timelines, values, handle_count,
- !(pWaitInfo->flags & VK_SEMAPHORE_WAIT_ANY_BIT_KHR),
- anv_get_absolute_timeout(timeout));
- }
- }
-
- vk_free(&device->vk.alloc, values);
-
- return result;
-}
-
-VkResult anv_SignalSemaphore(
- VkDevice _device,
- const VkSemaphoreSignalInfoKHR* pSignalInfo)
+anv_queue_finish(struct anv_queue *queue)
{
- ANV_FROM_HANDLE(anv_device, device, _device);
- ANV_FROM_HANDLE(anv_semaphore, semaphore, pSignalInfo->semaphore);
-
- struct anv_semaphore_impl *impl =
- semaphore->temporary.type != ANV_SEMAPHORE_TYPE_NONE ?
- &semaphore->temporary : &semaphore->permanent;
-
- switch (impl->type) {
- case ANV_SEMAPHORE_TYPE_TIMELINE: {
- pthread_mutex_lock(&device->mutex);
-
- VkResult result = anv_timeline_gc_locked(device, &impl->timeline);
+ if (queue->sync)
+ vk_sync_destroy(&queue->device->vk, queue->sync);
- assert(pSignalInfo->value > impl->timeline.highest_pending);
+ if (queue->companion_sync)
+ vk_sync_destroy(&queue->device->vk, queue->companion_sync);
- impl->timeline.highest_pending = impl->timeline.highest_past = pSignalInfo->value;
-
- if (result == VK_SUCCESS)
- result = anv_device_submit_deferred_locked(device);
-
- pthread_cond_broadcast(&device->queue_submit);
- pthread_mutex_unlock(&device->mutex);
- return result;
- }
-
- case ANV_SEMAPHORE_TYPE_DRM_SYNCOBJ_TIMELINE: {
- /* Timeline semaphores are created with a value of 0, so signaling on 0
- * is a waste of time.
- */
- if (pSignalInfo->value == 0)
- return VK_SUCCESS;
-
- int ret = anv_gem_syncobj_timeline_signal(device, &impl->syncobj,
- &pSignalInfo->value, 1);
-
- return ret == 0 ? VK_SUCCESS :
- anv_device_set_lost(device, "unable to signal timeline syncobj");
- }
-
- default:
- unreachable("Invalid semaphore type");
- }
+ anv_destroy_engine(queue);
+ vk_queue_finish(&queue->vk);
}
diff --git a/src/intel/vulkan/anv_rmv.c b/src/intel/vulkan/anv_rmv.c
new file mode 100644
index 00000000000..a65258d8d1f
--- /dev/null
+++ b/src/intel/vulkan/anv_rmv.c
@@ -0,0 +1,864 @@
+/*
+ * Copyright © 2023 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "anv_private.h"
+
+static VkResult
+capture_trace(VkQueue _queue)
+{
+ ANV_FROM_HANDLE(anv_queue, queue, _queue);
+
+ simple_mtx_lock(&queue->device->vk.memory_trace_data.token_mtx);
+ vk_dump_rmv_capture(&queue->device->vk.memory_trace_data);
+ simple_mtx_unlock(&queue->device->vk.memory_trace_data.token_mtx);
+
+ return VK_SUCCESS;
+}
+
+void
+anv_memory_trace_init(struct anv_device *device)
+{
+ struct vk_rmv_device_info info;
+ memset(&info, 0, sizeof(info));
+ anv_rmv_fill_device_info(device->physical, &info);
+ vk_memory_trace_init(&device->vk, &info);
+
+ if (!device->vk.memory_trace_data.is_enabled)
+ return;
+
+ device->vk.capture_trace = capture_trace;
+}
+
+static void
+fill_memory_info(const struct anv_physical_device *device,
+ struct vk_rmv_memory_info *out_info,
+ int32_t index)
+{
+ switch (index) {
+ case VK_RMV_MEMORY_LOCATION_DEVICE:
+ out_info->physical_base_address = 0;
+ out_info->size = device->memory.heaps[0].size;
+ break;
+ case VK_RMV_MEMORY_LOCATION_DEVICE_INVISIBLE:
+ out_info->physical_base_address = device->memory.heaps[0].size;
+ out_info->size = device->vram_non_mappable.size;
+ break;
+ case VK_RMV_MEMORY_LOCATION_HOST:
+ out_info->physical_base_address = 0;
+ out_info->size = device->memory.heaps[1].size;
+ break;
+ default:
+ unreachable("invalid memory index");
+ }
+}
+
+void
+anv_rmv_fill_device_info(const struct anv_physical_device *device,
+ struct vk_rmv_device_info *info)
+{
+ for (int32_t i = 0; i < VK_RMV_MEMORY_LOCATION_COUNT; ++i)
+ fill_memory_info(device, &info->memory_infos[i], i);
+
+ strncpy(info->device_name, device->info.name, sizeof(info->device_name) - 1);
+ info->pcie_revision_id = device->info.pci_revision_id;
+ info->pcie_device_id = device->info.pci_device_id;
+ /* TODO: */
+ info->pcie_family_id = 0;
+ info->minimum_shader_clock = 0;
+ info->maximum_shader_clock = 1 * 1024 * 1024 * 1024;
+ info->vram_type = VK_RMV_MEMORY_TYPE_DDR4;
+ info->vram_bus_width = 256;
+ info->vram_operations_per_clock = 1;
+ info->minimum_memory_clock = 0;
+ info->maximum_memory_clock = 1;
+ info->vram_bandwidth = 256;
+}
+
+void
+anv_memory_trace_finish(struct anv_device *device)
+{
+}
+
+static uint32_t
+resource_id_locked(struct anv_device *device, const void *obj)
+{
+ return vk_rmv_get_resource_id_locked(&device->vk, (uint64_t)(uintptr_t)obj);
+}
+
+static void
+resource_destroy_locked(struct anv_device *device, const void *obj)
+{
+ vk_rmv_destroy_resource_id_locked(&device->vk, (uint64_t)(uintptr_t)obj);
+}
+
+/* The token lock must be held when entering _locked functions */
+static void
+log_resource_bind_locked(struct anv_device *device, uint64_t resource_id,
+ struct anv_bo *bo, uint64_t offset,
+ uint64_t size)
+{
+ struct vk_rmv_resource_bind_token token = {
+ .resource_id = resource_id,
+ .is_system_memory = bo ? (bo->alloc_flags & ANV_BO_ALLOC_NO_LOCAL_MEM) : 0,
+ .address = (bo ? bo->offset : 0) + offset,
+ .size = size,
+ };
+
+ vk_rmv_emit_token(&device->vk.memory_trace_data, VK_RMV_TOKEN_TYPE_RESOURCE_BIND, &token);
+}
+
+static void
+log_state_pool_bind_locked(struct anv_device *device, uint64_t resource_id,
+ struct anv_state_pool *pool, struct anv_state *state)
+{
+ struct vk_rmv_resource_bind_token token = {
+ .resource_id = resource_id,
+ .is_system_memory = (pool->block_pool.bo_alloc_flags &
+ ANV_BO_ALLOC_NO_LOCAL_MEM) != 0,
+ .address = anv_address_physical(
+ anv_state_pool_state_address(pool, *state)),
+ .size = state->alloc_size,
+ };
+
+ vk_rmv_emit_token(&device->vk.memory_trace_data, VK_RMV_TOKEN_TYPE_RESOURCE_BIND, &token);
+}
+
+static enum vk_rmv_memory_location
+anv_heap_index_to_memory_location(struct anv_device *device,
+ unsigned heap_index)
+{
+ if (heap_index == 0)
+ return device->physical->vram_non_mappable.size != 0 ?
+ VK_RMV_MEMORY_LOCATION_DEVICE_INVISIBLE :
+ VK_RMV_MEMORY_LOCATION_DEVICE;
+ else if (heap_index == 1)
+ return VK_RMV_MEMORY_LOCATION_HOST;
+ else
+ return VK_RMV_MEMORY_LOCATION_DEVICE;
+}
+
+static void
+anv_rmv_log_bo_gtt_unmap_locked(struct anv_device *device,
+ struct anv_bo *bo)
+{
+ if (!bo->gtt_mapped)
+ return;
+
+ struct vk_rmv_token token = {
+ .type = VK_RMV_TOKEN_TYPE_PAGE_TABLE_UPDATE,
+ .timestamp = (uint64_t)os_time_get_nano(),
+ .data = {
+ .page_table_update = {
+ .type = VK_RMV_PAGE_TABLE_UPDATE_TYPE_UPDATE,
+ .page_size = device->info->mem_alignment,
+ .page_count = DIV_ROUND_UP(bo->size,
+ device->info->mem_alignment),
+ .pid = getpid(),
+ .virtual_address = bo->offset,
+ .physical_address = bo->offset,
+ .is_unmap = true,
+ },
+ },
+ };
+ util_dynarray_append(&device->vk.memory_trace_data.tokens,
+ struct vk_rmv_token, token);
+
+ bo->gtt_mapped = false;
+}
+
+void
+anv_rmv_log_bo_gtt_unmap(struct anv_device *device,
+ struct anv_bo *bo)
+{
+ simple_mtx_lock(&device->vk.memory_trace_data.token_mtx);
+ anv_rmv_log_bo_gtt_unmap_locked(device, bo);
+ simple_mtx_unlock(&device->vk.memory_trace_data.token_mtx);
+}
+
+void
+anv_rmv_log_bo_gtt_map(struct anv_device *device,
+ struct anv_bo *bo)
+{
+ simple_mtx_lock(&device->vk.memory_trace_data.token_mtx);
+ struct vk_rmv_token token = {
+ .type = VK_RMV_TOKEN_TYPE_PAGE_TABLE_UPDATE,
+ .timestamp = (uint64_t)os_time_get_nano(),
+ .data = {
+ .page_table_update = {
+ .type = VK_RMV_PAGE_TABLE_UPDATE_TYPE_UPDATE,
+ .page_size = device->info->mem_alignment,
+ .page_count = DIV_ROUND_UP(bo->size,
+ device->info->mem_alignment),
+ .pid = getpid(),
+ .virtual_address = bo->offset,
+ .physical_address = bo->offset,
+ .is_unmap = false,
+ },
+ },
+ };
+ util_dynarray_append(&device->vk.memory_trace_data.tokens,
+ struct vk_rmv_token, token);
+
+ bo->gtt_mapped = true;
+
+ simple_mtx_unlock(&device->vk.memory_trace_data.token_mtx);
+}
+
+void
+anv_rmv_log_bos_gtt_map(struct anv_device *device,
+ struct anv_bo **bos,
+ uint32_t bo_count)
+{
+ simple_mtx_lock(&device->vk.memory_trace_data.token_mtx);
+ for (uint32_t i = 0; i < bo_count; i++) {
+ struct anv_bo *bo = bos[i];
+
+ if (bo->gtt_mapped)
+ continue;
+
+ struct vk_rmv_token token = {
+ .type = VK_RMV_TOKEN_TYPE_PAGE_TABLE_UPDATE,
+ .timestamp = (uint64_t)os_time_get_nano(),
+ .data = {
+ .page_table_update = {
+ .type = VK_RMV_PAGE_TABLE_UPDATE_TYPE_UPDATE,
+ .page_size = device->info->mem_alignment,
+ .page_count = DIV_ROUND_UP(bo->size,
+ device->info->mem_alignment),
+ .pid = getpid(),
+ .virtual_address = bo->offset,
+ .physical_address = bo->offset,
+ .is_unmap = false,
+ },
+ },
+ };
+ util_dynarray_append(&device->vk.memory_trace_data.tokens,
+ struct vk_rmv_token, token);
+
+ bo->gtt_mapped = true;
+ }
+ simple_mtx_unlock(&device->vk.memory_trace_data.token_mtx);
+}
+
+void
+anv_rmv_log_vm_binds(struct anv_device *device,
+ struct anv_vm_bind *binds,
+ uint32_t bind_count)
+{
+ simple_mtx_lock(&device->vk.memory_trace_data.token_mtx);
+ for (uint32_t i = 0; i < bind_count; i++) {
+
+ struct vk_rmv_token token = {
+ .type = VK_RMV_TOKEN_TYPE_PAGE_TABLE_UPDATE,
+ .timestamp = (uint64_t)os_time_get_nano(),
+ .data = {
+ .page_table_update = {
+ .type = VK_RMV_PAGE_TABLE_UPDATE_TYPE_UPDATE,
+ .page_size = device->info->mem_alignment,
+ .page_count = DIV_ROUND_UP(binds[i].size,
+ device->info->mem_alignment),
+ .pid = getpid(),
+ .virtual_address = binds[i].address,
+ .physical_address = binds[i].bo_offset,
+ .is_unmap = binds[i].op == ANV_VM_UNBIND,
+ },
+ },
+ };
+ util_dynarray_append(&device->vk.memory_trace_data.tokens,
+ struct vk_rmv_token, token);
+ }
+ simple_mtx_unlock(&device->vk.memory_trace_data.token_mtx);
+}
+
+void
+anv_rmv_log_heap_create(struct anv_device *device,
+ struct anv_device_memory *memory,
+ bool is_internal,
+ VkMemoryAllocateFlags alloc_flags)
+{
+ /* Do not log zero-sized device memory objects. */
+ if (!memory->vk.size)
+ return;
+
+ simple_mtx_lock(&device->vk.memory_trace_data.token_mtx);
+
+ struct vk_rmv_resource_create_token token = {
+ .type = VK_RMV_RESOURCE_TYPE_HEAP,
+ .resource_id = resource_id_locked(device, memory),
+ .is_driver_internal = is_internal,
+ .heap = {
+ .alignment = device->info->mem_alignment,
+ .size = memory->vk.size,
+ .heap_index = anv_heap_index_to_memory_location(device,
+ memory->type->heapIndex),
+ .alloc_flags = alloc_flags,
+ },
+ };
+
+ vk_rmv_emit_token(&device->vk.memory_trace_data, VK_RMV_TOKEN_TYPE_RESOURCE_CREATE, &token);
+ log_resource_bind_locked(device, token.resource_id, memory->bo, 0, memory->vk.size);
+ simple_mtx_unlock(&device->vk.memory_trace_data.token_mtx);
+}
+
+static void
+anv_rmv_log_vma_locked(struct anv_device *device, uint64_t address, uint64_t size,
+ bool internal, bool vram, bool in_invisible_vram)
+{
+ struct vk_rmv_virtual_allocate_token token = {
+ .address = address,
+ /* If all VRAM is visible, no bo will be in invisible memory. */
+ .is_in_invisible_vram = in_invisible_vram,
+ .preferred_domains = (vram ?
+ VK_RMV_KERNEL_MEMORY_DOMAIN_VRAM :
+ VK_RMV_KERNEL_MEMORY_DOMAIN_GTT),
+ .is_driver_internal = internal,
+ .page_count = DIV_ROUND_UP(size, 4096),
+ };
+
+
+ vk_rmv_emit_token(&device->vk.memory_trace_data, VK_RMV_TOKEN_TYPE_VIRTUAL_ALLOCATE, &token);
+}
+
+void
+anv_rmv_log_bo_allocate(struct anv_device *device,
+ struct anv_bo *bo)
+{
+ simple_mtx_lock(&device->vk.memory_trace_data.token_mtx);
+ anv_rmv_log_vma_locked(device, bo->offset, bo->size,
+ bo->alloc_flags & ANV_BO_ALLOC_INTERNAL,
+ (bo->alloc_flags & ANV_BO_ALLOC_NO_LOCAL_MEM) == 0,
+ device->physical->vram_non_mappable.size != 0 &&
+ (bo->alloc_flags & (ANV_BO_ALLOC_MAPPED |
+ ANV_BO_ALLOC_HOST_CACHED_COHERENT |
+ ANV_BO_ALLOC_LOCAL_MEM_CPU_VISIBLE |
+ ANV_BO_ALLOC_NO_LOCAL_MEM)) == 0);
+ simple_mtx_unlock(&device->vk.memory_trace_data.token_mtx);
+
+ if (bo->alloc_flags & ANV_BO_ALLOC_MAPPED)
+ vk_rmv_log_cpu_map(&device->vk, bo->offset, false);
+}
+
+void
+anv_rmv_log_bo_destroy(struct anv_device *device, struct anv_bo *bo)
+{
+ struct vk_rmv_virtual_free_token token = {
+ .address = bo->offset,
+ };
+
+ if (bo->alloc_flags & ANV_BO_ALLOC_MAPPED)
+ vk_rmv_log_cpu_map(&device->vk, bo->offset, true);
+ simple_mtx_lock(&device->vk.memory_trace_data.token_mtx);
+ anv_rmv_log_bo_gtt_unmap_locked(device, bo);
+ vk_rmv_emit_token(&device->vk.memory_trace_data, VK_RMV_TOKEN_TYPE_VIRTUAL_FREE, &token);
+ simple_mtx_unlock(&device->vk.memory_trace_data.token_mtx);
+}
+
+void
+anv_rmv_log_buffer_create(struct anv_device *device,
+ bool is_internal,
+ struct anv_buffer *buffer)
+{
+ simple_mtx_lock(&device->vk.memory_trace_data.token_mtx);
+ struct vk_rmv_resource_create_token token = {
+ .type = VK_RMV_RESOURCE_TYPE_BUFFER,
+ .is_driver_internal = is_internal,
+ .resource_id = resource_id_locked(device, buffer),
+ .buffer = {
+ .create_flags = buffer->vk.create_flags,
+ .size = buffer->vk.size,
+ .usage_flags = buffer->vk.usage,
+ },
+ };
+
+ vk_rmv_emit_token(&device->vk.memory_trace_data, VK_RMV_TOKEN_TYPE_RESOURCE_CREATE, &token);
+ if (buffer->vk.create_flags & VK_BUFFER_CREATE_SPARSE_BINDING_BIT) {
+ assert(buffer->sparse_data.size != 0);
+ anv_rmv_log_vma_locked(device,
+ buffer->sparse_data.address,
+ buffer->sparse_data.size,
+ false /* internal */, true /* TODO: vram */,
+ true /* in_invisible_vram */);
+ log_resource_bind_locked(device,
+ resource_id_locked(device, buffer),
+ NULL,
+ buffer->sparse_data.address,
+ buffer->sparse_data.size);
+ }
+ simple_mtx_unlock(&device->vk.memory_trace_data.token_mtx);
+
+}
+
+void
+anv_rmv_log_buffer_destroy(struct anv_device *device,
+ struct anv_buffer *buffer)
+{
+ simple_mtx_lock(&device->vk.memory_trace_data.token_mtx);
+ if (buffer->vk.create_flags & VK_BUFFER_CREATE_SPARSE_BINDING_BIT) {
+ struct vk_rmv_virtual_free_token token = {
+ .address = buffer->sparse_data.address,
+ };
+ vk_rmv_emit_token(&device->vk.memory_trace_data, VK_RMV_TOKEN_TYPE_VIRTUAL_FREE, &token);
+ }
+ resource_destroy_locked(device, buffer);
+ simple_mtx_unlock(&device->vk.memory_trace_data.token_mtx);
+
+}
+
+void
+anv_rmv_log_buffer_bind(struct anv_device *device, struct anv_buffer *buffer)
+{
+ simple_mtx_lock(&device->vk.memory_trace_data.token_mtx);
+ log_resource_bind_locked(device,
+ resource_id_locked(device, buffer),
+ buffer->address.bo,
+ buffer->address.offset, buffer->vk.size);
+ simple_mtx_unlock(&device->vk.memory_trace_data.token_mtx);
+}
+
+void
+anv_rmv_log_image_create(struct anv_device *device,
+ bool is_internal,
+ struct anv_image *image)
+{
+ simple_mtx_lock(&device->vk.memory_trace_data.token_mtx);
+ struct vk_rmv_resource_create_token token = {
+ .type = VK_RMV_RESOURCE_TYPE_IMAGE,
+ .resource_id = resource_id_locked(device, image),
+ .is_driver_internal = is_internal,
+ .image = {
+ .create_flags = image->vk.create_flags,
+ .usage_flags = image->vk.usage,
+ .type = image->vk.image_type,
+ .extent = image->vk.extent,
+ .format = image->vk.format,
+ .num_mips = image->vk.mip_levels,
+ .num_slices = image->vk.array_layers,
+ .tiling = image->vk.tiling,
+ .alignment_log2 = util_logbase2(
+ image->bindings[ANV_IMAGE_MEMORY_BINDING_MAIN].memory_range.alignment),
+ .log2_samples = util_logbase2(image->vk.samples),
+ .metadata_alignment_log2 = util_logbase2(
+ image->planes[0].aux_surface.isl.alignment_B),
+ .image_alignment_log2 = util_logbase2(
+ image->planes[0].primary_surface.isl.alignment_B),
+ .size = image->planes[0].primary_surface.memory_range.size,
+ .metadata_size = image->planes[0].aux_surface.memory_range.size,
+ .metadata_header_size = 0,
+ .metadata_offset = image->planes[0].aux_surface.memory_range.offset,
+ .metadata_header_offset = image->planes[0].aux_surface.memory_range.offset,
+ .presentable = (image->planes[0].primary_surface.isl.usage &
+ ISL_SURF_USAGE_DISPLAY_BIT) != 0,
+ },
+ };
+
+ vk_rmv_emit_token(&device->vk.memory_trace_data, VK_RMV_TOKEN_TYPE_RESOURCE_CREATE, &token);
+ if (image->vk.create_flags & VK_IMAGE_CREATE_SPARSE_BINDING_BIT) {
+ for (uint32_t b = 0; b < ARRAY_SIZE(image->bindings); b++) {
+ if (image->bindings[b].sparse_data.size != 0) {
+ anv_rmv_log_vma_locked(device,
+ image->bindings[b].sparse_data.address,
+ image->bindings[b].sparse_data.size,
+ false /* internal */, true /* TODO: vram */,
+ true /* in_invisible_vram */);
+ log_resource_bind_locked(device,
+ resource_id_locked(device, image),
+ NULL,
+ image->bindings[b].sparse_data.address,
+ image->bindings[b].sparse_data.size);
+ }
+ }
+ }
+ simple_mtx_unlock(&device->vk.memory_trace_data.token_mtx);
+}
+
+void
+anv_rmv_log_image_destroy(struct anv_device *device,
+ struct anv_image *image)
+{
+ simple_mtx_lock(&device->vk.memory_trace_data.token_mtx);
+ if (image->vk.create_flags & VK_IMAGE_CREATE_SPARSE_BINDING_BIT) {
+ for (uint32_t b = 0; b < ARRAY_SIZE(image->bindings); b++) {
+ if (image->bindings[b].sparse_data.size != 0) {
+ struct vk_rmv_virtual_free_token token = {
+ .address = image->bindings[b].sparse_data.address,
+ };
+
+ vk_rmv_emit_token(&device->vk.memory_trace_data, VK_RMV_TOKEN_TYPE_VIRTUAL_FREE, &token);
+ }
+ }
+ }
+ resource_destroy_locked(device, image);
+ simple_mtx_unlock(&device->vk.memory_trace_data.token_mtx);
+}
+
+void
+anv_rmv_log_image_bind(struct anv_device *device,
+ struct anv_image *image,
+ enum anv_image_memory_binding binding)
+{
+ simple_mtx_lock(&device->vk.memory_trace_data.token_mtx);
+ log_resource_bind_locked(device,
+ resource_id_locked(device, image),
+ image->bindings[binding].address.bo,
+ image->bindings[binding].address.offset,
+ image->bindings[binding].memory_range.size);
+ simple_mtx_unlock(&device->vk.memory_trace_data.token_mtx);
+}
+
+void
+anv_rmv_log_query_pool_create(struct anv_device *device,
+ struct anv_query_pool *pool,
+ bool is_internal)
+{
+ if (pool->vk.query_type != VK_QUERY_TYPE_OCCLUSION &&
+ pool->vk.query_type != VK_QUERY_TYPE_PIPELINE_STATISTICS &&
+ pool->vk.query_type != VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT)
+ return;
+
+ simple_mtx_lock(&device->vk.memory_trace_data.token_mtx);
+ struct vk_rmv_resource_create_token create_token = {
+ .type = VK_RMV_RESOURCE_TYPE_QUERY_HEAP,
+ .resource_id = resource_id_locked(device, pool),
+ .is_driver_internal = is_internal,
+ .query_pool = {
+ .type = pool->vk.query_type,
+ .has_cpu_access = true,
+ },
+ };
+
+ vk_rmv_emit_token(&device->vk.memory_trace_data,
+ VK_RMV_TOKEN_TYPE_RESOURCE_CREATE, &create_token);
+ log_resource_bind_locked(device, create_token.resource_id,
+ pool->bo, 0, pool->bo->size);
+ simple_mtx_unlock(&device->vk.memory_trace_data.token_mtx);
+}
+
+static void
+bind_cmd_buffer_state_stream_locked(struct anv_device *device,
+ uint64_t resource_id,
+ struct anv_state_stream *stream)
+{
+ util_dynarray_foreach(&stream->all_blocks, struct anv_state, block)
+ log_state_pool_bind_locked(device, resource_id, stream->state_pool, block);
+}
+
+void
+anv_rmv_log_cmd_buffer_create(struct anv_device *device,
+ struct anv_cmd_buffer *cmd_buffer)
+{
+ uint64_t data_size =
+ cmd_buffer->surface_state_stream.total_size +
+ cmd_buffer->dynamic_state_stream.total_size +
+ cmd_buffer->general_state_stream.total_size +
+ cmd_buffer->indirect_push_descriptor_stream.total_size;
+
+ uint64_t executable_size = 0;
+ list_for_each_entry(struct anv_batch_bo, bbo, &cmd_buffer->batch_bos, link)
+ executable_size += bbo->length;
+
+ simple_mtx_lock(&device->vk.memory_trace_data.token_mtx);
+ struct vk_rmv_resource_create_token create_token = {
+ .type = VK_RMV_RESOURCE_TYPE_COMMAND_ALLOCATOR,
+ .resource_id = resource_id_locked(device, cmd_buffer),
+ .is_driver_internal = true,
+ .command_buffer = {
+ .preferred_domain = VK_RMV_KERNEL_MEMORY_DOMAIN_GTT /* TODO */,
+ .executable_size = executable_size,
+ .app_available_executable_size = executable_size,
+ .embedded_data_size = data_size,
+ .app_available_embedded_data_size = data_size,
+ .scratch_size = 0,
+ .app_available_scratch_size = 0,
+ },
+ };
+
+ vk_rmv_emit_token(&device->vk.memory_trace_data,
+ VK_RMV_TOKEN_TYPE_RESOURCE_CREATE,
+ &create_token);
+ list_for_each_entry(struct anv_batch_bo, bbo, &cmd_buffer->batch_bos, link) {
+ log_resource_bind_locked(device, create_token.resource_id,
+ bbo->bo, 0, bbo->length);
+ }
+ bind_cmd_buffer_state_stream_locked(device, create_token.resource_id,
+ &cmd_buffer->surface_state_stream);
+ bind_cmd_buffer_state_stream_locked(device, create_token.resource_id,
+ &cmd_buffer->dynamic_state_stream);
+ bind_cmd_buffer_state_stream_locked(device, create_token.resource_id,
+ &cmd_buffer->general_state_stream);
+ bind_cmd_buffer_state_stream_locked(device, create_token.resource_id,
+ &cmd_buffer->indirect_push_descriptor_stream);
+ simple_mtx_unlock(&device->vk.memory_trace_data.token_mtx);
+}
+
+void
+anv_rmv_log_cmd_buffer_destroy(struct anv_device *device,
+ struct anv_cmd_buffer *cmd_buffer)
+{
+ simple_mtx_lock(&device->vk.memory_trace_data.token_mtx);
+ struct vk_rmv_resource_destroy_token destroy_token = {
+ .resource_id = resource_id_locked(device, cmd_buffer),
+ };
+
+ vk_rmv_emit_token(&device->vk.memory_trace_data,
+ VK_RMV_TOKEN_TYPE_RESOURCE_DESTROY, &destroy_token);
+ resource_destroy_locked(device, cmd_buffer);
+ simple_mtx_unlock(&device->vk.memory_trace_data.token_mtx);
+}
+
+void
+anv_rmv_log_sparse_add_residency(struct anv_device *device,
+ struct anv_bo *src_bo,
+ uint64_t offset)
+{
+ struct vk_rmv_resource_reference_token token = {
+ .virtual_address = src_bo->offset + offset,
+ .residency_removed = false,
+ };
+
+ simple_mtx_lock(&device->vk.memory_trace_data.token_mtx);
+ vk_rmv_emit_token(&device->vk.memory_trace_data,
+ VK_RMV_TOKEN_TYPE_RESOURCE_REFERENCE, &token);
+ simple_mtx_unlock(&device->vk.memory_trace_data.token_mtx);
+}
+
+void
+anv_rmv_log_sparse_remove_residency(struct anv_device *device,
+ struct anv_bo *src_bo,
+ uint64_t offset)
+{
+ struct vk_rmv_resource_reference_token token = {
+ .virtual_address = src_bo->offset + offset,
+ .residency_removed = true,
+ };
+
+ simple_mtx_lock(&device->vk.memory_trace_data.token_mtx);
+ vk_rmv_emit_token(&device->vk.memory_trace_data,
+ VK_RMV_TOKEN_TYPE_RESOURCE_REFERENCE, &token);
+ simple_mtx_unlock(&device->vk.memory_trace_data.token_mtx);
+}
+
+void
+anv_rmv_log_descriptor_pool_create(struct anv_device *device,
+ const VkDescriptorPoolCreateInfo *create_info,
+ struct anv_descriptor_pool *pool,
+ bool is_internal)
+{
+ simple_mtx_lock(&device->vk.memory_trace_data.token_mtx);
+ struct vk_rmv_resource_create_token create_token = {
+ .type = VK_RMV_RESOURCE_TYPE_DESCRIPTOR_POOL,
+ .resource_id = resource_id_locked(device, pool),
+ .is_driver_internal = false,
+ .descriptor_pool = {
+ .max_sets = create_info->maxSets,
+ .pool_size_count = create_info->poolSizeCount,
+ /* Using vk_rmv_token_pool_alloc frees the allocation automatically
+ * when the trace is done. */
+ .pool_sizes = malloc(create_info->poolSizeCount *
+ sizeof(VkDescriptorPoolSize)),
+ },
+ };
+
+ if (!create_token.descriptor_pool.pool_sizes) {
+ simple_mtx_unlock(&device->vk.memory_trace_data.token_mtx);
+ return;
+ }
+
+ memcpy(create_token.descriptor_pool.pool_sizes, create_info->pPoolSizes,
+ create_info->poolSizeCount * sizeof(VkDescriptorPoolSize));
+
+ vk_rmv_emit_token(&device->vk.memory_trace_data,
+ VK_RMV_TOKEN_TYPE_RESOURCE_CREATE, &create_token);
+ simple_mtx_unlock(&device->vk.memory_trace_data.token_mtx);
+
+ if (pool->surfaces.bo) {
+ struct vk_rmv_resource_bind_token bind_token = {
+ .resource_id = create_token.resource_id,
+ .is_system_memory = false,
+ .address = pool->surfaces.bo->offset,
+ .size = pool->surfaces.bo->size,
+ };
+
+ simple_mtx_lock(&device->vk.memory_trace_data.token_mtx);
+ vk_rmv_emit_token(&device->vk.memory_trace_data, VK_RMV_TOKEN_TYPE_RESOURCE_BIND, &bind_token);
+ simple_mtx_unlock(&device->vk.memory_trace_data.token_mtx);
+ }
+ if (pool->samplers.bo) {
+ struct vk_rmv_resource_bind_token bind_token = {
+ .resource_id = create_token.resource_id,
+ .is_system_memory = false,
+ .address = pool->samplers.bo->offset,
+ .size = pool->samplers.bo->size,
+ };
+
+ simple_mtx_lock(&device->vk.memory_trace_data.token_mtx);
+ vk_rmv_emit_token(&device->vk.memory_trace_data, VK_RMV_TOKEN_TYPE_RESOURCE_BIND, &bind_token);
+ simple_mtx_unlock(&device->vk.memory_trace_data.token_mtx);
+ }
+}
+
+void
+anv_rmv_log_graphics_pipeline_create(struct anv_device *device,
+ struct anv_graphics_pipeline *pipeline,
+ bool is_internal)
+{
+ struct vk_rmv_resource_create_token create_token = {
+ .type = VK_RMV_RESOURCE_TYPE_PIPELINE,
+ .resource_id = resource_id_locked(device, pipeline),
+ .is_driver_internal = is_internal,
+ .pipeline = {
+ .is_internal = is_internal,
+ .hash_lo = 0,/* TODO pipeline->pipeline_hash; */
+ .shader_stages = pipeline->base.base.active_stages,
+ },
+ };
+
+ simple_mtx_lock(&device->vk.memory_trace_data.token_mtx);
+ vk_rmv_emit_token(&device->vk.memory_trace_data, VK_RMV_TOKEN_TYPE_RESOURCE_CREATE, &create_token);
+ for (unsigned s = 0; s < ARRAY_SIZE(pipeline->base.shaders); s++) {
+ struct anv_shader_bin *shader = pipeline->base.shaders[s];
+
+ if (!shader)
+ continue;
+
+ log_state_pool_bind_locked(device, create_token.resource_id,
+ &device->instruction_state_pool,
+ &shader->kernel);
+ }
+ simple_mtx_unlock(&device->vk.memory_trace_data.token_mtx);
+}
+
+void
+anv_rmv_log_compute_pipeline_create(struct anv_device *device,
+ struct anv_compute_pipeline *pipeline,
+ bool is_internal)
+{
+ VkShaderStageFlagBits active_stages =
+ pipeline->base.type == ANV_PIPELINE_COMPUTE ?
+ VK_SHADER_STAGE_COMPUTE_BIT : VK_SHADER_STAGE_RAYGEN_BIT_KHR;
+
+ simple_mtx_lock(&device->vk.memory_trace_data.token_mtx);
+ struct vk_rmv_resource_create_token create_token = {
+ .type = VK_RMV_RESOURCE_TYPE_PIPELINE,
+ .resource_id = resource_id_locked(device, pipeline),
+ .is_driver_internal = is_internal,
+ .pipeline = {
+ .is_internal = is_internal,
+ .hash_lo = 0,/* TODO pipeline->pipeline_hash; */
+ .shader_stages = active_stages,
+ },
+ };
+
+ vk_rmv_emit_token(&device->vk.memory_trace_data, VK_RMV_TOKEN_TYPE_RESOURCE_CREATE, &create_token);
+ struct anv_shader_bin *shader = pipeline->cs;
+ log_state_pool_bind_locked(device, create_token.resource_id,
+ &device->instruction_state_pool,
+ &shader->kernel);
+ simple_mtx_unlock(&device->vk.memory_trace_data.token_mtx);
+}
+
+void
+anv_rmv_log_rt_pipeline_create(struct anv_device *device,
+ struct anv_ray_tracing_pipeline *pipeline,
+ bool is_internal)
+{
+ simple_mtx_lock(&device->vk.memory_trace_data.token_mtx);
+
+ struct vk_rmv_resource_create_token create_token = {
+ .resource_id = resource_id_locked(device, pipeline),
+ .type = VK_RMV_RESOURCE_TYPE_PIPELINE,
+ .is_driver_internal = is_internal,
+ .pipeline = {
+ .is_internal = is_internal,
+ .hash_lo = 0, /* TODO */
+ .shader_stages = pipeline->base.active_stages,
+ },
+ };
+ vk_rmv_emit_token(&device->vk.memory_trace_data, VK_RMV_TOKEN_TYPE_RESOURCE_CREATE, &create_token);
+
+ struct anv_state_pool *state_pool = &device->instruction_state_pool;
+ for (uint32_t i = 0; i < pipeline->group_count; i++) {
+ struct anv_rt_shader_group *group = &pipeline->groups[i];
+
+ if (group->imported)
+ continue;
+
+ if (group->general) {
+ log_state_pool_bind_locked(device, create_token.resource_id, state_pool,
+ &group->general->kernel);
+ }
+ if (group->closest_hit) {
+ log_state_pool_bind_locked(device, create_token.resource_id, state_pool,
+ &group->closest_hit->kernel);
+ }
+ if (group->any_hit) {
+ log_state_pool_bind_locked(device, create_token.resource_id, state_pool,
+ &group->any_hit->kernel);
+ }
+ if (group->intersection) {
+ log_state_pool_bind_locked(device, create_token.resource_id, state_pool,
+ &group->intersection->kernel);
+ }
+ }
+
+ simple_mtx_unlock(&device->vk.memory_trace_data.token_mtx);
+}
+
+void
+anv_rmv_log_event_create(struct anv_device *device,
+ struct anv_event *event,
+ VkEventCreateFlags flags,
+ bool is_internal)
+{
+ simple_mtx_lock(&device->vk.memory_trace_data.token_mtx);
+ struct vk_rmv_resource_create_token create_token = {
+ .type = VK_RMV_RESOURCE_TYPE_GPU_EVENT,
+ .resource_id = resource_id_locked(device, event),
+ .is_driver_internal = is_internal,
+ .event = {
+ .flags = flags,
+ },
+ };
+
+ vk_rmv_emit_token(&device->vk.memory_trace_data, VK_RMV_TOKEN_TYPE_RESOURCE_CREATE, &create_token);
+ log_state_pool_bind_locked(device, create_token.resource_id,
+ &device->dynamic_state_pool,
+ &event->state);
+ simple_mtx_unlock(&device->vk.memory_trace_data.token_mtx);
+}
+
+void
+anv_rmv_log_resource_destroy(struct anv_device *device, const void *obj)
+{
+ simple_mtx_lock(&device->vk.memory_trace_data.token_mtx);
+ struct vk_rmv_resource_destroy_token token = {
+ .resource_id = resource_id_locked(device, obj),
+ };
+
+ vk_rmv_emit_token(&device->vk.memory_trace_data, VK_RMV_TOKEN_TYPE_RESOURCE_DESTROY, &token);
+ resource_destroy_locked(device, obj);
+ simple_mtx_unlock(&device->vk.memory_trace_data.token_mtx);
+}
diff --git a/src/intel/vulkan/anv_rmv.h b/src/intel/vulkan/anv_rmv.h
new file mode 100644
index 00000000000..e5e94619863
--- /dev/null
+++ b/src/intel/vulkan/anv_rmv.h
@@ -0,0 +1,118 @@
+/*
+ * Copyright © 2024 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef ANV_RMV_H
+#define ANV_RMV_H
+
+#include <stdbool.h>
+#include <stdint.h>
+
+#include "vulkan/vulkan_core.h"
+
+struct anv_device;
+struct anv_device_memory;
+struct anv_physical_device;
+struct anv_descriptor_pool;
+struct anv_buffer;
+struct anv_image;
+struct anv_bo;
+struct anv_event;
+struct anv_graphics_pipeline;
+struct anv_compute_pipeline;
+struct anv_ray_tracing_pipeline;
+
+enum anv_image_memory_binding;
+
+#define ANV_RMV(func, device, ...) do { \
+ if (unlikely((device)->vk.memory_trace_data.is_enabled)) \
+ anv_rmv_log_##func(device, __VA_ARGS__); \
+ } while (0)
+
+void anv_memory_trace_init(struct anv_device *device);
+void anv_rmv_fill_device_info(const struct anv_physical_device *device,
+ struct vk_rmv_device_info *info);
+void anv_memory_trace_finish(struct anv_device *device);
+
+void anv_rmv_log_heap_create(struct anv_device *device,
+ struct anv_device_memory *memory,
+ bool is_internal,
+ VkMemoryAllocateFlags alloc_flags);
+void anv_rmv_log_bo_gtt_map(struct anv_device *device,
+ struct anv_bo *bo);
+void anv_rmv_log_bo_gtt_unmap(struct anv_device *device,
+ struct anv_bo *bo);
+void anv_rmv_log_bos_gtt_map(struct anv_device *device,
+ struct anv_bo **bos,
+ uint32_t bo_count);
+void anv_rmv_log_vm_binds(struct anv_device *device,
+ struct anv_vm_bind *binds,
+ uint32_t bind_count);
+void anv_rmv_log_bo_allocate(struct anv_device *device,
+ struct anv_bo *bo);
+void anv_rmv_log_bo_destroy(struct anv_device *device, struct anv_bo *bo);
+void anv_rmv_log_buffer_create(struct anv_device *device,
+ bool is_internal,
+ struct anv_buffer *buffer);
+void anv_rmv_log_buffer_destroy(struct anv_device *device,
+ struct anv_buffer *buffer);
+void anv_rmv_log_buffer_bind(struct anv_device *device, struct anv_buffer *buffer);
+void anv_rmv_log_image_create(struct anv_device *device,
+ bool is_internal,
+ struct anv_image *image);
+void anv_rmv_log_image_destroy(struct anv_device *device,
+ struct anv_image *image);
+void anv_rmv_log_image_bind(struct anv_device *device,
+ struct anv_image *image,
+ enum anv_image_memory_binding binding);
+void anv_rmv_log_query_pool_create(struct anv_device *device,
+ struct anv_query_pool *pool,
+ bool is_internal);
+void anv_rmv_log_cmd_buffer_create(struct anv_device *device,
+ struct anv_cmd_buffer *cmd_buffer);
+void anv_rmv_log_cmd_buffer_destroy(struct anv_device *device,
+ struct anv_cmd_buffer *cmd_buffer);
+void anv_rmv_log_sparse_add_residency(struct anv_device *device,
+ struct anv_bo *src_bo,
+ uint64_t offset);
+void anv_rmv_log_sparse_remove_residency(struct anv_device *device,
+ struct anv_bo *src_bo,
+ uint64_t offset);
+void anv_rmv_log_descriptor_pool_create(struct anv_device *device,
+ const VkDescriptorPoolCreateInfo *create_info,
+ struct anv_descriptor_pool *pool,
+ bool is_internal);
+void anv_rmv_log_graphics_pipeline_create(struct anv_device *device,
+ struct anv_graphics_pipeline *pipeline,
+ bool is_internal);
+void anv_rmv_log_compute_pipeline_create(struct anv_device *device,
+ struct anv_compute_pipeline *pipeline,
+ bool is_internal);
+void anv_rmv_log_rt_pipeline_create(struct anv_device *device,
+ struct anv_ray_tracing_pipeline *pipeline,
+ bool is_internal);
+void anv_rmv_log_event_create(struct anv_device *device,
+ struct anv_event *event,
+ VkEventCreateFlags flags, bool is_internal);
+void anv_rmv_log_resource_destroy(struct anv_device *device, const void *obj);
+
+#endif /* ANV_RMV_H */
diff --git a/src/intel/vulkan/anv_sparse.c b/src/intel/vulkan/anv_sparse.c
new file mode 100644
index 00000000000..279dffea510
--- /dev/null
+++ b/src/intel/vulkan/anv_sparse.c
@@ -0,0 +1,1293 @@
+/*
+ * Copyright © 2022 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <anv_private.h>
+
+/* Sparse binding handling.
+ *
+ * There is one main structure passed around all over this file:
+ *
+ * - struct anv_sparse_binding_data: every resource (VkBuffer or VkImage) has
+ * a pointer to an instance of this structure. It contains the virtual
+ * memory address (VMA) used by the binding operations (which is different
+ * from the VMA used by the anv_bo it's bound to) and the VMA range size. We
+ * do not keep record of our our list of bindings (which ranges were bound
+ * to which buffers).
+ */
+
+__attribute__((format(printf, 1, 2)))
+static void
+sparse_debug(const char *format, ...)
+{
+ if (!INTEL_DEBUG(DEBUG_SPARSE))
+ return;
+
+ va_list args;
+ va_start(args, format);
+ vfprintf(stderr, format, args);
+ va_end(args);
+}
+
+static void
+dump_anv_vm_bind(struct anv_device *device,
+ const struct anv_vm_bind *bind)
+{
+ sparse_debug("[%s] ", bind->op == ANV_VM_BIND ? " bind " : "unbind");
+
+ if (bind->bo)
+ sparse_debug("bo:%04u ", bind->bo->gem_handle);
+ else
+ sparse_debug("bo:---- ");
+ sparse_debug("address:%016"PRIx64" size:%08"PRIx64" "
+ "mem_offset:%08"PRIx64"\n",
+ bind->address, bind->size, bind->bo_offset);
+}
+
+static void
+dump_anv_image(struct anv_image *i)
+{
+ if (!INTEL_DEBUG(DEBUG_SPARSE))
+ return;
+
+ sparse_debug("anv_image:\n");
+ sparse_debug("- format: %d\n", i->vk.format);
+ sparse_debug("- extent: [%d, %d, %d]\n",
+ i->vk.extent.width, i->vk.extent.height, i->vk.extent.depth);
+ sparse_debug("- mip_levels: %d array_layers: %d samples: %d\n",
+ i->vk.mip_levels, i->vk.array_layers, i->vk.samples);
+ sparse_debug("- n_planes: %d\n", i->n_planes);
+ sparse_debug("- disjoint: %d\n", i->disjoint);
+}
+
+static void
+dump_isl_surf(struct isl_surf *s)
+{
+ if (!INTEL_DEBUG(DEBUG_SPARSE))
+ return;
+
+ sparse_debug("isl_surf:\n");
+
+ const char *dim_s = s->dim == ISL_SURF_DIM_1D ? "1D" :
+ s->dim == ISL_SURF_DIM_2D ? "2D" :
+ s->dim == ISL_SURF_DIM_3D ? "3D" :
+ "(ERROR)";
+ sparse_debug("- dim: %s\n", dim_s);
+ sparse_debug("- tiling: %d (%s)\n", s->tiling,
+ isl_tiling_to_name(s->tiling));
+ sparse_debug("- format: %s\n", isl_format_get_short_name(s->format));
+ sparse_debug("- image_alignment_el: [%d, %d, %d]\n",
+ s->image_alignment_el.w, s->image_alignment_el.h,
+ s->image_alignment_el.d);
+ sparse_debug("- logical_level0_px: [%d, %d, %d, %d]\n",
+ s->logical_level0_px.w,
+ s->logical_level0_px.h,
+ s->logical_level0_px.d,
+ s->logical_level0_px.a);
+ sparse_debug("- phys_level0_sa: [%d, %d, %d, %d]\n",
+ s->phys_level0_sa.w,
+ s->phys_level0_sa.h,
+ s->phys_level0_sa.d,
+ s->phys_level0_sa.a);
+ sparse_debug("- levels: %d samples: %d\n", s->levels, s->samples);
+ sparse_debug("- size_B: %"PRIu64" alignment_B: %u\n",
+ s->size_B, s->alignment_B);
+ sparse_debug("- row_pitch_B: %u\n", s->row_pitch_B);
+ sparse_debug("- array_pitch_el_rows: %u\n", s->array_pitch_el_rows);
+
+ const struct isl_format_layout *layout = isl_format_get_layout(s->format);
+ sparse_debug("- format layout:\n");
+ sparse_debug(" - format:%d bpb:%d bw:%d bh:%d bd:%d\n",
+ layout->format, layout->bpb, layout->bw, layout->bh,
+ layout->bd);
+
+ struct isl_tile_info tile_info;
+ isl_surf_get_tile_info(s, &tile_info);
+
+ sparse_debug("- tile info:\n");
+ sparse_debug(" - format_bpb: %d\n", tile_info.format_bpb);
+ sparse_debug(" - logical_extent_el: [%d, %d, %d, %d]\n",
+ tile_info.logical_extent_el.w,
+ tile_info.logical_extent_el.h,
+ tile_info.logical_extent_el.d,
+ tile_info.logical_extent_el.a);
+ sparse_debug(" - phys_extent_B: [%d, %d]\n",
+ tile_info.phys_extent_B.w,
+ tile_info.phys_extent_B.h);
+}
+
+static VkOffset3D
+vk_offset3d_px_to_el(const VkOffset3D offset_px,
+ const struct isl_format_layout *layout)
+{
+ return (VkOffset3D) {
+ .x = offset_px.x / layout->bw,
+ .y = offset_px.y / layout->bh,
+ .z = offset_px.z / layout->bd,
+ };
+}
+
+static VkOffset3D
+vk_offset3d_el_to_px(const VkOffset3D offset_el,
+ const struct isl_format_layout *layout)
+{
+ return (VkOffset3D) {
+ .x = offset_el.x * layout->bw,
+ .y = offset_el.y * layout->bh,
+ .z = offset_el.z * layout->bd,
+ };
+}
+
+static VkExtent3D
+vk_extent3d_px_to_el(const VkExtent3D extent_px,
+ const struct isl_format_layout *layout)
+{
+ return (VkExtent3D) {
+ .width = extent_px.width / layout->bw,
+ .height = extent_px.height / layout->bh,
+ .depth = extent_px.depth / layout->bd,
+ };
+}
+
+static VkExtent3D
+vk_extent3d_el_to_px(const VkExtent3D extent_el,
+ const struct isl_format_layout *layout)
+{
+ return (VkExtent3D) {
+ .width = extent_el.width * layout->bw,
+ .height = extent_el.height * layout->bh,
+ .depth = extent_el.depth * layout->bd,
+ };
+}
+
+static bool
+isl_tiling_supports_standard_block_shapes(enum isl_tiling tiling)
+{
+ return isl_tiling_is_64(tiling) ||
+ tiling == ISL_TILING_ICL_Ys ||
+ tiling == ISL_TILING_SKL_Ys;
+}
+
+static VkExtent3D
+anv_sparse_get_standard_image_block_shape(enum isl_format format,
+ VkImageType image_type,
+ uint16_t texel_size)
+{
+ const struct isl_format_layout *layout = isl_format_get_layout(format);
+ VkExtent3D block_shape = { .width = 0, .height = 0, .depth = 0 };
+
+ switch (image_type) {
+ case VK_IMAGE_TYPE_1D:
+ /* 1D images don't have a standard block format. */
+ assert(false);
+ break;
+ case VK_IMAGE_TYPE_2D:
+ switch (texel_size) {
+ case 8:
+ block_shape = (VkExtent3D) { .width = 256, .height = 256, .depth = 1 };
+ break;
+ case 16:
+ block_shape = (VkExtent3D) { .width = 256, .height = 128, .depth = 1 };
+ break;
+ case 32:
+ block_shape = (VkExtent3D) { .width = 128, .height = 128, .depth = 1 };
+ break;
+ case 64:
+ block_shape = (VkExtent3D) { .width = 128, .height = 64, .depth = 1 };
+ break;
+ case 128:
+ block_shape = (VkExtent3D) { .width = 64, .height = 64, .depth = 1 };
+ break;
+ default:
+ fprintf(stderr, "unexpected texel_size %d\n", texel_size);
+ assert(false);
+ }
+ break;
+ case VK_IMAGE_TYPE_3D:
+ switch (texel_size) {
+ case 8:
+ block_shape = (VkExtent3D) { .width = 64, .height = 32, .depth = 32 };
+ break;
+ case 16:
+ block_shape = (VkExtent3D) { .width = 32, .height = 32, .depth = 32 };
+ break;
+ case 32:
+ block_shape = (VkExtent3D) { .width = 32, .height = 32, .depth = 16 };
+ break;
+ case 64:
+ block_shape = (VkExtent3D) { .width = 32, .height = 16, .depth = 16 };
+ break;
+ case 128:
+ block_shape = (VkExtent3D) { .width = 16, .height = 16, .depth = 16 };
+ break;
+ default:
+ fprintf(stderr, "unexpected texel_size %d\n", texel_size);
+ assert(false);
+ }
+ break;
+ default:
+ fprintf(stderr, "unexpected image_type %d\n", image_type);
+ assert(false);
+ }
+
+ return vk_extent3d_el_to_px(block_shape, layout);
+}
+
+/* Adds "bind_op" to the list in "submit", while also trying to check if we
+ * can just extend the last operation instead.
+ */
+static VkResult
+anv_sparse_submission_add(struct anv_device *device,
+ struct anv_sparse_submission *submit,
+ struct anv_vm_bind *bind_op)
+{
+ struct anv_vm_bind *prev_bind = submit->binds_len == 0 ? NULL :
+ &submit->binds[submit->binds_len - 1];
+
+ if (prev_bind &&
+ bind_op->op == prev_bind->op &&
+ bind_op->bo == prev_bind->bo &&
+ bind_op->address == prev_bind->address + prev_bind->size &&
+ (bind_op->bo_offset == prev_bind->bo_offset + prev_bind->size ||
+ prev_bind->bo == NULL)) {
+ prev_bind->size += bind_op->size;
+ return VK_SUCCESS;
+ }
+
+ if (submit->binds_len < submit->binds_capacity) {
+ submit->binds[submit->binds_len++] = *bind_op;
+ return VK_SUCCESS;
+ }
+
+ int new_capacity = MAX2(32, submit->binds_capacity * 2);
+ struct anv_vm_bind *new_binds =
+ vk_realloc(&device->vk.alloc, submit->binds,
+ new_capacity * sizeof(*new_binds), 8,
+ VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
+ if (!new_binds)
+ return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+ new_binds[submit->binds_len] = *bind_op;
+
+ submit->binds = new_binds;
+ submit->binds_len++;
+ submit->binds_capacity = new_capacity;
+
+ return VK_SUCCESS;
+}
+
+/* We really want to try to have all the page tables on as few BOs as possible
+ * to benefit from cache locality and to keep the i915.ko relocation lists
+ * small. On the other hand, we don't want to waste memory on unused space.
+ */
+#define ANV_TRTT_PAGE_TABLE_BO_SIZE (2 * 1024 * 1024)
+
+static VkResult
+trtt_make_page_table_bo(struct anv_device *device, struct anv_bo **bo)
+{
+ VkResult result;
+ struct anv_trtt *trtt = &device->trtt;
+
+ result = anv_device_alloc_bo(device, "trtt-page-table",
+ ANV_TRTT_PAGE_TABLE_BO_SIZE |
+ ANV_BO_ALLOC_INTERNAL,
+ 0, 0, bo);
+ if (result != VK_SUCCESS)
+ return result;
+
+ if (trtt->num_page_table_bos < trtt->page_table_bos_capacity) {
+ trtt->page_table_bos[trtt->num_page_table_bos++] = *bo;
+ } else {
+
+ int new_capacity = MAX2(8, trtt->page_table_bos_capacity * 2);
+ struct anv_bo **new_page_table_bos =
+ vk_realloc(&device->vk.alloc, trtt->page_table_bos,
+ new_capacity * sizeof(*trtt->page_table_bos), 8,
+ VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
+ if (!new_page_table_bos) {
+ anv_device_release_bo(device, *bo);
+ return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+ }
+
+ new_page_table_bos[trtt->num_page_table_bos] = *bo;
+
+ trtt->page_table_bos = new_page_table_bos;
+ trtt->page_table_bos_capacity = new_capacity;
+ trtt->num_page_table_bos++;
+ }
+
+ trtt->cur_page_table_bo = *bo;
+ trtt->next_page_table_bo_offset = 0;
+
+ sparse_debug("new number of page table BOs: %d\n",
+ trtt->num_page_table_bos);
+
+ return VK_SUCCESS;
+}
+
+static VkResult
+trtt_get_page_table_bo(struct anv_device *device, struct anv_bo **bo,
+ uint64_t *bo_addr)
+{
+ struct anv_trtt *trtt = &device->trtt;
+ VkResult result;
+
+ if (!trtt->cur_page_table_bo) {
+ result = trtt_make_page_table_bo(device, bo);
+ if (result != VK_SUCCESS)
+ return result;
+ }
+
+ *bo = trtt->cur_page_table_bo;
+ *bo_addr = trtt->cur_page_table_bo->offset +
+ trtt->next_page_table_bo_offset;
+
+ trtt->next_page_table_bo_offset += 4096;
+ if (trtt->next_page_table_bo_offset >= ANV_TRTT_PAGE_TABLE_BO_SIZE)
+ trtt->cur_page_table_bo = NULL;
+
+ return VK_SUCCESS;
+}
+
+static VkResult
+anv_trtt_init_context_state(struct anv_queue *queue)
+{
+ struct anv_device *device = queue->device;
+ struct anv_trtt *trtt = &device->trtt;
+
+ struct drm_syncobj_create create = {
+ .handle = 0,
+ .flags = 0,
+ };
+ if (intel_ioctl(device->fd, DRM_IOCTL_SYNCOBJ_CREATE, &create))
+ return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+ assert(create.handle != 0);
+ trtt->timeline_handle = create.handle;
+
+ struct anv_bo *l3_bo;
+ VkResult result = trtt_get_page_table_bo(device, &l3_bo, &trtt->l3_addr);
+ if (result != VK_SUCCESS)
+ return result;
+
+ trtt->l3_mirror = vk_zalloc(&device->vk.alloc, 4096, 8,
+ VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
+ if (!trtt->l3_mirror) {
+ result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+ return result;
+ }
+
+ /* L3 has 512 entries, so we can have up to 512 L2 tables. */
+ trtt->l2_mirror = vk_zalloc(&device->vk.alloc, 512 * 4096, 8,
+ VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
+ if (!trtt->l2_mirror) {
+ result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+ goto fail_free_l3;
+ }
+
+ result = anv_genX(device->info, init_trtt_context_state)(queue);
+
+ return result;
+
+fail_free_l3:
+ vk_free(&device->vk.alloc, trtt->l3_mirror);
+ return result;
+}
+
+static void
+anv_trtt_bind_list_add_entry(struct anv_trtt_bind *binds, int *binds_len,
+ uint64_t pte_addr, uint64_t entry_addr)
+{
+ binds[*binds_len] = (struct anv_trtt_bind) {
+ .pte_addr = pte_addr,
+ .entry_addr = entry_addr,
+ };
+ (*binds_len)++;
+}
+
+/* For L3 and L2 pages, null and invalid entries are indicated by bits 1 and 0
+ * respectively. For L1 entries, the hardware compares the addresses against
+ * what we program to the GFX_TRTT_NULL and GFX_TRTT_INVAL registers.
+ */
+#define ANV_TRTT_L3L2_NULL_ENTRY (1 << 1)
+#define ANV_TRTT_L3L2_INVALID_ENTRY (1 << 0)
+
+/* Adds elements to the anv_trtt_bind structs passed. This doesn't write the
+ * entries to the HW yet.
+ */
+static VkResult
+anv_trtt_bind_add(struct anv_device *device,
+ uint64_t trtt_addr, uint64_t dest_addr,
+ struct anv_trtt_submission *s)
+{
+ VkResult result = VK_SUCCESS;
+ struct anv_trtt *trtt = &device->trtt;
+ bool is_null_bind = dest_addr == ANV_TRTT_L1_NULL_TILE_VAL;
+
+ int l3_index = (trtt_addr >> 35) & 0x1FF;
+ int l2_index = (trtt_addr >> 26) & 0x1FF;
+ int l1_index = (trtt_addr >> 16) & 0x3FF;
+
+ uint64_t l2_addr = trtt->l3_mirror[l3_index];
+ if (l2_addr == ANV_TRTT_L3L2_NULL_ENTRY && is_null_bind) {
+ return VK_SUCCESS;
+ } else if (l2_addr == 0 || l2_addr == ANV_TRTT_L3L2_NULL_ENTRY) {
+ if (is_null_bind) {
+ trtt->l3_mirror[l3_index] = ANV_TRTT_L3L2_NULL_ENTRY;
+
+ anv_trtt_bind_list_add_entry(s->l3l2_binds, &s->l3l2_binds_len,
+ trtt->l3_addr + l3_index * sizeof(uint64_t),
+ ANV_TRTT_L3L2_NULL_ENTRY);
+
+ return VK_SUCCESS;
+ }
+
+ struct anv_bo *l2_bo;
+ result = trtt_get_page_table_bo(device, &l2_bo, &l2_addr);
+ if (result != VK_SUCCESS)
+ return result;
+
+ trtt->l3_mirror[l3_index] = l2_addr;
+
+ anv_trtt_bind_list_add_entry(s->l3l2_binds, &s->l3l2_binds_len,
+ trtt->l3_addr + l3_index * sizeof(uint64_t), l2_addr);
+ }
+ assert(l2_addr != 0 && l2_addr != ANV_TRTT_L3L2_NULL_ENTRY);
+
+ /* The first page in the l2_mirror corresponds to l3_index=0 and so on. */
+ uint64_t l1_addr = trtt->l2_mirror[l3_index * 512 + l2_index];
+ if (l1_addr == ANV_TRTT_L3L2_NULL_ENTRY && is_null_bind) {
+ return VK_SUCCESS;
+ } else if (l1_addr == 0 || l1_addr == ANV_TRTT_L3L2_NULL_ENTRY) {
+ if (is_null_bind) {
+ trtt->l2_mirror[l3_index * 512 + l2_index] =
+ ANV_TRTT_L3L2_NULL_ENTRY;
+
+ anv_trtt_bind_list_add_entry(s->l3l2_binds, &s->l3l2_binds_len,
+ l2_addr + l2_index * sizeof(uint64_t),
+ ANV_TRTT_L3L2_NULL_ENTRY);
+
+ return VK_SUCCESS;
+ }
+
+ struct anv_bo *l1_bo;
+ result = trtt_get_page_table_bo(device, &l1_bo, &l1_addr);
+ if (result != VK_SUCCESS)
+ return result;
+
+ trtt->l2_mirror[l3_index * 512 + l2_index] = l1_addr;
+
+ anv_trtt_bind_list_add_entry(s->l3l2_binds, &s->l3l2_binds_len,
+ l2_addr + l2_index * sizeof(uint64_t), l1_addr);
+ }
+ assert(l1_addr != 0 && l1_addr != ANV_TRTT_L3L2_NULL_ENTRY);
+
+ anv_trtt_bind_list_add_entry(s->l1_binds, &s->l1_binds_len,
+ l1_addr + l1_index * sizeof(uint32_t), dest_addr);
+
+ return VK_SUCCESS;
+}
+
+static VkResult
+anv_sparse_bind_trtt(struct anv_device *device,
+ struct anv_sparse_submission *sparse_submit)
+{
+ struct anv_trtt *trtt = &device->trtt;
+ VkResult result;
+
+ /* TR-TT submission needs a queue even when the API entry point doesn't
+ * give one, such as resource creation. */
+ if (!sparse_submit->queue)
+ sparse_submit->queue = trtt->queue;
+
+ /* These capacities are conservative estimations. For L1 binds the
+ * number will match exactly unless we skip NULL binds due to L2 already
+ * being NULL. For L3/L2 things are harder to estimate, but the resulting
+ * numbers are so small that a little overestimation won't hurt.
+ *
+ * We have assertions below to catch estimation errors.
+ */
+ int l3l2_binds_capacity = 1;
+ int l1_binds_capacity = 0;
+ for (int b = 0; b < sparse_submit->binds_len; b++) {
+ assert(sparse_submit->binds[b].size % (64 * 1024) == 0);
+ int pages = sparse_submit->binds[b].size / (64 * 1024);
+ l1_binds_capacity += pages;
+ l3l2_binds_capacity += (pages / 1024 + 1) * 2;
+ }
+
+ STACK_ARRAY(struct anv_trtt_bind, l3l2_binds, l3l2_binds_capacity);
+ STACK_ARRAY(struct anv_trtt_bind, l1_binds, l1_binds_capacity);
+ struct anv_trtt_submission trtt_submit = {
+ .sparse = sparse_submit,
+ .l3l2_binds = l3l2_binds,
+ .l1_binds = l1_binds,
+ .l3l2_binds_len = 0,
+ .l1_binds_len = 0,
+ };
+
+ pthread_mutex_lock(&trtt->mutex);
+
+ if (!trtt->l3_addr)
+ anv_trtt_init_context_state(sparse_submit->queue);
+
+ assert(trtt->l3_addr);
+
+ for (int b = 0; b < sparse_submit->binds_len; b++) {
+ struct anv_vm_bind *vm_bind = &sparse_submit->binds[b];
+ for (size_t i = 0; i < vm_bind->size; i += 64 * 1024) {
+ uint64_t trtt_addr = vm_bind->address + i;
+ uint64_t dest_addr =
+ (vm_bind->op == ANV_VM_BIND && vm_bind->bo) ?
+ vm_bind->bo->offset + vm_bind->bo_offset + i :
+ ANV_TRTT_L1_NULL_TILE_VAL;
+
+ result = anv_trtt_bind_add(device, trtt_addr, dest_addr,
+ &trtt_submit);
+ if (result != VK_SUCCESS)
+ goto out;
+ }
+ }
+
+ assert(trtt_submit.l3l2_binds_len <= l3l2_binds_capacity);
+ assert(trtt_submit.l1_binds_len <= l1_binds_capacity);
+
+ sparse_debug("trtt_binds: num_vm_binds:%02d l3l2:%04d l1:%04d\n",
+ sparse_submit->binds_len, trtt_submit.l3l2_binds_len,
+ trtt_submit.l1_binds_len);
+
+ if (trtt_submit.l3l2_binds_len || trtt_submit.l1_binds_len)
+ result = anv_genX(device->info, write_trtt_entries)(&trtt_submit);
+
+ if (result == VK_SUCCESS)
+ ANV_RMV(vm_binds, device, sparse_submit->binds, sparse_submit->binds_len);
+
+out:
+ pthread_mutex_unlock(&trtt->mutex);
+ STACK_ARRAY_FINISH(l1_binds);
+ STACK_ARRAY_FINISH(l3l2_binds);
+ return result;
+}
+
+static VkResult
+anv_sparse_bind_vm_bind(struct anv_device *device,
+ struct anv_sparse_submission *submit)
+{
+ struct anv_queue *queue = submit->queue;
+
+ if (!queue)
+ assert(submit->wait_count == 0 && submit->signal_count == 0);
+
+ return device->kmd_backend->vm_bind(device, submit, ANV_VM_BIND_FLAG_NONE);
+}
+
+VkResult
+anv_sparse_bind(struct anv_device *device,
+ struct anv_sparse_submission *submit)
+{
+ if (INTEL_DEBUG(DEBUG_SPARSE)) {
+ for (int b = 0; b < submit->binds_len; b++)
+ dump_anv_vm_bind(device, &submit->binds[b]);
+ }
+
+ return device->physical->sparse_type == ANV_SPARSE_TYPE_TRTT ?
+ anv_sparse_bind_trtt(device, submit) :
+ anv_sparse_bind_vm_bind(device, submit);
+}
+
+VkResult
+anv_init_sparse_bindings(struct anv_device *device,
+ uint64_t size_,
+ struct anv_sparse_binding_data *sparse,
+ enum anv_bo_alloc_flags alloc_flags,
+ uint64_t client_address,
+ struct anv_address *out_address)
+{
+ uint64_t size = align64(size_, ANV_SPARSE_BLOCK_SIZE);
+
+ if (device->physical->sparse_type == ANV_SPARSE_TYPE_TRTT)
+ alloc_flags |= ANV_BO_ALLOC_TRTT;
+
+ sparse->address = anv_vma_alloc(device, size, ANV_SPARSE_BLOCK_SIZE,
+ alloc_flags,
+ intel_48b_address(client_address),
+ &sparse->vma_heap);
+ sparse->size = size;
+
+ out_address->bo = NULL;
+ out_address->offset = sparse->address;
+
+ struct anv_vm_bind bind = {
+ .bo = NULL, /* That's a NULL binding. */
+ .address = sparse->address,
+ .bo_offset = 0,
+ .size = size,
+ .op = ANV_VM_BIND,
+ };
+ struct anv_sparse_submission submit = {
+ .queue = NULL,
+ .binds = &bind,
+ .binds_len = 1,
+ .binds_capacity = 1,
+ .wait_count = 0,
+ .signal_count = 0,
+ };
+ VkResult res = anv_sparse_bind(device, &submit);
+ if (res != VK_SUCCESS) {
+ anv_vma_free(device, sparse->vma_heap, sparse->address, sparse->size);
+ return res;
+ }
+
+ p_atomic_inc(&device->num_sparse_resources);
+ return VK_SUCCESS;
+}
+
+void
+anv_free_sparse_bindings(struct anv_device *device,
+ struct anv_sparse_binding_data *sparse)
+{
+ if (!sparse->address)
+ return;
+
+ sparse_debug("%s: address:0x%016"PRIx64" size:0x%08"PRIx64"\n",
+ __func__, sparse->address, sparse->size);
+
+ p_atomic_dec(&device->num_sparse_resources);
+
+ struct anv_vm_bind unbind = {
+ .bo = 0,
+ .address = sparse->address,
+ .bo_offset = 0,
+ .size = sparse->size,
+ .op = ANV_VM_UNBIND,
+ };
+ struct anv_sparse_submission submit = {
+ .queue = NULL,
+ .binds = &unbind,
+ .binds_len = 1,
+ .binds_capacity = 1,
+ .wait_count = 0,
+ .signal_count = 0,
+ };
+ VkResult res = anv_sparse_bind(device, &submit);
+
+ /* Our callers don't have a way to signal failure to the upper layers, so
+ * just keep the vma if we fail to unbind it. Still, let's have an
+ * assertion because this really shouldn't be happening.
+ */
+ assert(res == VK_SUCCESS);
+ if (res != VK_SUCCESS)
+ return;
+
+ anv_vma_free(device, sparse->vma_heap, sparse->address, sparse->size);
+}
+
+static VkExtent3D
+anv_sparse_calc_block_shape(struct anv_physical_device *pdevice,
+ struct isl_surf *surf)
+{
+ const struct isl_format_layout *layout =
+ isl_format_get_layout(surf->format);
+ const int Bpb = layout->bpb / 8;
+
+ struct isl_tile_info tile_info;
+ isl_surf_get_tile_info(surf, &tile_info);
+
+ VkExtent3D block_shape_el = {
+ .width = tile_info.logical_extent_el.width,
+ .height = tile_info.logical_extent_el.height,
+ .depth = tile_info.logical_extent_el.depth,
+ };
+ VkExtent3D block_shape_px = vk_extent3d_el_to_px(block_shape_el, layout);
+
+ if (surf->tiling == ISL_TILING_LINEAR) {
+ uint32_t elements_per_row = surf->row_pitch_B /
+ (block_shape_el.width * Bpb);
+ uint32_t rows_per_tile = ANV_SPARSE_BLOCK_SIZE /
+ (elements_per_row * Bpb);
+ assert(rows_per_tile * elements_per_row * Bpb == ANV_SPARSE_BLOCK_SIZE);
+
+ block_shape_px = (VkExtent3D) {
+ .width = elements_per_row * layout->bw,
+ .height = rows_per_tile * layout->bh,
+ .depth = layout->bd,
+ };
+ }
+
+ return block_shape_px;
+}
+
+VkSparseImageFormatProperties
+anv_sparse_calc_image_format_properties(struct anv_physical_device *pdevice,
+ VkImageAspectFlags aspect,
+ VkImageType vk_image_type,
+ struct isl_surf *surf)
+{
+ const struct isl_format_layout *isl_layout =
+ isl_format_get_layout(surf->format);
+ const int bpb = isl_layout->bpb;
+ assert(bpb == 8 || bpb == 16 || bpb == 32 || bpb == 64 ||bpb == 128);
+ const int Bpb = bpb / 8;
+
+ VkExtent3D granularity = anv_sparse_calc_block_shape(pdevice, surf);
+ bool is_standard = false;
+ bool is_known_nonstandard_format = false;
+
+ if (vk_image_type != VK_IMAGE_TYPE_1D) {
+ VkExtent3D std_shape =
+ anv_sparse_get_standard_image_block_shape(surf->format, vk_image_type,
+ bpb);
+ /* YUV formats don't work with Tile64, which is required if we want to
+ * claim standard block shapes. The spec requires us to support all
+ * non-compressed color formats that non-sparse supports, so we can't
+ * just say YUV formats are not supported by Sparse. So we end
+ * supporting this format and anv_sparse_calc_miptail_properties() will
+ * say that everything is part of the miptail.
+ *
+ * For more details on the hardware restriction, please check
+ * isl_gfx125_filter_tiling().
+ */
+ if (pdevice->info.verx10 >= 125 && isl_format_is_yuv(surf->format))
+ is_known_nonstandard_format = true;
+
+ /* The standard block shapes (and by extension, the tiling formats they
+ * require) are simply incompatible with getting a 2D view of a 3D
+ * image.
+ */
+ if (surf->usage & ISL_SURF_USAGE_2D_3D_COMPATIBLE_BIT)
+ is_known_nonstandard_format = true;
+
+ is_standard = granularity.width == std_shape.width &&
+ granularity.height == std_shape.height &&
+ granularity.depth == std_shape.depth;
+
+ /* TODO: dEQP seems to care about the block shapes being standard even
+ * for the cases where is_known_nonstandard_format is true. Luckily as
+ * of today all of those cases are NotSupported but sooner or later we
+ * may end up getting a failure.
+ * Notice that in practice we report these cases as having the mip tail
+ * starting on mip level 0, so the reported block shapes are irrelevant
+ * since non-opaque binds are not supported. Still, dEQP seems to care.
+ */
+ assert(is_standard || is_known_nonstandard_format);
+ }
+
+ uint32_t block_size = granularity.width * granularity.height *
+ granularity.depth * Bpb;
+ bool wrong_block_size = block_size != ANV_SPARSE_BLOCK_SIZE;
+
+ return (VkSparseImageFormatProperties) {
+ .aspectMask = aspect,
+ .imageGranularity = granularity,
+ .flags = ((is_standard || is_known_nonstandard_format) ? 0 :
+ VK_SPARSE_IMAGE_FORMAT_NONSTANDARD_BLOCK_SIZE_BIT) |
+ (wrong_block_size ? VK_SPARSE_IMAGE_FORMAT_SINGLE_MIPTAIL_BIT :
+ 0),
+ };
+}
+
+/* The miptail is supposed to be this region where the tiniest mip levels
+ * are squished together in one single page, which should save us some memory.
+ * It's a hardware feature which our hardware supports on certain tiling
+ * formats - the ones we always want to use for sparse resources.
+ *
+ * For sparse, the main feature of the miptail is that it only supports opaque
+ * binds, so you either bind the whole miptail or you bind nothing at all,
+ * there are no subresources inside it to separately bind. While the idea is
+ * that the miptail as reported by sparse should match what our hardware does,
+ * in practice we can say in our sparse functions that certain mip levels are
+ * part of the miptail while from the point of view of our hardwared they
+ * aren't.
+ *
+ * If we detect we're using the sparse-friendly tiling formats and ISL
+ * supports miptails for them, we can just trust the miptail level set by ISL
+ * and things can proceed as The Spec intended.
+ *
+ * However, if that's not the case, we have to go on a best-effort policy. We
+ * could simply declare that every mip level is part of the miptail and be
+ * done, but since that kinda defeats the purpose of Sparse we try to find
+ * what level we really should be reporting as the first miptail level based
+ * on the alignments of the surface subresources.
+ */
+void
+anv_sparse_calc_miptail_properties(struct anv_device *device,
+ struct anv_image *image,
+ VkImageAspectFlags vk_aspect,
+ uint32_t *imageMipTailFirstLod,
+ VkDeviceSize *imageMipTailSize,
+ VkDeviceSize *imageMipTailOffset,
+ VkDeviceSize *imageMipTailStride)
+{
+ const uint32_t plane = anv_image_aspect_to_plane(image, vk_aspect);
+ struct isl_surf *surf = &image->planes[plane].primary_surface.isl;
+ uint64_t binding_plane_offset =
+ image->planes[plane].primary_surface.memory_range.offset;
+ const struct isl_format_layout *isl_layout =
+ isl_format_get_layout(surf->format);
+ const int Bpb = isl_layout->bpb / 8;
+ struct isl_tile_info tile_info;
+ isl_surf_get_tile_info(surf, &tile_info);
+ uint32_t tile_size = tile_info.logical_extent_el.width * Bpb *
+ tile_info.logical_extent_el.height *
+ tile_info.logical_extent_el.depth;
+
+ uint64_t layer1_offset;
+ uint32_t x_off, y_off;
+
+ /* Treat the whole thing as a single miptail. We should have already
+ * reported this image as VK_SPARSE_IMAGE_FORMAT_SINGLE_MIPTAIL_BIT.
+ *
+ * In theory we could try to make ISL massage the alignments so that we
+ * could at least claim mip level 0 to be not part of the miptail, but
+ * that could end up wasting a lot of memory, so it's better to do
+ * nothing and focus our efforts into making things use the appropriate
+ * tiling formats that give us the standard block shapes.
+ */
+ if (tile_size != ANV_SPARSE_BLOCK_SIZE)
+ goto out_everything_is_miptail;
+
+ assert(surf->tiling != ISL_TILING_LINEAR);
+
+ if (image->vk.array_layers == 1) {
+ layer1_offset = surf->size_B;
+ } else {
+ isl_surf_get_image_offset_B_tile_sa(surf, 0, 1, 0, &layer1_offset,
+ &x_off, &y_off);
+ if (x_off || y_off)
+ goto out_everything_is_miptail;
+ }
+ assert(layer1_offset % tile_size == 0);
+
+ /* We could try to do better here, but there's not really any point since
+ * we should be supporting the appropriate tiling formats everywhere.
+ */
+ if (!isl_tiling_supports_standard_block_shapes(surf->tiling))
+ goto out_everything_is_miptail;
+
+ int miptail_first_level = surf->miptail_start_level;
+ if (miptail_first_level >= image->vk.mip_levels)
+ goto out_no_miptail;
+
+ uint64_t miptail_offset = 0;
+ isl_surf_get_image_offset_B_tile_sa(surf, miptail_first_level, 0, 0,
+ &miptail_offset,
+ &x_off, &y_off);
+ assert(x_off == 0 && y_off == 0);
+ assert(miptail_offset % tile_size == 0);
+
+ *imageMipTailFirstLod = miptail_first_level;
+ *imageMipTailSize = tile_size;
+ *imageMipTailOffset = binding_plane_offset + miptail_offset;
+ *imageMipTailStride = layer1_offset;
+ goto out_debug;
+
+out_no_miptail:
+ *imageMipTailFirstLod = image->vk.mip_levels;
+ *imageMipTailSize = 0;
+ *imageMipTailOffset = 0;
+ *imageMipTailStride = 0;
+ goto out_debug;
+
+out_everything_is_miptail:
+ *imageMipTailFirstLod = 0;
+ *imageMipTailSize = surf->size_B;
+ *imageMipTailOffset = binding_plane_offset;
+ *imageMipTailStride = 0;
+
+out_debug:
+ sparse_debug("miptail first_lod:%d size:%"PRIu64" offset:%"PRIu64" "
+ "stride:%"PRIu64"\n",
+ *imageMipTailFirstLod, *imageMipTailSize,
+ *imageMipTailOffset, *imageMipTailStride);
+}
+
+static struct anv_vm_bind
+vk_bind_to_anv_vm_bind(struct anv_sparse_binding_data *sparse,
+ const struct VkSparseMemoryBind *vk_bind)
+{
+ struct anv_vm_bind anv_bind = {
+ .bo = NULL,
+ .address = sparse->address + vk_bind->resourceOffset,
+ .bo_offset = 0,
+ .size = vk_bind->size,
+ .op = ANV_VM_BIND,
+ };
+
+ assert(vk_bind->size);
+ assert(vk_bind->resourceOffset + vk_bind->size <= sparse->size);
+
+ if (vk_bind->memory != VK_NULL_HANDLE) {
+ anv_bind.bo = anv_device_memory_from_handle(vk_bind->memory)->bo;
+ anv_bind.bo_offset = vk_bind->memoryOffset,
+ assert(vk_bind->memoryOffset + vk_bind->size <= anv_bind.bo->size);
+ }
+
+ return anv_bind;
+}
+
+static VkResult
+anv_sparse_bind_resource_memory(struct anv_device *device,
+ struct anv_sparse_binding_data *sparse,
+ uint64_t resource_size,
+ const VkSparseMemoryBind *vk_bind,
+ struct anv_sparse_submission *submit)
+{
+ struct anv_vm_bind bind = vk_bind_to_anv_vm_bind(sparse, vk_bind);
+ uint64_t rem = vk_bind->size % ANV_SPARSE_BLOCK_SIZE;
+
+ if (rem != 0) {
+ if (vk_bind->resourceOffset + vk_bind->size == resource_size)
+ bind.size += ANV_SPARSE_BLOCK_SIZE - rem;
+ else
+ return vk_error(device, VK_ERROR_VALIDATION_FAILED_EXT);
+ }
+
+ return anv_sparse_submission_add(device, submit, &bind);
+}
+
+VkResult
+anv_sparse_bind_buffer(struct anv_device *device,
+ struct anv_buffer *buffer,
+ const VkSparseMemoryBind *vk_bind,
+ struct anv_sparse_submission *submit)
+{
+ return anv_sparse_bind_resource_memory(device, &buffer->sparse_data,
+ buffer->vk.size,
+ vk_bind, submit);
+}
+
+VkResult
+anv_sparse_bind_image_opaque(struct anv_device *device,
+ struct anv_image *image,
+ const VkSparseMemoryBind *vk_bind,
+ struct anv_sparse_submission *submit)
+{
+ struct anv_image_binding *b =
+ &image->bindings[ANV_IMAGE_MEMORY_BINDING_MAIN];
+ assert(!image->disjoint);
+
+ return anv_sparse_bind_resource_memory(device, &b->sparse_data,
+ b->memory_range.size,
+ vk_bind, submit);
+}
+
+VkResult
+anv_sparse_bind_image_memory(struct anv_queue *queue,
+ struct anv_image *image,
+ const VkSparseImageMemoryBind *bind,
+ struct anv_sparse_submission *submit)
+{
+ struct anv_device *device = queue->device;
+ VkImageAspectFlags aspect = bind->subresource.aspectMask;
+ uint32_t mip_level = bind->subresource.mipLevel;
+ uint32_t array_layer = bind->subresource.arrayLayer;
+
+ assert(!(bind->flags & VK_SPARSE_MEMORY_BIND_METADATA_BIT));
+
+ struct anv_image_binding *img_binding = image->disjoint ?
+ anv_image_aspect_to_binding(image, aspect) :
+ &image->bindings[ANV_IMAGE_MEMORY_BINDING_MAIN];
+ struct anv_sparse_binding_data *sparse_data = &img_binding->sparse_data;
+
+ const uint32_t plane = anv_image_aspect_to_plane(image, aspect);
+ struct isl_surf *surf = &image->planes[plane].primary_surface.isl;
+ uint64_t binding_plane_offset =
+ image->planes[plane].primary_surface.memory_range.offset;
+ const struct isl_format_layout *layout =
+ isl_format_get_layout(surf->format);
+
+ if (INTEL_DEBUG(DEBUG_SPARSE)) {
+ sparse_debug("%s:", __func__);
+ sparse_debug("mip_level:%d array_layer:%d\n", mip_level, array_layer);
+ sparse_debug("aspect:0x%x plane:%d\n", aspect, plane);
+ sparse_debug("binding offset: [%d, %d, %d] extent: [%d, %d, %d]\n",
+ bind->offset.x, bind->offset.y, bind->offset.z,
+ bind->extent.width, bind->extent.height,
+ bind->extent.depth);
+ dump_anv_image(image);
+ dump_isl_surf(surf);
+ sparse_debug("\n");
+ }
+
+ VkExtent3D block_shape_px =
+ anv_sparse_calc_block_shape(device->physical, surf);
+ VkExtent3D block_shape_el = vk_extent3d_px_to_el(block_shape_px, layout);
+
+ /* Both bind->offset and bind->extent are in pixel units. */
+ VkOffset3D bind_offset_el = vk_offset3d_px_to_el(bind->offset, layout);
+
+ /* The spec says we only really need to align if for a given coordinate
+ * offset + extent equals the corresponding dimensions of the image
+ * subresource, but all the other non-aligned usage is invalid, so just
+ * align everything.
+ */
+ VkExtent3D bind_extent_px = {
+ .width = ALIGN_NPOT(bind->extent.width, block_shape_px.width),
+ .height = ALIGN_NPOT(bind->extent.height, block_shape_px.height),
+ .depth = ALIGN_NPOT(bind->extent.depth, block_shape_px.depth),
+ };
+ VkExtent3D bind_extent_el = vk_extent3d_px_to_el(bind_extent_px, layout);
+
+ /* A sparse block should correspond to our tile size, so this has to be
+ * either 4k or 64k depending on the tiling format. */
+ const uint64_t block_size_B = block_shape_el.width * (layout->bpb / 8) *
+ block_shape_el.height *
+ block_shape_el.depth;
+ /* How many blocks are necessary to form a whole line on this image? */
+ const uint32_t blocks_per_line = surf->row_pitch_B / (layout->bpb / 8) /
+ block_shape_el.width;
+ /* The loop below will try to bind a whole line of blocks at a time as
+ * they're guaranteed to be contiguous, so we calculate how many blocks
+ * that is and how big is each block to figure the bind size of a whole
+ * line.
+ */
+ uint64_t line_bind_size_in_blocks = bind_extent_el.width /
+ block_shape_el.width;
+ uint64_t line_bind_size = line_bind_size_in_blocks * block_size_B;
+ assert(line_bind_size_in_blocks != 0);
+ assert(line_bind_size != 0);
+
+ uint64_t memory_offset = bind->memoryOffset;
+ for (uint32_t z = bind_offset_el.z;
+ z < bind_offset_el.z + bind_extent_el.depth;
+ z += block_shape_el.depth) {
+ uint64_t subresource_offset_B;
+ uint32_t subresource_x_offset, subresource_y_offset;
+ isl_surf_get_image_offset_B_tile_sa(surf, mip_level, array_layer, z,
+ &subresource_offset_B,
+ &subresource_x_offset,
+ &subresource_y_offset);
+ assert(subresource_x_offset == 0 && subresource_y_offset == 0);
+ assert(subresource_offset_B % block_size_B == 0);
+
+ for (uint32_t y = bind_offset_el.y;
+ y < bind_offset_el.y + bind_extent_el.height;
+ y+= block_shape_el.height) {
+ uint32_t line_block_offset = y / block_shape_el.height *
+ blocks_per_line;
+ uint64_t line_start_B = subresource_offset_B +
+ line_block_offset * block_size_B;
+ uint64_t bind_offset_B = line_start_B +
+ (bind_offset_el.x / block_shape_el.width) *
+ block_size_B;
+
+ VkSparseMemoryBind opaque_bind = {
+ .resourceOffset = binding_plane_offset + bind_offset_B,
+ .size = line_bind_size,
+ .memory = bind->memory,
+ .memoryOffset = memory_offset,
+ .flags = bind->flags,
+ };
+
+ memory_offset += line_bind_size;
+
+ assert(line_start_B % block_size_B == 0);
+ assert(opaque_bind.resourceOffset % block_size_B == 0);
+ assert(opaque_bind.size % block_size_B == 0);
+
+ struct anv_vm_bind anv_bind = vk_bind_to_anv_vm_bind(sparse_data,
+ &opaque_bind);
+ VkResult result = anv_sparse_submission_add(device, submit,
+ &anv_bind);
+ if (result != VK_SUCCESS)
+ return result;
+ }
+ }
+
+ return VK_SUCCESS;
+}
+
+VkResult
+anv_sparse_image_check_support(struct anv_physical_device *pdevice,
+ VkImageCreateFlags flags,
+ VkImageTiling tiling,
+ VkSampleCountFlagBits samples,
+ VkImageType type,
+ VkFormat vk_format)
+{
+ assert(flags & VK_IMAGE_CREATE_SPARSE_BINDING_BIT);
+
+ /* The spec says:
+ * "A sparse image created using VK_IMAGE_CREATE_SPARSE_BINDING_BIT (but
+ * not VK_IMAGE_CREATE_SPARSE_RESIDENCY_BIT) supports all formats that
+ * non-sparse usage supports, and supports both VK_IMAGE_TILING_OPTIMAL
+ * and VK_IMAGE_TILING_LINEAR tiling."
+ */
+ if (!(flags & VK_IMAGE_CREATE_SPARSE_RESIDENCY_BIT))
+ return VK_SUCCESS;
+
+ /* From here on, these are the rules:
+ * "A sparse image created using VK_IMAGE_CREATE_SPARSE_RESIDENCY_BIT
+ * supports all non-compressed color formats with power-of-two element
+ * size that non-sparse usage supports. Additional formats may also be
+ * supported and can be queried via
+ * vkGetPhysicalDeviceSparseImageFormatProperties.
+ * VK_IMAGE_TILING_LINEAR tiling is not supported."
+ */
+
+ /* We choose not to support sparse residency on emulated compressed
+ * formats due to the additional image plane. It would make the
+ * implementation extremely complicated.
+ */
+ if (anv_is_format_emulated(pdevice, vk_format))
+ return VK_ERROR_FORMAT_NOT_SUPPORTED;
+
+ /* While the spec itself says linear is not supported (see above), deqp-vk
+ * tries anyway to create linear sparse images, so we have to check for it.
+ * This is also said in VUID-VkImageCreateInfo-tiling-04121:
+ * "If tiling is VK_IMAGE_TILING_LINEAR, flags must not contain
+ * VK_IMAGE_CREATE_SPARSE_RESIDENCY_BIT"
+ */
+ if (tiling == VK_IMAGE_TILING_LINEAR)
+ return VK_ERROR_FORMAT_NOT_SUPPORTED;
+
+ /* TODO: not supported yet. */
+ if (samples != VK_SAMPLE_COUNT_1_BIT)
+ return VK_ERROR_FEATURE_NOT_PRESENT;
+
+ /* While the Vulkan spec allows us to support depth/stencil sparse images
+ * everywhere, sometimes we're not able to have them with the tiling
+ * formats that give us the standard block shapes. Having standard block
+ * shapes is higher priority than supporting depth/stencil sparse images.
+ *
+ * Please see ISL's filter_tiling() functions for accurate explanations on
+ * why depth/stencil images are not always supported with the tiling
+ * formats we want. But in short: depth/stencil support in our HW is
+ * limited to 2D and we can't build a 2D view of a 3D image with these
+ * tiling formats due to the address swizzling being different.
+ */
+ VkImageAspectFlags aspects = vk_format_aspects(vk_format);
+ if (aspects & (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) {
+ /* For 125+, isl_gfx125_filter_tiling() claims 3D is not supported.
+ * For the previous platforms, isl_gfx6_filter_tiling() says only 2D is
+ * supported.
+ */
+ if (pdevice->info.verx10 >= 125) {
+ if (type == VK_IMAGE_TYPE_3D)
+ return VK_ERROR_FORMAT_NOT_SUPPORTED;
+ } else {
+ if (type != VK_IMAGE_TYPE_2D)
+ return VK_ERROR_FORMAT_NOT_SUPPORTED;
+ }
+ }
+
+ const struct anv_format *anv_format = anv_get_format(vk_format);
+ if (!anv_format)
+ return VK_ERROR_FORMAT_NOT_SUPPORTED;
+
+ for (int p = 0; p < anv_format->n_planes; p++) {
+ enum isl_format isl_format = anv_format->planes[p].isl_format;
+
+ if (isl_format == ISL_FORMAT_UNSUPPORTED)
+ return VK_ERROR_FORMAT_NOT_SUPPORTED;
+
+ const struct isl_format_layout *isl_layout =
+ isl_format_get_layout(isl_format);
+
+ /* As quoted above, we only need to support the power-of-two formats.
+ * The problem with the non-power-of-two formats is that we need an
+ * integer number of pixels to fit into a sparse block, so we'd need the
+ * sparse block sizes to be, for example, 192k for 24bpp.
+ *
+ * TODO: add support for these formats.
+ */
+ if (isl_layout->bpb != 8 && isl_layout->bpb != 16 &&
+ isl_layout->bpb != 32 && isl_layout->bpb != 64 &&
+ isl_layout->bpb != 128)
+ return VK_ERROR_FORMAT_NOT_SUPPORTED;
+ }
+
+ /* These YUV formats are considered by Vulkan to be compressed 2x1 blocks.
+ * We don't need to support them since they're compressed. On Gfx12 we
+ * can't even have Tile64 for them. Once we do support these formats we'll
+ * have to report the correct block shapes because dEQP cares about them,
+ * and we'll have to adjust for the fact that ISL treats these as 16bpp 1x1
+ * blocks instead of 32bpp 2x1 compressed blocks (as block shapes are
+ * reported in units of compressed blocks).
+ */
+ if (vk_format == VK_FORMAT_G8B8G8R8_422_UNORM ||
+ vk_format == VK_FORMAT_B8G8R8G8_422_UNORM)
+ return VK_ERROR_FORMAT_NOT_SUPPORTED;
+
+ return VK_SUCCESS;
+}
+
+static VkResult
+anv_trtt_garbage_collect_batches(struct anv_device *device)
+{
+ struct anv_trtt *trtt = &device->trtt;
+
+ if (trtt->timeline_val % 8 != 7)
+ return VK_SUCCESS;
+
+ uint64_t cur_timeline_val = 0;
+ struct drm_syncobj_timeline_array array = {
+ .handles = (uintptr_t)&trtt->timeline_handle,
+ .points = (uintptr_t)&cur_timeline_val,
+ .count_handles = 1,
+ .flags = 0,
+ };
+ if (intel_ioctl(device->fd, DRM_IOCTL_SYNCOBJ_QUERY, &array))
+ return vk_error(device, VK_ERROR_UNKNOWN);
+
+ list_for_each_entry_safe(struct anv_trtt_batch_bo, trtt_bbo,
+ &trtt->in_flight_batches, link) {
+ if (trtt_bbo->timeline_val > cur_timeline_val)
+ return VK_SUCCESS;
+
+ anv_trtt_batch_bo_free(device, trtt_bbo);
+ }
+
+ return VK_SUCCESS;
+}
+
+VkResult
+anv_trtt_batch_bo_new(struct anv_device *device, uint32_t batch_size,
+ struct anv_trtt_batch_bo **out_trtt_bbo)
+{
+ struct anv_trtt *trtt = &device->trtt;
+ VkResult result;
+
+ anv_trtt_garbage_collect_batches(device);
+
+ struct anv_trtt_batch_bo *trtt_bbo =
+ vk_alloc(&device->vk.alloc, sizeof(*trtt_bbo), 8,
+ VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
+ if (!trtt_bbo)
+ return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+ result = anv_bo_pool_alloc(&device->batch_bo_pool, batch_size,
+ &trtt_bbo->bo);
+ if (result != VK_SUCCESS)
+ goto out;
+
+ trtt_bbo->size = batch_size;
+ trtt_bbo->timeline_val = ++trtt->timeline_val;
+
+ list_addtail(&trtt_bbo->link, &trtt->in_flight_batches);
+
+ *out_trtt_bbo = trtt_bbo;
+
+ return VK_SUCCESS;
+out:
+ vk_free(&device->vk.alloc, trtt_bbo);
+ return result;
+}
diff --git a/src/intel/vulkan/anv_util.c b/src/intel/vulkan/anv_util.c
index b06ee760f70..15a160b6194 100644
--- a/src/intel/vulkan/anv_util.c
+++ b/src/intel/vulkan/anv_util.c
@@ -31,24 +31,6 @@
#include "anv_private.h"
#include "vk_enum_to_str.h"
-/** Log an error message. */
-void anv_printflike(1, 2)
-anv_loge(const char *format, ...)
-{
- va_list va;
-
- va_start(va, format);
- anv_loge_v(format, va);
- va_end(va);
-}
-
-/** \see anv_loge() */
-void
-anv_loge_v(const char *format, va_list va)
-{
- mesa_loge_v(format, va);
-}
-
void
__anv_perf_warn(struct anv_device *device,
const struct vk_object_base *object,
@@ -56,91 +38,119 @@ __anv_perf_warn(struct anv_device *device,
{
va_list ap;
char buffer[256];
- char report[512];
va_start(ap, format);
vsnprintf(buffer, sizeof(buffer), format, ap);
va_end(ap);
- snprintf(report, sizeof(report), "%s: %s", file, buffer);
-
- vk_debug_report(&device->physical->instance->vk,
- VK_DEBUG_REPORT_PERFORMANCE_WARNING_BIT_EXT,
- object, line, 0, "anv", report);
-
- mesa_logw("%s:%d: PERF: %s", file, line, buffer);
-}
-
-VkResult
-__vk_errorv(struct anv_instance *instance,
- const struct vk_object_base *object, VkResult error,
- const char *file, int line, const char *format, va_list ap)
-{
- char buffer[256];
- char report[512];
-
- const char *error_str = vk_Result_to_str(error);
-
- if (format) {
- vsnprintf(buffer, sizeof(buffer), format, ap);
-
- snprintf(report, sizeof(report), "%s:%d: %s (%s)", file, line, buffer,
- error_str);
+ if (object) {
+ __vk_log(VK_DEBUG_UTILS_MESSAGE_SEVERITY_WARNING_BIT_EXT,
+ VK_DEBUG_UTILS_MESSAGE_TYPE_PERFORMANCE_BIT_EXT,
+ VK_LOG_OBJS(object), file, line,
+ "PERF: %s", buffer);
} else {
- snprintf(report, sizeof(report), "%s:%d: %s", file, line, error_str);
+ __vk_log(VK_DEBUG_UTILS_MESSAGE_SEVERITY_WARNING_BIT_EXT,
+ VK_DEBUG_UTILS_MESSAGE_TYPE_PERFORMANCE_BIT_EXT,
+ VK_LOG_NO_OBJS(device->physical->instance), file, line,
+ "PERF: %s", buffer);
}
-
- if (instance) {
- vk_debug_report(&instance->vk, VK_DEBUG_REPORT_ERROR_BIT_EXT,
- object, line, 0, "anv", report);
- }
-
- mesa_loge("%s", report);
-
- return error;
-}
-
-VkResult
-__vk_errorf(struct anv_instance *instance,
- const struct vk_object_base *object, VkResult error,
- const char *file, int line, const char *format, ...)
-{
- va_list ap;
-
- va_start(ap, format);
- __vk_errorv(instance, object, error, file, line, format, ap);
- va_end(ap);
-
- return error;
}
void
-anv_dump_pipe_bits(enum anv_pipe_bits bits)
+anv_dump_pipe_bits(enum anv_pipe_bits bits, FILE *f)
{
if (bits & ANV_PIPE_DEPTH_CACHE_FLUSH_BIT)
- fputs("+depth_flush ", stderr);
+ fputs("+depth_flush ", f);
if (bits & ANV_PIPE_DATA_CACHE_FLUSH_BIT)
- fputs("+dc_flush ", stderr);
+ fputs("+dc_flush ", f);
if (bits & ANV_PIPE_HDC_PIPELINE_FLUSH_BIT)
- fputs("+hdc_flush ", stderr);
+ fputs("+hdc_flush ", f);
if (bits & ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT)
- fputs("+rt_flush ", stderr);
+ fputs("+rt_flush ", f);
if (bits & ANV_PIPE_TILE_CACHE_FLUSH_BIT)
- fputs("+tile_flush ", stderr);
+ fputs("+tile_flush ", f);
if (bits & ANV_PIPE_STATE_CACHE_INVALIDATE_BIT)
- fputs("+state_inval ", stderr);
+ fputs("+state_inval ", f);
if (bits & ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT)
- fputs("+const_inval ", stderr);
+ fputs("+const_inval ", f);
if (bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT)
- fputs("+vf_inval ", stderr);
+ fputs("+vf_inval ", f);
if (bits & ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT)
- fputs("+tex_inval ", stderr);
+ fputs("+tex_inval ", f);
if (bits & ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT)
- fputs("+ic_inval ", stderr);
+ fputs("+ic_inval ", f);
if (bits & ANV_PIPE_STALL_AT_SCOREBOARD_BIT)
- fputs("+pb_stall ", stderr);
+ fputs("+pb_stall ", f);
+ if (bits & ANV_PIPE_PSS_STALL_SYNC_BIT)
+ fputs("+pss_stall ", f);
if (bits & ANV_PIPE_DEPTH_STALL_BIT)
- fputs("+depth_stall ", stderr);
- if (bits & ANV_PIPE_CS_STALL_BIT)
- fputs("+cs_stall ", stderr);
+ fputs("+depth_stall ", f);
+ if (bits & ANV_PIPE_CS_STALL_BIT ||
+ bits & ANV_PIPE_END_OF_PIPE_SYNC_BIT)
+ fputs("+cs_stall ", f);
+ if (bits & ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT)
+ fputs("+utdp_flush ", f);
+ if (bits & ANV_PIPE_CCS_CACHE_FLUSH_BIT)
+ fputs("+ccs_flush ", f);
+}
+
+const char *
+anv_gfx_state_bit_to_str(enum anv_gfx_state_bits state)
+{
+#define NAME(name) case ANV_GFX_STATE_##name: return #name;
+ switch (state) {
+ NAME(URB);
+ NAME(VF_STATISTICS);
+ NAME(VF_SGVS);
+ NAME(VF_SGVS_2);
+ NAME(VF_SGVS_INSTANCING);
+ NAME(PRIMITIVE_REPLICATION);
+ NAME(MULTISAMPLE);
+ NAME(SBE);
+ NAME(SBE_SWIZ);
+ NAME(SO_DECL_LIST);
+ NAME(VS);
+ NAME(HS);
+ NAME(DS);
+ NAME(GS);
+ NAME(PS);
+ NAME(PS_EXTRA);
+ NAME(SBE_MESH);
+ NAME(CLIP_MESH);
+ NAME(MESH_CONTROL);
+ NAME(MESH_SHADER);
+ NAME(MESH_DISTRIB);
+ NAME(TASK_CONTROL);
+ NAME(TASK_SHADER);
+ NAME(TASK_REDISTRIB);
+ NAME(BLEND_STATE_PTR);
+ NAME(CLIP);
+ NAME(CC_STATE);
+ NAME(CC_STATE_PTR);
+ NAME(CPS);
+ NAME(DEPTH_BOUNDS);
+ NAME(INDEX_BUFFER);
+ NAME(LINE_STIPPLE);
+ NAME(PS_BLEND);
+ NAME(RASTER);
+ NAME(SAMPLE_MASK);
+ NAME(SAMPLE_PATTERN);
+ NAME(SCISSOR);
+ NAME(SF);
+ NAME(STREAMOUT);
+ NAME(TE);
+ NAME(VERTEX_INPUT);
+ NAME(VF);
+ NAME(VF_TOPOLOGY);
+ NAME(VFG);
+ NAME(VIEWPORT_CC);
+ NAME(VIEWPORT_CC_PTR);
+ NAME(VIEWPORT_SF_CLIP);
+ NAME(WM);
+ NAME(WM_DEPTH_STENCIL);
+ NAME(PMA_FIX);
+ NAME(WA_18019816803);
+ NAME(TBIMR_TILE_PASS_INFO);
+ default: unreachable("invalid state");
+ }
}
diff --git a/src/intel/vulkan/anv_utrace.c b/src/intel/vulkan/anv_utrace.c
new file mode 100644
index 00000000000..9b66300a44c
--- /dev/null
+++ b/src/intel/vulkan/anv_utrace.c
@@ -0,0 +1,684 @@
+/*
+ * Copyright © 2021 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "anv_private.h"
+#include "anv_internal_kernels.h"
+
+#include "ds/intel_tracepoints.h"
+#include "genxml/gen9_pack.h"
+#include "perf/intel_perf.h"
+#include "util/perf/cpu_trace.h"
+
+#include "vk_common_entrypoints.h"
+
+/** Timestamp structure format */
+union anv_utrace_timestamp {
+ /* Timestamp writtem by either 2 * MI_STORE_REGISTER_MEM or
+ * PIPE_CONTROL.
+ */
+ uint64_t timestamp;
+
+ /* Timestamp written by COMPUTE_WALKER::PostSync
+ *
+ * Layout is described in PRMs.
+ * ATSM PRMs, Volume 2d: Command Reference: Structures, POSTSYNC_DATA:
+ *
+ * "The timestamp layout :
+ * [0] = 32b Context Timestamp Start
+ * [1] = 32b Global Timestamp Start
+ * [2] = 32b Context Timestamp End
+ * [3] = 32b Global Timestamp End"
+ */
+ uint32_t compute_walker[4];
+};
+
+static uint32_t
+command_buffers_count_utraces(struct anv_device *device,
+ uint32_t cmd_buffer_count,
+ struct anv_cmd_buffer **cmd_buffers,
+ uint32_t *utrace_copies)
+{
+ if (!u_trace_should_process(&device->ds.trace_context))
+ return 0;
+
+ uint32_t utraces = 0;
+ for (uint32_t i = 0; i < cmd_buffer_count; i++) {
+ if (u_trace_has_points(&cmd_buffers[i]->trace)) {
+ utraces++;
+ if (!(cmd_buffers[i]->usage_flags & VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT))
+ *utrace_copies += list_length(&cmd_buffers[i]->trace.trace_chunks);
+ }
+ }
+
+ return utraces;
+}
+
+static void
+anv_utrace_delete_submit(struct u_trace_context *utctx, void *submit_data)
+{
+ struct anv_device *device =
+ container_of(utctx, struct anv_device, ds.trace_context);
+ struct anv_utrace_submit *submit = submit_data;
+
+ intel_ds_flush_data_fini(&submit->ds);
+
+ anv_state_stream_finish(&submit->dynamic_state_stream);
+ anv_state_stream_finish(&submit->general_state_stream);
+
+ if (submit->trace_bo)
+ anv_bo_pool_free(&device->utrace_bo_pool, submit->trace_bo);
+
+ util_dynarray_foreach(&submit->batch_bos, struct anv_bo *, bo)
+ anv_bo_pool_free(&device->utrace_bo_pool, *bo);
+ util_dynarray_fini(&submit->batch_bos);
+
+ vk_sync_destroy(&device->vk, submit->sync);
+
+ vk_free(&device->vk.alloc, submit);
+}
+
+static void
+anv_device_utrace_emit_gfx_copy_ts_buffer(struct u_trace_context *utctx,
+ void *cmdstream,
+ void *ts_from, uint32_t from_offset,
+ void *ts_to, uint32_t to_offset,
+ uint32_t count)
+{
+ struct anv_device *device =
+ container_of(utctx, struct anv_device, ds.trace_context);
+ struct anv_utrace_submit *submit = cmdstream;
+ struct anv_address from_addr = (struct anv_address) {
+ .bo = ts_from, .offset = from_offset * sizeof(union anv_utrace_timestamp) };
+ struct anv_address to_addr = (struct anv_address) {
+ .bo = ts_to, .offset = to_offset * sizeof(union anv_utrace_timestamp) };
+
+ anv_genX(device->info, emit_so_memcpy)(&submit->memcpy_state,
+ to_addr, from_addr,
+ count * sizeof(union anv_utrace_timestamp));
+}
+
+static void
+anv_device_utrace_emit_cs_copy_ts_buffer(struct u_trace_context *utctx,
+ void *cmdstream,
+ void *ts_from, uint32_t from_offset,
+ void *ts_to, uint32_t to_offset,
+ uint32_t count)
+{
+ struct anv_device *device =
+ container_of(utctx, struct anv_device, ds.trace_context);
+ struct anv_utrace_submit *submit = cmdstream;
+ struct anv_address from_addr = (struct anv_address) {
+ .bo = ts_from, .offset = from_offset * sizeof(union anv_utrace_timestamp) };
+ struct anv_address to_addr = (struct anv_address) {
+ .bo = ts_to, .offset = to_offset * sizeof(union anv_utrace_timestamp) };
+
+ struct anv_state push_data_state =
+ anv_genX(device->info, simple_shader_alloc_push)(
+ &submit->simple_state, sizeof(struct anv_memcpy_params));
+ struct anv_memcpy_params *params = push_data_state.map;
+
+ *params = (struct anv_memcpy_params) {
+ .num_dwords = count * sizeof(union anv_utrace_timestamp) / 4,
+ .src_addr = anv_address_physical(from_addr),
+ .dst_addr = anv_address_physical(to_addr),
+ };
+
+ anv_genX(device->info, emit_simple_shader_dispatch)(
+ &submit->simple_state, DIV_ROUND_UP(params->num_dwords, 4),
+ push_data_state);
+}
+
+static VkResult
+anv_utrace_submit_extend_batch(struct anv_batch *batch, uint32_t size,
+ void *user_data)
+{
+ struct anv_utrace_submit *submit = user_data;
+
+ uint32_t alloc_size = 0;
+ util_dynarray_foreach(&submit->batch_bos, struct anv_bo *, bo)
+ alloc_size += (*bo)->size;
+ alloc_size = MAX2(alloc_size * 2, 8192);
+
+ struct anv_bo *bo;
+ VkResult result = anv_bo_pool_alloc(&submit->queue->device->utrace_bo_pool,
+ align(alloc_size, 4096),
+ &bo);
+ if (result != VK_SUCCESS)
+ return result;
+
+ util_dynarray_append(&submit->batch_bos, struct anv_bo *, bo);
+
+ batch->end += 4 * GFX9_MI_BATCH_BUFFER_START_length;
+
+ anv_batch_emit(batch, GFX9_MI_BATCH_BUFFER_START, bbs) {
+ bbs.DWordLength = GFX9_MI_BATCH_BUFFER_START_length -
+ GFX9_MI_BATCH_BUFFER_START_length_bias;
+ bbs.SecondLevelBatchBuffer = Firstlevelbatch;
+ bbs.AddressSpaceIndicator = ASI_PPGTT;
+ bbs.BatchBufferStartAddress = (struct anv_address) { bo, 0 };
+ }
+
+ anv_batch_set_storage(batch,
+ (struct anv_address) { .bo = bo, },
+ bo->map,
+ bo->size - 4 * GFX9_MI_BATCH_BUFFER_START_length);
+
+ return VK_SUCCESS;
+}
+
+VkResult
+anv_device_utrace_flush_cmd_buffers(struct anv_queue *queue,
+ uint32_t cmd_buffer_count,
+ struct anv_cmd_buffer **cmd_buffers,
+ struct anv_utrace_submit **out_submit)
+{
+ struct anv_device *device = queue->device;
+ uint32_t utrace_copies = 0;
+ uint32_t utraces = command_buffers_count_utraces(device,
+ cmd_buffer_count,
+ cmd_buffers,
+ &utrace_copies);
+ if (!utraces) {
+ *out_submit = NULL;
+ return VK_SUCCESS;
+ }
+
+ VkResult result;
+ struct anv_utrace_submit *submit =
+ vk_zalloc(&device->vk.alloc, sizeof(struct anv_utrace_submit),
+ 8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
+ if (!submit)
+ return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+ submit->queue = queue;
+
+ intel_ds_flush_data_init(&submit->ds, &queue->ds, queue->ds.submission_id);
+
+ result = vk_sync_create(&device->vk, &device->physical->sync_syncobj_type,
+ 0, 0, &submit->sync);
+ if (result != VK_SUCCESS)
+ goto error_sync;
+
+ util_dynarray_init(&submit->batch_bos, NULL);
+
+ if (utrace_copies > 0) {
+ result = anv_bo_pool_alloc(&device->utrace_bo_pool,
+ utrace_copies * 4096,
+ &submit->trace_bo);
+ if (result != VK_SUCCESS)
+ goto error_trace_buf;
+
+ const bool uses_relocs = device->physical->uses_relocs;
+ result = anv_reloc_list_init(&submit->relocs, &device->vk.alloc, uses_relocs);
+ if (result != VK_SUCCESS)
+ goto error_reloc_list;
+
+ anv_state_stream_init(&submit->dynamic_state_stream,
+ &device->dynamic_state_pool, 16384);
+ anv_state_stream_init(&submit->general_state_stream,
+ &device->general_state_pool, 16384);
+
+ submit->batch = (struct anv_batch) {
+ .alloc = &device->vk.alloc,
+ .relocs = &submit->relocs,
+ .user_data = submit,
+ .extend_cb = anv_utrace_submit_extend_batch,
+ };
+
+ /* Only engine class where we support timestamp copies
+ *
+ * TODO: add INTEL_ENGINE_CLASS_COPY support (should be trivial ;)
+ */
+ assert(queue->family->engine_class == INTEL_ENGINE_CLASS_RENDER ||
+ queue->family->engine_class == INTEL_ENGINE_CLASS_COMPUTE);
+ if (queue->family->engine_class == INTEL_ENGINE_CLASS_RENDER) {
+
+ trace_intel_begin_trace_copy_cb(&submit->ds.trace, &submit->batch);
+
+ anv_genX(device->info, emit_so_memcpy_init)(&submit->memcpy_state,
+ device,
+ &submit->batch);
+ uint32_t num_traces = 0;
+ for (uint32_t i = 0; i < cmd_buffer_count; i++) {
+ if (cmd_buffers[i]->usage_flags & VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT) {
+ intel_ds_queue_flush_data(&queue->ds, &cmd_buffers[i]->trace,
+ &submit->ds, false);
+ } else {
+ num_traces += cmd_buffers[i]->trace.num_traces;
+ u_trace_clone_append(u_trace_begin_iterator(&cmd_buffers[i]->trace),
+ u_trace_end_iterator(&cmd_buffers[i]->trace),
+ &submit->ds.trace,
+ submit,
+ anv_device_utrace_emit_gfx_copy_ts_buffer);
+ }
+ }
+ anv_genX(device->info, emit_so_memcpy_fini)(&submit->memcpy_state);
+
+ trace_intel_end_trace_copy_cb(&submit->ds.trace, &submit->batch,
+ num_traces);
+
+ anv_genX(device->info, emit_so_memcpy_end)(&submit->memcpy_state);
+ } else {
+ struct anv_shader_bin *copy_kernel;
+ VkResult ret =
+ anv_device_get_internal_shader(device,
+ ANV_INTERNAL_KERNEL_MEMCPY_COMPUTE,
+ &copy_kernel);
+ if (ret != VK_SUCCESS)
+ goto error_batch;
+
+ trace_intel_begin_trace_copy_cb(&submit->ds.trace, &submit->batch);
+
+ submit->simple_state = (struct anv_simple_shader) {
+ .device = device,
+ .dynamic_state_stream = &submit->dynamic_state_stream,
+ .general_state_stream = &submit->general_state_stream,
+ .batch = &submit->batch,
+ .kernel = copy_kernel,
+ .l3_config = device->internal_kernels_l3_config,
+ };
+ anv_genX(device->info, emit_simple_shader_init)(&submit->simple_state);
+
+ uint32_t num_traces = 0;
+ for (uint32_t i = 0; i < cmd_buffer_count; i++) {
+ num_traces += cmd_buffers[i]->trace.num_traces;
+ if (cmd_buffers[i]->usage_flags & VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT) {
+ intel_ds_queue_flush_data(&queue->ds, &cmd_buffers[i]->trace,
+ &submit->ds, false);
+ } else {
+ num_traces += cmd_buffers[i]->trace.num_traces;
+ u_trace_clone_append(u_trace_begin_iterator(&cmd_buffers[i]->trace),
+ u_trace_end_iterator(&cmd_buffers[i]->trace),
+ &submit->ds.trace,
+ submit,
+ anv_device_utrace_emit_cs_copy_ts_buffer);
+ }
+ }
+
+ trace_intel_end_trace_copy_cb(&submit->ds.trace, &submit->batch,
+ num_traces);
+
+ anv_genX(device->info, emit_simple_shader_end)(&submit->simple_state);
+ }
+
+ intel_ds_queue_flush_data(&queue->ds, &submit->ds.trace, &submit->ds, true);
+
+ if (submit->batch.status != VK_SUCCESS) {
+ result = submit->batch.status;
+ goto error_batch;
+ }
+ } else {
+ for (uint32_t i = 0; i < cmd_buffer_count; i++) {
+ assert(cmd_buffers[i]->usage_flags & VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT);
+ intel_ds_queue_flush_data(&queue->ds, &cmd_buffers[i]->trace,
+ &submit->ds, i == (cmd_buffer_count - 1));
+ }
+ }
+
+ *out_submit = submit;
+
+ return VK_SUCCESS;
+
+ error_batch:
+ anv_reloc_list_finish(&submit->relocs);
+ util_dynarray_foreach(&submit->batch_bos, struct anv_bo *, bo)
+ anv_bo_pool_free(&device->utrace_bo_pool, *bo);
+ error_reloc_list:
+ anv_bo_pool_free(&device->utrace_bo_pool, submit->trace_bo);
+ error_trace_buf:
+ vk_sync_destroy(&device->vk, submit->sync);
+ error_sync:
+ intel_ds_flush_data_fini(&submit->ds);
+ vk_free(&device->vk.alloc, submit);
+ return result;
+}
+
+static void *
+anv_utrace_create_ts_buffer(struct u_trace_context *utctx, uint32_t size_b)
+{
+ struct anv_device *device =
+ container_of(utctx, struct anv_device, ds.trace_context);
+
+ uint32_t anv_ts_size_b = (size_b / sizeof(uint64_t)) *
+ sizeof(union anv_utrace_timestamp);
+
+ struct anv_bo *bo = NULL;
+ UNUSED VkResult result =
+ anv_bo_pool_alloc(&device->utrace_bo_pool,
+ align(anv_ts_size_b, 4096),
+ &bo);
+ assert(result == VK_SUCCESS);
+
+ memset(bo->map, 0, bo->size);
+#ifdef SUPPORT_INTEL_INTEGRATED_GPUS
+ if (device->physical->memory.need_flush &&
+ anv_bo_needs_host_cache_flush(bo->alloc_flags))
+ intel_flush_range(bo->map, bo->size);
+#endif
+
+ return bo;
+}
+
+static void
+anv_utrace_destroy_ts_buffer(struct u_trace_context *utctx, void *timestamps)
+{
+ struct anv_device *device =
+ container_of(utctx, struct anv_device, ds.trace_context);
+ struct anv_bo *bo = timestamps;
+
+ anv_bo_pool_free(&device->utrace_bo_pool, bo);
+}
+
+static void
+anv_utrace_record_ts(struct u_trace *ut, void *cs,
+ void *timestamps, unsigned idx,
+ bool end_of_pipe)
+{
+ struct anv_device *device =
+ container_of(ut->utctx, struct anv_device, ds.trace_context);
+ struct anv_cmd_buffer *cmd_buffer =
+ container_of(ut, struct anv_cmd_buffer, trace);
+ /* cmd_buffer is only valid if cs == NULL */
+ struct anv_batch *batch = cs != NULL ? cs : &cmd_buffer->batch;
+ struct anv_bo *bo = timestamps;
+
+ struct anv_address ts_address = (struct anv_address) {
+ .bo = bo,
+ .offset = idx * sizeof(union anv_utrace_timestamp)
+ };
+
+ /* Is this a end of compute trace point? */
+ const bool is_end_compute =
+ cs == NULL &&
+ (cmd_buffer->last_compute_walker != NULL ||
+ cmd_buffer->last_indirect_dispatch != NULL) &&
+ end_of_pipe;
+
+ enum anv_timestamp_capture_type capture_type = end_of_pipe ?
+ (is_end_compute ?
+ (cmd_buffer->last_indirect_dispatch != NULL ?
+ ANV_TIMESTAMP_REWRITE_INDIRECT_DISPATCH : ANV_TIMESTAMP_REWRITE_COMPUTE_WALKER) :
+ ANV_TIMESTAMP_CAPTURE_END_OF_PIPE) : ANV_TIMESTAMP_CAPTURE_TOP_OF_PIPE;
+
+ void *addr = capture_type == ANV_TIMESTAMP_REWRITE_INDIRECT_DISPATCH ?
+ cmd_buffer->last_indirect_dispatch :
+ capture_type == ANV_TIMESTAMP_REWRITE_COMPUTE_WALKER ?
+ cmd_buffer->last_compute_walker : NULL;
+
+ device->physical->cmd_emit_timestamp(batch, device, ts_address,
+ capture_type,
+ addr);
+ if (is_end_compute) {
+ cmd_buffer->last_compute_walker = NULL;
+ cmd_buffer->last_indirect_dispatch = NULL;
+ }
+}
+
+static uint64_t
+anv_utrace_read_ts(struct u_trace_context *utctx,
+ void *timestamps, unsigned idx, void *flush_data)
+{
+ struct anv_device *device =
+ container_of(utctx, struct anv_device, ds.trace_context);
+ struct anv_bo *bo = timestamps;
+ struct anv_utrace_submit *submit = flush_data;
+
+ /* Only need to stall on results for the first entry: */
+ if (idx == 0) {
+ MESA_TRACE_SCOPE("anv utrace wait timestamps");
+ UNUSED VkResult result =
+ vk_sync_wait(&device->vk,
+ submit->sync,
+ 0,
+ VK_SYNC_WAIT_COMPLETE,
+ os_time_get_absolute_timeout(OS_TIMEOUT_INFINITE));
+ assert(result == VK_SUCCESS);
+ }
+
+ union anv_utrace_timestamp *ts = (union anv_utrace_timestamp *)bo->map;
+
+ /* Don't translate the no-timestamp marker: */
+ if (ts[idx].timestamp == U_TRACE_NO_TIMESTAMP)
+ return U_TRACE_NO_TIMESTAMP;
+
+ /* Detect a 16bytes timestamp write */
+ if (ts[idx].compute_walker[2] != 0 || ts[idx].compute_walker[3] != 0) {
+ /* The timestamp written by COMPUTE_WALKER::PostSync only as 32bits. We
+ * need to rebuild the full 64bits using the previous timestamp. We
+ * assume that utrace is reading the timestamp in order. Anyway
+ * timestamp rollover on 32bits in a few minutes so in most cases that
+ * should be correct.
+ */
+ uint64_t timestamp =
+ (submit->last_full_timestamp & 0xffffffff00000000) |
+ (uint64_t) ts[idx].compute_walker[3];
+
+ return intel_device_info_timebase_scale(device->info, timestamp);
+ }
+
+ submit->last_full_timestamp = ts[idx].timestamp;
+
+ return intel_device_info_timebase_scale(device->info, ts[idx].timestamp);
+}
+
+void
+anv_device_utrace_init(struct anv_device *device)
+{
+ anv_bo_pool_init(&device->utrace_bo_pool, device, "utrace",
+ ANV_BO_ALLOC_MAPPED | ANV_BO_ALLOC_HOST_CACHED_COHERENT);
+ intel_ds_device_init(&device->ds, device->info, device->fd,
+ device->physical->local_minor,
+ INTEL_DS_API_VULKAN);
+ u_trace_context_init(&device->ds.trace_context,
+ &device->ds,
+ anv_utrace_create_ts_buffer,
+ anv_utrace_destroy_ts_buffer,
+ anv_utrace_record_ts,
+ anv_utrace_read_ts,
+ anv_utrace_delete_submit);
+
+ for (uint32_t q = 0; q < device->queue_count; q++) {
+ struct anv_queue *queue = &device->queues[q];
+
+ intel_ds_device_init_queue(&device->ds, &queue->ds, "%s%u",
+ intel_engines_class_to_string(queue->family->engine_class),
+ queue->vk.index_in_family);
+ }
+}
+
+void
+anv_device_utrace_finish(struct anv_device *device)
+{
+ intel_ds_device_process(&device->ds, true);
+ intel_ds_device_fini(&device->ds);
+ anv_bo_pool_finish(&device->utrace_bo_pool);
+}
+
+enum intel_ds_stall_flag
+anv_pipe_flush_bit_to_ds_stall_flag(enum anv_pipe_bits bits)
+{
+ static const struct {
+ enum anv_pipe_bits anv;
+ enum intel_ds_stall_flag ds;
+ } anv_to_ds_flags[] = {
+ { .anv = ANV_PIPE_DEPTH_CACHE_FLUSH_BIT, .ds = INTEL_DS_DEPTH_CACHE_FLUSH_BIT, },
+ { .anv = ANV_PIPE_DATA_CACHE_FLUSH_BIT, .ds = INTEL_DS_DATA_CACHE_FLUSH_BIT, },
+ { .anv = ANV_PIPE_TILE_CACHE_FLUSH_BIT, .ds = INTEL_DS_TILE_CACHE_FLUSH_BIT, },
+ { .anv = ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT, .ds = INTEL_DS_RENDER_TARGET_CACHE_FLUSH_BIT, },
+ { .anv = ANV_PIPE_STATE_CACHE_INVALIDATE_BIT, .ds = INTEL_DS_STATE_CACHE_INVALIDATE_BIT, },
+ { .anv = ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT, .ds = INTEL_DS_CONST_CACHE_INVALIDATE_BIT, },
+ { .anv = ANV_PIPE_VF_CACHE_INVALIDATE_BIT, .ds = INTEL_DS_VF_CACHE_INVALIDATE_BIT, },
+ { .anv = ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT, .ds = INTEL_DS_TEXTURE_CACHE_INVALIDATE_BIT, },
+ { .anv = ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT, .ds = INTEL_DS_INST_CACHE_INVALIDATE_BIT, },
+ { .anv = ANV_PIPE_DEPTH_STALL_BIT, .ds = INTEL_DS_DEPTH_STALL_BIT, },
+ { .anv = ANV_PIPE_CS_STALL_BIT, .ds = INTEL_DS_CS_STALL_BIT, },
+ { .anv = ANV_PIPE_HDC_PIPELINE_FLUSH_BIT, .ds = INTEL_DS_HDC_PIPELINE_FLUSH_BIT, },
+ { .anv = ANV_PIPE_STALL_AT_SCOREBOARD_BIT, .ds = INTEL_DS_STALL_AT_SCOREBOARD_BIT, },
+ { .anv = ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT, .ds = INTEL_DS_UNTYPED_DATAPORT_CACHE_FLUSH_BIT, },
+ { .anv = ANV_PIPE_PSS_STALL_SYNC_BIT, .ds = INTEL_DS_PSS_STALL_SYNC_BIT, },
+ { .anv = ANV_PIPE_END_OF_PIPE_SYNC_BIT, .ds = INTEL_DS_END_OF_PIPE_BIT, },
+ { .anv = ANV_PIPE_CCS_CACHE_FLUSH_BIT, .ds = INTEL_DS_CCS_CACHE_FLUSH_BIT, },
+ };
+
+ enum intel_ds_stall_flag ret = 0;
+ for (uint32_t i = 0; i < ARRAY_SIZE(anv_to_ds_flags); i++) {
+ if (anv_to_ds_flags[i].anv & bits)
+ ret |= anv_to_ds_flags[i].ds;
+ }
+
+ return ret;
+}
+
+void anv_CmdBeginDebugUtilsLabelEXT(
+ VkCommandBuffer _commandBuffer,
+ const VkDebugUtilsLabelEXT *pLabelInfo)
+{
+ VK_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, _commandBuffer);
+
+ vk_common_CmdBeginDebugUtilsLabelEXT(_commandBuffer, pLabelInfo);
+
+ trace_intel_begin_cmd_buffer_annotation(&cmd_buffer->trace);
+}
+
+void anv_CmdEndDebugUtilsLabelEXT(VkCommandBuffer _commandBuffer)
+{
+ VK_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, _commandBuffer);
+
+ if (cmd_buffer->vk.labels.size > 0) {
+ const VkDebugUtilsLabelEXT *label =
+ util_dynarray_top_ptr(&cmd_buffer->vk.labels, VkDebugUtilsLabelEXT);
+
+ trace_intel_end_cmd_buffer_annotation(&cmd_buffer->trace,
+ strlen(label->pLabelName),
+ label->pLabelName);
+ }
+
+ vk_common_CmdEndDebugUtilsLabelEXT(_commandBuffer);
+}
+
+void
+anv_queue_trace(struct anv_queue *queue, const char *label, bool frame, bool begin)
+{
+ struct anv_device *device = queue->device;
+
+ VkResult result;
+ struct anv_utrace_submit *submit =
+ vk_zalloc(&device->vk.alloc, sizeof(struct anv_utrace_submit),
+ 8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
+ if (!submit)
+ return;
+
+ submit->queue = queue;
+
+ intel_ds_flush_data_init(&submit->ds, &queue->ds, queue->ds.submission_id);
+
+ result = vk_sync_create(&device->vk, &device->physical->sync_syncobj_type,
+ 0, 0, &submit->sync);
+ if (result != VK_SUCCESS)
+ goto error_trace;
+
+ const bool uses_relocs = device->physical->uses_relocs;
+ result = anv_reloc_list_init(&submit->relocs, &device->vk.alloc, uses_relocs);
+ if (result != VK_SUCCESS)
+ goto error_sync;
+
+ submit->batch = (struct anv_batch) {
+ .alloc = &device->vk.alloc,
+ .relocs = &submit->relocs,
+ .user_data = submit,
+ .extend_cb = anv_utrace_submit_extend_batch,
+ };
+
+ if (frame) {
+ if (begin)
+ trace_intel_begin_frame(&submit->ds.trace, &submit->batch);
+ else
+ trace_intel_end_frame(&submit->ds.trace, &submit->batch,
+ device->debug_frame_desc->frame_id);
+ } else {
+ if (begin) {
+ trace_intel_begin_queue_annotation(&submit->ds.trace, &submit->batch);
+ } else {
+ trace_intel_end_queue_annotation(&submit->ds.trace,
+ &submit->batch,
+ strlen(label),
+ label);
+ }
+ }
+
+ anv_batch_emit(&submit->batch, GFX9_MI_BATCH_BUFFER_END, bbs);
+ anv_batch_emit(&submit->batch, GFX9_MI_NOOP, noop);
+
+ if (submit->batch.status != VK_SUCCESS) {
+ result = submit->batch.status;
+ goto error_reloc_list;
+ }
+
+ intel_ds_queue_flush_data(&queue->ds, &submit->ds.trace, &submit->ds, true);
+
+ pthread_mutex_lock(&device->mutex);
+ device->kmd_backend->queue_exec_trace(queue, submit);
+ pthread_mutex_unlock(&device->mutex);
+
+ return;
+
+ error_reloc_list:
+ anv_reloc_list_finish(&submit->relocs);
+ util_dynarray_foreach(&submit->batch_bos, struct anv_bo *, bo)
+ anv_bo_pool_free(&device->utrace_bo_pool, *bo);
+ error_sync:
+ vk_sync_destroy(&device->vk, submit->sync);
+ error_trace:
+ intel_ds_flush_data_fini(&submit->ds);
+ vk_free(&device->vk.alloc, submit);
+}
+
+void
+anv_QueueBeginDebugUtilsLabelEXT(
+ VkQueue _queue,
+ const VkDebugUtilsLabelEXT *pLabelInfo)
+{
+ VK_FROM_HANDLE(anv_queue, queue, _queue);
+
+ vk_common_QueueBeginDebugUtilsLabelEXT(_queue, pLabelInfo);
+
+ anv_queue_trace(queue, pLabelInfo->pLabelName,
+ false /* frame */, true /* begin */);
+}
+
+void
+anv_QueueEndDebugUtilsLabelEXT(VkQueue _queue)
+{
+ VK_FROM_HANDLE(anv_queue, queue, _queue);
+
+ if (queue->vk.labels.size > 0) {
+ const VkDebugUtilsLabelEXT *label =
+ util_dynarray_top_ptr(&queue->vk.labels, VkDebugUtilsLabelEXT);
+ anv_queue_trace(queue, label->pLabelName,
+ false /* frame */, false /* begin */);
+
+ intel_ds_device_process(&queue->device->ds, true);
+ }
+
+ vk_common_QueueEndDebugUtilsLabelEXT(_queue);
+}
diff --git a/src/intel/vulkan/anv_va.c b/src/intel/vulkan/anv_va.c
new file mode 100644
index 00000000000..fe05342a7f6
--- /dev/null
+++ b/src/intel/vulkan/anv_va.c
@@ -0,0 +1,195 @@
+/*
+ * Copyright © 2023 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "anv_private.h"
+
+#include "util/u_math.h"
+
+static uint64_t
+va_add(struct anv_va_range *range, uint64_t addr, uint64_t size)
+{
+ range->addr = addr;
+ range->size = size;
+
+ return addr + size;
+}
+
+static void
+va_at(struct anv_va_range *range, uint64_t addr, uint64_t size)
+{
+ range->addr = addr;
+ range->size = size;
+}
+
+static void
+anv_device_print_vas(struct anv_physical_device *device)
+{
+ fprintf(stderr, "Driver heaps:\n");
+#define PRINT_HEAP(name) \
+ fprintf(stderr, " 0x%016"PRIx64"-0x%016"PRIx64": %s\n", \
+ device->va.name.addr, \
+ device->va.name.addr + device->va.name.size, \
+ #name);
+ PRINT_HEAP(general_state_pool);
+ PRINT_HEAP(low_heap);
+ PRINT_HEAP(dynamic_state_pool);
+ PRINT_HEAP(sampler_state_pool);
+ PRINT_HEAP(binding_table_pool);
+ PRINT_HEAP(internal_surface_state_pool);
+ PRINT_HEAP(scratch_surface_state_pool);
+ PRINT_HEAP(bindless_surface_state_pool);
+ PRINT_HEAP(indirect_descriptor_pool);
+ PRINT_HEAP(indirect_push_descriptor_pool);
+ PRINT_HEAP(instruction_state_pool);
+ PRINT_HEAP(dynamic_state_db_pool);
+ PRINT_HEAP(descriptor_buffer_pool);
+ PRINT_HEAP(push_descriptor_buffer_pool);
+ PRINT_HEAP(high_heap);
+ PRINT_HEAP(trtt);
+}
+
+void
+anv_physical_device_init_va_ranges(struct anv_physical_device *device)
+{
+ /* anv Virtual Memory Layout
+ * =========================
+ *
+ * When the anv driver is determining the virtual graphics addresses of
+ * memory objects itself using the softpin mechanism, the following memory
+ * ranges will be used.
+ *
+ * Three special considerations to notice:
+ *
+ * (1) the dynamic state pool is located within the same 4 GiB as the low
+ * heap. This is to work around a VF cache issue described in a comment in
+ * anv_physical_device_init_heaps.
+ *
+ * (2) the binding table pool is located at lower addresses than the BT
+ * (binding table) surface state pool, within a 4 GiB range which also
+ * contains the bindless surface state pool. This allows surface state base
+ * addresses to cover both binding tables (16 bit offsets), the internal
+ * surface states (32 bit offsets) and the bindless surface states.
+ *
+ * (3) the last 4 GiB of the address space is withheld from the high heap.
+ * Various hardware units will read past the end of an object for various
+ * reasons. This healthy margin prevents reads from wrapping around 48-bit
+ * addresses.
+ */
+ uint64_t _1Mb = 1ull * 1024 * 1024;
+ uint64_t _1Gb = 1ull * 1024 * 1024 * 1024;
+ uint64_t _4Gb = 4ull * 1024 * 1024 * 1024;
+
+ uint64_t address = 0x000000200000ULL; /* 2MiB */
+
+ address = va_add(&device->va.general_state_pool, address,
+ _1Gb - address);
+
+ address = va_add(&device->va.low_heap, address, _1Gb);
+
+ /* The binding table pool has to be located directly in front of the
+ * surface states.
+ */
+ address += _1Gb;
+ address = va_add(&device->va.binding_table_pool, address, _1Gb);
+ address = va_add(&device->va.internal_surface_state_pool, address, 1 * _1Gb);
+ assert(device->va.internal_surface_state_pool.addr ==
+ align64(device->va.internal_surface_state_pool.addr, 2 * _1Gb));
+ /* Scratch surface state overlaps with the internal surface state */
+ va_at(&device->va.scratch_surface_state_pool,
+ device->va.internal_surface_state_pool.addr,
+ 8 * _1Mb);
+ address = va_add(&device->va.bindless_surface_state_pool, address, 2 * _1Gb);
+
+
+ /* PRMs & simulation disagrees on the actual size of this heap. Take the
+ * smallest (simulation) so that it works everywhere.
+ */
+ address = align64(address, _4Gb);
+ address = va_add(&device->va.dynamic_state_pool, address, _1Gb);
+ address = va_add(&device->va.sampler_state_pool, address, 2 * _1Gb);
+
+ if (device->indirect_descriptors) {
+ /* With indirect descriptors, descriptor buffers can go anywhere, they
+ * just need to be in a 4Gb aligned range, so all shader accesses can
+ * use a relocatable upper dword for the 64bit address.
+ */
+ address = align64(address, _4Gb);
+ address = va_add(&device->va.indirect_descriptor_pool, address, 3 * _1Gb);
+ address = va_add(&device->va.indirect_push_descriptor_pool, address, _1Gb);
+ }
+
+ /* We use a trick to compute constant data offsets in the shaders to avoid
+ * unnecessary 64bit address computations (see lower_load_constant() in
+ * anv_nir_apply_pipeline_layout.c). This assumes the instruction pool is
+ * located at an address with the lower 32bits at 0.
+ */
+ address = align64(address, _4Gb);
+ address = va_add(&device->va.instruction_state_pool, address, 2 * _1Gb);
+
+ address += 1 * _1Gb;
+ address = va_add(&device->va.dynamic_state_db_pool, address, _1Gb);
+ address = va_add(&device->va.descriptor_buffer_pool, address, 2 *_1Gb);
+ assert(device->va.descriptor_buffer_pool.addr % _4Gb == 0);
+ if (device->info.verx10 >= 125)
+ address = va_add(&device->va.push_descriptor_buffer_pool, address, _1Gb - 4096);
+
+ assert(device->va.descriptor_buffer_pool.addr ==
+ align64(device->va.descriptor_buffer_pool.addr, 4 * _1Gb));
+
+ address = align64(address, device->info.mem_alignment);
+ address = va_add(&device->va.aux_tt_pool, address, 2 * _1Gb);
+
+ /* What's left to do for us is to set va.high_heap and va.trtt without
+ * overlap, but there are a few things to be considered:
+ *
+ * The TR-TT address space is governed by the GFX_TRTT_VA_RANGE register,
+ * which carves out part of the address space for TR-TT and is independent
+ * of device->gtt_size. We use 47:44 for gen9+, the values we set here
+ * should be in sync with what we write to the register.
+ *
+ * If we ever gain the capability to use more than 48 bits of address space
+ * we'll have to adjust where we put the TR-TT space (and how we set
+ * GFX_TRTT_VA_RANGE).
+ *
+ * We have to leave the last 4GiB out of the high vma range, so that no
+ * state base address + size can overflow 48 bits. For more information see
+ * the comment about Wa32bitGeneralStateOffset in anv_allocator.c
+ *
+ * Despite the comment above, before we had TR-TT we were not only avoiding
+ * the last 4GiB of the 48bit address space, but also avoiding the last
+ * 4GiB from gtt_size, so let's be on the safe side and do the 4GiB
+ * avoiding for both the TR-TT space top and the gtt top.
+ */
+ assert(device->gtt_size <= (1uLL << 48));
+ uint64_t trtt_start = 0xFuLL << 44;
+ uint64_t trtt_end = (1uLL << 48) - 4 * _1Gb;
+ uint64_t addressable_top = MIN2(device->gtt_size, trtt_start) - 4 * _1Gb;
+
+ uint64_t user_heaps_size = addressable_top - address;
+ address = va_add(&device->va.high_heap, address, user_heaps_size);
+ assert(address <= trtt_start);
+ address = va_add(&device->va.trtt, trtt_start, trtt_end - trtt_start);
+
+ if (INTEL_DEBUG(DEBUG_HEAPS))
+ anv_device_print_vas(device);
+}
diff --git a/src/intel/vulkan/anv_video.c b/src/intel/vulkan/anv_video.c
new file mode 100644
index 00000000000..070c1806cc3
--- /dev/null
+++ b/src/intel/vulkan/anv_video.c
@@ -0,0 +1,435 @@
+/*
+ * Copyright © 2021 Red Hat
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "anv_private.h"
+
+#include "vk_video/vulkan_video_codecs_common.h"
+
+VkResult
+anv_CreateVideoSessionKHR(VkDevice _device,
+ const VkVideoSessionCreateInfoKHR *pCreateInfo,
+ const VkAllocationCallbacks *pAllocator,
+ VkVideoSessionKHR *pVideoSession)
+{
+ ANV_FROM_HANDLE(anv_device, device, _device);
+
+ struct anv_video_session *vid =
+ vk_alloc2(&device->vk.alloc, pAllocator, sizeof(*vid), 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+ if (!vid)
+ return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+ memset(vid, 0, sizeof(struct anv_video_session));
+
+ VkResult result = vk_video_session_init(&device->vk,
+ &vid->vk,
+ pCreateInfo);
+ if (result != VK_SUCCESS) {
+ vk_free2(&device->vk.alloc, pAllocator, vid);
+ return result;
+ }
+
+ *pVideoSession = anv_video_session_to_handle(vid);
+ return VK_SUCCESS;
+}
+
+void
+anv_DestroyVideoSessionKHR(VkDevice _device,
+ VkVideoSessionKHR _session,
+ const VkAllocationCallbacks *pAllocator)
+{
+ ANV_FROM_HANDLE(anv_device, device, _device);
+ ANV_FROM_HANDLE(anv_video_session, vid, _session);
+ if (!_session)
+ return;
+
+ vk_object_base_finish(&vid->vk.base);
+ vk_free2(&device->vk.alloc, pAllocator, vid);
+}
+
+VkResult
+anv_CreateVideoSessionParametersKHR(VkDevice _device,
+ const VkVideoSessionParametersCreateInfoKHR *pCreateInfo,
+ const VkAllocationCallbacks *pAllocator,
+ VkVideoSessionParametersKHR *pVideoSessionParameters)
+{
+ ANV_FROM_HANDLE(anv_device, device, _device);
+ ANV_FROM_HANDLE(anv_video_session, vid, pCreateInfo->videoSession);
+ ANV_FROM_HANDLE(anv_video_session_params, templ, pCreateInfo->videoSessionParametersTemplate);
+ struct anv_video_session_params *params =
+ vk_alloc2(&device->vk.alloc, pAllocator, sizeof(*params), 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+ if (!params)
+ return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+ VkResult result = vk_video_session_parameters_init(&device->vk,
+ &params->vk,
+ &vid->vk,
+ templ ? &templ->vk : NULL,
+ pCreateInfo);
+ if (result != VK_SUCCESS) {
+ vk_free2(&device->vk.alloc, pAllocator, params);
+ return result;
+ }
+
+ *pVideoSessionParameters = anv_video_session_params_to_handle(params);
+ return VK_SUCCESS;
+}
+
+void
+anv_DestroyVideoSessionParametersKHR(VkDevice _device,
+ VkVideoSessionParametersKHR _params,
+ const VkAllocationCallbacks *pAllocator)
+{
+ ANV_FROM_HANDLE(anv_device, device, _device);
+ ANV_FROM_HANDLE(anv_video_session_params, params, _params);
+ if (!_params)
+ return;
+ vk_video_session_parameters_finish(&device->vk, &params->vk);
+ vk_free2(&device->vk.alloc, pAllocator, params);
+}
+
+VkResult
+anv_GetPhysicalDeviceVideoCapabilitiesKHR(VkPhysicalDevice physicalDevice,
+ const VkVideoProfileInfoKHR *pVideoProfile,
+ VkVideoCapabilitiesKHR *pCapabilities)
+{
+ ANV_FROM_HANDLE(anv_physical_device, pdevice, physicalDevice);
+
+ pCapabilities->minBitstreamBufferOffsetAlignment = 32;
+ pCapabilities->minBitstreamBufferSizeAlignment = 32;
+ pCapabilities->maxCodedExtent.width = 4096;
+ pCapabilities->maxCodedExtent.height = 4096;
+ pCapabilities->flags = VK_VIDEO_CAPABILITY_SEPARATE_REFERENCE_IMAGES_BIT_KHR;
+
+ struct VkVideoDecodeCapabilitiesKHR *dec_caps = (struct VkVideoDecodeCapabilitiesKHR *)
+ vk_find_struct(pCapabilities->pNext, VIDEO_DECODE_CAPABILITIES_KHR);
+ if (dec_caps)
+ dec_caps->flags = VK_VIDEO_DECODE_CAPABILITY_DPB_AND_OUTPUT_COINCIDE_BIT_KHR;
+
+ /* H264 allows different luma and chroma bit depths */
+ if (pVideoProfile->lumaBitDepth != pVideoProfile->chromaBitDepth)
+ return VK_ERROR_VIDEO_PROFILE_FORMAT_NOT_SUPPORTED_KHR;
+
+ if (pVideoProfile->chromaSubsampling != VK_VIDEO_CHROMA_SUBSAMPLING_420_BIT_KHR)
+ return VK_ERROR_VIDEO_PROFILE_FORMAT_NOT_SUPPORTED_KHR;
+
+ switch (pVideoProfile->videoCodecOperation) {
+ case VK_VIDEO_CODEC_OPERATION_DECODE_H264_BIT_KHR: {
+ struct VkVideoDecodeH264CapabilitiesKHR *ext = (struct VkVideoDecodeH264CapabilitiesKHR *)
+ vk_find_struct(pCapabilities->pNext, VIDEO_DECODE_H264_CAPABILITIES_KHR);
+
+ if (pVideoProfile->lumaBitDepth != VK_VIDEO_COMPONENT_BIT_DEPTH_8_BIT_KHR)
+ return VK_ERROR_VIDEO_PROFILE_FORMAT_NOT_SUPPORTED_KHR;
+
+ pCapabilities->maxDpbSlots = 17;
+ pCapabilities->maxActiveReferencePictures = ANV_VIDEO_H264_MAX_NUM_REF_FRAME;
+ pCapabilities->pictureAccessGranularity.width = ANV_MB_WIDTH;
+ pCapabilities->pictureAccessGranularity.height = ANV_MB_HEIGHT;
+ pCapabilities->minCodedExtent.width = ANV_MB_WIDTH;
+ pCapabilities->minCodedExtent.height = ANV_MB_HEIGHT;
+
+ ext->fieldOffsetGranularity.x = 0;
+ ext->fieldOffsetGranularity.y = 0;
+ ext->maxLevelIdc = STD_VIDEO_H264_LEVEL_IDC_5_1;
+ strcpy(pCapabilities->stdHeaderVersion.extensionName, VK_STD_VULKAN_VIDEO_CODEC_H264_DECODE_EXTENSION_NAME);
+ pCapabilities->stdHeaderVersion.specVersion = VK_STD_VULKAN_VIDEO_CODEC_H264_DECODE_SPEC_VERSION;
+ break;
+ }
+ case VK_VIDEO_CODEC_OPERATION_DECODE_H265_BIT_KHR: {
+ struct VkVideoDecodeH265CapabilitiesKHR *ext = (struct VkVideoDecodeH265CapabilitiesKHR *)
+ vk_find_struct(pCapabilities->pNext, VIDEO_DECODE_H265_CAPABILITIES_KHR);
+
+ const struct VkVideoDecodeH265ProfileInfoKHR *h265_profile =
+ vk_find_struct_const(pVideoProfile->pNext,
+ VIDEO_DECODE_H265_PROFILE_INFO_KHR);
+
+ /* No hardware supports the scc extension profile */
+ if (h265_profile->stdProfileIdc != STD_VIDEO_H265_PROFILE_IDC_MAIN &&
+ h265_profile->stdProfileIdc != STD_VIDEO_H265_PROFILE_IDC_MAIN_10 &&
+ h265_profile->stdProfileIdc != STD_VIDEO_H265_PROFILE_IDC_MAIN_STILL_PICTURE &&
+ h265_profile->stdProfileIdc != STD_VIDEO_H265_PROFILE_IDC_FORMAT_RANGE_EXTENSIONS)
+ return VK_ERROR_VIDEO_PROFILE_OPERATION_NOT_SUPPORTED_KHR;
+
+ /* Skylake only supports the main profile */
+ if (h265_profile->stdProfileIdc != STD_VIDEO_H265_PROFILE_IDC_MAIN &&
+ h265_profile->stdProfileIdc != STD_VIDEO_H265_PROFILE_IDC_MAIN_STILL_PICTURE &&
+ pdevice->info.platform <= INTEL_PLATFORM_SKL)
+ return VK_ERROR_VIDEO_PROFILE_OPERATION_NOT_SUPPORTED_KHR;
+
+ /* Gfx10 and under don't support the range extension profile */
+ if (h265_profile->stdProfileIdc != STD_VIDEO_H265_PROFILE_IDC_MAIN &&
+ h265_profile->stdProfileIdc != STD_VIDEO_H265_PROFILE_IDC_MAIN_10 &&
+ h265_profile->stdProfileIdc != STD_VIDEO_H265_PROFILE_IDC_MAIN_STILL_PICTURE &&
+ pdevice->info.ver <= 10)
+ return VK_ERROR_VIDEO_PROFILE_OPERATION_NOT_SUPPORTED_KHR;
+
+ if (pVideoProfile->lumaBitDepth != VK_VIDEO_COMPONENT_BIT_DEPTH_8_BIT_KHR &&
+ pVideoProfile->lumaBitDepth != VK_VIDEO_COMPONENT_BIT_DEPTH_10_BIT_KHR)
+ return VK_ERROR_VIDEO_PROFILE_FORMAT_NOT_SUPPORTED_KHR;
+
+ pCapabilities->pictureAccessGranularity.width = ANV_MAX_H265_CTB_SIZE;
+ pCapabilities->pictureAccessGranularity.height = ANV_MAX_H265_CTB_SIZE;
+ pCapabilities->minCodedExtent.width = ANV_MAX_H265_CTB_SIZE;
+ pCapabilities->minCodedExtent.height = ANV_MAX_H265_CTB_SIZE;
+ pCapabilities->maxDpbSlots = ANV_VIDEO_H265_MAX_NUM_REF_FRAME;
+ pCapabilities->maxActiveReferencePictures = ANV_VIDEO_H265_HCP_NUM_REF_FRAME;
+
+ ext->maxLevelIdc = STD_VIDEO_H265_LEVEL_IDC_6_2;
+
+ strcpy(pCapabilities->stdHeaderVersion.extensionName, VK_STD_VULKAN_VIDEO_CODEC_H265_DECODE_EXTENSION_NAME);
+ pCapabilities->stdHeaderVersion.specVersion = VK_STD_VULKAN_VIDEO_CODEC_H265_DECODE_SPEC_VERSION;
+ break;
+ }
+ default:
+ break;
+ }
+ return VK_SUCCESS;
+}
+
+VkResult
+anv_GetPhysicalDeviceVideoFormatPropertiesKHR(VkPhysicalDevice physicalDevice,
+ const VkPhysicalDeviceVideoFormatInfoKHR *pVideoFormatInfo,
+ uint32_t *pVideoFormatPropertyCount,
+ VkVideoFormatPropertiesKHR *pVideoFormatProperties)
+{
+ VK_OUTARRAY_MAKE_TYPED(VkVideoFormatPropertiesKHR, out,
+ pVideoFormatProperties,
+ pVideoFormatPropertyCount);
+
+ bool need_10bit = false;
+ const struct VkVideoProfileListInfoKHR *prof_list = (struct VkVideoProfileListInfoKHR *)
+ vk_find_struct_const(pVideoFormatInfo->pNext, VIDEO_PROFILE_LIST_INFO_KHR);
+
+ if (prof_list) {
+ for (unsigned i = 0; i < prof_list->profileCount; i++) {
+ const VkVideoProfileInfoKHR *profile = &prof_list->pProfiles[i];
+ if (profile->lumaBitDepth & VK_VIDEO_COMPONENT_BIT_DEPTH_10_BIT_KHR ||
+ profile->chromaBitDepth & VK_VIDEO_COMPONENT_BIT_DEPTH_10_BIT_KHR)
+ need_10bit = true;
+ }
+ }
+
+ vk_outarray_append_typed(VkVideoFormatPropertiesKHR, &out, p) {
+ p->format = VK_FORMAT_G8_B8R8_2PLANE_420_UNORM;
+ p->imageType = VK_IMAGE_TYPE_2D;
+ p->imageTiling = VK_IMAGE_TILING_OPTIMAL;
+ p->imageUsageFlags = pVideoFormatInfo->imageUsage;
+ }
+
+ if (need_10bit) {
+ vk_outarray_append_typed(VkVideoFormatPropertiesKHR, &out, p) {
+ p->format = VK_FORMAT_G10X6_B10X6R10X6_2PLANE_420_UNORM_3PACK16;
+ p->imageType = VK_IMAGE_TYPE_2D;
+ p->imageTiling = VK_IMAGE_TILING_OPTIMAL;
+ p->imageUsageFlags = pVideoFormatInfo->imageUsage;
+ }
+ }
+
+ return vk_outarray_status(&out);
+}
+
+static uint64_t
+get_h264_video_mem_size(struct anv_video_session *vid, uint32_t mem_idx)
+{
+ uint32_t width_in_mb =
+ align(vid->vk.max_coded.width, ANV_MB_WIDTH) / ANV_MB_WIDTH;
+
+ switch (mem_idx) {
+ case ANV_VID_MEM_H264_INTRA_ROW_STORE:
+ return width_in_mb * 64;
+ case ANV_VID_MEM_H264_DEBLOCK_FILTER_ROW_STORE:
+ return width_in_mb * 64 * 4;
+ case ANV_VID_MEM_H264_BSD_MPC_ROW_SCRATCH:
+ return width_in_mb * 64 * 2;
+ case ANV_VID_MEM_H264_MPR_ROW_SCRATCH:
+ return width_in_mb * 64 * 2;
+ default:
+ unreachable("unknown memory");
+ }
+}
+
+static uint64_t
+get_h265_video_mem_size(struct anv_video_session *vid, uint32_t mem_idx)
+{
+ uint32_t bit_shift =
+ vid->vk.h265.profile_idc == STD_VIDEO_H265_PROFILE_IDC_MAIN_10 ? 2 : 3;
+
+ /* TODO. these sizes can be determined dynamically depending on ctb sizes of each slice. */
+ uint32_t width_in_ctb =
+ align(vid->vk.max_coded.width, ANV_MAX_H265_CTB_SIZE) / ANV_MAX_H265_CTB_SIZE;
+ uint32_t height_in_ctb =
+ align(vid->vk.max_coded.height, ANV_MAX_H265_CTB_SIZE) / ANV_MAX_H265_CTB_SIZE;
+ uint64_t size;
+
+ switch (mem_idx) {
+ case ANV_VID_MEM_H265_DEBLOCK_FILTER_ROW_STORE_LINE:
+ case ANV_VID_MEM_H265_DEBLOCK_FILTER_ROW_STORE_TILE_LINE:
+ size = align(vid->vk.max_coded.width, 32) >> bit_shift;
+ break;
+ case ANV_VID_MEM_H265_DEBLOCK_FILTER_ROW_STORE_TILE_COLUMN:
+ size = align(vid->vk.max_coded.height + 6 * height_in_ctb, 32) >> bit_shift;
+ break;
+ case ANV_VID_MEM_H265_METADATA_LINE:
+ size = (((vid->vk.max_coded.width + 15) >> 4) * 188 + width_in_ctb * 9 + 1023) >> 9;
+ break;
+ case ANV_VID_MEM_H265_METADATA_TILE_LINE:
+ size = (((vid->vk.max_coded.width + 15) >> 4) * 172 + width_in_ctb * 9 + 1023) >> 9;
+ break;
+ case ANV_VID_MEM_H265_METADATA_TILE_COLUMN:
+ size = (((vid->vk.max_coded.height + 15) >> 4) * 176 + height_in_ctb * 89 + 1023) >> 9;
+ break;
+ case ANV_VID_MEM_H265_SAO_LINE:
+ size = align((vid->vk.max_coded.width >> 1) + width_in_ctb * 3, 16) >> bit_shift;
+ break;
+ case ANV_VID_MEM_H265_SAO_TILE_LINE:
+ size = align((vid->vk.max_coded.width >> 1) + width_in_ctb * 6, 16) >> bit_shift;
+ break;
+ case ANV_VID_MEM_H265_SAO_TILE_COLUMN:
+ size = align((vid->vk.max_coded.height >> 1) + height_in_ctb * 6, 16) >> bit_shift;
+ break;
+ default:
+ unreachable("unknown memory");
+ }
+
+ return size << 6;
+}
+
+static void
+get_h264_video_session_mem_reqs(struct anv_video_session *vid,
+ VkVideoSessionMemoryRequirementsKHR *mem_reqs,
+ uint32_t *pVideoSessionMemoryRequirementsCount,
+ uint32_t memory_types)
+{
+ VK_OUTARRAY_MAKE_TYPED(VkVideoSessionMemoryRequirementsKHR,
+ out,
+ mem_reqs,
+ pVideoSessionMemoryRequirementsCount);
+
+ for (unsigned i = 0; i < ANV_VIDEO_MEM_REQS_H264; i++) {
+ uint32_t bind_index = ANV_VID_MEM_H264_INTRA_ROW_STORE + i;
+ uint64_t size = get_h264_video_mem_size(vid, i);
+
+ vk_outarray_append_typed(VkVideoSessionMemoryRequirementsKHR, &out, p) {
+ p->memoryBindIndex = bind_index;
+ p->memoryRequirements.size = size;
+ p->memoryRequirements.alignment = 4096;
+ p->memoryRequirements.memoryTypeBits = memory_types;
+ }
+ }
+}
+
+static void
+get_h265_video_session_mem_reqs(struct anv_video_session *vid,
+ VkVideoSessionMemoryRequirementsKHR *mem_reqs,
+ uint32_t *pVideoSessionMemoryRequirementsCount,
+ uint32_t memory_types)
+{
+ VK_OUTARRAY_MAKE_TYPED(VkVideoSessionMemoryRequirementsKHR,
+ out,
+ mem_reqs,
+ pVideoSessionMemoryRequirementsCount);
+
+ for (unsigned i = 0; i < ANV_VIDEO_MEM_REQS_H265; i++) {
+ uint32_t bind_index =
+ ANV_VID_MEM_H265_DEBLOCK_FILTER_ROW_STORE_LINE + i;
+ uint64_t size = get_h265_video_mem_size(vid, i);
+
+ vk_outarray_append_typed(VkVideoSessionMemoryRequirementsKHR, &out, p) {
+ p->memoryBindIndex = bind_index;
+ p->memoryRequirements.size = size;
+ p->memoryRequirements.alignment = 4096;
+ p->memoryRequirements.memoryTypeBits = memory_types;
+ }
+ }
+}
+
+VkResult
+anv_GetVideoSessionMemoryRequirementsKHR(VkDevice _device,
+ VkVideoSessionKHR videoSession,
+ uint32_t *pVideoSessionMemoryRequirementsCount,
+ VkVideoSessionMemoryRequirementsKHR *mem_reqs)
+{
+ ANV_FROM_HANDLE(anv_device, device, _device);
+ ANV_FROM_HANDLE(anv_video_session, vid, videoSession);
+
+ uint32_t memory_types =
+ (vid->vk.flags & VK_VIDEO_SESSION_CREATE_PROTECTED_CONTENT_BIT_KHR) ?
+ device->physical->memory.protected_mem_types :
+ device->physical->memory.default_buffer_mem_types;
+ switch (vid->vk.op) {
+ case VK_VIDEO_CODEC_OPERATION_DECODE_H264_BIT_KHR:
+ get_h264_video_session_mem_reqs(vid,
+ mem_reqs,
+ pVideoSessionMemoryRequirementsCount,
+ memory_types);
+ break;
+ case VK_VIDEO_CODEC_OPERATION_DECODE_H265_BIT_KHR:
+ get_h265_video_session_mem_reqs(vid,
+ mem_reqs,
+ pVideoSessionMemoryRequirementsCount,
+ memory_types);
+ break;
+ default:
+ unreachable("unknown codec");
+ }
+
+ return VK_SUCCESS;
+}
+
+VkResult
+anv_UpdateVideoSessionParametersKHR(VkDevice _device,
+ VkVideoSessionParametersKHR _params,
+ const VkVideoSessionParametersUpdateInfoKHR *pUpdateInfo)
+{
+ ANV_FROM_HANDLE(anv_video_session_params, params, _params);
+ return vk_video_session_parameters_update(&params->vk, pUpdateInfo);
+}
+
+static void
+copy_bind(struct anv_vid_mem *dst,
+ const VkBindVideoSessionMemoryInfoKHR *src)
+{
+ dst->mem = anv_device_memory_from_handle(src->memory);
+ dst->offset = src->memoryOffset;
+ dst->size = src->memorySize;
+}
+
+VkResult
+anv_BindVideoSessionMemoryKHR(VkDevice _device,
+ VkVideoSessionKHR videoSession,
+ uint32_t bind_mem_count,
+ const VkBindVideoSessionMemoryInfoKHR *bind_mem)
+{
+ ANV_FROM_HANDLE(anv_video_session, vid, videoSession);
+
+ switch (vid->vk.op) {
+ case VK_VIDEO_CODEC_OPERATION_DECODE_H264_BIT_KHR:
+ case VK_VIDEO_CODEC_OPERATION_DECODE_H265_BIT_KHR:
+ for (unsigned i = 0; i < bind_mem_count; i++) {
+ copy_bind(&vid->vid_mem[bind_mem[i].memoryBindIndex], &bind_mem[i]);
+ }
+ break;
+ default:
+ unreachable("unknown codec");
+ }
+ return VK_SUCCESS;
+}
diff --git a/src/intel/vulkan/anv_wsi.c b/src/intel/vulkan/anv_wsi.c
index 04d85d99d67..ab8e5d5fc6c 100644
--- a/src/intel/vulkan/anv_wsi.c
+++ b/src/intel/vulkan/anv_wsi.c
@@ -24,6 +24,9 @@
#include "anv_private.h"
#include "anv_measure.h"
#include "wsi_common.h"
+#include "vk_fence.h"
+#include "vk_queue.h"
+#include "vk_semaphore.h"
#include "vk_util.h"
static PFN_vkVoidFunction
@@ -33,46 +36,17 @@ anv_wsi_proc_addr(VkPhysicalDevice physicalDevice, const char *pName)
return vk_instance_get_proc_addr_unchecked(&pdevice->instance->vk, pName);
}
-static void
-anv_wsi_signal_semaphore_for_memory(VkDevice _device,
- VkSemaphore _semaphore,
- VkDeviceMemory _memory)
+static VkQueue
+anv_wsi_get_prime_blit_queue(VkDevice _device)
{
ANV_FROM_HANDLE(anv_device, device, _device);
- ANV_FROM_HANDLE(anv_semaphore, semaphore, _semaphore);
- ANV_FROM_HANDLE(anv_device_memory, memory, _memory);
- /* Put a BO semaphore with the image BO in the temporary. For BO binary
- * semaphores, we always set EXEC_OBJECT_WRITE so this creates a WaR
- * hazard with the display engine's read to ensure that no one writes to
- * the image before the read is complete.
- */
- anv_semaphore_reset_temporary(device, semaphore);
-
- struct anv_semaphore_impl *impl = &semaphore->temporary;
- impl->type = ANV_SEMAPHORE_TYPE_WSI_BO;
- impl->bo = anv_bo_ref(memory->bo);
-}
-
-static void
-anv_wsi_signal_fence_for_memory(VkDevice _device,
- VkFence _fence,
- VkDeviceMemory _memory)
-{
- ANV_FROM_HANDLE(anv_device, device, _device);
- ANV_FROM_HANDLE(anv_fence, fence, _fence);
- ANV_FROM_HANDLE(anv_device_memory, memory, _memory);
-
- /* Put a BO fence with the image BO in the temporary. For BO fences, we
- * always just wait until the BO isn't busy and reads from the BO should
- * count as busy.
- */
- anv_fence_reset_temporary(device, fence);
-
- struct anv_fence_impl *impl = &fence->temporary;
- impl->type = ANV_FENCE_TYPE_WSI_BO;
- impl->bo.bo = anv_bo_ref(memory->bo);
- impl->bo.state = ANV_BO_FENCE_STATE_SUBMITTED;
+ vk_foreach_queue(_queue, &device->vk) {
+ struct anv_queue *queue = (struct anv_queue *)_queue;
+ if (queue->family->queueFlags & (VK_QUEUE_GRAPHICS_BIT | VK_QUEUE_COMPUTE_BIT | VK_QUEUE_TRANSFER_BIT))
+ return vk_queue_to_handle(_queue);
+ }
+ return NULL;
}
VkResult
@@ -86,15 +60,21 @@ anv_init_wsi(struct anv_physical_device *physical_device)
&physical_device->instance->vk.alloc,
physical_device->master_fd,
&physical_device->instance->dri_options,
- false);
+ &(struct wsi_device_options){.sw_device = false});
if (result != VK_SUCCESS)
return result;
physical_device->wsi_device.supports_modifiers = true;
- physical_device->wsi_device.signal_semaphore_for_memory =
- anv_wsi_signal_semaphore_for_memory;
- physical_device->wsi_device.signal_fence_for_memory =
- anv_wsi_signal_fence_for_memory;
+ physical_device->wsi_device.get_blit_queue = anv_wsi_get_prime_blit_queue;
+ if (physical_device->info.kmd_type == INTEL_KMD_TYPE_I915) {
+ physical_device->wsi_device.signal_semaphore_with_memory = true;
+ physical_device->wsi_device.signal_fence_with_memory = true;
+ }
+
+ physical_device->vk.wsi_device = &physical_device->wsi_device;
+
+ wsi_device_setup_syncobj_fd(&physical_device->wsi_device,
+ physical_device->local_fd);
return VK_SUCCESS;
}
@@ -102,187 +82,25 @@ anv_init_wsi(struct anv_physical_device *physical_device)
void
anv_finish_wsi(struct anv_physical_device *physical_device)
{
+ physical_device->vk.wsi_device = NULL;
wsi_device_finish(&physical_device->wsi_device,
&physical_device->instance->vk.alloc);
}
-void anv_DestroySurfaceKHR(
- VkInstance _instance,
- VkSurfaceKHR _surface,
- const VkAllocationCallbacks* pAllocator)
-{
- ANV_FROM_HANDLE(anv_instance, instance, _instance);
- ICD_FROM_HANDLE(VkIcdSurfaceBase, surface, _surface);
-
- if (!surface)
- return;
-
- vk_free2(&instance->vk.alloc, pAllocator, surface);
-}
-
-VkResult anv_GetPhysicalDeviceSurfaceSupportKHR(
- VkPhysicalDevice physicalDevice,
- uint32_t queueFamilyIndex,
- VkSurfaceKHR surface,
- VkBool32* pSupported)
-{
- ANV_FROM_HANDLE(anv_physical_device, device, physicalDevice);
-
- return wsi_common_get_surface_support(&device->wsi_device,
- queueFamilyIndex,
- surface,
- pSupported);
-}
-
-VkResult anv_GetPhysicalDeviceSurfaceCapabilitiesKHR(
- VkPhysicalDevice physicalDevice,
- VkSurfaceKHR surface,
- VkSurfaceCapabilitiesKHR* pSurfaceCapabilities)
-{
- ANV_FROM_HANDLE(anv_physical_device, device, physicalDevice);
-
- return wsi_common_get_surface_capabilities(&device->wsi_device,
- surface,
- pSurfaceCapabilities);
-}
-
-VkResult anv_GetPhysicalDeviceSurfaceCapabilities2KHR(
- VkPhysicalDevice physicalDevice,
- const VkPhysicalDeviceSurfaceInfo2KHR* pSurfaceInfo,
- VkSurfaceCapabilities2KHR* pSurfaceCapabilities)
-{
- ANV_FROM_HANDLE(anv_physical_device, device, physicalDevice);
-
- return wsi_common_get_surface_capabilities2(&device->wsi_device,
- pSurfaceInfo,
- pSurfaceCapabilities);
-}
-
-VkResult anv_GetPhysicalDeviceSurfaceCapabilities2EXT(
- VkPhysicalDevice physicalDevice,
- VkSurfaceKHR surface,
- VkSurfaceCapabilities2EXT* pSurfaceCapabilities)
-{
- ANV_FROM_HANDLE(anv_physical_device, device, physicalDevice);
-
- return wsi_common_get_surface_capabilities2ext(&device->wsi_device,
- surface,
- pSurfaceCapabilities);
-}
-
-VkResult anv_GetPhysicalDeviceSurfaceFormatsKHR(
- VkPhysicalDevice physicalDevice,
- VkSurfaceKHR surface,
- uint32_t* pSurfaceFormatCount,
- VkSurfaceFormatKHR* pSurfaceFormats)
-{
- ANV_FROM_HANDLE(anv_physical_device, device, physicalDevice);
-
- return wsi_common_get_surface_formats(&device->wsi_device, surface,
- pSurfaceFormatCount, pSurfaceFormats);
-}
-
-VkResult anv_GetPhysicalDeviceSurfaceFormats2KHR(
- VkPhysicalDevice physicalDevice,
- const VkPhysicalDeviceSurfaceInfo2KHR* pSurfaceInfo,
- uint32_t* pSurfaceFormatCount,
- VkSurfaceFormat2KHR* pSurfaceFormats)
-{
- ANV_FROM_HANDLE(anv_physical_device, device, physicalDevice);
-
- return wsi_common_get_surface_formats2(&device->wsi_device, pSurfaceInfo,
- pSurfaceFormatCount, pSurfaceFormats);
-}
-
-VkResult anv_GetPhysicalDeviceSurfacePresentModesKHR(
- VkPhysicalDevice physicalDevice,
- VkSurfaceKHR surface,
- uint32_t* pPresentModeCount,
- VkPresentModeKHR* pPresentModes)
-{
- ANV_FROM_HANDLE(anv_physical_device, device, physicalDevice);
-
- return wsi_common_get_surface_present_modes(&device->wsi_device, surface,
- pPresentModeCount,
- pPresentModes);
-}
-
-VkResult anv_CreateSwapchainKHR(
- VkDevice _device,
- const VkSwapchainCreateInfoKHR* pCreateInfo,
- const VkAllocationCallbacks* pAllocator,
- VkSwapchainKHR* pSwapchain)
-{
- ANV_FROM_HANDLE(anv_device, device, _device);
- struct wsi_device *wsi_device = &device->physical->wsi_device;
- const VkAllocationCallbacks *alloc;
-
- if (pAllocator)
- alloc = pAllocator;
- else
- alloc = &device->vk.alloc;
-
- return wsi_common_create_swapchain(wsi_device, _device,
- pCreateInfo, alloc, pSwapchain);
-}
-
-void anv_DestroySwapchainKHR(
- VkDevice _device,
- VkSwapchainKHR swapchain,
- const VkAllocationCallbacks* pAllocator)
-{
- ANV_FROM_HANDLE(anv_device, device, _device);
- const VkAllocationCallbacks *alloc;
-
- if (pAllocator)
- alloc = pAllocator;
- else
- alloc = &device->vk.alloc;
-
- wsi_common_destroy_swapchain(_device, swapchain, alloc);
-}
-
-VkResult anv_GetSwapchainImagesKHR(
- VkDevice device,
- VkSwapchainKHR swapchain,
- uint32_t* pSwapchainImageCount,
- VkImage* pSwapchainImages)
-{
- return wsi_common_get_images(swapchain,
- pSwapchainImageCount,
- pSwapchainImages);
-}
-
-VkResult anv_AcquireNextImageKHR(
- VkDevice device,
- VkSwapchainKHR swapchain,
- uint64_t timeout,
- VkSemaphore semaphore,
- VkFence fence,
- uint32_t* pImageIndex)
-{
- VkAcquireNextImageInfoKHR acquire_info = {
- .sType = VK_STRUCTURE_TYPE_ACQUIRE_NEXT_IMAGE_INFO_KHR,
- .swapchain = swapchain,
- .timeout = timeout,
- .semaphore = semaphore,
- .fence = fence,
- .deviceMask = 0,
- };
-
- return anv_AcquireNextImage2KHR(device, &acquire_info, pImageIndex);
-}
-
VkResult anv_AcquireNextImage2KHR(
- VkDevice _device,
- const VkAcquireNextImageInfoKHR* pAcquireInfo,
- uint32_t* pImageIndex)
+ VkDevice _device,
+ const VkAcquireNextImageInfoKHR *pAcquireInfo,
+ uint32_t *pImageIndex)
{
- ANV_FROM_HANDLE(anv_device, device, _device);
+ VK_FROM_HANDLE(anv_device, device, _device);
+
+ VkResult result =
+ wsi_common_acquire_next_image2(&device->physical->wsi_device,
+ _device, pAcquireInfo, pImageIndex);
+ if (result == VK_SUCCESS)
+ anv_measure_acquire(device);
- anv_measure_acquire(device);
- return wsi_common_acquire_next_image2(&device->physical->wsi_device,
- _device, pAcquireInfo, pImageIndex);
+ return result;
}
VkResult anv_QueuePresentKHR(
@@ -291,111 +109,26 @@ VkResult anv_QueuePresentKHR(
{
ANV_FROM_HANDLE(anv_queue, queue, _queue);
struct anv_device *device = queue->device;
+ VkResult result;
if (device->debug_frame_desc) {
device->debug_frame_desc->frame_id++;
- if (!device->info.has_llc) {
- intel_clflush_range(device->debug_frame_desc,
- sizeof(*device->debug_frame_desc));
- }
}
- if (device->has_thread_submit &&
- pPresentInfo->waitSemaphoreCount > 0) {
- /* Make sure all of the dependency semaphores have materialized when
- * using a threaded submission.
- */
- VK_MULTIALLOC(ma);
- VK_MULTIALLOC_DECL(&ma, uint64_t, values,
- pPresentInfo->waitSemaphoreCount);
- VK_MULTIALLOC_DECL(&ma, uint32_t, syncobjs,
- pPresentInfo->waitSemaphoreCount);
-
- if (!vk_multialloc_alloc(&ma, &device->vk.alloc,
- VK_SYSTEM_ALLOCATION_SCOPE_COMMAND))
- return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+ if (u_trace_should_process(&device->ds.trace_context))
+ anv_queue_trace(queue, NULL, true /* frame */, false /* begin */);
- uint32_t wait_count = 0;
- for (uint32_t i = 0; i < pPresentInfo->waitSemaphoreCount; i++) {
- ANV_FROM_HANDLE(anv_semaphore, semaphore, pPresentInfo->pWaitSemaphores[i]);
- struct anv_semaphore_impl *impl =
- semaphore->temporary.type != ANV_SEMAPHORE_TYPE_NONE ?
- &semaphore->temporary : &semaphore->permanent;
-
- if (impl->type == ANV_SEMAPHORE_TYPE_DUMMY)
- continue;
- assert(impl->type == ANV_SEMAPHORE_TYPE_DRM_SYNCOBJ);
- syncobjs[wait_count] = impl->syncobj;
- values[wait_count] = 0;
- wait_count++;
- }
-
- int ret = 0;
- if (wait_count > 0) {
- ret =
- anv_gem_syncobj_timeline_wait(device,
- syncobjs, values, wait_count,
- anv_get_absolute_timeout(INT64_MAX),
- true /* wait_all */,
- true /* wait_materialize */);
- }
-
- vk_free(&device->vk.alloc, values);
-
- if (ret)
- return vk_error(VK_ERROR_DEVICE_LOST);
- }
+ result = vk_queue_wait_before_present(&queue->vk, pPresentInfo);
+ if (result != VK_SUCCESS)
+ return result;
- VkResult result = wsi_common_queue_present(&device->physical->wsi_device,
- anv_device_to_handle(queue->device),
- _queue, 0,
- pPresentInfo);
+ result = wsi_common_queue_present(&device->physical->wsi_device,
+ anv_device_to_handle(queue->device),
+ _queue, 0,
+ pPresentInfo);
- for (uint32_t i = 0; i < pPresentInfo->waitSemaphoreCount; i++) {
- ANV_FROM_HANDLE(anv_semaphore, semaphore, pPresentInfo->pWaitSemaphores[i]);
- /* From the Vulkan 1.0.53 spec:
- *
- * "If the import is temporary, the implementation must restore the
- * semaphore to its prior permanent state after submitting the next
- * semaphore wait operation."
- */
- anv_semaphore_reset_temporary(queue->device, semaphore);
- }
+ if (u_trace_should_process(&device->ds.trace_context))
+ anv_queue_trace(queue, NULL, true /* frame */, true /* begin */);
return result;
}
-
-VkResult anv_GetDeviceGroupPresentCapabilitiesKHR(
- VkDevice device,
- VkDeviceGroupPresentCapabilitiesKHR* pCapabilities)
-{
- memset(pCapabilities->presentMask, 0,
- sizeof(pCapabilities->presentMask));
- pCapabilities->presentMask[0] = 0x1;
- pCapabilities->modes = VK_DEVICE_GROUP_PRESENT_MODE_LOCAL_BIT_KHR;
-
- return VK_SUCCESS;
-}
-
-VkResult anv_GetDeviceGroupSurfacePresentModesKHR(
- VkDevice device,
- VkSurfaceKHR surface,
- VkDeviceGroupPresentModeFlagsKHR* pModes)
-{
- *pModes = VK_DEVICE_GROUP_PRESENT_MODE_LOCAL_BIT_KHR;
-
- return VK_SUCCESS;
-}
-
-VkResult anv_GetPhysicalDevicePresentRectanglesKHR(
- VkPhysicalDevice physicalDevice,
- VkSurfaceKHR surface,
- uint32_t* pRectCount,
- VkRect2D* pRects)
-{
- ANV_FROM_HANDLE(anv_physical_device, device, physicalDevice);
-
- return wsi_common_get_present_rectangles(&device->wsi_device,
- surface,
- pRectCount, pRects);
-}
diff --git a/src/intel/vulkan/anv_wsi_display.c b/src/intel/vulkan/anv_wsi_display.c
deleted file mode 100644
index 4bb0453f55f..00000000000
--- a/src/intel/vulkan/anv_wsi_display.c
+++ /dev/null
@@ -1,338 +0,0 @@
-/*
- * Copyright © 2017 Keith Packard
- *
- * Permission to use, copy, modify, distribute, and sell this software and its
- * documentation for any purpose is hereby granted without fee, provided that
- * the above copyright notice appear in all copies and that both that copyright
- * notice and this permission notice appear in supporting documentation, and
- * that the name of the copyright holders not be used in advertising or
- * publicity pertaining to distribution of the software without specific,
- * written prior permission. The copyright holders make no representations
- * about the suitability of this software for any purpose. It is provided "as
- * is" without express or implied warranty.
- *
- * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
- * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
- * EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT OR
- * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
- * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
- * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
- * OF THIS SOFTWARE.
- */
-
-#include "anv_private.h"
-#include "wsi_common.h"
-#include "vk_util.h"
-#include "wsi_common_display.h"
-
-VkResult
-anv_GetPhysicalDeviceDisplayPropertiesKHR(VkPhysicalDevice physical_device,
- uint32_t *property_count,
- VkDisplayPropertiesKHR *properties)
-{
- ANV_FROM_HANDLE(anv_physical_device, pdevice, physical_device);
-
- return wsi_display_get_physical_device_display_properties(
- physical_device,
- &pdevice->wsi_device,
- property_count,
- properties);
-}
-
-VkResult
-anv_GetPhysicalDeviceDisplayProperties2KHR(
- VkPhysicalDevice physicalDevice,
- uint32_t* pPropertyCount,
- VkDisplayProperties2KHR* pProperties)
-{
- ANV_FROM_HANDLE(anv_physical_device, pdevice, physicalDevice);
-
- return wsi_display_get_physical_device_display_properties2(
- physicalDevice, &pdevice->wsi_device,
- pPropertyCount, pProperties);
-}
-
-VkResult
-anv_GetPhysicalDeviceDisplayPlanePropertiesKHR(
- VkPhysicalDevice physical_device,
- uint32_t *property_count,
- VkDisplayPlanePropertiesKHR *properties)
-{
- ANV_FROM_HANDLE(anv_physical_device, pdevice, physical_device);
-
- return wsi_display_get_physical_device_display_plane_properties(
- physical_device, &pdevice->wsi_device,
- property_count, properties);
-}
-
-VkResult
-anv_GetPhysicalDeviceDisplayPlaneProperties2KHR(
- VkPhysicalDevice physicalDevice,
- uint32_t* pPropertyCount,
- VkDisplayPlaneProperties2KHR* pProperties)
-{
- ANV_FROM_HANDLE(anv_physical_device, pdevice, physicalDevice);
-
- return wsi_display_get_physical_device_display_plane_properties2(
- physicalDevice, &pdevice->wsi_device,
- pPropertyCount, pProperties);
-}
-
-VkResult
-anv_GetDisplayPlaneSupportedDisplaysKHR(VkPhysicalDevice physical_device,
- uint32_t plane_index,
- uint32_t *display_count,
- VkDisplayKHR *displays)
-{
- ANV_FROM_HANDLE(anv_physical_device, pdevice, physical_device);
-
- return wsi_display_get_display_plane_supported_displays(physical_device,
- &pdevice->wsi_device,
- plane_index,
- display_count,
- displays);
-}
-
-
-VkResult
-anv_GetDisplayModePropertiesKHR(VkPhysicalDevice physical_device,
- VkDisplayKHR display,
- uint32_t *property_count,
- VkDisplayModePropertiesKHR *properties)
-{
- ANV_FROM_HANDLE(anv_physical_device, pdevice, physical_device);
-
- return wsi_display_get_display_mode_properties(physical_device,
- &pdevice->wsi_device,
- display,
- property_count,
- properties);
-}
-
-VkResult
-anv_GetDisplayModeProperties2KHR(
- VkPhysicalDevice physicalDevice,
- VkDisplayKHR display,
- uint32_t* pPropertyCount,
- VkDisplayModeProperties2KHR* pProperties)
-{
- ANV_FROM_HANDLE(anv_physical_device, pdevice, physicalDevice);
-
- return wsi_display_get_display_mode_properties2(physicalDevice,
- &pdevice->wsi_device,
- display,
- pPropertyCount,
- pProperties);
-}
-
-VkResult
-anv_CreateDisplayModeKHR(VkPhysicalDevice physical_device,
- VkDisplayKHR display,
- const VkDisplayModeCreateInfoKHR *create_info,
- const VkAllocationCallbacks *allocator,
- VkDisplayModeKHR *mode)
-{
- ANV_FROM_HANDLE(anv_physical_device, pdevice, physical_device);
-
- return wsi_display_create_display_mode(physical_device,
- &pdevice->wsi_device,
- display,
- create_info,
- allocator,
- mode);
-}
-
-VkResult
-anv_GetDisplayPlaneCapabilitiesKHR(VkPhysicalDevice physical_device,
- VkDisplayModeKHR mode_khr,
- uint32_t plane_index,
- VkDisplayPlaneCapabilitiesKHR *capabilities)
-{
- ANV_FROM_HANDLE(anv_physical_device, pdevice, physical_device);
-
- return wsi_get_display_plane_capabilities(physical_device,
- &pdevice->wsi_device,
- mode_khr,
- plane_index,
- capabilities);
-}
-
-VkResult
-anv_GetDisplayPlaneCapabilities2KHR(
- VkPhysicalDevice physicalDevice,
- const VkDisplayPlaneInfo2KHR* pDisplayPlaneInfo,
- VkDisplayPlaneCapabilities2KHR* pCapabilities)
-{
- ANV_FROM_HANDLE(anv_physical_device, pdevice, physicalDevice);
-
- return wsi_get_display_plane_capabilities2(physicalDevice,
- &pdevice->wsi_device,
- pDisplayPlaneInfo,
- pCapabilities);
-}
-
-VkResult
-anv_CreateDisplayPlaneSurfaceKHR(
- VkInstance _instance,
- const VkDisplaySurfaceCreateInfoKHR *create_info,
- const VkAllocationCallbacks *allocator,
- VkSurfaceKHR *surface)
-{
- ANV_FROM_HANDLE(anv_instance, instance, _instance);
- const VkAllocationCallbacks *alloc;
-
- if (allocator)
- alloc = allocator;
- else
- alloc = &instance->vk.alloc;
-
- return wsi_create_display_surface(_instance, alloc, create_info, surface);
-}
-
-VkResult
-anv_ReleaseDisplayEXT(VkPhysicalDevice physical_device,
- VkDisplayKHR display)
-{
- ANV_FROM_HANDLE(anv_physical_device, pdevice, physical_device);
-
- return wsi_release_display(physical_device,
- &pdevice->wsi_device,
- display);
-}
-
-#ifdef VK_USE_PLATFORM_XLIB_XRANDR_EXT
-VkResult
-anv_AcquireXlibDisplayEXT(VkPhysicalDevice physical_device,
- Display *dpy,
- VkDisplayKHR display)
-{
- ANV_FROM_HANDLE(anv_physical_device, pdevice, physical_device);
-
- return wsi_acquire_xlib_display(physical_device,
- &pdevice->wsi_device,
- dpy,
- display);
-}
-
-VkResult
-anv_GetRandROutputDisplayEXT(VkPhysicalDevice physical_device,
- Display *dpy,
- RROutput output,
- VkDisplayKHR *display)
-{
- ANV_FROM_HANDLE(anv_physical_device, pdevice, physical_device);
-
- return wsi_get_randr_output_display(physical_device,
- &pdevice->wsi_device,
- dpy,
- output,
- display);
-}
-#endif /* VK_USE_PLATFORM_XLIB_XRANDR_EXT */
-
-/* VK_EXT_display_control */
-
-VkResult
-anv_DisplayPowerControlEXT(VkDevice _device,
- VkDisplayKHR display,
- const VkDisplayPowerInfoEXT *display_power_info)
-{
- ANV_FROM_HANDLE(anv_device, device, _device);
-
- return wsi_display_power_control(
- _device, &device->physical->wsi_device,
- display, display_power_info);
-}
-
-VkResult
-anv_RegisterDeviceEventEXT(VkDevice _device,
- const VkDeviceEventInfoEXT *device_event_info,
- const VkAllocationCallbacks *allocator,
- VkFence *_fence)
-{
- ANV_FROM_HANDLE(anv_device, device, _device);
- struct anv_fence *fence;
- VkResult ret;
-
- fence = vk_zalloc2(&device->vk.alloc, allocator, sizeof (*fence), 8,
- VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
- if (!fence)
- return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
-
- fence->permanent.type = ANV_FENCE_TYPE_WSI;
-
- ret = wsi_register_device_event(_device,
- &device->physical->wsi_device,
- device_event_info,
- allocator,
- &fence->permanent.fence_wsi,
- -1);
- if (ret == VK_SUCCESS)
- *_fence = anv_fence_to_handle(fence);
- else
- vk_free2(&device->vk.alloc, allocator, fence);
- return ret;
-}
-
-VkResult
-anv_RegisterDisplayEventEXT(VkDevice _device,
- VkDisplayKHR display,
- const VkDisplayEventInfoEXT *display_event_info,
- const VkAllocationCallbacks *allocator,
- VkFence *_fence)
-{
- ANV_FROM_HANDLE(anv_device, device, _device);
- struct anv_fence *fence;
- VkResult ret;
-
- fence = vk_zalloc2(&device->vk.alloc, allocator, sizeof (*fence), 8,
- VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
- if (!fence)
- return VK_ERROR_OUT_OF_HOST_MEMORY;
-
- fence->permanent.type = ANV_FENCE_TYPE_WSI;
-
- ret = wsi_register_display_event(
- _device, &device->physical->wsi_device,
- display, display_event_info, allocator, &fence->permanent.fence_wsi, -1);
-
- if (ret == VK_SUCCESS)
- *_fence = anv_fence_to_handle(fence);
- else
- vk_free2(&device->vk.alloc, allocator, fence);
- return ret;
-}
-
-VkResult
-anv_GetSwapchainCounterEXT(VkDevice _device,
- VkSwapchainKHR swapchain,
- VkSurfaceCounterFlagBitsEXT flag_bits,
- uint64_t *value)
-{
- ANV_FROM_HANDLE(anv_device, device, _device);
-
- return wsi_get_swapchain_counter(
- _device, &device->physical->wsi_device,
- swapchain, flag_bits, value);
-}
-
-VkResult
-anv_AcquireDrmDisplayEXT(VkPhysicalDevice physical_device,
- int32_t drm_fd,
- VkDisplayKHR display)
-{
- ANV_FROM_HANDLE(anv_physical_device, pdevice, physical_device);
-
- return wsi_acquire_drm_display(physical_device, &pdevice->wsi_device, drm_fd, display);
-}
-
-VkResult
-anv_GetDrmDisplayEXT(VkPhysicalDevice physical_device,
- int32_t drm_fd,
- uint32_t connector_id,
- VkDisplayKHR *display)
-{
- ANV_FROM_HANDLE(anv_physical_device, pdevice, physical_device);
-
- return wsi_get_drm_display(physical_device, &pdevice->wsi_device, drm_fd, connector_id, display);
-}
diff --git a/src/intel/vulkan/anv_wsi_x11.c b/src/intel/vulkan/anv_wsi_x11.c
deleted file mode 100644
index 702eb57aafe..00000000000
--- a/src/intel/vulkan/anv_wsi_x11.c
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
- * Copyright © 2015 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-#include <X11/Xlib-xcb.h>
-#include <X11/xshmfence.h>
-#include <xcb/xcb.h>
-#include <xcb/dri3.h>
-#include <xcb/present.h>
-
-#include "wsi_common_x11.h"
-#include "anv_private.h"
-
-VkBool32 anv_GetPhysicalDeviceXcbPresentationSupportKHR(
- VkPhysicalDevice physicalDevice,
- uint32_t queueFamilyIndex,
- xcb_connection_t* connection,
- xcb_visualid_t visual_id)
-{
- ANV_FROM_HANDLE(anv_physical_device, device, physicalDevice);
-
- return wsi_get_physical_device_xcb_presentation_support(
- &device->wsi_device,
- queueFamilyIndex,
- connection, visual_id);
-}
-
-VkBool32 anv_GetPhysicalDeviceXlibPresentationSupportKHR(
- VkPhysicalDevice physicalDevice,
- uint32_t queueFamilyIndex,
- Display* dpy,
- VisualID visualID)
-{
- ANV_FROM_HANDLE(anv_physical_device, device, physicalDevice);
-
- return wsi_get_physical_device_xcb_presentation_support(
- &device->wsi_device,
- queueFamilyIndex,
- XGetXCBConnection(dpy), visualID);
-}
-
-VkResult anv_CreateXcbSurfaceKHR(
- VkInstance _instance,
- const VkXcbSurfaceCreateInfoKHR* pCreateInfo,
- const VkAllocationCallbacks* pAllocator,
- VkSurfaceKHR* pSurface)
-{
- ANV_FROM_HANDLE(anv_instance, instance, _instance);
- const VkAllocationCallbacks *alloc;
- assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_XCB_SURFACE_CREATE_INFO_KHR);
-
- if (pAllocator)
- alloc = pAllocator;
- else
- alloc = &instance->vk.alloc;
-
- return wsi_create_xcb_surface(alloc, pCreateInfo, pSurface);
-}
-
-VkResult anv_CreateXlibSurfaceKHR(
- VkInstance _instance,
- const VkXlibSurfaceCreateInfoKHR* pCreateInfo,
- const VkAllocationCallbacks* pAllocator,
- VkSurfaceKHR* pSurface)
-{
- ANV_FROM_HANDLE(anv_instance, instance, _instance);
- const VkAllocationCallbacks *alloc;
-
- assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_XLIB_SURFACE_CREATE_INFO_KHR);
-
- if (pAllocator)
- alloc = pAllocator;
- else
- alloc = &instance->vk.alloc;
-
- return wsi_create_xlib_surface(alloc, pCreateInfo, pSurface);
-}
diff --git a/src/intel/vulkan/genX_acceleration_structure.c b/src/intel/vulkan/genX_acceleration_structure.c
new file mode 100644
index 00000000000..db5c34cdcdb
--- /dev/null
+++ b/src/intel/vulkan/genX_acceleration_structure.c
@@ -0,0 +1,1287 @@
+/*
+ * Copyright © 2020 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "anv_private.h"
+
+#include <math.h>
+
+#include "util/u_debug.h"
+#include "util/half_float.h"
+#include "util/u_atomic.h"
+
+#include "genxml/gen_macros.h"
+#include "genxml/genX_pack.h"
+#include "genxml/genX_rt_pack.h"
+
+#include "ds/intel_tracepoints.h"
+
+#if GFX_VERx10 == 125
+#include "grl/grl_structs.h"
+
+/* Wait for the previous dispatches to finish and flush their data port
+ * writes.
+ */
+#define ANV_GRL_FLUSH_FLAGS (ANV_PIPE_END_OF_PIPE_SYNC_BIT | \
+ ANV_PIPE_DATA_CACHE_FLUSH_BIT | \
+ ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT)
+
+static const VkAccelerationStructureGeometryKHR *
+get_geometry(const VkAccelerationStructureBuildGeometryInfoKHR *pInfo,
+ uint32_t index)
+{
+ return pInfo->pGeometries ? &pInfo->pGeometries[index] :
+ pInfo->ppGeometries[index];
+}
+
+static size_t align_transient_size(size_t bytes)
+{
+ return align_uintptr(bytes, 64);
+}
+
+static size_t align_private_size(size_t bytes)
+{
+ return align_uintptr(bytes, 64);
+}
+
+static size_t get_scheduler_size(size_t num_builds)
+{
+ size_t scheduler_size = sizeof(union SchedulerUnion);
+ /* add more memory for qnode creation stage if needed */
+ if (num_builds > QNODE_GLOBAL_ROOT_BUFFER_MIN_ENTRIES_NUM) {
+ scheduler_size += (num_builds - QNODE_GLOBAL_ROOT_BUFFER_MIN_ENTRIES_NUM) * 2 *
+ sizeof(struct QNodeGlobalRootBufferEntry);
+ }
+
+ return align_private_size(scheduler_size);
+}
+
+static size_t
+get_batched_binnedsah_transient_mem_size(size_t num_builds)
+{
+ if (num_builds == 0)
+ return 0;
+ return num_builds * (sizeof(struct SAHBuildBuffersInfo) + sizeof(gpuva_t));
+}
+
+static size_t
+get_batched_binnedsah_private_mem_size(size_t num_builds)
+{
+ if (num_builds == 0)
+ return 0;
+
+ size_t globals_size = align_private_size(num_builds * sizeof(struct SAHBuildGlobals));
+ return globals_size + get_scheduler_size(num_builds);
+}
+
+static uint32_t
+estimate_qbvh6_nodes(const uint32_t N)
+{
+ const uint32_t W = 6;
+ const uint32_t N0 = N / 2 + N % 2; // lowest level with 2 leaves per QBVH6 node
+ const uint32_t N1 = N0 / W + (N0 % W ? 1 : 0); // filled level
+ const uint32_t N2 = N0 / W + (N1 % W ? 1 : 0); // filled level
+ const uint32_t N3 = N0 / W + (N2 % W ? 1 : 0); // filled level
+ const uint32_t N4 = N3; // overestimate remaining nodes
+ return N0 + N1 + N2 + N3 + N4;
+}
+
+/* Estimates the worst case number of QBVH6 nodes for a top-down BVH
+ * build that guarantees to produce subtree with N >= K primitives
+ * from which a single QBVH6 node is created.
+ */
+static uint32_t
+estimate_qbvh6_nodes_minK(const uint32_t N, uint32_t K)
+{
+ const uint32_t N0 = N / K + (N % K ? 1 : 0); // lowest level of nodes with K leaves minimally
+ return N0 + estimate_qbvh6_nodes(N0);
+}
+
+static size_t
+estimate_qbvh6_fatleafs(const size_t P)
+{
+ return P;
+}
+
+static size_t
+estimate_qbvh6_nodes_worstcase(const size_t P)
+{
+ const size_t F = estimate_qbvh6_fatleafs(P);
+
+ // worst-case each inner node having 5 fat-leaf children.
+ // number of inner nodes is F/5 and number of fat-leaves is F
+ return F + ceil(F/5.0);
+}
+
+#define sizeof_PrimRef 32
+#define sizeof_HwInstanceLeaf (GENX(RT_BVH_INSTANCE_LEAF_length) * 4)
+#define sizeof_InternalNode (GENX(RT_BVH_INTERNAL_NODE_length) * 4)
+#define sizeof_Procedural (GENX(RT_BVH_PROCEDURAL_LEAF_length) * 4)
+#define sizeof_Quad (GENX(RT_BVH_QUAD_LEAF_length) * 4)
+
+static struct MKSizeEstimate
+get_gpu_size_estimate(const VkAccelerationStructureBuildGeometryInfoKHR *pInfo,
+ const VkAccelerationStructureBuildRangeInfoKHR *pBuildRangeInfos,
+ const uint32_t *pMaxPrimitiveCounts)
+{
+ uint32_t num_triangles = 0, num_aabbs = 0, num_instances = 0;
+ for (unsigned g = 0; g < pInfo->geometryCount; g++) {
+ const VkAccelerationStructureGeometryKHR *pGeometry =
+ get_geometry(pInfo, g);
+ uint32_t prim_count = pBuildRangeInfos != NULL ?
+ pBuildRangeInfos[g].primitiveCount : pMaxPrimitiveCounts[g];
+
+ switch (pGeometry->geometryType) {
+ case VK_GEOMETRY_TYPE_TRIANGLES_KHR:
+ num_triangles += prim_count;
+ break;
+ case VK_GEOMETRY_TYPE_AABBS_KHR:
+ num_aabbs += prim_count;
+ break;
+ case VK_GEOMETRY_TYPE_INSTANCES_KHR:
+ num_instances += prim_count;
+ break;
+ default:
+ unreachable("Unsupported geometry type");
+ }
+ }
+ const uint32_t num_primitives = num_triangles + num_aabbs + num_instances;
+
+ struct MKSizeEstimate est = {};
+
+ uint64_t size = sizeof(BVHBase);
+ size = align64(size, 64);
+
+ /* Must immediately follow BVHBase because we use fixed offset to nodes. */
+ est.node_data_start = size;
+
+ switch (pInfo->type) {
+ case VK_ACCELERATION_STRUCTURE_TYPE_TOP_LEVEL_KHR: {
+ assert(num_triangles == 0 && num_aabbs == 0);
+
+ est.numPrimitives = num_instances;
+ est.numPrimitivesToSplit = 0;
+ est.numBuildPrimitives = est.numPrimitives + est.numPrimitivesToSplit;
+
+ est.min_primitives = est.numPrimitives;
+ est.max_primitives = est.numPrimitives + est.numPrimitivesToSplit;
+
+ unsigned int sizeInnerNodes =
+ (unsigned int) estimate_qbvh6_nodes_worstcase(est.numBuildPrimitives) *
+ sizeof_InternalNode;
+ if (sizeInnerNodes == 0)
+ sizeInnerNodes = sizeof_InternalNode;
+
+ est.max_inner_nodes = sizeInnerNodes / sizeof_InternalNode;
+
+ size += sizeInnerNodes;
+ STATIC_ASSERT(sizeof_InternalNode % 64 == 0);
+
+ est.leaf_data_start = size;
+ size += est.numBuildPrimitives * sizeof_HwInstanceLeaf;
+ STATIC_ASSERT(sizeof_HwInstanceLeaf % 64 == 0);
+
+ est.leaf_data_size = est.numBuildPrimitives * sizeof_HwInstanceLeaf;
+
+ break;
+ }
+
+ case VK_ACCELERATION_STRUCTURE_TYPE_BOTTOM_LEVEL_KHR: {
+ assert(num_instances == 0);
+
+ /* RT: TODO */
+ const float split_factor = 0.0f;
+ uint32_t num_prims_to_split = 0;
+ if (false)
+ num_prims_to_split = num_triangles + (double)split_factor;
+
+ const uint32_t num_build_triangles = num_triangles + num_prims_to_split;
+ const uint32_t num_build_primitives = num_build_triangles + num_aabbs;
+
+ est.numPrimitives = num_primitives;
+ est.numTriangles = num_triangles;
+ est.numProcedurals = num_aabbs;
+ est.numMeshes = pInfo->geometryCount;
+ est.numBuildPrimitives = num_build_primitives;
+ est.numPrimitivesToSplit = num_prims_to_split;
+ est.max_instance_leafs = 0;
+
+ est.min_primitives = (size_t)(num_build_triangles * 0.5f + num_aabbs);
+ est.max_primitives = num_build_triangles + num_aabbs;
+
+ size_t nodeBytes = 0;
+ nodeBytes += estimate_qbvh6_nodes_worstcase(num_build_triangles) * sizeof_InternalNode;
+ nodeBytes += estimate_qbvh6_nodes_worstcase(num_aabbs) * sizeof_InternalNode;
+ if (nodeBytes == 0) // for case with 0 primitives
+ nodeBytes = sizeof_InternalNode;
+ nodeBytes = MAX2(nodeBytes, 8 * (size_t)num_build_primitives); // for primref_index0/1 buffers
+
+ est.max_inner_nodes = nodeBytes / sizeof_InternalNode;
+
+ size += nodeBytes;
+ STATIC_ASSERT(sizeof_InternalNode % 64 == 0);
+
+ est.leaf_data_start = size;
+ size += num_build_triangles * sizeof_Quad;
+ STATIC_ASSERT(sizeof_Quad % 64 == 0);
+
+ est.procedural_data_start = size;
+ size += num_aabbs * sizeof_Procedural;
+ STATIC_ASSERT(sizeof_Procedural % 64 == 0);
+
+ est.leaf_data_size = num_build_triangles * sizeof_Quad +
+ num_aabbs * sizeof_Procedural;
+
+ if (num_build_primitives == 0)
+ size += MAX2(sizeof_Quad, sizeof_Procedural);
+ break;
+ }
+
+ default:
+ unreachable("Unsupported acceleration structure type");
+ }
+
+ size = align64(size, 64);
+ est.instance_descs_start = size;
+ size += sizeof(struct InstanceDesc) * num_instances;
+
+ est.geo_meta_data_start = size;
+ size += sizeof(struct GeoMetaData) * pInfo->geometryCount;
+ size = align64(size, 64);
+
+ assert(size == align64(size, 64));
+ est.back_pointer_start = size;
+
+ const bool alloc_backpointers = false; /* RT TODO */
+ if (alloc_backpointers) {
+ size += est.max_inner_nodes * sizeof(uint32_t);
+ size = align64(size, 64);
+ }
+
+ assert(size < UINT32_MAX);
+ est.sizeTotal = align64(size, 64);
+
+ return est;
+}
+
+struct scratch_layout {
+ gpuva_t base;
+ uint32_t total_size;
+
+ gpuva_t primrefs;
+ gpuva_t globals;
+ gpuva_t leaf_index_buffers;
+ uint32_t leaf_index_buffer_stride;
+
+ /* new_sah */
+ gpuva_t qnode_buffer;
+ gpuva_t bvh2_buffer;
+};
+
+static size_t
+get_bvh2_size(uint32_t num_primitivies)
+{
+ if (num_primitivies == 0)
+ return 0;
+ return sizeof(struct BVH2) +
+ (2 * num_primitivies - 1) * sizeof(struct BVH2Node);
+}
+
+static struct scratch_layout
+get_gpu_scratch_layout(struct anv_address base,
+ struct MKSizeEstimate est,
+ enum anv_rt_bvh_build_method build_method)
+{
+ struct scratch_layout scratch = {
+ .base = anv_address_physical(base),
+ };
+ gpuva_t current = anv_address_physical(base);
+
+ scratch.globals = current;
+ current += sizeof(struct Globals);
+
+ scratch.primrefs = intel_canonical_address(current);
+ current += est.numBuildPrimitives * sizeof_PrimRef;
+
+ scratch.leaf_index_buffers = intel_canonical_address(current);
+ current += est.numBuildPrimitives * sizeof(uint32_t) * 2;
+ scratch.leaf_index_buffer_stride = sizeof(uint32_t);
+
+ switch (build_method) {
+ case ANV_BVH_BUILD_METHOD_TRIVIAL:
+ break;
+
+ case ANV_BVH_BUILD_METHOD_NEW_SAH: {
+ size_t bvh2_size = get_bvh2_size(est.numBuildPrimitives);
+ if (est.leaf_data_size < bvh2_size) {
+ scratch.bvh2_buffer = intel_canonical_address(current);
+ current += bvh2_size;
+ }
+
+ scratch.qnode_buffer = intel_canonical_address(current);
+ current += 2 * sizeof(dword) * est.max_inner_nodes;
+ break;
+ }
+
+ default:
+ unreachable("invalid build");
+ }
+
+ assert((current - scratch.base) < UINT32_MAX);
+ scratch.total_size = current - scratch.base;
+
+ return scratch;
+}
+
+static void
+anv_get_gpu_acceleration_structure_size(
+ UNUSED struct anv_device *device,
+ VkAccelerationStructureBuildTypeKHR buildType,
+ const VkAccelerationStructureBuildGeometryInfoKHR* pBuildInfo,
+ const uint32_t* pMaxPrimitiveCounts,
+ VkAccelerationStructureBuildSizesInfoKHR* pSizeInfo)
+{
+
+ struct MKSizeEstimate est = get_gpu_size_estimate(pBuildInfo, NULL,
+ pMaxPrimitiveCounts);
+ struct scratch_layout scratch = get_gpu_scratch_layout(ANV_NULL_ADDRESS, est,
+ device->bvh_build_method);
+
+ pSizeInfo->accelerationStructureSize = est.sizeTotal;
+ pSizeInfo->buildScratchSize = scratch.total_size;
+ pSizeInfo->updateScratchSize = scratch.total_size; /* TODO */
+}
+
+void
+genX(GetAccelerationStructureBuildSizesKHR)(
+ VkDevice _device,
+ VkAccelerationStructureBuildTypeKHR buildType,
+ const VkAccelerationStructureBuildGeometryInfoKHR* pBuildInfo,
+ const uint32_t* pMaxPrimitiveCounts,
+ VkAccelerationStructureBuildSizesInfoKHR* pSizeInfo)
+{
+ ANV_FROM_HANDLE(anv_device, device, _device);
+ assert(pSizeInfo->sType ==
+ VK_STRUCTURE_TYPE_ACCELERATION_STRUCTURE_BUILD_SIZES_INFO_KHR);
+
+ VkAccelerationStructureBuildSizesInfoKHR gpu_size_info;
+ anv_get_gpu_acceleration_structure_size(device, buildType, pBuildInfo,
+ pMaxPrimitiveCounts,
+ &gpu_size_info);
+
+ pSizeInfo->accelerationStructureSize =
+ gpu_size_info.accelerationStructureSize;
+ pSizeInfo->buildScratchSize = gpu_size_info.buildScratchSize;
+ pSizeInfo->updateScratchSize = gpu_size_info.updateScratchSize;
+}
+
+void
+genX(GetDeviceAccelerationStructureCompatibilityKHR)(
+ VkDevice _device,
+ const VkAccelerationStructureVersionInfoKHR* pVersionInfo,
+ VkAccelerationStructureCompatibilityKHR* pCompatibility)
+{
+ ANV_FROM_HANDLE(anv_device, device, _device);
+
+ if (memcmp(pVersionInfo->pVersionData,
+ device->physical->rt_uuid,
+ sizeof(device->physical->rt_uuid)) == 0) {
+ *pCompatibility = VK_ACCELERATION_STRUCTURE_COMPATIBILITY_COMPATIBLE_KHR;
+ } else {
+ *pCompatibility = VK_ACCELERATION_STRUCTURE_COMPATIBILITY_INCOMPATIBLE_KHR;
+ }
+}
+
+static inline uint8_t
+vk_to_grl_GeometryFlags(VkGeometryFlagsKHR flags)
+{
+ uint8_t grl_flags = GEOMETRY_FLAG_NONE;
+ unsigned mask = flags;
+ while (mask) {
+ int i = u_bit_scan(&mask);
+ switch ((VkGeometryFlagBitsKHR)(1u << i)) {
+ case VK_GEOMETRY_OPAQUE_BIT_KHR:
+ grl_flags |= GEOMETRY_FLAG_OPAQUE;
+ break;
+ case VK_GEOMETRY_NO_DUPLICATE_ANY_HIT_INVOCATION_BIT_KHR:
+ grl_flags |= GEOMETRY_FLAG_NO_DUPLICATE_ANYHIT_INVOCATION;
+ break;
+ default:
+ unreachable("Unsupported acceleration structure build flag");
+ }
+ }
+ return grl_flags;
+}
+
+static inline IndexFormat
+vk_to_grl_IndexFormat(VkIndexType type)
+{
+ switch (type) {
+ case VK_INDEX_TYPE_NONE_KHR: return INDEX_FORMAT_NONE;
+ case VK_INDEX_TYPE_UINT8_KHR: unreachable("No UINT8 support yet");
+ case VK_INDEX_TYPE_UINT16: return INDEX_FORMAT_R16_UINT;
+ case VK_INDEX_TYPE_UINT32: return INDEX_FORMAT_R32_UINT;
+ default:
+ unreachable("Unsupported index type");
+ }
+}
+
+static inline VertexFormat
+vk_to_grl_VertexFormat(VkFormat format)
+{
+ switch (format) {
+ case VK_FORMAT_R32G32_SFLOAT: return VERTEX_FORMAT_R32G32_FLOAT;
+ case VK_FORMAT_R32G32B32_SFLOAT: return VERTEX_FORMAT_R32G32B32_FLOAT;
+ case VK_FORMAT_R16G16_SFLOAT: return VERTEX_FORMAT_R16G16_FLOAT;
+ case VK_FORMAT_R16G16B16A16_SFLOAT: return VERTEX_FORMAT_R16G16B16A16_FLOAT;
+ case VK_FORMAT_R16G16_SNORM: return VERTEX_FORMAT_R16G16_SNORM;
+ case VK_FORMAT_R16G16B16A16_SNORM: return VERTEX_FORMAT_R16G16B16A16_SNORM;
+ case VK_FORMAT_R16G16B16A16_UNORM: return VERTEX_FORMAT_R16G16B16A16_UNORM;
+ case VK_FORMAT_R16G16_UNORM: return VERTEX_FORMAT_R16G16_UNORM;
+ /* case VK_FORMAT_R10G10B10A2_UNORM: return VERTEX_FORMAT_R10G10B10A2_UNORM; */
+ case VK_FORMAT_R8G8B8A8_UNORM: return VERTEX_FORMAT_R8G8B8A8_UNORM;
+ case VK_FORMAT_R8G8_UNORM: return VERTEX_FORMAT_R8G8_UNORM;
+ case VK_FORMAT_R8G8B8A8_SNORM: return VERTEX_FORMAT_R8G8B8A8_SNORM;
+ case VK_FORMAT_R8G8_SNORM: return VERTEX_FORMAT_R8G8_SNORM;
+ default:
+ unreachable("Unsupported vertex format");
+ }
+}
+
+static struct Geo
+vk_to_grl_Geo(const VkAccelerationStructureGeometryKHR *pGeometry,
+ uint32_t prim_count,
+ uint32_t transform_offset,
+ uint32_t primitive_offset,
+ uint32_t first_vertex)
+{
+ struct Geo geo = {
+ .Flags = vk_to_grl_GeometryFlags(pGeometry->flags),
+ };
+
+ switch (pGeometry->geometryType) {
+ case VK_GEOMETRY_TYPE_TRIANGLES_KHR: {
+ const VkAccelerationStructureGeometryTrianglesDataKHR *vk_tri =
+ &pGeometry->geometry.triangles;
+
+ geo.Type = GEOMETRY_TYPE_TRIANGLES;
+
+ geo.Desc.Triangles.pTransformBuffer =
+ vk_tri->transformData.deviceAddress;
+ geo.Desc.Triangles.pIndexBuffer =
+ vk_tri->indexData.deviceAddress;
+ geo.Desc.Triangles.pVertexBuffer =
+ vk_tri->vertexData.deviceAddress;
+ geo.Desc.Triangles.VertexBufferByteStride = vk_tri->vertexStride;
+
+ if (geo.Desc.Triangles.pTransformBuffer)
+ geo.Desc.Triangles.pTransformBuffer += transform_offset;
+
+ if (vk_tri->indexType == VK_INDEX_TYPE_NONE_KHR) {
+ geo.Desc.Triangles.IndexCount = 0;
+ geo.Desc.Triangles.VertexCount = prim_count * 3;
+ geo.Desc.Triangles.IndexFormat = INDEX_FORMAT_NONE;
+ geo.Desc.Triangles.pVertexBuffer += primitive_offset;
+ } else {
+ geo.Desc.Triangles.IndexCount = prim_count * 3;
+ geo.Desc.Triangles.VertexCount = vk_tri->maxVertex;
+ geo.Desc.Triangles.IndexFormat =
+ vk_to_grl_IndexFormat(vk_tri->indexType);
+ geo.Desc.Triangles.pIndexBuffer += primitive_offset;
+ }
+
+ geo.Desc.Triangles.VertexFormat =
+ vk_to_grl_VertexFormat(vk_tri->vertexFormat);
+ geo.Desc.Triangles.pVertexBuffer += vk_tri->vertexStride * first_vertex;
+ break;
+ }
+
+ case VK_GEOMETRY_TYPE_AABBS_KHR: {
+ const VkAccelerationStructureGeometryAabbsDataKHR *vk_aabbs =
+ &pGeometry->geometry.aabbs;
+ geo.Type = GEOMETRY_TYPE_PROCEDURAL;
+ geo.Desc.Procedural.pAABBs_GPUVA =
+ vk_aabbs->data.deviceAddress + primitive_offset;
+ geo.Desc.Procedural.AABBByteStride = vk_aabbs->stride;
+ geo.Desc.Procedural.AABBCount = prim_count;
+ break;
+ }
+
+ default:
+ unreachable("Invalid geometry type");
+ }
+
+ return geo;
+}
+
+#include "grl/grl_metakernel_copy.h"
+#include "grl/grl_metakernel_misc.h"
+#include "grl/grl_metakernel_build_primref.h"
+#include "grl/grl_metakernel_new_sah_builder.h"
+#include "grl/grl_metakernel_build_leaf.h"
+
+struct build_state {
+ enum anv_rt_bvh_build_method build_method;
+
+ struct MKSizeEstimate estimate;
+ struct scratch_layout scratch;
+ struct MKBuilderState state;
+
+ struct anv_address bvh_addr;
+
+ size_t geom_size_prefix_sum_buffer;
+ size_t transient_size;
+
+ uint32_t leaf_type;
+ uint32_t leaf_size;
+
+ uint32_t num_geometries;
+ uint32_t num_instances;
+
+ uint64_t instances_addr;
+ bool array_of_instances_ptr;
+
+ const VkAccelerationStructureGeometryKHR *vk_geoms;
+};
+
+static void
+get_binnedsah_scratch_buffers(struct build_state *bs,
+ uint64_t *p_qnode_buffer,
+ uint64_t *p_primref_indices,
+ uint64_t *p_bvh2)
+{
+ if (bs->estimate.numBuildPrimitives == 0)
+ {
+ *p_bvh2 = 0;
+ *p_qnode_buffer = 0;
+ *p_primref_indices = 0;
+ return;
+ }
+
+ size_t bvh2_size = get_bvh2_size(bs->estimate.numBuildPrimitives);
+ if (bs->estimate.leaf_data_size < bvh2_size) {
+ assert(bs->scratch.bvh2_buffer != 0);
+ *p_bvh2 = bs->scratch.bvh2_buffer;
+ } else {
+ *p_bvh2 = intel_canonical_address(bs->state.bvh_buffer +
+ bs->estimate.leaf_data_start);
+ }
+
+ assert(bs->scratch.qnode_buffer != 0);
+ *p_qnode_buffer = bs->scratch.qnode_buffer;
+
+ assert(bs->scratch.leaf_index_buffers != 0);
+ *p_primref_indices = bs->scratch.leaf_index_buffers;
+}
+
+static void
+write_memory(struct anv_cmd_alloc alloc, size_t offset, const void *data, size_t data_len)
+{
+ assert((offset + data_len) < alloc.size);
+ memcpy(alloc.map + offset, data, data_len);
+}
+
+static void
+cmd_build_acceleration_structures(
+ struct anv_cmd_buffer *cmd_buffer,
+ uint32_t infoCount,
+ const VkAccelerationStructureBuildGeometryInfoKHR *pInfos,
+ const VkAccelerationStructureBuildRangeInfoKHR *const *ppBuildRangeInfos,
+ const VkDeviceAddress *pIndirectDeviceAddresses,
+ const uint32_t *pIndirectStrides,
+ const uint32_t *const *ppMaxPrimitiveCounts)
+{
+ struct anv_device *device = cmd_buffer->device;
+ VK_MULTIALLOC(ma);
+
+ struct build_state *builds;
+ vk_multialloc_add(&ma, &builds, struct build_state, infoCount);
+
+ if (!vk_multialloc_zalloc(&ma,
+ &cmd_buffer->device->vk.alloc,
+ VK_SYSTEM_ALLOCATION_SCOPE_COMMAND)) {
+ anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_OUT_OF_HOST_MEMORY);
+ return;
+ }
+
+ trace_intel_begin_as_build(&cmd_buffer->trace);
+
+ /* TODO: Indirect */
+ assert(ppBuildRangeInfos != NULL);
+
+ size_t transient_mem_init_globals_size = 0;
+ size_t transient_mem_init_globals_offset = 0;
+
+ size_t transient_total = 0;
+
+ size_t private_mem_total = 0;
+
+ size_t num_trivial_builds = 0;
+ size_t num_new_sah_builds = 0;
+
+ /* Prepare a bunch of data for the kernels we have to run. */
+ for (uint32_t i = 0; i < infoCount; i++) {
+ struct build_state *bs = &builds[i];
+
+ const VkAccelerationStructureBuildGeometryInfoKHR *pInfo = &pInfos[i];
+ struct anv_address scratch_addr =
+ anv_address_from_u64(pInfo->scratchData.deviceAddress);
+
+ const VkAccelerationStructureBuildRangeInfoKHR *pBuildRangeInfos =
+ ppBuildRangeInfos ? ppBuildRangeInfos[i] : NULL;
+ const uint32_t *pMaxPrimitiveCounts =
+ ppMaxPrimitiveCounts ? ppMaxPrimitiveCounts[i] : NULL;
+
+ ANV_FROM_HANDLE(vk_acceleration_structure, dst_accel,
+ pInfo->dstAccelerationStructure);
+
+ bs->build_method = device->bvh_build_method;
+
+ bs->bvh_addr = anv_address_from_u64(vk_acceleration_structure_get_va(dst_accel));
+
+ bs->estimate = get_gpu_size_estimate(pInfo, pBuildRangeInfos,
+ pMaxPrimitiveCounts);
+ bs->scratch = get_gpu_scratch_layout(scratch_addr, bs->estimate,
+ bs->build_method);
+
+ uint32_t leaf_size, leaf_type;
+
+ switch (pInfo->type) {
+ case VK_ACCELERATION_STRUCTURE_TYPE_TOP_LEVEL_KHR: {
+ assert(pInfo->geometryCount == 1);
+
+ const VkAccelerationStructureGeometryKHR *pGeometry =
+ get_geometry(pInfo, 0);
+ assert(pGeometry->geometryType == VK_GEOMETRY_TYPE_INSTANCES_KHR);
+
+ const VkAccelerationStructureGeometryInstancesDataKHR *instances =
+ &pGeometry->geometry.instances;
+
+ bs->num_instances = pBuildRangeInfos[0].primitiveCount;
+ bs->instances_addr = instances->data.deviceAddress;
+ bs->array_of_instances_ptr = instances->arrayOfPointers;
+ leaf_type = NODE_TYPE_INSTANCE;
+ leaf_size = GENX(RT_BVH_INSTANCE_LEAF_length) * 4;
+ break;
+ }
+
+ case VK_ACCELERATION_STRUCTURE_TYPE_BOTTOM_LEVEL_KHR: {
+ bs->num_geometries = pInfo->geometryCount;
+ leaf_type = NODE_TYPE_QUAD;
+ leaf_size = GENX(RT_BVH_QUAD_LEAF_length) * 4;
+ break;
+ }
+
+ default:
+ unreachable("Unsupported acceleration structure type");
+ }
+
+ size_t geom_struct_size = bs->num_geometries * sizeof(struct Geo);
+ size_t geom_prefix_sum_size = align_uintptr(sizeof(uint32_t) * (bs->num_geometries + 1), 64);
+
+ bs->transient_size = geom_prefix_sum_size + geom_struct_size;
+
+ bs->geom_size_prefix_sum_buffer = transient_total + 0;
+
+ bs->state = (struct MKBuilderState) {
+ .geomDesc_buffer = bs->geom_size_prefix_sum_buffer +
+ geom_prefix_sum_size,
+ .build_primref_buffer = bs->scratch.primrefs,
+ .build_globals = bs->scratch.globals,
+ .bvh_buffer = anv_address_physical(bs->bvh_addr),
+ .leaf_type = leaf_type,
+ .leaf_size = leaf_size,
+ };
+
+ transient_total += bs->transient_size;
+
+ switch (device->bvh_build_method) {
+ case ANV_BVH_BUILD_METHOD_TRIVIAL:
+ num_trivial_builds++;
+ break;
+ case ANV_BVH_BUILD_METHOD_NEW_SAH:
+ num_new_sah_builds++;
+ break;
+ default:
+ unreachable("invalid BVH build method");
+ }
+
+ transient_mem_init_globals_size += sizeof(struct BatchedInitGlobalsData);
+ }
+
+ transient_total = align_transient_size(transient_total);
+ transient_mem_init_globals_offset = transient_total;
+ transient_total += align_transient_size(transient_mem_init_globals_size);
+
+ size_t transient_mem_binnedsah_size = 0;
+ size_t transient_mem_binnedsah_offset = 0;
+ size_t private_mem_binnedsah_size = 0;
+ size_t private_mem_binnedsah_offset = 0;
+
+ transient_mem_binnedsah_size = get_batched_binnedsah_transient_mem_size(num_new_sah_builds);
+ transient_mem_binnedsah_offset = transient_total;
+ transient_total += align_transient_size(transient_mem_binnedsah_size);
+
+ private_mem_binnedsah_size = get_batched_binnedsah_private_mem_size(num_new_sah_builds);
+ private_mem_binnedsah_offset = private_mem_total;
+ private_mem_total += align_private_size(private_mem_binnedsah_size);
+
+ /* Allocate required memory, unless we already have a suiteable buffer */
+ struct anv_cmd_alloc private_mem_alloc;
+ if (private_mem_total > cmd_buffer->state.rt.build_priv_mem_size) {
+ private_mem_alloc =
+ anv_cmd_buffer_alloc_space(cmd_buffer, private_mem_total, 64,
+ false /* mapped */);
+ if (anv_cmd_alloc_is_empty(private_mem_alloc)) {
+ anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_OUT_OF_DEVICE_MEMORY);
+ goto error;
+ }
+
+ cmd_buffer->state.rt.build_priv_mem_addr = private_mem_alloc.address;
+ cmd_buffer->state.rt.build_priv_mem_size = private_mem_alloc.size;
+ } else {
+ private_mem_alloc = (struct anv_cmd_alloc) {
+ .address = cmd_buffer->state.rt.build_priv_mem_addr,
+ .map = anv_address_map(cmd_buffer->state.rt.build_priv_mem_addr),
+ .size = cmd_buffer->state.rt.build_priv_mem_size,
+ };
+ }
+
+ struct anv_cmd_alloc transient_mem_alloc =
+ anv_cmd_buffer_alloc_space(cmd_buffer, transient_total, 64,
+ true /* mapped */);
+ if (transient_total > 0 && anv_cmd_alloc_is_empty(transient_mem_alloc)) {
+ anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_OUT_OF_DEVICE_MEMORY);
+ goto error;
+ }
+
+ uint64_t private_base = anv_address_physical(private_mem_alloc.address);
+ uint64_t transient_base = anv_address_physical(transient_mem_alloc.address);
+
+ /* Prepare transient memory */
+ for (uint32_t i = 0; i < infoCount; i++) {
+ struct build_state *bs = &builds[i];
+
+ const VkAccelerationStructureBuildGeometryInfoKHR *pInfo = &pInfos[i];
+
+ const VkAccelerationStructureBuildRangeInfoKHR *pBuildRangeInfos =
+ ppBuildRangeInfos ? ppBuildRangeInfos[i] : NULL;
+
+ struct Geo *geos = transient_mem_alloc.map + bs->state.geomDesc_buffer;
+ uint32_t *prefixes = transient_mem_alloc.map + bs->geom_size_prefix_sum_buffer;
+ uint32_t prefix_sum = 0;
+ for (unsigned g = 0; g < bs->num_geometries; g++) {
+ const VkAccelerationStructureGeometryKHR *pGeometry = get_geometry(pInfo, g);
+ uint32_t prim_count = pBuildRangeInfos[g].primitiveCount;
+ geos[g] = vk_to_grl_Geo(pGeometry, prim_count,
+ pBuildRangeInfos[g].transformOffset,
+ pBuildRangeInfos[g].primitiveOffset,
+ pBuildRangeInfos[g].firstVertex);
+
+ prefixes[g] = prefix_sum;
+ prefix_sum += prim_count;
+ }
+
+ prefixes[bs->num_geometries] = prefix_sum;
+
+ bs->geom_size_prefix_sum_buffer =
+ intel_canonical_address(bs->geom_size_prefix_sum_buffer +
+ transient_base);
+ bs->state.geomDesc_buffer =
+ intel_canonical_address(bs->state.geomDesc_buffer +
+ transient_base);
+
+ struct BatchedInitGlobalsData data = {
+ .p_build_globals = bs->scratch.globals,
+ .p_bvh_buffer = anv_address_physical(bs->bvh_addr),
+
+ .numPrimitives = 0,
+ .numGeometries = bs->num_geometries,
+ .numInstances = bs->num_instances,
+
+ .instance_descs_start = bs->estimate.instance_descs_start,
+ .geo_meta_data_start = bs->estimate.geo_meta_data_start,
+ .node_data_start = bs->estimate.node_data_start,
+ .leaf_data_start = bs->estimate.leaf_data_start,
+ .procedural_data_start = bs->estimate.procedural_data_start,
+ .back_pointer_start = bs->estimate.back_pointer_start,
+ .sizeTotal = bs->estimate.sizeTotal,
+
+ .leafType = bs->state.leaf_type,
+ .leafSize = bs->state.leaf_size,
+ };
+
+ write_memory(transient_mem_alloc,
+ transient_mem_init_globals_offset + i * sizeof(data),
+ &data, sizeof(data));
+ }
+
+ genX(flush_pipeline_select_gpgpu)(cmd_buffer);
+
+ /* Due to the nature of GRL and its heavy use of jumps/predication, we
+ * cannot tell exactly in what order the CFE_STATE we insert are going to
+ * be executed. So always use the largest possible size.
+ */
+ genX(cmd_buffer_ensure_cfe_state)(
+ cmd_buffer,
+ cmd_buffer->device->physical->max_grl_scratch_size);
+
+ /* Round 1 : init_globals kernel */
+ genX(grl_misc_batched_init_globals)(
+ cmd_buffer,
+ intel_canonical_address(transient_base +
+ transient_mem_init_globals_offset),
+ infoCount);
+
+ anv_add_pending_pipe_bits(cmd_buffer,
+ ANV_GRL_FLUSH_FLAGS,
+ "building accel struct");
+
+ /* Round 2 : Copy instance/geometry data from the application provided
+ * buffers into the acceleration structures.
+ */
+ for (uint32_t i = 0; i < infoCount; i++) {
+ struct build_state *bs = &builds[i];
+
+ /* Metadata */
+ if (bs->num_instances) {
+ assert(bs->num_geometries == 0);
+
+ const uint64_t copy_size = bs->num_instances * sizeof(InstanceDesc);
+ /* This must be calculated in same way as
+ * groupCountForGeoMetaDataCopySize
+ */
+ const uint32_t num_threads = (copy_size >> 8) + 3;
+
+ if (bs->array_of_instances_ptr) {
+ genX(grl_misc_copy_instance_ptrs)(
+ cmd_buffer,
+ anv_address_physical(anv_address_add(bs->bvh_addr,
+ bs->estimate.instance_descs_start)),
+ bs->instances_addr,
+ copy_size, num_threads);
+ } else {
+ genX(grl_misc_copy_instances)(
+ cmd_buffer,
+ anv_address_physical(anv_address_add(bs->bvh_addr,
+ bs->estimate.instance_descs_start)),
+ bs->instances_addr,
+ copy_size, num_threads);
+ }
+ }
+
+ if (bs->num_geometries) {
+ assert(bs->num_instances == 0);
+ const uint64_t copy_size = bs->num_geometries * sizeof(struct GeoMetaData);
+
+ /* This must be calculated in same way as
+ * groupCountForGeoMetaDataCopySize
+ */
+ const uint32_t num_threads = (copy_size >> 6) + 1;
+
+ genX(grl_misc_copy_geo_meta_data)(
+ cmd_buffer,
+ anv_address_physical(anv_address_add(bs->bvh_addr,
+ bs->estimate.geo_meta_data_start)),
+ bs->state.geomDesc_buffer,
+ copy_size,
+ num_threads);
+ }
+
+ /* Primrefs */
+ if (bs->num_instances) {
+ if (bs->array_of_instances_ptr) {
+ genX(grl_build_primref_buildPrimirefsFromInstancesArrOfPtrs)(
+ cmd_buffer,
+ bs->instances_addr,
+ PREFIX_MK_SIZE(grl_build_primref, bs->estimate),
+ PREFIX_MK_STATE(grl_build_primref, bs->state),
+ false /* allowUpdate */);
+ } else {
+ genX(grl_build_primref_buildPrimirefsFromInstances)(
+ cmd_buffer,
+ bs->instances_addr,
+ PREFIX_MK_SIZE(grl_build_primref, bs->estimate),
+ PREFIX_MK_STATE(grl_build_primref, bs->state),
+ false /* allowUpdate */);
+ }
+ }
+
+ if (bs->num_geometries) {
+ const VkAccelerationStructureBuildGeometryInfoKHR *pInfo = &pInfos[i];
+ const VkAccelerationStructureBuildRangeInfoKHR *pBuildRangeInfos =
+ ppBuildRangeInfos ? ppBuildRangeInfos[i] : NULL;
+
+ assert(pInfo->geometryCount == bs->num_geometries);
+ for (unsigned g = 0; g < pInfo->geometryCount; g++) {
+ const VkAccelerationStructureGeometryKHR *pGeometry =
+ get_geometry(pInfo, g);
+
+ switch (pGeometry->geometryType) {
+ case VK_GEOMETRY_TYPE_TRIANGLES_KHR:
+ genX(grl_build_primref_primrefs_from_tris)(
+ cmd_buffer,
+ PREFIX_MK_STATE(grl_build_primref, bs->state),
+ PREFIX_MK_SIZE(grl_build_primref, bs->estimate),
+ bs->state.geomDesc_buffer + g * sizeof(struct Geo),
+ g,
+ vk_to_grl_GeometryFlags(pGeometry->flags),
+ /* TODO: Indirect */
+ pBuildRangeInfos[g].primitiveCount);
+ break;
+
+ case VK_GEOMETRY_TYPE_AABBS_KHR:
+ genX(grl_build_primref_primrefs_from_proc)(
+ cmd_buffer,
+ PREFIX_MK_STATE(grl_build_primref, bs->state),
+ PREFIX_MK_SIZE(grl_build_primref, bs->estimate),
+ bs->state.geomDesc_buffer + g * sizeof(struct Geo),
+ g,
+ vk_to_grl_GeometryFlags(pGeometry->flags),
+ /* TODO: Indirect */
+ pBuildRangeInfos[g].primitiveCount);
+ break;
+
+ default:
+ unreachable("Invalid geometry type");
+ }
+ }
+ }
+ }
+
+ anv_add_pending_pipe_bits(cmd_buffer,
+ ANV_GRL_FLUSH_FLAGS,
+ "building accel struct");
+
+ /* Dispatch trivial builds */
+ if (num_trivial_builds) {
+ for (uint32_t i = 0; i < infoCount; i++) {
+ struct build_state *bs = &builds[i];
+
+ if (bs->build_method != ANV_BVH_BUILD_METHOD_TRIVIAL)
+ continue;
+
+ genX(grl_new_sah_builder_single_pass_binsah)(
+ cmd_buffer,
+ bs->scratch.globals,
+ bs->state.bvh_buffer,
+ bs->state.build_primref_buffer,
+ bs->scratch.leaf_index_buffers,
+ false /* alloc_backpointers */);
+ }
+ }
+
+ /* Dispatch new SAH builds */
+ if (num_new_sah_builds) {
+ size_t global_ptrs_offset = transient_mem_binnedsah_offset;
+ size_t buffers_info_offset = transient_mem_binnedsah_offset + sizeof(gpuva_t) * num_new_sah_builds;
+
+ size_t scheduler_offset = private_mem_binnedsah_offset;
+ size_t sah_globals_offset = private_mem_binnedsah_offset + get_scheduler_size(num_new_sah_builds);
+
+ struct SAHBuildArgsBatchable args = {
+ .num_builds = infoCount,
+ .p_globals_ptrs = intel_canonical_address(transient_base + global_ptrs_offset),
+ .p_buffers_info = intel_canonical_address(transient_base + buffers_info_offset),
+ .p_scheduler = intel_canonical_address(private_base + scheduler_offset),
+ .p_sah_globals = intel_canonical_address(private_base + sah_globals_offset),
+ .num_max_qnode_global_root_buffer_entries = MAX2(num_new_sah_builds, QNODE_GLOBAL_ROOT_BUFFER_MIN_ENTRIES_NUM),
+ };
+
+ for (uint32_t i = 0; i < infoCount; i++) {
+ struct build_state *bs = &builds[i];
+
+ if (bs->build_method != ANV_BVH_BUILD_METHOD_NEW_SAH)
+ continue;
+
+ uint64_t p_build_primref_index_buffers;
+ uint64_t p_bvh2;
+ uint64_t p_qnode_child_buffer;
+
+ get_binnedsah_scratch_buffers(bs,
+ &p_qnode_child_buffer,
+ &p_build_primref_index_buffers,
+ &p_bvh2);
+
+ struct SAHBuildBuffersInfo buffers = {
+ .p_primref_index_buffers = bs->scratch.leaf_index_buffers,
+ .p_bvh_base = bs->state.bvh_buffer,
+ .p_primrefs_buffer = bs->state.build_primref_buffer,
+ .p_bvh2 = p_bvh2,
+ .p_qnode_root_buffer = p_qnode_child_buffer,
+ .sah_globals_flags = 0,
+ };
+
+ write_memory(transient_mem_alloc, buffers_info_offset, &buffers, sizeof(buffers));
+ buffers_info_offset += sizeof(buffers);
+
+ write_memory(transient_mem_alloc, global_ptrs_offset, &bs->state.build_globals,
+ sizeof(bs->state.build_globals));
+ global_ptrs_offset += sizeof(bs->state.build_globals);
+ }
+
+ genX(grl_new_sah_builder_new_sah_build_batchable)(
+ cmd_buffer, PREFIX_MK_SAH_BUILD_ARGS_BATCHABLE(grl_new_sah_builder, args));
+ }
+
+ if (num_new_sah_builds == 0)
+ anv_add_pending_pipe_bits(cmd_buffer,
+ ANV_GRL_FLUSH_FLAGS,
+ "building accel struct");
+
+ /* Finally write the leaves. */
+ for (uint32_t i = 0; i < infoCount; i++) {
+ struct build_state *bs = &builds[i];
+
+ if (bs->num_instances) {
+ assert(bs->num_geometries == 0);
+ if (bs->array_of_instances_ptr) {
+ genX(grl_leaf_builder_buildLeafDXR_instances_pointers)(cmd_buffer,
+ PREFIX_MK_STATE(grl_leaf_builder, bs->state),
+ bs->scratch.leaf_index_buffers,
+ bs->instances_addr,
+ bs->scratch.leaf_index_buffer_stride,
+ 0 /* offset */,
+ bs->estimate.numBuildPrimitives);
+ } else {
+ genX(grl_leaf_builder_buildLeafDXR_instances)(cmd_buffer,
+ PREFIX_MK_STATE(grl_leaf_builder, bs->state),
+ bs->scratch.leaf_index_buffers,
+ bs->instances_addr,
+ bs->scratch.leaf_index_buffer_stride,
+ 0 /* offset */,
+ bs->estimate.numBuildPrimitives);
+ }
+ }
+
+ if (bs->num_geometries) {
+ assert(bs->num_instances == 0);
+ const uint64_t p_numPrimitives =
+ bs->state.build_globals + offsetof(struct Globals, numPrimitives);
+
+ assert(bs->estimate.numProcedurals == 0 ||
+ bs->estimate.numTriangles == 0);
+ if (bs->estimate.numProcedurals) {
+ genX(grl_leaf_builder_buildLeafDXR_procedurals)(
+ cmd_buffer,
+ PREFIX_MK_STATE(grl_leaf_builder, bs->state),
+ bs->scratch.leaf_index_buffers,
+ bs->scratch.leaf_index_buffer_stride,
+ 0 /* offset */,
+ p_numPrimitives);
+ } else {
+ genX(grl_leaf_builder_buildLeafDXR_quads)(
+ cmd_buffer,
+ PREFIX_MK_STATE(grl_leaf_builder, bs->state),
+ bs->scratch.leaf_index_buffers,
+ bs->scratch.leaf_index_buffer_stride,
+ 0 /* offset */,
+ p_numPrimitives,
+ false /* allow_updates */);
+ }
+ }
+ }
+
+ anv_add_pending_pipe_bits(cmd_buffer,
+ ANV_GRL_FLUSH_FLAGS,
+ "building accel struct");
+
+ trace_intel_end_as_build(&cmd_buffer->trace);
+
+ error:
+ vk_free(&cmd_buffer->device->vk.alloc, builds);
+}
+
+void
+genX(CmdBuildAccelerationStructuresKHR)(
+ VkCommandBuffer commandBuffer,
+ uint32_t infoCount,
+ const VkAccelerationStructureBuildGeometryInfoKHR* pInfos,
+ const VkAccelerationStructureBuildRangeInfoKHR* const* ppBuildRangeInfos)
+{
+ ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+
+ if (anv_batch_has_error(&cmd_buffer->batch))
+ return;
+
+ cmd_build_acceleration_structures(cmd_buffer, infoCount, pInfos,
+ ppBuildRangeInfos, NULL, NULL, NULL);
+}
+
+void
+genX(CmdBuildAccelerationStructuresIndirectKHR)(
+ VkCommandBuffer commandBuffer,
+ uint32_t infoCount,
+ const VkAccelerationStructureBuildGeometryInfoKHR* pInfos,
+ const VkDeviceAddress* pIndirectDeviceAddresses,
+ const uint32_t* pIndirectStrides,
+ const uint32_t* const* ppMaxPrimitiveCounts)
+{
+ unreachable("Unimplemented");
+}
+
+void
+genX(CmdCopyAccelerationStructureKHR)(
+ VkCommandBuffer commandBuffer,
+ const VkCopyAccelerationStructureInfoKHR* pInfo)
+{
+ ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+ ANV_FROM_HANDLE(vk_acceleration_structure, src_accel, pInfo->src);
+ ANV_FROM_HANDLE(vk_acceleration_structure, dst_accel, pInfo->dst);
+
+ assert(pInfo->mode == VK_COPY_ACCELERATION_STRUCTURE_MODE_COMPACT_KHR ||
+ pInfo->mode == VK_COPY_ACCELERATION_STRUCTURE_MODE_CLONE_KHR);
+
+ if (pInfo->mode == VK_COPY_ACCELERATION_STRUCTURE_MODE_CLONE_KHR) {
+ uint64_t src_size_addr =
+ vk_acceleration_structure_get_va(src_accel) +
+ offsetof(struct BVHBase, Meta.allocationSize);
+ genX(grl_copy_clone_indirect)(
+ cmd_buffer,
+ vk_acceleration_structure_get_va(dst_accel),
+ vk_acceleration_structure_get_va(src_accel),
+ src_size_addr);
+ } else {
+ genX(grl_copy_compact)(
+ cmd_buffer,
+ vk_acceleration_structure_get_va(dst_accel),
+ vk_acceleration_structure_get_va(src_accel));
+ }
+
+ anv_add_pending_pipe_bits(cmd_buffer,
+ ANV_PIPE_END_OF_PIPE_SYNC_BIT,
+ "after copy acceleration struct");
+}
+
+void
+genX(CmdCopyAccelerationStructureToMemoryKHR)(
+ VkCommandBuffer commandBuffer,
+ const VkCopyAccelerationStructureToMemoryInfoKHR* pInfo)
+{
+ ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+ ANV_FROM_HANDLE(vk_acceleration_structure, src_accel, pInfo->src);
+ struct anv_device *device = cmd_buffer->device;
+ uint64_t src_size_addr =
+ vk_acceleration_structure_get_va(src_accel) +
+ offsetof(struct BVHBase, Meta.allocationSize);
+
+ assert(pInfo->mode == VK_COPY_ACCELERATION_STRUCTURE_MODE_SERIALIZE_KHR);
+
+ genX(grl_copy_serialize_indirect)(
+ cmd_buffer,
+ pInfo->dst.deviceAddress,
+ vk_acceleration_structure_get_va(src_accel),
+ anv_address_physical(device->rt_uuid_addr),
+ src_size_addr);
+
+ anv_add_pending_pipe_bits(cmd_buffer,
+ ANV_PIPE_END_OF_PIPE_SYNC_BIT,
+ "after copy acceleration struct");
+}
+
+void
+genX(CmdCopyMemoryToAccelerationStructureKHR)(
+ VkCommandBuffer commandBuffer,
+ const VkCopyMemoryToAccelerationStructureInfoKHR* pInfo)
+{
+ ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+ ANV_FROM_HANDLE(vk_acceleration_structure, dst_accel, pInfo->dst);
+
+ assert(pInfo->mode == VK_COPY_ACCELERATION_STRUCTURE_MODE_DESERIALIZE_KHR);
+
+ uint64_t src_size_addr = pInfo->src.deviceAddress +
+ offsetof(struct SerializationHeader, DeserializedSizeInBytes);
+ genX(grl_copy_deserialize_indirect)(
+ cmd_buffer,
+ vk_acceleration_structure_get_va(dst_accel),
+ pInfo->src.deviceAddress,
+ src_size_addr);
+
+ anv_add_pending_pipe_bits(cmd_buffer,
+ ANV_PIPE_END_OF_PIPE_SYNC_BIT,
+ "after copy acceleration struct");
+}
+
+/* TODO: Host commands */
+
+VkResult
+genX(BuildAccelerationStructuresKHR)(
+ VkDevice _device,
+ VkDeferredOperationKHR deferredOperation,
+ uint32_t infoCount,
+ const VkAccelerationStructureBuildGeometryInfoKHR* pInfos,
+ const VkAccelerationStructureBuildRangeInfoKHR* const* ppBuildRangeInfos)
+{
+ ANV_FROM_HANDLE(anv_device, device, _device);
+ unreachable("Unimplemented");
+ return vk_error(device, VK_ERROR_FEATURE_NOT_PRESENT);
+}
+
+VkResult
+genX(CopyAccelerationStructureKHR)(
+ VkDevice _device,
+ VkDeferredOperationKHR deferredOperation,
+ const VkCopyAccelerationStructureInfoKHR* pInfo)
+{
+ ANV_FROM_HANDLE(anv_device, device, _device);
+ unreachable("Unimplemented");
+ return vk_error(device, VK_ERROR_FEATURE_NOT_PRESENT);
+}
+
+VkResult
+genX(CopyAccelerationStructureToMemoryKHR)(
+ VkDevice _device,
+ VkDeferredOperationKHR deferredOperation,
+ const VkCopyAccelerationStructureToMemoryInfoKHR* pInfo)
+{
+ ANV_FROM_HANDLE(anv_device, device, _device);
+ unreachable("Unimplemented");
+ return vk_error(device, VK_ERROR_FEATURE_NOT_PRESENT);
+}
+
+VkResult
+genX(CopyMemoryToAccelerationStructureKHR)(
+ VkDevice _device,
+ VkDeferredOperationKHR deferredOperation,
+ const VkCopyMemoryToAccelerationStructureInfoKHR* pInfo)
+{
+ ANV_FROM_HANDLE(anv_device, device, _device);
+ unreachable("Unimplemented");
+ return vk_error(device, VK_ERROR_FEATURE_NOT_PRESENT);
+}
+
+VkResult
+genX(WriteAccelerationStructuresPropertiesKHR)(
+ VkDevice _device,
+ uint32_t accelerationStructureCount,
+ const VkAccelerationStructureKHR* pAccelerationStructures,
+ VkQueryType queryType,
+ size_t dataSize,
+ void* pData,
+ size_t stride)
+{
+ ANV_FROM_HANDLE(anv_device, device, _device);
+ unreachable("Unimplemented");
+ return vk_error(device, VK_ERROR_FEATURE_NOT_PRESENT);
+}
+
+#endif /* GFX_VERx10 >= 125 */
diff --git a/src/intel/vulkan/genX_blorp_exec.c b/src/intel/vulkan/genX_blorp_exec.c
index ced154f72e1..b9d1902d3b5 100644
--- a/src/intel/vulkan/genX_blorp_exec.c
+++ b/src/intel/vulkan/genX_blorp_exec.c
@@ -26,23 +26,41 @@
#include "anv_private.h"
#include "anv_measure.h"
-/* These are defined in anv_private.h and blorp_genX_exec.h */
+/* These are defined in anv_private.h and blorp_genX_exec_brw.h */
#undef __gen_address_type
#undef __gen_user_data
#undef __gen_combine_address
#include "common/intel_l3_config.h"
-#include "blorp/blorp_genX_exec.h"
+#include "blorp/blorp_genX_exec_brw.h"
+
+#include "ds/intel_tracepoints.h"
static void blorp_measure_start(struct blorp_batch *_batch,
const struct blorp_params *params)
{
struct anv_cmd_buffer *cmd_buffer = _batch->driver_batch;
+ trace_intel_begin_blorp(&cmd_buffer->trace);
anv_measure_snapshot(cmd_buffer,
- params->snapshot_type,
+ blorp_op_to_intel_measure_snapshot(params->op),
NULL, 0);
}
+static void blorp_measure_end(struct blorp_batch *_batch,
+ const struct blorp_params *params)
+{
+ struct anv_cmd_buffer *cmd_buffer = _batch->driver_batch;
+ trace_intel_end_blorp(&cmd_buffer->trace,
+ params->op,
+ params->x1 - params->x0,
+ params->y1 - params->y0,
+ params->num_samples,
+ params->shader_pipeline,
+ params->dst.view.format,
+ params->src.view.format,
+ (_batch->flags & BLORP_BATCH_PREDICATE_ENABLE));
+}
+
static void *
blorp_emit_dwords(struct blorp_batch *batch, unsigned n)
{
@@ -55,10 +73,12 @@ blorp_emit_reloc(struct blorp_batch *batch,
void *location, struct blorp_address address, uint32_t delta)
{
struct anv_cmd_buffer *cmd_buffer = batch->driver_batch;
- assert(cmd_buffer->batch.start <= location &&
- location < cmd_buffer->batch.end);
- return anv_batch_emit_reloc(&cmd_buffer->batch, location,
- address.buffer, address.offset + delta);
+ struct anv_address anv_addr = {
+ .bo = address.buffer,
+ .offset = address.offset,
+ };
+ anv_reloc_list_add_bo(cmd_buffer->batch.relocs, anv_addr.bo);
+ return anv_address_physical(anv_address_add(anv_addr, delta));
}
static void
@@ -66,59 +86,47 @@ blorp_surface_reloc(struct blorp_batch *batch, uint32_t ss_offset,
struct blorp_address address, uint32_t delta)
{
struct anv_cmd_buffer *cmd_buffer = batch->driver_batch;
- VkResult result;
-
- if (ANV_ALWAYS_SOFTPIN) {
- result = anv_reloc_list_add_bo(&cmd_buffer->surface_relocs,
- &cmd_buffer->pool->alloc,
- address.buffer);
- if (unlikely(result != VK_SUCCESS))
- anv_batch_set_error(&cmd_buffer->batch, result);
- return;
- }
- uint64_t address_u64 = 0;
- result = anv_reloc_list_add(&cmd_buffer->surface_relocs,
- &cmd_buffer->pool->alloc,
- ss_offset, address.buffer,
- address.offset + delta,
- &address_u64);
- if (result != VK_SUCCESS)
+ VkResult result = anv_reloc_list_add_bo(&cmd_buffer->surface_relocs,
+ address.buffer);
+ if (unlikely(result != VK_SUCCESS))
anv_batch_set_error(&cmd_buffer->batch, result);
-
- void *dest = anv_block_pool_map(
- &cmd_buffer->device->surface_state_pool.block_pool, ss_offset, 8);
- write_reloc(cmd_buffer->device, dest, address_u64, false);
}
static uint64_t
blorp_get_surface_address(struct blorp_batch *blorp_batch,
struct blorp_address address)
{
- if (ANV_ALWAYS_SOFTPIN) {
- struct anv_address anv_addr = {
- .bo = address.buffer,
- .offset = address.offset,
- };
- return anv_address_physical(anv_addr);
- } else {
- /* We'll let blorp_surface_reloc write the address. */
- return 0;
- }
+ struct anv_address anv_addr = {
+ .bo = address.buffer,
+ .offset = address.offset,
+ };
+ return anv_address_physical(anv_addr);
}
-#if GFX_VER >= 7 && GFX_VER < 10
+#if GFX_VER == 9
static struct blorp_address
blorp_get_surface_base_address(struct blorp_batch *batch)
{
struct anv_cmd_buffer *cmd_buffer = batch->driver_batch;
return (struct blorp_address) {
- .buffer = cmd_buffer->device->surface_state_pool.block_pool.bo,
- .offset = 0,
+ .buffer = cmd_buffer->device->internal_surface_state_pool.block_pool.bo,
+ .offset = -cmd_buffer->device->internal_surface_state_pool.start_offset,
};
}
#endif
+static uint32_t
+blorp_get_dynamic_state(struct blorp_batch *batch,
+ enum blorp_dynamic_state name)
+{
+ struct anv_cmd_buffer *cmd_buffer = batch->driver_batch;
+ return (cmd_buffer->state.current_db_mode ==
+ ANV_CMD_DESCRIPTOR_BUFFER_MODE_BUFFER) ?
+ cmd_buffer->device->blorp.dynamic_states[name].db_state.offset :
+ cmd_buffer->device->blorp.dynamic_states[name].state.offset;
+}
+
static void *
blorp_alloc_dynamic_state(struct blorp_batch *batch,
uint32_t size,
@@ -134,7 +142,22 @@ blorp_alloc_dynamic_state(struct blorp_batch *batch,
return state.map;
}
-static void
+UNUSED static void *
+blorp_alloc_general_state(struct blorp_batch *batch,
+ uint32_t size,
+ uint32_t alignment,
+ uint32_t *offset)
+{
+ struct anv_cmd_buffer *cmd_buffer = batch->driver_batch;
+
+ struct anv_state state =
+ anv_cmd_buffer_alloc_general_state(cmd_buffer, size, alignment);
+
+ *offset = state.offset;
+ return state.map;
+}
+
+static bool
blorp_alloc_binding_table(struct blorp_batch *batch, unsigned num_entries,
unsigned state_size, unsigned state_alignment,
uint32_t *bt_offset,
@@ -149,18 +172,30 @@ blorp_alloc_binding_table(struct blorp_batch *batch, unsigned num_entries,
anv_cmd_buffer_alloc_blorp_binding_table(cmd_buffer, num_entries,
&state_offset, &bt_state);
if (result != VK_SUCCESS)
- return;
+ return false;
uint32_t *bt_map = bt_state.map;
*bt_offset = bt_state.offset;
for (unsigned i = 0; i < num_entries; i++) {
struct anv_state surface_state =
- anv_cmd_buffer_alloc_surface_state(cmd_buffer);
+ anv_cmd_buffer_alloc_surface_states(cmd_buffer, 1);
+ if (surface_state.map == NULL)
+ return false;
+
bt_map[i] = surface_state.offset + state_offset;
surface_offsets[i] = surface_state.offset;
surface_maps[i] = surface_state.map;
}
+
+ return true;
+}
+
+static uint32_t
+blorp_binding_table_offset_to_pointer(struct blorp_batch *batch,
+ uint32_t offset)
+{
+ return offset;
}
static void *
@@ -169,11 +204,13 @@ blorp_alloc_vertex_buffer(struct blorp_batch *batch, uint32_t size,
{
struct anv_cmd_buffer *cmd_buffer = batch->driver_batch;
struct anv_state vb_state =
- anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, size, 64);
+ anv_cmd_buffer_alloc_temporary_state(cmd_buffer, size, 64);
+ struct anv_address vb_addr =
+ anv_cmd_buffer_temporary_state_address(cmd_buffer, vb_state);
*addr = (struct blorp_address) {
- .buffer = cmd_buffer->device->dynamic_state_pool.block_pool.bo,
- .offset = vb_state.offset,
+ .buffer = vb_addr.bo,
+ .offset = vb_addr.offset,
.mocs = isl_mocs(&cmd_buffer->device->isl_dev,
ISL_SURF_USAGE_VERTEX_BUFFER_BIT, false),
};
@@ -187,6 +224,7 @@ blorp_vf_invalidate_for_vb_48b_transitions(struct blorp_batch *batch,
uint32_t *sizes,
unsigned num_vbs)
{
+#if GFX_VER == 9
struct anv_cmd_buffer *cmd_buffer = batch->driver_batch;
for (unsigned i = 0; i < num_vbs; i++) {
@@ -206,6 +244,7 @@ blorp_vf_invalidate_for_vb_48b_transitions(struct blorp_batch *batch,
*/
genX(cmd_buffer_update_dirty_vbs_for_gfx8_vb_flush)(cmd_buffer, SEQUENTIAL,
(1 << num_vbs) - 1);
+#endif
}
UNUSED static struct blorp_address
@@ -226,6 +265,18 @@ blorp_flush_range(struct blorp_batch *batch, void *start, size_t size)
*/
}
+static void
+blorp_pre_emit_urb_config(struct blorp_batch *blorp_batch,
+ struct intel_urb_config *urb_cfg)
+{
+ struct anv_cmd_buffer *cmd_buffer = blorp_batch->driver_batch;
+ genX(urb_workaround)(cmd_buffer, urb_cfg);
+
+ /* Update urb config. */
+ memcpy(&cmd_buffer->state.gfx.urb_cfg, urb_cfg,
+ sizeof(struct intel_urb_config));
+}
+
static const struct intel_l3_config *
blorp_get_l3_config(struct blorp_batch *batch)
{
@@ -233,17 +284,17 @@ blorp_get_l3_config(struct blorp_batch *batch)
return cmd_buffer->state.current_l3_config;
}
-void
-genX(blorp_exec)(struct blorp_batch *batch,
- const struct blorp_params *params)
+static void
+blorp_exec_on_render(struct blorp_batch *batch,
+ const struct blorp_params *params)
{
+ assert((batch->flags & BLORP_BATCH_USE_COMPUTE) == 0);
+
struct anv_cmd_buffer *cmd_buffer = batch->driver_batch;
+ assert(cmd_buffer->queue_family->queueFlags & VK_QUEUE_GRAPHICS_BIT);
- if (!cmd_buffer->state.current_l3_config) {
- const struct intel_l3_config *cfg =
- intel_get_default_l3_config(&cmd_buffer->device->info);
- genX(cmd_buffer_config_l3)(cmd_buffer, cfg);
- }
+ struct anv_gfx_dynamic_state *hw_state =
+ &cmd_buffer->state.gfx.dyn_state;
const unsigned scale = params->fast_clear_op ? UINT_MAX : 1;
genX(cmd_buffer_emit_hashing_mode)(cmd_buffer, params->x1 - params->x0,
@@ -252,40 +303,52 @@ genX(blorp_exec)(struct blorp_batch *batch,
#if GFX_VER >= 11
/* The PIPE_CONTROL command description says:
*
- * "Whenever a Binding Table Index (BTI) used by a Render Taget Message
+ * "Whenever a Binding Table Index (BTI) used by a Render Target Message
* points to a different RENDER_SURFACE_STATE, SW must issue a Render
* Target Cache Flush by enabling this bit. When render target flush
* is set due to new association of BTI, PS Scoreboard Stall bit must
* be set in this packet."
*/
- anv_add_pending_pipe_bits(cmd_buffer,
- ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
- ANV_PIPE_STALL_AT_SCOREBOARD_BIT,
- "before blorp BTI change");
+ if (blorp_uses_bti_rt_writes(batch, params)) {
+ anv_add_pending_pipe_bits(cmd_buffer,
+ ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
+ ANV_PIPE_STALL_AT_SCOREBOARD_BIT,
+ "before blorp BTI change");
+ }
#endif
- if (params->depth.enabled &&
- !(batch->flags & BLORP_BATCH_NO_EMIT_DEPTH_STENCIL))
- genX(cmd_buffer_emit_gfx12_depth_wa)(cmd_buffer, &params->depth.surf);
-
-#if GFX_VER == 7
- /* The MI_LOAD/STORE_REGISTER_MEM commands which BLORP uses to implement
- * indirect fast-clear colors can cause GPU hangs if we don't stall first.
- * See genX(cmd_buffer_mi_memcpy) for more details.
- */
- if (params->src.clear_color_addr.buffer ||
- params->dst.clear_color_addr.buffer) {
- anv_add_pending_pipe_bits(cmd_buffer,
- ANV_PIPE_CS_STALL_BIT,
- "before blorp prep fast clear");
+#if GFX_VERx10 >= 125
+ /* Check if blorp ds state matches ours. */
+ if (intel_needs_workaround(cmd_buffer->device->info, 18019816803)) {
+ bool blorp_ds_state = params->depth.enabled || params->stencil.enabled;
+ if (cmd_buffer->state.gfx.ds_write_state != blorp_ds_state) {
+ /* Flag the change in ds_write_state so that the next pipeline use
+ * will trigger a PIPE_CONTROL too.
+ */
+ cmd_buffer->state.gfx.ds_write_state = blorp_ds_state;
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_WA_18019816803);
+
+ /* Add the stall that will flush prior to the blorp operation by
+ * genX(cmd_buffer_apply_pipe_flushes)
+ */
+ anv_add_pending_pipe_bits(cmd_buffer,
+ ANV_PIPE_PSS_STALL_SYNC_BIT,
+ "Wa_18019816803");
+ }
}
#endif
- genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+ if (params->depth.enabled &&
+ !(batch->flags & BLORP_BATCH_NO_EMIT_DEPTH_STENCIL))
+ genX(cmd_buffer_emit_gfx12_depth_wa)(cmd_buffer, &params->depth.surf);
genX(flush_pipeline_select_3d)(cmd_buffer);
- genX(cmd_buffer_emit_gfx7_depth_flush)(cmd_buffer);
+ /* Wa_14015814527 */
+ genX(apply_task_urb_workaround)(cmd_buffer);
+
+ /* Apply any outstanding flushes in case pipeline select haven't. */
+ genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
/* BLORP doesn't do anything fancy with depth such as discards, so we want
* the PMA fix off. Also, off is always the safe option.
@@ -297,19 +360,151 @@ genX(blorp_exec)(struct blorp_batch *batch,
#if GFX_VER >= 11
/* The PIPE_CONTROL command description says:
*
- * "Whenever a Binding Table Index (BTI) used by a Render Taget Message
+ * "Whenever a Binding Table Index (BTI) used by a Render Target Message
* points to a different RENDER_SURFACE_STATE, SW must issue a Render
* Target Cache Flush by enabling this bit. When render target flush
* is set due to new association of BTI, PS Scoreboard Stall bit must
* be set in this packet."
*/
- anv_add_pending_pipe_bits(cmd_buffer,
- ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
- ANV_PIPE_STALL_AT_SCOREBOARD_BIT,
- "after blorp BTI change");
+ if (blorp_uses_bti_rt_writes(batch, params)) {
+ anv_add_pending_pipe_bits(cmd_buffer,
+ ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
+ ANV_PIPE_STALL_AT_SCOREBOARD_BIT,
+ "after blorp BTI change");
+ }
+#endif
+
+ /* Flag all the instructions emitted by BLORP. */
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_URB);
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VF_STATISTICS);
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VF);
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VF_TOPOLOGY);
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VERTEX_INPUT);
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VF_SGVS);
+#if GFX_VER >= 11
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VF_SGVS_2);
+#endif
+#if GFX_VER >= 12
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_PRIMITIVE_REPLICATION);
#endif
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VIEWPORT_CC_PTR);
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_STREAMOUT);
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_RASTER);
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_CLIP);
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_SAMPLE_MASK);
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_MULTISAMPLE);
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_SF);
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_SBE);
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_SBE_SWIZ);
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_DEPTH_BOUNDS);
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_WM);
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_WM_DEPTH_STENCIL);
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VS);
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_HS);
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_DS);
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_TE);
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_GS);
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_PS);
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_PS_EXTRA);
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_BLEND_STATE_PTR);
+ if (batch->blorp->config.use_mesh_shading) {
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_MESH_CONTROL);
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_TASK_CONTROL);
+ }
+ if (params->wm_prog_data) {
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_CC_STATE_PTR);
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_PS_BLEND);
+ }
+
+ anv_cmd_dirty_mask_t dirty = ~(ANV_CMD_DIRTY_INDEX_BUFFER |
+ ANV_CMD_DIRTY_XFB_ENABLE);
cmd_buffer->state.gfx.vb_dirty = ~0;
- cmd_buffer->state.gfx.dirty = ~0;
- cmd_buffer->state.push_constants_dirty = ~0;
+ cmd_buffer->state.gfx.dirty |= dirty;
+ cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_ALL_GRAPHICS;
+}
+
+static void
+blorp_exec_on_compute(struct blorp_batch *batch,
+ const struct blorp_params *params)
+{
+ assert(batch->flags & BLORP_BATCH_USE_COMPUTE);
+
+ struct anv_cmd_buffer *cmd_buffer = batch->driver_batch;
+ assert(cmd_buffer->queue_family->queueFlags & VK_QUEUE_COMPUTE_BIT);
+
+ genX(flush_pipeline_select_gpgpu)(cmd_buffer);
+
+ /* Apply any outstanding flushes in case pipeline select haven't. */
+ genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+
+ blorp_exec(batch, params);
+
+ cmd_buffer->state.descriptors_dirty |= VK_SHADER_STAGE_COMPUTE_BIT;
+ cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_COMPUTE_BIT;
+ cmd_buffer->state.compute.pipeline_dirty = true;
+}
+
+static void
+blorp_exec_on_blitter(struct blorp_batch *batch,
+ const struct blorp_params *params)
+{
+ assert(batch->flags & BLORP_BATCH_USE_BLITTER);
+
+ struct anv_cmd_buffer *cmd_buffer = batch->driver_batch;
+ assert(cmd_buffer->queue_family->queueFlags == VK_QUEUE_TRANSFER_BIT);
+
+ blorp_exec(batch, params);
+}
+
+void
+genX(blorp_exec)(struct blorp_batch *batch,
+ const struct blorp_params *params)
+{
+ struct anv_cmd_buffer *cmd_buffer = batch->driver_batch;
+
+ /* Turn on preemption if it was toggled off. */
+ if (!cmd_buffer->state.gfx.object_preemption)
+ genX(cmd_buffer_set_preemption)(cmd_buffer, true);
+
+ if (!cmd_buffer->state.current_l3_config) {
+ const struct intel_l3_config *cfg =
+ intel_get_default_l3_config(cmd_buffer->device->info);
+ genX(cmd_buffer_config_l3)(cmd_buffer, cfg);
+ }
+
+ if (batch->flags & BLORP_BATCH_USE_BLITTER)
+ blorp_exec_on_blitter(batch, params);
+ else if (batch->flags & BLORP_BATCH_USE_COMPUTE)
+ blorp_exec_on_compute(batch, params);
+ else
+ blorp_exec_on_render(batch, params);
+}
+
+static void
+blorp_emit_pre_draw(struct blorp_batch *batch, const struct blorp_params *params)
+{
+ struct anv_cmd_buffer *cmd_buffer = batch->driver_batch;
+ blorp_measure_start(batch, params);
+ genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, true);
+}
+
+static void
+blorp_emit_post_draw(struct blorp_batch *batch, const struct blorp_params *params)
+{
+ struct anv_cmd_buffer *cmd_buffer = batch->driver_batch;
+
+ genX(batch_emit_post_3dprimitive_was)(&cmd_buffer->batch,
+ cmd_buffer->device,
+ _3DPRIM_RECTLIST,
+ 3);
+
+ genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, false);
+ blorp_measure_end(batch, params);
+}
+
+void
+genX(blorp_init_dynamic_states)(struct blorp_context *context)
+{
+ blorp_init_dynamic_states(context);
}
diff --git a/src/intel/vulkan/genX_cmd_buffer.c b/src/intel/vulkan/genX_cmd_buffer.c
index 7d3e72f1711..390a8ac2bde 100644
--- a/src/intel/vulkan/genX_cmd_buffer.c
+++ b/src/intel/vulkan/genX_cmd_buffer.c
@@ -26,17 +26,16 @@
#include "anv_private.h"
#include "anv_measure.h"
-#include "vk_format.h"
+#include "vk_render_pass.h"
#include "vk_util.h"
-#include "util/fast_idiv_by_const.h"
#include "common/intel_aux_map.h"
-#include "common/intel_l3_config.h"
#include "genxml/gen_macros.h"
#include "genxml/genX_pack.h"
-#include "genxml/gen_rt_pack.h"
+#include "genxml/genX_rt_pack.h"
+#include "common/intel_genX_state_brw.h"
-#include "nir/nir_xfb_info.h"
+#include "ds/intel_tracepoints.h"
/* We reserve :
* - GPR 14 for secondary command buffer returns
@@ -48,6 +47,8 @@
#define __gen_get_batch_address(b, a) anv_batch_address(b, a)
#include "common/mi_builder.h"
+#include "genX_cmd_draw_generated_flush.h"
+
static void genX(flush_pipeline_select)(struct anv_cmd_buffer *cmd_buffer,
uint32_t pipeline);
@@ -56,11 +57,17 @@ convert_pc_to_bits(struct GENX(PIPE_CONTROL) *pc) {
enum anv_pipe_bits bits = 0;
bits |= (pc->DepthCacheFlushEnable) ? ANV_PIPE_DEPTH_CACHE_FLUSH_BIT : 0;
bits |= (pc->DCFlushEnable) ? ANV_PIPE_DATA_CACHE_FLUSH_BIT : 0;
-#if GFX_VER >= 12
+#if GFX_VERx10 >= 125
+ bits |= (pc->PSSStallSyncEnable) ? ANV_PIPE_PSS_STALL_SYNC_BIT : 0;
+#endif
+#if GFX_VER == 12
bits |= (pc->TileCacheFlushEnable) ? ANV_PIPE_TILE_CACHE_FLUSH_BIT : 0;
+#endif
+#if GFX_VER >= 12
bits |= (pc->HDCPipelineFlushEnable) ? ANV_PIPE_HDC_PIPELINE_FLUSH_BIT : 0;
#endif
bits |= (pc->RenderTargetCacheFlushEnable) ? ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT : 0;
+ bits |= (pc->VFCacheInvalidationEnable) ? ANV_PIPE_VF_CACHE_INVALIDATE_BIT : 0;
bits |= (pc->StateCacheInvalidationEnable) ? ANV_PIPE_STATE_CACHE_INVALIDATE_BIT : 0;
bits |= (pc->ConstantCacheInvalidationEnable) ? ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT : 0;
bits |= (pc->TextureCacheInvalidationEnable) ? ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT : 0;
@@ -68,22 +75,29 @@ convert_pc_to_bits(struct GENX(PIPE_CONTROL) *pc) {
bits |= (pc->StallAtPixelScoreboard) ? ANV_PIPE_STALL_AT_SCOREBOARD_BIT : 0;
bits |= (pc->DepthStallEnable) ? ANV_PIPE_DEPTH_STALL_BIT : 0;
bits |= (pc->CommandStreamerStallEnable) ? ANV_PIPE_CS_STALL_BIT : 0;
+#if GFX_VERx10 == 125
+ bits |= (pc->UntypedDataPortCacheFlushEnable) ? ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT : 0;
+ bits |= (pc->CCSFlushEnable) ? ANV_PIPE_CCS_CACHE_FLUSH_BIT : 0;
+#endif
return bits;
}
-#define anv_debug_dump_pc(pc) \
- if (unlikely(INTEL_DEBUG & DEBUG_PIPE_CONTROL)) { \
- fputs("pc: emit PC=( ", stderr); \
- anv_dump_pipe_bits(convert_pc_to_bits(&(pc))); \
- fprintf(stderr, ") reason: %s\n", __FUNCTION__); \
+#define anv_debug_dump_pc(pc, reason) \
+ if (INTEL_DEBUG(DEBUG_PIPE_CONTROL)) { \
+ fputs("pc: emit PC=( ", stdout); \
+ anv_dump_pipe_bits(convert_pc_to_bits(&(pc)), stdout); \
+ fprintf(stdout, ") reason: %s\n", reason); \
}
void
genX(cmd_buffer_emit_state_base_address)(struct anv_cmd_buffer *cmd_buffer)
{
+ if (anv_cmd_buffer_is_blitter_queue(cmd_buffer) ||
+ anv_cmd_buffer_is_video_queue(cmd_buffer))
+ return;
+
struct anv_device *device = cmd_buffer->device;
- UNUSED const struct intel_device_info *devinfo = &device->info;
- uint32_t mocs = isl_mocs(&device->isl_dev, 0, false);
+ const uint32_t mocs = isl_mocs(&device->isl_dev, 0, false);
/* If we are emitting a new state base address we probably need to re-emit
* binding tables.
@@ -93,33 +107,22 @@ genX(cmd_buffer_emit_state_base_address)(struct anv_cmd_buffer *cmd_buffer)
/* Emit a render target cache flush.
*
* This isn't documented anywhere in the PRM. However, it seems to be
- * necessary prior to changing the surface state base adress. Without
+ * necessary prior to changing the surface state base address. Without
* this, we get GPU hangs when using multi-level command buffers which
* clear depth, reset state base address, and then go render stuff.
*/
- anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
+ genx_batch_emit_pipe_control
+ (&cmd_buffer->batch, cmd_buffer->device->info,
+ cmd_buffer->state.current_pipeline,
#if GFX_VER >= 12
- pc.HDCPipelineFlushEnable = true;
+ ANV_PIPE_HDC_PIPELINE_FLUSH_BIT |
#else
- pc.DCFlushEnable = true;
-#endif
- pc.RenderTargetCacheFlushEnable = true;
- pc.CommandStreamerStallEnable = true;
-#if GFX_VER == 12
- /* Wa_1606662791:
- *
- * Software must program PIPE_CONTROL command with "HDC Pipeline
- * Flush" prior to programming of the below two non-pipeline state :
- * * STATE_BASE_ADDRESS
- * * 3DSTATE_BINDING_TABLE_POOL_ALLOC
- */
- if (devinfo->revision == 0 /* A0 */)
- pc.HDCPipelineFlushEnable = true;
+ ANV_PIPE_DATA_CACHE_FLUSH_BIT |
#endif
- anv_debug_dump_pc(pc);
- }
+ ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
+ ANV_PIPE_CS_STALL_BIT);
-#if GFX_VER == 12
+#if INTEL_NEEDS_WA_1607854226
/* Wa_1607854226:
*
* Workaround the non pipelined state not applying in MEDIA/GPGPU pipeline
@@ -129,94 +132,157 @@ genX(cmd_buffer_emit_state_base_address)(struct anv_cmd_buffer *cmd_buffer)
genX(flush_pipeline_select_3d)(cmd_buffer);
#endif
+ /* If no API entry point selected the current mode (this can happen if the
+ * first operation in the command buffer is a , select BUFFER if
+ * EXT_descriptor_buffer is enabled, otherwise LEGACY.
+ */
+ if (cmd_buffer->state.pending_db_mode ==
+ ANV_CMD_DESCRIPTOR_BUFFER_MODE_UNKNOWN) {
+ cmd_buffer->state.pending_db_mode =
+ cmd_buffer->device->vk.enabled_extensions.EXT_descriptor_buffer ?
+ ANV_CMD_DESCRIPTOR_BUFFER_MODE_BUFFER :
+ ANV_CMD_DESCRIPTOR_BUFFER_MODE_LEGACY;
+ }
+
anv_batch_emit(&cmd_buffer->batch, GENX(STATE_BASE_ADDRESS), sba) {
sba.GeneralStateBaseAddress = (struct anv_address) { NULL, 0 };
sba.GeneralStateMOCS = mocs;
+ sba.GeneralStateBufferSize = 0xfffff;
sba.GeneralStateBaseAddressModifyEnable = true;
+ sba.GeneralStateBufferSizeModifyEnable = true;
sba.StatelessDataPortAccessMOCS = mocs;
+#if GFX_VERx10 >= 125
+ sba.SurfaceStateBaseAddress =
+ (struct anv_address) { .offset =
+ device->physical->va.internal_surface_state_pool.addr,
+ };
+#else
sba.SurfaceStateBaseAddress =
anv_cmd_buffer_surface_base_address(cmd_buffer);
+#endif
sba.SurfaceStateMOCS = mocs;
sba.SurfaceStateBaseAddressModifyEnable = true;
- sba.DynamicStateBaseAddress =
- (struct anv_address) { device->dynamic_state_pool.block_pool.bo, 0 };
- sba.DynamicStateMOCS = mocs;
- sba.DynamicStateBaseAddressModifyEnable = true;
-
sba.IndirectObjectBaseAddress = (struct anv_address) { NULL, 0 };
sba.IndirectObjectMOCS = mocs;
+ sba.IndirectObjectBufferSize = 0xfffff;
sba.IndirectObjectBaseAddressModifyEnable = true;
+ sba.IndirectObjectBufferSizeModifyEnable = true;
sba.InstructionBaseAddress =
(struct anv_address) { device->instruction_state_pool.block_pool.bo, 0 };
sba.InstructionMOCS = mocs;
+ sba.InstructionBufferSize =
+ device->physical->va.instruction_state_pool.size / 4096;
sba.InstructionBaseAddressModifyEnable = true;
+ sba.InstructionBuffersizeModifyEnable = true;
+
+#if GFX_VER >= 11
+ sba.BindlessSamplerStateBaseAddress = ANV_NULL_ADDRESS;
+ sba.BindlessSamplerStateBufferSize = 0;
+ sba.BindlessSamplerStateMOCS = mocs;
+ sba.BindlessSamplerStateBaseAddressModifyEnable = true;
+#endif
+
+ if (cmd_buffer->state.pending_db_mode == ANV_CMD_DESCRIPTOR_BUFFER_MODE_BUFFER) {
+ sba.DynamicStateBaseAddress = (struct anv_address) {
+ .offset = device->physical->va.dynamic_state_db_pool.addr,
+ };
+ sba.DynamicStateBufferSize =
+ (device->physical->va.dynamic_state_db_pool.size +
+ device->physical->va.descriptor_buffer_pool.size +
+ device->physical->va.push_descriptor_buffer_pool.size) / 4096;
+ sba.DynamicStateMOCS = mocs;
+ sba.DynamicStateBaseAddressModifyEnable = true;
+ sba.DynamicStateBufferSizeModifyEnable = true;
+
+#if GFX_VERx10 >= 125
+ sba.BindlessSurfaceStateBaseAddress = (struct anv_address) {
+ .offset = device->physical->va.descriptor_buffer_pool.addr,
+ };
+ sba.BindlessSurfaceStateSize =
+ (device->physical->va.descriptor_buffer_pool.size +
+ device->physical->va.push_descriptor_buffer_pool.size) - 1;
+ sba.BindlessSurfaceStateMOCS = mocs;
+ sba.BindlessSurfaceStateBaseAddressModifyEnable = true;
+#else
+ const uint64_t surfaces_addr =
+ cmd_buffer->state.descriptor_buffers.surfaces_address != 0 ?
+ cmd_buffer->state.descriptor_buffers.surfaces_address :
+ anv_address_physical(device->workaround_address);
+ const uint64_t surfaces_size =
+ cmd_buffer->state.descriptor_buffers.surfaces_address != 0 ?
+ MIN2(device->physical->va.descriptor_buffer_pool.size -
+ (cmd_buffer->state.descriptor_buffers.surfaces_address -
+ device->physical->va.descriptor_buffer_pool.addr),
+ anv_physical_device_bindless_heap_size(device->physical, true)) :
+ (device->workaround_bo->size - device->workaround_address.offset);
+ sba.BindlessSurfaceStateBaseAddress = (struct anv_address) {
+ .offset = surfaces_addr,
+ };
+ sba.BindlessSurfaceStateSize = surfaces_size / ANV_SURFACE_STATE_SIZE - 1;
+ sba.BindlessSurfaceStateMOCS = mocs;
+ sba.BindlessSurfaceStateBaseAddressModifyEnable = true;
+#endif /* GFX_VERx10 < 125 */
+ } else if (!device->physical->indirect_descriptors) {
+#if GFX_VERx10 >= 125
+ sba.DynamicStateBaseAddress = (struct anv_address) {
+ .offset = device->physical->va.dynamic_state_pool.addr,
+ };
+ sba.DynamicStateBufferSize =
+ (device->physical->va.dynamic_state_pool.size +
+ device->physical->va.sampler_state_pool.size) / 4096;
+ sba.DynamicStateMOCS = mocs;
+ sba.DynamicStateBaseAddressModifyEnable = true;
+ sba.DynamicStateBufferSizeModifyEnable = true;
-# if (GFX_VER >= 8)
- /* Broadwell requires that we specify a buffer size for a bunch of
- * these fields. However, since we will be growing the BO's live, we
- * just set them all to the maximum.
- */
- sba.GeneralStateBufferSize = 0xfffff;
- sba.IndirectObjectBufferSize = 0xfffff;
- if (anv_use_softpin(device->physical)) {
- /* With softpin, we use fixed addresses so we actually know how big
- * our base addresses are.
- */
- sba.DynamicStateBufferSize = DYNAMIC_STATE_POOL_SIZE / 4096;
- sba.InstructionBufferSize = INSTRUCTION_STATE_POOL_SIZE / 4096;
- } else {
- sba.DynamicStateBufferSize = 0xfffff;
- sba.InstructionBufferSize = 0xfffff;
- }
- sba.GeneralStateBufferSizeModifyEnable = true;
- sba.IndirectObjectBufferSizeModifyEnable = true;
- sba.DynamicStateBufferSizeModifyEnable = true;
- sba.InstructionBuffersizeModifyEnable = true;
-# else
- /* On gfx7, we have upper bounds instead. According to the docs,
- * setting an upper bound of zero means that no bounds checking is
- * performed so, in theory, we should be able to leave them zero.
- * However, border color is broken and the GPU bounds-checks anyway.
- * To avoid this and other potential problems, we may as well set it
- * for everything.
- */
- sba.GeneralStateAccessUpperBound =
- (struct anv_address) { .bo = NULL, .offset = 0xfffff000 };
- sba.GeneralStateAccessUpperBoundModifyEnable = true;
- sba.DynamicStateAccessUpperBound =
- (struct anv_address) { .bo = NULL, .offset = 0xfffff000 };
- sba.DynamicStateAccessUpperBoundModifyEnable = true;
- sba.InstructionAccessUpperBound =
- (struct anv_address) { .bo = NULL, .offset = 0xfffff000 };
- sba.InstructionAccessUpperBoundModifyEnable = true;
-# endif
-# if (GFX_VER >= 9)
- if (anv_use_softpin(device->physical)) {
sba.BindlessSurfaceStateBaseAddress = (struct anv_address) {
- .bo = device->surface_state_pool.block_pool.bo,
- .offset = 0,
+ .offset = device->physical->va.internal_surface_state_pool.addr,
};
- sba.BindlessSurfaceStateSize = (1 << 20) - 1;
+ sba.BindlessSurfaceStateSize =
+ (device->physical->va.internal_surface_state_pool.size +
+ device->physical->va.bindless_surface_state_pool.size) - 1;
+ sba.BindlessSurfaceStateMOCS = mocs;
+ sba.BindlessSurfaceStateBaseAddressModifyEnable = true;
+#else
+ unreachable("Direct descriptor not supported");
+#endif
} else {
- sba.BindlessSurfaceStateBaseAddress = ANV_NULL_ADDRESS;
- sba.BindlessSurfaceStateSize = 0;
+ sba.DynamicStateBaseAddress = (struct anv_address) {
+ .offset = device->physical->va.dynamic_state_pool.addr,
+ };
+ sba.DynamicStateBufferSize =
+ (device->physical->va.dynamic_state_pool.size +
+ device->physical->va.sampler_state_pool.size) / 4096;
+ sba.DynamicStateMOCS = mocs;
+ sba.DynamicStateBaseAddressModifyEnable = true;
+ sba.DynamicStateBufferSizeModifyEnable = true;
+
+ sba.BindlessSurfaceStateBaseAddress =
+ (struct anv_address) { .offset =
+ device->physical->va.bindless_surface_state_pool.addr,
+ };
+ sba.BindlessSurfaceStateSize =
+ anv_physical_device_bindless_heap_size(device->physical, false) /
+ ANV_SURFACE_STATE_SIZE - 1;
+ sba.BindlessSurfaceStateMOCS = mocs;
+ sba.BindlessSurfaceStateBaseAddressModifyEnable = true;
}
- sba.BindlessSurfaceStateMOCS = mocs;
- sba.BindlessSurfaceStateBaseAddressModifyEnable = true;
-# endif
-# if (GFX_VER >= 10)
- sba.BindlessSamplerStateBaseAddress = (struct anv_address) { NULL, 0 };
- sba.BindlessSamplerStateMOCS = mocs;
- sba.BindlessSamplerStateBaseAddressModifyEnable = true;
- sba.BindlessSamplerStateBufferSize = 0;
-# endif
+
+#if GFX_VERx10 >= 125
+ sba.L1CacheControl = L1CC_WB;
+#endif
}
-#if GFX_VER == 12
+ bool db_mode_changed = false;
+ if (cmd_buffer->state.current_db_mode != cmd_buffer->state.pending_db_mode) {
+ cmd_buffer->state.current_db_mode = cmd_buffer->state.pending_db_mode;
+ db_mode_changed = true;
+ }
+
+#if INTEL_NEEDS_WA_1607854226
/* Wa_1607854226:
*
* Put the pipeline back into its current mode.
@@ -225,8 +291,12 @@ genX(cmd_buffer_emit_state_base_address)(struct anv_cmd_buffer *cmd_buffer)
genX(flush_pipeline_select)(cmd_buffer, gfx12_wa_pipeline);
#endif
+#if GFX_VERx10 >= 125
+ genX(cmd_buffer_emit_bt_pool_base_address)(cmd_buffer);
+#endif
+
/* After re-setting the surface state base address, we have to do some
- * cache flusing so that the sampler engine will pick up the new
+ * cache flushing so that the sampler engine will pick up the new
* SURFACE_STATE objects and binding tables. From the Broadwell PRM,
* Shared Function > 3D Sampler > State > State Caching (page 96):
*
@@ -261,332 +331,145 @@ genX(cmd_buffer_emit_state_base_address)(struct anv_cmd_buffer *cmd_buffer)
* sufficient. The theory here is that all of the sampling/rendering
* units cache the binding table in the texture cache. However, we have
* yet to be able to actually confirm this.
+ *
+ * Wa_14013910100:
+ *
+ * "DG2 128/256/512-A/B: S/W must program STATE_BASE_ADDRESS command twice
+ * or program pipe control with Instruction cache invalidate post
+ * STATE_BASE_ADDRESS command"
*/
- anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
- pc.TextureCacheInvalidationEnable = true;
- pc.ConstantCacheInvalidationEnable = true;
- pc.StateCacheInvalidationEnable = true;
- anv_debug_dump_pc(pc);
+ enum anv_pipe_bits bits =
+ ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT |
+ ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT |
+ ANV_PIPE_STATE_CACHE_INVALIDATE_BIT |
+ (intel_needs_workaround(cmd_buffer->device->info, 16013000631) ?
+ ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT : 0);
+
+#if GFX_VER >= 9 && GFX_VER <= 11
+ /* From the SKL PRM, Vol. 2a, "PIPE_CONTROL",
+ *
+ * "Workaround : “CS Stall” bit in PIPE_CONTROL command must be
+ * always set for GPGPU workloads when “Texture Cache Invalidation
+ * Enable” bit is set".
+ *
+ * Workaround stopped appearing in TGL PRMs.
+ */
+ if (cmd_buffer->state.current_pipeline == GPGPU)
+ bits |= ANV_PIPE_CS_STALL_BIT;
+#endif
+ genx_batch_emit_pipe_control(&cmd_buffer->batch, cmd_buffer->device->info,
+ cmd_buffer->state.current_pipeline,
+ bits);
+
+ assert(cmd_buffer->state.current_db_mode !=
+ ANV_CMD_DESCRIPTOR_BUFFER_MODE_UNKNOWN);
+ if (db_mode_changed) {
+#if GFX_VER == 11 || GFX_VER == 125
+ anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_SLICE_TABLE_STATE_POINTERS), ptr) {
+ ptr.SliceHashStatePointerValid = true;
+ ptr.SliceHashTableStatePointer = cmd_buffer->state.current_db_mode ==
+ ANV_CMD_DESCRIPTOR_BUFFER_MODE_BUFFER ?
+ device->slice_hash_db.offset :
+ device->slice_hash.offset;
+ }
+#endif
+
+ /* Changing the dynamic state location affects all the states having
+ * offset relative to that pointer.
+ */
+ struct anv_gfx_dynamic_state *hw_state = &cmd_buffer->state.gfx.dyn_state;
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VIEWPORT_SF_CLIP);
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VIEWPORT_CC);
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_SCISSOR);
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_CC_STATE);
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_BLEND_STATE);
+ if (cmd_buffer->device->vk.enabled_extensions.KHR_fragment_shading_rate) {
+ struct vk_dynamic_graphics_state *dyn =
+ &cmd_buffer->vk.dynamic_graphics_state;
+ BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_FSR);
+ }
+
+#if GFX_VERx10 < 125
+ /* The push constant data for compute shader is an offset in the dynamic
+ * state heap. If we change it, we need to reemit the push constants.
+ */
+ cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_COMPUTE_BIT;
+ cmd_buffer->state.compute.base.push_constants_data_dirty = true;
+#endif
}
}
-static void
-add_surface_reloc(struct anv_cmd_buffer *cmd_buffer,
- struct anv_state state, struct anv_address addr)
+void
+genX(cmd_buffer_emit_bt_pool_base_address)(struct anv_cmd_buffer *cmd_buffer)
{
- VkResult result;
+ if (!anv_cmd_buffer_is_render_or_compute_queue(cmd_buffer))
+ return;
- if (anv_use_softpin(cmd_buffer->device->physical)) {
- result = anv_reloc_list_add_bo(&cmd_buffer->surface_relocs,
- &cmd_buffer->pool->alloc,
- addr.bo);
- } else {
- const struct isl_device *isl_dev = &cmd_buffer->device->isl_dev;
- result = anv_reloc_list_add(&cmd_buffer->surface_relocs,
- &cmd_buffer->pool->alloc,
- state.offset + isl_dev->ss.addr_offset,
- addr.bo, addr.offset, NULL);
+ /* If we are emitting a new state base address we probably need to re-emit
+ * binding tables.
+ */
+ cmd_buffer->state.descriptors_dirty |= ~0;
+
+#if GFX_VERx10 >= 125
+ struct anv_device *device = cmd_buffer->device;
+ const uint32_t mocs = isl_mocs(&device->isl_dev, 0, false);
+
+ genx_batch_emit_pipe_control(&cmd_buffer->batch,
+ cmd_buffer->device->info,
+ cmd_buffer->state.current_pipeline,
+ ANV_PIPE_CS_STALL_BIT);
+ anv_batch_emit(
+ &cmd_buffer->batch, GENX(3DSTATE_BINDING_TABLE_POOL_ALLOC), btpa) {
+ btpa.BindingTablePoolBaseAddress =
+ anv_cmd_buffer_surface_base_address(cmd_buffer);
+ btpa.BindingTablePoolBufferSize = device->physical->va.binding_table_pool.size / 4096;
+ btpa.MOCS = mocs;
}
+ genx_batch_emit_pipe_control(&cmd_buffer->batch,
+ cmd_buffer->device->info,
+ cmd_buffer->state.current_pipeline,
+ ANV_PIPE_STATE_CACHE_INVALIDATE_BIT);
+#else /* GFX_VERx10 < 125 */
+ genX(cmd_buffer_emit_state_base_address)(cmd_buffer);
+#endif
+}
+
+static void
+add_surface_reloc(struct anv_cmd_buffer *cmd_buffer,
+ struct anv_address addr)
+{
+ VkResult result = anv_reloc_list_add_bo(&cmd_buffer->surface_relocs,
+ addr.bo);
+
if (unlikely(result != VK_SUCCESS))
anv_batch_set_error(&cmd_buffer->batch, result);
}
static void
add_surface_state_relocs(struct anv_cmd_buffer *cmd_buffer,
- struct anv_surface_state state)
+ const struct anv_surface_state *state)
{
- const struct isl_device *isl_dev = &cmd_buffer->device->isl_dev;
-
- assert(!anv_address_is_null(state.address));
- add_surface_reloc(cmd_buffer, state.state, state.address);
+ assert(!anv_address_is_null(state->address));
+ add_surface_reloc(cmd_buffer, state->address);
- if (!anv_address_is_null(state.aux_address)) {
+ if (!anv_address_is_null(state->aux_address)) {
VkResult result =
- anv_reloc_list_add(&cmd_buffer->surface_relocs,
- &cmd_buffer->pool->alloc,
- state.state.offset + isl_dev->ss.aux_addr_offset,
- state.aux_address.bo,
- state.aux_address.offset,
- NULL);
+ anv_reloc_list_add_bo(&cmd_buffer->surface_relocs,
+ state->aux_address.bo);
if (result != VK_SUCCESS)
anv_batch_set_error(&cmd_buffer->batch, result);
}
- if (!anv_address_is_null(state.clear_address)) {
+ if (!anv_address_is_null(state->clear_address)) {
VkResult result =
- anv_reloc_list_add(&cmd_buffer->surface_relocs,
- &cmd_buffer->pool->alloc,
- state.state.offset +
- isl_dev->ss.clear_color_state_offset,
- state.clear_address.bo,
- state.clear_address.offset,
- NULL);
+ anv_reloc_list_add_bo(&cmd_buffer->surface_relocs,
+ state->clear_address.bo);
if (result != VK_SUCCESS)
anv_batch_set_error(&cmd_buffer->batch, result);
}
}
-static bool
-isl_color_value_requires_conversion(union isl_color_value color,
- const struct isl_surf *surf,
- const struct isl_view *view)
-{
- if (surf->format == view->format && isl_swizzle_is_identity(view->swizzle))
- return false;
-
- uint32_t surf_pack[4] = { 0, 0, 0, 0 };
- isl_color_value_pack(&color, surf->format, surf_pack);
-
- uint32_t view_pack[4] = { 0, 0, 0, 0 };
- union isl_color_value swiz_color =
- isl_color_value_swizzle_inv(color, view->swizzle);
- isl_color_value_pack(&swiz_color, view->format, view_pack);
-
- return memcmp(surf_pack, view_pack, sizeof(surf_pack)) != 0;
-}
-
-static bool
-anv_can_fast_clear_color_view(struct anv_device * device,
- struct anv_image_view *iview,
- VkImageLayout layout,
- union isl_color_value clear_color,
- uint32_t num_layers,
- VkRect2D render_area)
-{
- if (iview->planes[0].isl.base_array_layer >=
- anv_image_aux_layers(iview->image, VK_IMAGE_ASPECT_COLOR_BIT,
- iview->planes[0].isl.base_level))
- return false;
-
- /* Start by getting the fast clear type. We use the first subpass
- * layout here because we don't want to fast-clear if the first subpass
- * to use the attachment can't handle fast-clears.
- */
- enum anv_fast_clear_type fast_clear_type =
- anv_layout_to_fast_clear_type(&device->info, iview->image,
- VK_IMAGE_ASPECT_COLOR_BIT,
- layout);
- switch (fast_clear_type) {
- case ANV_FAST_CLEAR_NONE:
- return false;
- case ANV_FAST_CLEAR_DEFAULT_VALUE:
- if (!isl_color_value_is_zero(clear_color, iview->planes[0].isl.format))
- return false;
- break;
- case ANV_FAST_CLEAR_ANY:
- break;
- }
-
- /* Potentially, we could do partial fast-clears but doing so has crazy
- * alignment restrictions. It's easier to just restrict to full size
- * fast clears for now.
- */
- if (render_area.offset.x != 0 ||
- render_area.offset.y != 0 ||
- render_area.extent.width != iview->vk.extent.width ||
- render_area.extent.height != iview->vk.extent.height)
- return false;
-
- /* On Broadwell and earlier, we can only handle 0/1 clear colors */
- if (GFX_VER <= 8 &&
- !isl_color_value_is_zero_one(clear_color, iview->planes[0].isl.format))
- return false;
-
- /* If the clear color is one that would require non-trivial format
- * conversion on resolve, we don't bother with the fast clear. This
- * shouldn't be common as most clear colors are 0/1 and the most common
- * format re-interpretation is for sRGB.
- */
- if (isl_color_value_requires_conversion(clear_color,
- &iview->image->planes[0].primary_surface.isl,
- &iview->planes[0].isl)) {
- anv_perf_warn(device, &iview->vk.base,
- "Cannot fast-clear to colors which would require "
- "format conversion on resolve");
- return false;
- }
-
- /* We only allow fast clears to the first slice of an image (level 0,
- * layer 0) and only for the entire slice. This guarantees us that, at
- * any given time, there is only one clear color on any given image at
- * any given time. At the time of our testing (Jan 17, 2018), there
- * were no known applications which would benefit from fast-clearing
- * more than just the first slice.
- */
- if (iview->planes[0].isl.base_level > 0 ||
- iview->planes[0].isl.base_array_layer > 0) {
- anv_perf_warn(device, &iview->image->vk.base,
- "Rendering with multi-lod or multi-layer framebuffer "
- "with LOAD_OP_LOAD and baseMipLevel > 0 or "
- "baseArrayLayer > 0. Not fast clearing.");
- return false;
- }
-
- if (num_layers > 1) {
- anv_perf_warn(device, &iview->image->vk.base,
- "Rendering to a multi-layer framebuffer with "
- "LOAD_OP_CLEAR. Only fast-clearing the first slice");
- }
-
- return true;
-}
-
-static bool
-anv_can_hiz_clear_ds_view(struct anv_device *device,
- struct anv_image_view *iview,
- VkImageLayout layout,
- VkImageAspectFlags clear_aspects,
- float depth_clear_value,
- VkRect2D render_area)
-{
- /* We don't do any HiZ or depth fast-clears on gfx7 yet */
- if (GFX_VER == 7)
- return false;
-
- /* If we're just clearing stencil, we can always HiZ clear */
- if (!(clear_aspects & VK_IMAGE_ASPECT_DEPTH_BIT))
- return true;
-
- /* We must have depth in order to have HiZ */
- if (!(iview->image->vk.aspects & VK_IMAGE_ASPECT_DEPTH_BIT))
- return false;
-
- const enum isl_aux_usage clear_aux_usage =
- anv_layout_to_aux_usage(&device->info, iview->image,
- VK_IMAGE_ASPECT_DEPTH_BIT,
- VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT,
- layout);
- if (!blorp_can_hiz_clear_depth(&device->info,
- &iview->image->planes[0].primary_surface.isl,
- clear_aux_usage,
- iview->planes[0].isl.base_level,
- iview->planes[0].isl.base_array_layer,
- render_area.offset.x,
- render_area.offset.y,
- render_area.offset.x +
- render_area.extent.width,
- render_area.offset.y +
- render_area.extent.height))
- return false;
-
- if (depth_clear_value != ANV_HZ_FC_VAL)
- return false;
-
- /* Only gfx9+ supports returning ANV_HZ_FC_VAL when sampling a fast-cleared
- * portion of a HiZ buffer. Testing has revealed that Gfx8 only supports
- * returning 0.0f. Gens prior to gfx8 do not support this feature at all.
- */
- if (GFX_VER == 8 && anv_can_sample_with_hiz(&device->info, iview->image))
- return false;
-
- /* If we got here, then we can fast clear */
- return true;
-}
-
-#define READ_ONCE(x) (*(volatile __typeof__(x) *)&(x))
-
-#if GFX_VER == 12
-static void
-anv_image_init_aux_tt(struct anv_cmd_buffer *cmd_buffer,
- const struct anv_image *image,
- VkImageAspectFlagBits aspect,
- uint32_t base_level, uint32_t level_count,
- uint32_t base_layer, uint32_t layer_count)
-{
- const uint32_t plane = anv_image_aspect_to_plane(image, aspect);
-
- const struct anv_surface *surface = &image->planes[plane].primary_surface;
- uint64_t base_address =
- anv_address_physical(anv_image_address(image, &surface->memory_range));
-
- const struct isl_surf *isl_surf = &image->planes[plane].primary_surface.isl;
- uint64_t format_bits = intel_aux_map_format_bits_for_isl_surf(isl_surf);
-
- /* We're about to live-update the AUX-TT. We really don't want anyone else
- * trying to read it while we're doing this. We could probably get away
- * with not having this stall in some cases if we were really careful but
- * it's better to play it safe. Full stall the GPU.
- */
- anv_add_pending_pipe_bits(cmd_buffer,
- ANV_PIPE_END_OF_PIPE_SYNC_BIT,
- "before update AUX-TT");
- genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
-
- struct mi_builder b;
- mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
-
- for (uint32_t a = 0; a < layer_count; a++) {
- const uint32_t layer = base_layer + a;
-
- uint64_t start_offset_B = UINT64_MAX, end_offset_B = 0;
- for (uint32_t l = 0; l < level_count; l++) {
- const uint32_t level = base_level + l;
-
- uint32_t logical_array_layer, logical_z_offset_px;
- if (image->vk.image_type == VK_IMAGE_TYPE_3D) {
- logical_array_layer = 0;
-
- /* If the given miplevel does not have this layer, then any higher
- * miplevels won't either because miplevels only get smaller the
- * higher the LOD.
- */
- assert(layer < image->vk.extent.depth);
- if (layer >= anv_minify(image->vk.extent.depth, level))
- break;
- logical_z_offset_px = layer;
- } else {
- assert(layer < image->vk.array_layers);
- logical_array_layer = layer;
- logical_z_offset_px = 0;
- }
-
- uint64_t slice_start_offset_B, slice_end_offset_B;
- isl_surf_get_image_range_B_tile(isl_surf, level,
- logical_array_layer,
- logical_z_offset_px,
- &slice_start_offset_B,
- &slice_end_offset_B);
-
- start_offset_B = MIN2(start_offset_B, slice_start_offset_B);
- end_offset_B = MAX2(end_offset_B, slice_end_offset_B);
- }
-
- /* Aux operates 64K at a time */
- start_offset_B = align_down_u64(start_offset_B, 64 * 1024);
- end_offset_B = align_u64(end_offset_B, 64 * 1024);
-
- for (uint64_t offset = start_offset_B;
- offset < end_offset_B; offset += 64 * 1024) {
- uint64_t address = base_address + offset;
-
- uint64_t aux_entry_addr64, *aux_entry_map;
- aux_entry_map = intel_aux_map_get_entry(cmd_buffer->device->aux_map_ctx,
- address, &aux_entry_addr64);
-
- assert(anv_use_softpin(cmd_buffer->device->physical));
- struct anv_address aux_entry_address = {
- .bo = NULL,
- .offset = aux_entry_addr64,
- };
-
- const uint64_t old_aux_entry = READ_ONCE(*aux_entry_map);
- uint64_t new_aux_entry =
- (old_aux_entry & INTEL_AUX_MAP_ADDRESS_MASK) | format_bits;
-
- if (isl_aux_usage_has_ccs(image->planes[plane].aux_usage))
- new_aux_entry |= INTEL_AUX_MAP_ENTRY_VALID_BIT;
-
- mi_store(&b, mi_mem64(aux_entry_address), mi_imm(new_aux_entry));
- }
- }
-
- anv_add_pending_pipe_bits(cmd_buffer,
- ANV_PIPE_AUX_TABLE_INVALIDATE_BIT,
- "after update AUX-TT");
-}
-#endif /* GFX_VER == 12 */
-
/* Transitions a HiZ-enabled depth buffer from one layout to another. Unless
* the initial layout is undefined, the HiZ buffer and depth buffer will
* represent the same data at the end of this operation.
@@ -594,6 +477,7 @@ anv_image_init_aux_tt(struct anv_cmd_buffer *cmd_buffer,
static void
transition_depth_buffer(struct anv_cmd_buffer *cmd_buffer,
const struct anv_image *image,
+ uint32_t base_level, uint32_t level_count,
uint32_t base_layer, uint32_t layer_count,
VkImageLayout initial_layout,
VkImageLayout final_layout,
@@ -604,32 +488,22 @@ transition_depth_buffer(struct anv_cmd_buffer *cmd_buffer,
if (image->planes[depth_plane].aux_usage == ISL_AUX_USAGE_NONE)
return;
-#if GFX_VER == 12
- if ((initial_layout == VK_IMAGE_LAYOUT_UNDEFINED ||
- initial_layout == VK_IMAGE_LAYOUT_PREINITIALIZED) &&
- cmd_buffer->device->physical->has_implicit_ccs &&
- cmd_buffer->device->info.has_aux_map) {
- anv_image_init_aux_tt(cmd_buffer, image, VK_IMAGE_ASPECT_DEPTH_BIT,
- 0, 1, base_layer, layer_count);
- }
-#endif
-
/* If will_full_fast_clear is set, the caller promises to fast-clear the
- * largest portion of the specified range as it can. For depth images,
- * that means the entire image because we don't support multi-LOD HiZ.
+ * largest portion of the specified range as it can.
*/
- assert(image->planes[0].primary_surface.isl.levels == 1);
if (will_full_fast_clear)
return;
const enum isl_aux_state initial_state =
- anv_layout_to_aux_state(&cmd_buffer->device->info, image,
+ anv_layout_to_aux_state(cmd_buffer->device->info, image,
VK_IMAGE_ASPECT_DEPTH_BIT,
- initial_layout);
+ initial_layout,
+ cmd_buffer->queue_family->queueFlags);
const enum isl_aux_state final_state =
- anv_layout_to_aux_state(&cmd_buffer->device->info, image,
+ anv_layout_to_aux_state(cmd_buffer->device->info, image,
VK_IMAGE_ASPECT_DEPTH_BIT,
- final_layout);
+ final_layout,
+ cmd_buffer->queue_family->queueFlags);
const bool initial_depth_valid =
isl_aux_state_has_valid_primary(initial_state);
@@ -642,36 +516,49 @@ transition_depth_buffer(struct anv_cmd_buffer *cmd_buffer,
/* Getting into the pass-through state for Depth is tricky and involves
* both a resolve and an ambiguate. We don't handle that state right now
- * as anv_layout_to_aux_state never returns it. Resolve/ambiguate will
- * trigger depth clears which require tile cache flushes.
+ * as anv_layout_to_aux_state never returns it.
*/
assert(final_state != ISL_AUX_STATE_PASS_THROUGH);
+ enum isl_aux_op hiz_op = ISL_AUX_OP_NONE;
if (final_needs_depth && !initial_depth_valid) {
assert(initial_hiz_valid);
- anv_image_hiz_op(cmd_buffer, image, VK_IMAGE_ASPECT_DEPTH_BIT,
- 0, base_layer, layer_count, ISL_AUX_OP_FULL_RESOLVE);
- anv_add_pending_pipe_bits(cmd_buffer,
- ANV_PIPE_TILE_CACHE_FLUSH_BIT,
- "after depth resolve");
+ hiz_op = ISL_AUX_OP_FULL_RESOLVE;
} else if (final_needs_hiz && !initial_hiz_valid) {
assert(initial_depth_valid);
- anv_image_hiz_op(cmd_buffer, image, VK_IMAGE_ASPECT_DEPTH_BIT,
- 0, base_layer, layer_count, ISL_AUX_OP_AMBIGUATE);
+ hiz_op = ISL_AUX_OP_AMBIGUATE;
+ }
+
+ if (hiz_op != ISL_AUX_OP_NONE) {
+ for (uint32_t l = 0; l < level_count; l++) {
+ const uint32_t level = base_level + l;
+
+ uint32_t aux_layers =
+ anv_image_aux_layers(image, VK_IMAGE_ASPECT_DEPTH_BIT, level);
+ if (base_layer >= aux_layers)
+ break; /* We will only get fewer layers as level increases */
+ uint32_t level_layer_count =
+ MIN2(layer_count, aux_layers - base_layer);
+
+ anv_image_hiz_op(cmd_buffer, image, VK_IMAGE_ASPECT_DEPTH_BIT,
+ l, base_layer, level_layer_count, hiz_op);
+ }
+ }
+
+ /* Additional tile cache flush for MTL:
+ *
+ * https://gitlab.freedesktop.org/mesa/mesa/-/issues/10420
+ * https://gitlab.freedesktop.org/mesa/mesa/-/issues/10530
+ */
+ if (intel_device_info_is_mtl(cmd_buffer->device->info) &&
+ image->planes[depth_plane].aux_usage == ISL_AUX_USAGE_HIZ_CCS &&
+ final_needs_depth && !initial_depth_valid) {
anv_add_pending_pipe_bits(cmd_buffer,
ANV_PIPE_TILE_CACHE_FLUSH_BIT,
- "after hiz resolve");
+ "HIZ-CCS flush");
}
}
-static inline bool
-vk_image_layout_stencil_write_optimal(VkImageLayout layout)
-{
- return layout == VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL ||
- layout == VK_IMAGE_LAYOUT_DEPTH_READ_ONLY_STENCIL_ATTACHMENT_OPTIMAL ||
- layout == VK_IMAGE_LAYOUT_STENCIL_ATTACHMENT_OPTIMAL_KHR;
-}
-
/* Transitions a HiZ-enabled depth buffer from one layout to another. Unless
* the initial layout is undefined, the HiZ buffer and depth buffer will
* represent the same data at the end of this operation.
@@ -685,35 +572,7 @@ transition_stencil_buffer(struct anv_cmd_buffer *cmd_buffer,
VkImageLayout final_layout,
bool will_full_fast_clear)
{
-#if GFX_VER == 7
- const uint32_t plane =
- anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_STENCIL_BIT);
-
- /* On gfx7, we have to store a texturable version of the stencil buffer in
- * a shadow whenever VK_IMAGE_USAGE_SAMPLED_BIT is set and copy back and
- * forth at strategic points. Stencil writes are only allowed in following
- * layouts:
- *
- * - VK_IMAGE_LAYOUT_GENERAL
- * - VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL
- * - VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL
- * - VK_IMAGE_LAYOUT_DEPTH_READ_ONLY_STENCIL_ATTACHMENT_OPTIMAL
- * - VK_IMAGE_LAYOUT_STENCIL_ATTACHMENT_OPTIMAL_KHR
- *
- * For general, we have no nice opportunity to transition so we do the copy
- * to the shadow unconditionally at the end of the subpass. For transfer
- * destinations, we can update it as part of the transfer op. For the other
- * layouts, we delay the copy until a transition into some other layout.
- */
- if (anv_surface_is_valid(&image->planes[plane].shadow_surface) &&
- vk_image_layout_stencil_write_optimal(initial_layout) &&
- !vk_image_layout_stencil_write_optimal(final_layout)) {
- anv_image_copy_to_shadow(cmd_buffer, image,
- VK_IMAGE_ASPECT_STENCIL_BIT,
- base_level, level_count,
- base_layer, layer_count);
- }
-#elif GFX_VER == 12
+#if GFX_VER == 12
const uint32_t plane =
anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_STENCIL_BIT);
if (image->planes[plane].aux_usage == ISL_AUX_USAGE_NONE)
@@ -721,11 +580,7 @@ transition_stencil_buffer(struct anv_cmd_buffer *cmd_buffer,
if ((initial_layout == VK_IMAGE_LAYOUT_UNDEFINED ||
initial_layout == VK_IMAGE_LAYOUT_PREINITIALIZED) &&
- cmd_buffer->device->physical->has_implicit_ccs &&
- cmd_buffer->device->info.has_aux_map) {
- anv_image_init_aux_tt(cmd_buffer, image, VK_IMAGE_ASPECT_STENCIL_BIT,
- base_level, level_count, base_layer, layer_count);
-
+ cmd_buffer->device->info->has_aux_map) {
/* If will_full_fast_clear is set, the caller promises to fast-clear the
* largest portion of the specified range as it can.
*/
@@ -737,8 +592,8 @@ transition_stencil_buffer(struct anv_cmd_buffer *cmd_buffer,
const VkRect2D clear_rect = {
.offset.x = 0,
.offset.y = 0,
- .extent.width = anv_minify(image->vk.extent.width, level),
- .extent.height = anv_minify(image->vk.extent.height, level),
+ .extent.width = u_minify(image->vk.extent.width, level),
+ .extent.height = u_minify(image->vk.extent.height, level),
};
uint32_t aux_layers =
@@ -757,6 +612,17 @@ transition_stencil_buffer(struct anv_cmd_buffer *cmd_buffer,
clear_rect, 0 /* Stencil clear value */);
}
}
+
+ /* Additional tile cache flush for MTL:
+ *
+ * https://gitlab.freedesktop.org/mesa/mesa/-/issues/10420
+ * https://gitlab.freedesktop.org/mesa/mesa/-/issues/10530
+ */
+ if (intel_device_info_is_mtl(cmd_buffer->device->info)) {
+ anv_add_pending_pipe_bits(cmd_buffer,
+ ANV_PIPE_TILE_CACHE_FLUSH_BIT,
+ "HIZ-CCS flush");
+ }
#endif
}
@@ -775,7 +641,7 @@ set_image_compressed_bit(struct anv_cmd_buffer *cmd_buffer,
const uint32_t plane = anv_image_aspect_to_plane(image, aspect);
/* We only have compression tracking for CCS_E */
- if (image->planes[plane].aux_usage != ISL_AUX_USAGE_CCS_E)
+ if (!isl_aux_usage_has_ccs_e(image->planes[plane].aux_usage))
return;
for (uint32_t a = 0; a < layer_count; a++) {
@@ -787,6 +653,22 @@ set_image_compressed_bit(struct anv_cmd_buffer *cmd_buffer,
sdi.ImmediateData = compressed ? UINT32_MAX : 0;
}
}
+
+ /* FCV_CCS_E images are automatically fast cleared to default value at
+ * render time. In order to account for this, anv should set the the
+ * appropriate fast clear state for level0/layer0.
+ *
+ * At the moment, tracking the fast clear state for higher levels/layers is
+ * neither supported, nor do we enter a situation where it is a concern.
+ */
+ if (image->planes[plane].aux_usage == ISL_AUX_USAGE_FCV_CCS_E &&
+ base_layer == 0 && level == 0) {
+ anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), sdi) {
+ sdi.Address = anv_image_get_fast_clear_type_addr(cmd_buffer->device,
+ image, aspect);
+ sdi.ImmediateData = ANV_FAST_CLEAR_DEFAULT_VALUE;
+ }
+ }
}
static void
@@ -811,7 +693,6 @@ set_image_fast_clear_state(struct anv_cmd_buffer *cmd_buffer,
/* This is only really practical on haswell and above because it requires
* MI math in order to get it correct.
*/
-#if GFX_VERx10 >= 75
static void
anv_cmd_compute_resolve_predicate(struct anv_cmd_buffer *cmd_buffer,
const struct anv_image *image,
@@ -820,12 +701,14 @@ anv_cmd_compute_resolve_predicate(struct anv_cmd_buffer *cmd_buffer,
enum isl_aux_op resolve_op,
enum anv_fast_clear_type fast_clear_supported)
{
+ struct anv_address addr = anv_image_get_fast_clear_type_addr(cmd_buffer->device,
+ image, aspect);
struct mi_builder b;
- mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
+ mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
+ const uint32_t mocs = anv_mocs_for_address(cmd_buffer->device, &addr);
+ mi_builder_set_mocs(&b, mocs);
- const struct mi_value fast_clear_type =
- mi_mem32(anv_image_get_fast_clear_type_addr(cmd_buffer->device,
- image, aspect));
+ const struct mi_value fast_clear_type = mi_mem32(addr);
if (resolve_op == ISL_AUX_OP_FULL_RESOLVE) {
/* In this case, we're doing a full resolve which means we want the
@@ -892,50 +775,6 @@ anv_cmd_compute_resolve_predicate(struct anv_cmd_buffer *cmd_buffer,
mip.CompareOperation = COMPARE_SRCS_EQUAL;
}
}
-#endif /* GFX_VERx10 >= 75 */
-
-#if GFX_VER <= 8
-static void
-anv_cmd_simple_resolve_predicate(struct anv_cmd_buffer *cmd_buffer,
- const struct anv_image *image,
- VkImageAspectFlagBits aspect,
- uint32_t level, uint32_t array_layer,
- enum isl_aux_op resolve_op,
- enum anv_fast_clear_type fast_clear_supported)
-{
- struct mi_builder b;
- mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
-
- struct mi_value fast_clear_type_mem =
- mi_mem32(anv_image_get_fast_clear_type_addr(cmd_buffer->device,
- image, aspect));
-
- /* This only works for partial resolves and only when the clear color is
- * all or nothing. On the upside, this emits less command streamer code
- * and works on Ivybridge and Bay Trail.
- */
- assert(resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE);
- assert(fast_clear_supported != ANV_FAST_CLEAR_ANY);
-
- /* We don't support fast clears on anything other than the first slice. */
- if (level > 0 || array_layer > 0)
- return;
-
- /* On gfx8, we don't have a concept of default clear colors because we
- * can't sample from CCS surfaces. It's enough to just load the fast clear
- * state into the predicate register.
- */
- mi_store(&b, mi_reg64(MI_PREDICATE_SRC0), fast_clear_type_mem);
- mi_store(&b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(0));
- mi_store(&b, fast_clear_type_mem, mi_imm(0));
-
- anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
- mip.LoadOperation = LOAD_LOADINV;
- mip.CombineOperation = COMBINE_SET;
- mip.CompareOperation = COMPARE_SRCS_EQUAL;
- }
-}
-#endif /* GFX_VER <= 8 */
static void
anv_cmd_predicated_ccs_resolve(struct anv_cmd_buffer *cmd_buffer,
@@ -949,15 +788,9 @@ anv_cmd_predicated_ccs_resolve(struct anv_cmd_buffer *cmd_buffer,
{
const uint32_t plane = anv_image_aspect_to_plane(image, aspect);
-#if GFX_VER >= 9
anv_cmd_compute_resolve_predicate(cmd_buffer, image,
aspect, level, array_layer,
resolve_op, fast_clear_supported);
-#else /* GFX_VER <= 8 */
- anv_cmd_simple_resolve_predicate(cmd_buffer, image,
- aspect, level, array_layer,
- resolve_op, fast_clear_supported);
-#endif
/* CCS_D only supports full resolves and BLORP will assert on us if we try
* to do a partial resolve on a CCS_D surface.
@@ -983,16 +816,12 @@ anv_cmd_predicated_mcs_resolve(struct anv_cmd_buffer *cmd_buffer,
assert(aspect == VK_IMAGE_ASPECT_COLOR_BIT);
assert(resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE);
-#if GFX_VERx10 >= 75
anv_cmd_compute_resolve_predicate(cmd_buffer, image,
aspect, 0, array_layer,
resolve_op, fast_clear_supported);
anv_image_mcs_op(cmd_buffer, image, format, swizzle, aspect,
array_layer, 1, resolve_op, NULL, true);
-#else
- unreachable("MCS resolves are unsupported on Ivybridge and Bay Trail");
-#endif
}
void
@@ -1007,13 +836,12 @@ genX(cmd_buffer_mark_image_written)(struct anv_cmd_buffer *cmd_buffer,
/* The aspect must be exactly one of the image aspects. */
assert(util_bitcount(aspect) == 1 && (aspect & image->vk.aspects));
- /* The only compression types with more than just fast-clears are MCS,
- * CCS_E, and HiZ. With HiZ we just trust the layout and don't actually
- * track the current fast-clear and compression state. This leaves us
- * with just MCS and CCS_E.
+ /* Filter out aux usages that don't have any compression tracking.
+ * Note: We only have compression tracking for CCS_E images, but it's
+ * possible for a CCS_E enabled image to have a subresource with a different
+ * aux usage.
*/
- if (aux_usage != ISL_AUX_USAGE_CCS_E &&
- aux_usage != ISL_AUX_USAGE_MCS)
+ if (!isl_aux_usage_has_compression(aux_usage))
return;
set_image_compressed_bit(cmd_buffer, image, aspect,
@@ -1028,117 +856,119 @@ init_fast_clear_color(struct anv_cmd_buffer *cmd_buffer,
assert(cmd_buffer && image);
assert(image->vk.aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV);
- set_image_fast_clear_state(cmd_buffer, image, aspect,
- ANV_FAST_CLEAR_NONE);
-
- /* Initialize the struct fields that are accessed for fast-clears so that
+ /* Initialize the struct fields that are accessed for fast clears so that
* the HW restrictions on the field values are satisfied.
+ *
+ * On generations that do not support indirect clear color natively, we
+ * can just skip initializing the values, because they will be set by
+ * BLORP before actually doing the fast clear.
+ *
+ * For newer generations, we may not be able to skip initialization.
+ * Testing shows that writing to CLEAR_COLOR causes corruption if
+ * the surface is currently being used. So, care must be taken here.
+ * There are two cases that we consider:
+ *
+ * 1. For CCS_E without FCV, we can skip initializing the color-related
+ * fields, just like on the older platforms. Also, DWORDS 6 and 7
+ * are marked MBZ (or have a usable field on gfx11), but we can skip
+ * initializing them because in practice these fields need other
+ * state to be programmed for their values to matter.
+ *
+ * 2. When the FCV optimization is enabled, we must initialize the
+ * color-related fields. Otherwise, the engine might reference their
+ * uninitialized contents before we fill them for a manual fast clear
+ * with BLORP. Although the surface may be in use, no synchronization
+ * is needed before initialization. The only possible clear color we
+ * support in this mode is 0.
*/
- struct anv_address addr =
- anv_image_get_clear_color_addr(cmd_buffer->device, image, aspect);
-
- if (GFX_VER >= 9) {
- const struct isl_device *isl_dev = &cmd_buffer->device->isl_dev;
- const unsigned num_dwords = GFX_VER >= 10 ?
- isl_dev->ss.clear_color_state_size / 4 :
- isl_dev->ss.clear_value_size / 4;
+#if GFX_VER == 12
+ const uint32_t plane = anv_image_aspect_to_plane(image, aspect);
+
+ if (image->planes[plane].aux_usage == ISL_AUX_USAGE_FCV_CCS_E) {
+ assert(!image->planes[plane].can_non_zero_fast_clear);
+ assert(cmd_buffer->device->isl_dev.ss.clear_color_state_size == 32);
+
+ unsigned num_dwords = 6;
+ struct anv_address addr =
+ anv_image_get_clear_color_addr(cmd_buffer->device, image, aspect);
+
for (unsigned i = 0; i < num_dwords; i++) {
anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), sdi) {
sdi.Address = addr;
sdi.Address.offset += i * 4;
sdi.ImmediateData = 0;
- }
- }
- } else {
- anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), sdi) {
- sdi.Address = addr;
- if (GFX_VERx10 >= 75) {
- /* Pre-SKL, the dword containing the clear values also contains
- * other fields, so we need to initialize those fields to match the
- * values that would be in a color attachment.
- */
- sdi.ImmediateData = ISL_CHANNEL_SELECT_RED << 25 |
- ISL_CHANNEL_SELECT_GREEN << 22 |
- ISL_CHANNEL_SELECT_BLUE << 19 |
- ISL_CHANNEL_SELECT_ALPHA << 16;
- } else if (GFX_VER == 7) {
- /* On IVB, the dword containing the clear values also contains
- * other fields that must be zero or can be zero.
- */
- sdi.ImmediateData = 0;
+ sdi.ForceWriteCompletionCheck = i == (num_dwords - 1);
}
}
}
+#endif
}
/* Copy the fast-clear value dword(s) between a surface state object and an
* image's fast clear state buffer.
*/
-static void
-genX(copy_fast_clear_dwords)(struct anv_cmd_buffer *cmd_buffer,
+void
+genX(load_image_clear_color)(struct anv_cmd_buffer *cmd_buffer,
struct anv_state surface_state,
- const struct anv_image *image,
- VkImageAspectFlagBits aspect,
- bool copy_from_surface_state)
+ const struct anv_image *image)
{
+#if GFX_VER < 10
assert(cmd_buffer && image);
assert(image->vk.aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV);
- struct anv_address ss_clear_addr = {
- .bo = cmd_buffer->device->surface_state_pool.block_pool.bo,
- .offset = surface_state.offset +
- cmd_buffer->device->isl_dev.ss.clear_value_offset,
- };
+ struct anv_address ss_clear_addr =
+ anv_state_pool_state_address(
+ &cmd_buffer->device->internal_surface_state_pool,
+ (struct anv_state) {
+ .offset = surface_state.offset +
+ cmd_buffer->device->isl_dev.ss.clear_value_offset
+ });
const struct anv_address entry_addr =
- anv_image_get_clear_color_addr(cmd_buffer->device, image, aspect);
+ anv_image_get_clear_color_addr(cmd_buffer->device, image,
+ VK_IMAGE_ASPECT_COLOR_BIT);
unsigned copy_size = cmd_buffer->device->isl_dev.ss.clear_value_size;
-#if GFX_VER == 7
- /* On gfx7, the combination of commands used here(MI_LOAD_REGISTER_MEM
- * and MI_STORE_REGISTER_MEM) can cause GPU hangs if any rendering is
- * in-flight when they are issued even if the memory touched is not
- * currently active for rendering. The weird bit is that it is not the
- * MI_LOAD/STORE_REGISTER_MEM commands which hang but rather the in-flight
- * rendering hangs such that the next stalling command after the
- * MI_LOAD/STORE_REGISTER_MEM commands will catch the hang.
+ struct mi_builder b;
+ mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
+
+ mi_memcpy(&b, ss_clear_addr, entry_addr, copy_size);
+
+ /* Updating a surface state object may require that the state cache be
+ * invalidated. From the SKL PRM, Shared Functions -> State -> State
+ * Caching:
*
- * It is unclear exactly why this hang occurs. Both MI commands come with
- * warnings about the 3D pipeline but that doesn't seem to fully explain
- * it. My (Jason's) best theory is that it has something to do with the
- * fact that we're using a GPU state register as our temporary and that
- * something with reading/writing it is causing problems.
+ * Whenever the RENDER_SURFACE_STATE object in memory pointed to by
+ * the Binding Table Pointer (BTP) and Binding Table Index (BTI) is
+ * modified [...], the L1 state cache must be invalidated to ensure
+ * the new surface or sampler state is fetched from system memory.
*
- * In order to work around this issue, we emit a PIPE_CONTROL with the
- * command streamer stall bit set.
+ * In testing, SKL doesn't actually seem to need this, but HSW does.
*/
anv_add_pending_pipe_bits(cmd_buffer,
- ANV_PIPE_CS_STALL_BIT,
- "after copy_fast_clear_dwords. Avoid potential hang");
- genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+ ANV_PIPE_STATE_CACHE_INVALIDATE_BIT,
+ "after load_image_clear_color surface state update");
#endif
+}
- struct mi_builder b;
- mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
-
- if (copy_from_surface_state) {
- mi_memcpy(&b, entry_addr, ss_clear_addr, copy_size);
- } else {
- mi_memcpy(&b, ss_clear_addr, entry_addr, copy_size);
-
- /* Updating a surface state object may require that the state cache be
- * invalidated. From the SKL PRM, Shared Functions -> State -> State
- * Caching:
- *
- * Whenever the RENDER_SURFACE_STATE object in memory pointed to by
- * the Binding Table Pointer (BTP) and Binding Table Index (BTI) is
- * modified [...], the L1 state cache must be invalidated to ensure
- * the new surface or sampler state is fetched from system memory.
- *
- * In testing, SKL doesn't actually seem to need this, but HSW does.
+void
+genX(set_fast_clear_state)(struct anv_cmd_buffer *cmd_buffer,
+ const struct anv_image *image,
+ const enum isl_format format,
+ union isl_color_value clear_color)
+{
+ if (isl_color_value_is_zero(clear_color, format)) {
+ /* This image has the auxiliary buffer enabled. We can mark the
+ * subresource as not needing a resolve because the clear color
+ * will match what's in every RENDER_SURFACE_STATE object when
+ * it's being used for sampling.
*/
- anv_add_pending_pipe_bits(cmd_buffer,
- ANV_PIPE_STATE_CACHE_INVALIDATE_BIT,
- "after copy_fast_clear_dwords surface state update");
+ set_image_fast_clear_state(cmd_buffer, image,
+ VK_IMAGE_ASPECT_COLOR_BIT,
+ ANV_FAST_CLEAR_DEFAULT_VALUE);
+ } else {
+ set_image_fast_clear_state(cmd_buffer, image,
+ VK_IMAGE_ASPECT_COLOR_BIT,
+ ANV_FAST_CLEAR_ANY);
}
}
@@ -1161,12 +991,12 @@ transition_color_buffer(struct anv_cmd_buffer *cmd_buffer,
uint32_t base_layer, uint32_t layer_count,
VkImageLayout initial_layout,
VkImageLayout final_layout,
- uint64_t src_queue_family,
- uint64_t dst_queue_family,
+ uint32_t src_queue_family,
+ uint32_t dst_queue_family,
bool will_full_fast_clear)
{
struct anv_device *device = cmd_buffer->device;
- const struct intel_device_info *devinfo = &device->info;
+ const struct intel_device_info *devinfo = device->info;
/* Validate the inputs. */
assert(cmd_buffer);
assert(image && image->vk.aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV);
@@ -1175,13 +1005,16 @@ transition_color_buffer(struct anv_cmd_buffer *cmd_buffer,
layer_count != VK_REMAINING_ARRAY_LAYERS);
/* Ensure the subresource range is valid. */
UNUSED uint64_t last_level_num = base_level + level_count;
- const uint32_t max_depth = anv_minify(image->vk.extent.depth, base_level);
+ const uint32_t max_depth = u_minify(image->vk.extent.depth, base_level);
UNUSED const uint32_t image_layers = MAX2(image->vk.array_layers, max_depth);
assert((uint64_t)base_layer + layer_count <= image_layers);
assert(last_level_num <= image->vk.mip_levels);
- /* The spec disallows these final layouts. */
- assert(final_layout != VK_IMAGE_LAYOUT_UNDEFINED &&
- final_layout != VK_IMAGE_LAYOUT_PREINITIALIZED);
+ /* If there is a layout transfer, the final layout cannot be undefined or
+ * preinitialized (VUID-VkImageMemoryBarrier-newLayout-01198).
+ */
+ assert(initial_layout == final_layout ||
+ (final_layout != VK_IMAGE_LAYOUT_UNDEFINED &&
+ final_layout != VK_IMAGE_LAYOUT_PREINITIALIZED));
const struct isl_drm_modifier_info *isl_mod_info =
image->vk.tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT
? isl_drm_modifier_get_info(image->vk.drm_format_mod)
@@ -1195,6 +1028,18 @@ transition_color_buffer(struct anv_cmd_buffer *cmd_buffer,
dst_queue_family == VK_QUEUE_FAMILY_FOREIGN_EXT ||
dst_queue_family == VK_QUEUE_FAMILY_EXTERNAL;
+ /* If the queues are external, consider the first queue family flags
+ * (should be the most capable)
+ */
+ const VkQueueFlagBits src_queue_flags =
+ device->physical->queue.families[
+ (src_queue_external || src_queue_family == VK_QUEUE_FAMILY_IGNORED) ?
+ 0 : src_queue_family].queueFlags;
+ const VkQueueFlagBits dst_queue_flags =
+ device->physical->queue.families[
+ (dst_queue_external || dst_queue_family == VK_QUEUE_FAMILY_IGNORED) ?
+ 0 : dst_queue_family].queueFlags;
+
/* Simultaneous acquire and release on external queues is illegal. */
assert(!src_queue_external || !dst_queue_external);
@@ -1202,43 +1047,81 @@ transition_color_buffer(struct anv_cmd_buffer *cmd_buffer,
* image has a DRM format modifier because we store image data in
* a driver-private bo which is inaccessible to the external queue.
*/
- const bool mod_acquire =
+ const bool private_binding_acquire =
src_queue_external &&
- image->vk.tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT;
+ anv_image_is_externally_shared(image) &&
+ anv_image_has_private_binding(image);
- const bool mod_release =
+ const bool private_binding_release =
dst_queue_external &&
- image->vk.tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT;
+ anv_image_is_externally_shared(image) &&
+ anv_image_has_private_binding(image);
if (initial_layout == final_layout &&
- !mod_acquire && !mod_release) {
+ !private_binding_acquire && !private_binding_release) {
/* No work is needed. */
return;
}
- const uint32_t plane = anv_image_aspect_to_plane(image, aspect);
-
- if (anv_surface_is_valid(&image->planes[plane].shadow_surface) &&
- final_layout == VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL) {
- /* This surface is a linear compressed image with a tiled shadow surface
- * for texturing. The client is about to use it in READ_ONLY_OPTIMAL so
- * we need to ensure the shadow copy is up-to-date.
- */
- assert(image->vk.tiling != VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT);
- assert(image->vk.aspects == VK_IMAGE_ASPECT_COLOR_BIT);
- assert(image->planes[plane].primary_surface.isl.tiling == ISL_TILING_LINEAR);
- assert(image->planes[plane].shadow_surface.isl.tiling != ISL_TILING_LINEAR);
- assert(isl_format_is_compressed(image->planes[plane].primary_surface.isl.format));
- assert(plane == 0);
- anv_image_copy_to_shadow(cmd_buffer, image,
- VK_IMAGE_ASPECT_COLOR_BIT,
- base_level, level_count,
- base_layer, layer_count);
+ /**
+ * Section 7.7.4 of the Vulkan 1.3.260 spec says:
+ *
+ * If the transfer is via an image memory barrier, and an image layout
+ * transition is desired, then the values of oldLayout and newLayout in the
+ * release operation's memory barrier must be equal to values of oldLayout
+ * and newLayout in the acquire operation's memory barrier. Although the
+ * image layout transition is submitted twice, it will only be executed
+ * once. A layout transition specified in this way happens-after the
+ * release operation and happens-before the acquire operation.
+ *
+ * Because we know that we get match transition on each queue, we choose to
+ * only do the work on one queue type : RENDER. In the cases where we do
+ * transitions between COMPUTE & TRANSFER, we should have matching
+ * aux/fast_clear value which would trigger no work in the code below.
+ */
+ if (!(src_queue_external || dst_queue_external) &&
+ src_queue_family != VK_QUEUE_FAMILY_IGNORED &&
+ dst_queue_family != VK_QUEUE_FAMILY_IGNORED &&
+ src_queue_family != dst_queue_family) {
+ enum intel_engine_class src_engine =
+ cmd_buffer->queue_family->engine_class;
+ if (src_engine != INTEL_ENGINE_CLASS_RENDER)
+ return;
}
+ const uint32_t plane = anv_image_aspect_to_plane(image, aspect);
+
if (base_layer >= anv_image_aux_layers(image, aspect, base_level))
return;
+ enum isl_aux_usage initial_aux_usage =
+ anv_layout_to_aux_usage(devinfo, image, aspect, 0,
+ initial_layout, src_queue_flags);
+ enum isl_aux_usage final_aux_usage =
+ anv_layout_to_aux_usage(devinfo, image, aspect, 0,
+ final_layout, dst_queue_flags);
+ enum anv_fast_clear_type initial_fast_clear =
+ anv_layout_to_fast_clear_type(devinfo, image, aspect, initial_layout,
+ src_queue_flags);
+ enum anv_fast_clear_type final_fast_clear =
+ anv_layout_to_fast_clear_type(devinfo, image, aspect, final_layout,
+ dst_queue_flags);
+
+ /* We must override the anv_layout_to_* functions because they are unaware
+ * of acquire/release direction.
+ */
+ if (private_binding_acquire) {
+ initial_aux_usage = isl_drm_modifier_has_aux(isl_mod_info->modifier) ?
+ image->planes[plane].aux_usage : ISL_AUX_USAGE_NONE;
+ initial_fast_clear = isl_mod_info->supports_clear_color ?
+ initial_fast_clear : ANV_FAST_CLEAR_NONE;
+ } else if (private_binding_release) {
+ final_aux_usage = isl_drm_modifier_has_aux(isl_mod_info->modifier) ?
+ image->planes[plane].aux_usage : ISL_AUX_USAGE_NONE;
+ final_fast_clear = isl_mod_info->supports_clear_color ?
+ final_fast_clear : ANV_FAST_CLEAR_NONE;
+ }
+
assert(image->planes[plane].primary_surface.isl.tiling != ISL_TILING_LINEAR);
/* The following layouts are equivalent for non-linear images. */
@@ -1254,8 +1137,43 @@ transition_color_buffer(struct anv_cmd_buffer *cmd_buffer,
* data.
*/
must_init_fast_clear_state = true;
- must_init_aux_surface = true;
- } else if (mod_acquire) {
+
+ if (image->planes[plane].aux_usage == ISL_AUX_USAGE_MCS ||
+ devinfo->has_illegal_ccs_values) {
+
+ must_init_aux_surface = true;
+
+ } else {
+ assert(isl_aux_usage_has_ccs_e(image->planes[plane].aux_usage));
+
+ /* We can start using the CCS immediately without ambiguating. The
+ * two conditions that enable this are:
+ *
+ * 1) The device treats all possible CCS values as legal. In other
+ * words, we can't confuse the hardware with random bits in the
+ * CCS.
+ *
+ * 2) We enable compression on all writable image layouts. The CCS
+ * will receive all writes and will therefore always be in sync
+ * with the main surface.
+ *
+ * If we were to disable compression on some writable layouts, the
+ * CCS could get out of sync with the main surface and the app
+ * could lose the data it wrote previously. For example, this
+ * could happen if an app: transitions from UNDEFINED w/o
+ * ambiguating -> renders with AUX_NONE -> samples with AUX_CCS.
+ *
+ * The second condition is asserted below, but could be moved
+ * elsewhere for more coverage (we're only checking transitions from
+ * an undefined layout).
+ */
+ assert(vk_image_layout_is_read_only(final_layout, aspect) ||
+ (final_aux_usage != ISL_AUX_USAGE_NONE));
+
+ must_init_aux_surface = false;
+ }
+
+ } else if (private_binding_acquire) {
/* The fast clear state lives in a driver-private bo, and therefore the
* external/foreign queue is unaware of it.
*
@@ -1272,18 +1190,14 @@ transition_color_buffer(struct anv_cmd_buffer *cmd_buffer,
ANV_IMAGE_MEMORY_BINDING_PRIVATE);
must_init_fast_clear_state = true;
- if (image->planes[plane].aux_surface.memory_range.binding ==
+ if (anv_image_get_aux_memory_range(image, plane)->binding ==
ANV_IMAGE_MEMORY_BINDING_PRIVATE) {
- assert(isl_mod_info->aux_usage == ISL_AUX_USAGE_NONE);
-
/* The aux surface, like the fast clear state, lives in
* a driver-private bo. We must initialize the aux surface for the
* same reasons we must initialize the fast clear state.
*/
must_init_aux_surface = true;
} else {
- assert(isl_mod_info->aux_usage != ISL_AUX_USAGE_NONE);
-
/* The aux surface, unlike the fast clear state, lives in
* application-visible VkDeviceMemory and is shared with the
* external/foreign queue. Therefore, when we acquire ownership of the
@@ -1294,24 +1208,12 @@ transition_color_buffer(struct anv_cmd_buffer *cmd_buffer,
}
}
-#if GFX_VER == 12
- /* We do not yet support modifiers with aux on gen12. */
- assert(image->vk.tiling != VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT);
-
- if (initial_layout_undefined) {
- if (device->physical->has_implicit_ccs && devinfo->has_aux_map) {
- anv_image_init_aux_tt(cmd_buffer, image, aspect,
- base_level, level_count,
- base_layer, layer_count);
- }
- }
-#else
- assert(!(device->physical->has_implicit_ccs && devinfo->has_aux_map));
-#endif
-
if (must_init_fast_clear_state) {
- if (base_level == 0 && base_layer == 0)
- init_fast_clear_color(cmd_buffer, image, aspect);
+ if (base_level == 0 && base_layer == 0) {
+ set_image_fast_clear_state(cmd_buffer, image, aspect,
+ ANV_FAST_CLEAR_NONE);
+ }
+ init_fast_clear_color(cmd_buffer, image, aspect);
}
if (must_init_aux_surface) {
@@ -1341,14 +1243,15 @@ transition_color_buffer(struct anv_cmd_buffer *cmd_buffer,
*
* For MCS, (2) is never an issue because we don't support multisampled
* storage images. In theory, issue (1) is a problem with MCS but we've
- * never seen it in the wild. For 4x and 16x, all bit patters could, in
- * theory, be interpreted as something but we don't know that all bit
+ * never seen it in the wild. For 4x and 16x, all bit patterns could,
+ * in theory, be interpreted as something but we don't know that all bit
* patterns are actually valid. For 2x and 8x, you could easily end up
* with the MCS referring to an invalid plane because not all bits of
* the MCS value are actually used. Even though we've never seen issues
* in the wild, it's best to play it safe and initialize the MCS. We
- * can use a fast-clear for MCS because we only ever touch from render
- * and texture (no image load store).
+ * could use a fast-clear for MCS because we only ever touch from render
+ * and texture (no image load store). However, due to WA 14013111325,
+ * we choose to ambiguate MCS as well.
*/
if (image->vk.samples == 1) {
for (uint32_t l = 0; l < level_count; l++) {
@@ -1377,19 +1280,10 @@ transition_color_buffer(struct anv_cmd_buffer *cmd_buffer,
aspect, level, base_layer, level_layer_count,
ISL_AUX_OP_AMBIGUATE, NULL, false);
- if (image->planes[plane].aux_usage == ISL_AUX_USAGE_CCS_E) {
- set_image_compressed_bit(cmd_buffer, image, aspect,
- level, base_layer, level_layer_count,
- false);
- }
+ set_image_compressed_bit(cmd_buffer, image, aspect, level,
+ base_layer, level_layer_count, false);
}
} else {
- if (image->vk.samples == 4 || image->vk.samples == 16) {
- anv_perf_warn(cmd_buffer->device, &image->vk.base,
- "Doing a potentially unnecessary fast-clear to "
- "define an MCS buffer.");
- }
-
/* If will_full_fast_clear is set, the caller promises to fast-clear
* the largest portion of the specified range as it can.
*/
@@ -1401,25 +1295,11 @@ transition_color_buffer(struct anv_cmd_buffer *cmd_buffer,
image->planes[plane].primary_surface.isl.format,
ISL_SWIZZLE_IDENTITY,
aspect, base_layer, layer_count,
- ISL_AUX_OP_FAST_CLEAR, NULL, false);
+ ISL_AUX_OP_AMBIGUATE, NULL, false);
}
return;
}
- enum isl_aux_usage initial_aux_usage =
- anv_layout_to_aux_usage(devinfo, image, aspect, 0, initial_layout);
- enum isl_aux_usage final_aux_usage =
- anv_layout_to_aux_usage(devinfo, image, aspect, 0, final_layout);
-
- /* We must override the anv_layout_to_* functions because they are unaware of
- * acquire/release direction.
- */
- if (mod_acquire) {
- initial_aux_usage = isl_mod_info->aux_usage;
- } else if (mod_release) {
- final_aux_usage = isl_mod_info->aux_usage;
- }
-
/* The current code assumes that there is no mixing of CCS_E and CCS_D.
* We can handle transitions between CCS_D/E to and from NONE. What we
* don't yet handle is switching between CCS_E and CCS_D within a given
@@ -1440,15 +1320,29 @@ transition_color_buffer(struct anv_cmd_buffer *cmd_buffer,
/* If the initial layout supports more fast clear than the final layout
* then we need at least a partial resolve.
*/
- const enum anv_fast_clear_type initial_fast_clear =
- anv_layout_to_fast_clear_type(devinfo, image, aspect, initial_layout);
- const enum anv_fast_clear_type final_fast_clear =
- anv_layout_to_fast_clear_type(devinfo, image, aspect, final_layout);
- if (final_fast_clear < initial_fast_clear)
+ if (final_fast_clear < initial_fast_clear) {
+ /* Partial resolves will actually only occur on layer 0/level 0. This
+ * is generally okay because anv only allows explicit fast clears to
+ * the first subresource.
+ *
+ * The situation is a bit different with FCV_CCS_E. With that aux
+ * usage, implicit fast clears can occur on any layer and level.
+ * anv doesn't track fast clear states for more than the first
+ * subresource, so we need to assert that a layout transition doesn't
+ * attempt to partial resolve the other subresources.
+ *
+ * At the moment, we don't enter such a situation, and partial resolves
+ * for higher level/layer resources shouldn't be a concern.
+ */
+ if (image->planes[plane].aux_usage == ISL_AUX_USAGE_FCV_CCS_E) {
+ assert(base_level == 0 && level_count == 1 &&
+ base_layer == 0 && layer_count == 1);
+ }
resolve_op = ISL_AUX_OP_PARTIAL_RESOLVE;
+ }
- if (initial_aux_usage == ISL_AUX_USAGE_CCS_E &&
- final_aux_usage != ISL_AUX_USAGE_CCS_E)
+ if (isl_aux_usage_has_ccs_e(initial_aux_usage) &&
+ !isl_aux_usage_has_ccs_e(final_aux_usage))
resolve_op = ISL_AUX_OP_FULL_RESOLVE;
if (resolve_op == ISL_AUX_OP_NONE)
@@ -1474,7 +1368,7 @@ transition_color_buffer(struct anv_cmd_buffer *cmd_buffer,
anv_add_pending_pipe_bits(cmd_buffer,
ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
ANV_PIPE_END_OF_PIPE_SYNC_BIT,
- "after transition RT");
+ "before transition RT");
for (uint32_t l = 0; l < level_count; l++) {
uint32_t level = base_level + l;
@@ -1525,519 +1419,55 @@ transition_color_buffer(struct anv_cmd_buffer *cmd_buffer,
"after transition RT");
}
-static VkResult
-genX(cmd_buffer_setup_attachments)(struct anv_cmd_buffer *cmd_buffer,
- const struct anv_render_pass *pass,
- const struct anv_framebuffer *framebuffer,
- const VkRenderPassBeginInfo *begin)
-{
- struct anv_cmd_state *state = &cmd_buffer->state;
-
- vk_free(&cmd_buffer->pool->alloc, state->attachments);
-
- if (pass->attachment_count > 0) {
- state->attachments = vk_zalloc(&cmd_buffer->pool->alloc,
- pass->attachment_count *
- sizeof(state->attachments[0]),
- 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
- if (state->attachments == NULL) {
- /* Propagate VK_ERROR_OUT_OF_HOST_MEMORY to vkEndCommandBuffer */
- return anv_batch_set_error(&cmd_buffer->batch,
- VK_ERROR_OUT_OF_HOST_MEMORY);
- }
- } else {
- state->attachments = NULL;
- }
-
- const VkRenderPassAttachmentBeginInfoKHR *attach_begin =
- vk_find_struct_const(begin, RENDER_PASS_ATTACHMENT_BEGIN_INFO_KHR);
- if (begin && !attach_begin)
- assert(pass->attachment_count == framebuffer->attachment_count);
-
- for (uint32_t i = 0; i < pass->attachment_count; ++i) {
- if (attach_begin && attach_begin->attachmentCount != 0) {
- assert(attach_begin->attachmentCount == pass->attachment_count);
- ANV_FROM_HANDLE(anv_image_view, iview, attach_begin->pAttachments[i]);
- state->attachments[i].image_view = iview;
- } else if (framebuffer && i < framebuffer->attachment_count) {
- state->attachments[i].image_view = framebuffer->attachments[i];
- } else {
- state->attachments[i].image_view = NULL;
- }
- }
-
- if (begin) {
- for (uint32_t i = 0; i < pass->attachment_count; ++i) {
- const struct anv_render_pass_attachment *pass_att = &pass->attachments[i];
- struct anv_attachment_state *att_state = &state->attachments[i];
- VkImageAspectFlags att_aspects = vk_format_aspects(pass_att->format);
- VkImageAspectFlags clear_aspects = 0;
- VkImageAspectFlags load_aspects = 0;
-
- if (att_aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) {
- /* color attachment */
- if (pass_att->load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) {
- clear_aspects |= VK_IMAGE_ASPECT_COLOR_BIT;
- } else if (pass_att->load_op == VK_ATTACHMENT_LOAD_OP_LOAD) {
- load_aspects |= VK_IMAGE_ASPECT_COLOR_BIT;
- }
- } else {
- /* depthstencil attachment */
- if (att_aspects & VK_IMAGE_ASPECT_DEPTH_BIT) {
- if (pass_att->load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) {
- clear_aspects |= VK_IMAGE_ASPECT_DEPTH_BIT;
- } else if (pass_att->load_op == VK_ATTACHMENT_LOAD_OP_LOAD) {
- load_aspects |= VK_IMAGE_ASPECT_DEPTH_BIT;
- }
- }
- if (att_aspects & VK_IMAGE_ASPECT_STENCIL_BIT) {
- if (pass_att->stencil_load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) {
- clear_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT;
- } else if (pass_att->stencil_load_op == VK_ATTACHMENT_LOAD_OP_LOAD) {
- load_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT;
- }
- }
- }
-
- att_state->current_layout = pass_att->initial_layout;
- att_state->current_stencil_layout = pass_att->stencil_initial_layout;
- att_state->pending_clear_aspects = clear_aspects;
- att_state->pending_load_aspects = load_aspects;
- if (clear_aspects)
- att_state->clear_value = begin->pClearValues[i];
-
- struct anv_image_view *iview = state->attachments[i].image_view;
- anv_assert(iview->vk.format == pass_att->format);
-
- const uint32_t num_layers = iview->planes[0].isl.array_len;
- att_state->pending_clear_views = (1 << num_layers) - 1;
-
- /* This will be initialized after the first subpass transition. */
- att_state->aux_usage = ISL_AUX_USAGE_NONE;
-
- att_state->fast_clear = false;
- if (clear_aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) {
- assert(clear_aspects == VK_IMAGE_ASPECT_COLOR_BIT);
- att_state->fast_clear =
- anv_can_fast_clear_color_view(cmd_buffer->device, iview,
- pass_att->first_subpass_layout,
- vk_to_isl_color(att_state->clear_value.color),
- framebuffer->layers,
- begin->renderArea);
- } else if (clear_aspects & (VK_IMAGE_ASPECT_DEPTH_BIT |
- VK_IMAGE_ASPECT_STENCIL_BIT)) {
- att_state->fast_clear =
- anv_can_hiz_clear_ds_view(cmd_buffer->device, iview,
- pass_att->first_subpass_layout,
- clear_aspects,
- att_state->clear_value.depthStencil.depth,
- begin->renderArea);
- }
- }
- }
-
- return VK_SUCCESS;
-}
-
-/**
- * Setup anv_cmd_state::attachments for vkCmdBeginRenderPass.
- */
-static VkResult
-genX(cmd_buffer_alloc_att_surf_states)(struct anv_cmd_buffer *cmd_buffer,
- const struct anv_render_pass *pass,
- const struct anv_subpass *subpass)
+static MUST_CHECK VkResult
+anv_cmd_buffer_init_attachments(struct anv_cmd_buffer *cmd_buffer,
+ uint32_t color_att_count)
{
- const struct isl_device *isl_dev = &cmd_buffer->device->isl_dev;
- struct anv_cmd_state *state = &cmd_buffer->state;
+ struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
/* Reserve one for the NULL state. */
- unsigned num_states = 1;
- for (uint32_t i = 0; i < subpass->attachment_count; i++) {
- uint32_t att = subpass->attachments[i].attachment;
- if (att == VK_ATTACHMENT_UNUSED)
- continue;
-
- assert(att < pass->attachment_count);
- if (!vk_format_is_color(pass->attachments[att].format))
- continue;
-
- const VkImageUsageFlagBits att_usage = subpass->attachments[i].usage;
- assert(util_bitcount(att_usage) == 1);
-
- if (att_usage == VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT ||
- att_usage == VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT)
- num_states++;
- }
-
- const uint32_t ss_stride = align_u32(isl_dev->ss.size, isl_dev->ss.align);
- state->attachment_states =
- anv_state_stream_alloc(&cmd_buffer->surface_state_stream,
- num_states * ss_stride, isl_dev->ss.align);
- if (state->attachment_states.map == NULL) {
- return anv_batch_set_error(&cmd_buffer->batch,
- VK_ERROR_OUT_OF_DEVICE_MEMORY);
- }
+ unsigned num_states = 1 + color_att_count;
+ const struct isl_device *isl_dev = &cmd_buffer->device->isl_dev;
+ const uint32_t ss_stride = align(isl_dev->ss.size, isl_dev->ss.align);
+ gfx->att_states =
+ anv_cmd_buffer_alloc_surface_states(cmd_buffer, num_states);
+ if (gfx->att_states.map == NULL)
+ return VK_ERROR_OUT_OF_DEVICE_MEMORY;
- struct anv_state next_state = state->attachment_states;
+ struct anv_state next_state = gfx->att_states;
next_state.alloc_size = isl_dev->ss.size;
- state->null_surface_state = next_state;
+ gfx->null_surface_state = next_state;
next_state.offset += ss_stride;
next_state.map += ss_stride;
- for (uint32_t i = 0; i < subpass->attachment_count; i++) {
- uint32_t att = subpass->attachments[i].attachment;
- if (att == VK_ATTACHMENT_UNUSED)
- continue;
-
- assert(att < pass->attachment_count);
- if (!vk_format_is_color(pass->attachments[att].format))
- continue;
-
- const VkImageUsageFlagBits att_usage = subpass->attachments[i].usage;
- assert(util_bitcount(att_usage) == 1);
-
- if (att_usage == VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT)
- state->attachments[att].color.state = next_state;
- else if (att_usage == VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT)
- state->attachments[att].input.state = next_state;
- else
- continue;
-
- state->attachments[att].color.state = next_state;
+ gfx->color_att_count = color_att_count;
+ for (uint32_t i = 0; i < color_att_count; i++) {
+ gfx->color_att[i] = (struct anv_attachment) {
+ .surface_state.state = next_state,
+ };
next_state.offset += ss_stride;
next_state.map += ss_stride;
}
-
- assert(next_state.offset == state->attachment_states.offset +
- state->attachment_states.alloc_size);
+ gfx->depth_att = (struct anv_attachment) { };
+ gfx->stencil_att = (struct anv_attachment) { };
return VK_SUCCESS;
}
-VkResult
-genX(BeginCommandBuffer)(
- VkCommandBuffer commandBuffer,
- const VkCommandBufferBeginInfo* pBeginInfo)
-{
- ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
-
- /* If this is the first vkBeginCommandBuffer, we must *initialize* the
- * command buffer's state. Otherwise, we must *reset* its state. In both
- * cases we reset it.
- *
- * From the Vulkan 1.0 spec:
- *
- * If a command buffer is in the executable state and the command buffer
- * was allocated from a command pool with the
- * VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT flag set, then
- * vkBeginCommandBuffer implicitly resets the command buffer, behaving
- * as if vkResetCommandBuffer had been called with
- * VK_COMMAND_BUFFER_RESET_RELEASE_RESOURCES_BIT not set. It then puts
- * the command buffer in the recording state.
- */
- anv_cmd_buffer_reset(cmd_buffer);
-
- cmd_buffer->usage_flags = pBeginInfo->flags;
-
- /* VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT must be ignored for
- * primary level command buffers.
- *
- * From the Vulkan 1.0 spec:
- *
- * VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT specifies that a
- * secondary command buffer is considered to be entirely inside a render
- * pass. If this is a primary command buffer, then this bit is ignored.
- */
- if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY)
- cmd_buffer->usage_flags &= ~VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT;
-
- genX(cmd_buffer_emit_state_base_address)(cmd_buffer);
-
- /* We sometimes store vertex data in the dynamic state buffer for blorp
- * operations and our dynamic state stream may re-use data from previous
- * command buffers. In order to prevent stale cache data, we flush the VF
- * cache. We could do this on every blorp call but that's not really
- * needed as all of the data will get written by the CPU prior to the GPU
- * executing anything. The chances are fairly high that they will use
- * blorp at least once per primary command buffer so it shouldn't be
- * wasted.
- *
- * There is also a workaround on gfx8 which requires us to invalidate the
- * VF cache occasionally. It's easier if we can assume we start with a
- * fresh cache (See also genX(cmd_buffer_set_binding_for_gfx8_vb_flush).)
- */
- anv_add_pending_pipe_bits(cmd_buffer,
- ANV_PIPE_VF_CACHE_INVALIDATE_BIT,
- "new cmd buffer");
-
- /* Re-emit the aux table register in every command buffer. This way we're
- * ensured that we have the table even if this command buffer doesn't
- * initialize any images.
- */
- if (cmd_buffer->device->info.has_aux_map) {
- anv_add_pending_pipe_bits(cmd_buffer,
- ANV_PIPE_AUX_TABLE_INVALIDATE_BIT,
- "new cmd buffer with aux-tt");
- }
-
- /* We send an "Indirect State Pointers Disable" packet at
- * EndCommandBuffer, so all push contant packets are ignored during a
- * context restore. Documentation says after that command, we need to
- * emit push constants again before any rendering operation. So we
- * flag them dirty here to make sure they get emitted.
- */
- cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_ALL_GRAPHICS;
-
- VkResult result = VK_SUCCESS;
- if (cmd_buffer->usage_flags &
- VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {
- assert(pBeginInfo->pInheritanceInfo);
- ANV_FROM_HANDLE(anv_render_pass, pass,
- pBeginInfo->pInheritanceInfo->renderPass);
- struct anv_subpass *subpass =
- &pass->subpasses[pBeginInfo->pInheritanceInfo->subpass];
- ANV_FROM_HANDLE(anv_framebuffer, framebuffer,
- pBeginInfo->pInheritanceInfo->framebuffer);
-
- cmd_buffer->state.pass = pass;
- cmd_buffer->state.subpass = subpass;
-
- /* This is optional in the inheritance info. */
- cmd_buffer->state.framebuffer = framebuffer;
-
- result = genX(cmd_buffer_setup_attachments)(cmd_buffer, pass,
- framebuffer, NULL);
- if (result != VK_SUCCESS)
- return result;
-
- result = genX(cmd_buffer_alloc_att_surf_states)(cmd_buffer, pass,
- subpass);
- if (result != VK_SUCCESS)
- return result;
-
- /* Record that HiZ is enabled if we can. */
- if (cmd_buffer->state.framebuffer) {
- const struct anv_image_view * const iview =
- anv_cmd_buffer_get_depth_stencil_view(cmd_buffer);
-
- if (iview) {
- VkImageLayout layout =
- cmd_buffer->state.subpass->depth_stencil_attachment->layout;
-
- enum isl_aux_usage aux_usage =
- anv_layout_to_aux_usage(&cmd_buffer->device->info, iview->image,
- VK_IMAGE_ASPECT_DEPTH_BIT,
- VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT,
- layout);
-
- cmd_buffer->state.hiz_enabled = isl_aux_usage_has_hiz(aux_usage);
- }
- }
-
- cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_RENDER_TARGETS;
- }
-
-#if GFX_VERx10 >= 75
- if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) {
- const VkCommandBufferInheritanceConditionalRenderingInfoEXT *conditional_rendering_info =
- vk_find_struct_const(pBeginInfo->pInheritanceInfo->pNext, COMMAND_BUFFER_INHERITANCE_CONDITIONAL_RENDERING_INFO_EXT);
-
- /* If secondary buffer supports conditional rendering
- * we should emit commands as if conditional rendering is enabled.
- */
- cmd_buffer->state.conditional_render_enabled =
- conditional_rendering_info && conditional_rendering_info->conditionalRenderingEnable;
- }
-#endif
-
- return result;
-}
-
-/* From the PRM, Volume 2a:
- *
- * "Indirect State Pointers Disable
- *
- * At the completion of the post-sync operation associated with this pipe
- * control packet, the indirect state pointers in the hardware are
- * considered invalid; the indirect pointers are not saved in the context.
- * If any new indirect state commands are executed in the command stream
- * while the pipe control is pending, the new indirect state commands are
- * preserved.
- *
- * [DevIVB+]: Using Invalidate State Pointer (ISP) only inhibits context
- * restoring of Push Constant (3DSTATE_CONSTANT_*) commands. Push Constant
- * commands are only considered as Indirect State Pointers. Once ISP is
- * issued in a context, SW must initialize by programming push constant
- * commands for all the shaders (at least to zero length) before attempting
- * any rendering operation for the same context."
- *
- * 3DSTATE_CONSTANT_* packets are restored during a context restore,
- * even though they point to a BO that has been already unreferenced at
- * the end of the previous batch buffer. This has been fine so far since
- * we are protected by these scratch page (every address not covered by
- * a BO should be pointing to the scratch page). But on CNL, it is
- * causing a GPU hang during context restore at the 3DSTATE_CONSTANT_*
- * instruction.
- *
- * The flag "Indirect State Pointers Disable" in PIPE_CONTROL tells the
- * hardware to ignore previous 3DSTATE_CONSTANT_* packets during a
- * context restore, so the mentioned hang doesn't happen. However,
- * software must program push constant commands for all stages prior to
- * rendering anything. So we flag them dirty in BeginCommandBuffer.
- *
- * Finally, we also make sure to stall at pixel scoreboard to make sure the
- * constants have been loaded into the EUs prior to disable the push constants
- * so that it doesn't hang a previous 3DPRIMITIVE.
- */
static void
-emit_isp_disable(struct anv_cmd_buffer *cmd_buffer)
-{
- anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
- pc.StallAtPixelScoreboard = true;
- pc.CommandStreamerStallEnable = true;
- anv_debug_dump_pc(pc);
- }
- anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
- pc.IndirectStatePointersDisable = true;
- pc.CommandStreamerStallEnable = true;
- anv_debug_dump_pc(pc);
- }
-}
-
-VkResult
-genX(EndCommandBuffer)(
- VkCommandBuffer commandBuffer)
+anv_cmd_buffer_reset_rendering(struct anv_cmd_buffer *cmd_buffer)
{
- ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+ struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
- if (anv_batch_has_error(&cmd_buffer->batch))
- return cmd_buffer->batch.status;
+ gfx->render_area = (VkRect2D) { };
+ gfx->layer_count = 0;
+ gfx->samples = 0;
- anv_measure_endcommandbuffer(cmd_buffer);
-
- /* We want every command buffer to start with the PMA fix in a known state,
- * so we disable it at the end of the command buffer.
- */
- genX(cmd_buffer_enable_pma_fix)(cmd_buffer, false);
-
- genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
-
- emit_isp_disable(cmd_buffer);
-
- anv_cmd_buffer_end_batch_buffer(cmd_buffer);
-
- return VK_SUCCESS;
-}
-
-void
-genX(CmdExecuteCommands)(
- VkCommandBuffer commandBuffer,
- uint32_t commandBufferCount,
- const VkCommandBuffer* pCmdBuffers)
-{
- ANV_FROM_HANDLE(anv_cmd_buffer, primary, commandBuffer);
-
- assert(primary->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
-
- if (anv_batch_has_error(&primary->batch))
- return;
-
- /* The secondary command buffers will assume that the PMA fix is disabled
- * when they begin executing. Make sure this is true.
- */
- genX(cmd_buffer_enable_pma_fix)(primary, false);
-
- /* The secondary command buffer doesn't know which textures etc. have been
- * flushed prior to their execution. Apply those flushes now.
- */
- genX(cmd_buffer_apply_pipe_flushes)(primary);
-
- for (uint32_t i = 0; i < commandBufferCount; i++) {
- ANV_FROM_HANDLE(anv_cmd_buffer, secondary, pCmdBuffers[i]);
-
- assert(secondary->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
- assert(!anv_batch_has_error(&secondary->batch));
-
-#if GFX_VERx10 >= 75
- if (secondary->state.conditional_render_enabled) {
- if (!primary->state.conditional_render_enabled) {
- /* Secondary buffer is constructed as if it will be executed
- * with conditional rendering, we should satisfy this dependency
- * regardless of conditional rendering being enabled in primary.
- */
- struct mi_builder b;
- mi_builder_init(&b, &primary->device->info, &primary->batch);
- mi_store(&b, mi_reg64(ANV_PREDICATE_RESULT_REG),
- mi_imm(UINT64_MAX));
- }
- }
-#endif
-
- if (secondary->usage_flags &
- VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {
- /* If we're continuing a render pass from the primary, we need to
- * copy the surface states for the current subpass into the storage
- * we allocated for them in BeginCommandBuffer.
- */
- struct anv_bo *ss_bo =
- primary->device->surface_state_pool.block_pool.bo;
- struct anv_state src_state = primary->state.attachment_states;
- struct anv_state dst_state = secondary->state.attachment_states;
- assert(src_state.alloc_size == dst_state.alloc_size);
-
- genX(cmd_buffer_so_memcpy)(primary,
- (struct anv_address) {
- .bo = ss_bo,
- .offset = dst_state.offset,
- },
- (struct anv_address) {
- .bo = ss_bo,
- .offset = src_state.offset,
- },
- src_state.alloc_size);
- }
-
- anv_cmd_buffer_add_secondary(primary, secondary);
-
- assert(secondary->perf_query_pool == NULL || primary->perf_query_pool == NULL ||
- secondary->perf_query_pool == primary->perf_query_pool);
- if (secondary->perf_query_pool)
- primary->perf_query_pool = secondary->perf_query_pool;
-
-#if GFX_VERx10 == 120
- if (secondary->state.depth_reg_mode != ANV_DEPTH_REG_MODE_UNKNOWN)
- primary->state.depth_reg_mode = secondary->state.depth_reg_mode;
-#endif
- }
-
- /* The secondary isn't counted in our VF cache tracking so we need to
- * invalidate the whole thing.
- */
- if (GFX_VER >= 8 && GFX_VER <= 9) {
- anv_add_pending_pipe_bits(primary,
- ANV_PIPE_CS_STALL_BIT | ANV_PIPE_VF_CACHE_INVALIDATE_BIT,
- "Secondary cmd buffer not tracked in VF cache");
- }
-
- /* The secondary may have selected a different pipeline (3D or compute) and
- * may have changed the current L3$ configuration. Reset our tracking
- * variables to invalid values to ensure that we re-emit these in the case
- * where we do any draws or compute dispatches from the primary after the
- * secondary has returned.
- */
- primary->state.current_pipeline = UINT32_MAX;
- primary->state.current_l3_config = NULL;
- primary->state.current_hash_scale = 0;
-
- /* Each of the secondary command buffers will use its own state base
- * address. We need to re-emit state base address for the primary after
- * all of the secondaries are done.
- *
- * TODO: Maybe we want to make this a dirty bit to avoid extra state base
- * address calls?
- */
- genX(cmd_buffer_emit_state_base_address)(primary);
+ gfx->color_att_count = 0;
+ gfx->depth_att = (struct anv_attachment) { };
+ gfx->stencil_att = (struct anv_attachment) { };
+ gfx->null_surface_state = ANV_STATE_NULL;
}
/**
@@ -2057,7 +1487,7 @@ genX(cmd_buffer_config_l3)(struct anv_cmd_buffer *cmd_buffer,
*/
assert(cfg == cmd_buffer->device->l3_config);
#else
- if (INTEL_DEBUG & DEBUG_L3) {
+ if (INTEL_DEBUG(DEBUG_L3)) {
mesa_logd("L3 config transition: ");
intel_dump_l3_config(cfg, stderr);
}
@@ -2066,12 +1496,10 @@ genX(cmd_buffer_config_l3)(struct anv_cmd_buffer *cmd_buffer,
* while the pipeline is completely drained and the caches are flushed,
* which involves a first PIPE_CONTROL flush which stalls the pipeline...
*/
- anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
- pc.DCFlushEnable = true;
- pc.PostSyncOperation = NoWrite;
- pc.CommandStreamerStallEnable = true;
- anv_debug_dump_pc(pc);
- }
+ genx_batch_emit_pipe_control(&cmd_buffer->batch, cmd_buffer->device->info,
+ cmd_buffer->state.current_pipeline,
+ ANV_PIPE_DATA_CACHE_FLUSH_BIT |
+ ANV_PIPE_CS_STALL_BIT);
/* ...followed by a second pipelined PIPE_CONTROL that initiates
* invalidation of the relevant caches. Note that because RO invalidation
@@ -2087,40 +1515,128 @@ genX(cmd_buffer_config_l3)(struct anv_cmd_buffer *cmd_buffer,
* already guarantee that there is no concurrent GPGPU kernel execution
* (see SKL HSD 2132585).
*/
- anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
- pc.TextureCacheInvalidationEnable = true;
- pc.ConstantCacheInvalidationEnable = true;
- pc.InstructionCacheInvalidateEnable = true;
- pc.StateCacheInvalidationEnable = true;
- pc.PostSyncOperation = NoWrite;
- anv_debug_dump_pc(pc);
- }
+ genx_batch_emit_pipe_control(&cmd_buffer->batch, cmd_buffer->device->info,
+ cmd_buffer->state.current_pipeline,
+ ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT |
+ ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT |
+ ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT |
+ ANV_PIPE_STATE_CACHE_INVALIDATE_BIT);
/* Now send a third stalling flush to make sure that invalidation is
* complete when the L3 configuration registers are modified.
*/
- anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
- pc.DCFlushEnable = true;
- pc.PostSyncOperation = NoWrite;
- pc.CommandStreamerStallEnable = true;
- anv_debug_dump_pc(pc);
- }
+ genx_batch_emit_pipe_control(&cmd_buffer->batch, cmd_buffer->device->info,
+ cmd_buffer->state.current_pipeline,
+ ANV_PIPE_DATA_CACHE_FLUSH_BIT |
+ ANV_PIPE_CS_STALL_BIT);
genX(emit_l3_config)(&cmd_buffer->batch, cmd_buffer->device, cfg);
#endif /* GFX_VER >= 11 */
cmd_buffer->state.current_l3_config = cfg;
}
-void
-genX(cmd_buffer_apply_pipe_flushes)(struct anv_cmd_buffer *cmd_buffer)
+ALWAYS_INLINE void
+genX(invalidate_aux_map)(struct anv_batch *batch,
+ struct anv_device *device,
+ enum intel_engine_class engine_class,
+ enum anv_pipe_bits bits)
{
- UNUSED const struct intel_device_info *devinfo = &cmd_buffer->device->info;
- enum anv_pipe_bits bits = cmd_buffer->state.pending_pipe_bits;
+#if GFX_VER == 12
+ if ((bits & ANV_PIPE_AUX_TABLE_INVALIDATE_BIT) && device->info->has_aux_map) {
+ uint32_t register_addr = 0;
+ switch (engine_class) {
+ case INTEL_ENGINE_CLASS_COMPUTE:
+ register_addr = GENX(COMPCS0_CCS_AUX_INV_num);
+ break;
+ case INTEL_ENGINE_CLASS_COPY:
+#if GFX_VERx10 >= 125
+ register_addr = GENX(BCS_CCS_AUX_INV_num);
+#endif
+ break;
+ case INTEL_ENGINE_CLASS_VIDEO:
+ register_addr = GENX(VD0_CCS_AUX_INV_num);
+ break;
+ case INTEL_ENGINE_CLASS_RENDER:
+ default:
+ register_addr = GENX(GFX_CCS_AUX_INV_num);
+ break;
+ }
- if (unlikely(cmd_buffer->device->physical->always_flush_cache))
- bits |= ANV_PIPE_FLUSH_BITS | ANV_PIPE_INVALIDATE_BITS;
- else if (bits == 0)
- return;
+ anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
+ lri.RegisterOffset = register_addr;
+ lri.DataDWord = 1;
+ }
+
+ /* Wa_16018063123 - emit fast color dummy blit before MI_FLUSH_DW. */
+ if (intel_needs_workaround(device->info, 16018063123) &&
+ engine_class == INTEL_ENGINE_CLASS_COPY) {
+ genX(batch_emit_fast_color_dummy_blit)(batch, device);
+ }
+
+ /* HSD 22012751911: SW Programming sequence when issuing aux invalidation:
+ *
+ * "Poll Aux Invalidation bit once the invalidation is set
+ * (Register 4208 bit 0)"
+ */
+ anv_batch_emit(batch, GENX(MI_SEMAPHORE_WAIT), sem) {
+ sem.CompareOperation = COMPARE_SAD_EQUAL_SDD;
+ sem.WaitMode = PollingMode;
+ sem.RegisterPollMode = true;
+ sem.SemaphoreDataDword = 0x0;
+ sem.SemaphoreAddress =
+ anv_address_from_u64(register_addr);
+ }
+ }
+#else
+ assert(!device->info->has_aux_map);
+#endif
+}
+
+ALWAYS_INLINE enum anv_pipe_bits
+genX(emit_apply_pipe_flushes)(struct anv_batch *batch,
+ struct anv_device *device,
+ uint32_t current_pipeline,
+ enum anv_pipe_bits bits,
+ enum anv_pipe_bits *emitted_flush_bits)
+{
+#if GFX_VER >= 12
+ /* From the TGL PRM, Volume 2a, "PIPE_CONTROL":
+ *
+ * "SW must follow below programming restrictions when programming
+ * PIPE_CONTROL command [for ComputeCS]:
+ * ...
+ * Following bits must not be set when programmed for ComputeCS:
+ * - "Render Target Cache Flush Enable", "Depth Cache Flush Enable"
+ * and "Tile Cache Flush Enable"
+ * - "Depth Stall Enable", Stall at Pixel Scoreboard and
+ * "PSD Sync Enable".
+ * - "OVR Tile 0 Flush", "TBIMR Force Batch Closure",
+ * "AMFS Flush Enable", "VF Cache Invalidation Enable" and
+ * "Global Snapshot Count Reset"."
+ *
+ * XXX: According to spec this should not be a concern for a regular
+ * RCS in GPGPU mode, but during testing it was found that at least
+ * "VF Cache Invalidation Enable" bit is ignored in such case.
+ * This can cause us to miss some important invalidations
+ * (e.g. from CmdPipelineBarriers) and have incoherent data.
+ *
+ * There is also a Wa_1606932921 "RCS is not waking up fixed function clock
+ * when specific 3d related bits are programmed in pipecontrol in
+ * compute mode" that suggests us not to use "RT Cache Flush" in GPGPU mode.
+ *
+ * The other bits are not confirmed to cause problems, but included here
+ * just to be safe, as they're also not really relevant in the GPGPU mode,
+ * and having them doesn't seem to cause any regressions.
+ *
+ * So if we're currently in GPGPU mode, we hide some bits from
+ * this flush, and will flush them only when we'll be able to.
+ * Similar thing with GPGPU-only bits.
+ */
+ enum anv_pipe_bits defer_bits = bits &
+ (current_pipeline == GPGPU ? ANV_PIPE_GFX_BITS: ANV_PIPE_GPGPU_BITS);
+
+ bits &= ~defer_bits;
+#endif
/*
* From Sandybridge PRM, volume 2, "1.7.2 End-of-Pipe Synchronization":
@@ -2158,9 +1674,34 @@ genX(cmd_buffer_apply_pipe_flushes)(struct anv_cmd_buffer *cmd_buffer)
* "Driver must ensure that the engine is IDLE but ensure it doesn't
* add extra flushes in the case it knows that the engine is already
* IDLE."
+ *
+ * HSD 22012751911: SW Programming sequence when issuing aux invalidation:
+ *
+ * "Render target Cache Flush + L3 Fabric Flush + State Invalidation + CS Stall"
+ *
+ * Notice we don't set the L3 Fabric Flush here, because we have
+ * ANV_PIPE_END_OF_PIPE_SYNC_BIT which inserts a CS stall. The
+ * PIPE_CONTROL::L3 Fabric Flush documentation says :
+ *
+ * "L3 Fabric Flush will ensure all the pending transactions in the L3
+ * Fabric are flushed to global observation point. HW does implicit L3
+ * Fabric Flush on all stalling flushes (both explicit and implicit)
+ * and on PIPECONTROL having Post Sync Operation enabled."
+ *
+ * Therefore setting L3 Fabric Flush here would be redundant.
*/
- if (GFX_VER == 12 && (bits & ANV_PIPE_AUX_TABLE_INVALIDATE_BIT))
- bits |= ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT;
+ if (GFX_VER == 12 && (bits & ANV_PIPE_AUX_TABLE_INVALIDATE_BIT)) {
+ if (current_pipeline == GPGPU) {
+ bits |= (ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT |
+ ANV_PIPE_DATA_CACHE_FLUSH_BIT |
+ (GFX_VERx10 == 125 ? ANV_PIPE_CCS_CACHE_FLUSH_BIT: 0));
+ } else if (current_pipeline == _3D) {
+ bits |= (ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT |
+ ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
+ ANV_PIPE_STATE_CACHE_INVALIDATE_BIT |
+ (GFX_VERx10 == 125 ? ANV_PIPE_CCS_CACHE_FLUSH_BIT: 0));
+ }
+ }
/* If we're going to do an invalidate and we have a pending end-of-pipe
* sync that has yet to be resolved, we do the end-of-pipe sync now.
@@ -2169,24 +1710,12 @@ genX(cmd_buffer_apply_pipe_flushes)(struct anv_cmd_buffer *cmd_buffer)
(bits & ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT)) {
bits |= ANV_PIPE_END_OF_PIPE_SYNC_BIT;
bits &= ~ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT;
- }
- /* Wa_1409226450, Wait for EU to be idle before pipe control which
- * invalidates the instruction cache
- */
- if (GFX_VER == 12 && (bits & ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT))
- bits |= ANV_PIPE_CS_STALL_BIT | ANV_PIPE_STALL_AT_SCOREBOARD_BIT;
-
- if ((GFX_VER >= 8 && GFX_VER <= 9) &&
- (bits & ANV_PIPE_CS_STALL_BIT) &&
- (bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT)) {
- /* If we are doing a VF cache invalidate AND a CS stall (it must be
- * both) then we can reset our vertex cache tracking.
- */
- memset(cmd_buffer->state.gfx.vb_dirty_ranges, 0,
- sizeof(cmd_buffer->state.gfx.vb_dirty_ranges));
- memset(&cmd_buffer->state.gfx.ib_dirty_range, 0,
- sizeof(cmd_buffer->state.gfx.ib_dirty_range));
+ if (INTEL_DEBUG(DEBUG_PIPE_CONTROL) && bits) {
+ fputs("pc: add ", stderr);
+ anv_dump_pipe_bits(ANV_PIPE_END_OF_PIPE_SYNC_BIT, stdout);
+ fprintf(stderr, "reason: Ensure flushes done before invalidate\n");
+ }
}
/* Project: SKL / Argument: LRI Post Sync Operation [23]
@@ -2197,146 +1726,86 @@ genX(cmd_buffer_apply_pipe_flushes)(struct anv_cmd_buffer *cmd_buffer)
* PIPELINE_SELECT command is set to GPGPU mode of operation)."
*
* The same text exists a few rows below for Post Sync Op.
- *
- * On Gfx12 this is Wa_1607156449.
*/
if (bits & ANV_PIPE_POST_SYNC_BIT) {
- if ((GFX_VER == 9 || (GFX_VER == 12 && devinfo->revision == 0 /* A0 */)) &&
- cmd_buffer->state.current_pipeline == GPGPU)
+ if (GFX_VER == 9 && current_pipeline == GPGPU)
bits |= ANV_PIPE_CS_STALL_BIT;
bits &= ~ANV_PIPE_POST_SYNC_BIT;
}
if (bits & (ANV_PIPE_FLUSH_BITS | ANV_PIPE_STALL_BITS |
ANV_PIPE_END_OF_PIPE_SYNC_BIT)) {
- anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) {
-#if GFX_VER >= 12
- pipe.TileCacheFlushEnable = bits & ANV_PIPE_TILE_CACHE_FLUSH_BIT;
- pipe.HDCPipelineFlushEnable |= bits & ANV_PIPE_HDC_PIPELINE_FLUSH_BIT;
-#else
- /* Flushing HDC pipeline requires DC Flush on earlier HW. */
- pipe.DCFlushEnable |= bits & ANV_PIPE_HDC_PIPELINE_FLUSH_BIT;
-#endif
- pipe.DepthCacheFlushEnable = bits & ANV_PIPE_DEPTH_CACHE_FLUSH_BIT;
- pipe.DCFlushEnable |= bits & ANV_PIPE_DATA_CACHE_FLUSH_BIT;
- pipe.RenderTargetCacheFlushEnable =
- bits & ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;
+ enum anv_pipe_bits flush_bits =
+ bits & (ANV_PIPE_FLUSH_BITS | ANV_PIPE_STALL_BITS |
+ ANV_PIPE_END_OF_PIPE_SYNC_BIT);
- /* Wa_1409600907: "PIPE_CONTROL with Depth Stall Enable bit must
- * be set with any PIPE_CONTROL with Depth Flush Enable bit set.
- */
-#if GFX_VER >= 12
- pipe.DepthStallEnable =
- pipe.DepthCacheFlushEnable || (bits & ANV_PIPE_DEPTH_STALL_BIT);
-#else
- pipe.DepthStallEnable = bits & ANV_PIPE_DEPTH_STALL_BIT;
+#if GFX_VERx10 >= 125
+ if (current_pipeline != GPGPU) {
+ if (flush_bits & ANV_PIPE_HDC_PIPELINE_FLUSH_BIT)
+ flush_bits |= ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT;
+ } else {
+ if (flush_bits & (ANV_PIPE_HDC_PIPELINE_FLUSH_BIT |
+ ANV_PIPE_DATA_CACHE_FLUSH_BIT))
+ flush_bits |= ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT;
+ }
+
+ /* BSpec 47112: PIPE_CONTROL::Untyped Data-Port Cache Flush:
+ *
+ * "'HDC Pipeline Flush' bit must be set for this bit to take
+ * effect."
+ */
+ if (flush_bits & ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT)
+ flush_bits |= ANV_PIPE_HDC_PIPELINE_FLUSH_BIT;
#endif
- pipe.CommandStreamerStallEnable = bits & ANV_PIPE_CS_STALL_BIT;
- pipe.StallAtPixelScoreboard = bits & ANV_PIPE_STALL_AT_SCOREBOARD_BIT;
+#if GFX_VER < 12
+ if (flush_bits & ANV_PIPE_HDC_PIPELINE_FLUSH_BIT)
+ flush_bits |= ANV_PIPE_DATA_CACHE_FLUSH_BIT;
+#endif
- /* From Sandybridge PRM, volume 2, "1.7.3.1 Writing a Value to Memory":
- *
- * "The most common action to perform upon reaching a
- * synchronization point is to write a value out to memory. An
- * immediate value (included with the synchronization command) may
- * be written."
- *
- *
- * From Broadwell PRM, volume 7, "End-of-Pipe Synchronization":
- *
- * "In case the data flushed out by the render engine is to be
- * read back in to the render engine in coherent manner, then the
- * render engine has to wait for the fence completion before
- * accessing the flushed data. This can be achieved by following
- * means on various products: PIPE_CONTROL command with CS Stall
- * and the required write caches flushed with Post-Sync-Operation
- * as Write Immediate Data.
- *
- * Example:
- * - Workload-1 (3D/GPGPU/MEDIA)
- * - PIPE_CONTROL (CS Stall, Post-Sync-Operation Write
- * Immediate Data, Required Write Cache Flush bits set)
- * - Workload-2 (Can use the data produce or output by
- * Workload-1)
- */
- if (bits & ANV_PIPE_END_OF_PIPE_SYNC_BIT) {
- pipe.CommandStreamerStallEnable = true;
- pipe.PostSyncOperation = WriteImmediateData;
- pipe.Address = cmd_buffer->device->workaround_address;
- }
+ uint32_t sync_op = NoWrite;
+ struct anv_address addr = ANV_NULL_ADDRESS;
- /*
- * According to the Broadwell documentation, any PIPE_CONTROL with the
- * "Command Streamer Stall" bit set must also have another bit set,
- * with five different options:
- *
- * - Render Target Cache Flush
- * - Depth Cache Flush
- * - Stall at Pixel Scoreboard
- * - Post-Sync Operation
- * - Depth Stall
- * - DC Flush Enable
- *
- * I chose "Stall at Pixel Scoreboard" since that's what we use in
- * mesa and it seems to work fine. The choice is fairly arbitrary.
- */
- if (pipe.CommandStreamerStallEnable &&
- !pipe.RenderTargetCacheFlushEnable &&
- !pipe.DepthCacheFlushEnable &&
- !pipe.StallAtPixelScoreboard &&
- !pipe.PostSyncOperation &&
- !pipe.DepthStallEnable &&
- !pipe.DCFlushEnable)
- pipe.StallAtPixelScoreboard = true;
- anv_debug_dump_pc(pipe);
+ /* From Sandybridge PRM, volume 2, "1.7.3.1 Writing a Value to Memory":
+ *
+ * "The most common action to perform upon reaching a
+ * synchronization point is to write a value out to memory. An
+ * immediate value (included with the synchronization command) may
+ * be written."
+ *
+ *
+ * From Broadwell PRM, volume 7, "End-of-Pipe Synchronization":
+ *
+ * "In case the data flushed out by the render engine is to be
+ * read back in to the render engine in coherent manner, then the
+ * render engine has to wait for the fence completion before
+ * accessing the flushed data. This can be achieved by following
+ * means on various products: PIPE_CONTROL command with CS Stall
+ * and the required write caches flushed with Post-Sync-Operation
+ * as Write Immediate Data.
+ *
+ * Example:
+ * - Workload-1 (3D/GPGPU/MEDIA)
+ * - PIPE_CONTROL (CS Stall, Post-Sync-Operation Write
+ * Immediate Data, Required Write Cache Flush bits set)
+ * - Workload-2 (Can use the data produce or output by
+ * Workload-1)
+ */
+ if (flush_bits & ANV_PIPE_END_OF_PIPE_SYNC_BIT) {
+ flush_bits |= ANV_PIPE_CS_STALL_BIT;
+ sync_op = WriteImmediateData;
+ addr = device->workaround_address;
}
- /* If a render target flush was emitted, then we can toggle off the bit
- * saying that render target writes are ongoing.
- */
- if (bits & ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT)
- bits &= ~(ANV_PIPE_RENDER_TARGET_BUFFER_WRITES);
+ /* Flush PC. */
+ genx_batch_emit_pipe_control_write(batch, device->info, current_pipeline,
+ sync_op, addr, 0, flush_bits);
- if (GFX_VERx10 == 75) {
- /* Haswell needs addition work-arounds:
- *
- * From Haswell PRM, volume 2, part 1, "End-of-Pipe Synchronization":
- *
- * Option 1:
- * PIPE_CONTROL command with the CS Stall and the required write
- * caches flushed with Post-SyncOperation as Write Immediate Data
- * followed by eight dummy MI_STORE_DATA_IMM (write to scratch
- * spce) commands.
- *
- * Example:
- * - Workload-1
- * - PIPE_CONTROL (CS Stall, Post-Sync-Operation Write
- * Immediate Data, Required Write Cache Flush bits set)
- * - MI_STORE_DATA_IMM (8 times) (Dummy data, Scratch Address)
- * - Workload-2 (Can use the data produce or output by
- * Workload-1)
- *
- * Unfortunately, both the PRMs and the internal docs are a bit
- * out-of-date in this regard. What the windows driver does (and
- * this appears to actually work) is to emit a register read from the
- * memory address written by the pipe control above.
- *
- * What register we load into doesn't matter. We choose an indirect
- * rendering register because we know it always exists and it's one
- * of the first registers the command parser allows us to write. If
- * you don't have command parser support in your kernel (pre-4.2),
- * this will get turned into MI_NOOP and you won't get the
- * workaround. Unfortunately, there's just not much we can do in
- * that case. This register is perfectly safe to write since we
- * always re-load all of the indirect draw registers right before
- * 3DPRIMITIVE when needed anyway.
- */
- anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
- lrm.RegisterAddress = 0x243C; /* GFX7_3DPRIM_START_INSTANCE */
- lrm.MemoryAddress = cmd_buffer->device->workaround_address;
- }
- }
+ /* If the caller wants to know what flushes have been emitted,
+ * provide the bits based off the PIPE_CONTROL programmed bits.
+ */
+ if (emitted_flush_bits != NULL)
+ *emitted_flush_bits = flush_bits;
bits &= ~(ANV_PIPE_FLUSH_BITS | ANV_PIPE_STALL_BITS |
ANV_PIPE_END_OF_PIPE_SYNC_BIT);
@@ -2354,202 +1823,328 @@ genX(cmd_buffer_apply_pipe_flushes)(struct anv_cmd_buffer *cmd_buffer)
* This appears to hang Broadwell, so we restrict it to just gfx9.
*/
if (GFX_VER == 9 && (bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT))
- anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe);
-
- anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) {
- pipe.StateCacheInvalidationEnable =
- bits & ANV_PIPE_STATE_CACHE_INVALIDATE_BIT;
- pipe.ConstantCacheInvalidationEnable =
- bits & ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT;
- pipe.VFCacheInvalidationEnable =
- bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT;
- pipe.TextureCacheInvalidationEnable =
- bits & ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT;
- pipe.InstructionCacheInvalidateEnable =
- bits & ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT;
-
- /* From the SKL PRM, Vol. 2a, "PIPE_CONTROL",
- *
- * "When VF Cache Invalidate is set “Post Sync Operation” must be
- * enabled to “Write Immediate Data” or “Write PS Depth Count” or
- * “Write Timestamp”.
- */
- if (GFX_VER == 9 && pipe.VFCacheInvalidationEnable) {
- pipe.PostSyncOperation = WriteImmediateData;
- pipe.Address = cmd_buffer->device->workaround_address;
- }
- anv_debug_dump_pc(pipe);
- }
+ anv_batch_emit(batch, GENX(PIPE_CONTROL), pipe);
-#if GFX_VER == 12
- if ((bits & ANV_PIPE_AUX_TABLE_INVALIDATE_BIT) &&
- cmd_buffer->device->info.has_aux_map) {
- anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
- lri.RegisterOffset = GENX(GFX_CCS_AUX_INV_num);
- lri.DataDWord = 1;
- }
- }
+#if GFX_VER >= 9 && GFX_VER <= 11
+ /* From the SKL PRM, Vol. 2a, "PIPE_CONTROL",
+ *
+ * "Workaround : “CS Stall” bit in PIPE_CONTROL command must be
+ * always set for GPGPU workloads when “Texture Cache
+ * Invalidation Enable” bit is set".
+ *
+ * Workaround stopped appearing in TGL PRMs.
+ */
+ if (current_pipeline == GPGPU &&
+ (bits & ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT))
+ bits |= ANV_PIPE_CS_STALL_BIT;
#endif
+ uint32_t sync_op = NoWrite;
+ struct anv_address addr = ANV_NULL_ADDRESS;
+
+ /* From the SKL PRM, Vol. 2a, "PIPE_CONTROL",
+ *
+ * "When VF Cache Invalidate is set “Post Sync Operation” must be
+ * enabled to “Write Immediate Data” or “Write PS Depth Count” or
+ * “Write Timestamp”.
+ */
+ if (GFX_VER == 9 && (bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT)) {
+ sync_op = WriteImmediateData;
+ addr = device->workaround_address;
+ }
+
+ /* Invalidate PC. */
+ genx_batch_emit_pipe_control_write(batch, device->info, current_pipeline,
+ sync_op, addr, 0, bits);
+
+ enum intel_engine_class engine_class =
+ current_pipeline == GPGPU ? INTEL_ENGINE_CLASS_COMPUTE :
+ INTEL_ENGINE_CLASS_RENDER;
+ genX(invalidate_aux_map)(batch, device, engine_class, bits);
+
bits &= ~ANV_PIPE_INVALIDATE_BITS;
}
- cmd_buffer->state.pending_pipe_bits = bits;
+#if GFX_VER >= 12
+ bits |= defer_bits;
+#endif
+
+ return bits;
}
-void genX(CmdPipelineBarrier)(
- VkCommandBuffer commandBuffer,
- VkPipelineStageFlags srcStageMask,
- VkPipelineStageFlags destStageMask,
- VkBool32 byRegion,
- uint32_t memoryBarrierCount,
- const VkMemoryBarrier* pMemoryBarriers,
- uint32_t bufferMemoryBarrierCount,
- const VkBufferMemoryBarrier* pBufferMemoryBarriers,
- uint32_t imageMemoryBarrierCount,
- const VkImageMemoryBarrier* pImageMemoryBarriers)
+ALWAYS_INLINE void
+genX(cmd_buffer_apply_pipe_flushes)(struct anv_cmd_buffer *cmd_buffer)
{
- ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
-
- /* XXX: Right now, we're really dumb and just flush whatever categories
- * the app asks for. One of these days we may make this a bit better
- * but right now that's all the hardware allows for in most areas.
+#if INTEL_NEEDS_WA_1508744258
+ /* If we're changing the state of the RHWO optimization, we need to have
+ * sb_stall+cs_stall.
*/
- VkAccessFlags src_flags = 0;
- VkAccessFlags dst_flags = 0;
-
- for (uint32_t i = 0; i < memoryBarrierCount; i++) {
- src_flags |= pMemoryBarriers[i].srcAccessMask;
- dst_flags |= pMemoryBarriers[i].dstAccessMask;
+ const bool rhwo_opt_change =
+ cmd_buffer->state.rhwo_optimization_enabled !=
+ cmd_buffer->state.pending_rhwo_optimization_enabled;
+ if (rhwo_opt_change) {
+ anv_add_pending_pipe_bits(cmd_buffer,
+ ANV_PIPE_STALL_AT_SCOREBOARD_BIT |
+ ANV_PIPE_END_OF_PIPE_SYNC_BIT,
+ "change RHWO optimization");
}
+#endif
- for (uint32_t i = 0; i < bufferMemoryBarrierCount; i++) {
- src_flags |= pBufferMemoryBarriers[i].srcAccessMask;
- dst_flags |= pBufferMemoryBarriers[i].dstAccessMask;
- }
+ enum anv_pipe_bits bits = cmd_buffer->state.pending_pipe_bits;
- for (uint32_t i = 0; i < imageMemoryBarrierCount; i++) {
- src_flags |= pImageMemoryBarriers[i].srcAccessMask;
- dst_flags |= pImageMemoryBarriers[i].dstAccessMask;
- ANV_FROM_HANDLE(anv_image, image, pImageMemoryBarriers[i].image);
- const VkImageSubresourceRange *range =
- &pImageMemoryBarriers[i].subresourceRange;
+ if (unlikely(cmd_buffer->device->physical->always_flush_cache))
+ bits |= ANV_PIPE_FLUSH_BITS | ANV_PIPE_INVALIDATE_BITS;
+ else if (bits == 0)
+ return;
- uint32_t base_layer, layer_count;
- if (image->vk.image_type == VK_IMAGE_TYPE_3D) {
- base_layer = 0;
- layer_count = anv_minify(image->vk.extent.depth, range->baseMipLevel);
- } else {
- base_layer = range->baseArrayLayer;
- layer_count = vk_image_subresource_layer_count(&image->vk, range);
+ if (anv_cmd_buffer_is_blitter_queue(cmd_buffer) ||
+ anv_cmd_buffer_is_video_queue(cmd_buffer)) {
+ if (bits & ANV_PIPE_INVALIDATE_BITS) {
+ genX(invalidate_aux_map)(&cmd_buffer->batch, cmd_buffer->device,
+ cmd_buffer->queue_family->engine_class, bits);
+ bits &= ~ANV_PIPE_INVALIDATE_BITS;
}
- const uint32_t level_count =
- vk_image_subresource_level_count(&image->vk, range);
+ cmd_buffer->state.pending_pipe_bits = bits;
+ return;
+ }
- if (range->aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) {
- transition_depth_buffer(cmd_buffer, image,
- base_layer, layer_count,
- pImageMemoryBarriers[i].oldLayout,
- pImageMemoryBarriers[i].newLayout,
- false /* will_full_fast_clear */);
- }
+ const bool trace_flush =
+ (bits & (ANV_PIPE_FLUSH_BITS |
+ ANV_PIPE_STALL_BITS |
+ ANV_PIPE_INVALIDATE_BITS |
+ ANV_PIPE_END_OF_PIPE_SYNC_BIT)) != 0;
+ if (trace_flush)
+ trace_intel_begin_stall(&cmd_buffer->trace);
- if (range->aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) {
- transition_stencil_buffer(cmd_buffer, image,
- range->baseMipLevel, level_count,
- base_layer, layer_count,
- pImageMemoryBarriers[i].oldLayout,
- pImageMemoryBarriers[i].newLayout,
- false /* will_full_fast_clear */);
- }
+ if (GFX_VER == 9 &&
+ (bits & ANV_PIPE_CS_STALL_BIT) &&
+ (bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT)) {
+ /* If we are doing a VF cache invalidate AND a CS stall (it must be
+ * both) then we can reset our vertex cache tracking.
+ */
+ memset(cmd_buffer->state.gfx.vb_dirty_ranges, 0,
+ sizeof(cmd_buffer->state.gfx.vb_dirty_ranges));
+ memset(&cmd_buffer->state.gfx.ib_dirty_range, 0,
+ sizeof(cmd_buffer->state.gfx.ib_dirty_range));
+ }
- if (range->aspectMask & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) {
- VkImageAspectFlags color_aspects =
- vk_image_expand_aspect_mask(&image->vk, range->aspectMask);
- anv_foreach_image_aspect_bit(aspect_bit, image, color_aspects) {
- transition_color_buffer(cmd_buffer, image, 1UL << aspect_bit,
- range->baseMipLevel, level_count,
- base_layer, layer_count,
- pImageMemoryBarriers[i].oldLayout,
- pImageMemoryBarriers[i].newLayout,
- pImageMemoryBarriers[i].srcQueueFamilyIndex,
- pImageMemoryBarriers[i].dstQueueFamilyIndex,
- false /* will_full_fast_clear */);
- }
+
+ enum anv_pipe_bits emitted_bits = 0;
+ cmd_buffer->state.pending_pipe_bits =
+ genX(emit_apply_pipe_flushes)(&cmd_buffer->batch,
+ cmd_buffer->device,
+ cmd_buffer->state.current_pipeline,
+ bits,
+ &emitted_bits);
+ anv_cmd_buffer_update_pending_query_bits(cmd_buffer, emitted_bits);
+
+#if INTEL_NEEDS_WA_1508744258
+ if (rhwo_opt_change) {
+ anv_batch_write_reg(&cmd_buffer->batch, GENX(COMMON_SLICE_CHICKEN1), c1) {
+ c1.RCCRHWOOptimizationDisable =
+ !cmd_buffer->state.pending_rhwo_optimization_enabled;
+ c1.RCCRHWOOptimizationDisableMask = true;
}
+ cmd_buffer->state.rhwo_optimization_enabled =
+ cmd_buffer->state.pending_rhwo_optimization_enabled;
}
+#endif
- anv_add_pending_pipe_bits(cmd_buffer,
- anv_pipe_flush_bits_for_access_flags(cmd_buffer->device, src_flags) |
- anv_pipe_invalidate_bits_for_access_flags(cmd_buffer->device, dst_flags),
- "pipe barrier");
+ if (trace_flush) {
+ trace_intel_end_stall(&cmd_buffer->trace,
+ bits & ~cmd_buffer->state.pending_pipe_bits,
+ anv_pipe_flush_bit_to_ds_stall_flag,
+ cmd_buffer->state.pc_reasons[0],
+ cmd_buffer->state.pc_reasons[1],
+ cmd_buffer->state.pc_reasons[2],
+ cmd_buffer->state.pc_reasons[3]);
+ cmd_buffer->state.pc_reasons[0] = NULL;
+ cmd_buffer->state.pc_reasons[1] = NULL;
+ cmd_buffer->state.pc_reasons[2] = NULL;
+ cmd_buffer->state.pc_reasons[3] = NULL;
+ cmd_buffer->state.pc_reasons_count = 0;
+ }
}
-static void
-cmd_buffer_alloc_push_constants(struct anv_cmd_buffer *cmd_buffer)
+static inline struct anv_state
+emit_dynamic_buffer_binding_table_entry(struct anv_cmd_buffer *cmd_buffer,
+ struct anv_cmd_pipeline_state *pipe_state,
+ struct anv_pipeline_binding *binding,
+ const struct anv_descriptor *desc)
{
- VkShaderStageFlags stages =
- cmd_buffer->state.gfx.pipeline->active_stages;
+ if (!desc->buffer)
+ return anv_null_surface_state_for_binding_table(cmd_buffer->device);
+
+ /* Compute the offset within the buffer */
+ uint32_t dynamic_offset =
+ pipe_state->dynamic_offsets[
+ binding->set].offsets[binding->dynamic_offset_index];
+ uint64_t offset = desc->offset + dynamic_offset;
+ /* Clamp to the buffer size */
+ offset = MIN2(offset, desc->buffer->vk.size);
+ /* Clamp the range to the buffer size */
+ uint32_t range = MIN2(desc->range, desc->buffer->vk.size - offset);
+
+ /* Align the range for consistency */
+ if (desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC)
+ range = align(range, ANV_UBO_ALIGNMENT);
+
+ struct anv_address address =
+ anv_address_add(desc->buffer->address, offset);
+
+ struct anv_state surface_state =
+ anv_cmd_buffer_alloc_surface_states(cmd_buffer, 1);
+ if (surface_state.map == NULL)
+ return ANV_STATE_NULL;
+
+ enum isl_format format =
+ anv_isl_format_for_descriptor_type(cmd_buffer->device,
+ desc->type);
+
+ isl_surf_usage_flags_t usage =
+ anv_isl_usage_for_descriptor_type(desc->type);
+
+ anv_fill_buffer_surface_state(cmd_buffer->device,
+ surface_state.map,
+ format, ISL_SWIZZLE_IDENTITY,
+ usage, address, range, 1);
+
+ return surface_state;
+}
+
+static uint32_t
+emit_indirect_descriptor_binding_table_entry(struct anv_cmd_buffer *cmd_buffer,
+ struct anv_cmd_pipeline_state *pipe_state,
+ struct anv_pipeline_binding *binding,
+ const struct anv_descriptor *desc)
+{
+ struct anv_device *device = cmd_buffer->device;
+ struct anv_state surface_state;
- /* In order to avoid thrash, we assume that vertex and fragment stages
- * always exist. In the rare case where one is missing *and* the other
- * uses push concstants, this may be suboptimal. However, avoiding stalls
- * seems more important.
+ /* Relative offset in the STATE_BASE_ADDRESS::SurfaceStateBaseAddress heap.
+ * Depending on where the descriptor surface state is allocated, they can
+ * either come from device->internal_surface_state_pool or
+ * device->bindless_surface_state_pool.
*/
- stages |= VK_SHADER_STAGE_FRAGMENT_BIT | VK_SHADER_STAGE_VERTEX_BIT;
+ switch (desc->type) {
+ case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
+ case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
+ case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT: {
+ if (desc->image_view) {
+ const struct anv_surface_state *sstate =
+ anv_image_view_texture_surface_state(desc->image_view,
+ binding->plane,
+ desc->layout);
+ surface_state = desc->image_view->use_surface_state_stream ?
+ sstate->state :
+ anv_bindless_state_for_binding_table(device, sstate->state);
+ assert(surface_state.alloc_size);
+ } else {
+ surface_state = anv_null_surface_state_for_binding_table(device);
+ }
+ break;
+ }
- if (stages == cmd_buffer->state.gfx.push_constant_stages)
- return;
+ case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE: {
+ if (desc->image_view) {
+ const struct anv_surface_state *sstate =
+ anv_image_view_storage_surface_state(desc->image_view);
+ surface_state = desc->image_view->use_surface_state_stream ?
+ sstate->state :
+ anv_bindless_state_for_binding_table(device, sstate->state);
+ assert(surface_state.alloc_size);
+ } else {
+ surface_state =
+ anv_null_surface_state_for_binding_table(device);
+ }
+ break;
+ }
-#if GFX_VER >= 8
- const unsigned push_constant_kb = 32;
-#elif GFX_VERx10 == 75
- const unsigned push_constant_kb = cmd_buffer->device->info.gt == 3 ? 32 : 16;
-#else
- const unsigned push_constant_kb = 16;
-#endif
+ case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
+ case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
+ if (desc->set_buffer_view) {
+ surface_state = desc->set_buffer_view->general.state;
+ assert(surface_state.alloc_size);
+ } else {
+ surface_state = anv_null_surface_state_for_binding_table(device);
+ }
+ break;
+
+ case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
+ if (desc->buffer_view) {
+ surface_state = anv_bindless_state_for_binding_table(
+ device,
+ desc->buffer_view->general.state);
+ assert(surface_state.alloc_size);
+ } else {
+ surface_state = anv_null_surface_state_for_binding_table(device);
+ }
+ break;
- const unsigned num_stages =
- util_bitcount(stages & VK_SHADER_STAGE_ALL_GRAPHICS);
- unsigned size_per_stage = push_constant_kb / num_stages;
+ case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
+ case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
+ surface_state =
+ emit_dynamic_buffer_binding_table_entry(cmd_buffer, pipe_state,
+ binding, desc);
+ break;
- /* Broadwell+ and Haswell gt3 require that the push constant sizes be in
- * units of 2KB. Incidentally, these are the same platforms that have
- * 32KB worth of push constant space.
- */
- if (push_constant_kb == 32)
- size_per_stage &= ~1u;
-
- uint32_t kb_used = 0;
- for (int i = MESA_SHADER_VERTEX; i < MESA_SHADER_FRAGMENT; i++) {
- unsigned push_size = (stages & (1 << i)) ? size_per_stage : 0;
- anv_batch_emit(&cmd_buffer->batch,
- GENX(3DSTATE_PUSH_CONSTANT_ALLOC_VS), alloc) {
- alloc._3DCommandSubOpcode = 18 + i;
- alloc.ConstantBufferOffset = (push_size > 0) ? kb_used : 0;
- alloc.ConstantBufferSize = push_size;
+ case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
+ if (desc->buffer_view) {
+ surface_state = anv_bindless_state_for_binding_table(
+ device, desc->buffer_view->storage.state);
+ assert(surface_state.alloc_size);
+ } else {
+ surface_state = anv_null_surface_state_for_binding_table(device);
}
- kb_used += push_size;
- }
+ break;
- anv_batch_emit(&cmd_buffer->batch,
- GENX(3DSTATE_PUSH_CONSTANT_ALLOC_PS), alloc) {
- alloc.ConstantBufferOffset = kb_used;
- alloc.ConstantBufferSize = push_constant_kb - kb_used;
+ default:
+ unreachable("Invalid descriptor type");
}
- cmd_buffer->state.gfx.push_constant_stages = stages;
+ return surface_state.offset;
+}
+
+static uint32_t
+emit_direct_descriptor_binding_table_entry(struct anv_cmd_buffer *cmd_buffer,
+ struct anv_cmd_pipeline_state *pipe_state,
+ const struct anv_descriptor_set *set,
+ struct anv_pipeline_binding *binding,
+ const struct anv_descriptor *desc)
+{
+ uint32_t desc_offset;
- /* From the BDW PRM for 3DSTATE_PUSH_CONSTANT_ALLOC_VS:
- *
- * "The 3DSTATE_CONSTANT_VS must be reprogrammed prior to
- * the next 3DPRIMITIVE command after programming the
- * 3DSTATE_PUSH_CONSTANT_ALLOC_VS"
- *
- * Since 3DSTATE_PUSH_CONSTANT_ALLOC_VS is programmed as part of
- * pipeline setup, we need to dirty push constants.
+ /* Relative offset in the STATE_BASE_ADDRESS::SurfaceStateBaseAddress heap.
+ * Depending on where the descriptor surface state is allocated, they can
+ * either come from device->internal_surface_state_pool or
+ * device->bindless_surface_state_pool.
*/
- cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_ALL_GRAPHICS;
+ switch (desc->type) {
+ case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
+ case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
+ case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
+ case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
+ case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
+ case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
+ case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
+ case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
+ desc_offset = set->desc_offset + binding->set_offset;
+ break;
+
+ case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
+ case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: {
+ struct anv_state state =
+ emit_dynamic_buffer_binding_table_entry(cmd_buffer, pipe_state,
+ binding, desc);
+ desc_offset = state.offset;
+ break;
+ }
+
+ default:
+ unreachable("Invalid descriptor type");
+ }
+
+ return desc_offset;
}
static VkResult
@@ -2558,7 +2153,6 @@ emit_binding_table(struct anv_cmd_buffer *cmd_buffer,
struct anv_shader_bin *shader,
struct anv_state *bt_state)
{
- struct anv_subpass *subpass = cmd_buffer->state.subpass;
uint32_t state_offset;
struct anv_pipeline_bind_map *map = &shader->bind_map;
@@ -2575,13 +2169,6 @@ emit_binding_table(struct anv_cmd_buffer *cmd_buffer,
if (bt_state->map == NULL)
return VK_ERROR_OUT_OF_DEVICE_MEMORY;
- /* We only need to emit relocs if we're not using softpin. If we are using
- * softpin then we always keep all user-allocated memory objects resident.
- */
- const bool need_client_mem_relocs =
- !anv_use_softpin(cmd_buffer->device->physical);
- struct anv_push_constants *push = &pipe_state->push_constants;
-
for (uint32_t s = 0; s < map->surface_count; s++) {
struct anv_pipeline_binding *binding = &map->surface_to_descriptor[s];
@@ -2595,90 +2182,66 @@ emit_binding_table(struct anv_cmd_buffer *cmd_buffer,
case ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS:
/* Color attachment binding */
assert(shader->stage == MESA_SHADER_FRAGMENT);
- if (binding->index < subpass->color_count) {
- const unsigned att =
- subpass->color_attachments[binding->index].attachment;
-
- /* From the Vulkan 1.0.46 spec:
- *
- * "If any color or depth/stencil attachments are
- * VK_ATTACHMENT_UNUSED, then no writes occur for those
- * attachments."
- */
- if (att == VK_ATTACHMENT_UNUSED) {
- surface_state = cmd_buffer->state.null_surface_state;
- } else {
- surface_state = cmd_buffer->state.attachments[att].color.state;
- }
+ if (binding->index < cmd_buffer->state.gfx.color_att_count) {
+ const struct anv_attachment *att =
+ &cmd_buffer->state.gfx.color_att[binding->index];
+ surface_state = att->surface_state.state;
} else {
- surface_state = cmd_buffer->state.null_surface_state;
+ surface_state = cmd_buffer->state.gfx.null_surface_state;
}
-
- assert(surface_state.map);
- bt_map[s] = surface_state.offset + state_offset;
- break;
-
- case ANV_DESCRIPTOR_SET_SHADER_CONSTANTS: {
- struct anv_state surface_state =
- anv_cmd_buffer_alloc_surface_state(cmd_buffer);
-
- struct anv_address constant_data = {
- .bo = cmd_buffer->device->instruction_state_pool.block_pool.bo,
- .offset = shader->kernel.offset +
- shader->prog_data->const_data_offset,
- };
- unsigned constant_data_size = shader->prog_data->const_data_size;
-
- const enum isl_format format =
- anv_isl_format_for_descriptor_type(cmd_buffer->device,
- VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER);
- anv_fill_buffer_surface_state(cmd_buffer->device,
- surface_state, format,
- ISL_SURF_USAGE_CONSTANT_BUFFER_BIT,
- constant_data, constant_data_size, 1);
-
assert(surface_state.map);
bt_map[s] = surface_state.offset + state_offset;
- add_surface_reloc(cmd_buffer, surface_state, constant_data);
break;
- }
case ANV_DESCRIPTOR_SET_NUM_WORK_GROUPS: {
/* This is always the first binding for compute shaders */
assert(shader->stage == MESA_SHADER_COMPUTE && s == 0);
struct anv_state surface_state =
- anv_cmd_buffer_alloc_surface_state(cmd_buffer);
+ anv_cmd_buffer_alloc_surface_states(cmd_buffer, 1);
+ if (surface_state.map == NULL)
+ return VK_ERROR_OUT_OF_DEVICE_MEMORY;
const enum isl_format format =
anv_isl_format_for_descriptor_type(cmd_buffer->device,
VK_DESCRIPTOR_TYPE_STORAGE_BUFFER);
- anv_fill_buffer_surface_state(cmd_buffer->device, surface_state,
- format,
+ anv_fill_buffer_surface_state(cmd_buffer->device, surface_state.map,
+ format, ISL_SWIZZLE_IDENTITY,
ISL_SURF_USAGE_CONSTANT_BUFFER_BIT,
cmd_buffer->state.compute.num_workgroups,
12, 1);
assert(surface_state.map);
bt_map[s] = surface_state.offset + state_offset;
- if (need_client_mem_relocs) {
- add_surface_reloc(cmd_buffer, surface_state,
- cmd_buffer->state.compute.num_workgroups);
- }
break;
}
case ANV_DESCRIPTOR_SET_DESCRIPTORS: {
+ struct anv_descriptor_set *set =
+ pipe_state->descriptors[binding->index];
+
+ /* If the shader doesn't access the set buffer, just put the null
+ * surface.
+ */
+ if (set->is_push && !shader->push_desc_info.used_set_buffer) {
+ bt_map[s] = 0;
+ break;
+ }
+
/* This is a descriptor set buffer so the set index is actually
* given by binding->binding. (Yes, that's confusing.)
*/
- struct anv_descriptor_set *set =
- pipe_state->descriptors[binding->index];
- assert(set->desc_mem.alloc_size);
+ assert(set->desc_surface_mem.alloc_size);
assert(set->desc_surface_state.alloc_size);
bt_map[s] = set->desc_surface_state.offset + state_offset;
- add_surface_reloc(cmd_buffer, set->desc_surface_state,
- anv_descriptor_set_address(set));
+ add_surface_reloc(cmd_buffer, anv_descriptor_set_address(set));
+ break;
+ }
+
+ case ANV_DESCRIPTOR_SET_DESCRIPTORS_BUFFER: {
+ assert(pipe_state->descriptor_buffers[binding->index].state.alloc_size);
+ bt_map[s] = pipe_state->descriptor_buffers[binding->index].state.offset +
+ state_offset;
break;
}
@@ -2686,6 +2249,7 @@ emit_binding_table(struct anv_cmd_buffer *cmd_buffer,
assert(binding->set < MAX_SETS);
const struct anv_descriptor_set *set =
pipe_state->descriptors[binding->set];
+
if (binding->index >= set->descriptor_count) {
/* From the Vulkan spec section entitled "DescriptorSet and
* Binding Assignment":
@@ -2702,162 +2266,45 @@ emit_binding_table(struct anv_cmd_buffer *cmd_buffer,
assert(binding->index < set->layout->descriptor_count);
continue;
}
- const struct anv_descriptor *desc = &set->descriptors[binding->index];
- switch (desc->type) {
- case VK_DESCRIPTOR_TYPE_SAMPLER:
- /* Nothing for us to do here */
- continue;
+ /* For push descriptor, if the binding is fully promoted to push
+ * constants, just reference the null surface in the binding table.
+ * It's unused and we didn't allocate/pack a surface state for it .
+ */
+ if (set->is_push) {
+ uint32_t desc_idx = set->layout->binding[binding->binding].descriptor_index;
+ assert(desc_idx < MAX_PUSH_DESCRIPTORS);
- case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
- case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE: {
- if (desc->image_view) {
- struct anv_surface_state sstate =
- (desc->layout == VK_IMAGE_LAYOUT_GENERAL) ?
- desc->image_view->planes[binding->plane].general_sampler_surface_state :
- desc->image_view->planes[binding->plane].optimal_sampler_surface_state;
- surface_state = sstate.state;
- assert(surface_state.alloc_size);
- if (need_client_mem_relocs)
- add_surface_state_relocs(cmd_buffer, sstate);
- } else {
- surface_state = cmd_buffer->device->null_surface_state;
+ if (shader->push_desc_info.fully_promoted_ubo_descriptors & BITFIELD_BIT(desc_idx)) {
+ surface_state =
+ anv_null_surface_state_for_binding_table(cmd_buffer->device);
+ break;
}
- break;
}
- case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
- assert(shader->stage == MESA_SHADER_FRAGMENT);
- assert(desc->image_view != NULL);
- if ((desc->image_view->vk.aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) == 0) {
- /* For depth and stencil input attachments, we treat it like any
- * old texture that a user may have bound.
- */
- assert(desc->image_view->n_planes == 1);
- struct anv_surface_state sstate =
- (desc->layout == VK_IMAGE_LAYOUT_GENERAL) ?
- desc->image_view->planes[0].general_sampler_surface_state :
- desc->image_view->planes[0].optimal_sampler_surface_state;
- surface_state = sstate.state;
- assert(surface_state.alloc_size);
- if (need_client_mem_relocs)
- add_surface_state_relocs(cmd_buffer, sstate);
- } else {
- /* For color input attachments, we create the surface state at
- * vkBeginRenderPass time so that we can include aux and clear
- * color information.
- */
- assert(binding->input_attachment_index < subpass->input_count);
- const unsigned subpass_att = binding->input_attachment_index;
- const unsigned att = subpass->input_attachments[subpass_att].attachment;
- surface_state = cmd_buffer->state.attachments[att].input.state;
- }
- break;
- case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE: {
- if (desc->image_view) {
- struct anv_surface_state sstate = (binding->write_only)
- ? desc->image_view->planes[binding->plane].writeonly_storage_surface_state
- : desc->image_view->planes[binding->plane].storage_surface_state;
- surface_state = sstate.state;
- assert(surface_state.alloc_size);
- if (surface_state.offset == 0) {
- mesa_loge("Bound a image to a descriptor where the "
- "descriptor does not have NonReadable "
- "set and the image does not have a "
- "corresponding SPIR-V format enum.");
- vk_debug_report(&cmd_buffer->device->physical->instance->vk,
- VK_DEBUG_REPORT_ERROR_BIT_EXT,
- &desc->image_view->vk.base,
- __LINE__, 0, "anv",
- "Bound a image to a descriptor where the "
- "descriptor does not have NonReadable "
- "set and the image does not have a "
- "corresponding SPIR-V format enum.");
- }
- if (surface_state.offset && need_client_mem_relocs)
- add_surface_state_relocs(cmd_buffer, sstate);
- } else {
- surface_state = cmd_buffer->device->null_surface_state;
- }
- break;
+ const struct anv_descriptor *desc = &set->descriptors[binding->index];
+ if (desc->type == VK_DESCRIPTOR_TYPE_ACCELERATION_STRUCTURE_KHR ||
+ desc->type == VK_DESCRIPTOR_TYPE_SAMPLER) {
+ /* Nothing for us to do here */
+ continue;
}
- case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
- case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
- case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
- if (desc->buffer_view) {
- surface_state = desc->buffer_view->surface_state;
- assert(surface_state.alloc_size);
- if (need_client_mem_relocs) {
- add_surface_reloc(cmd_buffer, surface_state,
- desc->buffer_view->address);
- }
- } else {
- surface_state = cmd_buffer->device->null_surface_state;
- }
- break;
-
- case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
- case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: {
- if (desc->buffer) {
- /* Compute the offset within the buffer */
- uint32_t dynamic_offset =
- push->dynamic_offsets[binding->dynamic_offset_index];
- uint64_t offset = desc->offset + dynamic_offset;
- /* Clamp to the buffer size */
- offset = MIN2(offset, desc->buffer->size);
- /* Clamp the range to the buffer size */
- uint32_t range = MIN2(desc->range, desc->buffer->size - offset);
-
- /* Align the range for consistency */
- if (desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC)
- range = align_u32(range, ANV_UBO_ALIGNMENT);
-
- struct anv_address address =
- anv_address_add(desc->buffer->address, offset);
-
- surface_state =
- anv_state_stream_alloc(&cmd_buffer->surface_state_stream, 64, 64);
- enum isl_format format =
- anv_isl_format_for_descriptor_type(cmd_buffer->device,
- desc->type);
-
- isl_surf_usage_flags_t usage =
- desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC ?
- ISL_SURF_USAGE_CONSTANT_BUFFER_BIT :
- ISL_SURF_USAGE_STORAGE_BIT;
-
- anv_fill_buffer_surface_state(cmd_buffer->device, surface_state,
- format, usage, address, range, 1);
- if (need_client_mem_relocs)
- add_surface_reloc(cmd_buffer, surface_state, address);
- } else {
- surface_state = cmd_buffer->device->null_surface_state;
- }
- break;
+ const struct anv_pipeline *pipeline = pipe_state->pipeline;
+ uint32_t surface_state_offset;
+ if (pipeline->layout.type == ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_INDIRECT) {
+ surface_state_offset =
+ emit_indirect_descriptor_binding_table_entry(cmd_buffer,
+ pipe_state,
+ binding, desc);
+ } else {
+ assert(pipeline->layout.type == ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_DIRECT ||
+ pipeline->layout.type == ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_BUFFER);
+ surface_state_offset =
+ emit_direct_descriptor_binding_table_entry(cmd_buffer, pipe_state,
+ set, binding, desc);
}
- case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
- if (desc->buffer_view) {
- surface_state = (binding->write_only)
- ? desc->buffer_view->writeonly_storage_surface_state
- : desc->buffer_view->storage_surface_state;
- assert(surface_state.alloc_size);
- if (need_client_mem_relocs) {
- add_surface_reloc(cmd_buffer, surface_state,
- desc->buffer_view->address);
- }
- } else {
- surface_state = cmd_buffer->device->null_surface_state;
- }
- break;
-
- default:
- assert(!"Invalid descriptor type");
- continue;
- }
- assert(surface_state.map);
- bt_map[s] = surface_state.offset + state_offset;
+ bt_map[s] = surface_state_offset + state_offset;
break;
}
}
@@ -2902,18 +2349,22 @@ emit_samplers(struct anv_cmd_buffer *cmd_buffer,
continue;
memcpy(state->map + (s * 16),
- sampler->state[binding->plane], sizeof(sampler->state[0]));
+ cmd_buffer->state.current_db_mode ==
+ ANV_CMD_DESCRIPTOR_BUFFER_MODE_BUFFER ?
+ sampler->db_state[binding->plane] :
+ sampler->state[binding->plane],
+ sizeof(sampler->state[0]));
}
return VK_SUCCESS;
}
-static uint32_t
-flush_descriptor_sets(struct anv_cmd_buffer *cmd_buffer,
- struct anv_cmd_pipeline_state *pipe_state,
- const VkShaderStageFlags dirty,
- struct anv_shader_bin **shaders,
- uint32_t num_shaders)
+uint32_t
+genX(cmd_buffer_flush_descriptor_sets)(struct anv_cmd_buffer *cmd_buffer,
+ struct anv_cmd_pipeline_state *pipe_state,
+ const VkShaderStageFlags dirty,
+ struct anv_shader_bin **shaders,
+ uint32_t num_shaders)
{
VkShaderStageFlags flushed = 0;
@@ -2949,10 +2400,10 @@ flush_descriptor_sets(struct anv_cmd_buffer *cmd_buffer,
if (result != VK_SUCCESS)
return 0;
- /* Re-emit state base addresses so we get the new surface state base
+ /* Re-emit the BT base address so we get the new surface state base
* address before we start emitting binding tables etc.
*/
- genX(cmd_buffer_emit_state_base_address)(cmd_buffer);
+ genX(cmd_buffer_emit_bt_pool_base_address)(cmd_buffer);
/* Re-emit all active binding tables */
flushed = 0;
@@ -2983,2403 +2434,1789 @@ flush_descriptor_sets(struct anv_cmd_buffer *cmd_buffer,
return flushed;
}
-static void
-cmd_buffer_emit_descriptor_pointers(struct anv_cmd_buffer *cmd_buffer,
- uint32_t stages)
+/* This function generates the surface state used to read the content of the
+ * descriptor buffer.
+ */
+void
+genX(cmd_buffer_emit_push_descriptor_buffer_surface)(struct anv_cmd_buffer *cmd_buffer,
+ struct anv_descriptor_set *set)
{
- static const uint32_t sampler_state_opcodes[] = {
- [MESA_SHADER_VERTEX] = 43,
- [MESA_SHADER_TESS_CTRL] = 44, /* HS */
- [MESA_SHADER_TESS_EVAL] = 45, /* DS */
- [MESA_SHADER_GEOMETRY] = 46,
- [MESA_SHADER_FRAGMENT] = 47,
- [MESA_SHADER_COMPUTE] = 0,
- };
+ assert(set->desc_surface_state.map == NULL);
- static const uint32_t binding_table_opcodes[] = {
- [MESA_SHADER_VERTEX] = 38,
- [MESA_SHADER_TESS_CTRL] = 39,
- [MESA_SHADER_TESS_EVAL] = 40,
- [MESA_SHADER_GEOMETRY] = 41,
- [MESA_SHADER_FRAGMENT] = 42,
- [MESA_SHADER_COMPUTE] = 0,
- };
-
- anv_foreach_stage(s, stages) {
- assert(s < ARRAY_SIZE(binding_table_opcodes));
- assert(binding_table_opcodes[s] > 0);
+ struct anv_descriptor_set_layout *layout = set->layout;
+ enum isl_format format =
+ anv_isl_format_for_descriptor_type(cmd_buffer->device,
+ VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER);
- if (cmd_buffer->state.samplers[s].alloc_size > 0) {
- anv_batch_emit(&cmd_buffer->batch,
- GENX(3DSTATE_SAMPLER_STATE_POINTERS_VS), ssp) {
- ssp._3DCommandSubOpcode = sampler_state_opcodes[s];
- ssp.PointertoVSSamplerState = cmd_buffer->state.samplers[s].offset;
- }
- }
+ set->desc_surface_state =
+ anv_cmd_buffer_alloc_surface_states(cmd_buffer, 1);
+ if (set->desc_surface_state.map == NULL)
+ return;
+ anv_fill_buffer_surface_state(cmd_buffer->device,
+ set->desc_surface_state.map,
+ format, ISL_SWIZZLE_IDENTITY,
+ ISL_SURF_USAGE_CONSTANT_BUFFER_BIT,
+ set->desc_surface_addr,
+ layout->descriptor_buffer_surface_size, 1);
+}
- /* Always emit binding table pointers if we're asked to, since on SKL
- * this is what flushes push constants. */
- anv_batch_emit(&cmd_buffer->batch,
- GENX(3DSTATE_BINDING_TABLE_POINTERS_VS), btp) {
- btp._3DCommandSubOpcode = binding_table_opcodes[s];
- btp.PointertoVSBindingTable = cmd_buffer->state.binding_tables[s].offset;
+/* This functions generates surface states used by a pipeline for push
+ * descriptors. This is delayed to the draw/dispatch time to avoid allocation
+ * and surface state generation when a pipeline is not going to use the
+ * binding table to access any push descriptor data.
+ */
+void
+genX(cmd_buffer_emit_push_descriptor_surfaces)(struct anv_cmd_buffer *cmd_buffer,
+ struct anv_descriptor_set *set)
+{
+ while (set->generate_surface_states) {
+ int desc_idx = u_bit_scan(&set->generate_surface_states);
+ struct anv_descriptor *desc = &set->descriptors[desc_idx];
+ struct anv_buffer_view *bview = desc->set_buffer_view;
+
+ if (bview != NULL && bview->general.state.map == NULL) {
+ bview->general.state =
+ anv_cmd_buffer_alloc_surface_states(cmd_buffer, 1);
+ if (bview->general.state.map == NULL)
+ return;
+ anv_descriptor_write_surface_state(cmd_buffer->device, desc,
+ bview->general.state);
}
}
}
-static struct anv_address
-get_push_range_address(struct anv_cmd_buffer *cmd_buffer,
- const struct anv_shader_bin *shader,
- const struct anv_push_range *range)
+ALWAYS_INLINE void
+genX(batch_emit_pipe_control)(struct anv_batch *batch,
+ const struct intel_device_info *devinfo,
+ uint32_t current_pipeline,
+ enum anv_pipe_bits bits,
+ const char *reason)
{
- struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
- switch (range->set) {
- case ANV_DESCRIPTOR_SET_DESCRIPTORS: {
- /* This is a descriptor set buffer so the set index is
- * actually given by binding->binding. (Yes, that's
- * confusing.)
- */
- struct anv_descriptor_set *set =
- gfx_state->base.descriptors[range->index];
- return anv_descriptor_set_address(set);
- }
+ genX(batch_emit_pipe_control_write)(batch,
+ devinfo,
+ current_pipeline,
+ NoWrite,
+ ANV_NULL_ADDRESS,
+ 0,
+ bits,
+ reason);
+}
- case ANV_DESCRIPTOR_SET_PUSH_CONSTANTS: {
- if (gfx_state->base.push_constants_state.alloc_size == 0) {
- gfx_state->base.push_constants_state =
- anv_cmd_buffer_gfx_push_constants(cmd_buffer);
- }
- return (struct anv_address) {
- .bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo,
- .offset = gfx_state->base.push_constants_state.offset,
- };
- }
+ALWAYS_INLINE void
+genX(batch_emit_pipe_control_write)(struct anv_batch *batch,
+ const struct intel_device_info *devinfo,
+ uint32_t current_pipeline,
+ uint32_t post_sync_op,
+ struct anv_address address,
+ uint32_t imm_data,
+ enum anv_pipe_bits bits,
+ const char *reason)
+{
+ if ((batch->engine_class == INTEL_ENGINE_CLASS_COPY) ||
+ (batch->engine_class == INTEL_ENGINE_CLASS_VIDEO))
+ unreachable("Trying to emit unsupported PIPE_CONTROL command.");
+
+ /* XXX - insert all workarounds and GFX specific things below. */
- case ANV_DESCRIPTOR_SET_SHADER_CONSTANTS:
- return (struct anv_address) {
- .bo = cmd_buffer->device->instruction_state_pool.block_pool.bo,
- .offset = shader->kernel.offset +
- shader->prog_data->const_data_offset,
+ /* Wa_14014966230: For COMPUTE Workload - Any PIPE_CONTROL command with
+ * POST_SYNC Operation Enabled MUST be preceded by a PIPE_CONTROL
+ * with CS_STALL Bit set (with No POST_SYNC ENABLED)
+ */
+ if (intel_device_info_is_adln(devinfo) &&
+ current_pipeline == GPGPU &&
+ post_sync_op != NoWrite) {
+ anv_batch_emit(batch, GENX(PIPE_CONTROL), pipe) {
+ pipe.CommandStreamerStallEnable = true;
+ anv_debug_dump_pc(pipe, "Wa_14014966230");
};
+ }
- default: {
- assert(range->set < MAX_SETS);
- struct anv_descriptor_set *set =
- gfx_state->base.descriptors[range->set];
- const struct anv_descriptor *desc =
- &set->descriptors[range->index];
+ /* SKL PRMs, Volume 7: 3D-Media-GPGPU, Programming Restrictions for
+ * PIPE_CONTROL, Flush Types:
+ * "Requires stall bit ([20] of DW) set for all GPGPU Workloads."
+ * For newer platforms this is documented in the PIPE_CONTROL instruction
+ * page.
+ */
+ if (current_pipeline == GPGPU &&
+ (bits & ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT))
+ bits |= ANV_PIPE_CS_STALL_BIT;
- if (desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER) {
- if (desc->buffer_view)
- return desc->buffer_view->address;
- } else {
- assert(desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC);
- if (desc->buffer) {
- const struct anv_push_constants *push =
- &gfx_state->base.push_constants;
- uint32_t dynamic_offset =
- push->dynamic_offsets[range->dynamic_offset_index];
- return anv_address_add(desc->buffer->address,
- desc->offset + dynamic_offset);
- }
- }
+#if INTEL_NEEDS_WA_1409600907
+ /* Wa_1409600907: "PIPE_CONTROL with Depth Stall Enable bit must
+ * be set with any PIPE_CONTROL with Depth Flush Enable bit set.
+ */
+ if (bits & ANV_PIPE_DEPTH_CACHE_FLUSH_BIT)
+ bits |= ANV_PIPE_DEPTH_STALL_BIT;
+#endif
- /* For NULL UBOs, we just return an address in the workaround BO. We do
- * writes to it for workarounds but always at the bottom. The higher
- * bytes should be all zeros.
- */
- assert(range->length * 32 <= 2048);
- return (struct anv_address) {
- .bo = cmd_buffer->device->workaround_bo,
- .offset = 1024,
- };
- }
- }
-}
+ anv_batch_emit(batch, GENX(PIPE_CONTROL), pipe) {
+#if GFX_VERx10 >= 125
+ pipe.UntypedDataPortCacheFlushEnable =
+ bits & ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT;
+ pipe.CCSFlushEnable = bits & ANV_PIPE_CCS_CACHE_FLUSH_BIT;
+#endif
+#if GFX_VER == 12
+ pipe.TileCacheFlushEnable = bits & ANV_PIPE_TILE_CACHE_FLUSH_BIT;
+#endif
+#if GFX_VER > 11
+ pipe.HDCPipelineFlushEnable = bits & ANV_PIPE_HDC_PIPELINE_FLUSH_BIT;
+#endif
+ pipe.DepthCacheFlushEnable = bits & ANV_PIPE_DEPTH_CACHE_FLUSH_BIT;
+ pipe.DCFlushEnable = bits & ANV_PIPE_DATA_CACHE_FLUSH_BIT;
+ pipe.RenderTargetCacheFlushEnable =
+ bits & ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;
+ pipe.DepthStallEnable = bits & ANV_PIPE_DEPTH_STALL_BIT;
-/** Returns the size in bytes of the bound buffer
- *
- * The range is relative to the start of the buffer, not the start of the
- * range. The returned range may be smaller than
- *
- * (range->start + range->length) * 32;
- */
-static uint32_t
-get_push_range_bound_size(struct anv_cmd_buffer *cmd_buffer,
- const struct anv_shader_bin *shader,
- const struct anv_push_range *range)
-{
- assert(shader->stage != MESA_SHADER_COMPUTE);
- const struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
- switch (range->set) {
- case ANV_DESCRIPTOR_SET_DESCRIPTORS: {
- struct anv_descriptor_set *set =
- gfx_state->base.descriptors[range->index];
- assert(range->start * 32 < set->desc_mem.alloc_size);
- assert((range->start + range->length) * 32 <= set->desc_mem.alloc_size);
- return set->desc_mem.alloc_size;
- }
-
- case ANV_DESCRIPTOR_SET_PUSH_CONSTANTS:
- return (range->start + range->length) * 32;
-
- case ANV_DESCRIPTOR_SET_SHADER_CONSTANTS:
- return ALIGN(shader->prog_data->const_data_size, ANV_UBO_ALIGNMENT);
-
- default: {
- assert(range->set < MAX_SETS);
- struct anv_descriptor_set *set =
- gfx_state->base.descriptors[range->set];
- const struct anv_descriptor *desc =
- &set->descriptors[range->index];
+ pipe.TLBInvalidate = bits & ANV_PIPE_TLB_INVALIDATE_BIT;
- if (desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER) {
- if (!desc->buffer_view)
- return 0;
+#if GFX_VERx10 >= 125
+ pipe.PSSStallSyncEnable = bits & ANV_PIPE_PSS_STALL_SYNC_BIT;
+#endif
+ pipe.CommandStreamerStallEnable = bits & ANV_PIPE_CS_STALL_BIT;
+ pipe.StallAtPixelScoreboard = bits & ANV_PIPE_STALL_AT_SCOREBOARD_BIT;
- if (range->start * 32 > desc->buffer_view->range)
- return 0;
+ pipe.StateCacheInvalidationEnable =
+ bits & ANV_PIPE_STATE_CACHE_INVALIDATE_BIT;
+ pipe.ConstantCacheInvalidationEnable =
+ bits & ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT;
+#if GFX_VER >= 12
+ /* Invalidates the L3 cache part in which index & vertex data is loaded
+ * when VERTEX_BUFFER_STATE::L3BypassDisable is set.
+ */
+ pipe.L3ReadOnlyCacheInvalidationEnable =
+ bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT;
+#endif
+ pipe.VFCacheInvalidationEnable =
+ bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT;
+ pipe.TextureCacheInvalidationEnable =
+ bits & ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT;
+ pipe.InstructionCacheInvalidateEnable =
+ bits & ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT;
- return desc->buffer_view->range;
- } else {
- if (!desc->buffer)
- return 0;
+ pipe.PostSyncOperation = post_sync_op;
+ pipe.Address = address;
+ pipe.DestinationAddressType = DAT_PPGTT;
+ pipe.ImmediateData = imm_data;
- assert(desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC);
- /* Compute the offset within the buffer */
- const struct anv_push_constants *push =
- &gfx_state->base.push_constants;
- uint32_t dynamic_offset =
- push->dynamic_offsets[range->dynamic_offset_index];
- uint64_t offset = desc->offset + dynamic_offset;
- /* Clamp to the buffer size */
- offset = MIN2(offset, desc->buffer->size);
- /* Clamp the range to the buffer size */
- uint32_t bound_range = MIN2(desc->range, desc->buffer->size - offset);
-
- /* Align the range for consistency */
- bound_range = align_u32(bound_range, ANV_UBO_ALIGNMENT);
-
- return bound_range;
- }
- }
+ anv_debug_dump_pc(pipe, reason);
}
}
-static void
-cmd_buffer_emit_push_constant(struct anv_cmd_buffer *cmd_buffer,
- gl_shader_stage stage,
- struct anv_address *buffers,
- unsigned buffer_count)
+/* Set preemption on/off. */
+void
+genX(batch_set_preemption)(struct anv_batch *batch,
+ const struct intel_device_info *devinfo,
+ uint32_t current_pipeline,
+ bool value)
{
- const struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
- const struct anv_graphics_pipeline *pipeline = gfx_state->pipeline;
-
- static const uint32_t push_constant_opcodes[] = {
- [MESA_SHADER_VERTEX] = 21,
- [MESA_SHADER_TESS_CTRL] = 25, /* HS */
- [MESA_SHADER_TESS_EVAL] = 26, /* DS */
- [MESA_SHADER_GEOMETRY] = 22,
- [MESA_SHADER_FRAGMENT] = 23,
- [MESA_SHADER_COMPUTE] = 0,
- };
-
- assert(stage < ARRAY_SIZE(push_constant_opcodes));
- assert(push_constant_opcodes[stage] > 0);
-
- anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CONSTANT_VS), c) {
- c._3DCommandSubOpcode = push_constant_opcodes[stage];
+#if GFX_VERx10 >= 120
+ anv_batch_write_reg(batch, GENX(CS_CHICKEN1), cc1) {
+ cc1.DisablePreemptionandHighPriorityPausingdueto3DPRIMITIVECommand = !value;
+ cc1.DisablePreemptionandHighPriorityPausingdueto3DPRIMITIVECommandMask = true;
+ }
- if (anv_pipeline_has_stage(pipeline, stage)) {
- const struct anv_pipeline_bind_map *bind_map =
- &pipeline->shaders[stage]->bind_map;
+ /* Wa_16013994831 - we need to insert CS_STALL and 250 noops. */
+ genx_batch_emit_pipe_control(batch, devinfo, current_pipeline,
+ ANV_PIPE_CS_STALL_BIT);
-#if GFX_VER >= 9
- /* This field exists since Gfx8. However, the Broadwell PRM says:
- *
- * "Constant Buffer Object Control State must be always programmed
- * to zero."
- *
- * This restriction does not exist on any newer platforms.
- *
- * We only have one MOCS field for the whole packet, not one per
- * buffer. We could go out of our way here to walk over all of the
- * buffers and see if any of them are used externally and use the
- * external MOCS. However, the notion that someone would use the
- * same bit of memory for both scanout and a UBO is nuts. Let's not
- * bother and assume it's all internal.
- */
- c.MOCS = isl_mocs(&cmd_buffer->device->isl_dev, 0, false);
+ for (unsigned i = 0; i < 250; i++)
+ anv_batch_emit(batch, GENX(MI_NOOP), noop);
#endif
+}
-#if GFX_VERx10 >= 75
- /* The Skylake PRM contains the following restriction:
- *
- * "The driver must ensure The following case does not occur
- * without a flush to the 3D engine: 3DSTATE_CONSTANT_* with
- * buffer 3 read length equal to zero committed followed by a
- * 3DSTATE_CONSTANT_* with buffer 0 read length not equal to
- * zero committed."
- *
- * To avoid this, we program the buffers in the highest slots.
- * This way, slot 0 is only used if slot 3 is also used.
- */
- assert(buffer_count <= 4);
- const unsigned shift = 4 - buffer_count;
- for (unsigned i = 0; i < buffer_count; i++) {
- const struct anv_push_range *range = &bind_map->push_ranges[i];
-
- /* At this point we only have non-empty ranges */
- assert(range->length > 0);
-
- /* For Ivy Bridge, make sure we only set the first range (actual
- * push constants)
- */
- assert((GFX_VERx10 >= 75) || i == 0);
+void
+genX(cmd_buffer_set_preemption)(struct anv_cmd_buffer *cmd_buffer, bool value)
+{
+#if GFX_VERx10 >= 120
+ if (cmd_buffer->state.gfx.object_preemption == value)
+ return;
- c.ConstantBody.ReadLength[i + shift] = range->length;
- c.ConstantBody.Buffer[i + shift] =
- anv_address_add(buffers[i], range->start * 32);
- }
-#else
- /* For Ivy Bridge, push constants are relative to dynamic state
- * base address and we only ever push actual push constants.
- */
- if (bind_map->push_ranges[0].length > 0) {
- assert(buffer_count == 1);
- assert(bind_map->push_ranges[0].set ==
- ANV_DESCRIPTOR_SET_PUSH_CONSTANTS);
- assert(buffers[0].bo ==
- cmd_buffer->device->dynamic_state_pool.block_pool.bo);
- c.ConstantBody.ReadLength[0] = bind_map->push_ranges[0].length;
- c.ConstantBody.Buffer[0].bo = NULL;
- c.ConstantBody.Buffer[0].offset = buffers[0].offset;
- }
- assert(bind_map->push_ranges[1].length == 0);
- assert(bind_map->push_ranges[2].length == 0);
- assert(bind_map->push_ranges[3].length == 0);
+ genX(batch_set_preemption)(&cmd_buffer->batch, cmd_buffer->device->info,
+ cmd_buffer->state.current_pipeline,
+ value);
+ cmd_buffer->state.gfx.object_preemption = value;
#endif
- }
- }
}
-#if GFX_VER >= 12
-static void
-cmd_buffer_emit_push_constant_all(struct anv_cmd_buffer *cmd_buffer,
- uint32_t shader_mask,
- struct anv_address *buffers,
- uint32_t buffer_count)
+ALWAYS_INLINE static void
+update_descriptor_set_surface_state(struct anv_cmd_buffer *cmd_buffer,
+ struct anv_cmd_pipeline_state *pipe_state,
+ uint32_t set_idx)
{
- if (buffer_count == 0) {
- anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CONSTANT_ALL), c) {
- c.ShaderUpdateEnable = shader_mask;
- c.MOCS = isl_mocs(&cmd_buffer->device->isl_dev, 0, false);
- }
+ if (!pipe_state->descriptor_buffers[set_idx].bound)
return;
- }
- const struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
- const struct anv_graphics_pipeline *pipeline = gfx_state->pipeline;
-
- static const UNUSED uint32_t push_constant_opcodes[] = {
- [MESA_SHADER_VERTEX] = 21,
- [MESA_SHADER_TESS_CTRL] = 25, /* HS */
- [MESA_SHADER_TESS_EVAL] = 26, /* DS */
- [MESA_SHADER_GEOMETRY] = 22,
- [MESA_SHADER_FRAGMENT] = 23,
- [MESA_SHADER_COMPUTE] = 0,
- };
-
- gl_shader_stage stage = vk_to_mesa_shader_stage(shader_mask);
- assert(stage < ARRAY_SIZE(push_constant_opcodes));
- assert(push_constant_opcodes[stage] > 0);
-
- const struct anv_pipeline_bind_map *bind_map =
- &pipeline->shaders[stage]->bind_map;
-
- uint32_t *dw;
- const uint32_t buffer_mask = (1 << buffer_count) - 1;
- const uint32_t num_dwords = 2 + 2 * buffer_count;
-
- dw = anv_batch_emitn(&cmd_buffer->batch, num_dwords,
- GENX(3DSTATE_CONSTANT_ALL),
- .ShaderUpdateEnable = shader_mask,
- .PointerBufferMask = buffer_mask,
- .MOCS = isl_mocs(&cmd_buffer->device->isl_dev, 0, false));
-
- for (int i = 0; i < buffer_count; i++) {
- const struct anv_push_range *range = &bind_map->push_ranges[i];
- GENX(3DSTATE_CONSTANT_ALL_DATA_pack)(
- &cmd_buffer->batch, dw + 2 + i * 2,
- &(struct GENX(3DSTATE_CONSTANT_ALL_DATA)) {
- .PointerToConstantBuffer =
- anv_address_add(buffers[i], range->start * 32),
- .ConstantBufferReadLength = range->length,
- });
+ const struct anv_physical_device *device = cmd_buffer->device->physical;
+ const int32_t buffer_index =
+ pipe_state->descriptor_buffers[set_idx].buffer_index;
+ const struct anv_va_range *push_va_range =
+ GFX_VERx10 >= 125 ?
+ &device->va.push_descriptor_buffer_pool :
+ &device->va.internal_surface_state_pool;
+ const struct anv_va_range *va_range =
+ buffer_index == -1 ? push_va_range : &device->va.descriptor_buffer_pool;
+ const uint64_t descriptor_set_addr =
+ (buffer_index == -1 ? va_range->addr :
+ cmd_buffer->state.descriptor_buffers.address[buffer_index]) +
+ pipe_state->descriptor_buffers[set_idx].buffer_offset;
+ const uint64_t set_size =
+ MIN2(va_range->size - (descriptor_set_addr - va_range->addr),
+ anv_physical_device_bindless_heap_size(device, true));
+
+ if (descriptor_set_addr != pipe_state->descriptor_buffers[set_idx].address) {
+ pipe_state->descriptor_buffers[set_idx].address = descriptor_set_addr;
+
+ struct anv_state surface_state =
+ anv_cmd_buffer_alloc_surface_states(cmd_buffer, 1);
+ const enum isl_format format =
+ anv_isl_format_for_descriptor_type(cmd_buffer->device,
+ VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER);
+ anv_fill_buffer_surface_state(
+ cmd_buffer->device, surface_state.map,
+ format, ISL_SWIZZLE_IDENTITY,
+ ISL_SURF_USAGE_CONSTANT_BUFFER_BIT,
+ anv_address_from_u64(pipe_state->descriptor_buffers[set_idx].address),
+ set_size, 1);
+
+ pipe_state->descriptor_buffers[set_idx].state = surface_state;
}
}
-#endif
-static void
-cmd_buffer_flush_push_constants(struct anv_cmd_buffer *cmd_buffer,
- VkShaderStageFlags dirty_stages)
+ALWAYS_INLINE static uint32_t
+compute_descriptor_set_surface_offset(const struct anv_cmd_buffer *cmd_buffer,
+ const struct anv_cmd_pipeline_state *pipe_state,
+ const uint32_t set_idx)
{
- VkShaderStageFlags flushed = 0;
- struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
- const struct anv_graphics_pipeline *pipeline = gfx_state->pipeline;
-
-#if GFX_VER >= 12
- uint32_t nobuffer_stages = 0;
-#endif
-
- /* Compute robust pushed register access mask for each stage. */
- if (cmd_buffer->device->robust_buffer_access) {
- anv_foreach_stage(stage, dirty_stages) {
- if (!anv_pipeline_has_stage(pipeline, stage))
- continue;
-
- const struct anv_shader_bin *shader = pipeline->shaders[stage];
- const struct anv_pipeline_bind_map *bind_map = &shader->bind_map;
- struct anv_push_constants *push = &gfx_state->base.push_constants;
-
- push->push_reg_mask[stage] = 0;
- /* Start of the current range in the shader, relative to the start of
- * push constants in the shader.
- */
- unsigned range_start_reg = 0;
- for (unsigned i = 0; i < 4; i++) {
- const struct anv_push_range *range = &bind_map->push_ranges[i];
- if (range->length == 0)
- continue;
-
- unsigned bound_size =
- get_push_range_bound_size(cmd_buffer, shader, range);
- if (bound_size >= range->start * 32) {
- unsigned bound_regs =
- MIN2(DIV_ROUND_UP(bound_size, 32) - range->start,
- range->length);
- assert(range_start_reg + bound_regs <= 64);
- push->push_reg_mask[stage] |= BITFIELD64_RANGE(range_start_reg,
- bound_regs);
- }
+ const struct anv_physical_device *device = cmd_buffer->device->physical;
- cmd_buffer->state.push_constants_dirty |=
- mesa_to_vk_shader_stage(stage);
+ if (device->uses_ex_bso) {
+ int32_t buffer_index =
+ pipe_state->descriptor_buffers[set_idx].buffer_index;
+ uint64_t buffer_address =
+ buffer_index == -1 ?
+ device->va.push_descriptor_buffer_pool.addr :
+ cmd_buffer->state.descriptor_buffers.address[buffer_index];
- range_start_reg += range->length;
- }
- }
+ return (buffer_address - device->va.descriptor_buffer_pool.addr) +
+ pipe_state->descriptor_buffers[set_idx].buffer_offset;
}
- /* Resets the push constant state so that we allocate a new one if
- * needed.
- */
- gfx_state->base.push_constants_state = ANV_STATE_NULL;
-
- anv_foreach_stage(stage, dirty_stages) {
- unsigned buffer_count = 0;
- flushed |= mesa_to_vk_shader_stage(stage);
- UNUSED uint32_t max_push_range = 0;
-
- struct anv_address buffers[4] = {};
- if (anv_pipeline_has_stage(pipeline, stage)) {
- const struct anv_shader_bin *shader = pipeline->shaders[stage];
- const struct anv_pipeline_bind_map *bind_map = &shader->bind_map;
-
- /* We have to gather buffer addresses as a second step because the
- * loop above puts data into the push constant area and the call to
- * get_push_range_address is what locks our push constants and copies
- * them into the actual GPU buffer. If we did the two loops at the
- * same time, we'd risk only having some of the sizes in the push
- * constant buffer when we did the copy.
- */
- for (unsigned i = 0; i < 4; i++) {
- const struct anv_push_range *range = &bind_map->push_ranges[i];
- if (range->length == 0)
- break;
+ return pipe_state->descriptor_buffers[set_idx].buffer_offset << 6;
+}
- buffers[i] = get_push_range_address(cmd_buffer, shader, range);
- max_push_range = MAX2(max_push_range, range->length);
- buffer_count++;
- }
+ALWAYS_INLINE static uint32_t
+compute_descriptor_set_sampler_offset(const struct anv_cmd_buffer *cmd_buffer,
+ const struct anv_cmd_pipeline_state *pipe_state,
+ const uint32_t set_idx)
+{
+ const struct anv_physical_device *device = cmd_buffer->device->physical;
+ int32_t buffer_index =
+ pipe_state->descriptor_buffers[set_idx].buffer_index;
+ uint64_t buffer_address =
+ buffer_index == -1 ?
+ device->va.push_descriptor_buffer_pool.addr :
+ cmd_buffer->state.descriptor_buffers.address[buffer_index];
+
+ return (buffer_address - device->va.dynamic_state_db_pool.addr) +
+ pipe_state->descriptor_buffers[set_idx].buffer_offset;
+}
- /* We have at most 4 buffers but they should be tightly packed */
- for (unsigned i = buffer_count; i < 4; i++)
- assert(bind_map->push_ranges[i].length == 0);
- }
+void
+genX(flush_descriptor_buffers)(struct anv_cmd_buffer *cmd_buffer,
+ struct anv_cmd_pipeline_state *pipe_state)
+{
+ /* On Gfx12.5+ the STATE_BASE_ADDRESS BindlessSurfaceStateBaseAddress &
+ * DynamicStateBaseAddress are fixed. So as long as we stay in one
+ * descriptor buffer mode, there is no need to switch.
+ */
+#if GFX_VERx10 >= 125
+ if (cmd_buffer->state.current_db_mode !=
+ cmd_buffer->state.pending_db_mode)
+ genX(cmd_buffer_emit_state_base_address)(cmd_buffer);
+#else
+ if (cmd_buffer->state.descriptor_buffers.dirty)
+ genX(cmd_buffer_emit_state_base_address)(cmd_buffer);
+#endif
-#if GFX_VER >= 12
- /* If this stage doesn't have any push constants, emit it later in a
- * single CONSTANT_ALL packet.
- */
- if (buffer_count == 0) {
- nobuffer_stages |= 1 << stage;
- continue;
+ assert(cmd_buffer->state.current_db_mode !=
+ ANV_CMD_DESCRIPTOR_BUFFER_MODE_UNKNOWN);
+ if (cmd_buffer->state.current_db_mode == ANV_CMD_DESCRIPTOR_BUFFER_MODE_BUFFER &&
+ (cmd_buffer->state.descriptor_buffers.dirty ||
+ (pipe_state->pipeline->active_stages &
+ cmd_buffer->state.descriptor_buffers.offsets_dirty) != 0)) {
+ struct anv_push_constants *push_constants =
+ &pipe_state->push_constants;
+ for (uint32_t i = 0; i < ARRAY_SIZE(push_constants->desc_surface_offsets); i++) {
+ update_descriptor_set_surface_state(cmd_buffer, pipe_state, i);
+
+ push_constants->desc_surface_offsets[i] =
+ compute_descriptor_set_surface_offset(cmd_buffer, pipe_state, i);
+ push_constants->desc_sampler_offsets[i] =
+ compute_descriptor_set_sampler_offset(cmd_buffer, pipe_state, i);
}
- /* The Constant Buffer Read Length field from 3DSTATE_CONSTANT_ALL
- * contains only 5 bits, so we can only use it for buffers smaller than
- * 32.
- */
- if (max_push_range < 32) {
- cmd_buffer_emit_push_constant_all(cmd_buffer, 1 << stage,
- buffers, buffer_count);
- continue;
- }
+#if GFX_VERx10 < 125
+ struct anv_device *device = cmd_buffer->device;
+ push_constants->surfaces_base_offset =
+ (cmd_buffer->state.descriptor_buffers.surfaces_address -
+ device->physical->va.descriptor_buffer_pool.addr);
#endif
- cmd_buffer_emit_push_constant(cmd_buffer, stage, buffers, buffer_count);
+ cmd_buffer->state.push_constants_dirty |=
+ (cmd_buffer->state.descriptor_buffers.offsets_dirty &
+ pipe_state->pipeline->active_stages);
+ pipe_state->push_constants_data_dirty = true;
+ cmd_buffer->state.descriptor_buffers.offsets_dirty &=
+ ~pipe_state->pipeline->active_stages;
}
-#if GFX_VER >= 12
- if (nobuffer_stages)
- cmd_buffer_emit_push_constant_all(cmd_buffer, nobuffer_stages, NULL, 0);
-#endif
-
- cmd_buffer->state.push_constants_dirty &= ~flushed;
+ cmd_buffer->state.descriptor_buffers.dirty = false;
}
-static void
-cmd_buffer_emit_clip(struct anv_cmd_buffer *cmd_buffer)
+void
+genX(cmd_buffer_begin_companion)(struct anv_cmd_buffer *cmd_buffer,
+ VkCommandBufferLevel level)
{
- const uint32_t clip_states =
-#if GFX_VER <= 7
- ANV_CMD_DIRTY_DYNAMIC_FRONT_FACE |
- ANV_CMD_DIRTY_DYNAMIC_CULL_MODE |
-#endif
- ANV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY |
- ANV_CMD_DIRTY_DYNAMIC_VIEWPORT |
- ANV_CMD_DIRTY_PIPELINE;
+ cmd_buffer->vk.level = level;
+ cmd_buffer->is_companion_rcs_cmd_buffer = true;
- if ((cmd_buffer->state.gfx.dirty & clip_states) == 0)
- return;
+ trace_intel_begin_cmd_buffer(&cmd_buffer->trace);
- /* Take dynamic primitive topology in to account with
- * 3DSTATE_CLIP::ViewportXYClipTestEnable
+#if GFX_VER >= 12
+ /* Reenable prefetching at the beginning of secondary command buffers. We
+ * do this so that the return instruction edition is not prefetched before
+ * completion.
*/
- bool xy_clip_test_enable = 0;
-
- if (cmd_buffer->state.gfx.pipeline->dynamic_states &
- ANV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY) {
- VkPrimitiveTopology primitive_topology =
- cmd_buffer->state.gfx.dynamic.primitive_topology;
-
- VkPolygonMode dynamic_raster_mode =
- genX(raster_polygon_mode)(cmd_buffer->state.gfx.pipeline,
- primitive_topology);
-
- xy_clip_test_enable = (dynamic_raster_mode == VK_POLYGON_MODE_FILL);
+ if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) {
+ anv_batch_emit(&cmd_buffer->batch, GENX(MI_ARB_CHECK), arb) {
+ arb.PreParserDisableMask = true;
+ arb.PreParserDisable = false;
+ }
}
-
-#if GFX_VER <= 7
- const struct anv_dynamic_state *d = &cmd_buffer->state.gfx.dynamic;
-#endif
- struct GENX(3DSTATE_CLIP) clip = {
- GENX(3DSTATE_CLIP_header),
-#if GFX_VER <= 7
- .FrontWinding = genX(vk_to_intel_front_face)[d->front_face],
- .CullMode = genX(vk_to_intel_cullmode)[d->cull_mode],
#endif
- .ViewportXYClipTestEnable = xy_clip_test_enable,
- };
- uint32_t dwords[GENX(3DSTATE_CLIP_length)];
- struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
- const struct brw_vue_prog_data *last =
- anv_pipeline_get_last_vue_prog_data(pipeline);
- if (last->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
- clip.MaximumVPIndex =
- cmd_buffer->state.gfx.dynamic.viewport.count > 0 ?
- cmd_buffer->state.gfx.dynamic.viewport.count - 1 : 0;
- }
+ /* A companion command buffer is only used for blorp commands atm, so
+ * default to the legacy mode.
+ */
+ cmd_buffer->state.current_db_mode = ANV_CMD_DESCRIPTOR_BUFFER_MODE_LEGACY;
+ genX(cmd_buffer_emit_bt_pool_base_address)(cmd_buffer);
- GENX(3DSTATE_CLIP_pack)(NULL, dwords, &clip);
- anv_batch_emit_merge(&cmd_buffer->batch, dwords,
- pipeline->gfx7.clip);
+ /* Re-emit the aux table register in every command buffer. This way we're
+ * ensured that we have the table even if this command buffer doesn't
+ * initialize any images.
+ */
+ if (cmd_buffer->device->info->has_aux_map) {
+ anv_add_pending_pipe_bits(cmd_buffer,
+ ANV_PIPE_AUX_TABLE_INVALIDATE_BIT,
+ "new cmd buffer with aux-tt");
+ }
}
static void
-cmd_buffer_emit_streamout(struct anv_cmd_buffer *cmd_buffer)
+genX(cmd_buffer_set_protected_memory)(struct anv_cmd_buffer *cmd_buffer,
+ bool enabled)
{
- const struct anv_dynamic_state *d = &cmd_buffer->state.gfx.dynamic;
- struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
-
-#if GFX_VER == 7
-# define streamout_state_dw pipeline->gfx7.streamout_state
+#if GFX_VER >= 12
+ if (enabled) {
+ anv_batch_emit(&cmd_buffer->batch, GENX(MI_SET_APPID), appid) {
+ /* Default value for single session. */
+ appid.ProtectedMemoryApplicationID = cmd_buffer->device->protected_session_id;
+ appid.ProtectedMemoryApplicationIDType = DISPLAY_APP;
+ }
+ }
+ anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
+ pc.PipeControlFlushEnable = true;
+ pc.DCFlushEnable = true;
+ pc.RenderTargetCacheFlushEnable = true;
+ pc.CommandStreamerStallEnable = true;
+ if (enabled)
+ pc.ProtectedMemoryEnable = true;
+ else
+ pc.ProtectedMemoryDisable = true;
+ }
#else
-# define streamout_state_dw pipeline->gfx8.streamout_state
+ unreachable("Protected content not supported");
#endif
-
- uint32_t dwords[GENX(3DSTATE_STREAMOUT_length)];
-
- struct GENX(3DSTATE_STREAMOUT) so = {
- GENX(3DSTATE_STREAMOUT_header),
- .RenderingDisable = d->raster_discard,
- };
- GENX(3DSTATE_STREAMOUT_pack)(NULL, dwords, &so);
- anv_batch_emit_merge(&cmd_buffer->batch, dwords, streamout_state_dw);
}
-void
-genX(cmd_buffer_flush_state)(struct anv_cmd_buffer *cmd_buffer)
+VkResult
+genX(BeginCommandBuffer)(
+ VkCommandBuffer commandBuffer,
+ const VkCommandBufferBeginInfo* pBeginInfo)
{
- struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
- uint32_t *p;
-
- assert((pipeline->active_stages & VK_SHADER_STAGE_COMPUTE_BIT) == 0);
-
- genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->base.l3_config);
-
- genX(cmd_buffer_emit_hashing_mode)(cmd_buffer, UINT_MAX, UINT_MAX, 1);
-
- genX(flush_pipeline_select_3d)(cmd_buffer);
+ ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+ VkResult result;
- /* Apply any pending pipeline flushes we may have. We want to apply them
- * now because, if any of those flushes are for things like push constants,
- * the GPU will read the state at weird times.
+ /* If this is the first vkBeginCommandBuffer, we must *initialize* the
+ * command buffer's state. Otherwise, we must *reset* its state. In both
+ * cases we reset it.
+ *
+ * From the Vulkan 1.0 spec:
+ *
+ * If a command buffer is in the executable state and the command buffer
+ * was allocated from a command pool with the
+ * VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT flag set, then
+ * vkBeginCommandBuffer implicitly resets the command buffer, behaving
+ * as if vkResetCommandBuffer had been called with
+ * VK_COMMAND_BUFFER_RESET_RELEASE_RESOURCES_BIT not set. It then puts
+ * the command buffer in the recording state.
*/
- genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
-
- uint32_t vb_emit = cmd_buffer->state.gfx.vb_dirty & pipeline->vb_used;
- if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE)
- vb_emit |= pipeline->vb_used;
+ anv_cmd_buffer_reset(&cmd_buffer->vk, 0);
+ anv_cmd_buffer_reset_rendering(cmd_buffer);
- if (vb_emit) {
- const uint32_t num_buffers = __builtin_popcount(vb_emit);
- const uint32_t num_dwords = 1 + num_buffers * 4;
-
- p = anv_batch_emitn(&cmd_buffer->batch, num_dwords,
- GENX(3DSTATE_VERTEX_BUFFERS));
- uint32_t i = 0;
- u_foreach_bit(vb, vb_emit) {
- struct anv_buffer *buffer = cmd_buffer->state.vertex_bindings[vb].buffer;
- uint32_t offset = cmd_buffer->state.vertex_bindings[vb].offset;
-
- /* If dynamic, use stride/size from vertex binding, otherwise use
- * stride/size that was setup in the pipeline object.
- */
- bool dynamic_stride = cmd_buffer->state.gfx.dynamic.dyn_vbo_stride;
- bool dynamic_size = cmd_buffer->state.gfx.dynamic.dyn_vbo_size;
-
- struct GENX(VERTEX_BUFFER_STATE) state;
- if (buffer) {
- uint32_t stride = dynamic_stride ?
- cmd_buffer->state.vertex_bindings[vb].stride : pipeline->vb[vb].stride;
- /* From the Vulkan spec (vkCmdBindVertexBuffers2EXT):
- *
- * "If pname:pSizes is not NULL then pname:pSizes[i] specifies
- * the bound size of the vertex buffer starting from the corresponding
- * elements of pname:pBuffers[i] plus pname:pOffsets[i]."
- */
- UNUSED uint32_t size = dynamic_size ?
- cmd_buffer->state.vertex_bindings[vb].size : buffer->size - offset;
+ cmd_buffer->usage_flags = pBeginInfo->flags;
- state = (struct GENX(VERTEX_BUFFER_STATE)) {
- .VertexBufferIndex = vb,
+ /* VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT must be ignored for
+ * primary level command buffers.
+ *
+ * From the Vulkan 1.0 spec:
+ *
+ * VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT specifies that a
+ * secondary command buffer is considered to be entirely inside a render
+ * pass. If this is a primary command buffer, then this bit is ignored.
+ */
+ if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY)
+ cmd_buffer->usage_flags &= ~VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT;
- .MOCS = anv_mocs(cmd_buffer->device, buffer->address.bo,
- ISL_SURF_USAGE_VERTEX_BUFFER_BIT),
-#if GFX_VER <= 7
- .BufferAccessType = pipeline->vb[vb].instanced ? INSTANCEDATA : VERTEXDATA,
- .InstanceDataStepRate = pipeline->vb[vb].instance_divisor,
-#endif
- .AddressModifyEnable = true,
- .BufferPitch = stride,
- .BufferStartingAddress = anv_address_add(buffer->address, offset),
- .NullVertexBuffer = offset >= buffer->size,
#if GFX_VER >= 12
- .L3BypassDisable = true,
-#endif
-
-#if GFX_VER >= 8
- .BufferSize = size,
-#else
- /* XXX: to handle dynamic offset for older gens we might want
- * to modify Endaddress, but there are issues when doing so:
- *
- * https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7439
- */
- .EndAddress = anv_address_add(buffer->address, buffer->size - 1),
-#endif
- };
- } else {
- state = (struct GENX(VERTEX_BUFFER_STATE)) {
- .VertexBufferIndex = vb,
- .NullVertexBuffer = true,
- };
- }
-
-#if GFX_VER >= 8 && GFX_VER <= 9
- genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(cmd_buffer, vb,
- state.BufferStartingAddress,
- state.BufferSize);
-#endif
-
- GENX(VERTEX_BUFFER_STATE_pack)(&cmd_buffer->batch, &p[1 + i * 4], &state);
- i++;
+ /* Reenable prefetching at the beginning of secondary command buffers. We
+ * do this so that the return instruction edition is not prefetched before
+ * completion.
+ */
+ if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) {
+ anv_batch_emit(&cmd_buffer->batch, GENX(MI_ARB_CHECK), arb) {
+ arb.PreParserDisableMask = true;
+ arb.PreParserDisable = false;
}
}
+#endif
- cmd_buffer->state.gfx.vb_dirty &= ~vb_emit;
+ /* Assume the viewport has already been set in primary command buffers. */
+ if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY)
+ cmd_buffer->state.gfx.viewport_set = true;
- uint32_t descriptors_dirty = cmd_buffer->state.descriptors_dirty &
- pipeline->active_stages;
- if (!cmd_buffer->state.gfx.dirty && !descriptors_dirty &&
- !cmd_buffer->state.push_constants_dirty)
- return;
+ trace_intel_begin_cmd_buffer(&cmd_buffer->trace);
- if ((cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_XFB_ENABLE) ||
- (GFX_VER == 7 && (cmd_buffer->state.gfx.dirty &
- ANV_CMD_DIRTY_PIPELINE))) {
- /* We don't need any per-buffer dirty tracking because you're not
- * allowed to bind different XFB buffers while XFB is enabled.
+ if (anv_cmd_buffer_is_video_queue(cmd_buffer) ||
+ anv_cmd_buffer_is_blitter_queue(cmd_buffer)) {
+ /* Re-emit the aux table register in every command buffer. This way we're
+ * ensured that we have the table even if this command buffer doesn't
+ * initialize any images.
*/
- for (unsigned idx = 0; idx < MAX_XFB_BUFFERS; idx++) {
- struct anv_xfb_binding *xfb = &cmd_buffer->state.xfb_bindings[idx];
- anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_SO_BUFFER), sob) {
-#if GFX_VER < 12
- sob.SOBufferIndex = idx;
-#else
- sob._3DCommandOpcode = 0;
- sob._3DCommandSubOpcode = SO_BUFFER_INDEX_0_CMD + idx;
-#endif
-
- if (cmd_buffer->state.xfb_enabled && xfb->buffer && xfb->size != 0) {
- sob.MOCS = anv_mocs(cmd_buffer->device, xfb->buffer->address.bo, 0);
- sob.SurfaceBaseAddress = anv_address_add(xfb->buffer->address,
- xfb->offset);
-#if GFX_VER >= 8
- sob.SOBufferEnable = true;
- sob.StreamOffsetWriteEnable = false;
- /* Size is in DWords - 1 */
- sob.SurfaceSize = DIV_ROUND_UP(xfb->size, 4) - 1;
-#else
- /* We don't have SOBufferEnable in 3DSTATE_SO_BUFFER on Gfx7 so
- * we trust in SurfaceEndAddress = SurfaceBaseAddress = 0 (the
- * default for an empty SO_BUFFER packet) to disable them.
- */
- sob.SurfacePitch = pipeline->gfx7.xfb_bo_pitch[idx];
- sob.SurfaceEndAddress = anv_address_add(xfb->buffer->address,
- xfb->offset + xfb->size);
-#endif
- }
- }
- }
-
- /* CNL and later require a CS stall after 3DSTATE_SO_BUFFER */
- if (GFX_VER >= 10) {
+ if (cmd_buffer->device->info->has_aux_map) {
anv_add_pending_pipe_bits(cmd_buffer,
- ANV_PIPE_CS_STALL_BIT,
- "after 3DSTATE_SO_BUFFER call");
+ ANV_PIPE_AUX_TABLE_INVALIDATE_BIT,
+ "new cmd buffer with aux-tt");
}
+ return VK_SUCCESS;
}
- if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) {
- anv_batch_emit_batch(&cmd_buffer->batch, &pipeline->base.batch);
-
- /* Remove from dynamic state emission all of stuff that is baked into
- * the pipeline.
- */
- cmd_buffer->state.gfx.dirty &= ~pipeline->static_state_mask;
-
- /* If the pipeline changed, we may need to re-allocate push constant
- * space in the URB.
- */
- cmd_buffer_alloc_push_constants(cmd_buffer);
- }
-
- if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE)
- cmd_buffer->state.gfx.primitive_topology = pipeline->topology;
+#if GFX_VER >= 12
+ if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY &&
+ cmd_buffer->vk.pool->flags & VK_COMMAND_POOL_CREATE_PROTECTED_BIT)
+ genX(cmd_buffer_set_protected_memory)(cmd_buffer, true);
+#endif
-#if GFX_VER <= 7
- if (cmd_buffer->state.descriptors_dirty & VK_SHADER_STAGE_VERTEX_BIT ||
- cmd_buffer->state.push_constants_dirty & VK_SHADER_STAGE_VERTEX_BIT) {
- /* From the IVB PRM Vol. 2, Part 1, Section 3.2.1:
- *
- * "A PIPE_CONTROL with Post-Sync Operation set to 1h and a depth
- * stall needs to be sent just prior to any 3DSTATE_VS,
- * 3DSTATE_URB_VS, 3DSTATE_CONSTANT_VS,
- * 3DSTATE_BINDING_TABLE_POINTER_VS,
- * 3DSTATE_SAMPLER_STATE_POINTER_VS command. Only one
- * PIPE_CONTROL needs to be sent before any combination of VS
- * associated 3DSTATE."
- */
- anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
- pc.DepthStallEnable = true;
- pc.PostSyncOperation = WriteImmediateData;
- pc.Address = cmd_buffer->device->workaround_address;
- anv_debug_dump_pc(pc);
- }
+ if (cmd_buffer->device->vk.enabled_extensions.EXT_descriptor_buffer) {
+ genX(cmd_buffer_emit_state_base_address)(cmd_buffer);
+ } else {
+ cmd_buffer->state.current_db_mode = ANV_CMD_DESCRIPTOR_BUFFER_MODE_LEGACY;
+ genX(cmd_buffer_emit_bt_pool_base_address)(cmd_buffer);
}
-#endif
- /* Render targets live in the same binding table as fragment descriptors */
- if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_RENDER_TARGETS)
- descriptors_dirty |= VK_SHADER_STAGE_FRAGMENT_BIT;
+ /* We sometimes store vertex data in the dynamic state buffer for blorp
+ * operations and our dynamic state stream may re-use data from previous
+ * command buffers. In order to prevent stale cache data, we flush the VF
+ * cache. We could do this on every blorp call but that's not really
+ * needed as all of the data will get written by the CPU prior to the GPU
+ * executing anything. The chances are fairly high that they will use
+ * blorp at least once per primary command buffer so it shouldn't be
+ * wasted.
+ *
+ * There is also a workaround on gfx8 which requires us to invalidate the
+ * VF cache occasionally. It's easier if we can assume we start with a
+ * fresh cache (See also genX(cmd_buffer_set_binding_for_gfx8_vb_flush).)
+ */
+ anv_add_pending_pipe_bits(cmd_buffer,
+ ANV_PIPE_VF_CACHE_INVALIDATE_BIT,
+ "new cmd buffer");
- /* We emit the binding tables and sampler tables first, then emit push
- * constants and then finally emit binding table and sampler table
- * pointers. It has to happen in this order, since emitting the binding
- * tables may change the push constants (in case of storage images). After
- * emitting push constants, on SKL+ we have to emit the corresponding
- * 3DSTATE_BINDING_TABLE_POINTER_* for the push constants to take effect.
+ /* Re-emit the aux table register in every command buffer. This way we're
+ * ensured that we have the table even if this command buffer doesn't
+ * initialize any images.
*/
- uint32_t dirty = 0;
- if (descriptors_dirty) {
- dirty = flush_descriptor_sets(cmd_buffer,
- &cmd_buffer->state.gfx.base,
- descriptors_dirty,
- pipeline->shaders,
- ARRAY_SIZE(pipeline->shaders));
- cmd_buffer->state.descriptors_dirty &= ~dirty;
- }
-
- if (dirty || cmd_buffer->state.push_constants_dirty) {
- /* Because we're pushing UBOs, we have to push whenever either
- * descriptors or push constants is dirty.
- */
- dirty |= cmd_buffer->state.push_constants_dirty;
- dirty &= ANV_STAGE_MASK & VK_SHADER_STAGE_ALL_GRAPHICS;
- cmd_buffer_flush_push_constants(cmd_buffer, dirty);
+ if (cmd_buffer->device->info->has_aux_map) {
+ anv_add_pending_pipe_bits(cmd_buffer,
+ ANV_PIPE_AUX_TABLE_INVALIDATE_BIT,
+ "new cmd buffer with aux-tt");
}
- if (dirty)
- cmd_buffer_emit_descriptor_pointers(cmd_buffer, dirty);
-
- cmd_buffer_emit_clip(cmd_buffer);
+ /* We send an "Indirect State Pointers Disable" packet at
+ * EndCommandBuffer, so all push constant packets are ignored during a
+ * context restore. Documentation says after that command, we need to
+ * emit push constants again before any rendering operation. So we
+ * flag them dirty here to make sure they get emitted.
+ */
+ cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_ALL_GRAPHICS;
+ cmd_buffer->state.gfx.base.push_constants_data_dirty = true;
- if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_DYNAMIC_RASTERIZER_DISCARD_ENABLE)
- cmd_buffer_emit_streamout(cmd_buffer);
+ if (cmd_buffer->usage_flags &
+ VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {
+ struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
+
+ char gcbiar_data[VK_GCBIARR_DATA_SIZE(MAX_RTS)];
+ const VkRenderingInfo *resume_info =
+ vk_get_command_buffer_inheritance_as_rendering_resume(cmd_buffer->vk.level,
+ pBeginInfo,
+ gcbiar_data);
+ if (resume_info != NULL) {
+ genX(CmdBeginRendering)(commandBuffer, resume_info);
+ } else {
+ const VkCommandBufferInheritanceRenderingInfo *inheritance_info =
+ vk_get_command_buffer_inheritance_rendering_info(cmd_buffer->vk.level,
+ pBeginInfo);
+ assert(inheritance_info);
+
+ gfx->rendering_flags = inheritance_info->flags;
+ gfx->render_area = (VkRect2D) { };
+ gfx->layer_count = 0;
+ gfx->samples = inheritance_info->rasterizationSamples;
+ gfx->view_mask = inheritance_info->viewMask;
+
+ uint32_t color_att_count = inheritance_info->colorAttachmentCount;
+ result = anv_cmd_buffer_init_attachments(cmd_buffer, color_att_count);
+ if (result != VK_SUCCESS)
+ return result;
+
+ for (uint32_t i = 0; i < color_att_count; i++) {
+ gfx->color_att[i].vk_format =
+ inheritance_info->pColorAttachmentFormats[i];
+ }
+ gfx->depth_att.vk_format =
+ inheritance_info->depthAttachmentFormat;
+ gfx->stencil_att.vk_format =
+ inheritance_info->stencilAttachmentFormat;
- if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_DYNAMIC_VIEWPORT)
- gfx8_cmd_buffer_emit_viewport(cmd_buffer);
+ anv_cmd_graphic_state_update_has_uint_rt(gfx);
- if (cmd_buffer->state.gfx.dirty & (ANV_CMD_DIRTY_DYNAMIC_VIEWPORT |
- ANV_CMD_DIRTY_PIPELINE)) {
- gfx8_cmd_buffer_emit_depth_viewport(cmd_buffer,
- pipeline->depth_clamp_enable);
+ cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_RENDER_AREA |
+ ANV_CMD_DIRTY_RENDER_TARGETS;
+ }
}
- if (cmd_buffer->state.gfx.dirty & (ANV_CMD_DIRTY_DYNAMIC_SCISSOR |
- ANV_CMD_DIRTY_RENDER_TARGETS))
- gfx7_cmd_buffer_emit_scissor(cmd_buffer);
+ /* Emit the sample pattern at the beginning of the batch because the
+ * default locations emitted at the device initialization might have been
+ * changed by a previous command buffer.
+ *
+ * Do not change that when we're continuing a previous renderpass.
+ */
+ if (cmd_buffer->device->vk.enabled_extensions.EXT_sample_locations &&
+ !(cmd_buffer->usage_flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT))
+ genX(emit_sample_pattern)(&cmd_buffer->batch, NULL);
- genX(cmd_buffer_flush_dynamic_state)(cmd_buffer);
-}
+ if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) {
+ const VkCommandBufferInheritanceConditionalRenderingInfoEXT *conditional_rendering_info =
+ vk_find_struct_const(pBeginInfo->pInheritanceInfo->pNext, COMMAND_BUFFER_INHERITANCE_CONDITIONAL_RENDERING_INFO_EXT);
-static void
-emit_vertex_bo(struct anv_cmd_buffer *cmd_buffer,
- struct anv_address addr,
- uint32_t size, uint32_t index)
-{
- uint32_t *p = anv_batch_emitn(&cmd_buffer->batch, 5,
- GENX(3DSTATE_VERTEX_BUFFERS));
-
- GENX(VERTEX_BUFFER_STATE_pack)(&cmd_buffer->batch, p + 1,
- &(struct GENX(VERTEX_BUFFER_STATE)) {
- .VertexBufferIndex = index,
- .AddressModifyEnable = true,
- .BufferPitch = 0,
- .MOCS = addr.bo ? anv_mocs(cmd_buffer->device, addr.bo,
- ISL_SURF_USAGE_VERTEX_BUFFER_BIT) : 0,
- .NullVertexBuffer = size == 0,
-#if GFX_VER >= 12
- .L3BypassDisable = true,
-#endif
-#if (GFX_VER >= 8)
- .BufferStartingAddress = addr,
- .BufferSize = size
-#else
- .BufferStartingAddress = addr,
- .EndAddress = anv_address_add(addr, size),
-#endif
- });
+ /* If secondary buffer supports conditional rendering
+ * we should emit commands as if conditional rendering is enabled.
+ */
+ cmd_buffer->state.conditional_render_enabled =
+ conditional_rendering_info && conditional_rendering_info->conditionalRenderingEnable;
- genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(cmd_buffer,
- index, addr, size);
-}
+ if (pBeginInfo->pInheritanceInfo->occlusionQueryEnable) {
+ cmd_buffer->state.gfx.n_occlusion_queries = 1;
+ cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_OCCLUSION_QUERY_ACTIVE;
+ }
+ }
-static void
-emit_base_vertex_instance_bo(struct anv_cmd_buffer *cmd_buffer,
- struct anv_address addr)
-{
- emit_vertex_bo(cmd_buffer, addr, addr.bo ? 8 : 0, ANV_SVGS_VB_INDEX);
+ return VK_SUCCESS;
}
+/* From the PRM, Volume 2a:
+ *
+ * "Indirect State Pointers Disable
+ *
+ * At the completion of the post-sync operation associated with this pipe
+ * control packet, the indirect state pointers in the hardware are
+ * considered invalid; the indirect pointers are not saved in the context.
+ * If any new indirect state commands are executed in the command stream
+ * while the pipe control is pending, the new indirect state commands are
+ * preserved.
+ *
+ * [DevIVB+]: Using Invalidate State Pointer (ISP) only inhibits context
+ * restoring of Push Constant (3DSTATE_CONSTANT_*) commands. Push Constant
+ * commands are only considered as Indirect State Pointers. Once ISP is
+ * issued in a context, SW must initialize by programming push constant
+ * commands for all the shaders (at least to zero length) before attempting
+ * any rendering operation for the same context."
+ *
+ * 3DSTATE_CONSTANT_* packets are restored during a context restore,
+ * even though they point to a BO that has been already unreferenced at
+ * the end of the previous batch buffer. This has been fine so far since
+ * we are protected by these scratch page (every address not covered by
+ * a BO should be pointing to the scratch page). But on CNL, it is
+ * causing a GPU hang during context restore at the 3DSTATE_CONSTANT_*
+ * instruction.
+ *
+ * The flag "Indirect State Pointers Disable" in PIPE_CONTROL tells the
+ * hardware to ignore previous 3DSTATE_CONSTANT_* packets during a
+ * context restore, so the mentioned hang doesn't happen. However,
+ * software must program push constant commands for all stages prior to
+ * rendering anything. So we flag them dirty in BeginCommandBuffer.
+ *
+ * Finally, we also make sure to stall at pixel scoreboard to make sure the
+ * constants have been loaded into the EUs prior to disable the push constants
+ * so that it doesn't hang a previous 3DPRIMITIVE.
+ */
static void
-emit_base_vertex_instance(struct anv_cmd_buffer *cmd_buffer,
- uint32_t base_vertex, uint32_t base_instance)
+emit_isp_disable(struct anv_cmd_buffer *cmd_buffer)
{
- if (base_vertex == 0 && base_instance == 0) {
- emit_base_vertex_instance_bo(cmd_buffer, ANV_NULL_ADDRESS);
- } else {
- struct anv_state id_state =
- anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, 8, 4);
-
- ((uint32_t *)id_state.map)[0] = base_vertex;
- ((uint32_t *)id_state.map)[1] = base_instance;
-
- struct anv_address addr = {
- .bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo,
- .offset = id_state.offset,
- };
-
- emit_base_vertex_instance_bo(cmd_buffer, addr);
+ genx_batch_emit_pipe_control(&cmd_buffer->batch,
+ cmd_buffer->device->info,
+ cmd_buffer->state.current_pipeline,
+ ANV_PIPE_CS_STALL_BIT |
+ ANV_PIPE_STALL_AT_SCOREBOARD_BIT);
+ anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
+ pc.IndirectStatePointersDisable = true;
+ pc.CommandStreamerStallEnable = true;
+ anv_debug_dump_pc(pc, __func__);
}
}
-static void
-emit_draw_index(struct anv_cmd_buffer *cmd_buffer, uint32_t draw_index)
+static VkResult
+end_command_buffer(struct anv_cmd_buffer *cmd_buffer)
{
- struct anv_state state =
- anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, 4, 4);
+ if (anv_batch_has_error(&cmd_buffer->batch))
+ return cmd_buffer->batch.status;
- ((uint32_t *)state.map)[0] = draw_index;
+ anv_measure_endcommandbuffer(cmd_buffer);
- struct anv_address addr = {
- .bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo,
- .offset = state.offset,
- };
+ if (anv_cmd_buffer_is_video_queue(cmd_buffer) ||
+ anv_cmd_buffer_is_blitter_queue(cmd_buffer)) {
+ trace_intel_end_cmd_buffer(&cmd_buffer->trace, cmd_buffer->vk.level);
+ genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+ anv_cmd_buffer_end_batch_buffer(cmd_buffer);
+ return VK_SUCCESS;
+ }
- emit_vertex_bo(cmd_buffer, addr, 4, ANV_DRAWID_VB_INDEX);
-}
+ /* Flush query clears using blorp so that secondary query writes do not
+ * race with the clear.
+ */
+ if (cmd_buffer->state.queries.clear_bits) {
+ anv_add_pending_pipe_bits(cmd_buffer,
+ ANV_PIPE_QUERY_BITS(cmd_buffer->state.queries.clear_bits),
+ "query clear flush prior command buffer end");
+ }
-static void
-update_dirty_vbs_for_gfx8_vb_flush(struct anv_cmd_buffer *cmd_buffer,
- uint32_t access_type)
-{
- struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
- const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
-
- uint64_t vb_used = pipeline->vb_used;
- if (vs_prog_data->uses_firstvertex ||
- vs_prog_data->uses_baseinstance)
- vb_used |= 1ull << ANV_SVGS_VB_INDEX;
- if (vs_prog_data->uses_drawid)
- vb_used |= 1ull << ANV_DRAWID_VB_INDEX;
-
- genX(cmd_buffer_update_dirty_vbs_for_gfx8_vb_flush)(cmd_buffer,
- access_type == RANDOM,
- vb_used);
-}
+ genX(cmd_buffer_flush_generated_draws)(cmd_buffer);
-ALWAYS_INLINE static void
-cmd_buffer_emit_vertex_constants_and_flush(struct anv_cmd_buffer *cmd_buffer,
- const struct brw_vs_prog_data *vs_prog_data,
- uint32_t base_vertex,
- uint32_t base_instance,
- uint32_t draw_id,
- bool force_flush)
-{
- bool emitted = false;
- if (vs_prog_data->uses_firstvertex ||
- vs_prog_data->uses_baseinstance) {
- emit_base_vertex_instance(cmd_buffer, base_vertex, base_instance);
- emitted = true;
- }
- if (vs_prog_data->uses_drawid) {
- emit_draw_index(cmd_buffer, draw_id);
- emitted = true;
- }
- /* Emitting draw index or vertex index BOs may result in needing
- * additional VF cache flushes.
+ /* Turn on object level preemption if it is disabled to have it in known
+ * state at the beginning of new command buffer.
*/
- if (emitted || force_flush)
- genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
-}
-
-void genX(CmdDraw)(
- VkCommandBuffer commandBuffer,
- uint32_t vertexCount,
- uint32_t instanceCount,
- uint32_t firstVertex,
- uint32_t firstInstance)
-{
- ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
- struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
- const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
+ if (!cmd_buffer->state.gfx.object_preemption)
+ genX(cmd_buffer_set_preemption)(cmd_buffer, true);
- if (anv_batch_has_error(&cmd_buffer->batch))
- return;
+ /* We want every command buffer to start with the PMA fix in a known state,
+ * so we disable it at the end of the command buffer.
+ */
+ genX(cmd_buffer_enable_pma_fix)(cmd_buffer, false);
- const uint32_t count = (vertexCount *
- instanceCount *
- (pipeline->use_primitive_replication ?
- 1 : anv_subpass_view_count(cmd_buffer->state.subpass)));
- anv_measure_snapshot(cmd_buffer,
- INTEL_SNAPSHOT_DRAW,
- "draw", count);
+ /* Wa_14015814527
+ *
+ * Apply task URB workaround in the end of primary or secondary cmd_buffer.
+ */
+ genX(apply_task_urb_workaround)(cmd_buffer);
- genX(cmd_buffer_flush_state)(cmd_buffer);
+ genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
- if (cmd_buffer->state.conditional_render_enabled)
- genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
+ emit_isp_disable(cmd_buffer);
- cmd_buffer_emit_vertex_constants_and_flush(cmd_buffer, vs_prog_data,
- firstVertex, firstInstance, 0,
- true);
+#if GFX_VER >= 12
+ if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY &&
+ cmd_buffer->vk.pool->flags & VK_COMMAND_POOL_CREATE_PROTECTED_BIT)
+ genX(cmd_buffer_set_protected_memory)(cmd_buffer, false);
+#endif
- /* Our implementation of VK_KHR_multiview uses instancing to draw the
- * different views. We need to multiply instanceCount by the view count.
- */
- if (!pipeline->use_primitive_replication)
- instanceCount *= anv_subpass_view_count(cmd_buffer->state.subpass);
+ trace_intel_end_cmd_buffer(&cmd_buffer->trace, cmd_buffer->vk.level);
- anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
- prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled;
- prim.VertexAccessType = SEQUENTIAL;
- prim.PrimitiveTopologyType = cmd_buffer->state.gfx.primitive_topology;
- prim.VertexCountPerInstance = vertexCount;
- prim.StartVertexLocation = firstVertex;
- prim.InstanceCount = instanceCount;
- prim.StartInstanceLocation = firstInstance;
- prim.BaseVertexLocation = 0;
- }
+ anv_cmd_buffer_end_batch_buffer(cmd_buffer);
- update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, SEQUENTIAL);
+ return VK_SUCCESS;
}
-void genX(CmdDrawMultiEXT)(
- VkCommandBuffer commandBuffer,
- uint32_t drawCount,
- const VkMultiDrawInfoEXT *pVertexInfo,
- uint32_t instanceCount,
- uint32_t firstInstance,
- uint32_t stride)
+VkResult
+genX(EndCommandBuffer)(
+ VkCommandBuffer commandBuffer)
{
ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
- struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
- const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
- if (anv_batch_has_error(&cmd_buffer->batch))
- return;
-
- const uint32_t count = (drawCount *
- instanceCount *
- (pipeline->use_primitive_replication ?
- 1 : anv_subpass_view_count(cmd_buffer->state.subpass)));
- anv_measure_snapshot(cmd_buffer,
- INTEL_SNAPSHOT_DRAW,
- "draw_multi", count);
-
- genX(cmd_buffer_flush_state)(cmd_buffer);
-
- if (cmd_buffer->state.conditional_render_enabled)
- genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
+ VkResult status = end_command_buffer(cmd_buffer);
+ if (status != VK_SUCCESS)
+ return status;
- /* Our implementation of VK_KHR_multiview uses instancing to draw the
- * different views. We need to multiply instanceCount by the view count.
+ /* If there is MSAA access over the compute/transfer queue, we can use the
+ * companion RCS command buffer and end it properly.
*/
- if (!pipeline->use_primitive_replication)
- instanceCount *= anv_subpass_view_count(cmd_buffer->state.subpass);
-
- uint32_t i = 0;
- vk_foreach_multi_draw(draw, i, pVertexInfo, drawCount, stride) {
- cmd_buffer_emit_vertex_constants_and_flush(cmd_buffer, vs_prog_data,
- draw->firstVertex,
- firstInstance, i, !i);
-
- anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
- prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled;
- prim.VertexAccessType = SEQUENTIAL;
- prim.PrimitiveTopologyType = cmd_buffer->state.gfx.primitive_topology;
- prim.VertexCountPerInstance = draw->vertexCount;
- prim.StartVertexLocation = draw->firstVertex;
- prim.InstanceCount = instanceCount;
- prim.StartInstanceLocation = firstInstance;
- prim.BaseVertexLocation = 0;
- }
+ if (cmd_buffer->companion_rcs_cmd_buffer) {
+ assert(anv_cmd_buffer_is_compute_queue(cmd_buffer) ||
+ anv_cmd_buffer_is_blitter_queue(cmd_buffer));
+ status = end_command_buffer(cmd_buffer->companion_rcs_cmd_buffer);
}
- update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, SEQUENTIAL);
+ ANV_RMV(cmd_buffer_create, cmd_buffer->device, cmd_buffer);
+
+ return status;
}
-void genX(CmdDrawIndexed)(
- VkCommandBuffer commandBuffer,
- uint32_t indexCount,
- uint32_t instanceCount,
- uint32_t firstIndex,
- int32_t vertexOffset,
- uint32_t firstInstance)
+static void
+cmd_buffer_emit_copy_ts_buffer(struct u_trace_context *utctx,
+ void *cmdstream,
+ void *ts_from, uint32_t from_offset,
+ void *ts_to, uint32_t to_offset,
+ uint32_t count)
{
- ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
- struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
- const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
-
- if (anv_batch_has_error(&cmd_buffer->batch))
- return;
-
- const uint32_t count = (indexCount *
- instanceCount *
- (pipeline->use_primitive_replication ?
- 1 : anv_subpass_view_count(cmd_buffer->state.subpass)));
- anv_measure_snapshot(cmd_buffer,
- INTEL_SNAPSHOT_DRAW,
- "draw indexed",
- count);
-
- genX(cmd_buffer_flush_state)(cmd_buffer);
-
- if (cmd_buffer->state.conditional_render_enabled)
- genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
-
- cmd_buffer_emit_vertex_constants_and_flush(cmd_buffer, vs_prog_data, vertexOffset, firstInstance, 0, true);
-
- /* Our implementation of VK_KHR_multiview uses instancing to draw the
- * different views. We need to multiply instanceCount by the view count.
- */
- if (!pipeline->use_primitive_replication)
- instanceCount *= anv_subpass_view_count(cmd_buffer->state.subpass);
-
- anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
- prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled;
- prim.VertexAccessType = RANDOM;
- prim.PrimitiveTopologyType = cmd_buffer->state.gfx.primitive_topology;
- prim.VertexCountPerInstance = indexCount;
- prim.StartVertexLocation = firstIndex;
- prim.InstanceCount = instanceCount;
- prim.StartInstanceLocation = firstInstance;
- prim.BaseVertexLocation = vertexOffset;
- }
-
- update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, RANDOM);
+ struct anv_memcpy_state *memcpy_state = cmdstream;
+ struct anv_address from_addr = (struct anv_address) {
+ .bo = ts_from, .offset = from_offset * sizeof(uint64_t) };
+ struct anv_address to_addr = (struct anv_address) {
+ .bo = ts_to, .offset = to_offset * sizeof(uint64_t) };
+
+ genX(emit_so_memcpy)(memcpy_state, to_addr, from_addr,
+ count * sizeof(uint64_t));
}
-void genX(CmdDrawMultiIndexedEXT)(
+void
+genX(CmdExecuteCommands)(
VkCommandBuffer commandBuffer,
- uint32_t drawCount,
- const VkMultiDrawIndexedInfoEXT *pIndexInfo,
- uint32_t instanceCount,
- uint32_t firstInstance,
- uint32_t stride,
- const int32_t *pVertexOffset)
+ uint32_t commandBufferCount,
+ const VkCommandBuffer* pCmdBuffers)
{
- ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
- struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
- const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
+ ANV_FROM_HANDLE(anv_cmd_buffer, container, commandBuffer);
- if (anv_batch_has_error(&cmd_buffer->batch))
+ struct anv_device *device = container->device;
+
+ if (anv_batch_has_error(&container->batch))
return;
- const uint32_t count = (drawCount *
- instanceCount *
- (pipeline->use_primitive_replication ?
- 1 : anv_subpass_view_count(cmd_buffer->state.subpass)));
- anv_measure_snapshot(cmd_buffer,
- INTEL_SNAPSHOT_DRAW,
- "draw indexed_multi",
- count);
+ /* The secondary command buffers will assume that the PMA fix is disabled
+ * when they begin executing. Make sure this is true.
+ */
+ genX(cmd_buffer_enable_pma_fix)(container, false);
- genX(cmd_buffer_flush_state)(cmd_buffer);
+ /* Turn on preemption in case it was toggled off. */
+ if (!container->state.gfx.object_preemption)
+ genX(cmd_buffer_set_preemption)(container, true);
- if (cmd_buffer->state.conditional_render_enabled)
- genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
+ /* Wa_14015814527
+ *
+ * Apply task URB workaround before secondary cmd buffers.
+ */
+ genX(apply_task_urb_workaround)(container);
- /* Our implementation of VK_KHR_multiview uses instancing to draw the
- * different views. We need to multiply instanceCount by the view count.
+ /* Flush query clears using blorp so that secondary query writes do not
+ * race with the clear.
*/
- if (!pipeline->use_primitive_replication)
- instanceCount *= anv_subpass_view_count(cmd_buffer->state.subpass);
-
- uint32_t i = 0;
- if (pVertexOffset) {
- if (vs_prog_data->uses_drawid) {
- bool emitted = true;
- if (vs_prog_data->uses_firstvertex ||
- vs_prog_data->uses_baseinstance) {
- emit_base_vertex_instance(cmd_buffer, *pVertexOffset, firstInstance);
- emitted = true;
- }
- vk_foreach_multi_draw_indexed(draw, i, pIndexInfo, drawCount, stride) {
- if (vs_prog_data->uses_drawid) {
- emit_draw_index(cmd_buffer, i);
- emitted = true;
- }
- /* Emitting draw index or vertex index BOs may result in needing
- * additional VF cache flushes.
- */
- if (emitted)
- genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
-
- anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
- prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled;
- prim.VertexAccessType = RANDOM;
- prim.PrimitiveTopologyType = cmd_buffer->state.gfx.primitive_topology;
- prim.VertexCountPerInstance = draw->indexCount;
- prim.StartVertexLocation = draw->firstIndex;
- prim.InstanceCount = instanceCount;
- prim.StartInstanceLocation = firstInstance;
- prim.BaseVertexLocation = *pVertexOffset;
- }
- emitted = false;
- }
- } else {
- if (vs_prog_data->uses_firstvertex ||
- vs_prog_data->uses_baseinstance) {
- emit_base_vertex_instance(cmd_buffer, *pVertexOffset, firstInstance);
- /* Emitting draw index or vertex index BOs may result in needing
- * additional VF cache flushes.
- */
- genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
- }
- vk_foreach_multi_draw_indexed(draw, i, pIndexInfo, drawCount, stride) {
- anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
- prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled;
- prim.VertexAccessType = RANDOM;
- prim.PrimitiveTopologyType = cmd_buffer->state.gfx.primitive_topology;
- prim.VertexCountPerInstance = draw->indexCount;
- prim.StartVertexLocation = draw->firstIndex;
- prim.InstanceCount = instanceCount;
- prim.StartInstanceLocation = firstInstance;
- prim.BaseVertexLocation = *pVertexOffset;
- }
- }
- }
- } else {
- vk_foreach_multi_draw_indexed(draw, i, pIndexInfo, drawCount, stride) {
- cmd_buffer_emit_vertex_constants_and_flush(cmd_buffer, vs_prog_data,
- draw->vertexOffset,
- firstInstance, i, i != 0);
-
- anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
- prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled;
- prim.VertexAccessType = RANDOM;
- prim.PrimitiveTopologyType = cmd_buffer->state.gfx.primitive_topology;
- prim.VertexCountPerInstance = draw->indexCount;
- prim.StartVertexLocation = draw->firstIndex;
- prim.InstanceCount = instanceCount;
- prim.StartInstanceLocation = firstInstance;
- prim.BaseVertexLocation = draw->vertexOffset;
- }
- }
+ if (container->state.queries.clear_bits) {
+ anv_add_pending_pipe_bits(container,
+ ANV_PIPE_QUERY_BITS(container->state.queries.clear_bits),
+ "query clear flush prior to secondary buffer");
}
- update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, RANDOM);
-}
-
-/* Auto-Draw / Indirect Registers */
-#define GFX7_3DPRIM_END_OFFSET 0x2420
-#define GFX7_3DPRIM_START_VERTEX 0x2430
-#define GFX7_3DPRIM_VERTEX_COUNT 0x2434
-#define GFX7_3DPRIM_INSTANCE_COUNT 0x2438
-#define GFX7_3DPRIM_START_INSTANCE 0x243C
-#define GFX7_3DPRIM_BASE_VERTEX 0x2440
-
-void genX(CmdDrawIndirectByteCountEXT)(
- VkCommandBuffer commandBuffer,
- uint32_t instanceCount,
- uint32_t firstInstance,
- VkBuffer counterBuffer,
- VkDeviceSize counterBufferOffset,
- uint32_t counterOffset,
- uint32_t vertexStride)
-{
-#if GFX_VERx10 >= 75
- ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
- ANV_FROM_HANDLE(anv_buffer, counter_buffer, counterBuffer);
- struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
- const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
-
- /* firstVertex is always zero for this draw function */
- const uint32_t firstVertex = 0;
-
- if (anv_batch_has_error(&cmd_buffer->batch))
- return;
-
- anv_measure_snapshot(cmd_buffer,
- INTEL_SNAPSHOT_DRAW,
- "draw indirect byte count",
- instanceCount);
+ /* The secondary command buffer doesn't know which textures etc. have been
+ * flushed prior to their execution. Apply those flushes now.
+ */
+ genX(cmd_buffer_apply_pipe_flushes)(container);
- genX(cmd_buffer_flush_state)(cmd_buffer);
+ genX(cmd_buffer_flush_generated_draws)(container);
- if (vs_prog_data->uses_firstvertex ||
- vs_prog_data->uses_baseinstance)
- emit_base_vertex_instance(cmd_buffer, firstVertex, firstInstance);
- if (vs_prog_data->uses_drawid)
- emit_draw_index(cmd_buffer, 0);
+ UNUSED enum anv_cmd_descriptor_buffer_mode db_mode =
+ container->state.current_db_mode;
- /* Emitting draw index or vertex index BOs may result in needing
- * additional VF cache flushes.
+ /* Do a first pass to copy the surface state content of the render targets
+ * if needed.
*/
- genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+ bool need_surface_state_copy = false;
+ for (uint32_t i = 0; i < commandBufferCount; i++) {
+ ANV_FROM_HANDLE(anv_cmd_buffer, secondary, pCmdBuffers[i]);
- /* Our implementation of VK_KHR_multiview uses instancing to draw the
- * different views. We need to multiply instanceCount by the view count.
- */
- if (!pipeline->use_primitive_replication)
- instanceCount *= anv_subpass_view_count(cmd_buffer->state.subpass);
+ if (secondary->usage_flags &
+ VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {
+ need_surface_state_copy = true;
+ break;
+ }
+ }
- struct mi_builder b;
- mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
- struct mi_value count =
- mi_mem32(anv_address_add(counter_buffer->address,
- counterBufferOffset));
- if (counterOffset)
- count = mi_isub(&b, count, mi_imm(counterOffset));
- count = mi_udiv32_imm(&b, count, vertexStride);
- mi_store(&b, mi_reg32(GFX7_3DPRIM_VERTEX_COUNT), count);
-
- mi_store(&b, mi_reg32(GFX7_3DPRIM_START_VERTEX), mi_imm(firstVertex));
- mi_store(&b, mi_reg32(GFX7_3DPRIM_INSTANCE_COUNT), mi_imm(instanceCount));
- mi_store(&b, mi_reg32(GFX7_3DPRIM_START_INSTANCE), mi_imm(firstInstance));
- mi_store(&b, mi_reg32(GFX7_3DPRIM_BASE_VERTEX), mi_imm(0));
-
- anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
- prim.IndirectParameterEnable = true;
- prim.VertexAccessType = SEQUENTIAL;
- prim.PrimitiveTopologyType = cmd_buffer->state.gfx.primitive_topology;
- }
-
- update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, SEQUENTIAL);
-#endif /* GFX_VERx10 >= 75 */
-}
+ if (need_surface_state_copy) {
+ if (container->vk.pool->flags & VK_COMMAND_POOL_CREATE_PROTECTED_BIT)
+ genX(cmd_buffer_set_protected_memory)(container, false);
-static void
-load_indirect_parameters(struct anv_cmd_buffer *cmd_buffer,
- struct anv_address addr,
- bool indexed)
-{
- struct mi_builder b;
- mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
+ /* The memcpy will take care of the 3D preemption requirements. */
+ struct anv_memcpy_state memcpy_state;
+ genX(emit_so_memcpy_init)(&memcpy_state, device, &container->batch);
- mi_store(&b, mi_reg32(GFX7_3DPRIM_VERTEX_COUNT),
- mi_mem32(anv_address_add(addr, 0)));
+ for (uint32_t i = 0; i < commandBufferCount; i++) {
+ ANV_FROM_HANDLE(anv_cmd_buffer, secondary, pCmdBuffers[i]);
- struct mi_value instance_count = mi_mem32(anv_address_add(addr, 4));
- unsigned view_count = anv_subpass_view_count(cmd_buffer->state.subpass);
- if (view_count > 1) {
-#if GFX_VERx10 >= 75
- instance_count = mi_imul_imm(&b, instance_count, view_count);
-#else
- anv_finishme("Multiview + indirect draw requires MI_MATH; "
- "MI_MATH is not supported on Ivy Bridge");
-#endif
- }
- mi_store(&b, mi_reg32(GFX7_3DPRIM_INSTANCE_COUNT), instance_count);
+ assert(secondary->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
+ assert(!anv_batch_has_error(&secondary->batch));
- mi_store(&b, mi_reg32(GFX7_3DPRIM_START_VERTEX),
- mi_mem32(anv_address_add(addr, 8)));
+ if (secondary->usage_flags &
+ VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {
+ /* If we're continuing a render pass from the container, we need
+ * to copy the surface states for the current subpass into the
+ * storage we allocated for them in BeginCommandBuffer.
+ */
+ struct anv_state src_state = container->state.gfx.att_states;
+ struct anv_state dst_state = secondary->state.gfx.att_states;
+ assert(src_state.alloc_size == dst_state.alloc_size);
+
+ genX(emit_so_memcpy)(
+ &memcpy_state,
+ anv_state_pool_state_address(&device->internal_surface_state_pool,
+ dst_state),
+ anv_state_pool_state_address(&device->internal_surface_state_pool,
+ src_state),
+ src_state.alloc_size);
+ }
+ }
+ genX(emit_so_memcpy_fini)(&memcpy_state);
- if (indexed) {
- mi_store(&b, mi_reg32(GFX7_3DPRIM_BASE_VERTEX),
- mi_mem32(anv_address_add(addr, 12)));
- mi_store(&b, mi_reg32(GFX7_3DPRIM_START_INSTANCE),
- mi_mem32(anv_address_add(addr, 16)));
- } else {
- mi_store(&b, mi_reg32(GFX7_3DPRIM_START_INSTANCE),
- mi_mem32(anv_address_add(addr, 12)));
- mi_store(&b, mi_reg32(GFX7_3DPRIM_BASE_VERTEX), mi_imm(0));
+ if (container->vk.pool->flags & VK_COMMAND_POOL_CREATE_PROTECTED_BIT)
+ genX(cmd_buffer_set_protected_memory)(container, true);
}
-}
-
-void genX(CmdDrawIndirect)(
- VkCommandBuffer commandBuffer,
- VkBuffer _buffer,
- VkDeviceSize offset,
- uint32_t drawCount,
- uint32_t stride)
-{
- ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
- ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
- struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
- const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
- if (anv_batch_has_error(&cmd_buffer->batch))
- return;
+ /* Ensure preemption is enabled (assumption for all secondary) */
+ genX(cmd_buffer_set_preemption)(container, true);
- genX(cmd_buffer_flush_state)(cmd_buffer);
+ for (uint32_t i = 0; i < commandBufferCount; i++) {
+ ANV_FROM_HANDLE(anv_cmd_buffer, secondary, pCmdBuffers[i]);
- if (cmd_buffer->state.conditional_render_enabled)
- genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
+ assert(secondary->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
+ assert(!anv_batch_has_error(&secondary->batch));
- for (uint32_t i = 0; i < drawCount; i++) {
- struct anv_address draw = anv_address_add(buffer->address, offset);
+ if (secondary->state.conditional_render_enabled) {
+ if (!container->state.conditional_render_enabled) {
+ /* Secondary buffer is constructed as if it will be executed
+ * with conditional rendering, we should satisfy this dependency
+ * regardless of conditional rendering being enabled in container.
+ */
+ struct mi_builder b;
+ mi_builder_init(&b, device->info, &container->batch);
+ mi_store(&b, mi_reg64(ANV_PREDICATE_RESULT_REG),
+ mi_imm(UINT64_MAX));
+ }
+ }
- if (vs_prog_data->uses_firstvertex ||
- vs_prog_data->uses_baseinstance)
- emit_base_vertex_instance_bo(cmd_buffer, anv_address_add(draw, 8));
- if (vs_prog_data->uses_drawid)
- emit_draw_index(cmd_buffer, i);
+ anv_cmd_buffer_add_secondary(container, secondary);
- /* Emitting draw index or vertex index BOs may result in needing
- * additional VF cache flushes.
+ /* Add secondary buffer's RCS command buffer to container buffer's RCS
+ * command buffer for execution if secondary RCS is valid.
*/
- genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
-
- load_indirect_parameters(cmd_buffer, draw, false);
+ if (secondary->companion_rcs_cmd_buffer != NULL) {
+ VkResult result = anv_cmd_buffer_ensure_rcs_companion(container);
+ if (result != VK_SUCCESS) {
+ anv_batch_set_error(&container->batch, result);
+ return;
+ }
- anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
- prim.IndirectParameterEnable = true;
- prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled;
- prim.VertexAccessType = SEQUENTIAL;
- prim.PrimitiveTopologyType = cmd_buffer->state.gfx.primitive_topology;
+ anv_cmd_buffer_add_secondary(container->companion_rcs_cmd_buffer,
+ secondary->companion_rcs_cmd_buffer);
}
- update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, SEQUENTIAL);
+ assert(secondary->perf_query_pool == NULL || container->perf_query_pool == NULL ||
+ secondary->perf_query_pool == container->perf_query_pool);
+ if (secondary->perf_query_pool)
+ container->perf_query_pool = secondary->perf_query_pool;
- offset += stride;
- }
-}
+#if INTEL_NEEDS_WA_1808121037
+ if (secondary->state.depth_reg_mode != ANV_DEPTH_REG_MODE_UNKNOWN)
+ container->state.depth_reg_mode = secondary->state.depth_reg_mode;
+#endif
-void genX(CmdDrawIndexedIndirect)(
- VkCommandBuffer commandBuffer,
- VkBuffer _buffer,
- VkDeviceSize offset,
- uint32_t drawCount,
- uint32_t stride)
-{
- ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
- ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
- struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
- const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
+ container->state.gfx.viewport_set |= secondary->state.gfx.viewport_set;
- if (anv_batch_has_error(&cmd_buffer->batch))
- return;
+ db_mode = secondary->state.current_db_mode;
+ }
+
+ /* The secondary isn't counted in our VF cache tracking so we need to
+ * invalidate the whole thing.
+ */
+ if (GFX_VER == 9) {
+ anv_add_pending_pipe_bits(container,
+ ANV_PIPE_CS_STALL_BIT | ANV_PIPE_VF_CACHE_INVALIDATE_BIT,
+ "Secondary cmd buffer not tracked in VF cache");
+ }
- genX(cmd_buffer_flush_state)(cmd_buffer);
+#if INTEL_WA_16014538804_GFX_VER
+ if (anv_cmd_buffer_is_render_queue(container) &&
+ intel_needs_workaround(device->info, 16014538804))
+ anv_batch_emit(&container->batch, GENX(PIPE_CONTROL), pc);
+#endif
- if (cmd_buffer->state.conditional_render_enabled)
- genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
+ /* The secondary may have selected a different pipeline (3D or compute) and
+ * may have changed the current L3$ configuration. Reset our tracking
+ * variables to invalid values to ensure that we re-emit these in the case
+ * where we do any draws or compute dispatches from the container after the
+ * secondary has returned.
+ */
+ container->state.current_pipeline = UINT32_MAX;
+ container->state.current_l3_config = NULL;
+ container->state.current_hash_scale = 0;
+ container->state.gfx.push_constant_stages = 0;
+ container->state.gfx.ds_write_state = false;
- for (uint32_t i = 0; i < drawCount; i++) {
- struct anv_address draw = anv_address_add(buffer->address, offset);
+ memset(&container->state.gfx.urb_cfg, 0, sizeof(struct intel_urb_config));
- /* TODO: We need to stomp base vertex to 0 somehow */
- if (vs_prog_data->uses_firstvertex ||
- vs_prog_data->uses_baseinstance)
- emit_base_vertex_instance_bo(cmd_buffer, anv_address_add(draw, 12));
- if (vs_prog_data->uses_drawid)
- emit_draw_index(cmd_buffer, i);
+ /* Reemit all GFX instructions in container */
+ memcpy(container->state.gfx.dyn_state.dirty,
+ device->gfx_dirty_state,
+ sizeof(container->state.gfx.dyn_state.dirty));
+ if (container->device->vk.enabled_extensions.KHR_fragment_shading_rate) {
+ /* Also recompute the CPS_STATE offset */
+ struct vk_dynamic_graphics_state *dyn =
+ &container->vk.dynamic_graphics_state;
+ BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_FSR);
+ }
- /* Emitting draw index or vertex index BOs may result in needing
- * additional VF cache flushes.
+ /* Each of the secondary command buffers will use its own state base
+ * address. We need to re-emit state base address for the container after
+ * all of the secondaries are done.
+ */
+ if (container->device->vk.enabled_extensions.EXT_descriptor_buffer) {
+#if GFX_VERx10 >= 125
+ /* If the last secondary had a different mode, reemit the last pending
+ * mode. Otherwise, we can do a lighter binding table pool update.
*/
- genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+ if (db_mode != container->state.current_db_mode) {
+ container->state.current_db_mode = db_mode;
+ genX(cmd_buffer_emit_state_base_address)(container);
+ } else {
+ genX(cmd_buffer_emit_bt_pool_base_address)(container);
+ }
+#else
+ genX(cmd_buffer_emit_state_base_address)(container);
+#endif
+ } else {
+ genX(cmd_buffer_emit_bt_pool_base_address)(container);
+ }
- load_indirect_parameters(cmd_buffer, draw, true);
+ /* Copy of utrace timestamp buffers from secondary into container */
+ if (u_trace_enabled(&device->ds.trace_context)) {
+ trace_intel_begin_trace_copy(&container->trace);
- anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
- prim.IndirectParameterEnable = true;
- prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled;
- prim.VertexAccessType = RANDOM;
- prim.PrimitiveTopologyType = cmd_buffer->state.gfx.primitive_topology;
+ struct anv_memcpy_state memcpy_state;
+ genX(emit_so_memcpy_init)(&memcpy_state, device, &container->batch);
+ uint32_t num_traces = 0;
+ for (uint32_t i = 0; i < commandBufferCount; i++) {
+ ANV_FROM_HANDLE(anv_cmd_buffer, secondary, pCmdBuffers[i]);
+
+ num_traces += secondary->trace.num_traces;
+ u_trace_clone_append(u_trace_begin_iterator(&secondary->trace),
+ u_trace_end_iterator(&secondary->trace),
+ &container->trace,
+ &memcpy_state,
+ cmd_buffer_emit_copy_ts_buffer);
}
+ genX(emit_so_memcpy_fini)(&memcpy_state);
- update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, RANDOM);
+ trace_intel_end_trace_copy(&container->trace, num_traces);
- offset += stride;
+ /* Memcpy is done using the 3D pipeline. */
+ container->state.current_pipeline = _3D;
}
}
-static struct mi_value
-prepare_for_draw_count_predicate(struct anv_cmd_buffer *cmd_buffer,
- struct mi_builder *b,
- struct anv_buffer *count_buffer,
- uint64_t countBufferOffset)
+static inline enum anv_pipe_bits
+anv_pipe_flush_bits_for_access_flags(struct anv_cmd_buffer *cmd_buffer,
+ VkAccessFlags2 flags)
{
- struct anv_address count_address =
- anv_address_add(count_buffer->address, countBufferOffset);
-
- struct mi_value ret = mi_imm(0);
-
- if (cmd_buffer->state.conditional_render_enabled) {
-#if GFX_VERx10 >= 75
- ret = mi_new_gpr(b);
- mi_store(b, mi_value_ref(b, ret), mi_mem32(count_address));
-#endif
- } else {
- /* Upload the current draw count from the draw parameters buffer to
- * MI_PREDICATE_SRC0.
- */
- mi_store(b, mi_reg64(MI_PREDICATE_SRC0), mi_mem32(count_address));
- mi_store(b, mi_reg32(MI_PREDICATE_SRC1 + 4), mi_imm(0));
+ enum anv_pipe_bits pipe_bits = 0;
+
+ u_foreach_bit64(b, flags) {
+ switch ((VkAccessFlags2)BITFIELD64_BIT(b)) {
+ case VK_ACCESS_2_SHADER_WRITE_BIT:
+ case VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT:
+ case VK_ACCESS_2_ACCELERATION_STRUCTURE_WRITE_BIT_KHR:
+ /* We're transitioning a buffer that was previously used as write
+ * destination through the data port. To make its content available
+ * to future operations, flush the hdc pipeline.
+ */
+ pipe_bits |= ANV_PIPE_HDC_PIPELINE_FLUSH_BIT;
+ pipe_bits |= ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT;
+ break;
+ case VK_ACCESS_2_COLOR_ATTACHMENT_WRITE_BIT:
+ /* We're transitioning a buffer that was previously used as render
+ * target. To make its content available to future operations, flush
+ * the render target cache.
+ */
+ pipe_bits |= ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;
+ break;
+ case VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT:
+ /* We're transitioning a buffer that was previously used as depth
+ * buffer. To make its content available to future operations, flush
+ * the depth cache.
+ */
+ pipe_bits |= ANV_PIPE_DEPTH_CACHE_FLUSH_BIT;
+ break;
+ case VK_ACCESS_2_TRANSFER_WRITE_BIT:
+ /* We're transitioning a buffer that was previously used as a
+ * transfer write destination. Generic write operations include color
+ * & depth operations as well as buffer operations like :
+ * - vkCmdClearColorImage()
+ * - vkCmdClearDepthStencilImage()
+ * - vkCmdBlitImage()
+ * - vkCmdCopy*(), vkCmdUpdate*(), vkCmdFill*()
+ *
+ * Most of these operations are implemented using Blorp which writes
+ * through the render target cache or the depth cache on the graphics
+ * queue. On the compute queue, the writes are done through the data
+ * port.
+ */
+ if (anv_cmd_buffer_is_compute_queue(cmd_buffer)) {
+ pipe_bits |= ANV_PIPE_HDC_PIPELINE_FLUSH_BIT;
+ pipe_bits |= ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT;
+ } else {
+ /* We can use the data port when trying to stay in compute mode on
+ * the RCS.
+ */
+ pipe_bits |= ANV_PIPE_HDC_PIPELINE_FLUSH_BIT;
+ pipe_bits |= ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT;
+ /* Most operations are done through RT/detph writes */
+ pipe_bits |= ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;
+ pipe_bits |= ANV_PIPE_DEPTH_CACHE_FLUSH_BIT;
+ }
+ break;
+ case VK_ACCESS_2_MEMORY_WRITE_BIT:
+ /* We're transitioning a buffer for generic write operations. Flush
+ * all the caches.
+ */
+ pipe_bits |= ANV_PIPE_FLUSH_BITS;
+ break;
+ case VK_ACCESS_2_HOST_WRITE_BIT:
+ /* We're transitioning a buffer for access by CPU. Invalidate
+ * all the caches. Since data and tile caches don't have invalidate,
+ * we are forced to flush those as well.
+ */
+ pipe_bits |= ANV_PIPE_FLUSH_BITS;
+ pipe_bits |= ANV_PIPE_INVALIDATE_BITS;
+ break;
+ case VK_ACCESS_2_TRANSFORM_FEEDBACK_WRITE_BIT_EXT:
+ case VK_ACCESS_2_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT:
+ /* We're transitioning a buffer written either from VS stage or from
+ * the command streamer (see CmdEndTransformFeedbackEXT), we just
+ * need to stall the CS.
+ *
+ * Streamout writes apparently bypassing L3, in order to make them
+ * visible to the destination, we need to invalidate the other
+ * caches.
+ */
+ pipe_bits |= ANV_PIPE_CS_STALL_BIT | ANV_PIPE_INVALIDATE_BITS;
+ break;
+ default:
+ break; /* Nothing to do */
+ }
}
- return ret;
+ return pipe_bits;
}
-static void
-emit_draw_count_predicate(struct anv_cmd_buffer *cmd_buffer,
- struct mi_builder *b,
- uint32_t draw_index)
+static inline enum anv_pipe_bits
+anv_pipe_invalidate_bits_for_access_flags(struct anv_cmd_buffer *cmd_buffer,
+ VkAccessFlags2 flags)
{
- /* Upload the index of the current primitive to MI_PREDICATE_SRC1. */
- mi_store(b, mi_reg32(MI_PREDICATE_SRC1), mi_imm(draw_index));
-
- if (draw_index == 0) {
- anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
- mip.LoadOperation = LOAD_LOADINV;
- mip.CombineOperation = COMBINE_SET;
- mip.CompareOperation = COMPARE_SRCS_EQUAL;
- }
- } else {
- /* While draw_index < draw_count the predicate's result will be
- * (draw_index == draw_count) ^ TRUE = TRUE
- * When draw_index == draw_count the result is
- * (TRUE) ^ TRUE = FALSE
- * After this all results will be:
- * (FALSE) ^ FALSE = FALSE
- */
- anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
- mip.LoadOperation = LOAD_LOAD;
- mip.CombineOperation = COMBINE_XOR;
- mip.CompareOperation = COMPARE_SRCS_EQUAL;
+ struct anv_device *device = cmd_buffer->device;
+ enum anv_pipe_bits pipe_bits = 0;
+
+ u_foreach_bit64(b, flags) {
+ switch ((VkAccessFlags2)BITFIELD64_BIT(b)) {
+ case VK_ACCESS_2_INDIRECT_COMMAND_READ_BIT:
+ /* Indirect draw commands take a buffer as input that we're going to
+ * read from the command streamer to load some of the HW registers
+ * (see genX_cmd_buffer.c:load_indirect_parameters). This requires a
+ * command streamer stall so that all the cache flushes have
+ * completed before the command streamer loads from memory.
+ */
+ pipe_bits |= ANV_PIPE_CS_STALL_BIT;
+ /* Indirect draw commands also set gl_BaseVertex & gl_BaseIndex
+ * through a vertex buffer, so invalidate that cache.
+ */
+ pipe_bits |= ANV_PIPE_VF_CACHE_INVALIDATE_BIT;
+ /* For CmdDipatchIndirect, we also load gl_NumWorkGroups through a
+ * UBO from the buffer, so we need to invalidate constant cache.
+ */
+ pipe_bits |= ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT;
+ pipe_bits |= ANV_PIPE_DATA_CACHE_FLUSH_BIT;
+ /* Tile cache flush needed For CmdDipatchIndirect since command
+ * streamer and vertex fetch aren't L3 coherent.
+ */
+ pipe_bits |= ANV_PIPE_TILE_CACHE_FLUSH_BIT;
+ break;
+ case VK_ACCESS_2_INDEX_READ_BIT:
+ case VK_ACCESS_2_VERTEX_ATTRIBUTE_READ_BIT:
+ /* We transitioning a buffer to be used for as input for vkCmdDraw*
+ * commands, so we invalidate the VF cache to make sure there is no
+ * stale data when we start rendering.
+ */
+ pipe_bits |= ANV_PIPE_VF_CACHE_INVALIDATE_BIT;
+ break;
+ case VK_ACCESS_2_UNIFORM_READ_BIT:
+ case VK_ACCESS_2_SHADER_BINDING_TABLE_READ_BIT_KHR:
+ /* We transitioning a buffer to be used as uniform data. Because
+ * uniform is accessed through the data port & sampler, we need to
+ * invalidate the texture cache (sampler) & constant cache (data
+ * port) to avoid stale data.
+ */
+ pipe_bits |= ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT;
+ if (device->physical->compiler->indirect_ubos_use_sampler) {
+ pipe_bits |= ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT;
+ } else {
+ pipe_bits |= ANV_PIPE_HDC_PIPELINE_FLUSH_BIT;
+ pipe_bits |= ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT;
+ }
+ break;
+ case VK_ACCESS_2_INPUT_ATTACHMENT_READ_BIT:
+ case VK_ACCESS_2_TRANSFER_READ_BIT:
+ case VK_ACCESS_2_SHADER_SAMPLED_READ_BIT:
+ /* Transitioning a buffer to be read through the sampler, so
+ * invalidate the texture cache, we don't want any stale data.
+ */
+ pipe_bits |= ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT;
+ break;
+ case VK_ACCESS_2_SHADER_READ_BIT:
+ /* Same as VK_ACCESS_2_UNIFORM_READ_BIT and
+ * VK_ACCESS_2_SHADER_SAMPLED_READ_BIT cases above
+ */
+ pipe_bits |= ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT |
+ ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT;
+ if (!device->physical->compiler->indirect_ubos_use_sampler) {
+ pipe_bits |= ANV_PIPE_HDC_PIPELINE_FLUSH_BIT;
+ pipe_bits |= ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT;
+ }
+ break;
+ case VK_ACCESS_2_MEMORY_READ_BIT:
+ /* Transitioning a buffer for generic read, invalidate all the
+ * caches.
+ */
+ pipe_bits |= ANV_PIPE_INVALIDATE_BITS;
+ break;
+ case VK_ACCESS_2_MEMORY_WRITE_BIT:
+ /* Generic write, make sure all previously written things land in
+ * memory.
+ */
+ pipe_bits |= ANV_PIPE_FLUSH_BITS;
+ break;
+ case VK_ACCESS_2_CONDITIONAL_RENDERING_READ_BIT_EXT:
+ case VK_ACCESS_2_TRANSFORM_FEEDBACK_COUNTER_READ_BIT_EXT:
+ /* Transitioning a buffer for conditional rendering or transform
+ * feedback. We'll load the content of this buffer into HW registers
+ * using the command streamer, so we need to stall the command
+ * streamer , so we need to stall the command streamer to make sure
+ * any in-flight flush operations have completed.
+ */
+ pipe_bits |= ANV_PIPE_CS_STALL_BIT;
+ pipe_bits |= ANV_PIPE_TILE_CACHE_FLUSH_BIT;
+ pipe_bits |= ANV_PIPE_DATA_CACHE_FLUSH_BIT;
+ break;
+ case VK_ACCESS_2_HOST_READ_BIT:
+ /* We're transitioning a buffer that was written by CPU. Flush
+ * all the caches.
+ */
+ pipe_bits |= ANV_PIPE_FLUSH_BITS;
+ break;
+ case VK_ACCESS_2_TRANSFORM_FEEDBACK_WRITE_BIT_EXT:
+ /* We're transitioning a buffer to be written by the streamout fixed
+ * function. This one is apparently not L3 coherent, so we need a
+ * tile cache flush to make sure any previous write is not going to
+ * create WaW hazards.
+ */
+ pipe_bits |= ANV_PIPE_DATA_CACHE_FLUSH_BIT;
+ pipe_bits |= ANV_PIPE_TILE_CACHE_FLUSH_BIT;
+ break;
+ case VK_ACCESS_2_SHADER_STORAGE_READ_BIT:
+ /* VK_ACCESS_2_SHADER_STORAGE_READ_BIT specifies read access to a
+ * storage buffer, physical storage buffer, storage texel buffer, or
+ * storage image in any shader pipeline stage.
+ *
+ * Any storage buffers or images written to must be invalidated and
+ * flushed before the shader can access them.
+ *
+ * Both HDC & Untyped flushes also do invalidation. This is why we
+ * use this here on Gfx12+.
+ *
+ * Gfx11 and prior don't have HDC. Only Data cache flush is available
+ * and it only operates on the written cache lines.
+ */
+ if (device->info->ver >= 12) {
+ pipe_bits |= ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT;
+ pipe_bits |= ANV_PIPE_HDC_PIPELINE_FLUSH_BIT;
+ }
+ break;
+ case VK_ACCESS_2_DESCRIPTOR_BUFFER_READ_BIT_EXT:
+ pipe_bits |= ANV_PIPE_STATE_CACHE_INVALIDATE_BIT;
+ break;
+ default:
+ break; /* Nothing to do */
}
}
+
+ return pipe_bits;
}
-#if GFX_VERx10 >= 75
-static void
-emit_draw_count_predicate_with_conditional_render(
- struct anv_cmd_buffer *cmd_buffer,
- struct mi_builder *b,
- uint32_t draw_index,
- struct mi_value max)
+static inline bool
+stage_is_shader(const VkPipelineStageFlags2 stage)
{
- struct mi_value pred = mi_ult(b, mi_imm(draw_index), max);
- pred = mi_iand(b, pred, mi_reg64(ANV_PREDICATE_RESULT_REG));
-
-#if GFX_VER >= 8
- mi_store(b, mi_reg32(MI_PREDICATE_RESULT), pred);
-#else
- /* MI_PREDICATE_RESULT is not whitelisted in i915 command parser
- * so we emit MI_PREDICATE to set it.
- */
-
- mi_store(b, mi_reg64(MI_PREDICATE_SRC0), pred);
- mi_store(b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(0));
+ return (stage & (VK_PIPELINE_STAGE_2_VERTEX_SHADER_BIT |
+ VK_PIPELINE_STAGE_2_TESSELLATION_CONTROL_SHADER_BIT |
+ VK_PIPELINE_STAGE_2_TESSELLATION_EVALUATION_SHADER_BIT |
+ VK_PIPELINE_STAGE_2_GEOMETRY_SHADER_BIT |
+ VK_PIPELINE_STAGE_2_FRAGMENT_SHADER_BIT |
+ VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT |
+ VK_PIPELINE_STAGE_2_ALL_GRAPHICS_BIT |
+ VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT |
+ VK_PIPELINE_STAGE_2_RAY_TRACING_SHADER_BIT_KHR |
+ VK_PIPELINE_STAGE_2_TASK_SHADER_BIT_EXT |
+ VK_PIPELINE_STAGE_2_MESH_SHADER_BIT_EXT));
+}
- anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
- mip.LoadOperation = LOAD_LOADINV;
- mip.CombineOperation = COMBINE_SET;
- mip.CompareOperation = COMPARE_SRCS_EQUAL;
- }
-#endif
+static inline bool
+stage_is_transfer(const VkPipelineStageFlags2 stage)
+{
+ return (stage & (VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT |
+ VK_PIPELINE_STAGE_2_ALL_TRANSFER_BIT));
}
-#endif
-static void
-emit_draw_count_predicate_cond(struct anv_cmd_buffer *cmd_buffer,
- struct mi_builder *b,
- uint32_t draw_index,
- struct mi_value max)
+static inline bool
+stage_is_video(const VkPipelineStageFlags2 stage)
{
-#if GFX_VERx10 >= 75
- if (cmd_buffer->state.conditional_render_enabled) {
- emit_draw_count_predicate_with_conditional_render(
- cmd_buffer, b, draw_index, mi_value_ref(b, max));
- } else {
- emit_draw_count_predicate(cmd_buffer, b, draw_index);
- }
-#else
- emit_draw_count_predicate(cmd_buffer, b, draw_index);
+ return (stage & (VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT |
+#ifdef VK_ENABLE_BETA_EXTENSIONS
+ VK_PIPELINE_STAGE_2_VIDEO_ENCODE_BIT_KHR |
#endif
+ VK_PIPELINE_STAGE_2_VIDEO_DECODE_BIT_KHR));
}
-void genX(CmdDrawIndirectCount)(
- VkCommandBuffer commandBuffer,
- VkBuffer _buffer,
- VkDeviceSize offset,
- VkBuffer _countBuffer,
- VkDeviceSize countBufferOffset,
- uint32_t maxDrawCount,
- uint32_t stride)
+static inline bool
+mask_is_shader_write(const VkAccessFlags2 access)
{
- ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
- ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
- ANV_FROM_HANDLE(anv_buffer, count_buffer, _countBuffer);
- struct anv_cmd_state *cmd_state = &cmd_buffer->state;
- struct anv_graphics_pipeline *pipeline = cmd_state->gfx.pipeline;
- const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
-
- if (anv_batch_has_error(&cmd_buffer->batch))
- return;
-
- genX(cmd_buffer_flush_state)(cmd_buffer);
-
- struct mi_builder b;
- mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
- struct mi_value max =
- prepare_for_draw_count_predicate(cmd_buffer, &b,
- count_buffer, countBufferOffset);
-
- for (uint32_t i = 0; i < maxDrawCount; i++) {
- struct anv_address draw = anv_address_add(buffer->address, offset);
-
- emit_draw_count_predicate_cond(cmd_buffer, &b, i, max);
-
- if (vs_prog_data->uses_firstvertex ||
- vs_prog_data->uses_baseinstance)
- emit_base_vertex_instance_bo(cmd_buffer, anv_address_add(draw, 8));
- if (vs_prog_data->uses_drawid)
- emit_draw_index(cmd_buffer, i);
-
- /* Emitting draw index or vertex index BOs may result in needing
- * additional VF cache flushes.
- */
- genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
-
- load_indirect_parameters(cmd_buffer, draw, false);
-
- anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
- prim.IndirectParameterEnable = true;
- prim.PredicateEnable = true;
- prim.VertexAccessType = SEQUENTIAL;
- prim.PrimitiveTopologyType = cmd_buffer->state.gfx.primitive_topology;
- }
-
- update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, SEQUENTIAL);
-
- offset += stride;
- }
-
- mi_value_unref(&b, max);
+ return (access & (VK_ACCESS_2_SHADER_WRITE_BIT |
+ VK_ACCESS_2_MEMORY_WRITE_BIT |
+ VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT));
}
-void genX(CmdDrawIndexedIndirectCount)(
- VkCommandBuffer commandBuffer,
- VkBuffer _buffer,
- VkDeviceSize offset,
- VkBuffer _countBuffer,
- VkDeviceSize countBufferOffset,
- uint32_t maxDrawCount,
- uint32_t stride)
+static inline bool
+mask_is_write(const VkAccessFlags2 access)
{
- ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
- ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
- ANV_FROM_HANDLE(anv_buffer, count_buffer, _countBuffer);
- struct anv_cmd_state *cmd_state = &cmd_buffer->state;
- struct anv_graphics_pipeline *pipeline = cmd_state->gfx.pipeline;
- const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
-
- if (anv_batch_has_error(&cmd_buffer->batch))
- return;
-
- genX(cmd_buffer_flush_state)(cmd_buffer);
+ return access & (VK_ACCESS_2_SHADER_WRITE_BIT |
+ VK_ACCESS_2_COLOR_ATTACHMENT_WRITE_BIT |
+ VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT |
+ VK_ACCESS_2_TRANSFER_WRITE_BIT |
+ VK_ACCESS_2_HOST_WRITE_BIT |
+ VK_ACCESS_2_MEMORY_WRITE_BIT |
+ VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT |
+ VK_ACCESS_2_VIDEO_DECODE_WRITE_BIT_KHR |
+#ifdef VK_ENABLE_BETA_EXTENSIONS
+ VK_ACCESS_2_VIDEO_ENCODE_WRITE_BIT_KHR |
+#endif
+ VK_ACCESS_2_TRANSFORM_FEEDBACK_WRITE_BIT_EXT |
+ VK_ACCESS_2_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT |
+ VK_ACCESS_2_COMMAND_PREPROCESS_WRITE_BIT_NV |
+ VK_ACCESS_2_ACCELERATION_STRUCTURE_WRITE_BIT_KHR |
+ VK_ACCESS_2_MICROMAP_WRITE_BIT_EXT |
+ VK_ACCESS_2_OPTICAL_FLOW_WRITE_BIT_NV);
+}
- struct mi_builder b;
- mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
- struct mi_value max =
- prepare_for_draw_count_predicate(cmd_buffer, &b,
- count_buffer, countBufferOffset);
+static inline bool
+mask_is_transfer_write(const VkAccessFlags2 access)
+{
+ return access & (VK_ACCESS_2_TRANSFER_WRITE_BIT |
+ VK_ACCESS_2_MEMORY_WRITE_BIT);
+}
- for (uint32_t i = 0; i < maxDrawCount; i++) {
- struct anv_address draw = anv_address_add(buffer->address, offset);
+static void
+cmd_buffer_barrier_video(struct anv_cmd_buffer *cmd_buffer,
+ const VkDependencyInfo *dep_info)
+{
+ assert(anv_cmd_buffer_is_video_queue(cmd_buffer));
- emit_draw_count_predicate_cond(cmd_buffer, &b, i, max);
+ bool flush_llc = false;
+ bool flush_ccs = false;
+ for (uint32_t i = 0; i < dep_info->imageMemoryBarrierCount; i++) {
+ const VkImageMemoryBarrier2 *img_barrier =
+ &dep_info->pImageMemoryBarriers[i];
- /* TODO: We need to stomp base vertex to 0 somehow */
- if (vs_prog_data->uses_firstvertex ||
- vs_prog_data->uses_baseinstance)
- emit_base_vertex_instance_bo(cmd_buffer, anv_address_add(draw, 12));
- if (vs_prog_data->uses_drawid)
- emit_draw_index(cmd_buffer, i);
+ ANV_FROM_HANDLE(anv_image, image, img_barrier->image);
+ const VkImageSubresourceRange *range = &img_barrier->subresourceRange;
- /* Emitting draw index or vertex index BOs may result in needing
- * additional VF cache flushes.
+ /* If srcQueueFamilyIndex is not equal to dstQueueFamilyIndex, this
+ * memory barrier defines a queue family ownership transfer.
*/
- genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+ if (img_barrier->srcQueueFamilyIndex != img_barrier->dstQueueFamilyIndex)
+ flush_llc = true;
- load_indirect_parameters(cmd_buffer, draw, true);
-
- anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
- prim.IndirectParameterEnable = true;
- prim.PredicateEnable = true;
- prim.VertexAccessType = RANDOM;
- prim.PrimitiveTopologyType = cmd_buffer->state.gfx.primitive_topology;
+ VkImageAspectFlags img_aspects =
+ vk_image_expand_aspect_mask(&image->vk, range->aspectMask);
+ anv_foreach_image_aspect_bit(aspect_bit, image, img_aspects) {
+ const uint32_t plane =
+ anv_image_aspect_to_plane(image, 1UL << aspect_bit);
+ if (isl_aux_usage_has_ccs(image->planes[plane].aux_usage)) {
+ flush_ccs = true;
+ }
}
-
- update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, RANDOM);
-
- offset += stride;
}
- mi_value_unref(&b, max);
-}
-
-void genX(CmdBeginTransformFeedbackEXT)(
- VkCommandBuffer commandBuffer,
- uint32_t firstCounterBuffer,
- uint32_t counterBufferCount,
- const VkBuffer* pCounterBuffers,
- const VkDeviceSize* pCounterBufferOffsets)
-{
- ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
-
- assert(firstCounterBuffer < MAX_XFB_BUFFERS);
- assert(counterBufferCount <= MAX_XFB_BUFFERS);
- assert(firstCounterBuffer + counterBufferCount <= MAX_XFB_BUFFERS);
-
- /* From the SKL PRM Vol. 2c, SO_WRITE_OFFSET:
- *
- * "Ssoftware must ensure that no HW stream output operations can be in
- * process or otherwise pending at the point that the MI_LOAD/STORE
- * commands are processed. This will likely require a pipeline flush."
- */
- anv_add_pending_pipe_bits(cmd_buffer,
- ANV_PIPE_CS_STALL_BIT,
- "begin transform feedback");
- genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
-
- for (uint32_t idx = 0; idx < MAX_XFB_BUFFERS; idx++) {
- /* If we have a counter buffer, this is a resume so we need to load the
- * value into the streamout offset register. Otherwise, this is a begin
- * and we need to reset it to zero.
+ for (uint32_t i = 0; i < dep_info->bufferMemoryBarrierCount; i++) {
+ /* Flush the cache if something is written by the video operations and
+ * used by any other stages except video encode/decode stages or if
+ * srcQueueFamilyIndex is not equal to dstQueueFamilyIndex, this memory
+ * barrier defines a queue family ownership transfer.
*/
- if (pCounterBuffers &&
- idx >= firstCounterBuffer &&
- idx - firstCounterBuffer < counterBufferCount &&
- pCounterBuffers[idx - firstCounterBuffer] != VK_NULL_HANDLE) {
- uint32_t cb_idx = idx - firstCounterBuffer;
- ANV_FROM_HANDLE(anv_buffer, counter_buffer, pCounterBuffers[cb_idx]);
- uint64_t offset = pCounterBufferOffsets ?
- pCounterBufferOffsets[cb_idx] : 0;
-
- anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
- lrm.RegisterAddress = GENX(SO_WRITE_OFFSET0_num) + idx * 4;
- lrm.MemoryAddress = anv_address_add(counter_buffer->address,
- offset);
- }
- } else {
- anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
- lri.RegisterOffset = GENX(SO_WRITE_OFFSET0_num) + idx * 4;
- lri.DataDWord = 0;
- }
+ if ((stage_is_video(dep_info->pBufferMemoryBarriers[i].srcStageMask) &&
+ mask_is_write(dep_info->pBufferMemoryBarriers[i].srcAccessMask) &&
+ !stage_is_video(dep_info->pBufferMemoryBarriers[i].dstStageMask)) ||
+ (dep_info->pBufferMemoryBarriers[i].srcQueueFamilyIndex !=
+ dep_info->pBufferMemoryBarriers[i].dstQueueFamilyIndex)) {
+ flush_llc = true;
+ break;
}
}
- cmd_buffer->state.xfb_enabled = true;
- cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_XFB_ENABLE;
-}
-
-void genX(CmdEndTransformFeedbackEXT)(
- VkCommandBuffer commandBuffer,
- uint32_t firstCounterBuffer,
- uint32_t counterBufferCount,
- const VkBuffer* pCounterBuffers,
- const VkDeviceSize* pCounterBufferOffsets)
-{
- ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
-
- assert(firstCounterBuffer < MAX_XFB_BUFFERS);
- assert(counterBufferCount <= MAX_XFB_BUFFERS);
- assert(firstCounterBuffer + counterBufferCount <= MAX_XFB_BUFFERS);
-
- /* From the SKL PRM Vol. 2c, SO_WRITE_OFFSET:
- *
- * "Ssoftware must ensure that no HW stream output operations can be in
- * process or otherwise pending at the point that the MI_LOAD/STORE
- * commands are processed. This will likely require a pipeline flush."
- */
- anv_add_pending_pipe_bits(cmd_buffer,
- ANV_PIPE_CS_STALL_BIT,
- "end transform feedback");
- genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
-
- for (uint32_t cb_idx = 0; cb_idx < counterBufferCount; cb_idx++) {
- unsigned idx = firstCounterBuffer + cb_idx;
-
- /* If we have a counter buffer, this is a resume so we need to load the
- * value into the streamout offset register. Otherwise, this is a begin
- * and we need to reset it to zero.
+ for (uint32_t i = 0; i < dep_info->memoryBarrierCount; i++) {
+ /* Flush the cache if something is written by the video operations and
+ * used by any other stages except video encode/decode stage.
*/
- if (pCounterBuffers &&
- cb_idx < counterBufferCount &&
- pCounterBuffers[cb_idx] != VK_NULL_HANDLE) {
- ANV_FROM_HANDLE(anv_buffer, counter_buffer, pCounterBuffers[cb_idx]);
- uint64_t offset = pCounterBufferOffsets ?
- pCounterBufferOffsets[cb_idx] : 0;
-
- anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), srm) {
- srm.MemoryAddress = anv_address_add(counter_buffer->address,
- offset);
- srm.RegisterAddress = GENX(SO_WRITE_OFFSET0_num) + idx * 4;
- }
+ if (stage_is_video(dep_info->pMemoryBarriers[i].srcStageMask) &&
+ mask_is_write(dep_info->pMemoryBarriers[i].srcAccessMask) &&
+ !stage_is_video(dep_info->pMemoryBarriers[i].dstStageMask)) {
+ flush_llc = true;
+ break;
}
}
- cmd_buffer->state.xfb_enabled = false;
- cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_XFB_ENABLE;
+ if (flush_ccs || flush_llc) {
+ anv_batch_emit(&cmd_buffer->batch, GENX(MI_FLUSH_DW), fd) {
+#if GFX_VERx10 >= 125
+ fd.FlushCCS = flush_ccs;
+#endif
+#if GFX_VER >= 12
+ /* Using this bit on Gfx9 triggers a GPU hang.
+ * This is undocumented behavior. Gfx12 seems fine.
+ * TODO: check Gfx11
+ */
+ fd.FlushLLC = flush_llc;
+#endif
+ }
+ }
}
-void
-genX(cmd_buffer_flush_compute_state)(struct anv_cmd_buffer *cmd_buffer)
+static void
+cmd_buffer_barrier_blitter(struct anv_cmd_buffer *cmd_buffer,
+ const VkDependencyInfo *dep_info)
{
- struct anv_cmd_compute_state *comp_state = &cmd_buffer->state.compute;
- struct anv_compute_pipeline *pipeline = comp_state->pipeline;
-
- assert(pipeline->cs);
+#if GFX_VERx10 >= 125
+ assert(anv_cmd_buffer_is_blitter_queue(cmd_buffer));
- genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->base.l3_config);
+ /* The blitter requires an MI_FLUSH_DW command when a buffer transitions
+ * from being a destination to a source.
+ */
+ bool flush_llc = false;
+ bool flush_ccs = false;
+ for (uint32_t i = 0; i < dep_info->imageMemoryBarrierCount; i++) {
+ const VkImageMemoryBarrier2 *img_barrier =
+ &dep_info->pImageMemoryBarriers[i];
- genX(flush_pipeline_select_gpgpu)(cmd_buffer);
+ ANV_FROM_HANDLE(anv_image, image, img_barrier->image);
+ const VkImageSubresourceRange *range = &img_barrier->subresourceRange;
- /* Apply any pending pipeline flushes we may have. We want to apply them
- * now because, if any of those flushes are for things like push constants,
- * the GPU will read the state at weird times.
- */
- genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+ /* If srcQueueFamilyIndex is not equal to dstQueueFamilyIndex, this
+ * memory barrier defines a queue family transfer operation.
+ */
+ if (img_barrier->srcQueueFamilyIndex != img_barrier->dstQueueFamilyIndex)
+ flush_llc = true;
- if (cmd_buffer->state.compute.pipeline_dirty) {
- /* From the Sky Lake PRM Vol 2a, MEDIA_VFE_STATE:
- *
- * "A stalling PIPE_CONTROL is required before MEDIA_VFE_STATE unless
- * the only bits that are changed are scoreboard related: Scoreboard
- * Enable, Scoreboard Type, Scoreboard Mask, Scoreboard * Delta. For
- * these scoreboard related states, a MEDIA_STATE_FLUSH is
- * sufficient."
+ /* Flush cache if transfer command reads the output of the previous
+ * transfer command, ideally we should just wait for the completion but
+ * for now just flush the cache to make the data visible.
*/
- anv_add_pending_pipe_bits(cmd_buffer,
- ANV_PIPE_CS_STALL_BIT,
- "flush compute state");
- genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+ if ((img_barrier->oldLayout == VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL ||
+ img_barrier->oldLayout == VK_IMAGE_LAYOUT_GENERAL) &&
+ (img_barrier->newLayout == VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL ||
+ img_barrier->newLayout == VK_IMAGE_LAYOUT_GENERAL)) {
+ flush_llc = true;
+ }
- anv_batch_emit_batch(&cmd_buffer->batch, &pipeline->base.batch);
+ VkImageAspectFlags img_aspects =
+ vk_image_expand_aspect_mask(&image->vk, range->aspectMask);
+ anv_foreach_image_aspect_bit(aspect_bit, image, img_aspects) {
+ const uint32_t plane =
+ anv_image_aspect_to_plane(image, 1UL << aspect_bit);
+ if (isl_aux_usage_has_ccs(image->planes[plane].aux_usage)) {
+ flush_ccs = true;
+ }
+ }
+ }
- /* The workgroup size of the pipeline affects our push constant layout
- * so flag push constants as dirty if we change the pipeline.
+ for (uint32_t i = 0; i < dep_info->bufferMemoryBarrierCount; i++) {
+ /* Flush the cache if something is written by the transfer command and
+ * used by any other stages except transfer stage or if
+ * srcQueueFamilyIndex is not equal to dstQueueFamilyIndex, this memory
+ * barrier defines a queue family transfer operation.
*/
- cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_COMPUTE_BIT;
+ if ((stage_is_transfer(dep_info->pBufferMemoryBarriers[i].srcStageMask) &&
+ mask_is_write(dep_info->pBufferMemoryBarriers[i].srcAccessMask)) ||
+ (dep_info->pBufferMemoryBarriers[i].srcQueueFamilyIndex !=
+ dep_info->pBufferMemoryBarriers[i].dstQueueFamilyIndex)) {
+ flush_llc = true;
+ break;
+ }
}
- if ((cmd_buffer->state.descriptors_dirty & VK_SHADER_STAGE_COMPUTE_BIT) ||
- cmd_buffer->state.compute.pipeline_dirty) {
- flush_descriptor_sets(cmd_buffer,
- &cmd_buffer->state.compute.base,
- VK_SHADER_STAGE_COMPUTE_BIT,
- &pipeline->cs, 1);
- cmd_buffer->state.descriptors_dirty &= ~VK_SHADER_STAGE_COMPUTE_BIT;
-
-#if GFX_VERx10 < 125
- uint32_t iface_desc_data_dw[GENX(INTERFACE_DESCRIPTOR_DATA_length)];
- struct GENX(INTERFACE_DESCRIPTOR_DATA) desc = {
- .BindingTablePointer =
- cmd_buffer->state.binding_tables[MESA_SHADER_COMPUTE].offset,
- .SamplerStatePointer =
- cmd_buffer->state.samplers[MESA_SHADER_COMPUTE].offset,
- };
- GENX(INTERFACE_DESCRIPTOR_DATA_pack)(NULL, iface_desc_data_dw, &desc);
-
- struct anv_state state =
- anv_cmd_buffer_merge_dynamic(cmd_buffer, iface_desc_data_dw,
- pipeline->interface_descriptor_data,
- GENX(INTERFACE_DESCRIPTOR_DATA_length),
- 64);
-
- uint32_t size = GENX(INTERFACE_DESCRIPTOR_DATA_length) * sizeof(uint32_t);
- anv_batch_emit(&cmd_buffer->batch,
- GENX(MEDIA_INTERFACE_DESCRIPTOR_LOAD), mid) {
- mid.InterfaceDescriptorTotalLength = size;
- mid.InterfaceDescriptorDataStartAddress = state.offset;
+ for (uint32_t i = 0; i < dep_info->memoryBarrierCount; i++) {
+ /* Flush the cache if something is written by the transfer command and
+ * used by any other stages except transfer stage.
+ */
+ if (stage_is_transfer(dep_info->pMemoryBarriers[i].srcStageMask) &&
+ mask_is_write(dep_info->pMemoryBarriers[i].srcAccessMask)) {
+ flush_llc = true;
+ break;
}
-#endif
}
- if (cmd_buffer->state.push_constants_dirty & VK_SHADER_STAGE_COMPUTE_BIT) {
- comp_state->push_data =
- anv_cmd_buffer_cs_push_constants(cmd_buffer);
-
-#if GFX_VERx10 < 125
- if (comp_state->push_data.alloc_size) {
- anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_CURBE_LOAD), curbe) {
- curbe.CURBETotalDataLength = comp_state->push_data.alloc_size;
- curbe.CURBEDataStartAddress = comp_state->push_data.offset;
- }
+ if (flush_ccs || flush_llc) {
+ /* Wa_16018063123 - emit fast color dummy blit before MI_FLUSH_DW. */
+ if (intel_needs_workaround(cmd_buffer->device->info, 16018063123)) {
+ genX(batch_emit_fast_color_dummy_blit)(&cmd_buffer->batch,
+ cmd_buffer->device);
+ }
+ anv_batch_emit(&cmd_buffer->batch, GENX(MI_FLUSH_DW), fd) {
+ fd.FlushCCS = flush_ccs;
+ fd.FlushLLC = flush_llc;
}
-#endif
-
- cmd_buffer->state.push_constants_dirty &= ~VK_SHADER_STAGE_COMPUTE_BIT;
}
-
- cmd_buffer->state.compute.pipeline_dirty = false;
-
- genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+#endif
}
-#if GFX_VER == 7
-
-static VkResult
-verify_cmd_parser(const struct anv_device *device,
- int required_version,
- const char *function)
+static inline bool
+cmd_buffer_has_pending_copy_query(struct anv_cmd_buffer *cmd_buffer)
{
- if (device->physical->cmd_parser_version < required_version) {
- return vk_errorf(device, &device->physical->vk.base,
- VK_ERROR_FEATURE_NOT_PRESENT,
- "cmd parser version %d is required for %s",
- required_version, function);
- } else {
- return VK_SUCCESS;
- }
+ /* Query copies are only written with dataport, so we only need to check
+ * that flag.
+ */
+ return (cmd_buffer->state.queries.buffer_write_bits &
+ ANV_QUERY_WRITES_DATA_FLUSH) != 0;
}
-#endif
-
static void
-anv_cmd_buffer_push_base_group_id(struct anv_cmd_buffer *cmd_buffer,
- uint32_t baseGroupX,
- uint32_t baseGroupY,
- uint32_t baseGroupZ)
+cmd_buffer_barrier(struct anv_cmd_buffer *cmd_buffer,
+ const VkDependencyInfo *dep_info,
+ const char *reason)
{
- if (anv_batch_has_error(&cmd_buffer->batch))
+ if (anv_cmd_buffer_is_video_queue(cmd_buffer)) {
+ cmd_buffer_barrier_video(cmd_buffer, dep_info);
return;
-
- struct anv_push_constants *push =
- &cmd_buffer->state.compute.base.push_constants;
- if (push->cs.base_work_group_id[0] != baseGroupX ||
- push->cs.base_work_group_id[1] != baseGroupY ||
- push->cs.base_work_group_id[2] != baseGroupZ) {
- push->cs.base_work_group_id[0] = baseGroupX;
- push->cs.base_work_group_id[1] = baseGroupY;
- push->cs.base_work_group_id[2] = baseGroupZ;
-
- cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_COMPUTE_BIT;
}
-}
-void genX(CmdDispatch)(
- VkCommandBuffer commandBuffer,
- uint32_t x,
- uint32_t y,
- uint32_t z)
-{
- genX(CmdDispatchBase)(commandBuffer, 0, 0, 0, x, y, z);
-}
-
-#if GFX_VERx10 >= 125
-
-static inline void
-emit_compute_walker(struct anv_cmd_buffer *cmd_buffer,
- const struct anv_compute_pipeline *pipeline, bool indirect,
- const struct brw_cs_prog_data *prog_data,
- uint32_t groupCountX, uint32_t groupCountY,
- uint32_t groupCountZ)
-{
- struct anv_cmd_compute_state *comp_state = &cmd_buffer->state.compute;
- const struct anv_shader_bin *cs_bin = pipeline->cs;
- bool predicate = cmd_buffer->state.conditional_render_enabled;
-
- const struct intel_device_info *devinfo = &pipeline->base.device->info;
- const struct brw_cs_dispatch_info dispatch =
- brw_cs_get_dispatch_info(devinfo, prog_data, NULL);
-
- anv_batch_emit(&cmd_buffer->batch, GENX(COMPUTE_WALKER), cw) {
- cw.IndirectParameterEnable = indirect;
- cw.PredicateEnable = predicate;
- cw.SIMDSize = dispatch.simd_size / 16;
- cw.IndirectDataStartAddress = comp_state->push_data.offset;
- cw.IndirectDataLength = comp_state->push_data.alloc_size;
- cw.LocalXMaximum = prog_data->local_size[0] - 1;
- cw.LocalYMaximum = prog_data->local_size[1] - 1;
- cw.LocalZMaximum = prog_data->local_size[2] - 1;
- cw.ThreadGroupIDXDimension = groupCountX;
- cw.ThreadGroupIDYDimension = groupCountY;
- cw.ThreadGroupIDZDimension = groupCountZ;
- cw.ExecutionMask = dispatch.right_mask;
-
- cw.InterfaceDescriptor = (struct GENX(INTERFACE_DESCRIPTOR_DATA)) {
- .KernelStartPointer = cs_bin->kernel.offset,
- .SamplerStatePointer =
- cmd_buffer->state.samplers[MESA_SHADER_COMPUTE].offset,
- .BindingTablePointer =
- cmd_buffer->state.binding_tables[MESA_SHADER_COMPUTE].offset,
- .BindingTableEntryCount =
- 1 + MIN2(pipeline->cs->bind_map.surface_count, 30),
- .NumberofThreadsinGPGPUThreadGroup = dispatch.threads,
- .SharedLocalMemorySize = encode_slm_size(GFX_VER,
- prog_data->base.total_shared),
- .BarrierEnable = prog_data->uses_barrier,
- };
+ if (anv_cmd_buffer_is_blitter_queue(cmd_buffer)) {
+ cmd_buffer_barrier_blitter(cmd_buffer, dep_info);
+ return;
}
-}
-#else /* #if GFX_VERx10 >= 125 */
-
-static inline void
-emit_gpgpu_walker(struct anv_cmd_buffer *cmd_buffer,
- const struct anv_compute_pipeline *pipeline, bool indirect,
- const struct brw_cs_prog_data *prog_data,
- uint32_t groupCountX, uint32_t groupCountY,
- uint32_t groupCountZ)
-{
- bool predicate = (GFX_VER <= 7 && indirect) ||
- cmd_buffer->state.conditional_render_enabled;
-
- const struct intel_device_info *devinfo = &pipeline->base.device->info;
- const struct brw_cs_dispatch_info dispatch =
- brw_cs_get_dispatch_info(devinfo, prog_data, NULL);
-
- anv_batch_emit(&cmd_buffer->batch, GENX(GPGPU_WALKER), ggw) {
- ggw.IndirectParameterEnable = indirect;
- ggw.PredicateEnable = predicate;
- ggw.SIMDSize = dispatch.simd_size / 16;
- ggw.ThreadDepthCounterMaximum = 0;
- ggw.ThreadHeightCounterMaximum = 0;
- ggw.ThreadWidthCounterMaximum = dispatch.threads - 1;
- ggw.ThreadGroupIDXDimension = groupCountX;
- ggw.ThreadGroupIDYDimension = groupCountY;
- ggw.ThreadGroupIDZDimension = groupCountZ;
- ggw.RightExecutionMask = dispatch.right_mask;
- ggw.BottomExecutionMask = 0xffffffff;
- }
-
- anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_STATE_FLUSH), msf);
-}
-
-#endif /* #if GFX_VERx10 >= 125 */
-
-static inline void
-emit_cs_walker(struct anv_cmd_buffer *cmd_buffer,
- const struct anv_compute_pipeline *pipeline, bool indirect,
- const struct brw_cs_prog_data *prog_data,
- uint32_t groupCountX, uint32_t groupCountY,
- uint32_t groupCountZ)
-{
-#if GFX_VERx10 >= 125
- emit_compute_walker(cmd_buffer, pipeline, indirect, prog_data, groupCountX,
- groupCountY, groupCountZ);
-#else
- emit_gpgpu_walker(cmd_buffer, pipeline, indirect, prog_data, groupCountX,
- groupCountY, groupCountZ);
-#endif
-}
+ struct anv_device *device = cmd_buffer->device;
-void genX(CmdDispatchBase)(
- VkCommandBuffer commandBuffer,
- uint32_t baseGroupX,
- uint32_t baseGroupY,
- uint32_t baseGroupZ,
- uint32_t groupCountX,
- uint32_t groupCountY,
- uint32_t groupCountZ)
-{
- ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
- struct anv_compute_pipeline *pipeline = cmd_buffer->state.compute.pipeline;
- const struct brw_cs_prog_data *prog_data = get_cs_prog_data(pipeline);
+ /* XXX: Right now, we're really dumb and just flush whatever categories
+ * the app asks for. One of these days we may make this a bit better
+ * but right now that's all the hardware allows for in most areas.
+ */
+ VkAccessFlags2 src_flags = 0;
+ VkAccessFlags2 dst_flags = 0;
- anv_cmd_buffer_push_base_group_id(cmd_buffer, baseGroupX,
- baseGroupY, baseGroupZ);
+ bool apply_sparse_flushes = false;
+ bool flush_query_copies = false;
- if (anv_batch_has_error(&cmd_buffer->batch))
- return;
+ for (uint32_t i = 0; i < dep_info->memoryBarrierCount; i++) {
+ src_flags |= dep_info->pMemoryBarriers[i].srcAccessMask;
+ dst_flags |= dep_info->pMemoryBarriers[i].dstAccessMask;
- anv_measure_snapshot(cmd_buffer,
- INTEL_SNAPSHOT_COMPUTE,
- "compute",
- groupCountX * groupCountY * groupCountZ *
- prog_data->local_size[0] * prog_data->local_size[1] *
- prog_data->local_size[2]);
+ /* Shader writes to buffers that could then be written by a transfer
+ * command (including queries).
+ */
+ if (stage_is_shader(dep_info->pMemoryBarriers[i].srcStageMask) &&
+ mask_is_shader_write(dep_info->pMemoryBarriers[i].srcAccessMask) &&
+ stage_is_transfer(dep_info->pMemoryBarriers[i].dstStageMask)) {
+ cmd_buffer->state.queries.buffer_write_bits |=
+ ANV_QUERY_COMPUTE_WRITES_PENDING_BITS;
+ }
- if (prog_data->uses_num_work_groups) {
- struct anv_state state =
- anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, 12, 4);
- uint32_t *sizes = state.map;
- sizes[0] = groupCountX;
- sizes[1] = groupCountY;
- sizes[2] = groupCountZ;
- cmd_buffer->state.compute.num_workgroups = (struct anv_address) {
- .bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo,
- .offset = state.offset,
- };
+ if (stage_is_transfer(dep_info->pMemoryBarriers[i].srcStageMask) &&
+ mask_is_transfer_write(dep_info->pMemoryBarriers[i].srcAccessMask) &&
+ cmd_buffer_has_pending_copy_query(cmd_buffer))
+ flush_query_copies = true;
- /* The num_workgroups buffer goes in the binding table */
- cmd_buffer->state.descriptors_dirty |= VK_SHADER_STAGE_COMPUTE_BIT;
+ /* There's no way of knowing if this memory barrier is related to sparse
+ * buffers! This is pretty horrible.
+ */
+ if (mask_is_write(src_flags) &&
+ p_atomic_read(&device->num_sparse_resources) > 0)
+ apply_sparse_flushes = true;
}
- genX(cmd_buffer_flush_compute_state)(cmd_buffer);
-
- if (cmd_buffer->state.conditional_render_enabled)
- genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
-
- emit_cs_walker(cmd_buffer, pipeline, false, prog_data, groupCountX,
- groupCountY, groupCountZ);
-}
-
-#define GPGPU_DISPATCHDIMX 0x2500
-#define GPGPU_DISPATCHDIMY 0x2504
-#define GPGPU_DISPATCHDIMZ 0x2508
+ for (uint32_t i = 0; i < dep_info->bufferMemoryBarrierCount; i++) {
+ const VkBufferMemoryBarrier2 *buf_barrier =
+ &dep_info->pBufferMemoryBarriers[i];
+ ANV_FROM_HANDLE(anv_buffer, buffer, buf_barrier->buffer);
-void genX(CmdDispatchIndirect)(
- VkCommandBuffer commandBuffer,
- VkBuffer _buffer,
- VkDeviceSize offset)
-{
- ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
- ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
- struct anv_compute_pipeline *pipeline = cmd_buffer->state.compute.pipeline;
- const struct brw_cs_prog_data *prog_data = get_cs_prog_data(pipeline);
- struct anv_address addr = anv_address_add(buffer->address, offset);
- UNUSED struct anv_batch *batch = &cmd_buffer->batch;
-
- anv_cmd_buffer_push_base_group_id(cmd_buffer, 0, 0, 0);
-
-#if GFX_VER == 7
- /* Linux 4.4 added command parser version 5 which allows the GPGPU
- * indirect dispatch registers to be written.
- */
- if (verify_cmd_parser(cmd_buffer->device, 5,
- "vkCmdDispatchIndirect") != VK_SUCCESS)
- return;
-#endif
+ src_flags |= buf_barrier->srcAccessMask;
+ dst_flags |= buf_barrier->dstAccessMask;
- anv_measure_snapshot(cmd_buffer,
- INTEL_SNAPSHOT_COMPUTE,
- "compute indirect",
- 0);
+ /* Shader writes to buffers that could then be written by a transfer
+ * command (including queries).
+ */
+ if (stage_is_shader(buf_barrier->srcStageMask) &&
+ mask_is_shader_write(buf_barrier->srcAccessMask) &&
+ stage_is_transfer(buf_barrier->dstStageMask)) {
+ cmd_buffer->state.queries.buffer_write_bits |=
+ ANV_QUERY_COMPUTE_WRITES_PENDING_BITS;
+ }
- if (prog_data->uses_num_work_groups) {
- cmd_buffer->state.compute.num_workgroups = addr;
+ if (stage_is_transfer(buf_barrier->srcStageMask) &&
+ mask_is_transfer_write(buf_barrier->srcAccessMask) &&
+ cmd_buffer_has_pending_copy_query(cmd_buffer))
+ flush_query_copies = true;
- /* The num_workgroups buffer goes in the binding table */
- cmd_buffer->state.descriptors_dirty |= VK_SHADER_STAGE_COMPUTE_BIT;
+ if (anv_buffer_is_sparse(buffer) && mask_is_write(src_flags))
+ apply_sparse_flushes = true;
}
- genX(cmd_buffer_flush_compute_state)(cmd_buffer);
-
- struct mi_builder b;
- mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
-
- struct mi_value size_x = mi_mem32(anv_address_add(addr, 0));
- struct mi_value size_y = mi_mem32(anv_address_add(addr, 4));
- struct mi_value size_z = mi_mem32(anv_address_add(addr, 8));
+ for (uint32_t i = 0; i < dep_info->imageMemoryBarrierCount; i++) {
+ const VkImageMemoryBarrier2 *img_barrier =
+ &dep_info->pImageMemoryBarriers[i];
- mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMX), size_x);
- mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMY), size_y);
- mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMZ), size_z);
+ src_flags |= img_barrier->srcAccessMask;
+ dst_flags |= img_barrier->dstAccessMask;
-#if GFX_VER <= 7
- /* predicate = (compute_dispatch_indirect_x_size == 0); */
- mi_store(&b, mi_reg64(MI_PREDICATE_SRC0), size_x);
- mi_store(&b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(0));
- anv_batch_emit(batch, GENX(MI_PREDICATE), mip) {
- mip.LoadOperation = LOAD_LOAD;
- mip.CombineOperation = COMBINE_SET;
- mip.CompareOperation = COMPARE_SRCS_EQUAL;
- }
+ ANV_FROM_HANDLE(anv_image, image, img_barrier->image);
+ const VkImageSubresourceRange *range = &img_barrier->subresourceRange;
- /* predicate |= (compute_dispatch_indirect_y_size == 0); */
- mi_store(&b, mi_reg32(MI_PREDICATE_SRC0), size_y);
- anv_batch_emit(batch, GENX(MI_PREDICATE), mip) {
- mip.LoadOperation = LOAD_LOAD;
- mip.CombineOperation = COMBINE_OR;
- mip.CompareOperation = COMPARE_SRCS_EQUAL;
- }
+ uint32_t base_layer, layer_count;
+ if (image->vk.image_type == VK_IMAGE_TYPE_3D) {
+ base_layer = 0;
+ layer_count = u_minify(image->vk.extent.depth, range->baseMipLevel);
+ } else {
+ base_layer = range->baseArrayLayer;
+ layer_count = vk_image_subresource_layer_count(&image->vk, range);
+ }
+ const uint32_t level_count =
+ vk_image_subresource_level_count(&image->vk, range);
- /* predicate |= (compute_dispatch_indirect_z_size == 0); */
- mi_store(&b, mi_reg32(MI_PREDICATE_SRC0), size_z);
- anv_batch_emit(batch, GENX(MI_PREDICATE), mip) {
- mip.LoadOperation = LOAD_LOAD;
- mip.CombineOperation = COMBINE_OR;
- mip.CompareOperation = COMPARE_SRCS_EQUAL;
- }
+ VkImageLayout old_layout = img_barrier->oldLayout;
+ VkImageLayout new_layout = img_barrier->newLayout;
- /* predicate = !predicate; */
- anv_batch_emit(batch, GENX(MI_PREDICATE), mip) {
- mip.LoadOperation = LOAD_LOADINV;
- mip.CombineOperation = COMBINE_OR;
- mip.CompareOperation = COMPARE_FALSE;
- }
-
-#if GFX_VERx10 == 75
- if (cmd_buffer->state.conditional_render_enabled) {
- /* predicate &= !(conditional_rendering_predicate == 0); */
- mi_store(&b, mi_reg32(MI_PREDICATE_SRC0),
- mi_reg32(ANV_PREDICATE_RESULT_REG));
- anv_batch_emit(batch, GENX(MI_PREDICATE), mip) {
- mip.LoadOperation = LOAD_LOADINV;
- mip.CombineOperation = COMBINE_AND;
- mip.CompareOperation = COMPARE_SRCS_EQUAL;
+ /* If we're inside a render pass, the runtime might have converted some
+ * layouts from GENERAL to FEEDBACK_LOOP. Check if that's the case and
+ * reconvert back to the original layout so that application barriers
+ * within renderpass are operating with consistent layouts.
+ */
+ if (!cmd_buffer->vk.runtime_rp_barrier &&
+ cmd_buffer->vk.render_pass != NULL) {
+ assert(anv_cmd_graphics_state_has_image_as_attachment(&cmd_buffer->state.gfx,
+ image));
+ VkImageLayout subpass_att_layout, subpass_stencil_att_layout;
+
+ vk_command_buffer_get_attachment_layout(
+ &cmd_buffer->vk, &image->vk,
+ &subpass_att_layout, &subpass_stencil_att_layout);
+
+ old_layout = subpass_att_layout;
+ new_layout = subpass_att_layout;
}
- }
-#endif
-#else /* GFX_VER > 7 */
- if (cmd_buffer->state.conditional_render_enabled)
- genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
-#endif
+ if (range->aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) {
+ transition_depth_buffer(cmd_buffer, image,
+ range->baseMipLevel, level_count,
+ base_layer, layer_count,
+ old_layout, new_layout,
+ false /* will_full_fast_clear */);
+ }
- emit_cs_walker(cmd_buffer, pipeline, true, prog_data, 0, 0, 0);
-}
+ if (range->aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) {
+ transition_stencil_buffer(cmd_buffer, image,
+ range->baseMipLevel, level_count,
+ base_layer, layer_count,
+ old_layout, new_layout,
+ false /* will_full_fast_clear */);
+ }
-#if GFX_VERx10 >= 125
-static void
-calc_local_trace_size(uint8_t local_shift[3], const uint32_t global[3])
-{
- unsigned total_shift = 0;
- memset(local_shift, 0, 3);
-
- bool progress;
- do {
- progress = false;
- for (unsigned i = 0; i < 3; i++) {
- assert(global[i] > 0);
- if ((1 << local_shift[i]) < global[i]) {
- progress = true;
- local_shift[i]++;
- total_shift++;
+ if (range->aspectMask & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) {
+ VkImageAspectFlags color_aspects =
+ vk_image_expand_aspect_mask(&image->vk, range->aspectMask);
+ anv_foreach_image_aspect_bit(aspect_bit, image, color_aspects) {
+ transition_color_buffer(cmd_buffer, image, 1UL << aspect_bit,
+ range->baseMipLevel, level_count,
+ base_layer, layer_count,
+ old_layout, new_layout,
+ img_barrier->srcQueueFamilyIndex,
+ img_barrier->dstQueueFamilyIndex,
+ false /* will_full_fast_clear */);
}
-
- if (total_shift == 3)
- return;
}
- } while(progress);
-
- /* Assign whatever's left to x */
- local_shift[0] += 3 - total_shift;
-}
-
-static struct GFX_RT_SHADER_TABLE
-vk_sdar_to_shader_table(const VkStridedDeviceAddressRegionKHR *region)
-{
- return (struct GFX_RT_SHADER_TABLE) {
- .BaseAddress = anv_address_from_u64(region->deviceAddress),
- .Stride = region->stride,
- };
-}
-
-static void
-cmd_buffer_trace_rays(struct anv_cmd_buffer *cmd_buffer,
- const VkStridedDeviceAddressRegionKHR *raygen_sbt,
- const VkStridedDeviceAddressRegionKHR *miss_sbt,
- const VkStridedDeviceAddressRegionKHR *hit_sbt,
- const VkStridedDeviceAddressRegionKHR *callable_sbt,
- bool is_indirect,
- uint32_t launch_width,
- uint32_t launch_height,
- uint32_t launch_depth,
- uint64_t launch_size_addr)
-{
- struct anv_cmd_ray_tracing_state *rt = &cmd_buffer->state.rt;
- struct anv_ray_tracing_pipeline *pipeline = rt->pipeline;
-
- if (anv_batch_has_error(&cmd_buffer->batch))
- return;
-
- /* If we have a known degenerate launch size, just bail */
- if (!is_indirect &&
- (launch_width == 0 || launch_height == 0 || launch_depth == 0))
- return;
-
- genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->base.l3_config);
- genX(flush_pipeline_select_gpgpu)(cmd_buffer);
-
- cmd_buffer->state.rt.pipeline_dirty = false;
-
- genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
-
- /* Add these to the reloc list as they're internal buffers that don't
- * actually have relocs to pick them up manually.
- *
- * TODO(RT): This is a bit of a hack
- */
- anv_reloc_list_add_bo(cmd_buffer->batch.relocs,
- cmd_buffer->batch.alloc,
- rt->scratch.bo);
-
- /* Allocate and set up our RT_DISPATCH_GLOBALS */
- struct anv_state rtdg_state =
- anv_cmd_buffer_alloc_dynamic_state(cmd_buffer,
- BRW_RT_PUSH_CONST_OFFSET +
- sizeof(struct anv_push_constants),
- 64);
-
- struct GFX_RT_DISPATCH_GLOBALS rtdg = {
- .MemBaseAddress = (struct anv_address) {
- .bo = rt->scratch.bo,
- .offset = rt->scratch.layout.ray_stack_start,
- },
- .CallStackHandler =
- anv_shader_bin_get_bsr(cmd_buffer->device->rt_trivial_return, 0),
- .AsyncRTStackSize = rt->scratch.layout.ray_stack_stride / 64,
- .NumDSSRTStacks = rt->scratch.layout.stack_ids_per_dss,
- .MaxBVHLevels = BRW_RT_MAX_BVH_LEVELS,
- .Flags = RT_DEPTH_TEST_LESS_EQUAL,
- .HitGroupTable = vk_sdar_to_shader_table(hit_sbt),
- .MissGroupTable = vk_sdar_to_shader_table(miss_sbt),
- .SWStackSize = rt->scratch.layout.sw_stack_size / 64,
- .LaunchWidth = launch_width,
- .LaunchHeight = launch_height,
- .LaunchDepth = launch_depth,
- .CallableGroupTable = vk_sdar_to_shader_table(callable_sbt),
- };
- GFX_RT_DISPATCH_GLOBALS_pack(NULL, rtdg_state.map, &rtdg);
- /* Push constants go after the RT_DISPATCH_GLOBALS */
- assert(GFX_RT_DISPATCH_GLOBALS_length * 4 <= BRW_RT_PUSH_CONST_OFFSET);
- memcpy(rtdg_state.map + BRW_RT_PUSH_CONST_OFFSET,
- &cmd_buffer->state.rt.base.push_constants,
- sizeof(struct anv_push_constants));
-
- struct anv_address rtdg_addr = {
- .bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo,
- .offset = rtdg_state.offset,
- };
-
- uint8_t local_size_log2[3];
- uint32_t global_size[3] = {};
- if (is_indirect) {
- /* Pick a local size that's probably ok. We assume most TraceRays calls
- * will use a two-dimensional dispatch size. Worst case, our initial
- * dispatch will be a little slower than it has to be.
+ /* Mark image as compressed if the destination layout has untracked
+ * writes to the aux surface.
*/
- local_size_log2[0] = 2;
- local_size_log2[1] = 1;
- local_size_log2[2] = 0;
+ VkImageAspectFlags aspects =
+ vk_image_expand_aspect_mask(&image->vk, range->aspectMask);
+ anv_foreach_image_aspect_bit(aspect_bit, image, aspects) {
+ VkImageAspectFlagBits aspect = 1UL << aspect_bit;
+ if (anv_layout_has_untracked_aux_writes(
+ device->info,
+ image, aspect,
+ img_barrier->newLayout,
+ cmd_buffer->queue_family->queueFlags)) {
+ for (uint32_t l = 0; l < level_count; l++) {
+ const uint32_t level = range->baseMipLevel + l;
+ const uint32_t aux_layers =
+ anv_image_aux_layers(image, aspect, level);
+
+ if (base_layer >= aux_layers)
+ break; /* We will only get fewer layers as level increases */
+
+ uint32_t level_layer_count =
+ MIN2(layer_count, aux_layers - base_layer);
- struct mi_builder b;
- mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
+ set_image_compressed_bit(cmd_buffer, image, aspect,
+ level,
+ base_layer, level_layer_count,
+ true);
+ }
+ }
+ }
- struct mi_value launch_size[3] = {
- mi_mem32(anv_address_from_u64(launch_size_addr + 0)),
- mi_mem32(anv_address_from_u64(launch_size_addr + 4)),
- mi_mem32(anv_address_from_u64(launch_size_addr + 8)),
- };
+ if (anv_image_is_sparse(image) && mask_is_write(src_flags))
+ apply_sparse_flushes = true;
+ }
- /* Store the original launch size into RT_DISPATCH_GLOBALS
- *
- * TODO: Pull values from genX_bits.h once RT_DISPATCH_GLOBALS gets
- * moved into a genX version.
- */
- mi_store(&b, mi_mem32(anv_address_add(rtdg_addr, 52)),
- mi_value_ref(&b, launch_size[0]));
- mi_store(&b, mi_mem32(anv_address_add(rtdg_addr, 56)),
- mi_value_ref(&b, launch_size[1]));
- mi_store(&b, mi_mem32(anv_address_add(rtdg_addr, 60)),
- mi_value_ref(&b, launch_size[2]));
-
- /* Compute the global dispatch size */
- for (unsigned i = 0; i < 3; i++) {
- if (local_size_log2[i] == 0)
- continue;
+ enum anv_pipe_bits bits =
+ anv_pipe_flush_bits_for_access_flags(cmd_buffer, src_flags) |
+ anv_pipe_invalidate_bits_for_access_flags(cmd_buffer, dst_flags);
- /* global_size = DIV_ROUND_UP(launch_size, local_size)
- *
- * Fortunately for us MI_ALU math is 64-bit and , mi_ushr32_imm
- * has the semantics of shifting the enture 64-bit value and taking
- * the bottom 32 so we don't have to worry about roll-over.
- */
- uint32_t local_size = 1 << local_size_log2[i];
- launch_size[i] = mi_iadd(&b, launch_size[i],
- mi_imm(local_size - 1));
- launch_size[i] = mi_ushr32_imm(&b, launch_size[i],
- local_size_log2[i]);
- }
-
- mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMX), launch_size[0]);
- mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMY), launch_size[1]);
- mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMZ), launch_size[2]);
- } else {
- uint32_t launch_size[3] = { launch_width, launch_height, launch_depth };
- calc_local_trace_size(local_size_log2, launch_size);
+ /* Our HW implementation of the sparse feature lives in the GAM unit
+ * (interface between all the GPU caches and external memory). As a result
+ * writes to NULL bound images & buffers that should be ignored are
+ * actually still visible in the caches. The only way for us to get correct
+ * NULL bound regions to return 0s is to evict the caches to force the
+ * caches to be repopulated with 0s.
+ */
+ if (apply_sparse_flushes)
+ bits |= ANV_PIPE_FLUSH_BITS;
- for (unsigned i = 0; i < 3; i++) {
- /* We have to be a bit careful here because DIV_ROUND_UP adds to the
- * numerator value may overflow. Cast to uint64_t to avoid this.
- */
- uint32_t local_size = 1 << local_size_log2[i];
- global_size[i] = DIV_ROUND_UP((uint64_t)launch_size[i], local_size);
- }
+ /* Copies from query pools are executed with a shader writing through the
+ * dataport.
+ */
+ if (flush_query_copies) {
+ bits |= (GFX_VER >= 12 ?
+ ANV_PIPE_HDC_PIPELINE_FLUSH_BIT : ANV_PIPE_DATA_CACHE_FLUSH_BIT);
}
- anv_batch_emit(&cmd_buffer->batch, GENX(COMPUTE_WALKER), cw) {
- cw.IndirectParameterEnable = is_indirect;
- cw.PredicateEnable = false;
- cw.SIMDSize = SIMD8;
- cw.LocalXMaximum = (1 << local_size_log2[0]) - 1;
- cw.LocalYMaximum = (1 << local_size_log2[1]) - 1;
- cw.LocalZMaximum = (1 << local_size_log2[2]) - 1;
- cw.ThreadGroupIDXDimension = global_size[0];
- cw.ThreadGroupIDYDimension = global_size[1];
- cw.ThreadGroupIDZDimension = global_size[2];
- cw.ExecutionMask = 0xff;
- cw.EmitInlineParameter = true;
-
- const gl_shader_stage s = MESA_SHADER_RAYGEN;
- struct anv_device *device = cmd_buffer->device;
- struct anv_state *surfaces = &cmd_buffer->state.binding_tables[s];
- struct anv_state *samplers = &cmd_buffer->state.samplers[s];
- cw.InterfaceDescriptor = (struct GENX(INTERFACE_DESCRIPTOR_DATA)) {
- .KernelStartPointer = device->rt_trampoline->kernel.offset,
- .SamplerStatePointer = samplers->offset,
- /* i965: DIV_ROUND_UP(CLAMP(stage_state->sampler_count, 0, 16), 4), */
- .SamplerCount = 0,
- .BindingTablePointer = surfaces->offset,
- .NumberofThreadsinGPGPUThreadGroup = 1,
- .BTDMode = true,
- };
+ if (dst_flags & VK_ACCESS_INDIRECT_COMMAND_READ_BIT)
+ genX(cmd_buffer_flush_generated_draws)(cmd_buffer);
- struct brw_rt_raygen_trampoline_params trampoline_params = {
- .rt_disp_globals_addr = anv_address_physical(rtdg_addr),
- .raygen_bsr_addr = raygen_sbt->deviceAddress,
- .is_indirect = is_indirect,
- .local_group_size_log2 = {
- local_size_log2[0],
- local_size_log2[1],
- local_size_log2[2],
- },
- };
- STATIC_ASSERT(sizeof(trampoline_params) == 32);
- memcpy(cw.InlineData, &trampoline_params, sizeof(trampoline_params));
- }
+ anv_add_pending_pipe_bits(cmd_buffer, bits, reason);
}
-void
-genX(CmdTraceRaysKHR)(
+void genX(CmdPipelineBarrier2)(
VkCommandBuffer commandBuffer,
- const VkStridedDeviceAddressRegionKHR* pRaygenShaderBindingTable,
- const VkStridedDeviceAddressRegionKHR* pMissShaderBindingTable,
- const VkStridedDeviceAddressRegionKHR* pHitShaderBindingTable,
- const VkStridedDeviceAddressRegionKHR* pCallableShaderBindingTable,
- uint32_t width,
- uint32_t height,
- uint32_t depth)
+ const VkDependencyInfo* pDependencyInfo)
{
ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
- cmd_buffer_trace_rays(cmd_buffer,
- pRaygenShaderBindingTable,
- pMissShaderBindingTable,
- pHitShaderBindingTable,
- pCallableShaderBindingTable,
- false /* is_indirect */,
- width, height, depth,
- 0 /* launch_size_addr */);
+ cmd_buffer_barrier(cmd_buffer, pDependencyInfo, "pipe barrier");
}
void
-genX(CmdTraceRaysIndirectKHR)(
- VkCommandBuffer commandBuffer,
- const VkStridedDeviceAddressRegionKHR* pRaygenShaderBindingTable,
- const VkStridedDeviceAddressRegionKHR* pMissShaderBindingTable,
- const VkStridedDeviceAddressRegionKHR* pHitShaderBindingTable,
- const VkStridedDeviceAddressRegionKHR* pCallableShaderBindingTable,
- VkDeviceAddress indirectDeviceAddress)
+genX(batch_emit_breakpoint)(struct anv_batch *batch,
+ struct anv_device *device,
+ bool emit_before_draw)
{
- ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+ /* Update draw call count once */
+ uint32_t draw_count = emit_before_draw ?
+ p_atomic_inc_return(&device->draw_call_count) :
+ p_atomic_read(&device->draw_call_count);
+
+ if (((draw_count == intel_debug_bkp_before_draw_count &&
+ emit_before_draw) ||
+ (draw_count == intel_debug_bkp_after_draw_count &&
+ !emit_before_draw))) {
+ struct anv_address wait_addr =
+ anv_state_pool_state_address(&device->dynamic_state_pool,
+ device->breakpoint);
+
+ anv_batch_emit(batch, GENX(MI_SEMAPHORE_WAIT), sem) {
+ sem.WaitMode = PollingMode;
+ sem.CompareOperation = COMPARE_SAD_EQUAL_SDD;
+ sem.SemaphoreDataDword = 0x1;
+ sem.SemaphoreAddress = wait_addr;
+ };
+ }
+}
- cmd_buffer_trace_rays(cmd_buffer,
- pRaygenShaderBindingTable,
- pMissShaderBindingTable,
- pHitShaderBindingTable,
- pCallableShaderBindingTable,
- true /* is_indirect */,
- 0, 0, 0, /* width, height, depth, */
- indirectDeviceAddress);
+/* Only emit PIPELINE_SELECT, for the whole mode switch and flushing use
+ * flush_pipeline_select()
+ */
+void
+genX(emit_pipeline_select)(struct anv_batch *batch, uint32_t pipeline,
+ const struct anv_device *device)
+{
+ /* Bspec 55860: Xe2+ no longer requires PIPELINE_SELECT */
+#if GFX_VER < 20
+ anv_batch_emit(batch, GENX(PIPELINE_SELECT), ps) {
+ ps.MaskBits = GFX_VERx10 >= 125 ? 0x93 : GFX_VER >= 12 ? 0x13 : 0x3;
+#if GFX_VER == 12
+ ps.MediaSamplerDOPClockGateEnable = true;
+#endif
+ ps.PipelineSelection = pipeline;
+#if GFX_VERx10 == 125
+ /* It might still be better to only enable this when the compute
+ * pipeline will have DPAS instructions.
+ */
+ ps.SystolicModeEnable = pipeline == GPGPU &&
+ device->vk.enabled_extensions.KHR_cooperative_matrix &&
+ device->vk.enabled_features.cooperativeMatrix;
+#endif
+ }
+#endif /* if GFX_VER < 20 */
}
-#endif /* GFX_VERx10 >= 125 */
static void
genX(flush_pipeline_select)(struct anv_cmd_buffer *cmd_buffer,
uint32_t pipeline)
{
- UNUSED const struct intel_device_info *devinfo = &cmd_buffer->device->info;
+ UNUSED const struct intel_device_info *devinfo = cmd_buffer->device->info;
if (cmd_buffer->state.current_pipeline == pipeline)
return;
-#if GFX_VER >= 8 && GFX_VER < 10
+#if GFX_VER >= 20
+ /* While PIPELINE_SELECT is not needed on Xe2+, our current assumption
+ * is that the pipelined flushes in the 3D pipeline are not getting
+ * synchronized with the compute dispatches (and vice versa). So we need
+ * a CS_STALL prior the next set of commands to ensure the flushes have
+ * completed.
+ *
+ * The new RESOURCE_BARRIER instruction has support for synchronizing
+ * 3D/Compute and once we switch to that we should be able to get rid of
+ * this CS_STALL.
+ */
+ anv_add_pending_pipe_bits(cmd_buffer, ANV_PIPE_CS_STALL_BIT, "pipeline switch stall");
+
+ /* Since we are not stalling/flushing caches explicitly while switching
+ * between the pipelines, we need to apply data dependency flushes recorded
+ * previously on the resource.
+ */
+ genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+#else
+
+#if GFX_VER == 9
/* From the Broadwell PRM, Volume 2a: Instructions, PIPELINE_SELECT:
*
* Software must clear the COLOR_CALC_STATE Valid field in
@@ -5393,6 +4230,96 @@ genX(flush_pipeline_select)(struct anv_cmd_buffer *cmd_buffer,
anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CC_STATE_POINTERS), t);
#endif
+#if GFX_VERx10 == 120
+ /* Undocumented workaround to force the re-emission of
+ * MEDIA_INTERFACE_DESCRIPTOR_LOAD when switching from 3D to Compute
+ * pipeline without rebinding a pipeline :
+ * vkCmdBindPipeline(COMPUTE, cs_pipeline);
+ * vkCmdDispatch(...);
+ * vkCmdBindPipeline(GRAPHICS, gfx_pipeline);
+ * vkCmdDraw(...);
+ * vkCmdDispatch(...);
+ */
+ if (pipeline == _3D)
+ cmd_buffer->state.compute.pipeline_dirty = true;
+#endif
+
+ /* We apparently cannot flush the tile cache (color/depth) from the GPGPU
+ * pipeline. That means query clears will not be visible to query
+ * copy/write. So we need to flush it before going to GPGPU mode.
+ */
+ if (cmd_buffer->state.current_pipeline == _3D &&
+ cmd_buffer->state.queries.clear_bits) {
+ anv_add_pending_pipe_bits(cmd_buffer,
+ ANV_PIPE_QUERY_BITS(cmd_buffer->state.queries.clear_bits),
+ "query clear flush prior to GPGPU");
+ }
+
+ /* Flush and invalidate bits done needed prior PIPELINE_SELECT. */
+ enum anv_pipe_bits bits = 0;
+
+#if GFX_VER >= 12
+ /* From Tigerlake PRM, Volume 2a, PIPELINE_SELECT:
+ *
+ * "Software must ensure Render Cache, Depth Cache and HDC Pipeline flush
+ * are flushed through a stalling PIPE_CONTROL command prior to
+ * programming of PIPELINE_SELECT command transitioning Pipeline Select
+ * from 3D to GPGPU/Media.
+ * Software must ensure HDC Pipeline flush and Generic Media State Clear
+ * is issued through a stalling PIPE_CONTROL command prior to programming
+ * of PIPELINE_SELECT command transitioning Pipeline Select from
+ * GPGPU/Media to 3D."
+ *
+ * Note: Issuing PIPE_CONTROL_MEDIA_STATE_CLEAR causes GPU hangs, probably
+ * because PIPE was not in MEDIA mode?!
+ */
+ bits |= ANV_PIPE_CS_STALL_BIT | ANV_PIPE_HDC_PIPELINE_FLUSH_BIT;
+
+ if (cmd_buffer->state.current_pipeline == _3D) {
+ bits |= ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
+ ANV_PIPE_DEPTH_CACHE_FLUSH_BIT;
+ } else {
+ bits |= ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT;
+ }
+#else
+ /* From "BXML » GT » MI » vol1a GPU Overview » [Instruction]
+ * PIPELINE_SELECT [DevBWR+]":
+ *
+ * Project: DEVSNB+
+ *
+ * Software must ensure all the write caches are flushed through a
+ * stalling PIPE_CONTROL command followed by another PIPE_CONTROL
+ * command to invalidate read only caches prior to programming
+ * MI_PIPELINE_SELECT command to change the Pipeline Select Mode.
+ *
+ * Note the cmd_buffer_apply_pipe_flushes will split this into two
+ * PIPE_CONTROLs.
+ */
+ bits |= ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
+ ANV_PIPE_DEPTH_CACHE_FLUSH_BIT |
+ ANV_PIPE_HDC_PIPELINE_FLUSH_BIT |
+ ANV_PIPE_CS_STALL_BIT |
+ ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT |
+ ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT |
+ ANV_PIPE_STATE_CACHE_INVALIDATE_BIT |
+ ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT |
+ ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT;
+#endif
+
+ /* Wa_16013063087 - State Cache Invalidate must be issued prior to
+ * PIPELINE_SELECT when switching from 3D to Compute.
+ *
+ * SW must do this by programming of PIPECONTROL with “CS Stall” followed by
+ * a PIPECONTROL with State Cache Invalidate bit set.
+ *
+ */
+ if (cmd_buffer->state.current_pipeline == _3D && pipeline == GPGPU &&
+ intel_needs_workaround(cmd_buffer->device->info, 16013063087))
+ bits |= ANV_PIPE_STATE_CACHE_INVALIDATE_BIT;
+
+ anv_add_pending_pipe_bits(cmd_buffer, bits, "flush/invalidate PIPELINE_SELECT");
+ genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+
#if GFX_VER == 9
if (pipeline == _3D) {
/* There is a mid-object preemption workaround which requires you to
@@ -5400,6 +4327,13 @@ genX(flush_pipeline_select)(struct anv_cmd_buffer *cmd_buffer,
* even without preemption, we have issues with geometry flickering when
* GPGPU and 3D are back-to-back and this seems to fix it. We don't
* really know why.
+ *
+ * Also, from the Sky Lake PRM Vol 2a, MEDIA_VFE_STATE:
+ *
+ * "A stalling PIPE_CONTROL is required before MEDIA_VFE_STATE unless
+ * the only bits that are changed are scoreboard related ..."
+ *
+ * This is satisfied by applying pre-PIPELINE_SELECT pipe flushes above.
*/
anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_VFE_STATE), vfe) {
vfe.MaximumNumberofThreads =
@@ -5417,54 +4351,10 @@ genX(flush_pipeline_select)(struct anv_cmd_buffer *cmd_buffer,
}
#endif
- /* From "BXML » GT » MI » vol1a GPU Overview » [Instruction]
- * PIPELINE_SELECT [DevBWR+]":
- *
- * Project: DEVSNB+
- *
- * Software must ensure all the write caches are flushed through a
- * stalling PIPE_CONTROL command followed by another PIPE_CONTROL
- * command to invalidate read only caches prior to programming
- * MI_PIPELINE_SELECT command to change the Pipeline Select Mode.
- */
- anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
- pc.RenderTargetCacheFlushEnable = true;
- pc.DepthCacheFlushEnable = true;
-#if GFX_VER >= 12
- pc.HDCPipelineFlushEnable = true;
-#else
- pc.DCFlushEnable = true;
-#endif
- pc.PostSyncOperation = NoWrite;
- pc.CommandStreamerStallEnable = true;
-#if GFX_VER >= 12
- /* Wa_1409600907: "PIPE_CONTROL with Depth Stall Enable bit must be
- * set with any PIPE_CONTROL with Depth Flush Enable bit set.
- */
- pc.DepthStallEnable = true;
-#endif
- anv_debug_dump_pc(pc);
- }
-
- anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
- pc.TextureCacheInvalidationEnable = true;
- pc.ConstantCacheInvalidationEnable = true;
- pc.StateCacheInvalidationEnable = true;
- pc.InstructionCacheInvalidateEnable = true;
- pc.PostSyncOperation = NoWrite;
- anv_debug_dump_pc(pc);
- }
-
- anv_batch_emit(&cmd_buffer->batch, GENX(PIPELINE_SELECT), ps) {
-#if GFX_VER >= 9
- ps.MaskBits = GFX_VER >= 12 ? 0x13 : 3;
- ps.MediaSamplerDOPClockGateEnable = GFX_VER >= 12;
-#endif
- ps.PipelineSelection = pipeline;
- }
+ genX(emit_pipeline_select)(&cmd_buffer->batch, pipeline, cmd_buffer->device);
#if GFX_VER == 9
- if (devinfo->is_geminilake) {
+ if (devinfo->platform == INTEL_PLATFORM_GLK) {
/* Project: DevGLK
*
* "This chicken bit works around a hardware issue with barrier logic
@@ -5479,7 +4369,7 @@ genX(flush_pipeline_select)(struct anv_cmd_buffer *cmd_buffer,
}
}
#endif
-
+#endif /* else of if GFX_VER >= 20 */
cmd_buffer->state.current_pipeline = pipeline;
}
@@ -5496,54 +4386,20 @@ genX(flush_pipeline_select_gpgpu)(struct anv_cmd_buffer *cmd_buffer)
}
void
-genX(cmd_buffer_emit_gfx7_depth_flush)(struct anv_cmd_buffer *cmd_buffer)
-{
- if (GFX_VER >= 8)
- return;
-
- /* From the Haswell PRM, documentation for 3DSTATE_DEPTH_BUFFER:
- *
- * "Restriction: Prior to changing Depth/Stencil Buffer state (i.e., any
- * combination of 3DSTATE_DEPTH_BUFFER, 3DSTATE_CLEAR_PARAMS,
- * 3DSTATE_STENCIL_BUFFER, 3DSTATE_HIER_DEPTH_BUFFER) SW must first
- * issue a pipelined depth stall (PIPE_CONTROL with Depth Stall bit
- * set), followed by a pipelined depth cache flush (PIPE_CONTROL with
- * Depth Flush Bit set, followed by another pipelined depth stall
- * (PIPE_CONTROL with Depth Stall Bit set), unless SW can otherwise
- * guarantee that the pipeline from WM onwards is already flushed (e.g.,
- * via a preceding MI_FLUSH)."
- */
- anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) {
- pipe.DepthStallEnable = true;
- anv_debug_dump_pc(pipe);
- }
- anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) {
- pipe.DepthCacheFlushEnable = true;
-#if GFX_VER >= 12
- pipe.TileCacheFlushEnable = true;
-#endif
- anv_debug_dump_pc(pipe);
- }
- anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) {
- pipe.DepthStallEnable = true;
- anv_debug_dump_pc(pipe);
- }
-}
-
-void
genX(cmd_buffer_emit_gfx12_depth_wa)(struct anv_cmd_buffer *cmd_buffer,
const struct isl_surf *surf)
{
-#if GFX_VERx10 == 120
- const bool fmt_is_d16 = surf->format == ISL_FORMAT_R16_UNORM;
+#if INTEL_NEEDS_WA_1808121037
+ const bool is_d16_1x_msaa = surf->format == ISL_FORMAT_R16_UNORM &&
+ surf->samples == 1;
switch (cmd_buffer->state.depth_reg_mode) {
case ANV_DEPTH_REG_MODE_HW_DEFAULT:
- if (!fmt_is_d16)
+ if (!is_d16_1x_msaa)
return;
break;
- case ANV_DEPTH_REG_MODE_D16:
- if (fmt_is_d16)
+ case ANV_DEPTH_REG_MODE_D16_1X_MSAA:
+ if (is_d16_1x_msaa)
return;
break;
case ANV_DEPTH_REG_MODE_UNKNOWN:
@@ -5558,33 +4414,26 @@ genX(cmd_buffer_emit_gfx12_depth_wa)(struct anv_cmd_buffer *cmd_buffer,
ANV_PIPE_DEPTH_CACHE_FLUSH_BIT |
ANV_PIPE_DEPTH_STALL_BIT |
ANV_PIPE_END_OF_PIPE_SYNC_BIT,
- "Workaround: Stop pipeline for 14010455700");
+ "Workaround: Stop pipeline for 1808121037");
genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
- /* Wa_14010455700
+ /* Wa_1808121037
*
* To avoid sporadic corruptions “Set 0x7010[9] when Depth Buffer
* Surface Format is D16_UNORM , surface type is not NULL & 1X_MSAA”.
*/
anv_batch_write_reg(&cmd_buffer->batch, GENX(COMMON_SLICE_CHICKEN1), reg) {
- reg.HIZPlaneOptimizationdisablebit = fmt_is_d16 && surf->samples == 1;
+ reg.HIZPlaneOptimizationdisablebit = is_d16_1x_msaa;
reg.HIZPlaneOptimizationdisablebitMask = true;
}
- /* Wa_1806527549
- *
- * Set HIZ_CHICKEN (7018h) bit 13 = 1 when depth buffer is D16_UNORM.
- */
- anv_batch_write_reg(&cmd_buffer->batch, GENX(HIZ_CHICKEN), reg) {
- reg.HZDepthTestLEGEOptimizationDisable = fmt_is_d16;
- reg.HZDepthTestLEGEOptimizationDisableMask = true;
- }
-
cmd_buffer->state.depth_reg_mode =
- fmt_is_d16 ? ANV_DEPTH_REG_MODE_D16 : ANV_DEPTH_REG_MODE_HW_DEFAULT;
+ is_d16_1x_msaa ? ANV_DEPTH_REG_MODE_D16_1X_MSAA :
+ ANV_DEPTH_REG_MODE_HW_DEFAULT;
#endif
}
+#if GFX_VER == 9
/* From the Skylake PRM, 3DSTATE_VERTEX_BUFFERS:
*
* "The VF cache needs to be invalidated before binding and then using
@@ -5618,8 +4467,7 @@ genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(struct anv_cmd_buffer *cmd_buffer
struct anv_address vb_address,
uint32_t vb_size)
{
- if (GFX_VER < 8 || GFX_VER > 9 ||
- !anv_use_softpin(cmd_buffer->device->physical))
+ if (GFX_VER > 9)
return;
struct anv_vb_cache_range *bound, *dirty;
@@ -5634,28 +4482,9 @@ genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(struct anv_cmd_buffer *cmd_buffer
dirty = &cmd_buffer->state.gfx.vb_dirty_ranges[vb_index];
}
- if (vb_size == 0) {
- bound->start = 0;
- bound->end = 0;
- return;
- }
-
- assert(vb_address.bo && (vb_address.bo->flags & EXEC_OBJECT_PINNED));
- bound->start = intel_48b_address(anv_address_physical(vb_address));
- bound->end = bound->start + vb_size;
- assert(bound->end > bound->start); /* No overflow */
-
- /* Align everything to a cache line */
- bound->start &= ~(64ull - 1ull);
- bound->end = align_u64(bound->end, 64);
-
- /* Compute the dirty range */
- dirty->start = MIN2(dirty->start, bound->start);
- dirty->end = MAX2(dirty->end, bound->end);
-
- /* If our range is larger than 32 bits, we have to flush */
- assert(bound->end - bound->start <= (1ull << 32));
- if (dirty->end - dirty->start > (1ull << 32)) {
+ if (anv_gfx8_9_vb_cache_range_needs_workaround(bound, dirty,
+ vb_address,
+ vb_size)) {
anv_add_pending_pipe_bits(cmd_buffer,
ANV_PIPE_CS_STALL_BIT |
ANV_PIPE_VF_CACHE_INVALIDATE_BIT,
@@ -5668,19 +4497,12 @@ genX(cmd_buffer_update_dirty_vbs_for_gfx8_vb_flush)(struct anv_cmd_buffer *cmd_b
uint32_t access_type,
uint64_t vb_used)
{
- if (GFX_VER < 8 || GFX_VER > 9 ||
- !anv_use_softpin(cmd_buffer->device->physical))
- return;
-
if (access_type == RANDOM) {
/* We have an index buffer */
struct anv_vb_cache_range *bound = &cmd_buffer->state.gfx.ib_bound_range;
struct anv_vb_cache_range *dirty = &cmd_buffer->state.gfx.ib_dirty_range;
- if (bound->end > bound->start) {
- dirty->start = MIN2(dirty->start, bound->start);
- dirty->end = MAX2(dirty->end, bound->end);
- }
+ anv_merge_vb_cache_range(dirty, bound);
}
uint64_t mask = vb_used;
@@ -5694,12 +4516,10 @@ genX(cmd_buffer_update_dirty_vbs_for_gfx8_vb_flush)(struct anv_cmd_buffer *cmd_b
bound = &cmd_buffer->state.gfx.vb_bound_ranges[i];
dirty = &cmd_buffer->state.gfx.vb_dirty_ranges[i];
- if (bound->end > bound->start) {
- dirty->start = MIN2(dirty->start, bound->start);
- dirty->end = MAX2(dirty->end, bound->end);
- }
+ anv_merge_vb_cache_range(dirty, bound);
}
}
+#endif /* GFX_VER == 9 */
/**
* Update the pixel hashing modes that determine the balancing of PS threads
@@ -5724,7 +4544,7 @@ genX(cmd_buffer_emit_hashing_mode)(struct anv_cmd_buffer *cmd_buffer,
unsigned scale)
{
#if GFX_VER == 9
- const struct intel_device_info *devinfo = &cmd_buffer->device->info;
+ const struct intel_device_info *devinfo = cmd_buffer->device->info;
const unsigned slice_hashing[] = {
/* Because all Gfx9 platforms with more than one slice require
* three-way subslice hashing, a single "normal" 16x16 slice hashing
@@ -5796,25 +4616,39 @@ static void
cmd_buffer_emit_depth_stencil(struct anv_cmd_buffer *cmd_buffer)
{
struct anv_device *device = cmd_buffer->device;
- const struct anv_image_view *iview =
- anv_cmd_buffer_get_depth_stencil_view(cmd_buffer);
- const struct anv_image *image = iview ? iview->image : NULL;
-
- /* FIXME: Width and Height are wrong */
-
- genX(cmd_buffer_emit_gfx7_depth_flush)(cmd_buffer);
+ struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
uint32_t *dw = anv_batch_emit_dwords(&cmd_buffer->batch,
device->isl_dev.ds.size / 4);
if (dw == NULL)
return;
- struct isl_depth_stencil_hiz_emit_info info = { };
+ struct isl_view isl_view = {};
+ struct isl_depth_stencil_hiz_emit_info info = {
+ .view = &isl_view,
+ .mocs = anv_mocs(device, NULL, ISL_SURF_USAGE_DEPTH_BIT),
+ };
+
+ if (gfx->depth_att.iview != NULL) {
+ isl_view = gfx->depth_att.iview->planes[0].isl;
+ } else if (gfx->stencil_att.iview != NULL) {
+ isl_view = gfx->stencil_att.iview->planes[0].isl;
+ }
- if (iview)
- info.view = &iview->planes[0].isl;
+ if (gfx->view_mask) {
+ assert(isl_view.array_len == 0 ||
+ isl_view.array_len >= util_last_bit(gfx->view_mask));
+ isl_view.array_len = util_last_bit(gfx->view_mask);
+ } else {
+ assert(isl_view.array_len == 0 ||
+ isl_view.array_len >= util_last_bit(gfx->layer_count));
+ isl_view.array_len = gfx->layer_count;
+ }
+
+ if (gfx->depth_att.iview != NULL) {
+ const struct anv_image_view *iview = gfx->depth_att.iview;
+ const struct anv_image *image = iview->image;
- if (image && (image->vk.aspects & VK_IMAGE_ASPECT_DEPTH_BIT)) {
const uint32_t depth_plane =
anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_DEPTH_BIT);
const struct anv_surface *depth_surface =
@@ -5822,18 +4656,14 @@ cmd_buffer_emit_depth_stencil(struct anv_cmd_buffer *cmd_buffer)
const struct anv_address depth_address =
anv_image_address(image, &depth_surface->memory_range);
- info.depth_surf = &depth_surface->isl;
+ anv_reloc_list_add_bo(cmd_buffer->batch.relocs, depth_address.bo);
- info.depth_address =
- anv_batch_emit_reloc(&cmd_buffer->batch,
- dw + device->isl_dev.ds.depth_offset / 4,
- depth_address.bo, depth_address.offset);
+ info.depth_surf = &depth_surface->isl;
+ info.depth_address = anv_address_physical(depth_address);
info.mocs =
anv_mocs(device, depth_address.bo, ISL_SURF_USAGE_DEPTH_BIT);
- const uint32_t ds =
- cmd_buffer->state.subpass->depth_stencil_attachment->attachment;
- info.hiz_usage = cmd_buffer->state.attachments[ds].aux_usage;
+ info.hiz_usage = gfx->depth_att.aux_usage;
if (info.hiz_usage != ISL_AUX_USAGE_NONE) {
assert(isl_aux_usage_has_hiz(info.hiz_usage));
@@ -5842,18 +4672,19 @@ cmd_buffer_emit_depth_stencil(struct anv_cmd_buffer *cmd_buffer)
const struct anv_address hiz_address =
anv_image_address(image, &hiz_surface->memory_range);
- info.hiz_surf = &hiz_surface->isl;
+ anv_reloc_list_add_bo(cmd_buffer->batch.relocs, hiz_address.bo);
- info.hiz_address =
- anv_batch_emit_reloc(&cmd_buffer->batch,
- dw + device->isl_dev.ds.hiz_offset / 4,
- hiz_address.bo, hiz_address.offset);
+ info.hiz_surf = &hiz_surface->isl;
+ info.hiz_address = anv_address_physical(hiz_address);
info.depth_clear_value = ANV_HZ_FC_VAL;
}
}
- if (image && (image->vk.aspects & VK_IMAGE_ASPECT_STENCIL_BIT)) {
+ if (gfx->stencil_att.iview != NULL) {
+ const struct anv_image_view *iview = gfx->stencil_att.iview;
+ const struct anv_image *image = iview->image;
+
const uint32_t stencil_plane =
anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_STENCIL_BIT);
const struct anv_surface *stencil_surface =
@@ -5861,555 +4692,684 @@ cmd_buffer_emit_depth_stencil(struct anv_cmd_buffer *cmd_buffer)
const struct anv_address stencil_address =
anv_image_address(image, &stencil_surface->memory_range);
+ anv_reloc_list_add_bo(cmd_buffer->batch.relocs, stencil_address.bo);
+
info.stencil_surf = &stencil_surface->isl;
info.stencil_aux_usage = image->planes[stencil_plane].aux_usage;
- info.stencil_address =
- anv_batch_emit_reloc(&cmd_buffer->batch,
- dw + device->isl_dev.ds.stencil_offset / 4,
- stencil_address.bo, stencil_address.offset);
+ info.stencil_address = anv_address_physical(stencil_address);
info.mocs =
anv_mocs(device, stencil_address.bo, ISL_SURF_USAGE_STENCIL_BIT);
}
isl_emit_depth_stencil_hiz_s(&device->isl_dev, dw, &info);
+ /* Wa_14016712196:
+ * Emit depth flush after state that sends implicit depth flush.
+ */
+ if (intel_needs_workaround(cmd_buffer->device->info, 14016712196)) {
+ genx_batch_emit_pipe_control(&cmd_buffer->batch,
+ cmd_buffer->device->info,
+ cmd_buffer->state.current_pipeline,
+ ANV_PIPE_DEPTH_CACHE_FLUSH_BIT);
+ }
+
if (info.depth_surf)
genX(cmd_buffer_emit_gfx12_depth_wa)(cmd_buffer, info.depth_surf);
- if (GFX_VER >= 12) {
+ if (GFX_VER >= 11) {
cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT;
genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
- /* Wa_1408224581
- *
- * Workaround: Gfx12LP Astep only An additional pipe control with
- * post-sync = store dword operation would be required.( w/a is to
- * have an additional pipe control after the stencil state whenever
- * the surface state bits of this state is changing).
- */
- anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
- pc.PostSyncOperation = WriteImmediateData;
- pc.Address = cmd_buffer->device->workaround_address;
+ if (intel_needs_workaround(cmd_buffer->device->info, 1408224581) ||
+ intel_needs_workaround(cmd_buffer->device->info, 14014097488)) {
+ /* Wa_1408224581
+ *
+ * Workaround: Gfx12LP Astep only An additional pipe control with
+ * post-sync = store dword operation would be required.( w/a is to
+ * have an additional pipe control after the stencil state whenever
+ * the surface state bits of this state is changing).
+ *
+ * This also seems sufficient to handle Wa_14014097488.
+ */
+ genx_batch_emit_pipe_control_write
+ (&cmd_buffer->batch, cmd_buffer->device->info,
+ cmd_buffer->state.current_pipeline, WriteImmediateData,
+ cmd_buffer->device->workaround_address, 0, 0);
}
}
cmd_buffer->state.hiz_enabled = isl_aux_usage_has_hiz(info.hiz_usage);
}
-/**
- * This ANDs the view mask of the current subpass with the pending clear
- * views in the attachment to get the mask of views active in the subpass
- * that still need to be cleared.
- */
-static inline uint32_t
-get_multiview_subpass_clear_mask(const struct anv_cmd_state *cmd_state,
- const struct anv_attachment_state *att_state)
-{
- return cmd_state->subpass->view_mask & att_state->pending_clear_views;
-}
-
-static inline bool
-do_first_layer_clear(const struct anv_cmd_state *cmd_state,
- const struct anv_attachment_state *att_state)
+static void
+cmd_buffer_emit_cps_control_buffer(struct anv_cmd_buffer *cmd_buffer,
+ const struct anv_image_view *fsr_iview)
{
- if (!cmd_state->subpass->view_mask)
- return true;
+#if GFX_VERx10 >= 125
+ struct anv_device *device = cmd_buffer->device;
- uint32_t pending_clear_mask =
- get_multiview_subpass_clear_mask(cmd_state, att_state);
+ if (!device->vk.enabled_extensions.KHR_fragment_shading_rate)
+ return;
- return pending_clear_mask & 1;
-}
+ uint32_t *dw = anv_batch_emit_dwords(&cmd_buffer->batch,
+ device->isl_dev.cpb.size / 4);
+ if (dw == NULL)
+ return;
-static inline bool
-current_subpass_is_last_for_attachment(const struct anv_cmd_state *cmd_state,
- uint32_t att_idx)
-{
- const uint32_t last_subpass_idx =
- cmd_state->pass->attachments[att_idx].last_subpass_idx;
- const struct anv_subpass *last_subpass =
- &cmd_state->pass->subpasses[last_subpass_idx];
- return last_subpass == cmd_state->subpass;
-}
+ struct isl_cpb_emit_info info = { };
-static void
-cmd_buffer_begin_subpass(struct anv_cmd_buffer *cmd_buffer,
- uint32_t subpass_id)
-{
- struct anv_cmd_state *cmd_state = &cmd_buffer->state;
- struct anv_render_pass *pass = cmd_state->pass;
- struct anv_subpass *subpass = &pass->subpasses[subpass_id];
- cmd_state->subpass = subpass;
-
- cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_RENDER_TARGETS;
-
- /* Our implementation of VK_KHR_multiview uses instancing to draw the
- * different views. If the client asks for instancing, we need to use the
- * Instance Data Step Rate to ensure that we repeat the client's
- * per-instance data once for each view. Since this bit is in
- * VERTEX_BUFFER_STATE on gfx7, we need to dirty vertex buffers at the top
- * of each subpass.
- */
- if (GFX_VER == 7)
- cmd_buffer->state.gfx.vb_dirty |= ~0;
+ if (fsr_iview) {
+ const struct anv_image_binding *binding = &fsr_iview->image->bindings[0];
- /* It is possible to start a render pass with an old pipeline. Because the
- * render pass and subpass index are both baked into the pipeline, this is
- * highly unlikely. In order to do so, it requires that you have a render
- * pass with a single subpass and that you use that render pass twice
- * back-to-back and use the same pipeline at the start of the second render
- * pass as at the end of the first. In order to avoid unpredictable issues
- * with this edge case, we just dirty the pipeline at the start of every
- * subpass.
- */
- cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_PIPELINE;
+ anv_reloc_list_add_bo(cmd_buffer->batch.relocs, binding->address.bo);
- /* Accumulate any subpass flushes that need to happen before the subpass */
- anv_add_pending_pipe_bits(cmd_buffer,
- cmd_buffer->state.pass->subpass_flushes[subpass_id],
- "begin subpass deps/attachments");
+ struct anv_address addr =
+ anv_address_add(binding->address, binding->memory_range.offset);
- VkRect2D render_area = cmd_buffer->state.render_area;
- struct anv_framebuffer *fb = cmd_buffer->state.framebuffer;
+ info.view = &fsr_iview->planes[0].isl;
+ info.surf = &fsr_iview->image->planes[0].primary_surface.isl;
+ info.address = anv_address_physical(addr);
+ info.mocs =
+ anv_mocs(device, fsr_iview->image->bindings[0].address.bo,
+ ISL_SURF_USAGE_CPB_BIT);
+ }
- bool is_multiview = subpass->view_mask != 0;
+ isl_emit_cpb_control_s(&device->isl_dev, dw, &info);
- for (uint32_t i = 0; i < subpass->attachment_count; ++i) {
- const uint32_t a = subpass->attachments[i].attachment;
- if (a == VK_ATTACHMENT_UNUSED)
- continue;
+ /* Wa_14016712196:
+ * Emit depth flush after state that sends implicit depth flush.
+ */
+ if (intel_needs_workaround(cmd_buffer->device->info, 14016712196)) {
+ genx_batch_emit_pipe_control(&cmd_buffer->batch,
+ cmd_buffer->device->info,
+ cmd_buffer->state.current_pipeline,
+ ANV_PIPE_DEPTH_CACHE_FLUSH_BIT);
+ }
+#endif /* GFX_VERx10 >= 125 */
+}
- assert(a < cmd_state->pass->attachment_count);
- struct anv_attachment_state *att_state = &cmd_state->attachments[a];
+static VkImageLayout
+attachment_initial_layout(const VkRenderingAttachmentInfo *att)
+{
+ const VkRenderingAttachmentInitialLayoutInfoMESA *layout_info =
+ vk_find_struct_const(att->pNext,
+ RENDERING_ATTACHMENT_INITIAL_LAYOUT_INFO_MESA);
+ if (layout_info != NULL)
+ return layout_info->initialLayout;
- struct anv_image_view *iview = cmd_state->attachments[a].image_view;
- const struct anv_image *image = iview->image;
+ return att->imageLayout;
+}
- VkImageLayout target_layout = subpass->attachments[i].layout;
- VkImageLayout target_stencil_layout =
- subpass->attachments[i].stencil_layout;
+void genX(CmdBeginRendering)(
+ VkCommandBuffer commandBuffer,
+ const VkRenderingInfo* pRenderingInfo)
+{
+ ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+ struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
+ VkResult result;
- uint32_t level = iview->planes[0].isl.base_level;
- uint32_t width = anv_minify(iview->image->vk.extent.width, level);
- uint32_t height = anv_minify(iview->image->vk.extent.height, level);
- bool full_surface_draw =
- render_area.offset.x == 0 && render_area.offset.y == 0 &&
- render_area.extent.width == width &&
- render_area.extent.height == height;
+ if (!anv_cmd_buffer_is_render_queue(cmd_buffer)) {
+ assert(!"Trying to start a render pass on non-render queue!");
+ anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_UNKNOWN);
+ return;
+ }
- uint32_t base_layer, layer_count;
- if (image->vk.image_type == VK_IMAGE_TYPE_3D) {
- base_layer = 0;
- layer_count = anv_minify(iview->image->vk.extent.depth, level);
- } else {
- base_layer = iview->planes[0].isl.base_array_layer;
- layer_count = fb->layers;
- }
+ anv_measure_beginrenderpass(cmd_buffer);
+ trace_intel_begin_render_pass(&cmd_buffer->trace);
- if (image->vk.aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) {
- bool will_full_fast_clear =
- (att_state->pending_clear_aspects & VK_IMAGE_ASPECT_COLOR_BIT) &&
- att_state->fast_clear && full_surface_draw;
-
- assert(image->vk.aspects == VK_IMAGE_ASPECT_COLOR_BIT);
- transition_color_buffer(cmd_buffer, image, VK_IMAGE_ASPECT_COLOR_BIT,
- level, 1, base_layer, layer_count,
- att_state->current_layout, target_layout,
- VK_QUEUE_FAMILY_IGNORED,
- VK_QUEUE_FAMILY_IGNORED,
- will_full_fast_clear);
- att_state->aux_usage =
- anv_layout_to_aux_usage(&cmd_buffer->device->info, image,
- VK_IMAGE_ASPECT_COLOR_BIT,
- VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT,
- target_layout);
- }
+ gfx->rendering_flags = pRenderingInfo->flags;
+ gfx->view_mask = pRenderingInfo->viewMask;
+ gfx->layer_count = pRenderingInfo->layerCount;
+ gfx->samples = 0;
- if (image->vk.aspects & VK_IMAGE_ASPECT_DEPTH_BIT) {
- bool will_full_fast_clear =
- (att_state->pending_clear_aspects & VK_IMAGE_ASPECT_DEPTH_BIT) &&
- att_state->fast_clear && full_surface_draw;
+ if (gfx->render_area.offset.x != pRenderingInfo->renderArea.offset.x ||
+ gfx->render_area.offset.y != pRenderingInfo->renderArea.offset.y ||
+ gfx->render_area.extent.width != pRenderingInfo->renderArea.extent.width ||
+ gfx->render_area.extent.height != pRenderingInfo->renderArea.extent.height) {
+ gfx->render_area = pRenderingInfo->renderArea;
+ gfx->dirty |= ANV_CMD_DIRTY_RENDER_AREA;
+ }
- transition_depth_buffer(cmd_buffer, image,
- base_layer, layer_count,
- att_state->current_layout, target_layout,
- will_full_fast_clear);
- att_state->aux_usage =
- anv_layout_to_aux_usage(&cmd_buffer->device->info, image,
- VK_IMAGE_ASPECT_DEPTH_BIT,
- VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT,
- target_layout);
- }
+ const bool is_multiview = gfx->view_mask != 0;
+ const VkRect2D render_area = gfx->render_area;
+ const uint32_t layers =
+ is_multiview ? util_last_bit(gfx->view_mask) : gfx->layer_count;
- if (image->vk.aspects & VK_IMAGE_ASPECT_STENCIL_BIT) {
- bool will_full_fast_clear =
- (att_state->pending_clear_aspects & VK_IMAGE_ASPECT_STENCIL_BIT) &&
- att_state->fast_clear && full_surface_draw;
+ /* The framebuffer size is at least large enough to contain the render
+ * area. Because a zero renderArea is possible, we MAX with 1.
+ */
+ struct isl_extent3d fb_size = {
+ .w = MAX2(1, render_area.offset.x + render_area.extent.width),
+ .h = MAX2(1, render_area.offset.y + render_area.extent.height),
+ .d = layers,
+ };
- transition_stencil_buffer(cmd_buffer, image,
- level, 1, base_layer, layer_count,
- att_state->current_stencil_layout,
- target_stencil_layout,
- will_full_fast_clear);
- }
- att_state->current_layout = target_layout;
- att_state->current_stencil_layout = target_stencil_layout;
+ const uint32_t color_att_count = pRenderingInfo->colorAttachmentCount;
+ result = anv_cmd_buffer_init_attachments(cmd_buffer, color_att_count);
+ if (result != VK_SUCCESS)
+ return;
- if (att_state->pending_clear_aspects & VK_IMAGE_ASPECT_COLOR_BIT) {
- assert(att_state->pending_clear_aspects == VK_IMAGE_ASPECT_COLOR_BIT);
+ genX(flush_pipeline_select_3d)(cmd_buffer);
- /* Multi-planar images are not supported as attachments */
- assert(image->vk.aspects == VK_IMAGE_ASPECT_COLOR_BIT);
- assert(image->n_planes == 1);
+ for (uint32_t i = 0; i < gfx->color_att_count; i++) {
+ if (pRenderingInfo->pColorAttachments[i].imageView == VK_NULL_HANDLE)
+ continue;
- uint32_t base_clear_layer = iview->planes[0].isl.base_array_layer;
- uint32_t clear_layer_count = fb->layers;
+ const VkRenderingAttachmentInfo *att =
+ &pRenderingInfo->pColorAttachments[i];
+ ANV_FROM_HANDLE(anv_image_view, iview, att->imageView);
+ const VkImageLayout initial_layout = attachment_initial_layout(att);
+
+ assert(render_area.offset.x + render_area.extent.width <=
+ iview->vk.extent.width);
+ assert(render_area.offset.y + render_area.extent.height <=
+ iview->vk.extent.height);
+ assert(layers <= iview->vk.layer_count);
+
+ fb_size.w = MAX2(fb_size.w, iview->vk.extent.width);
+ fb_size.h = MAX2(fb_size.h, iview->vk.extent.height);
+
+ assert(gfx->samples == 0 || gfx->samples == iview->vk.image->samples);
+ gfx->samples |= iview->vk.image->samples;
+
+ enum isl_aux_usage aux_usage =
+ anv_layout_to_aux_usage(cmd_buffer->device->info,
+ iview->image,
+ VK_IMAGE_ASPECT_COLOR_BIT,
+ VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT,
+ att->imageLayout,
+ cmd_buffer->queue_family->queueFlags);
+
+ union isl_color_value fast_clear_color = { .u32 = { 0, } };
+
+ if (att->loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR &&
+ !(gfx->rendering_flags & VK_RENDERING_RESUMING_BIT)) {
+ const union isl_color_value clear_color =
+ vk_to_isl_color_with_format(att->clearValue.color,
+ iview->planes[0].isl.format);
+
+ /* We only support fast-clears on the first layer */
+ const bool fast_clear =
+ (!is_multiview || (gfx->view_mask & 1)) &&
+ anv_can_fast_clear_color_view(cmd_buffer->device, iview,
+ att->imageLayout, clear_color,
+ layers, render_area,
+ cmd_buffer->queue_family->queueFlags);
+
+ if (att->imageLayout != initial_layout) {
+ assert(render_area.offset.x == 0 && render_area.offset.y == 0 &&
+ render_area.extent.width == iview->vk.extent.width &&
+ render_area.extent.height == iview->vk.extent.height);
+ if (is_multiview) {
+ u_foreach_bit(view, gfx->view_mask) {
+ transition_color_buffer(cmd_buffer, iview->image,
+ VK_IMAGE_ASPECT_COLOR_BIT,
+ iview->vk.base_mip_level, 1,
+ iview->vk.base_array_layer + view,
+ 1, /* layer_count */
+ initial_layout, att->imageLayout,
+ VK_QUEUE_FAMILY_IGNORED,
+ VK_QUEUE_FAMILY_IGNORED,
+ fast_clear);
+ }
+ } else {
+ transition_color_buffer(cmd_buffer, iview->image,
+ VK_IMAGE_ASPECT_COLOR_BIT,
+ iview->vk.base_mip_level, 1,
+ iview->vk.base_array_layer,
+ gfx->layer_count,
+ initial_layout, att->imageLayout,
+ VK_QUEUE_FAMILY_IGNORED,
+ VK_QUEUE_FAMILY_IGNORED,
+ fast_clear);
+ }
+ }
- if (att_state->fast_clear &&
- do_first_layer_clear(cmd_state, att_state)) {
+ uint32_t clear_view_mask = pRenderingInfo->viewMask;
+ uint32_t base_clear_layer = iview->vk.base_array_layer;
+ uint32_t clear_layer_count = gfx->layer_count;
+ if (fast_clear) {
/* We only support fast-clears on the first layer */
- assert(level == 0 && base_layer == 0);
+ assert(iview->vk.base_mip_level == 0 &&
+ iview->vk.base_array_layer == 0);
+
+ fast_clear_color = clear_color;
- union isl_color_value clear_color = {};
- anv_clear_color_from_att_state(&clear_color, att_state, iview);
if (iview->image->vk.samples == 1) {
- anv_image_ccs_op(cmd_buffer, image,
+ anv_image_ccs_op(cmd_buffer, iview->image,
iview->planes[0].isl.format,
iview->planes[0].isl.swizzle,
VK_IMAGE_ASPECT_COLOR_BIT,
0, 0, 1, ISL_AUX_OP_FAST_CLEAR,
- &clear_color,
+ &fast_clear_color,
false);
} else {
- anv_image_mcs_op(cmd_buffer, image,
+ anv_image_mcs_op(cmd_buffer, iview->image,
iview->planes[0].isl.format,
iview->planes[0].isl.swizzle,
VK_IMAGE_ASPECT_COLOR_BIT,
0, 1, ISL_AUX_OP_FAST_CLEAR,
- &clear_color,
+ &fast_clear_color,
false);
}
+ clear_view_mask &= ~1u;
base_clear_layer++;
clear_layer_count--;
- if (is_multiview)
- att_state->pending_clear_views &= ~1;
-
- if (isl_color_value_is_zero(clear_color,
- iview->planes[0].isl.format)) {
- /* This image has the auxiliary buffer enabled. We can mark the
- * subresource as not needing a resolve because the clear color
- * will match what's in every RENDER_SURFACE_STATE object when
- * it's being used for sampling.
- */
- set_image_fast_clear_state(cmd_buffer, iview->image,
- VK_IMAGE_ASPECT_COLOR_BIT,
- ANV_FAST_CLEAR_DEFAULT_VALUE);
- } else {
- set_image_fast_clear_state(cmd_buffer, iview->image,
- VK_IMAGE_ASPECT_COLOR_BIT,
- ANV_FAST_CLEAR_ANY);
- }
+
+ genX(set_fast_clear_state)(cmd_buffer, iview->image,
+ iview->planes[0].isl.format,
+ clear_color);
}
- /* From the VkFramebufferCreateInfo spec:
- *
- * "If the render pass uses multiview, then layers must be one and each
- * attachment requires a number of layers that is greater than the
- * maximum bit index set in the view mask in the subpasses in which it
- * is used."
- *
- * So if multiview is active we ignore the number of layers in the
- * framebuffer and instead we honor the view mask from the subpass.
- */
if (is_multiview) {
- assert(image->n_planes == 1);
- uint32_t pending_clear_mask =
- get_multiview_subpass_clear_mask(cmd_state, att_state);
-
- u_foreach_bit(layer_idx, pending_clear_mask) {
- uint32_t layer =
- iview->planes[0].isl.base_array_layer + layer_idx;
-
- anv_image_clear_color(cmd_buffer, image,
+ u_foreach_bit(view, clear_view_mask) {
+ anv_image_clear_color(cmd_buffer, iview->image,
VK_IMAGE_ASPECT_COLOR_BIT,
- att_state->aux_usage,
+ aux_usage,
iview->planes[0].isl.format,
iview->planes[0].isl.swizzle,
- level, layer, 1,
- render_area,
- vk_to_isl_color(att_state->clear_value.color));
+ iview->vk.base_mip_level,
+ iview->vk.base_array_layer + view, 1,
+ render_area, clear_color);
}
-
- att_state->pending_clear_views &= ~pending_clear_mask;
- } else if (clear_layer_count > 0) {
- assert(image->n_planes == 1);
- anv_image_clear_color(cmd_buffer, image, VK_IMAGE_ASPECT_COLOR_BIT,
- att_state->aux_usage,
+ } else {
+ anv_image_clear_color(cmd_buffer, iview->image,
+ VK_IMAGE_ASPECT_COLOR_BIT,
+ aux_usage,
iview->planes[0].isl.format,
iview->planes[0].isl.swizzle,
- level, base_clear_layer, clear_layer_count,
- render_area,
- vk_to_isl_color(att_state->clear_value.color));
+ iview->vk.base_mip_level,
+ base_clear_layer, clear_layer_count,
+ render_area, clear_color);
+ }
+ } else {
+ /* If not LOAD_OP_CLEAR, we shouldn't have a layout transition. */
+ assert(att->imageLayout == initial_layout);
+ }
+
+ gfx->color_att[i].vk_format = iview->vk.format;
+ gfx->color_att[i].iview = iview;
+ gfx->color_att[i].layout = att->imageLayout;
+ gfx->color_att[i].aux_usage = aux_usage;
+
+ struct isl_view isl_view = iview->planes[0].isl;
+ if (pRenderingInfo->viewMask) {
+ assert(isl_view.array_len >= util_last_bit(pRenderingInfo->viewMask));
+ isl_view.array_len = util_last_bit(pRenderingInfo->viewMask);
+ } else {
+ assert(isl_view.array_len >= pRenderingInfo->layerCount);
+ isl_view.array_len = pRenderingInfo->layerCount;
+ }
+
+ anv_image_fill_surface_state(cmd_buffer->device,
+ iview->image,
+ VK_IMAGE_ASPECT_COLOR_BIT,
+ &isl_view,
+ ISL_SURF_USAGE_RENDER_TARGET_BIT,
+ aux_usage, &fast_clear_color,
+ 0, /* anv_image_view_state_flags */
+ &gfx->color_att[i].surface_state);
+
+ add_surface_state_relocs(cmd_buffer, &gfx->color_att[i].surface_state);
+
+ if (GFX_VER < 10 &&
+ (att->loadOp == VK_ATTACHMENT_LOAD_OP_LOAD ||
+ (gfx->rendering_flags & VK_RENDERING_RESUMING_BIT)) &&
+ iview->image->planes[0].aux_usage != ISL_AUX_USAGE_NONE &&
+ iview->planes[0].isl.base_level == 0 &&
+ iview->planes[0].isl.base_array_layer == 0) {
+ genX(load_image_clear_color)(cmd_buffer,
+ gfx->color_att[i].surface_state.state,
+ iview->image);
+ }
+
+ if (att->resolveMode != VK_RESOLVE_MODE_NONE) {
+ gfx->color_att[i].resolve_mode = att->resolveMode;
+ gfx->color_att[i].resolve_iview =
+ anv_image_view_from_handle(att->resolveImageView);
+ gfx->color_att[i].resolve_layout = att->resolveImageLayout;
+ }
+ }
+
+ anv_cmd_graphic_state_update_has_uint_rt(gfx);
+
+ const struct anv_image_view *fsr_iview = NULL;
+ const VkRenderingFragmentShadingRateAttachmentInfoKHR *fsr_att =
+ vk_find_struct_const(pRenderingInfo->pNext,
+ RENDERING_FRAGMENT_SHADING_RATE_ATTACHMENT_INFO_KHR);
+ if (fsr_att != NULL && fsr_att->imageView != VK_NULL_HANDLE) {
+ fsr_iview = anv_image_view_from_handle(fsr_att->imageView);
+ /* imageLayout and shadingRateAttachmentTexelSize are ignored */
+ }
+
+ const struct anv_image_view *ds_iview = NULL;
+ const VkRenderingAttachmentInfo *d_att = pRenderingInfo->pDepthAttachment;
+ const VkRenderingAttachmentInfo *s_att = pRenderingInfo->pStencilAttachment;
+ if ((d_att != NULL && d_att->imageView != VK_NULL_HANDLE) ||
+ (s_att != NULL && s_att->imageView != VK_NULL_HANDLE)) {
+ const struct anv_image_view *d_iview = NULL, *s_iview = NULL;
+ VkImageLayout depth_layout = VK_IMAGE_LAYOUT_UNDEFINED;
+ VkImageLayout stencil_layout = VK_IMAGE_LAYOUT_UNDEFINED;
+ VkImageLayout initial_depth_layout = VK_IMAGE_LAYOUT_UNDEFINED;
+ VkImageLayout initial_stencil_layout = VK_IMAGE_LAYOUT_UNDEFINED;
+ enum isl_aux_usage depth_aux_usage = ISL_AUX_USAGE_NONE;
+ enum isl_aux_usage stencil_aux_usage = ISL_AUX_USAGE_NONE;
+ float depth_clear_value = 0;
+ uint32_t stencil_clear_value = 0;
+
+ if (d_att != NULL && d_att->imageView != VK_NULL_HANDLE) {
+ d_iview = anv_image_view_from_handle(d_att->imageView);
+ initial_depth_layout = attachment_initial_layout(d_att);
+ depth_layout = d_att->imageLayout;
+ depth_aux_usage =
+ anv_layout_to_aux_usage(cmd_buffer->device->info,
+ d_iview->image,
+ VK_IMAGE_ASPECT_DEPTH_BIT,
+ VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT,
+ depth_layout,
+ cmd_buffer->queue_family->queueFlags);
+ depth_clear_value = d_att->clearValue.depthStencil.depth;
+ }
+
+ if (s_att != NULL && s_att->imageView != VK_NULL_HANDLE) {
+ s_iview = anv_image_view_from_handle(s_att->imageView);
+ initial_stencil_layout = attachment_initial_layout(s_att);
+ stencil_layout = s_att->imageLayout;
+ stencil_aux_usage =
+ anv_layout_to_aux_usage(cmd_buffer->device->info,
+ s_iview->image,
+ VK_IMAGE_ASPECT_STENCIL_BIT,
+ VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT,
+ stencil_layout,
+ cmd_buffer->queue_family->queueFlags);
+ stencil_clear_value = s_att->clearValue.depthStencil.stencil;
+ }
+
+ assert(s_iview == NULL || d_iview == NULL || s_iview == d_iview);
+ ds_iview = d_iview != NULL ? d_iview : s_iview;
+ assert(ds_iview != NULL);
+
+ assert(render_area.offset.x + render_area.extent.width <=
+ ds_iview->vk.extent.width);
+ assert(render_area.offset.y + render_area.extent.height <=
+ ds_iview->vk.extent.height);
+ assert(layers <= ds_iview->vk.layer_count);
+
+ fb_size.w = MAX2(fb_size.w, ds_iview->vk.extent.width);
+ fb_size.h = MAX2(fb_size.h, ds_iview->vk.extent.height);
+
+ assert(gfx->samples == 0 || gfx->samples == ds_iview->vk.image->samples);
+ gfx->samples |= ds_iview->vk.image->samples;
+
+ VkImageAspectFlags clear_aspects = 0;
+ if (d_iview != NULL && d_att->loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR &&
+ !(gfx->rendering_flags & VK_RENDERING_RESUMING_BIT))
+ clear_aspects |= VK_IMAGE_ASPECT_DEPTH_BIT;
+ if (s_iview != NULL && s_att->loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR &&
+ !(gfx->rendering_flags & VK_RENDERING_RESUMING_BIT))
+ clear_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT;
+
+ if (clear_aspects != 0) {
+ const bool hiz_clear =
+ anv_can_hiz_clear_ds_view(cmd_buffer->device, d_iview,
+ depth_layout, clear_aspects,
+ depth_clear_value,
+ render_area,
+ cmd_buffer->queue_family->queueFlags);
+
+ if (depth_layout != initial_depth_layout) {
+ assert(render_area.offset.x == 0 && render_area.offset.y == 0 &&
+ render_area.extent.width == d_iview->vk.extent.width &&
+ render_area.extent.height == d_iview->vk.extent.height);
+
+ if (is_multiview) {
+ u_foreach_bit(view, gfx->view_mask) {
+ transition_depth_buffer(cmd_buffer, d_iview->image,
+ d_iview->vk.base_mip_level, 1,
+ d_iview->vk.base_array_layer + view,
+ 1 /* layer_count */,
+ initial_depth_layout, depth_layout,
+ hiz_clear);
+ }
+ } else {
+ transition_depth_buffer(cmd_buffer, d_iview->image,
+ d_iview->vk.base_mip_level, 1,
+ d_iview->vk.base_array_layer,
+ gfx->layer_count,
+ initial_depth_layout, depth_layout,
+ hiz_clear);
+ }
}
- } else if (att_state->pending_clear_aspects & (VK_IMAGE_ASPECT_DEPTH_BIT |
- VK_IMAGE_ASPECT_STENCIL_BIT)) {
- if (att_state->fast_clear &&
- (att_state->pending_clear_aspects & VK_IMAGE_ASPECT_DEPTH_BIT)) {
- /* We currently only support HiZ for single-LOD images */
- assert(isl_aux_usage_has_hiz(iview->image->planes[0].aux_usage));
- assert(iview->planes[0].isl.base_level == 0);
- assert(iview->planes[0].isl.levels == 1);
+
+ if (stencil_layout != initial_stencil_layout) {
+ assert(render_area.offset.x == 0 && render_area.offset.y == 0 &&
+ render_area.extent.width == s_iview->vk.extent.width &&
+ render_area.extent.height == s_iview->vk.extent.height);
+
+ if (is_multiview) {
+ u_foreach_bit(view, gfx->view_mask) {
+ transition_stencil_buffer(cmd_buffer, s_iview->image,
+ s_iview->vk.base_mip_level, 1,
+ s_iview->vk.base_array_layer + view,
+ 1 /* layer_count */,
+ initial_stencil_layout,
+ stencil_layout,
+ hiz_clear);
+ }
+ } else {
+ transition_stencil_buffer(cmd_buffer, s_iview->image,
+ s_iview->vk.base_mip_level, 1,
+ s_iview->vk.base_array_layer,
+ gfx->layer_count,
+ initial_stencil_layout,
+ stencil_layout,
+ hiz_clear);
+ }
}
if (is_multiview) {
- uint32_t pending_clear_mask =
- get_multiview_subpass_clear_mask(cmd_state, att_state);
-
- u_foreach_bit(layer_idx, pending_clear_mask) {
- uint32_t layer =
- iview->planes[0].isl.base_array_layer + layer_idx;
-
- if (att_state->fast_clear) {
- anv_image_hiz_clear(cmd_buffer, image,
- att_state->pending_clear_aspects,
- level, layer, 1, render_area,
- att_state->clear_value.depthStencil.stencil);
+ u_foreach_bit(view, gfx->view_mask) {
+ uint32_t level = ds_iview->vk.base_mip_level;
+ uint32_t layer = ds_iview->vk.base_array_layer + view;
+
+ if (hiz_clear) {
+ anv_image_hiz_clear(cmd_buffer, ds_iview->image,
+ clear_aspects,
+ level, layer, 1,
+ render_area,
+ stencil_clear_value);
} else {
- anv_image_clear_depth_stencil(cmd_buffer, image,
- att_state->pending_clear_aspects,
- att_state->aux_usage,
- level, layer, 1, render_area,
- att_state->clear_value.depthStencil.depth,
- att_state->clear_value.depthStencil.stencil);
+ anv_image_clear_depth_stencil(cmd_buffer, ds_iview->image,
+ clear_aspects,
+ depth_aux_usage,
+ level, layer, 1,
+ render_area,
+ depth_clear_value,
+ stencil_clear_value);
}
}
-
- att_state->pending_clear_views &= ~pending_clear_mask;
} else {
- if (att_state->fast_clear) {
- anv_image_hiz_clear(cmd_buffer, image,
- att_state->pending_clear_aspects,
+ uint32_t level = ds_iview->vk.base_mip_level;
+ uint32_t base_layer = ds_iview->vk.base_array_layer;
+ uint32_t layer_count = gfx->layer_count;
+
+ if (hiz_clear) {
+ anv_image_hiz_clear(cmd_buffer, ds_iview->image,
+ clear_aspects,
level, base_layer, layer_count,
render_area,
- att_state->clear_value.depthStencil.stencil);
+ stencil_clear_value);
} else {
- anv_image_clear_depth_stencil(cmd_buffer, image,
- att_state->pending_clear_aspects,
- att_state->aux_usage,
+ anv_image_clear_depth_stencil(cmd_buffer, ds_iview->image,
+ clear_aspects,
+ depth_aux_usage,
level, base_layer, layer_count,
render_area,
- att_state->clear_value.depthStencil.depth,
- att_state->clear_value.depthStencil.stencil);
+ depth_clear_value,
+ stencil_clear_value);
}
}
- } else {
- assert(att_state->pending_clear_aspects == 0);
+ } else {
+ /* If not LOAD_OP_CLEAR, we shouldn't have a layout transition. */
+ assert(depth_layout == initial_depth_layout);
+ assert(stencil_layout == initial_stencil_layout);
}
- /* If multiview is enabled, then we are only done clearing when we no
- * longer have pending layers to clear, or when we have processed the
- * last subpass that uses this attachment.
- */
- if (!is_multiview ||
- att_state->pending_clear_views == 0 ||
- current_subpass_is_last_for_attachment(cmd_state, a)) {
- att_state->pending_clear_aspects = 0;
+ if (d_iview != NULL) {
+ gfx->depth_att.vk_format = d_iview->vk.format;
+ gfx->depth_att.iview = d_iview;
+ gfx->depth_att.layout = depth_layout;
+ gfx->depth_att.aux_usage = depth_aux_usage;
+ if (d_att != NULL && d_att->resolveMode != VK_RESOLVE_MODE_NONE) {
+ assert(d_att->resolveImageView != VK_NULL_HANDLE);
+ gfx->depth_att.resolve_mode = d_att->resolveMode;
+ gfx->depth_att.resolve_iview =
+ anv_image_view_from_handle(d_att->resolveImageView);
+ gfx->depth_att.resolve_layout = d_att->resolveImageLayout;
+ }
}
- att_state->pending_load_aspects = 0;
+ if (s_iview != NULL) {
+ gfx->stencil_att.vk_format = s_iview->vk.format;
+ gfx->stencil_att.iview = s_iview;
+ gfx->stencil_att.layout = stencil_layout;
+ gfx->stencil_att.aux_usage = stencil_aux_usage;
+ if (s_att->resolveMode != VK_RESOLVE_MODE_NONE) {
+ assert(s_att->resolveImageView != VK_NULL_HANDLE);
+ gfx->stencil_att.resolve_mode = s_att->resolveMode;
+ gfx->stencil_att.resolve_iview =
+ anv_image_view_from_handle(s_att->resolveImageView);
+ gfx->stencil_att.resolve_layout = s_att->resolveImageLayout;
+ }
+ }
}
- /* We've transitioned all our images possibly fast clearing them. Now we
- * can fill out the surface states that we will use as render targets
- * during actual subpass rendering.
- */
- VkResult result = genX(cmd_buffer_alloc_att_surf_states)(cmd_buffer,
- pass, subpass);
- if (result != VK_SUCCESS)
- return;
-
+ /* Finally, now that we know the right size, set up the null surface */
+ assert(util_bitcount(gfx->samples) <= 1);
isl_null_fill_state(&cmd_buffer->device->isl_dev,
- cmd_state->null_surface_state.map,
- .size = isl_extent3d(fb->width, fb->height, fb->layers));
+ gfx->null_surface_state.map,
+ .size = fb_size);
- for (uint32_t i = 0; i < subpass->attachment_count; ++i) {
- const uint32_t att = subpass->attachments[i].attachment;
- if (att == VK_ATTACHMENT_UNUSED)
+ for (uint32_t i = 0; i < gfx->color_att_count; i++) {
+ if (pRenderingInfo->pColorAttachments[i].imageView != VK_NULL_HANDLE)
continue;
- assert(att < cmd_state->pass->attachment_count);
- struct anv_render_pass_attachment *pass_att = &pass->attachments[att];
- struct anv_attachment_state *att_state = &cmd_state->attachments[att];
- struct anv_image_view *iview = att_state->image_view;
-
- if (!vk_format_is_color(pass_att->format))
- continue;
-
- const VkImageUsageFlagBits att_usage = subpass->attachments[i].usage;
- assert(util_bitcount(att_usage) == 1);
-
- struct anv_surface_state *surface_state;
- isl_surf_usage_flags_t isl_surf_usage;
- enum isl_aux_usage isl_aux_usage;
- if (att_usage == VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT) {
- surface_state = &att_state->color;
- isl_surf_usage = ISL_SURF_USAGE_RENDER_TARGET_BIT;
- isl_aux_usage = att_state->aux_usage;
- } else if (att_usage == VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT) {
- surface_state = &att_state->input;
- isl_surf_usage = ISL_SURF_USAGE_TEXTURE_BIT;
- isl_aux_usage =
- anv_layout_to_aux_usage(&cmd_buffer->device->info, iview->image,
- VK_IMAGE_ASPECT_COLOR_BIT,
- VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT,
- att_state->current_layout);
- } else {
- continue;
- }
-
- /* We had better have a surface state when we get here */
- assert(surface_state->state.map);
+ isl_null_fill_state(&cmd_buffer->device->isl_dev,
+ gfx->color_att[i].surface_state.state.map,
+ .size = fb_size);
+ }
- union isl_color_value clear_color = { .u32 = { 0, } };
- if (pass_att->load_op == VK_ATTACHMENT_LOAD_OP_CLEAR &&
- att_state->fast_clear)
- anv_clear_color_from_att_state(&clear_color, att_state, iview);
+ /****** We can now start emitting code to begin the render pass ******/
- anv_image_fill_surface_state(cmd_buffer->device,
- iview->image,
- VK_IMAGE_ASPECT_COLOR_BIT,
- &iview->planes[0].isl,
- isl_surf_usage,
- isl_aux_usage,
- &clear_color,
- 0,
- surface_state,
- NULL);
+ gfx->dirty |= ANV_CMD_DIRTY_RENDER_TARGETS;
- add_surface_state_relocs(cmd_buffer, *surface_state);
+ /* It is possible to start a render pass with an old pipeline. Because the
+ * render pass and subpass index are both baked into the pipeline, this is
+ * highly unlikely. In order to do so, it requires that you have a render
+ * pass with a single subpass and that you use that render pass twice
+ * back-to-back and use the same pipeline at the start of the second render
+ * pass as at the end of the first. In order to avoid unpredictable issues
+ * with this edge case, we just dirty the pipeline at the start of every
+ * subpass.
+ */
+ gfx->dirty |= ANV_CMD_DIRTY_PIPELINE;
- if (GFX_VER < 10 &&
- pass_att->load_op == VK_ATTACHMENT_LOAD_OP_LOAD &&
- iview->image->planes[0].aux_usage != ISL_AUX_USAGE_NONE &&
- iview->planes[0].isl.base_level == 0 &&
- iview->planes[0].isl.base_array_layer == 0) {
- genX(copy_fast_clear_dwords)(cmd_buffer, surface_state->state,
- iview->image,
- VK_IMAGE_ASPECT_COLOR_BIT,
- false /* copy to ss */);
+#if GFX_VER >= 11
+ bool has_color_att = false;
+ for (uint32_t i = 0; i < gfx->color_att_count; i++) {
+ if (pRenderingInfo->pColorAttachments[i].imageView != VK_NULL_HANDLE) {
+ has_color_att = true;
+ break;
}
}
-
-#if GFX_VER >= 11
- /* The PIPE_CONTROL command description says:
- *
- * "Whenever a Binding Table Index (BTI) used by a Render Taget Message
- * points to a different RENDER_SURFACE_STATE, SW must issue a Render
- * Target Cache Flush by enabling this bit. When render target flush
- * is set due to new association of BTI, PS Scoreboard Stall bit must
- * be set in this packet."
- */
- anv_add_pending_pipe_bits(cmd_buffer,
- ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
- ANV_PIPE_STALL_AT_SCOREBOARD_BIT,
- "change RT");
+ if (has_color_att) {
+ /* The PIPE_CONTROL command description says:
+ *
+ * "Whenever a Binding Table Index (BTI) used by a Render Target Message
+ * points to a different RENDER_SURFACE_STATE, SW must issue a Render
+ * Target Cache Flush by enabling this bit. When render target flush
+ * is set due to new association of BTI, PS Scoreboard Stall bit must
+ * be set in this packet."
+ *
+ * We assume that a new BeginRendering is always changing the RTs, which
+ * may not be true and cause excessive flushing. We can trivially skip it
+ * in the case that there are no RTs (depth-only rendering), though.
+ */
+ anv_add_pending_pipe_bits(cmd_buffer,
+ ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
+ ANV_PIPE_STALL_AT_SCOREBOARD_BIT,
+ "change RT");
+ }
#endif
cmd_buffer_emit_depth_stencil(cmd_buffer);
-}
-static enum blorp_filter
-vk_to_blorp_resolve_mode(VkResolveModeFlagBitsKHR vk_mode)
-{
- switch (vk_mode) {
- case VK_RESOLVE_MODE_SAMPLE_ZERO_BIT_KHR:
- return BLORP_FILTER_SAMPLE_0;
- case VK_RESOLVE_MODE_AVERAGE_BIT_KHR:
- return BLORP_FILTER_AVERAGE;
- case VK_RESOLVE_MODE_MIN_BIT_KHR:
- return BLORP_FILTER_MIN_SAMPLE;
- case VK_RESOLVE_MODE_MAX_BIT_KHR:
- return BLORP_FILTER_MAX_SAMPLE;
- default:
- return BLORP_FILTER_NONE;
- }
+ cmd_buffer_emit_cps_control_buffer(cmd_buffer, fsr_iview);
}
static void
-cmd_buffer_end_subpass(struct anv_cmd_buffer *cmd_buffer)
+cmd_buffer_mark_attachment_written(struct anv_cmd_buffer *cmd_buffer,
+ struct anv_attachment *att,
+ VkImageAspectFlagBits aspect)
{
- struct anv_cmd_state *cmd_state = &cmd_buffer->state;
- struct anv_subpass *subpass = cmd_state->subpass;
- uint32_t subpass_id = anv_get_subpass_id(&cmd_buffer->state);
- struct anv_framebuffer *fb = cmd_buffer->state.framebuffer;
+ struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
+ const struct anv_image_view *iview = att->iview;
- /* We are done with the previous subpass and all rendering directly to that
- * subpass is now complete. Zero out all the surface states so we don't
- * accidentally use them between now and the next subpass.
- */
- for (uint32_t i = 0; i < cmd_state->pass->attachment_count; ++i) {
- memset(&cmd_state->attachments[i].color, 0,
- sizeof(cmd_state->attachments[i].color));
- memset(&cmd_state->attachments[i].input, 0,
- sizeof(cmd_state->attachments[i].input));
- }
- cmd_state->null_surface_state = ANV_STATE_NULL;
- cmd_state->attachment_states = ANV_STATE_NULL;
-
- for (uint32_t i = 0; i < subpass->attachment_count; ++i) {
- const uint32_t a = subpass->attachments[i].attachment;
- if (a == VK_ATTACHMENT_UNUSED)
- continue;
+ if (iview == NULL)
+ return;
- assert(a < cmd_state->pass->attachment_count);
- struct anv_attachment_state *att_state = &cmd_state->attachments[a];
- struct anv_image_view *iview = att_state->image_view;
+ if (gfx->view_mask == 0) {
+ genX(cmd_buffer_mark_image_written)(cmd_buffer, iview->image,
+ aspect, att->aux_usage,
+ iview->planes[0].isl.base_level,
+ iview->planes[0].isl.base_array_layer,
+ gfx->layer_count);
+ } else {
+ uint32_t res_view_mask = gfx->view_mask;
+ while (res_view_mask) {
+ int i = u_bit_scan(&res_view_mask);
+
+ const uint32_t level = iview->planes[0].isl.base_level;
+ const uint32_t layer = iview->planes[0].isl.base_array_layer + i;
- assert(util_bitcount(subpass->attachments[i].usage) == 1);
- if (subpass->attachments[i].usage ==
- VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT) {
- /* We assume that if we're ending a subpass, we did do some rendering
- * so we may end up with compressed data.
- */
genX(cmd_buffer_mark_image_written)(cmd_buffer, iview->image,
- VK_IMAGE_ASPECT_COLOR_BIT,
- att_state->aux_usage,
- iview->planes[0].isl.base_level,
- iview->planes[0].isl.base_array_layer,
- fb->layers);
- } else if (subpass->attachments[i].usage ==
- VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT) {
- /* We may be writing depth or stencil so we need to mark the surface.
- * Unfortunately, there's no way to know at this point whether the
- * depth or stencil tests used will actually write to the surface.
- *
- * Even though stencil may be plane 1, it always shares a base_level
- * with depth.
- */
- const struct isl_view *ds_view = &iview->planes[0].isl;
- if (iview->vk.aspects & VK_IMAGE_ASPECT_DEPTH_BIT) {
- genX(cmd_buffer_mark_image_written)(cmd_buffer, iview->image,
- VK_IMAGE_ASPECT_DEPTH_BIT,
- att_state->aux_usage,
- ds_view->base_level,
- ds_view->base_array_layer,
- fb->layers);
- }
- if (iview->vk.aspects & VK_IMAGE_ASPECT_STENCIL_BIT) {
- /* Even though stencil may be plane 1, it always shares a
- * base_level with depth.
- */
- genX(cmd_buffer_mark_image_written)(cmd_buffer, iview->image,
- VK_IMAGE_ASPECT_STENCIL_BIT,
- ISL_AUX_USAGE_NONE,
- ds_view->base_level,
- ds_view->base_array_layer,
- fb->layers);
- }
+ aspect, att->aux_usage,
+ level, layer, 1);
}
}
+}
+
+void genX(CmdEndRendering)(
+ VkCommandBuffer commandBuffer)
+{
+ ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+ struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
+
+ if (anv_batch_has_error(&cmd_buffer->batch))
+ return;
+
+ const bool is_multiview = gfx->view_mask != 0;
+ const uint32_t layers =
+ is_multiview ? util_last_bit(gfx->view_mask) : gfx->layer_count;
+
+ bool has_color_resolve = false;
+ for (uint32_t i = 0; i < gfx->color_att_count; i++) {
+ cmd_buffer_mark_attachment_written(cmd_buffer, &gfx->color_att[i],
+ VK_IMAGE_ASPECT_COLOR_BIT);
- if (subpass->has_color_resolve) {
+ /* Stash this off for later */
+ if (gfx->color_att[i].resolve_mode != VK_RESOLVE_MODE_NONE &&
+ !(gfx->rendering_flags & VK_RENDERING_SUSPENDING_BIT))
+ has_color_resolve = true;
+ }
+
+ cmd_buffer_mark_attachment_written(cmd_buffer, &gfx->depth_att,
+ VK_IMAGE_ASPECT_DEPTH_BIT);
+
+ cmd_buffer_mark_attachment_written(cmd_buffer, &gfx->stencil_att,
+ VK_IMAGE_ASPECT_STENCIL_BIT);
+
+ if (has_color_resolve) {
/* We are about to do some MSAA resolves. We need to flush so that the
* result of writes to the MSAA color attachments show up in the sampler
* when we blit to the single-sampled resolve target.
@@ -6418,58 +5378,11 @@ cmd_buffer_end_subpass(struct anv_cmd_buffer *cmd_buffer)
ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT |
ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT,
"MSAA resolve");
-
- for (uint32_t i = 0; i < subpass->color_count; ++i) {
- uint32_t src_att = subpass->color_attachments[i].attachment;
- uint32_t dst_att = subpass->resolve_attachments[i].attachment;
-
- if (dst_att == VK_ATTACHMENT_UNUSED)
- continue;
-
- assert(src_att < cmd_buffer->state.pass->attachment_count);
- assert(dst_att < cmd_buffer->state.pass->attachment_count);
-
- if (cmd_buffer->state.attachments[dst_att].pending_clear_aspects) {
- /* From the Vulkan 1.0 spec:
- *
- * If the first use of an attachment in a render pass is as a
- * resolve attachment, then the loadOp is effectively ignored
- * as the resolve is guaranteed to overwrite all pixels in the
- * render area.
- */
- cmd_buffer->state.attachments[dst_att].pending_clear_aspects = 0;
- }
-
- struct anv_image_view *src_iview = cmd_state->attachments[src_att].image_view;
- struct anv_image_view *dst_iview = cmd_state->attachments[dst_att].image_view;
-
- const VkRect2D render_area = cmd_buffer->state.render_area;
-
- enum isl_aux_usage src_aux_usage =
- cmd_buffer->state.attachments[src_att].aux_usage;
- enum isl_aux_usage dst_aux_usage =
- cmd_buffer->state.attachments[dst_att].aux_usage;
-
- assert(src_iview->vk.aspects == VK_IMAGE_ASPECT_COLOR_BIT &&
- dst_iview->vk.aspects == VK_IMAGE_ASPECT_COLOR_BIT);
-
- anv_image_msaa_resolve(cmd_buffer,
- src_iview->image, src_aux_usage,
- src_iview->planes[0].isl.base_level,
- src_iview->planes[0].isl.base_array_layer,
- dst_iview->image, dst_aux_usage,
- dst_iview->planes[0].isl.base_level,
- dst_iview->planes[0].isl.base_array_layer,
- VK_IMAGE_ASPECT_COLOR_BIT,
- render_area.offset.x, render_area.offset.y,
- render_area.offset.x, render_area.offset.y,
- render_area.extent.width,
- render_area.extent.height,
- fb->layers, BLORP_FILTER_NONE);
- }
}
- if (subpass->ds_resolve_attachment) {
+ if (!(gfx->rendering_flags & VK_RENDERING_SUSPENDING_BIT) &&
+ (gfx->depth_att.resolve_mode != VK_RESOLVE_MODE_NONE ||
+ gfx->stencil_att.resolve_mode != VK_RESOLVE_MODE_NONE)) {
/* We are about to do some MSAA resolves. We need to flush so that the
* result of writes to the MSAA depth attachments show up in the sampler
* when we blit to the single-sampled resolve target.
@@ -6478,313 +5391,71 @@ cmd_buffer_end_subpass(struct anv_cmd_buffer *cmd_buffer)
ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT |
ANV_PIPE_DEPTH_CACHE_FLUSH_BIT,
"MSAA resolve");
-
- uint32_t src_att = subpass->depth_stencil_attachment->attachment;
- uint32_t dst_att = subpass->ds_resolve_attachment->attachment;
-
- assert(src_att < cmd_buffer->state.pass->attachment_count);
- assert(dst_att < cmd_buffer->state.pass->attachment_count);
-
- if (cmd_buffer->state.attachments[dst_att].pending_clear_aspects) {
- /* From the Vulkan 1.0 spec:
- *
- * If the first use of an attachment in a render pass is as a
- * resolve attachment, then the loadOp is effectively ignored
- * as the resolve is guaranteed to overwrite all pixels in the
- * render area.
- */
- cmd_buffer->state.attachments[dst_att].pending_clear_aspects = 0;
- }
-
- struct anv_image_view *src_iview = cmd_state->attachments[src_att].image_view;
- struct anv_image_view *dst_iview = cmd_state->attachments[dst_att].image_view;
-
- const VkRect2D render_area = cmd_buffer->state.render_area;
-
- struct anv_attachment_state *src_state =
- &cmd_state->attachments[src_att];
- struct anv_attachment_state *dst_state =
- &cmd_state->attachments[dst_att];
-
- if ((src_iview->image->vk.aspects & VK_IMAGE_ASPECT_DEPTH_BIT) &&
- subpass->depth_resolve_mode != VK_RESOLVE_MODE_NONE_KHR) {
-
- /* MSAA resolves sample from the source attachment. Transition the
- * depth attachment first to get rid of any HiZ that we may not be
- * able to handle.
- */
- transition_depth_buffer(cmd_buffer, src_iview->image,
- src_iview->planes[0].isl.base_array_layer,
- fb->layers,
- src_state->current_layout,
- VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
- false /* will_full_fast_clear */);
- src_state->aux_usage =
- anv_layout_to_aux_usage(&cmd_buffer->device->info, src_iview->image,
- VK_IMAGE_ASPECT_DEPTH_BIT,
- VK_IMAGE_USAGE_TRANSFER_SRC_BIT,
- VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL);
- src_state->current_layout = VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL;
-
- /* MSAA resolves write to the resolve attachment as if it were any
- * other transfer op. Transition the resolve attachment accordingly.
- */
- VkImageLayout dst_initial_layout = dst_state->current_layout;
-
- /* If our render area is the entire size of the image, we're going to
- * blow it all away so we can claim the initial layout is UNDEFINED
- * and we'll get a HiZ ambiguate instead of a resolve.
- */
- if (dst_iview->image->vk.image_type != VK_IMAGE_TYPE_3D &&
- render_area.offset.x == 0 && render_area.offset.y == 0 &&
- render_area.extent.width == dst_iview->vk.extent.width &&
- render_area.extent.height == dst_iview->vk.extent.height)
- dst_initial_layout = VK_IMAGE_LAYOUT_UNDEFINED;
-
- transition_depth_buffer(cmd_buffer, dst_iview->image,
- dst_iview->planes[0].isl.base_array_layer,
- fb->layers,
- dst_initial_layout,
- VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
- false /* will_full_fast_clear */);
- dst_state->aux_usage =
- anv_layout_to_aux_usage(&cmd_buffer->device->info, dst_iview->image,
- VK_IMAGE_ASPECT_DEPTH_BIT,
- VK_IMAGE_USAGE_TRANSFER_DST_BIT,
- VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL);
- dst_state->current_layout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL;
-
- enum blorp_filter filter =
- vk_to_blorp_resolve_mode(subpass->depth_resolve_mode);
-
- anv_image_msaa_resolve(cmd_buffer,
- src_iview->image, src_state->aux_usage,
- src_iview->planes[0].isl.base_level,
- src_iview->planes[0].isl.base_array_layer,
- dst_iview->image, dst_state->aux_usage,
- dst_iview->planes[0].isl.base_level,
- dst_iview->planes[0].isl.base_array_layer,
- VK_IMAGE_ASPECT_DEPTH_BIT,
- render_area.offset.x, render_area.offset.y,
- render_area.offset.x, render_area.offset.y,
- render_area.extent.width,
- render_area.extent.height,
- fb->layers, filter);
- }
-
- if ((src_iview->image->vk.aspects & VK_IMAGE_ASPECT_STENCIL_BIT) &&
- subpass->stencil_resolve_mode != VK_RESOLVE_MODE_NONE_KHR) {
-
- src_state->current_stencil_layout = VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL;
- dst_state->current_stencil_layout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL;
-
- enum isl_aux_usage src_aux_usage = ISL_AUX_USAGE_NONE;
- const uint32_t plane =
- anv_image_aspect_to_plane(dst_iview->image, VK_IMAGE_ASPECT_STENCIL_BIT);
- enum isl_aux_usage dst_aux_usage =
- dst_iview->image->planes[plane].aux_usage;
-
- enum blorp_filter filter =
- vk_to_blorp_resolve_mode(subpass->stencil_resolve_mode);
-
- anv_image_msaa_resolve(cmd_buffer,
- src_iview->image, src_aux_usage,
- src_iview->planes[0].isl.base_level,
- src_iview->planes[0].isl.base_array_layer,
- dst_iview->image, dst_aux_usage,
- dst_iview->planes[0].isl.base_level,
- dst_iview->planes[0].isl.base_array_layer,
- VK_IMAGE_ASPECT_STENCIL_BIT,
- render_area.offset.x, render_area.offset.y,
- render_area.offset.x, render_area.offset.y,
- render_area.extent.width,
- render_area.extent.height,
- fb->layers, filter);
- }
- }
-
-#if GFX_VER == 7
- /* On gfx7, we have to store a texturable version of the stencil buffer in
- * a shadow whenever VK_IMAGE_USAGE_SAMPLED_BIT is set and copy back and
- * forth at strategic points. Stencil writes are only allowed in following
- * layouts:
- *
- * - VK_IMAGE_LAYOUT_GENERAL
- * - VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL
- * - VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL
- * - VK_IMAGE_LAYOUT_DEPTH_READ_ONLY_STENCIL_ATTACHMENT_OPTIMAL
- * - VK_IMAGE_LAYOUT_STENCIL_ATTACHMENT_OPTIMAL_KHR
- *
- * For general, we have no nice opportunity to transition so we do the copy
- * to the shadow unconditionally at the end of the subpass. For transfer
- * destinations, we can update it as part of the transfer op. For the other
- * layouts, we delay the copy until a transition into some other layout.
- */
- if (subpass->depth_stencil_attachment) {
- uint32_t a = subpass->depth_stencil_attachment->attachment;
- assert(a != VK_ATTACHMENT_UNUSED);
-
- struct anv_attachment_state *att_state = &cmd_state->attachments[a];
- struct anv_image_view *iview = cmd_state->attachments[a].image_view;;
- const struct anv_image *image = iview->image;
-
- if (image->vk.aspects & VK_IMAGE_ASPECT_STENCIL_BIT) {
- const uint32_t plane =
- anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_STENCIL_BIT);
-
- if (anv_surface_is_valid(&image->planes[plane].shadow_surface) &&
- att_state->current_stencil_layout == VK_IMAGE_LAYOUT_GENERAL) {
- assert(image->vk.aspects & VK_IMAGE_ASPECT_STENCIL_BIT);
- anv_image_copy_to_shadow(cmd_buffer, image,
- VK_IMAGE_ASPECT_STENCIL_BIT,
- iview->planes[plane].isl.base_level, 1,
- iview->planes[plane].isl.base_array_layer,
- fb->layers);
- }
- }
}
-#endif /* GFX_VER == 7 */
- for (uint32_t i = 0; i < subpass->attachment_count; ++i) {
- const uint32_t a = subpass->attachments[i].attachment;
- if (a == VK_ATTACHMENT_UNUSED)
+ for (uint32_t i = 0; i < gfx->color_att_count; i++) {
+ const struct anv_attachment *att = &gfx->color_att[i];
+ if (att->resolve_mode == VK_RESOLVE_MODE_NONE ||
+ (gfx->rendering_flags & VK_RENDERING_SUSPENDING_BIT))
continue;
- if (cmd_state->pass->attachments[a].last_subpass_idx != subpass_id)
- continue;
-
- assert(a < cmd_state->pass->attachment_count);
- struct anv_attachment_state *att_state = &cmd_state->attachments[a];
- struct anv_image_view *iview = cmd_state->attachments[a].image_view;
- const struct anv_image *image = iview->image;
-
- /* Transition the image into the final layout for this render pass */
- VkImageLayout target_layout =
- cmd_state->pass->attachments[a].final_layout;
- VkImageLayout target_stencil_layout =
- cmd_state->pass->attachments[a].stencil_final_layout;
-
- uint32_t base_layer, layer_count;
- if (image->vk.image_type == VK_IMAGE_TYPE_3D) {
- base_layer = 0;
- layer_count = anv_minify(iview->image->vk.extent.depth,
- iview->planes[0].isl.base_level);
- } else {
- base_layer = iview->planes[0].isl.base_array_layer;
- layer_count = fb->layers;
- }
-
- if (image->vk.aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) {
- assert(image->vk.aspects == VK_IMAGE_ASPECT_COLOR_BIT);
- transition_color_buffer(cmd_buffer, image, VK_IMAGE_ASPECT_COLOR_BIT,
- iview->planes[0].isl.base_level, 1,
- base_layer, layer_count,
- att_state->current_layout, target_layout,
- VK_QUEUE_FAMILY_IGNORED,
- VK_QUEUE_FAMILY_IGNORED,
- false /* will_full_fast_clear */);
- }
-
- if (image->vk.aspects & VK_IMAGE_ASPECT_DEPTH_BIT) {
- transition_depth_buffer(cmd_buffer, image,
- base_layer, layer_count,
- att_state->current_layout, target_layout,
- false /* will_full_fast_clear */);
- }
-
- if (image->vk.aspects & VK_IMAGE_ASPECT_STENCIL_BIT) {
- transition_stencil_buffer(cmd_buffer, image,
- iview->planes[0].isl.base_level, 1,
- base_layer, layer_count,
- att_state->current_stencil_layout,
- target_stencil_layout,
- false /* will_full_fast_clear */);
- }
+ anv_attachment_msaa_resolve(cmd_buffer, att, att->layout,
+ VK_IMAGE_ASPECT_COLOR_BIT);
}
- /* Accumulate any subpass flushes that need to happen after the subpass.
- * Yes, they do get accumulated twice in the NextSubpass case but since
- * genX_CmdNextSubpass just calls end/begin back-to-back, we just end up
- * ORing the bits in twice so it's harmless.
- */
- anv_add_pending_pipe_bits(cmd_buffer,
- cmd_buffer->state.pass->subpass_flushes[subpass_id + 1],
- "end subpass deps/attachments");
-}
-
-void genX(CmdBeginRenderPass2)(
- VkCommandBuffer commandBuffer,
- const VkRenderPassBeginInfo* pRenderPassBeginInfo,
- const VkSubpassBeginInfoKHR* pSubpassBeginInfo)
-{
- ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
- ANV_FROM_HANDLE(anv_render_pass, pass, pRenderPassBeginInfo->renderPass);
- ANV_FROM_HANDLE(anv_framebuffer, framebuffer, pRenderPassBeginInfo->framebuffer);
- VkResult result;
-
- cmd_buffer->state.framebuffer = framebuffer;
- cmd_buffer->state.pass = pass;
- cmd_buffer->state.render_area = pRenderPassBeginInfo->renderArea;
+ if (gfx->depth_att.resolve_mode != VK_RESOLVE_MODE_NONE &&
+ !(gfx->rendering_flags & VK_RENDERING_SUSPENDING_BIT)) {
+ const struct anv_image_view *src_iview = gfx->depth_att.iview;
- anv_measure_beginrenderpass(cmd_buffer);
-
- result = genX(cmd_buffer_setup_attachments)(cmd_buffer, pass,
- framebuffer,
- pRenderPassBeginInfo);
- if (result != VK_SUCCESS) {
- assert(anv_batch_has_error(&cmd_buffer->batch));
- return;
+ /* MSAA resolves sample from the source attachment. Transition the
+ * depth attachment first to get rid of any HiZ that we may not be
+ * able to handle.
+ */
+ transition_depth_buffer(cmd_buffer, src_iview->image, 0, 1,
+ src_iview->planes[0].isl.base_array_layer,
+ layers,
+ gfx->depth_att.layout,
+ VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
+ false /* will_full_fast_clear */);
+
+ anv_attachment_msaa_resolve(cmd_buffer, &gfx->depth_att,
+ VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
+ VK_IMAGE_ASPECT_DEPTH_BIT);
+
+ /* Transition the source back to the original layout. This seems a bit
+ * inefficient but, since HiZ resolves aren't destructive, going from
+ * less HiZ to more is generally a no-op.
+ */
+ transition_depth_buffer(cmd_buffer, src_iview->image, 0, 1,
+ src_iview->planes[0].isl.base_array_layer,
+ layers,
+ VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
+ gfx->depth_att.layout,
+ false /* will_full_fast_clear */);
}
- genX(flush_pipeline_select_3d)(cmd_buffer);
-
- cmd_buffer_begin_subpass(cmd_buffer, 0);
-}
-
-void genX(CmdNextSubpass2)(
- VkCommandBuffer commandBuffer,
- const VkSubpassBeginInfoKHR* pSubpassBeginInfo,
- const VkSubpassEndInfoKHR* pSubpassEndInfo)
-{
- ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
-
- if (anv_batch_has_error(&cmd_buffer->batch))
- return;
-
- assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
-
- uint32_t prev_subpass = anv_get_subpass_id(&cmd_buffer->state);
- cmd_buffer_end_subpass(cmd_buffer);
- cmd_buffer_begin_subpass(cmd_buffer, prev_subpass + 1);
-}
-
-void genX(CmdEndRenderPass2)(
- VkCommandBuffer commandBuffer,
- const VkSubpassEndInfoKHR* pSubpassEndInfo)
-{
- ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
-
- if (anv_batch_has_error(&cmd_buffer->batch))
- return;
+ if (gfx->stencil_att.resolve_mode != VK_RESOLVE_MODE_NONE &&
+ !(gfx->rendering_flags & VK_RENDERING_SUSPENDING_BIT)) {
+ anv_attachment_msaa_resolve(cmd_buffer, &gfx->stencil_att,
+ gfx->stencil_att.layout,
+ VK_IMAGE_ASPECT_STENCIL_BIT);
+ }
- cmd_buffer_end_subpass(cmd_buffer);
- cmd_buffer->state.hiz_enabled = false;
+ trace_intel_end_render_pass(&cmd_buffer->trace,
+ gfx->render_area.extent.width,
+ gfx->render_area.extent.height,
+ gfx->color_att_count,
+ gfx->samples);
- /* Remove references to render pass specific state. This enables us to
- * detect whether or not we're in a renderpass.
- */
- cmd_buffer->state.framebuffer = NULL;
- cmd_buffer->state.pass = NULL;
- cmd_buffer->state.subpass = NULL;
+ anv_cmd_buffer_reset_rendering(cmd_buffer);
}
void
genX(cmd_emit_conditional_render_predicate)(struct anv_cmd_buffer *cmd_buffer)
{
-#if GFX_VERx10 >= 75
struct mi_builder b;
- mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
+ mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
mi_store(&b, mi_reg64(MI_PREDICATE_SRC0),
mi_reg32(ANV_PREDICATE_RESULT_REG));
@@ -6795,10 +5466,8 @@ genX(cmd_emit_conditional_render_predicate)(struct anv_cmd_buffer *cmd_buffer)
mip.CombineOperation = COMBINE_SET;
mip.CompareOperation = COMPARE_SRCS_EQUAL;
}
-#endif
}
-#if GFX_VERx10 >= 75
void genX(CmdBeginConditionalRenderingEXT)(
VkCommandBuffer commandBuffer,
const VkConditionalRenderingBeginInfoEXT* pConditionalRenderingBegin)
@@ -6817,7 +5486,9 @@ void genX(CmdBeginConditionalRenderingEXT)(
genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
struct mi_builder b;
- mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
+ mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
+ const uint32_t mocs = anv_mocs_for_address(cmd_buffer->device, &value_address);
+ mi_builder_set_mocs(&b, mocs);
/* Section 19.4 of the Vulkan 1.1.85 spec says:
*
@@ -6849,121 +5520,162 @@ void genX(CmdEndConditionalRenderingEXT)(
cmd_state->conditional_render_enabled = false;
}
-#endif
-/* Set of stage bits for which are pipelined, i.e. they get queued by the
- * command streamer for later execution.
+/* Set of stage bits for which are pipelined, i.e. they get queued
+ * by the command streamer for later execution.
*/
#define ANV_PIPELINE_STAGE_PIPELINED_BITS \
- (VK_PIPELINE_STAGE_VERTEX_INPUT_BIT | \
- VK_PIPELINE_STAGE_VERTEX_SHADER_BIT | \
- VK_PIPELINE_STAGE_TESSELLATION_CONTROL_SHADER_BIT | \
- VK_PIPELINE_STAGE_TESSELLATION_EVALUATION_SHADER_BIT | \
- VK_PIPELINE_STAGE_GEOMETRY_SHADER_BIT | \
- VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT | \
- VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT | \
- VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT | \
- VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT | \
- VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT | \
- VK_PIPELINE_STAGE_TRANSFER_BIT | \
- VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT | \
- VK_PIPELINE_STAGE_ALL_GRAPHICS_BIT | \
- VK_PIPELINE_STAGE_ALL_COMMANDS_BIT)
-
-void genX(CmdSetEvent)(
+ ~(VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT | \
+ VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT | \
+ VK_PIPELINE_STAGE_2_HOST_BIT | \
+ VK_PIPELINE_STAGE_2_CONDITIONAL_RENDERING_BIT_EXT)
+
+void genX(CmdSetEvent2)(
VkCommandBuffer commandBuffer,
VkEvent _event,
- VkPipelineStageFlags stageMask)
+ const VkDependencyInfo* pDependencyInfo)
{
ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
ANV_FROM_HANDLE(anv_event, event, _event);
+ if (anv_cmd_buffer_is_video_queue(cmd_buffer)) {
+ anv_batch_emit(&cmd_buffer->batch, GENX(MI_FLUSH_DW), flush) {
+ flush.PostSyncOperation = WriteImmediateData;
+ flush.Address = anv_state_pool_state_address(
+ &cmd_buffer->device->dynamic_state_pool,
+ event->state);
+ flush.ImmediateData = VK_EVENT_SET;
+ }
+ return;
+ }
+
+ VkPipelineStageFlags2 src_stages = 0;
+
+ for (uint32_t i = 0; i < pDependencyInfo->memoryBarrierCount; i++)
+ src_stages |= pDependencyInfo->pMemoryBarriers[i].srcStageMask;
+ for (uint32_t i = 0; i < pDependencyInfo->bufferMemoryBarrierCount; i++)
+ src_stages |= pDependencyInfo->pBufferMemoryBarriers[i].srcStageMask;
+ for (uint32_t i = 0; i < pDependencyInfo->imageMemoryBarrierCount; i++)
+ src_stages |= pDependencyInfo->pImageMemoryBarriers[i].srcStageMask;
+
cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT;
genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
- anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
- if (stageMask & ANV_PIPELINE_STAGE_PIPELINED_BITS) {
- pc.StallAtPixelScoreboard = true;
- pc.CommandStreamerStallEnable = true;
- }
-
- pc.DestinationAddressType = DAT_PPGTT,
- pc.PostSyncOperation = WriteImmediateData,
- pc.Address = (struct anv_address) {
- cmd_buffer->device->dynamic_state_pool.block_pool.bo,
- event->state.offset
- };
- pc.ImmediateData = VK_EVENT_SET;
- anv_debug_dump_pc(pc);
- }
+ enum anv_pipe_bits pc_bits = 0;
+ if (src_stages & ANV_PIPELINE_STAGE_PIPELINED_BITS) {
+ pc_bits |= ANV_PIPE_STALL_AT_SCOREBOARD_BIT;
+ pc_bits |= ANV_PIPE_CS_STALL_BIT;
+ }
+
+ genx_batch_emit_pipe_control_write
+ (&cmd_buffer->batch, cmd_buffer->device->info,
+ cmd_buffer->state.current_pipeline, WriteImmediateData,
+ anv_state_pool_state_address(&cmd_buffer->device->dynamic_state_pool,
+ event->state),
+ VK_EVENT_SET, pc_bits);
}
-void genX(CmdResetEvent)(
+void genX(CmdResetEvent2)(
VkCommandBuffer commandBuffer,
VkEvent _event,
- VkPipelineStageFlags stageMask)
+ VkPipelineStageFlags2 stageMask)
{
ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
ANV_FROM_HANDLE(anv_event, event, _event);
+ if (anv_cmd_buffer_is_video_queue(cmd_buffer)) {
+ anv_batch_emit(&cmd_buffer->batch, GENX(MI_FLUSH_DW), flush) {
+ flush.PostSyncOperation = WriteImmediateData;
+ flush.Address = anv_state_pool_state_address(
+ &cmd_buffer->device->dynamic_state_pool,
+ event->state);
+ flush.ImmediateData = VK_EVENT_RESET;
+ }
+ return;
+ }
+
cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT;
genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
- anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
- if (stageMask & ANV_PIPELINE_STAGE_PIPELINED_BITS) {
- pc.StallAtPixelScoreboard = true;
- pc.CommandStreamerStallEnable = true;
- }
-
- pc.DestinationAddressType = DAT_PPGTT;
- pc.PostSyncOperation = WriteImmediateData;
- pc.Address = (struct anv_address) {
- cmd_buffer->device->dynamic_state_pool.block_pool.bo,
- event->state.offset
- };
- pc.ImmediateData = VK_EVENT_RESET;
- anv_debug_dump_pc(pc);
- }
+ enum anv_pipe_bits pc_bits = 0;
+ if (stageMask & ANV_PIPELINE_STAGE_PIPELINED_BITS) {
+ pc_bits |= ANV_PIPE_STALL_AT_SCOREBOARD_BIT;
+ pc_bits |= ANV_PIPE_CS_STALL_BIT;
+ }
+
+ genx_batch_emit_pipe_control_write
+ (&cmd_buffer->batch, cmd_buffer->device->info,
+ cmd_buffer->state.current_pipeline, WriteImmediateData,
+ anv_state_pool_state_address(&cmd_buffer->device->dynamic_state_pool,
+ event->state),
+ VK_EVENT_RESET,
+ pc_bits);
}
-void genX(CmdWaitEvents)(
+void genX(CmdWaitEvents2)(
VkCommandBuffer commandBuffer,
uint32_t eventCount,
const VkEvent* pEvents,
- VkPipelineStageFlags srcStageMask,
- VkPipelineStageFlags destStageMask,
- uint32_t memoryBarrierCount,
- const VkMemoryBarrier* pMemoryBarriers,
- uint32_t bufferMemoryBarrierCount,
- const VkBufferMemoryBarrier* pBufferMemoryBarriers,
- uint32_t imageMemoryBarrierCount,
- const VkImageMemoryBarrier* pImageMemoryBarriers)
+ const VkDependencyInfo* pDependencyInfos)
{
-#if GFX_VER >= 8
ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
for (uint32_t i = 0; i < eventCount; i++) {
ANV_FROM_HANDLE(anv_event, event, pEvents[i]);
anv_batch_emit(&cmd_buffer->batch, GENX(MI_SEMAPHORE_WAIT), sem) {
- sem.WaitMode = PollingMode,
- sem.CompareOperation = COMPARE_SAD_EQUAL_SDD,
- sem.SemaphoreDataDword = VK_EVENT_SET,
- sem.SemaphoreAddress = (struct anv_address) {
- cmd_buffer->device->dynamic_state_pool.block_pool.bo,
- event->state.offset
- };
+ sem.WaitMode = PollingMode;
+ sem.CompareOperation = COMPARE_SAD_EQUAL_SDD;
+ sem.SemaphoreDataDword = VK_EVENT_SET;
+ sem.SemaphoreAddress = anv_state_pool_state_address(
+ &cmd_buffer->device->dynamic_state_pool,
+ event->state);
}
}
-#else
- anv_finishme("Implement events on gfx7");
-#endif
- genX(CmdPipelineBarrier)(commandBuffer, srcStageMask, destStageMask,
- false, /* byRegion */
- memoryBarrierCount, pMemoryBarriers,
- bufferMemoryBarrierCount, pBufferMemoryBarriers,
- imageMemoryBarrierCount, pImageMemoryBarriers);
+ cmd_buffer_barrier(cmd_buffer, pDependencyInfos, "wait event");
+}
+
+static uint32_t vk_to_intel_index_type(VkIndexType type)
+{
+ switch (type) {
+ case VK_INDEX_TYPE_UINT8_KHR:
+ return INDEX_BYTE;
+ case VK_INDEX_TYPE_UINT16:
+ return INDEX_WORD;
+ case VK_INDEX_TYPE_UINT32:
+ return INDEX_DWORD;
+ default:
+ unreachable("invalid index type");
+ }
+}
+
+void genX(CmdBindIndexBuffer2KHR)(
+ VkCommandBuffer commandBuffer,
+ VkBuffer _buffer,
+ VkDeviceSize offset,
+ VkDeviceSize size,
+ VkIndexType indexType)
+{
+ ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+ ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
+
+ uint32_t restart_index = vk_index_to_restart(indexType);
+ if (cmd_buffer->state.gfx.restart_index != restart_index) {
+ cmd_buffer->state.gfx.restart_index = restart_index;
+ cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_RESTART_INDEX;
+ }
+
+ uint32_t index_type = vk_to_intel_index_type(indexType);
+ if (cmd_buffer->state.gfx.index_buffer != buffer ||
+ cmd_buffer->state.gfx.index_type != index_type ||
+ cmd_buffer->state.gfx.index_offset != offset) {
+ cmd_buffer->state.gfx.index_buffer = buffer;
+ cmd_buffer->state.gfx.index_type = vk_to_intel_index_type(indexType);
+ cmd_buffer->state.gfx.index_offset = offset;
+ cmd_buffer->state.gfx.index_size = buffer ? vk_buffer_range(&buffer->vk, offset, size) : 0;
+ cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_INDEX_BUFFER;
+ }
}
VkResult genX(CmdSetPerformanceOverrideINTEL)(
@@ -6974,21 +5686,12 @@ VkResult genX(CmdSetPerformanceOverrideINTEL)(
switch (pOverrideInfo->type) {
case VK_PERFORMANCE_OVERRIDE_TYPE_NULL_HARDWARE_INTEL: {
-#if GFX_VER >= 9
anv_batch_write_reg(&cmd_buffer->batch, GENX(CS_DEBUG_MODE2), csdm2) {
csdm2._3DRenderingInstructionDisable = pOverrideInfo->enable;
csdm2.MediaInstructionDisable = pOverrideInfo->enable;
csdm2._3DRenderingInstructionDisableMask = true;
csdm2.MediaInstructionDisableMask = true;
}
-#else
- anv_batch_write_reg(&cmd_buffer->batch, GENX(INSTPM), instpm) {
- instpm._3DRenderingInstructionDisable = pOverrideInfo->enable;
- instpm.MediaInstructionDisable = pOverrideInfo->enable;
- instpm._3DRenderingInstructionDisableMask = true;
- instpm.MediaInstructionDisableMask = true;
- }
-#endif
break;
}
@@ -7019,13 +5722,495 @@ VkResult genX(CmdSetPerformanceStreamMarkerINTEL)(
return VK_SUCCESS;
}
+#define TIMESTAMP 0x2358
+
void genX(cmd_emit_timestamp)(struct anv_batch *batch,
- struct anv_bo *bo,
- uint32_t offset) {
- anv_batch_emit(batch, GENX(PIPE_CONTROL), pc) {
- pc.CommandStreamerStallEnable = true;
- pc.PostSyncOperation = WriteTimestamp;
- pc.Address = (struct anv_address) {bo, offset};
- anv_debug_dump_pc(pc);
+ struct anv_device *device,
+ struct anv_address addr,
+ enum anv_timestamp_capture_type type,
+ void *data) {
+ /* Make sure ANV_TIMESTAMP_CAPTURE_AT_CS_STALL and
+ * ANV_TIMESTAMP_REWRITE_COMPUTE_WALKER capture type are not set for
+ * transfer queue.
+ */
+ if ((batch->engine_class == INTEL_ENGINE_CLASS_COPY) ||
+ (batch->engine_class == INTEL_ENGINE_CLASS_VIDEO)) {
+ assert(type != ANV_TIMESTAMP_CAPTURE_AT_CS_STALL &&
+ type != ANV_TIMESTAMP_REWRITE_COMPUTE_WALKER);
}
+
+ switch (type) {
+ case ANV_TIMESTAMP_CAPTURE_TOP_OF_PIPE: {
+ struct mi_builder b;
+ mi_builder_init(&b, device->info, batch);
+ mi_store(&b, mi_mem64(addr), mi_reg64(TIMESTAMP));
+ break;
+ }
+
+ case ANV_TIMESTAMP_CAPTURE_END_OF_PIPE: {
+ if ((batch->engine_class == INTEL_ENGINE_CLASS_COPY) ||
+ (batch->engine_class == INTEL_ENGINE_CLASS_VIDEO)) {
+ /* Wa_16018063123 - emit fast color dummy blit before MI_FLUSH_DW. */
+ if (intel_needs_workaround(device->info, 16018063123))
+ genX(batch_emit_fast_color_dummy_blit)(batch, device);
+ anv_batch_emit(batch, GENX(MI_FLUSH_DW), fd) {
+ fd.PostSyncOperation = WriteTimestamp;
+ fd.Address = addr;
+ }
+ } else {
+ genx_batch_emit_pipe_control_write(batch, device->info, 0,
+ WriteTimestamp, addr, 0, 0);
+ }
+ break;
+ }
+
+ case ANV_TIMESTAMP_CAPTURE_AT_CS_STALL:
+ genx_batch_emit_pipe_control_write
+ (batch, device->info, 0, WriteTimestamp, addr, 0,
+ ANV_PIPE_CS_STALL_BIT);
+ break;
+
+#if GFX_VERx10 >= 125
+ case ANV_TIMESTAMP_REWRITE_COMPUTE_WALKER: {
+ uint32_t dwords[GENX(COMPUTE_WALKER_length)];
+
+ GENX(COMPUTE_WALKER_pack)(batch, dwords, &(struct GENX(COMPUTE_WALKER)) {
+ .PostSync = (struct GENX(POSTSYNC_DATA)) {
+ .Operation = WriteTimestamp,
+ .DestinationAddress = addr,
+ .MOCS = anv_mocs(device, NULL, 0),
+ },
+ });
+
+ for (uint32_t i = 0; i < ARRAY_SIZE(dwords); i++)
+ ((uint32_t *)data)[i] |= dwords[i];
+ break;
+ }
+
+ case ANV_TIMESTAMP_REWRITE_INDIRECT_DISPATCH: {
+ uint32_t dwords[GENX(EXECUTE_INDIRECT_DISPATCH_length)];
+
+ GENX(EXECUTE_INDIRECT_DISPATCH_pack)
+ (batch, dwords, &(struct GENX(EXECUTE_INDIRECT_DISPATCH)) {
+ .MOCS = anv_mocs(device, NULL, 0),
+ .COMPUTE_WALKER_BODY = {
+ .PostSync = (struct GENX(POSTSYNC_DATA)) {
+ .Operation = WriteTimestamp,
+ .DestinationAddress = addr,
+ .MOCS = anv_mocs(device, NULL, 0),
+ },
+ }
+ });
+
+ for (uint32_t i = 0; i < ARRAY_SIZE(dwords); i++)
+ ((uint32_t *)data)[i] |= dwords[i];
+ break;
+ }
+#endif
+
+ default:
+ unreachable("invalid");
+ }
+}
+
+void genX(batch_emit_secondary_call)(struct anv_batch *batch,
+ struct anv_address secondary_addr,
+ struct anv_address secondary_return_addr)
+{
+ /* Emit a write to change the return address of the secondary */
+ uint64_t *write_return_addr =
+ anv_batch_emitn(batch,
+ GENX(MI_STORE_DATA_IMM_length) + 1 /* QWord write */,
+ GENX(MI_STORE_DATA_IMM),
+#if GFX_VER >= 12
+ .ForceWriteCompletionCheck = true,
+#endif
+ .Address = secondary_return_addr) +
+ GENX(MI_STORE_DATA_IMM_ImmediateData_start) / 8;
+
+#if GFX_VER >= 12
+ /* Disable prefetcher before jumping into a secondary */
+ anv_batch_emit(batch, GENX(MI_ARB_CHECK), arb) {
+ arb.PreParserDisableMask = true;
+ arb.PreParserDisable = true;
+ }
+#endif
+
+ /* Jump into the secondary */
+ anv_batch_emit(batch, GENX(MI_BATCH_BUFFER_START), bbs) {
+ bbs.AddressSpaceIndicator = ASI_PPGTT;
+ bbs.SecondLevelBatchBuffer = Firstlevelbatch;
+ bbs.BatchBufferStartAddress = secondary_addr;
+ }
+
+ /* Replace the return address written by the MI_STORE_DATA_IMM above with
+ * the primary's current batch address (immediately after the jump).
+ */
+ *write_return_addr =
+ anv_address_physical(anv_batch_current_address(batch));
+}
+
+void *
+genX(batch_emit_return)(struct anv_batch *batch)
+{
+ return anv_batch_emitn(batch,
+ GENX(MI_BATCH_BUFFER_START_length),
+ GENX(MI_BATCH_BUFFER_START),
+ .AddressSpaceIndicator = ASI_PPGTT,
+ .SecondLevelBatchBuffer = Firstlevelbatch);
+}
+
+void
+genX(batch_emit_post_3dprimitive_was)(struct anv_batch *batch,
+ const struct anv_device *device,
+ uint32_t primitive_topology,
+ uint32_t vertex_count)
+{
+#if INTEL_WA_22014412737_GFX_VER || INTEL_WA_16014538804_GFX_VER
+ if (intel_needs_workaround(device->info, 22014412737) &&
+ (primitive_topology == _3DPRIM_POINTLIST ||
+ primitive_topology == _3DPRIM_LINELIST ||
+ primitive_topology == _3DPRIM_LINESTRIP ||
+ primitive_topology == _3DPRIM_LINELIST_ADJ ||
+ primitive_topology == _3DPRIM_LINESTRIP_ADJ ||
+ primitive_topology == _3DPRIM_LINELOOP ||
+ primitive_topology == _3DPRIM_POINTLIST_BF ||
+ primitive_topology == _3DPRIM_LINESTRIP_CONT ||
+ primitive_topology == _3DPRIM_LINESTRIP_BF ||
+ primitive_topology == _3DPRIM_LINESTRIP_CONT_BF) &&
+ (vertex_count == 1 || vertex_count == 2)) {
+ genx_batch_emit_pipe_control_write
+ (batch, device->info, 0, WriteImmediateData,
+ device->workaround_address, 0, 0);
+
+ /* Reset counter because we just emitted a PC */
+ batch->num_3d_primitives_emitted = 0;
+ } else if (intel_needs_workaround(device->info, 16014538804)) {
+ batch->num_3d_primitives_emitted++;
+ /* WA 16014538804:
+ * After every 3 3D_Primitive command,
+ * atleast 1 pipe_control must be inserted.
+ */
+ if (batch->num_3d_primitives_emitted == 3) {
+ anv_batch_emit(batch, GENX(PIPE_CONTROL), pc);
+ batch->num_3d_primitives_emitted = 0;
+ }
+ }
+#endif
+}
+
+/* Wa_16018063123 */
+ALWAYS_INLINE void
+genX(batch_emit_fast_color_dummy_blit)(struct anv_batch *batch,
+ struct anv_device *device)
+{
+#if GFX_VERx10 >= 125
+ anv_batch_emit(batch, GENX(XY_FAST_COLOR_BLT), blt) {
+ blt.DestinationBaseAddress = device->workaround_address;
+ blt.DestinationMOCS = device->isl_dev.mocs.blitter_dst;
+ blt.DestinationPitch = 63;
+ blt.DestinationX2 = 1;
+ blt.DestinationY2 = 4;
+ blt.DestinationSurfaceWidth = 1;
+ blt.DestinationSurfaceHeight = 4;
+ blt.DestinationSurfaceType = XY_SURFTYPE_2D;
+ blt.DestinationSurfaceQPitch = 4;
+ blt.DestinationTiling = XY_TILE_LINEAR;
+ }
+#endif
+}
+
+void
+genX(urb_workaround)(struct anv_cmd_buffer *cmd_buffer,
+ const struct intel_urb_config *urb_cfg)
+{
+#if INTEL_NEEDS_WA_16014912113
+ const struct intel_urb_config *current =
+ &cmd_buffer->state.gfx.urb_cfg;
+ if (intel_urb_setup_changed(urb_cfg, current, MESA_SHADER_TESS_EVAL) &&
+ current->size[0] != 0) {
+ for (int i = 0; i <= MESA_SHADER_GEOMETRY; i++) {
+ anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_URB_VS), urb) {
+ urb._3DCommandSubOpcode += i;
+ urb.VSURBStartingAddress = current->start[i];
+ urb.VSURBEntryAllocationSize = current->size[i] - 1;
+ urb.VSNumberofURBEntries = i == 0 ? 256 : 0;
+ }
+ }
+ anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
+ pc.HDCPipelineFlushEnable = true;
+ }
+ }
+#endif
+}
+
+struct anv_state
+genX(cmd_buffer_begin_companion_rcs_syncpoint)(
+ struct anv_cmd_buffer *cmd_buffer)
+{
+#if GFX_VERx10 >= 125
+ const struct intel_device_info *info = cmd_buffer->device->info;
+ struct anv_state syncpoint =
+ anv_cmd_buffer_alloc_temporary_state(cmd_buffer, 2 * sizeof(uint32_t), 4);
+ struct anv_address xcs_wait_addr =
+ anv_cmd_buffer_temporary_state_address(cmd_buffer, syncpoint);
+ struct anv_address rcs_wait_addr = anv_address_add(xcs_wait_addr, 4);
+
+ /* Reset the sync point */
+ memset(syncpoint.map, 0, 2 * sizeof(uint32_t));
+
+ struct mi_builder b;
+
+ /* On CCS:
+ * - flush all caches & invalidate
+ * - unblock RCS
+ * - wait on RCS to complete
+ * - clear the value we waited on
+ */
+
+ if (anv_cmd_buffer_is_compute_queue(cmd_buffer)) {
+ anv_add_pending_pipe_bits(cmd_buffer, ANV_PIPE_FLUSH_BITS |
+ ANV_PIPE_INVALIDATE_BITS |
+ ANV_PIPE_STALL_BITS,
+ "post main cmd buffer invalidate");
+ genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+ } else if (anv_cmd_buffer_is_blitter_queue(cmd_buffer)) {
+ /* Wa_16018063123 - emit fast color dummy blit before MI_FLUSH_DW. */
+ if (intel_needs_workaround(cmd_buffer->device->info, 16018063123)) {
+ genX(batch_emit_fast_color_dummy_blit)(&cmd_buffer->batch,
+ cmd_buffer->device);
+ }
+ anv_batch_emit(&cmd_buffer->batch, GENX(MI_FLUSH_DW), fd) {
+ fd.FlushCCS = true; /* Maybe handle Flush LLC */
+ }
+ }
+
+ {
+ mi_builder_init(&b, info, &cmd_buffer->batch);
+ mi_store(&b, mi_mem32(rcs_wait_addr), mi_imm(0x1));
+ anv_batch_emit(&cmd_buffer->batch, GENX(MI_SEMAPHORE_WAIT), sem) {
+ sem.WaitMode = PollingMode;
+ sem.CompareOperation = COMPARE_SAD_EQUAL_SDD;
+ sem.SemaphoreDataDword = 0x1;
+ sem.SemaphoreAddress = xcs_wait_addr;
+ }
+ /* Make sure to reset the semaphore in case the command buffer is run
+ * multiple times.
+ */
+ mi_store(&b, mi_mem32(xcs_wait_addr), mi_imm(0x0));
+ }
+
+ /* On RCS:
+ * - wait on CCS signal
+ * - clear the value we waited on
+ */
+ {
+ mi_builder_init(&b, info, &cmd_buffer->companion_rcs_cmd_buffer->batch);
+ anv_batch_emit(&cmd_buffer->companion_rcs_cmd_buffer->batch,
+ GENX(MI_SEMAPHORE_WAIT),
+ sem) {
+ sem.WaitMode = PollingMode;
+ sem.CompareOperation = COMPARE_SAD_EQUAL_SDD;
+ sem.SemaphoreDataDword = 0x1;
+ sem.SemaphoreAddress = rcs_wait_addr;
+ }
+ /* Make sure to reset the semaphore in case the command buffer is run
+ * multiple times.
+ */
+ mi_store(&b, mi_mem32(rcs_wait_addr), mi_imm(0x0));
+ }
+
+ return syncpoint;
+#else
+ unreachable("Not implemented");
+#endif
+}
+
+void
+genX(cmd_buffer_end_companion_rcs_syncpoint)(struct anv_cmd_buffer *cmd_buffer,
+ struct anv_state syncpoint)
+{
+#if GFX_VERx10 >= 125
+ struct anv_address xcs_wait_addr =
+ anv_cmd_buffer_temporary_state_address(cmd_buffer, syncpoint);
+
+ struct mi_builder b;
+
+ /* On RCS:
+ * - flush all caches & invalidate
+ * - unblock the CCS
+ */
+ anv_add_pending_pipe_bits(cmd_buffer->companion_rcs_cmd_buffer,
+ ANV_PIPE_FLUSH_BITS |
+ ANV_PIPE_INVALIDATE_BITS |
+ ANV_PIPE_STALL_BITS,
+ "post rcs flush");
+ genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer->companion_rcs_cmd_buffer);
+
+ mi_builder_init(&b, cmd_buffer->device->info,
+ &cmd_buffer->companion_rcs_cmd_buffer->batch);
+ mi_store(&b, mi_mem32(xcs_wait_addr), mi_imm(0x1));
+#else
+ unreachable("Not implemented");
+#endif
+}
+
+VkResult
+genX(write_trtt_entries)(struct anv_trtt_submission *submit)
+{
+#if GFX_VER >= 12
+ const struct intel_device_info *devinfo =
+ submit->sparse->queue->device->info;
+
+ size_t batch_size = submit->l3l2_binds_len * 20 +
+ submit->l1_binds_len * 16 +
+ GENX(PIPE_CONTROL_length) * sizeof(uint32_t) + 8;
+ STACK_ARRAY(uint32_t, cmds, batch_size);
+ struct anv_batch batch = {
+ .start = cmds,
+ .next = cmds,
+ .end = (void *)cmds + batch_size,
+ };
+
+ /* BSpec says:
+ * "DWord Length programmed must not exceed 0x3FE."
+ * For a single dword write the programmed length is 2, and for a single
+ * qword it's 3. This is the value we actually write to the register field,
+ * so it's not considering the bias.
+ */
+ uint32_t dword_write_len = 2;
+ uint32_t qword_write_len = 3;
+ uint32_t max_dword_extra_writes = 0x3FE - dword_write_len;
+ uint32_t max_qword_extra_writes = (0x3FE - qword_write_len) / 2;
+
+ /* What makes the code below quite complicated is the fact that we can
+ * write multiple values with MI_STORE_DATA_IMM as long as the writes go to
+ * contiguous addresses.
+ */
+
+ for (int i = 0; i < submit->l3l2_binds_len; i++) {
+ int extra_writes = 0;
+ for (int j = i + 1;
+ j < submit->l3l2_binds_len &&
+ extra_writes <= max_qword_extra_writes;
+ j++) {
+ if (submit->l3l2_binds[i].pte_addr + (j - i) * 8 ==
+ submit->l3l2_binds[j].pte_addr) {
+ extra_writes++;
+ } else {
+ break;
+ }
+ }
+ bool is_last_write = submit->l1_binds_len == 0 &&
+ i + extra_writes + 1 == submit->l3l2_binds_len;
+
+ uint32_t total_len = GENX(MI_STORE_DATA_IMM_length_bias) +
+ qword_write_len + (extra_writes * 2);
+ uint32_t *dw;
+ dw = anv_batch_emitn(&batch, total_len, GENX(MI_STORE_DATA_IMM),
+ .ForceWriteCompletionCheck = is_last_write,
+ .StoreQword = true,
+ .Address = anv_address_from_u64(submit->l3l2_binds[i].pte_addr),
+ );
+ dw += 3;
+ for (int j = 0; j < extra_writes + 1; j++) {
+ uint64_t entry_addr_64b = submit->l3l2_binds[i + j].entry_addr;
+ *dw = entry_addr_64b & 0xFFFFFFFF;
+ dw++;
+ *dw = (entry_addr_64b >> 32) & 0xFFFFFFFF;
+ dw++;
+ }
+ assert(dw == batch.next);
+
+ i += extra_writes;
+ }
+
+ for (int i = 0; i < submit->l1_binds_len; i++) {
+ int extra_writes = 0;
+ for (int j = i + 1;
+ j < submit->l1_binds_len && extra_writes <= max_dword_extra_writes;
+ j++) {
+ if (submit->l1_binds[i].pte_addr + (j - i) * 4 ==
+ submit->l1_binds[j].pte_addr) {
+ extra_writes++;
+ } else {
+ break;
+ }
+ }
+
+ bool is_last_write = i + extra_writes + 1 == submit->l1_binds_len;
+
+ uint32_t total_len = GENX(MI_STORE_DATA_IMM_length_bias) +
+ dword_write_len + extra_writes;
+ uint32_t *dw;
+ dw = anv_batch_emitn(&batch, total_len, GENX(MI_STORE_DATA_IMM),
+ .ForceWriteCompletionCheck = is_last_write,
+ .Address = anv_address_from_u64(submit->l1_binds[i].pte_addr),
+ );
+ dw += 3;
+ for (int j = 0; j < extra_writes + 1; j++) {
+ *dw = (submit->l1_binds[i + j].entry_addr >> 16) & 0xFFFFFFFF;
+ dw++;
+ }
+ assert(dw == batch.next);
+
+ i += extra_writes;
+ }
+
+ genx_batch_emit_pipe_control(&batch, devinfo, _3D,
+ ANV_PIPE_CS_STALL_BIT |
+ ANV_PIPE_TLB_INVALIDATE_BIT);
+
+ anv_batch_emit(&batch, GENX(MI_BATCH_BUFFER_END), bbe);
+
+ assert(batch.next <= batch.end);
+
+ VkResult result = anv_queue_submit_trtt_batch(submit->sparse, &batch);
+ STACK_ARRAY_FINISH(cmds);
+
+ return result;
+
+#endif
+ return VK_SUCCESS;
+}
+
+void
+genX(CmdWriteBufferMarker2AMD)(VkCommandBuffer commandBuffer,
+ VkPipelineStageFlags2 stage,
+ VkBuffer dstBuffer,
+ VkDeviceSize dstOffset,
+ uint32_t marker)
+{
+ ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+ ANV_FROM_HANDLE(anv_buffer, buffer, dstBuffer);
+
+ /* The barriers inserted by the application to make dstBuffer writable
+ * should already have the L1/L2 cache flushes. On platforms where the
+ * command streamer is not coherent with L3, we need an additional set of
+ * cache flushes.
+ */
+ enum anv_pipe_bits bits =
+ (ANV_DEVINFO_HAS_COHERENT_L3_CS(cmd_buffer->device->info) ? 0 :
+ (ANV_PIPE_DATA_CACHE_FLUSH_BIT | ANV_PIPE_TILE_CACHE_FLUSH_BIT)) |
+ ANV_PIPE_END_OF_PIPE_SYNC_BIT;
+
+ trace_intel_begin_write_buffer_marker(&cmd_buffer->trace);
+
+ anv_add_pending_pipe_bits(cmd_buffer, bits, "write buffer marker");
+ genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+
+ struct mi_builder b;
+ mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
+
+ /* Emitting a PIPE_CONTROL with Post-Sync Op = Write Immediate Data
+ * would be the logical way to implement this extension, as it could
+ * do a pipelined marker write. Unfortunately, it requires writing
+ * whole 64-bit QWords, and VK_AMD_buffer_marker requires writing a
+ * 32-bit value. MI_STORE_DATA_IMM is the only good way to do that,
+ * and unfortunately it requires stalling.
+ */
+ mi_store(&b, mi_mem32(anv_address_add(buffer->address, dstOffset)),
+ mi_imm(marker));
+
+ trace_intel_end_write_buffer_marker(&cmd_buffer->trace);
}
diff --git a/src/intel/vulkan/genX_cmd_compute.c b/src/intel/vulkan/genX_cmd_compute.c
new file mode 100644
index 00000000000..7f05139e43f
--- /dev/null
+++ b/src/intel/vulkan/genX_cmd_compute.c
@@ -0,0 +1,1168 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <assert.h>
+#include <stdbool.h>
+
+#include "anv_private.h"
+#include "anv_measure.h"
+#include "vk_render_pass.h"
+#include "vk_util.h"
+
+#include "common/intel_aux_map.h"
+#include "genxml/gen_macros.h"
+#include "genxml/genX_pack.h"
+#include "genxml/genX_rt_pack.h"
+#include "common/intel_genX_state_brw.h"
+
+#include "ds/intel_tracepoints.h"
+
+/* We reserve :
+ * - GPR 14 for secondary command buffer returns
+ * - GPR 15 for conditional rendering
+ */
+#define MI_BUILDER_NUM_ALLOC_GPRS 14
+#define __gen_get_batch_dwords anv_batch_emit_dwords
+#define __gen_address_offset anv_address_add
+#define __gen_get_batch_address(b, a) anv_batch_address(b, a)
+#include "common/mi_builder.h"
+
+void
+genX(cmd_buffer_ensure_cfe_state)(struct anv_cmd_buffer *cmd_buffer,
+ uint32_t total_scratch)
+{
+#if GFX_VERx10 >= 125
+ assert(cmd_buffer->state.current_pipeline == GPGPU);
+
+ struct anv_cmd_compute_state *comp_state = &cmd_buffer->state.compute;
+
+ if (total_scratch <= comp_state->scratch_size)
+ return;
+
+ const struct intel_device_info *devinfo = cmd_buffer->device->info;
+ anv_batch_emit(&cmd_buffer->batch, GENX(CFE_STATE), cfe) {
+ cfe.MaximumNumberofThreads =
+ devinfo->max_cs_threads * devinfo->subslice_total;
+
+ uint32_t scratch_surf = 0xffffffff;
+ if (total_scratch > 0) {
+ struct anv_bo *scratch_bo =
+ anv_scratch_pool_alloc(cmd_buffer->device,
+ &cmd_buffer->device->scratch_pool,
+ MESA_SHADER_COMPUTE,
+ total_scratch);
+ anv_reloc_list_add_bo(cmd_buffer->batch.relocs,
+ scratch_bo);
+ scratch_surf =
+ anv_scratch_pool_get_surf(cmd_buffer->device,
+ &cmd_buffer->device->scratch_pool,
+ total_scratch);
+ cfe.ScratchSpaceBuffer = scratch_surf >> 4;
+ }
+
+ cfe.OverDispatchControl = 2; /* 50% overdispatch */
+ }
+
+ comp_state->scratch_size = total_scratch;
+#else
+ unreachable("Invalid call");
+#endif
+}
+
+static void
+genX(cmd_buffer_flush_compute_state)(struct anv_cmd_buffer *cmd_buffer)
+{
+ struct anv_cmd_compute_state *comp_state = &cmd_buffer->state.compute;
+ struct anv_compute_pipeline *pipeline =
+ anv_pipeline_to_compute(comp_state->base.pipeline);
+ const UNUSED struct intel_device_info *devinfo = cmd_buffer->device->info;
+
+ assert(pipeline->cs);
+
+ genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->base.l3_config);
+
+ genX(flush_descriptor_buffers)(cmd_buffer, &comp_state->base);
+
+ genX(flush_pipeline_select_gpgpu)(cmd_buffer);
+
+ /* Apply any pending pipeline flushes we may have. We want to apply them
+ * now because, if any of those flushes are for things like push constants,
+ * the GPU will read the state at weird times.
+ */
+ genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+
+ if (cmd_buffer->state.compute.pipeline_dirty) {
+#if GFX_VERx10 < 125
+ /* From the Sky Lake PRM Vol 2a, MEDIA_VFE_STATE:
+ *
+ * "A stalling PIPE_CONTROL is required before MEDIA_VFE_STATE unless
+ * the only bits that are changed are scoreboard related: Scoreboard
+ * Enable, Scoreboard Type, Scoreboard Mask, Scoreboard * Delta. For
+ * these scoreboard related states, a MEDIA_STATE_FLUSH is
+ * sufficient."
+ */
+ anv_add_pending_pipe_bits(cmd_buffer,
+ ANV_PIPE_CS_STALL_BIT,
+ "flush compute state");
+ genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+#endif
+
+ anv_batch_emit_batch(&cmd_buffer->batch, &pipeline->base.batch);
+
+#if GFX_VERx10 >= 125
+ const struct brw_cs_prog_data *prog_data = get_cs_prog_data(pipeline);
+ genX(cmd_buffer_ensure_cfe_state)(cmd_buffer, prog_data->base.total_scratch);
+#endif
+
+ /* The workgroup size of the pipeline affects our push constant layout
+ * so flag push constants as dirty if we change the pipeline.
+ */
+ cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_COMPUTE_BIT;
+ comp_state->base.push_constants_data_dirty = true;
+ }
+
+ cmd_buffer->state.descriptors_dirty |=
+ genX(cmd_buffer_flush_push_descriptors)(cmd_buffer,
+ &cmd_buffer->state.compute.base,
+ &pipeline->base);
+
+ if ((cmd_buffer->state.descriptors_dirty & VK_SHADER_STAGE_COMPUTE_BIT) ||
+ cmd_buffer->state.compute.pipeline_dirty) {
+ genX(cmd_buffer_flush_descriptor_sets)(cmd_buffer,
+ &cmd_buffer->state.compute.base,
+ VK_SHADER_STAGE_COMPUTE_BIT,
+ &pipeline->cs, 1);
+ cmd_buffer->state.descriptors_dirty &= ~VK_SHADER_STAGE_COMPUTE_BIT;
+
+#if GFX_VERx10 < 125
+ uint32_t iface_desc_data_dw[GENX(INTERFACE_DESCRIPTOR_DATA_length)];
+ struct GENX(INTERFACE_DESCRIPTOR_DATA) desc = {
+ .BindingTablePointer =
+ cmd_buffer->state.binding_tables[MESA_SHADER_COMPUTE].offset,
+ .SamplerStatePointer =
+ cmd_buffer->state.samplers[MESA_SHADER_COMPUTE].offset,
+ };
+ GENX(INTERFACE_DESCRIPTOR_DATA_pack)(NULL, iface_desc_data_dw, &desc);
+
+ struct anv_state state =
+ anv_cmd_buffer_merge_dynamic(cmd_buffer, iface_desc_data_dw,
+ pipeline->interface_descriptor_data,
+ GENX(INTERFACE_DESCRIPTOR_DATA_length),
+ 64);
+
+ uint32_t size = GENX(INTERFACE_DESCRIPTOR_DATA_length) * sizeof(uint32_t);
+ anv_batch_emit(&cmd_buffer->batch,
+ GENX(MEDIA_INTERFACE_DESCRIPTOR_LOAD), mid) {
+ mid.InterfaceDescriptorTotalLength = size;
+ mid.InterfaceDescriptorDataStartAddress = state.offset;
+ }
+#endif
+ }
+
+ if (cmd_buffer->state.push_constants_dirty & VK_SHADER_STAGE_COMPUTE_BIT) {
+
+ if (comp_state->push_data.alloc_size == 0 ||
+ comp_state->base.push_constants_data_dirty) {
+ comp_state->push_data =
+ anv_cmd_buffer_cs_push_constants(cmd_buffer);
+ comp_state->base.push_constants_data_dirty = false;
+ }
+
+#if GFX_VERx10 < 125
+ if (comp_state->push_data.alloc_size) {
+ anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_CURBE_LOAD), curbe) {
+ curbe.CURBETotalDataLength = comp_state->push_data.alloc_size;
+ curbe.CURBEDataStartAddress = comp_state->push_data.offset;
+ }
+ }
+#endif
+
+ cmd_buffer->state.push_constants_dirty &= ~VK_SHADER_STAGE_COMPUTE_BIT;
+ }
+
+ cmd_buffer->state.compute.pipeline_dirty = false;
+
+ genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+}
+
+static void
+anv_cmd_buffer_push_base_group_id(struct anv_cmd_buffer *cmd_buffer,
+ uint32_t baseGroupX,
+ uint32_t baseGroupY,
+ uint32_t baseGroupZ)
+{
+ if (anv_batch_has_error(&cmd_buffer->batch))
+ return;
+
+ struct anv_push_constants *push =
+ &cmd_buffer->state.compute.base.push_constants;
+ if (push->cs.base_work_group_id[0] != baseGroupX ||
+ push->cs.base_work_group_id[1] != baseGroupY ||
+ push->cs.base_work_group_id[2] != baseGroupZ) {
+ push->cs.base_work_group_id[0] = baseGroupX;
+ push->cs.base_work_group_id[1] = baseGroupY;
+ push->cs.base_work_group_id[2] = baseGroupZ;
+
+ cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_COMPUTE_BIT;
+ cmd_buffer->state.compute.base.push_constants_data_dirty = true;
+ }
+}
+
+#define GPGPU_DISPATCHDIMX 0x2500
+#define GPGPU_DISPATCHDIMY 0x2504
+#define GPGPU_DISPATCHDIMZ 0x2508
+
+static void
+compute_load_indirect_params(struct anv_cmd_buffer *cmd_buffer,
+ const struct anv_address indirect_addr)
+{
+ struct mi_builder b;
+ mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
+
+ struct mi_value size_x = mi_mem32(anv_address_add(indirect_addr, 0));
+ struct mi_value size_y = mi_mem32(anv_address_add(indirect_addr, 4));
+ struct mi_value size_z = mi_mem32(anv_address_add(indirect_addr, 8));
+
+ mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMX), size_x);
+ mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMY), size_y);
+ mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMZ), size_z);
+}
+
+static void
+compute_store_indirect_params(struct anv_cmd_buffer *cmd_buffer,
+ const struct anv_address indirect_addr)
+{
+ struct mi_builder b;
+ mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
+
+ struct mi_value size_x = mi_mem32(anv_address_add(indirect_addr, 0));
+ struct mi_value size_y = mi_mem32(anv_address_add(indirect_addr, 4));
+ struct mi_value size_z = mi_mem32(anv_address_add(indirect_addr, 8));
+
+ mi_store(&b, size_x, mi_reg32(GPGPU_DISPATCHDIMX));
+ mi_store(&b, size_y, mi_reg32(GPGPU_DISPATCHDIMY));
+ mi_store(&b, size_z, mi_reg32(GPGPU_DISPATCHDIMZ));
+}
+
+
+#if GFX_VERx10 >= 125
+
+static inline struct GENX(INTERFACE_DESCRIPTOR_DATA)
+get_interface_descriptor_data(struct anv_cmd_buffer *cmd_buffer,
+ const struct anv_shader_bin *shader,
+ const struct brw_cs_prog_data *prog_data,
+ const struct intel_cs_dispatch_info *dispatch)
+{
+ const struct intel_device_info *devinfo = cmd_buffer->device->info;
+
+ return (struct GENX(INTERFACE_DESCRIPTOR_DATA)) {
+ .KernelStartPointer = shader->kernel.offset,
+ .SamplerStatePointer = cmd_buffer->state.samplers[MESA_SHADER_COMPUTE].offset,
+ .BindingTablePointer = cmd_buffer->state.binding_tables[MESA_SHADER_COMPUTE].offset,
+ /* Typically set to 0 to avoid prefetching on every thread dispatch. */
+ .BindingTableEntryCount = devinfo->verx10 == 125 ?
+ 0 : 1 + MIN2(shader->bind_map.surface_count, 30),
+ .NumberofThreadsinGPGPUThreadGroup = dispatch->threads,
+ .SharedLocalMemorySize = encode_slm_size(GFX_VER, prog_data->base.total_shared),
+ .PreferredSLMAllocationSize = preferred_slm_allocation_size(devinfo),
+ .NumberOfBarriers = prog_data->uses_barrier,
+ };
+}
+
+static inline void
+emit_indirect_compute_walker(struct anv_cmd_buffer *cmd_buffer,
+ const struct anv_shader_bin *shader,
+ const struct brw_cs_prog_data *prog_data,
+ struct anv_address indirect_addr)
+{
+ const struct intel_device_info *devinfo = cmd_buffer->device->info;
+ assert(devinfo->has_indirect_unroll);
+
+ struct anv_cmd_compute_state *comp_state = &cmd_buffer->state.compute;
+ bool predicate = cmd_buffer->state.conditional_render_enabled;
+
+ const struct intel_cs_dispatch_info dispatch =
+ brw_cs_get_dispatch_info(devinfo, prog_data, NULL);
+ const int dispatch_size = dispatch.simd_size / 16;
+
+ struct GENX(COMPUTE_WALKER_BODY) body = {
+ .SIMDSize = dispatch_size,
+ .MessageSIMD = dispatch_size,
+ .IndirectDataStartAddress = comp_state->push_data.offset,
+ .IndirectDataLength = comp_state->push_data.alloc_size,
+ .LocalXMaximum = prog_data->local_size[0] - 1,
+ .LocalYMaximum = prog_data->local_size[1] - 1,
+ .LocalZMaximum = prog_data->local_size[2] - 1,
+ .ExecutionMask = dispatch.right_mask,
+ .PostSync.MOCS = anv_mocs(cmd_buffer->device, NULL, 0),
+ .InterfaceDescriptor =
+ get_interface_descriptor_data(cmd_buffer, shader, prog_data,
+ &dispatch),
+ };
+
+ cmd_buffer->last_indirect_dispatch =
+ anv_batch_emitn(
+ &cmd_buffer->batch,
+ GENX(EXECUTE_INDIRECT_DISPATCH_length),
+ GENX(EXECUTE_INDIRECT_DISPATCH),
+ .PredicateEnable = predicate,
+ .MaxCount = 1,
+ .COMPUTE_WALKER_BODY = body,
+ .ArgumentBufferStartAddress = indirect_addr,
+ .MOCS = anv_mocs(cmd_buffer->device,
+ indirect_addr.bo, 0),
+ );
+}
+
+static inline void
+emit_compute_walker(struct anv_cmd_buffer *cmd_buffer,
+ const struct anv_compute_pipeline *pipeline, bool indirect,
+ const struct brw_cs_prog_data *prog_data,
+ uint32_t groupCountX, uint32_t groupCountY,
+ uint32_t groupCountZ)
+{
+ const struct anv_cmd_compute_state *comp_state = &cmd_buffer->state.compute;
+ const bool predicate = cmd_buffer->state.conditional_render_enabled;
+
+ const struct intel_device_info *devinfo = pipeline->base.device->info;
+ const struct intel_cs_dispatch_info dispatch =
+ brw_cs_get_dispatch_info(devinfo, prog_data, NULL);
+
+ cmd_buffer->last_compute_walker =
+ anv_batch_emitn(
+ &cmd_buffer->batch,
+ GENX(COMPUTE_WALKER_length),
+ GENX(COMPUTE_WALKER),
+ .IndirectParameterEnable = indirect,
+ .PredicateEnable = predicate,
+ .SIMDSize = dispatch.simd_size / 16,
+ .MessageSIMD = dispatch.simd_size / 16,
+ .IndirectDataStartAddress = comp_state->push_data.offset,
+ .IndirectDataLength = comp_state->push_data.alloc_size,
+#if GFX_VERx10 == 125
+ .SystolicModeEnable = prog_data->uses_systolic,
+#endif
+ .GenerateLocalID = prog_data->generate_local_id != 0,
+ .EmitLocal = prog_data->generate_local_id,
+ .WalkOrder = prog_data->walk_order,
+ .TileLayout = prog_data->walk_order == INTEL_WALK_ORDER_YXZ ?
+ TileY32bpe : Linear,
+ .LocalXMaximum = prog_data->local_size[0] - 1,
+ .LocalYMaximum = prog_data->local_size[1] - 1,
+ .LocalZMaximum = prog_data->local_size[2] - 1,
+ .ThreadGroupIDXDimension = groupCountX,
+ .ThreadGroupIDYDimension = groupCountY,
+ .ThreadGroupIDZDimension = groupCountZ,
+ .ExecutionMask = dispatch.right_mask,
+ .PostSync = {
+ .MOCS = anv_mocs(pipeline->base.device, NULL, 0),
+ },
+ .InterfaceDescriptor =
+ get_interface_descriptor_data(cmd_buffer, pipeline->cs,
+ prog_data, &dispatch),
+ );
+}
+
+#else /* #if GFX_VERx10 >= 125 */
+
+static inline void
+emit_gpgpu_walker(struct anv_cmd_buffer *cmd_buffer,
+ const struct anv_compute_pipeline *pipeline, bool indirect,
+ const struct brw_cs_prog_data *prog_data,
+ uint32_t groupCountX, uint32_t groupCountY,
+ uint32_t groupCountZ)
+{
+ const bool predicate = cmd_buffer->state.conditional_render_enabled;
+
+ const struct intel_device_info *devinfo = pipeline->base.device->info;
+ const struct intel_cs_dispatch_info dispatch =
+ brw_cs_get_dispatch_info(devinfo, prog_data, NULL);
+
+ anv_batch_emit(&cmd_buffer->batch, GENX(GPGPU_WALKER), ggw) {
+ ggw.IndirectParameterEnable = indirect;
+ ggw.PredicateEnable = predicate;
+ ggw.SIMDSize = dispatch.simd_size / 16;
+ ggw.ThreadDepthCounterMaximum = 0;
+ ggw.ThreadHeightCounterMaximum = 0;
+ ggw.ThreadWidthCounterMaximum = dispatch.threads - 1;
+ ggw.ThreadGroupIDXDimension = groupCountX;
+ ggw.ThreadGroupIDYDimension = groupCountY;
+ ggw.ThreadGroupIDZDimension = groupCountZ;
+ ggw.RightExecutionMask = dispatch.right_mask;
+ ggw.BottomExecutionMask = 0xffffffff;
+ }
+
+ anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_STATE_FLUSH), msf);
+}
+
+#endif /* #if GFX_VERx10 >= 125 */
+
+static inline void
+emit_cs_walker(struct anv_cmd_buffer *cmd_buffer,
+ const struct anv_compute_pipeline *pipeline,
+ const struct brw_cs_prog_data *prog_data,
+ struct anv_address indirect_addr,
+ uint32_t groupCountX, uint32_t groupCountY, uint32_t groupCountZ)
+{
+ bool is_indirect = !anv_address_is_null(indirect_addr);
+
+#if GFX_VERx10 >= 125
+ if (is_indirect && cmd_buffer->device->info->has_indirect_unroll) {
+ emit_indirect_compute_walker(cmd_buffer, pipeline->cs, prog_data,
+ indirect_addr);
+ return;
+ }
+#endif
+
+ if (is_indirect)
+ compute_load_indirect_params(cmd_buffer, indirect_addr);
+
+#if GFX_VERx10 >= 125
+ emit_compute_walker(cmd_buffer, pipeline, is_indirect, prog_data,
+ groupCountX, groupCountY, groupCountZ);
+#else
+ emit_gpgpu_walker(cmd_buffer, pipeline, is_indirect, prog_data,
+ groupCountX, groupCountY, groupCountZ);
+#endif
+}
+
+void genX(CmdDispatchBase)(
+ VkCommandBuffer commandBuffer,
+ uint32_t baseGroupX,
+ uint32_t baseGroupY,
+ uint32_t baseGroupZ,
+ uint32_t groupCountX,
+ uint32_t groupCountY,
+ uint32_t groupCountZ)
+{
+ ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+ struct anv_compute_pipeline *pipeline =
+ anv_pipeline_to_compute(cmd_buffer->state.compute.base.pipeline);
+ const struct brw_cs_prog_data *prog_data = get_cs_prog_data(pipeline);
+
+ anv_cmd_buffer_push_base_group_id(cmd_buffer, baseGroupX,
+ baseGroupY, baseGroupZ);
+
+ if (anv_batch_has_error(&cmd_buffer->batch))
+ return;
+
+ anv_measure_snapshot(cmd_buffer,
+ INTEL_SNAPSHOT_COMPUTE,
+ "compute",
+ groupCountX * groupCountY * groupCountZ *
+ prog_data->local_size[0] * prog_data->local_size[1] *
+ prog_data->local_size[2]);
+
+ trace_intel_begin_compute(&cmd_buffer->trace);
+
+ if (prog_data->uses_num_work_groups) {
+ struct anv_state state =
+ anv_cmd_buffer_alloc_temporary_state(cmd_buffer, 12, 4);
+ uint32_t *sizes = state.map;
+ sizes[0] = groupCountX;
+ sizes[1] = groupCountY;
+ sizes[2] = groupCountZ;
+ cmd_buffer->state.compute.num_workgroups =
+ anv_cmd_buffer_temporary_state_address(cmd_buffer, state);
+
+ /* The num_workgroups buffer goes in the binding table */
+ cmd_buffer->state.descriptors_dirty |= VK_SHADER_STAGE_COMPUTE_BIT;
+ }
+
+ genX(cmd_buffer_flush_compute_state)(cmd_buffer);
+
+ if (cmd_buffer->state.conditional_render_enabled)
+ genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
+
+ emit_cs_walker(cmd_buffer, pipeline, prog_data,
+ ANV_NULL_ADDRESS /* no indirect data */,
+ groupCountX, groupCountY, groupCountZ);
+
+ trace_intel_end_compute(&cmd_buffer->trace,
+ groupCountX, groupCountY, groupCountZ);
+}
+
+void genX(CmdDispatchIndirect)(
+ VkCommandBuffer commandBuffer,
+ VkBuffer _buffer,
+ VkDeviceSize offset)
+{
+ ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+ ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
+ struct anv_compute_pipeline *pipeline =
+ anv_pipeline_to_compute(cmd_buffer->state.compute.base.pipeline);
+ const struct brw_cs_prog_data *prog_data = get_cs_prog_data(pipeline);
+ struct anv_address addr = anv_address_add(buffer->address, offset);
+ UNUSED struct anv_batch *batch = &cmd_buffer->batch;
+
+ anv_cmd_buffer_push_base_group_id(cmd_buffer, 0, 0, 0);
+
+ anv_measure_snapshot(cmd_buffer,
+ INTEL_SNAPSHOT_COMPUTE,
+ "compute indirect",
+ 0);
+ trace_intel_begin_compute(&cmd_buffer->trace);
+
+ if (prog_data->uses_num_work_groups) {
+ cmd_buffer->state.compute.num_workgroups = addr;
+
+ /* The num_workgroups buffer goes in the binding table */
+ cmd_buffer->state.descriptors_dirty |= VK_SHADER_STAGE_COMPUTE_BIT;
+ }
+
+ genX(cmd_buffer_flush_compute_state)(cmd_buffer);
+
+ if (cmd_buffer->state.conditional_render_enabled)
+ genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
+
+ emit_cs_walker(cmd_buffer, pipeline, prog_data, addr, 0, 0, 0);
+
+ trace_intel_end_compute(&cmd_buffer->trace, 0, 0, 0);
+}
+
+struct anv_address
+genX(cmd_buffer_ray_query_globals)(struct anv_cmd_buffer *cmd_buffer)
+{
+#if GFX_VERx10 >= 125
+ struct anv_device *device = cmd_buffer->device;
+
+ struct anv_state state =
+ anv_cmd_buffer_alloc_temporary_state(cmd_buffer,
+ BRW_RT_DISPATCH_GLOBALS_SIZE, 64);
+ struct brw_rt_scratch_layout layout;
+ uint32_t stack_ids_per_dss = 2048; /* TODO: can we use a lower value in
+ * some cases?
+ */
+ brw_rt_compute_scratch_layout(&layout, device->info,
+ stack_ids_per_dss, 1 << 10);
+
+ const struct GENX(RT_DISPATCH_GLOBALS) rtdg = {
+ .MemBaseAddress = (struct anv_address) {
+ /* The ray query HW computes offsets from the top of the buffer, so
+ * let the address at the end of the buffer.
+ */
+ .bo = device->ray_query_bo,
+ .offset = device->ray_query_bo->size
+ },
+ .AsyncRTStackSize = layout.ray_stack_stride / 64,
+ .NumDSSRTStacks = layout.stack_ids_per_dss,
+ .MaxBVHLevels = BRW_RT_MAX_BVH_LEVELS,
+ .Flags = RT_DEPTH_TEST_LESS_EQUAL,
+ .ResumeShaderTable = (struct anv_address) {
+ .bo = cmd_buffer->state.ray_query_shadow_bo,
+ },
+ };
+ GENX(RT_DISPATCH_GLOBALS_pack)(NULL, state.map, &rtdg);
+
+ return anv_cmd_buffer_temporary_state_address(cmd_buffer, state);
+#else
+ unreachable("Not supported");
+#endif
+}
+
+#if GFX_VERx10 >= 125
+void
+genX(cmd_buffer_dispatch_kernel)(struct anv_cmd_buffer *cmd_buffer,
+ struct anv_kernel *kernel,
+ const uint32_t *global_size,
+ uint32_t arg_count,
+ const struct anv_kernel_arg *args)
+{
+ const struct intel_device_info *devinfo = cmd_buffer->device->info;
+ const struct brw_cs_prog_data *cs_prog_data =
+ brw_cs_prog_data_const(kernel->bin->prog_data);
+
+ genX(cmd_buffer_config_l3)(cmd_buffer, kernel->l3_config);
+
+ genX(flush_pipeline_select_gpgpu)(cmd_buffer);
+
+ /* Apply any pending pipeline flushes we may have. We want to apply them
+ * now because, if any of those flushes are for things like push constants,
+ * the GPU will read the state at weird times.
+ */
+ genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+
+ uint32_t indirect_data_size = sizeof(struct brw_kernel_sysvals);
+ indirect_data_size += kernel->bin->bind_map.kernel_args_size;
+ indirect_data_size = ALIGN(indirect_data_size, 64);
+ struct anv_state indirect_data =
+ anv_cmd_buffer_alloc_general_state(cmd_buffer,
+ indirect_data_size, 64);
+ memset(indirect_data.map, 0, indirect_data.alloc_size);
+
+ struct brw_kernel_sysvals sysvals = {};
+ if (global_size != NULL) {
+ for (unsigned i = 0; i < 3; i++)
+ sysvals.num_work_groups[i] = global_size[i];
+ memcpy(indirect_data.map, &sysvals, sizeof(sysvals));
+ } else {
+ struct anv_address sysvals_addr = {
+ .bo = NULL, /* General state buffer is always 0. */
+ .offset = indirect_data.offset,
+ };
+
+ compute_store_indirect_params(cmd_buffer, sysvals_addr);
+ }
+
+ void *args_map = indirect_data.map + sizeof(sysvals);
+ for (unsigned i = 0; i < kernel->bin->bind_map.kernel_arg_count; i++) {
+ struct brw_kernel_arg_desc *arg_desc =
+ &kernel->bin->bind_map.kernel_args[i];
+ assert(i < arg_count);
+ const struct anv_kernel_arg *arg = &args[i];
+ if (arg->is_ptr) {
+ memcpy(args_map + arg_desc->offset, arg->ptr, arg_desc->size);
+ } else {
+ assert(arg_desc->size <= sizeof(arg->u64));
+ memcpy(args_map + arg_desc->offset, &arg->u64, arg_desc->size);
+ }
+ }
+
+ struct intel_cs_dispatch_info dispatch =
+ brw_cs_get_dispatch_info(devinfo, cs_prog_data, NULL);
+
+ anv_batch_emit(&cmd_buffer->batch, GENX(COMPUTE_WALKER), cw) {
+ cw.PredicateEnable = false;
+ cw.SIMDSize = dispatch.simd_size / 16;
+ cw.MessageSIMD = dispatch.simd_size / 16;
+ cw.IndirectDataStartAddress = indirect_data.offset;
+ cw.IndirectDataLength = indirect_data.alloc_size;
+ cw.LocalXMaximum = cs_prog_data->local_size[0] - 1;
+ cw.LocalYMaximum = cs_prog_data->local_size[1] - 1;
+ cw.LocalZMaximum = cs_prog_data->local_size[2] - 1;
+ cw.ExecutionMask = dispatch.right_mask;
+ cw.PostSync.MOCS = cmd_buffer->device->isl_dev.mocs.internal;
+
+ if (global_size != NULL) {
+ cw.ThreadGroupIDXDimension = global_size[0];
+ cw.ThreadGroupIDYDimension = global_size[1];
+ cw.ThreadGroupIDZDimension = global_size[2];
+ } else {
+ cw.IndirectParameterEnable = true;
+ }
+
+ cw.InterfaceDescriptor =
+ get_interface_descriptor_data(cmd_buffer,
+ kernel->bin,
+ cs_prog_data,
+ &dispatch);
+ }
+
+ /* We just blew away the compute pipeline state */
+ cmd_buffer->state.compute.pipeline_dirty = true;
+}
+
+static void
+calc_local_trace_size(uint8_t local_shift[3], const uint32_t global[3])
+{
+ unsigned total_shift = 0;
+ memset(local_shift, 0, 3);
+
+ bool progress;
+ do {
+ progress = false;
+ for (unsigned i = 0; i < 3; i++) {
+ assert(global[i] > 0);
+ if ((1 << local_shift[i]) < global[i]) {
+ progress = true;
+ local_shift[i]++;
+ total_shift++;
+ }
+
+ if (total_shift == 3)
+ return;
+ }
+ } while(progress);
+
+ /* Assign whatever's left to x */
+ local_shift[0] += 3 - total_shift;
+}
+
+static struct GENX(RT_SHADER_TABLE)
+vk_sdar_to_shader_table(const VkStridedDeviceAddressRegionKHR *region)
+{
+ return (struct GENX(RT_SHADER_TABLE)) {
+ .BaseAddress = anv_address_from_u64(region->deviceAddress),
+ .Stride = region->stride,
+ };
+}
+
+struct trace_params {
+ /* If is_sbt_indirect, use indirect_sbts_addr to build RT_DISPATCH_GLOBALS
+ * with mi_builder.
+ */
+ bool is_sbt_indirect;
+ const VkStridedDeviceAddressRegionKHR *raygen_sbt;
+ const VkStridedDeviceAddressRegionKHR *miss_sbt;
+ const VkStridedDeviceAddressRegionKHR *hit_sbt;
+ const VkStridedDeviceAddressRegionKHR *callable_sbt;
+
+ /* A pointer to a VkTraceRaysIndirectCommand2KHR structure */
+ uint64_t indirect_sbts_addr;
+
+ /* If is_indirect, use launch_size_addr to program the dispatch size. */
+ bool is_launch_size_indirect;
+ uint32_t launch_size[3];
+
+ /* A pointer a uint32_t[3] */
+ uint64_t launch_size_addr;
+};
+
+static struct anv_state
+cmd_buffer_emit_rt_dispatch_globals(struct anv_cmd_buffer *cmd_buffer,
+ struct trace_params *params)
+{
+ assert(!params->is_sbt_indirect);
+ assert(params->miss_sbt != NULL);
+ assert(params->hit_sbt != NULL);
+ assert(params->callable_sbt != NULL);
+
+ struct anv_cmd_ray_tracing_state *rt = &cmd_buffer->state.rt;
+
+ struct anv_state rtdg_state =
+ anv_cmd_buffer_alloc_temporary_state(cmd_buffer,
+ BRW_RT_PUSH_CONST_OFFSET +
+ sizeof(struct anv_push_constants),
+ 64);
+
+ struct GENX(RT_DISPATCH_GLOBALS) rtdg = {
+ .MemBaseAddress = (struct anv_address) {
+ .bo = rt->scratch.bo,
+ .offset = rt->scratch.layout.ray_stack_start,
+ },
+ .CallStackHandler = anv_shader_bin_get_bsr(
+ cmd_buffer->device->rt_trivial_return, 0),
+ .AsyncRTStackSize = rt->scratch.layout.ray_stack_stride / 64,
+ .NumDSSRTStacks = rt->scratch.layout.stack_ids_per_dss,
+ .MaxBVHLevels = BRW_RT_MAX_BVH_LEVELS,
+ .Flags = RT_DEPTH_TEST_LESS_EQUAL,
+ .HitGroupTable = vk_sdar_to_shader_table(params->hit_sbt),
+ .MissGroupTable = vk_sdar_to_shader_table(params->miss_sbt),
+ .SWStackSize = rt->scratch.layout.sw_stack_size / 64,
+ .LaunchWidth = params->launch_size[0],
+ .LaunchHeight = params->launch_size[1],
+ .LaunchDepth = params->launch_size[2],
+ .CallableGroupTable = vk_sdar_to_shader_table(params->callable_sbt),
+ };
+ GENX(RT_DISPATCH_GLOBALS_pack)(NULL, rtdg_state.map, &rtdg);
+
+ return rtdg_state;
+}
+
+static struct mi_value
+mi_build_sbt_entry(struct mi_builder *b,
+ uint64_t addr_field_addr,
+ uint64_t stride_field_addr)
+{
+ return mi_ior(b,
+ mi_iand(b, mi_mem64(anv_address_from_u64(addr_field_addr)),
+ mi_imm(BITFIELD64_BIT(49) - 1)),
+ mi_ishl_imm(b, mi_mem32(anv_address_from_u64(stride_field_addr)),
+ 48));
+}
+
+static struct anv_state
+cmd_buffer_emit_rt_dispatch_globals_indirect(struct anv_cmd_buffer *cmd_buffer,
+ struct trace_params *params)
+{
+ struct anv_cmd_ray_tracing_state *rt = &cmd_buffer->state.rt;
+
+ struct anv_state rtdg_state =
+ anv_cmd_buffer_alloc_temporary_state(cmd_buffer,
+ BRW_RT_PUSH_CONST_OFFSET +
+ sizeof(struct anv_push_constants),
+ 64);
+
+ struct GENX(RT_DISPATCH_GLOBALS) rtdg = {
+ .MemBaseAddress = (struct anv_address) {
+ .bo = rt->scratch.bo,
+ .offset = rt->scratch.layout.ray_stack_start,
+ },
+ .CallStackHandler = anv_shader_bin_get_bsr(
+ cmd_buffer->device->rt_trivial_return, 0),
+ .AsyncRTStackSize = rt->scratch.layout.ray_stack_stride / 64,
+ .NumDSSRTStacks = rt->scratch.layout.stack_ids_per_dss,
+ .MaxBVHLevels = BRW_RT_MAX_BVH_LEVELS,
+ .Flags = RT_DEPTH_TEST_LESS_EQUAL,
+ .SWStackSize = rt->scratch.layout.sw_stack_size / 64,
+ };
+ GENX(RT_DISPATCH_GLOBALS_pack)(NULL, rtdg_state.map, &rtdg);
+
+ struct anv_address rtdg_addr =
+ anv_cmd_buffer_temporary_state_address(cmd_buffer, rtdg_state);
+
+ struct mi_builder b;
+ mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
+ const uint32_t mocs = anv_mocs_for_address(cmd_buffer->device, &rtdg_addr);
+ mi_builder_set_mocs(&b, mocs);
+
+ /* Fill the MissGroupTable, HitGroupTable & CallableGroupTable fields of
+ * RT_DISPATCH_GLOBALS using the mi_builder.
+ */
+ mi_store(&b,
+ mi_mem64(
+ anv_address_add(
+ rtdg_addr,
+ GENX(RT_DISPATCH_GLOBALS_MissGroupTable_start) / 8)),
+ mi_build_sbt_entry(&b,
+ params->indirect_sbts_addr +
+ offsetof(VkTraceRaysIndirectCommand2KHR,
+ missShaderBindingTableAddress),
+ params->indirect_sbts_addr +
+ offsetof(VkTraceRaysIndirectCommand2KHR,
+ missShaderBindingTableStride)));
+ mi_store(&b,
+ mi_mem64(
+ anv_address_add(
+ rtdg_addr,
+ GENX(RT_DISPATCH_GLOBALS_HitGroupTable_start) / 8)),
+ mi_build_sbt_entry(&b,
+ params->indirect_sbts_addr +
+ offsetof(VkTraceRaysIndirectCommand2KHR,
+ hitShaderBindingTableAddress),
+ params->indirect_sbts_addr +
+ offsetof(VkTraceRaysIndirectCommand2KHR,
+ hitShaderBindingTableStride)));
+ mi_store(&b,
+ mi_mem64(
+ anv_address_add(
+ rtdg_addr,
+ GENX(RT_DISPATCH_GLOBALS_CallableGroupTable_start) / 8)),
+ mi_build_sbt_entry(&b,
+ params->indirect_sbts_addr +
+ offsetof(VkTraceRaysIndirectCommand2KHR,
+ callableShaderBindingTableAddress),
+ params->indirect_sbts_addr +
+ offsetof(VkTraceRaysIndirectCommand2KHR,
+ callableShaderBindingTableStride)));
+
+ return rtdg_state;
+}
+
+static void
+cmd_buffer_trace_rays(struct anv_cmd_buffer *cmd_buffer,
+ struct trace_params *params)
+{
+ struct anv_device *device = cmd_buffer->device;
+ struct anv_cmd_ray_tracing_state *rt = &cmd_buffer->state.rt;
+ struct anv_ray_tracing_pipeline *pipeline =
+ anv_pipeline_to_ray_tracing(rt->base.pipeline);
+
+ if (anv_batch_has_error(&cmd_buffer->batch))
+ return;
+
+ /* If we have a known degenerate launch size, just bail */
+ if (!params->is_launch_size_indirect &&
+ (params->launch_size[0] == 0 ||
+ params->launch_size[1] == 0 ||
+ params->launch_size[2] == 0))
+ return;
+
+ trace_intel_begin_rays(&cmd_buffer->trace);
+
+ genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->base.l3_config);
+
+ genX(flush_descriptor_buffers)(cmd_buffer, &rt->base);
+
+ genX(flush_pipeline_select_gpgpu)(cmd_buffer);
+
+ cmd_buffer->state.rt.pipeline_dirty = false;
+
+ genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+
+ genX(cmd_buffer_flush_push_descriptors)(cmd_buffer,
+ &cmd_buffer->state.rt.base,
+ &pipeline->base);
+
+ /* Add these to the reloc list as they're internal buffers that don't
+ * actually have relocs to pick them up manually.
+ *
+ * TODO(RT): This is a bit of a hack
+ */
+ anv_reloc_list_add_bo(cmd_buffer->batch.relocs,
+ rt->scratch.bo);
+ anv_reloc_list_add_bo(cmd_buffer->batch.relocs,
+ cmd_buffer->device->btd_fifo_bo);
+
+ /* Allocate and set up our RT_DISPATCH_GLOBALS */
+ struct anv_state rtdg_state =
+ params->is_sbt_indirect ?
+ cmd_buffer_emit_rt_dispatch_globals_indirect(cmd_buffer, params) :
+ cmd_buffer_emit_rt_dispatch_globals(cmd_buffer, params);
+
+ assert(rtdg_state.alloc_size >= (BRW_RT_PUSH_CONST_OFFSET +
+ sizeof(struct anv_push_constants)));
+ assert(GENX(RT_DISPATCH_GLOBALS_length) * 4 <= BRW_RT_PUSH_CONST_OFFSET);
+ /* Push constants go after the RT_DISPATCH_GLOBALS */
+ memcpy(rtdg_state.map + BRW_RT_PUSH_CONST_OFFSET,
+ &cmd_buffer->state.rt.base.push_constants,
+ sizeof(struct anv_push_constants));
+
+ struct anv_address rtdg_addr =
+ anv_cmd_buffer_temporary_state_address(cmd_buffer, rtdg_state);
+
+ uint8_t local_size_log2[3];
+ uint32_t global_size[3] = {};
+ if (params->is_launch_size_indirect) {
+ /* Pick a local size that's probably ok. We assume most TraceRays calls
+ * will use a two-dimensional dispatch size. Worst case, our initial
+ * dispatch will be a little slower than it has to be.
+ */
+ local_size_log2[0] = 2;
+ local_size_log2[1] = 1;
+ local_size_log2[2] = 0;
+
+ struct mi_builder b;
+ mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
+ const uint32_t mocs = anv_mocs_for_address(cmd_buffer->device, &rtdg_addr);
+ mi_builder_set_mocs(&b, mocs);
+
+ struct mi_value launch_size[3] = {
+ mi_mem32(anv_address_from_u64(params->launch_size_addr + 0)),
+ mi_mem32(anv_address_from_u64(params->launch_size_addr + 4)),
+ mi_mem32(anv_address_from_u64(params->launch_size_addr + 8)),
+ };
+
+ /* Store the original launch size into RT_DISPATCH_GLOBALS */
+ mi_store(&b, mi_mem32(anv_address_add(rtdg_addr,
+ GENX(RT_DISPATCH_GLOBALS_LaunchWidth_start) / 8)),
+ mi_value_ref(&b, launch_size[0]));
+ mi_store(&b, mi_mem32(anv_address_add(rtdg_addr,
+ GENX(RT_DISPATCH_GLOBALS_LaunchHeight_start) / 8)),
+ mi_value_ref(&b, launch_size[1]));
+ mi_store(&b, mi_mem32(anv_address_add(rtdg_addr,
+ GENX(RT_DISPATCH_GLOBALS_LaunchDepth_start) / 8)),
+ mi_value_ref(&b, launch_size[2]));
+
+ /* Compute the global dispatch size */
+ for (unsigned i = 0; i < 3; i++) {
+ if (local_size_log2[i] == 0)
+ continue;
+
+ /* global_size = DIV_ROUND_UP(launch_size, local_size)
+ *
+ * Fortunately for us MI_ALU math is 64-bit and , mi_ushr32_imm
+ * has the semantics of shifting the enture 64-bit value and taking
+ * the bottom 32 so we don't have to worry about roll-over.
+ */
+ uint32_t local_size = 1 << local_size_log2[i];
+ launch_size[i] = mi_iadd(&b, launch_size[i],
+ mi_imm(local_size - 1));
+ launch_size[i] = mi_ushr32_imm(&b, launch_size[i],
+ local_size_log2[i]);
+ }
+
+ mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMX), launch_size[0]);
+ mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMY), launch_size[1]);
+ mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMZ), launch_size[2]);
+
+ } else {
+ calc_local_trace_size(local_size_log2, params->launch_size);
+
+ for (unsigned i = 0; i < 3; i++) {
+ /* We have to be a bit careful here because DIV_ROUND_UP adds to the
+ * numerator value may overflow. Cast to uint64_t to avoid this.
+ */
+ uint32_t local_size = 1 << local_size_log2[i];
+ global_size[i] = DIV_ROUND_UP((uint64_t)params->launch_size[i], local_size);
+ }
+ }
+
+#if GFX_VERx10 == 125
+ /* Wa_14014427904 - We need additional invalidate/flush when
+ * emitting NP state commands with ATS-M in compute mode.
+ */
+ if (intel_device_info_is_atsm(device->info) &&
+ cmd_buffer->queue_family->engine_class == INTEL_ENGINE_CLASS_COMPUTE) {
+ genx_batch_emit_pipe_control(&cmd_buffer->batch,
+ cmd_buffer->device->info,
+ cmd_buffer->state.current_pipeline,
+ ANV_PIPE_CS_STALL_BIT |
+ ANV_PIPE_STATE_CACHE_INVALIDATE_BIT |
+ ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT |
+ ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT |
+ ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT |
+ ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT |
+ ANV_PIPE_HDC_PIPELINE_FLUSH_BIT);
+ }
+#endif
+
+ anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_BTD), btd) {
+ /* TODO: This is the timeout after which the bucketed thread dispatcher
+ * will kick off a wave of threads. We go with the lowest value
+ * for now. It could be tweaked on a per application basis
+ * (drirc).
+ */
+ btd.DispatchTimeoutCounter = _64clocks;
+ /* BSpec 43851: "This field must be programmed to 6h i.e. memory backed
+ * buffer must be 128KB."
+ */
+ btd.PerDSSMemoryBackedBufferSize = 6;
+ btd.MemoryBackedBufferBasePointer = (struct anv_address) { .bo = device->btd_fifo_bo };
+ if (pipeline->base.scratch_size > 0) {
+ struct anv_bo *scratch_bo =
+ anv_scratch_pool_alloc(device,
+ &device->scratch_pool,
+ MESA_SHADER_COMPUTE,
+ pipeline->base.scratch_size);
+ anv_reloc_list_add_bo(cmd_buffer->batch.relocs,
+ scratch_bo);
+ uint32_t scratch_surf =
+ anv_scratch_pool_get_surf(cmd_buffer->device,
+ &device->scratch_pool,
+ pipeline->base.scratch_size);
+ btd.ScratchSpaceBuffer = scratch_surf >> 4;
+ }
+ }
+
+ genX(cmd_buffer_ensure_cfe_state)(cmd_buffer, pipeline->base.scratch_size);
+
+ const struct brw_cs_prog_data *cs_prog_data =
+ brw_cs_prog_data_const(device->rt_trampoline->prog_data);
+ struct intel_cs_dispatch_info dispatch =
+ brw_cs_get_dispatch_info(device->info, cs_prog_data, NULL);
+
+ anv_batch_emit(&cmd_buffer->batch, GENX(COMPUTE_WALKER), cw) {
+ cw.IndirectParameterEnable = params->is_launch_size_indirect;
+ cw.PredicateEnable = cmd_buffer->state.conditional_render_enabled;
+ cw.SIMDSize = dispatch.simd_size / 16;
+ cw.MessageSIMD = dispatch.simd_size / 16;
+ cw.LocalXMaximum = (1 << local_size_log2[0]) - 1;
+ cw.LocalYMaximum = (1 << local_size_log2[1]) - 1;
+ cw.LocalZMaximum = (1 << local_size_log2[2]) - 1;
+ cw.ThreadGroupIDXDimension = global_size[0];
+ cw.ThreadGroupIDYDimension = global_size[1];
+ cw.ThreadGroupIDZDimension = global_size[2];
+ cw.ExecutionMask = 0xff;
+ cw.EmitInlineParameter = true;
+ cw.PostSync.MOCS = anv_mocs(pipeline->base.device, NULL, 0);
+
+ const gl_shader_stage s = MESA_SHADER_RAYGEN;
+ struct anv_device *device = cmd_buffer->device;
+ struct anv_state *surfaces = &cmd_buffer->state.binding_tables[s];
+ struct anv_state *samplers = &cmd_buffer->state.samplers[s];
+ cw.InterfaceDescriptor = (struct GENX(INTERFACE_DESCRIPTOR_DATA)) {
+ .KernelStartPointer = device->rt_trampoline->kernel.offset,
+ .SamplerStatePointer = samplers->offset,
+ /* i965: DIV_ROUND_UP(CLAMP(stage_state->sampler_count, 0, 16), 4), */
+ .SamplerCount = 0,
+ .BindingTablePointer = surfaces->offset,
+ .NumberofThreadsinGPGPUThreadGroup = 1,
+ .BTDMode = true,
+ };
+
+ struct brw_rt_raygen_trampoline_params trampoline_params = {
+ .rt_disp_globals_addr = anv_address_physical(rtdg_addr),
+ .raygen_bsr_addr =
+ params->is_sbt_indirect ?
+ (params->indirect_sbts_addr +
+ offsetof(VkTraceRaysIndirectCommand2KHR,
+ raygenShaderRecordAddress)) :
+ params->raygen_sbt->deviceAddress,
+ .is_indirect = params->is_sbt_indirect,
+ .local_group_size_log2 = {
+ local_size_log2[0],
+ local_size_log2[1],
+ local_size_log2[2],
+ },
+ };
+ STATIC_ASSERT(sizeof(trampoline_params) == 32);
+ memcpy(cw.InlineData, &trampoline_params, sizeof(trampoline_params));
+ }
+
+ trace_intel_end_rays(&cmd_buffer->trace,
+ params->launch_size[0],
+ params->launch_size[1],
+ params->launch_size[2]);
+}
+
+void
+genX(CmdTraceRaysKHR)(
+ VkCommandBuffer commandBuffer,
+ const VkStridedDeviceAddressRegionKHR* pRaygenShaderBindingTable,
+ const VkStridedDeviceAddressRegionKHR* pMissShaderBindingTable,
+ const VkStridedDeviceAddressRegionKHR* pHitShaderBindingTable,
+ const VkStridedDeviceAddressRegionKHR* pCallableShaderBindingTable,
+ uint32_t width,
+ uint32_t height,
+ uint32_t depth)
+{
+ ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+ struct trace_params params = {
+ .is_sbt_indirect = false,
+ .raygen_sbt = pRaygenShaderBindingTable,
+ .miss_sbt = pMissShaderBindingTable,
+ .hit_sbt = pHitShaderBindingTable,
+ .callable_sbt = pCallableShaderBindingTable,
+ .is_launch_size_indirect = false,
+ .launch_size = {
+ width,
+ height,
+ depth,
+ },
+ };
+
+ cmd_buffer_trace_rays(cmd_buffer, &params);
+}
+
+void
+genX(CmdTraceRaysIndirectKHR)(
+ VkCommandBuffer commandBuffer,
+ const VkStridedDeviceAddressRegionKHR* pRaygenShaderBindingTable,
+ const VkStridedDeviceAddressRegionKHR* pMissShaderBindingTable,
+ const VkStridedDeviceAddressRegionKHR* pHitShaderBindingTable,
+ const VkStridedDeviceAddressRegionKHR* pCallableShaderBindingTable,
+ VkDeviceAddress indirectDeviceAddress)
+{
+ ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+ struct trace_params params = {
+ .is_sbt_indirect = false,
+ .raygen_sbt = pRaygenShaderBindingTable,
+ .miss_sbt = pMissShaderBindingTable,
+ .hit_sbt = pHitShaderBindingTable,
+ .callable_sbt = pCallableShaderBindingTable,
+ .is_launch_size_indirect = true,
+ .launch_size_addr = indirectDeviceAddress,
+ };
+
+ cmd_buffer_trace_rays(cmd_buffer, &params);
+}
+
+void
+genX(CmdTraceRaysIndirect2KHR)(
+ VkCommandBuffer commandBuffer,
+ VkDeviceAddress indirectDeviceAddress)
+{
+ ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+ struct trace_params params = {
+ .is_sbt_indirect = true,
+ .indirect_sbts_addr = indirectDeviceAddress,
+ .is_launch_size_indirect = true,
+ .launch_size_addr = indirectDeviceAddress +
+ offsetof(VkTraceRaysIndirectCommand2KHR, width),
+ };
+
+ cmd_buffer_trace_rays(cmd_buffer, &params);
+}
+
+#endif /* GFX_VERx10 >= 125 */
diff --git a/src/intel/vulkan/genX_cmd_draw.c b/src/intel/vulkan/genX_cmd_draw.c
new file mode 100644
index 00000000000..64a806659b6
--- /dev/null
+++ b/src/intel/vulkan/genX_cmd_draw.c
@@ -0,0 +1,2330 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <assert.h>
+#include <stdbool.h>
+
+#include "anv_private.h"
+#include "anv_measure.h"
+#include "vk_render_pass.h"
+#include "vk_util.h"
+
+#include "common/intel_aux_map.h"
+#include "genxml/gen_macros.h"
+#include "genxml/genX_pack.h"
+#include "genxml/genX_rt_pack.h"
+#include "common/intel_genX_state_brw.h"
+
+#include "ds/intel_tracepoints.h"
+
+/* We reserve :
+ * - GPR 14 for secondary command buffer returns
+ * - GPR 15 for conditional rendering
+ */
+#define MI_BUILDER_NUM_ALLOC_GPRS 14
+#define __gen_get_batch_dwords anv_batch_emit_dwords
+#define __gen_address_offset anv_address_add
+#define __gen_get_batch_address(b, a) anv_batch_address(b, a)
+#include "common/mi_builder.h"
+
+static void
+cmd_buffer_alloc_gfx_push_constants(struct anv_cmd_buffer *cmd_buffer)
+{
+ struct anv_graphics_pipeline *pipeline =
+ anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
+ VkShaderStageFlags stages = pipeline->base.base.active_stages;
+
+ /* In order to avoid thrash, we assume that vertex and fragment stages
+ * always exist. In the rare case where one is missing *and* the other
+ * uses push concstants, this may be suboptimal. However, avoiding stalls
+ * seems more important.
+ */
+ stages |= VK_SHADER_STAGE_FRAGMENT_BIT;
+ if (anv_pipeline_is_primitive(pipeline))
+ stages |= VK_SHADER_STAGE_VERTEX_BIT;
+
+ if (stages == cmd_buffer->state.gfx.push_constant_stages)
+ return;
+
+ unsigned push_constant_kb;
+
+ const struct intel_device_info *devinfo = cmd_buffer->device->info;
+ if (anv_pipeline_is_mesh(pipeline))
+ push_constant_kb = devinfo->mesh_max_constant_urb_size_kb;
+ else
+ push_constant_kb = devinfo->max_constant_urb_size_kb;
+
+ const unsigned num_stages =
+ util_bitcount(stages & VK_SHADER_STAGE_ALL_GRAPHICS);
+ unsigned size_per_stage = push_constant_kb / num_stages;
+
+ /* Broadwell+ and Haswell gt3 require that the push constant sizes be in
+ * units of 2KB. Incidentally, these are the same platforms that have
+ * 32KB worth of push constant space.
+ */
+ if (push_constant_kb == 32)
+ size_per_stage &= ~1u;
+
+ uint32_t kb_used = 0;
+ for (int i = MESA_SHADER_VERTEX; i < MESA_SHADER_FRAGMENT; i++) {
+ const unsigned push_size = (stages & (1 << i)) ? size_per_stage : 0;
+ anv_batch_emit(&cmd_buffer->batch,
+ GENX(3DSTATE_PUSH_CONSTANT_ALLOC_VS), alloc) {
+ alloc._3DCommandSubOpcode = 18 + i;
+ alloc.ConstantBufferOffset = (push_size > 0) ? kb_used : 0;
+ alloc.ConstantBufferSize = push_size;
+ }
+ kb_used += push_size;
+ }
+
+ anv_batch_emit(&cmd_buffer->batch,
+ GENX(3DSTATE_PUSH_CONSTANT_ALLOC_PS), alloc) {
+ alloc.ConstantBufferOffset = kb_used;
+ alloc.ConstantBufferSize = push_constant_kb - kb_used;
+ }
+
+#if GFX_VERx10 == 125
+ /* DG2: Wa_22011440098
+ * MTL: Wa_18022330953
+ *
+ * In 3D mode, after programming push constant alloc command immediately
+ * program push constant command(ZERO length) without any commit between
+ * them.
+ */
+ anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CONSTANT_ALL), c) {
+ /* Update empty push constants for all stages (bitmask = 11111b) */
+ c.ShaderUpdateEnable = 0x1f;
+ c.MOCS = anv_mocs(cmd_buffer->device, NULL, 0);
+ }
+#endif
+
+ cmd_buffer->state.gfx.push_constant_stages = stages;
+
+ /* From the BDW PRM for 3DSTATE_PUSH_CONSTANT_ALLOC_VS:
+ *
+ * "The 3DSTATE_CONSTANT_VS must be reprogrammed prior to
+ * the next 3DPRIMITIVE command after programming the
+ * 3DSTATE_PUSH_CONSTANT_ALLOC_VS"
+ *
+ * Since 3DSTATE_PUSH_CONSTANT_ALLOC_VS is programmed as part of
+ * pipeline setup, we need to dirty push constants.
+ */
+ cmd_buffer->state.push_constants_dirty |= stages;
+}
+
+static void
+cmd_buffer_emit_descriptor_pointers(struct anv_cmd_buffer *cmd_buffer,
+ uint32_t stages)
+{
+ static const uint32_t sampler_state_opcodes[] = {
+ [MESA_SHADER_VERTEX] = 43,
+ [MESA_SHADER_TESS_CTRL] = 44, /* HS */
+ [MESA_SHADER_TESS_EVAL] = 45, /* DS */
+ [MESA_SHADER_GEOMETRY] = 46,
+ [MESA_SHADER_FRAGMENT] = 47,
+ };
+
+ static const uint32_t binding_table_opcodes[] = {
+ [MESA_SHADER_VERTEX] = 38,
+ [MESA_SHADER_TESS_CTRL] = 39,
+ [MESA_SHADER_TESS_EVAL] = 40,
+ [MESA_SHADER_GEOMETRY] = 41,
+ [MESA_SHADER_FRAGMENT] = 42,
+ };
+
+ anv_foreach_stage(s, stages) {
+ assert(s < ARRAY_SIZE(binding_table_opcodes));
+
+ if (cmd_buffer->state.samplers[s].alloc_size > 0) {
+ anv_batch_emit(&cmd_buffer->batch,
+ GENX(3DSTATE_SAMPLER_STATE_POINTERS_VS), ssp) {
+ ssp._3DCommandSubOpcode = sampler_state_opcodes[s];
+ ssp.PointertoVSSamplerState = cmd_buffer->state.samplers[s].offset;
+ }
+ }
+
+ /* Always emit binding table pointers if we're asked to, since on SKL
+ * this is what flushes push constants. */
+ anv_batch_emit(&cmd_buffer->batch,
+ GENX(3DSTATE_BINDING_TABLE_POINTERS_VS), btp) {
+ btp._3DCommandSubOpcode = binding_table_opcodes[s];
+ btp.PointertoVSBindingTable = cmd_buffer->state.binding_tables[s].offset;
+ }
+ }
+}
+
+static struct anv_address
+get_push_range_address(struct anv_cmd_buffer *cmd_buffer,
+ const struct anv_shader_bin *shader,
+ const struct anv_push_range *range)
+{
+ struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
+ switch (range->set) {
+ case ANV_DESCRIPTOR_SET_DESCRIPTORS: {
+ /* This is a descriptor set buffer so the set index is
+ * actually given by binding->binding. (Yes, that's
+ * confusing.)
+ */
+ struct anv_descriptor_set *set =
+ gfx_state->base.descriptors[range->index];
+ return anv_descriptor_set_address(set);
+ }
+
+ case ANV_DESCRIPTOR_SET_DESCRIPTORS_BUFFER: {
+ return anv_address_from_u64(
+ anv_cmd_buffer_descriptor_buffer_address(
+ cmd_buffer,
+ gfx_state->base.descriptor_buffers[range->index].buffer_index) +
+ gfx_state->base.descriptor_buffers[range->index].buffer_offset);
+ }
+
+ case ANV_DESCRIPTOR_SET_PUSH_CONSTANTS: {
+ if (gfx_state->base.push_constants_state.alloc_size == 0) {
+ gfx_state->base.push_constants_state =
+ anv_cmd_buffer_gfx_push_constants(cmd_buffer);
+ }
+ return anv_cmd_buffer_temporary_state_address(
+ cmd_buffer, gfx_state->base.push_constants_state);
+ }
+
+ default: {
+ assert(range->set < MAX_SETS);
+ struct anv_descriptor_set *set =
+ gfx_state->base.descriptors[range->set];
+ const struct anv_descriptor *desc =
+ &set->descriptors[range->index];
+
+ if (desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER) {
+ if (desc->buffer) {
+ return anv_address_add(desc->buffer->address,
+ desc->offset);
+ }
+ } else {
+ assert(desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC);
+ if (desc->buffer) {
+ const struct anv_cmd_pipeline_state *pipe_state = &gfx_state->base;
+ uint32_t dynamic_offset =
+ pipe_state->dynamic_offsets[
+ range->set].offsets[range->dynamic_offset_index];
+ return anv_address_add(desc->buffer->address,
+ desc->offset + dynamic_offset);
+ }
+ }
+
+ /* For NULL UBOs, we just return an address in the workaround BO. We do
+ * writes to it for workarounds but always at the bottom. The higher
+ * bytes should be all zeros.
+ */
+ assert(range->length * 32 <= 2048);
+ return (struct anv_address) {
+ .bo = cmd_buffer->device->workaround_bo,
+ .offset = 1024,
+ };
+ }
+ }
+}
+
+
+/** Returns the size in bytes of the bound buffer
+ *
+ * The range is relative to the start of the buffer, not the start of the
+ * range. The returned range may be smaller than
+ *
+ * (range->start + range->length) * 32;
+ */
+static uint32_t
+get_push_range_bound_size(struct anv_cmd_buffer *cmd_buffer,
+ const struct anv_shader_bin *shader,
+ const struct anv_push_range *range)
+{
+ assert(shader->stage != MESA_SHADER_COMPUTE);
+ const struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
+ switch (range->set) {
+ case ANV_DESCRIPTOR_SET_DESCRIPTORS: {
+ struct anv_descriptor_set *set =
+ gfx_state->base.descriptors[range->index];
+ struct anv_state state = set->desc_surface_mem;
+ assert(range->start * 32 < state.alloc_size);
+ assert((range->start + range->length) * 32 <= state.alloc_size);
+ return state.alloc_size;
+ }
+
+ case ANV_DESCRIPTOR_SET_DESCRIPTORS_BUFFER:
+ return gfx_state->base.pipeline->layout.set[
+ range->index].layout->descriptor_buffer_surface_size;
+
+ case ANV_DESCRIPTOR_SET_PUSH_CONSTANTS:
+ return (range->start + range->length) * 32;
+
+ default: {
+ assert(range->set < MAX_SETS);
+ struct anv_descriptor_set *set =
+ gfx_state->base.descriptors[range->set];
+ const struct anv_descriptor *desc =
+ &set->descriptors[range->index];
+
+ if (desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER) {
+ /* Here we promote a UBO to a binding table entry so that we can avoid a layer of indirection.
+ * We use the descriptor set's internally allocated surface state to fill the binding table entry.
+ */
+ if (!desc->buffer)
+ return 0;
+
+ if (range->start * 32 > desc->bind_range)
+ return 0;
+
+ return desc->bind_range;
+ } else {
+ if (!desc->buffer)
+ return 0;
+
+ assert(desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC);
+ /* Compute the offset within the buffer */
+ const struct anv_cmd_pipeline_state *pipe_state = &gfx_state->base;
+ uint32_t dynamic_offset =
+ pipe_state->dynamic_offsets[
+ range->set].offsets[range->dynamic_offset_index];
+ uint64_t offset = desc->offset + dynamic_offset;
+ /* Clamp to the buffer size */
+ offset = MIN2(offset, desc->buffer->vk.size);
+ /* Clamp the range to the buffer size */
+ uint32_t bound_range = MIN2(desc->range, desc->buffer->vk.size - offset);
+
+ /* Align the range for consistency */
+ bound_range = align(bound_range, ANV_UBO_ALIGNMENT);
+
+ return bound_range;
+ }
+ }
+ }
+}
+
+static void
+cmd_buffer_emit_push_constant(struct anv_cmd_buffer *cmd_buffer,
+ gl_shader_stage stage,
+ struct anv_address *buffers,
+ unsigned buffer_count)
+{
+ const struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
+ const struct anv_graphics_pipeline *pipeline =
+ anv_pipeline_to_graphics(gfx_state->base.pipeline);
+
+ static const uint32_t push_constant_opcodes[] = {
+ [MESA_SHADER_VERTEX] = 21,
+ [MESA_SHADER_TESS_CTRL] = 25, /* HS */
+ [MESA_SHADER_TESS_EVAL] = 26, /* DS */
+ [MESA_SHADER_GEOMETRY] = 22,
+ [MESA_SHADER_FRAGMENT] = 23,
+ };
+
+ assert(stage < ARRAY_SIZE(push_constant_opcodes));
+
+ UNUSED uint32_t mocs = anv_mocs(cmd_buffer->device, NULL, 0);
+
+ anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CONSTANT_VS), c) {
+ c._3DCommandSubOpcode = push_constant_opcodes[stage];
+
+ /* Set MOCS.
+ *
+ * We only have one MOCS field for the whole packet, not one per
+ * buffer. We could go out of our way here to walk over all of
+ * the buffers and see if any of them are used externally and use
+ * the external MOCS. However, the notion that someone would use
+ * the same bit of memory for both scanout and a UBO is nuts.
+ *
+ * Let's not bother and assume it's all internal.
+ */
+ c.MOCS = mocs;
+
+ if (anv_pipeline_has_stage(pipeline, stage)) {
+ const struct anv_pipeline_bind_map *bind_map =
+ &pipeline->base.shaders[stage]->bind_map;
+
+ /* The Skylake PRM contains the following restriction:
+ *
+ * "The driver must ensure The following case does not occur
+ * without a flush to the 3D engine: 3DSTATE_CONSTANT_* with
+ * buffer 3 read length equal to zero committed followed by a
+ * 3DSTATE_CONSTANT_* with buffer 0 read length not equal to
+ * zero committed."
+ *
+ * To avoid this, we program the buffers in the highest slots.
+ * This way, slot 0 is only used if slot 3 is also used.
+ */
+ assert(buffer_count <= 4);
+ const unsigned shift = 4 - buffer_count;
+ for (unsigned i = 0; i < buffer_count; i++) {
+ const struct anv_push_range *range = &bind_map->push_ranges[i];
+
+ /* At this point we only have non-empty ranges */
+ assert(range->length > 0);
+
+ c.ConstantBody.ReadLength[i + shift] = range->length;
+ c.ConstantBody.Buffer[i + shift] =
+ anv_address_add(buffers[i], range->start * 32);
+ }
+ }
+ }
+}
+
+#if GFX_VER >= 12
+static void
+cmd_buffer_emit_push_constant_all(struct anv_cmd_buffer *cmd_buffer,
+ uint32_t shader_mask,
+ struct anv_address *buffers,
+ uint32_t buffer_count)
+{
+ if (buffer_count == 0) {
+ anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CONSTANT_ALL), c) {
+ c.ShaderUpdateEnable = shader_mask;
+ c.MOCS = isl_mocs(&cmd_buffer->device->isl_dev, 0, false);
+ }
+ return;
+ }
+
+ const struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
+ const struct anv_graphics_pipeline *pipeline =
+ anv_pipeline_to_graphics(gfx_state->base.pipeline);
+
+ gl_shader_stage stage = vk_to_mesa_shader_stage(shader_mask);
+
+ const struct anv_pipeline_bind_map *bind_map =
+ &pipeline->base.shaders[stage]->bind_map;
+
+ uint32_t *dw;
+ const uint32_t buffer_mask = (1 << buffer_count) - 1;
+ const uint32_t num_dwords = 2 + 2 * buffer_count;
+
+ dw = anv_batch_emitn(&cmd_buffer->batch, num_dwords,
+ GENX(3DSTATE_CONSTANT_ALL),
+ .ShaderUpdateEnable = shader_mask,
+ .PointerBufferMask = buffer_mask,
+ .MOCS = isl_mocs(&cmd_buffer->device->isl_dev, 0, false));
+
+ for (int i = 0; i < buffer_count; i++) {
+ const struct anv_push_range *range = &bind_map->push_ranges[i];
+ GENX(3DSTATE_CONSTANT_ALL_DATA_pack)(
+ &cmd_buffer->batch, dw + 2 + i * 2,
+ &(struct GENX(3DSTATE_CONSTANT_ALL_DATA)) {
+ .PointerToConstantBuffer =
+ anv_address_add(buffers[i], range->start * 32),
+ .ConstantBufferReadLength = range->length,
+ });
+ }
+}
+#endif
+
+static void
+cmd_buffer_flush_gfx_push_constants(struct anv_cmd_buffer *cmd_buffer,
+ VkShaderStageFlags dirty_stages)
+{
+ VkShaderStageFlags flushed = 0;
+ struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
+ const struct anv_graphics_pipeline *pipeline =
+ anv_pipeline_to_graphics(gfx_state->base.pipeline);
+
+#if GFX_VER >= 12
+ uint32_t nobuffer_stages = 0;
+#endif
+
+ /* Compute robust pushed register access mask for each stage. */
+ anv_foreach_stage(stage, dirty_stages) {
+ if (!anv_pipeline_has_stage(pipeline, stage))
+ continue;
+
+ const struct anv_shader_bin *shader = pipeline->base.shaders[stage];
+ if (shader->prog_data->zero_push_reg) {
+ const struct anv_pipeline_bind_map *bind_map = &shader->bind_map;
+ struct anv_push_constants *push = &gfx_state->base.push_constants;
+
+ push->push_reg_mask[stage] = 0;
+ /* Start of the current range in the shader, relative to the start of
+ * push constants in the shader.
+ */
+ unsigned range_start_reg = 0;
+ for (unsigned i = 0; i < 4; i++) {
+ const struct anv_push_range *range = &bind_map->push_ranges[i];
+ if (range->length == 0)
+ continue;
+
+ unsigned bound_size =
+ get_push_range_bound_size(cmd_buffer, shader, range);
+ if (bound_size >= range->start * 32) {
+ unsigned bound_regs =
+ MIN2(DIV_ROUND_UP(bound_size, 32) - range->start,
+ range->length);
+ assert(range_start_reg + bound_regs <= 64);
+ push->push_reg_mask[stage] |= BITFIELD64_RANGE(range_start_reg,
+ bound_regs);
+ }
+
+ cmd_buffer->state.push_constants_dirty |=
+ mesa_to_vk_shader_stage(stage);
+
+ range_start_reg += range->length;
+ }
+ }
+ }
+
+ /* Setting NULL resets the push constant state so that we allocate a new one
+ * if needed. If push constant data not dirty, get_push_range_address can
+ * re-use existing allocation.
+ */
+ if (gfx_state->base.push_constants_data_dirty)
+ gfx_state->base.push_constants_state = ANV_STATE_NULL;
+
+ anv_foreach_stage(stage, dirty_stages) {
+ unsigned buffer_count = 0;
+ flushed |= mesa_to_vk_shader_stage(stage);
+ UNUSED uint32_t max_push_range = 0;
+
+ struct anv_address buffers[4] = {};
+ if (anv_pipeline_has_stage(pipeline, stage)) {
+ const struct anv_shader_bin *shader = pipeline->base.shaders[stage];
+ const struct anv_pipeline_bind_map *bind_map = &shader->bind_map;
+
+ /* We have to gather buffer addresses as a second step because the
+ * loop above puts data into the push constant area and the call to
+ * get_push_range_address is what locks our push constants and copies
+ * them into the actual GPU buffer. If we did the two loops at the
+ * same time, we'd risk only having some of the sizes in the push
+ * constant buffer when we did the copy.
+ */
+ for (unsigned i = 0; i < 4; i++) {
+ const struct anv_push_range *range = &bind_map->push_ranges[i];
+ if (range->length == 0)
+ break;
+
+ buffers[i] = get_push_range_address(cmd_buffer, shader, range);
+ max_push_range = MAX2(max_push_range, range->length);
+ buffer_count++;
+ }
+
+ /* We have at most 4 buffers but they should be tightly packed */
+ for (unsigned i = buffer_count; i < 4; i++)
+ assert(bind_map->push_ranges[i].length == 0);
+ }
+
+#if GFX_VER >= 12
+ /* If this stage doesn't have any push constants, emit it later in a
+ * single CONSTANT_ALL packet.
+ */
+ if (buffer_count == 0) {
+ nobuffer_stages |= 1 << stage;
+ continue;
+ }
+
+ /* The Constant Buffer Read Length field from 3DSTATE_CONSTANT_ALL
+ * contains only 5 bits, so we can only use it for buffers smaller than
+ * 32.
+ *
+ * According to Wa_16011448509, Gfx12.0 misinterprets some address bits
+ * in 3DSTATE_CONSTANT_ALL. It should still be safe to use the command
+ * for disabling stages, where all address bits are zero. However, we
+ * can't safely use it for general buffers with arbitrary addresses.
+ * Just fall back to the individual 3DSTATE_CONSTANT_XS commands in that
+ * case.
+ */
+ if (max_push_range < 32 && GFX_VERx10 > 120) {
+ cmd_buffer_emit_push_constant_all(cmd_buffer, 1 << stage,
+ buffers, buffer_count);
+ continue;
+ }
+#endif
+
+ cmd_buffer_emit_push_constant(cmd_buffer, stage, buffers, buffer_count);
+ }
+
+#if GFX_VER >= 12
+ if (nobuffer_stages)
+ /* Wa_16011448509: all address bits are zero */
+ cmd_buffer_emit_push_constant_all(cmd_buffer, nobuffer_stages, NULL, 0);
+#endif
+
+ cmd_buffer->state.push_constants_dirty &= ~flushed;
+ gfx_state->base.push_constants_data_dirty = false;
+}
+
+#if GFX_VERx10 >= 125
+static void
+cmd_buffer_flush_mesh_inline_data(struct anv_cmd_buffer *cmd_buffer,
+ VkShaderStageFlags dirty_stages)
+{
+ struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
+ const struct anv_graphics_pipeline *pipeline =
+ anv_pipeline_to_graphics(gfx_state->base.pipeline);
+
+ if (dirty_stages & VK_SHADER_STAGE_TASK_BIT_EXT &&
+ anv_pipeline_has_stage(pipeline, MESA_SHADER_TASK)) {
+
+ const struct anv_shader_bin *shader = pipeline->base.shaders[MESA_SHADER_TASK];
+ const struct anv_pipeline_bind_map *bind_map = &shader->bind_map;
+
+ anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_TASK_SHADER_DATA), data) {
+ const struct anv_push_range *range = &bind_map->push_ranges[0];
+ if (range->length > 0) {
+ struct anv_address buffer =
+ get_push_range_address(cmd_buffer, shader, range);
+
+ uint64_t addr = anv_address_physical(buffer);
+ data.InlineData[0] = addr & 0xffffffff;
+ data.InlineData[1] = addr >> 32;
+
+ memcpy(&data.InlineData[BRW_TASK_MESH_PUSH_CONSTANTS_START_DW],
+ cmd_buffer->state.gfx.base.push_constants.client_data,
+ BRW_TASK_MESH_PUSH_CONSTANTS_SIZE_DW * 4);
+ }
+ }
+ }
+
+ if (dirty_stages & VK_SHADER_STAGE_MESH_BIT_EXT &&
+ anv_pipeline_has_stage(pipeline, MESA_SHADER_MESH)) {
+
+ const struct anv_shader_bin *shader = pipeline->base.shaders[MESA_SHADER_MESH];
+ const struct anv_pipeline_bind_map *bind_map = &shader->bind_map;
+
+ anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_MESH_SHADER_DATA), data) {
+ const struct anv_push_range *range = &bind_map->push_ranges[0];
+ if (range->length > 0) {
+ struct anv_address buffer =
+ get_push_range_address(cmd_buffer, shader, range);
+
+ uint64_t addr = anv_address_physical(buffer);
+ data.InlineData[0] = addr & 0xffffffff;
+ data.InlineData[1] = addr >> 32;
+
+ memcpy(&data.InlineData[BRW_TASK_MESH_PUSH_CONSTANTS_START_DW],
+ cmd_buffer->state.gfx.base.push_constants.client_data,
+ BRW_TASK_MESH_PUSH_CONSTANTS_SIZE_DW * 4);
+ }
+ }
+ }
+
+ cmd_buffer->state.push_constants_dirty &= ~dirty_stages;
+}
+#endif
+
+ALWAYS_INLINE static void
+genX(emit_hs)(struct anv_cmd_buffer *cmd_buffer)
+{
+ struct anv_graphics_pipeline *pipeline =
+ anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
+ if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL))
+ return;
+
+ anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.hs);
+}
+
+ALWAYS_INLINE static void
+genX(emit_ds)(struct anv_cmd_buffer *cmd_buffer)
+{
+#if INTEL_NEEDS_WA_22018402687
+ /* Wa_22018402687:
+ * In any 3D enabled context, just before any Tessellation enabled draw
+ * call (3D Primitive), re-send the last programmed 3DSTATE_DS again.
+ * This will make sure that the 3DSTATE_INT generated just before the
+ * draw call will have TDS dirty which will make sure TDS will launch the
+ * state thread before the draw call.
+ *
+ * This fixes a hang resulting from running anything using tessellation
+ * after a switch away from the mesh pipeline.
+ * We don't need to track said switch, as it matters at the HW level, and
+ * can be triggered even across processes, so we apply the Wa at all times.
+ */
+ struct anv_graphics_pipeline *pipeline =
+ anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
+ if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL))
+ return;
+
+ anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.ds);
+#endif
+}
+
+ALWAYS_INLINE static void
+genX(cmd_buffer_flush_gfx_state)(struct anv_cmd_buffer *cmd_buffer)
+{
+ struct anv_graphics_pipeline *pipeline =
+ anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
+ const struct vk_dynamic_graphics_state *dyn =
+ &cmd_buffer->vk.dynamic_graphics_state;
+ uint32_t *p;
+
+ assert((pipeline->base.base.active_stages & VK_SHADER_STAGE_COMPUTE_BIT) == 0);
+
+ genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->base.base.l3_config);
+
+ genX(cmd_buffer_emit_hashing_mode)(cmd_buffer, UINT_MAX, UINT_MAX, 1);
+
+ genX(flush_descriptor_buffers)(cmd_buffer, &cmd_buffer->state.gfx.base);
+
+ genX(flush_pipeline_select_3d)(cmd_buffer);
+
+ /* Wa_14015814527
+ *
+ * Apply task URB workaround when switching from task to primitive.
+ */
+ if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) {
+ if (anv_pipeline_is_primitive(pipeline)) {
+ genX(apply_task_urb_workaround)(cmd_buffer);
+ } else if (anv_pipeline_has_stage(pipeline, MESA_SHADER_TASK)) {
+ cmd_buffer->state.gfx.used_task_shader = true;
+ }
+ }
+
+ /* Apply any pending pipeline flushes we may have. We want to apply them
+ * now because, if any of those flushes are for things like push constants,
+ * the GPU will read the state at weird times.
+ */
+ genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+
+ /* Check what vertex buffers have been rebound against the set of bindings
+ * being used by the current set of vertex attributes.
+ */
+ uint32_t vb_emit = cmd_buffer->state.gfx.vb_dirty & dyn->vi->bindings_valid;
+ /* If the pipeline changed, the we have to consider all the valid bindings. */
+ if ((cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) ||
+ BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VI_BINDING_STRIDES))
+ vb_emit |= dyn->vi->bindings_valid;
+
+ if (vb_emit) {
+ const uint32_t num_buffers = __builtin_popcount(vb_emit);
+ const uint32_t num_dwords = 1 + num_buffers * 4;
+
+ p = anv_batch_emitn(&cmd_buffer->batch, num_dwords,
+ GENX(3DSTATE_VERTEX_BUFFERS));
+ uint32_t i = 0;
+ u_foreach_bit(vb, vb_emit) {
+ struct anv_buffer *buffer = cmd_buffer->state.vertex_bindings[vb].buffer;
+ uint32_t offset = cmd_buffer->state.vertex_bindings[vb].offset;
+
+ struct GENX(VERTEX_BUFFER_STATE) state;
+ if (buffer) {
+ uint32_t stride = dyn->vi_binding_strides[vb];
+ UNUSED uint32_t size = cmd_buffer->state.vertex_bindings[vb].size;
+
+ state = (struct GENX(VERTEX_BUFFER_STATE)) {
+ .VertexBufferIndex = vb,
+
+ .MOCS = anv_mocs(cmd_buffer->device, buffer->address.bo,
+ ISL_SURF_USAGE_VERTEX_BUFFER_BIT),
+ .AddressModifyEnable = true,
+ .BufferPitch = stride,
+ .BufferStartingAddress = anv_address_add(buffer->address, offset),
+ .NullVertexBuffer = offset >= buffer->vk.size,
+#if GFX_VER >= 12
+ .L3BypassDisable = true,
+#endif
+
+ .BufferSize = size,
+ };
+ } else {
+ state = (struct GENX(VERTEX_BUFFER_STATE)) {
+ .VertexBufferIndex = vb,
+ .NullVertexBuffer = true,
+ .MOCS = anv_mocs(cmd_buffer->device, NULL,
+ ISL_SURF_USAGE_VERTEX_BUFFER_BIT),
+ };
+ }
+
+#if GFX_VER == 9
+ genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(cmd_buffer, vb,
+ state.BufferStartingAddress,
+ state.BufferSize);
+#endif
+
+ GENX(VERTEX_BUFFER_STATE_pack)(&cmd_buffer->batch, &p[1 + i * 4], &state);
+ i++;
+ }
+ }
+
+ cmd_buffer->state.gfx.vb_dirty &= ~vb_emit;
+
+ const bool any_dynamic_state_dirty =
+ vk_dynamic_graphics_state_any_dirty(dyn);
+ uint32_t descriptors_dirty = cmd_buffer->state.descriptors_dirty &
+ pipeline->base.base.active_stages;
+
+ descriptors_dirty |=
+ genX(cmd_buffer_flush_push_descriptors)(cmd_buffer,
+ &cmd_buffer->state.gfx.base,
+ &pipeline->base.base);
+
+ /* Wa_1306463417, Wa_16011107343 - Send HS state for every primitive. */
+ if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE ||
+ (INTEL_NEEDS_WA_1306463417 || INTEL_NEEDS_WA_16011107343)) {
+ genX(emit_hs)(cmd_buffer);
+ }
+
+ if (!cmd_buffer->state.gfx.dirty && !descriptors_dirty &&
+ !any_dynamic_state_dirty &&
+ ((cmd_buffer->state.push_constants_dirty &
+ (VK_SHADER_STAGE_ALL_GRAPHICS |
+ VK_SHADER_STAGE_TASK_BIT_EXT |
+ VK_SHADER_STAGE_MESH_BIT_EXT)) == 0))
+ return;
+
+ if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_XFB_ENABLE) {
+ /* Wa_16011411144:
+ *
+ * SW must insert a PIPE_CONTROL cmd before and after the
+ * 3dstate_so_buffer_index_0/1/2/3 states to ensure so_buffer_index_*
+ * state is not combined with other state changes.
+ */
+ if (intel_needs_workaround(cmd_buffer->device->info, 16011411144)) {
+ anv_add_pending_pipe_bits(cmd_buffer,
+ ANV_PIPE_CS_STALL_BIT,
+ "before SO_BUFFER change WA");
+ genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+ }
+
+ /* We don't need any per-buffer dirty tracking because you're not
+ * allowed to bind different XFB buffers while XFB is enabled.
+ */
+ for (unsigned idx = 0; idx < MAX_XFB_BUFFERS; idx++) {
+ struct anv_xfb_binding *xfb = &cmd_buffer->state.xfb_bindings[idx];
+ anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_SO_BUFFER), sob) {
+#if GFX_VER < 12
+ sob.SOBufferIndex = idx;
+#else
+ sob._3DCommandOpcode = 0;
+ sob._3DCommandSubOpcode = SO_BUFFER_INDEX_0_CMD + idx;
+#endif
+
+ if (cmd_buffer->state.xfb_enabled && xfb->buffer && xfb->size != 0) {
+ sob.MOCS = anv_mocs(cmd_buffer->device, xfb->buffer->address.bo,
+ ISL_SURF_USAGE_STREAM_OUT_BIT);
+ sob.SurfaceBaseAddress = anv_address_add(xfb->buffer->address,
+ xfb->offset);
+ sob.SOBufferEnable = true;
+ sob.StreamOffsetWriteEnable = false;
+ /* Size is in DWords - 1 */
+ sob.SurfaceSize = DIV_ROUND_UP(xfb->size, 4) - 1;
+ } else {
+ sob.MOCS = anv_mocs(cmd_buffer->device, NULL, 0);
+ }
+ }
+ }
+
+ if (intel_needs_workaround(cmd_buffer->device->info, 16011411144)) {
+ /* Wa_16011411144: also CS_STALL after touching SO_BUFFER change */
+ anv_add_pending_pipe_bits(cmd_buffer,
+ ANV_PIPE_CS_STALL_BIT,
+ "after SO_BUFFER change WA");
+ genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+ } else if (GFX_VER >= 10) {
+ /* CNL and later require a CS stall after 3DSTATE_SO_BUFFER */
+ anv_add_pending_pipe_bits(cmd_buffer,
+ ANV_PIPE_CS_STALL_BIT,
+ "after 3DSTATE_SO_BUFFER call");
+ }
+ }
+
+ /* Flush the runtime state into the HW state tracking */
+ if (cmd_buffer->state.gfx.dirty || any_dynamic_state_dirty)
+ genX(cmd_buffer_flush_gfx_runtime_state)(cmd_buffer);
+
+ /* Flush the HW state into the commmand buffer */
+ if (!BITSET_IS_EMPTY(cmd_buffer->state.gfx.dyn_state.dirty))
+ genX(cmd_buffer_flush_gfx_hw_state)(cmd_buffer);
+
+ /* If the pipeline changed, we may need to re-allocate push constant space
+ * in the URB.
+ */
+ if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) {
+ cmd_buffer_alloc_gfx_push_constants(cmd_buffer);
+
+ /* Also add the relocations (scratch buffers) */
+ VkResult result = anv_reloc_list_append(cmd_buffer->batch.relocs,
+ pipeline->base.base.batch.relocs);
+ if (result != VK_SUCCESS) {
+ anv_batch_set_error(&cmd_buffer->batch, result);
+ return;
+ }
+ }
+
+ /* Render targets live in the same binding table as fragment descriptors */
+ if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_RENDER_TARGETS)
+ descriptors_dirty |= VK_SHADER_STAGE_FRAGMENT_BIT;
+
+ /* We emit the binding tables and sampler tables first, then emit push
+ * constants and then finally emit binding table and sampler table
+ * pointers. It has to happen in this order, since emitting the binding
+ * tables may change the push constants (in case of storage images). After
+ * emitting push constants, on SKL+ we have to emit the corresponding
+ * 3DSTATE_BINDING_TABLE_POINTER_* for the push constants to take effect.
+ */
+ uint32_t dirty = 0;
+ if (descriptors_dirty) {
+ dirty = genX(cmd_buffer_flush_descriptor_sets)(
+ cmd_buffer,
+ &cmd_buffer->state.gfx.base,
+ descriptors_dirty,
+ pipeline->base.shaders,
+ ARRAY_SIZE(pipeline->base.shaders));
+ cmd_buffer->state.descriptors_dirty &= ~dirty;
+ }
+
+ if (dirty || cmd_buffer->state.push_constants_dirty) {
+ /* Because we're pushing UBOs, we have to push whenever either
+ * descriptors or push constants is dirty.
+ */
+ dirty |= cmd_buffer->state.push_constants_dirty &
+ pipeline->base.base.active_stages;
+ cmd_buffer_flush_gfx_push_constants(cmd_buffer,
+ dirty & VK_SHADER_STAGE_ALL_GRAPHICS);
+#if GFX_VERx10 >= 125
+ cmd_buffer_flush_mesh_inline_data(
+ cmd_buffer, dirty & (VK_SHADER_STAGE_TASK_BIT_EXT |
+ VK_SHADER_STAGE_MESH_BIT_EXT));
+#endif
+ }
+
+ if (dirty & VK_SHADER_STAGE_ALL_GRAPHICS) {
+ cmd_buffer_emit_descriptor_pointers(cmd_buffer,
+ dirty & VK_SHADER_STAGE_ALL_GRAPHICS);
+ }
+
+ /* When we're done, there is no more dirty gfx state. */
+ cmd_buffer->state.gfx.dirty = 0;
+}
+
+ALWAYS_INLINE static bool
+anv_use_generated_draws(const struct anv_cmd_buffer *cmd_buffer, uint32_t count)
+{
+ const struct anv_device *device = cmd_buffer->device;
+ const struct anv_graphics_pipeline *pipeline =
+ anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
+
+ /* We cannot generate readable commands in protected mode. */
+ if (cmd_buffer->vk.pool->flags & VK_COMMAND_POOL_CREATE_PROTECTED_BIT)
+ return false;
+
+ /* Limit generated draws to pipelines without HS stage. This makes things
+ * simpler for implementing Wa_1306463417, Wa_16011107343.
+ */
+ if ((INTEL_NEEDS_WA_1306463417 || INTEL_NEEDS_WA_16011107343) &&
+ anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_CTRL))
+ return false;
+
+ return count >= device->physical->instance->generated_indirect_threshold;
+}
+
+#include "genX_cmd_draw_helpers.h"
+#include "genX_cmd_draw_generated_indirect.h"
+
+#if GFX_VER >= 11
+#define _3DPRIMITIVE_DIRECT GENX(3DPRIMITIVE_EXTENDED)
+#else
+#define _3DPRIMITIVE_DIRECT GENX(3DPRIMITIVE)
+#endif
+
+void genX(CmdDraw)(
+ VkCommandBuffer commandBuffer,
+ uint32_t vertexCount,
+ uint32_t instanceCount,
+ uint32_t firstVertex,
+ uint32_t firstInstance)
+{
+ ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+ struct anv_graphics_pipeline *pipeline =
+ anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
+
+ if (anv_batch_has_error(&cmd_buffer->batch))
+ return;
+
+ const uint32_t count =
+ vertexCount * instanceCount * pipeline->instance_multiplier;
+ anv_measure_snapshot(cmd_buffer,
+ INTEL_SNAPSHOT_DRAW,
+ "draw", count);
+ trace_intel_begin_draw(&cmd_buffer->trace);
+
+ /* Select pipeline here to allow
+ * cmd_buffer_emit_vertex_constants_and_flush() without flushing before
+ * cmd_buffer_flush_gfx_state().
+ */
+ genX(flush_pipeline_select_3d)(cmd_buffer);
+
+ if (cmd_buffer->state.conditional_render_enabled)
+ genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
+
+#if GFX_VER < 11
+ cmd_buffer_emit_vertex_constants_and_flush(cmd_buffer,
+ get_vs_prog_data(pipeline),
+ firstVertex, firstInstance, 0,
+ false /* force_flush */);
+#endif
+
+ genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
+ genX(emit_ds)(cmd_buffer);
+
+ genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, true);
+
+ anv_batch_emit(&cmd_buffer->batch, _3DPRIMITIVE_DIRECT, prim) {
+ prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled;
+#if GFX_VERx10 >= 125
+ prim.TBIMREnable = cmd_buffer->state.gfx.dyn_state.use_tbimr;
+#endif
+ prim.VertexAccessType = SEQUENTIAL;
+ prim.VertexCountPerInstance = vertexCount;
+ prim.StartVertexLocation = firstVertex;
+ prim.InstanceCount = instanceCount *
+ pipeline->instance_multiplier;
+ prim.StartInstanceLocation = firstInstance;
+ prim.BaseVertexLocation = 0;
+#if GFX_VER >= 11
+ prim.ExtendedParametersPresent = true;
+ prim.ExtendedParameter0 = firstVertex;
+ prim.ExtendedParameter1 = firstInstance;
+ prim.ExtendedParameter2 = 0;
+#endif
+ }
+
+ genX(batch_emit_post_3dprimitive_was)(&cmd_buffer->batch,
+ cmd_buffer->device,
+ cmd_buffer->state.gfx.primitive_topology,
+ vertexCount);
+ genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, false);
+
+ update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, SEQUENTIAL);
+
+ trace_intel_end_draw(&cmd_buffer->trace, count);
+}
+
+void genX(CmdDrawMultiEXT)(
+ VkCommandBuffer commandBuffer,
+ uint32_t drawCount,
+ const VkMultiDrawInfoEXT *pVertexInfo,
+ uint32_t instanceCount,
+ uint32_t firstInstance,
+ uint32_t stride)
+{
+ ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+ UNUSED struct anv_graphics_pipeline *pipeline =
+ anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
+
+ if (anv_batch_has_error(&cmd_buffer->batch))
+ return;
+
+ genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
+
+ if (cmd_buffer->state.conditional_render_enabled)
+ genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
+
+ uint32_t i = 0;
+#if GFX_VER < 11
+ vk_foreach_multi_draw(draw, i, pVertexInfo, drawCount, stride) {
+ cmd_buffer_emit_vertex_constants_and_flush(cmd_buffer,
+ get_vs_prog_data(pipeline),
+ draw->firstVertex,
+ firstInstance, i, !i);
+
+ const uint32_t count =
+ draw->vertexCount * instanceCount * pipeline->instance_multiplier;
+ anv_measure_snapshot(cmd_buffer,
+ INTEL_SNAPSHOT_DRAW,
+ "draw multi", count);
+ trace_intel_begin_draw_multi(&cmd_buffer->trace);
+
+ genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, true);
+
+ anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
+ prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled;
+ prim.VertexAccessType = SEQUENTIAL;
+ prim.VertexCountPerInstance = draw->vertexCount;
+ prim.StartVertexLocation = draw->firstVertex;
+ prim.InstanceCount = instanceCount *
+ pipeline->instance_multiplier;
+ prim.StartInstanceLocation = firstInstance;
+ prim.BaseVertexLocation = 0;
+ }
+
+ genX(batch_emit_post_3dprimitive_was)(&cmd_buffer->batch,
+ cmd_buffer->device,
+ cmd_buffer->state.gfx.primitive_topology,
+ drawCount == 0 ? 0 :
+ pVertexInfo[drawCount - 1].vertexCount);
+
+ genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, false);
+ trace_intel_end_draw_multi(&cmd_buffer->trace, count);
+ }
+#else
+ vk_foreach_multi_draw(draw, i, pVertexInfo, drawCount, stride) {
+
+ /* Wa_1306463417, Wa_16011107343 - Send HS state for every primitive,
+ * first one was handled by cmd_buffer_flush_gfx_state.
+ */
+ if (i && (INTEL_NEEDS_WA_1306463417 || INTEL_NEEDS_WA_16011107343))
+ genX(emit_hs)(cmd_buffer);
+ genX(emit_ds)(cmd_buffer);
+
+ const uint32_t count = draw->vertexCount * instanceCount;
+ anv_measure_snapshot(cmd_buffer,
+ INTEL_SNAPSHOT_DRAW,
+ "draw multi", count);
+ trace_intel_begin_draw_multi(&cmd_buffer->trace);
+
+ genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, true);
+
+ anv_batch_emit(&cmd_buffer->batch, _3DPRIMITIVE_DIRECT, prim) {
+#if GFX_VERx10 >= 125
+ prim.TBIMREnable = cmd_buffer->state.gfx.dyn_state.use_tbimr;
+#endif
+ prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled;
+ prim.VertexAccessType = SEQUENTIAL;
+ prim.VertexCountPerInstance = draw->vertexCount;
+ prim.StartVertexLocation = draw->firstVertex;
+ prim.InstanceCount = instanceCount;
+ prim.StartInstanceLocation = firstInstance;
+ prim.BaseVertexLocation = 0;
+ prim.ExtendedParametersPresent = true;
+ prim.ExtendedParameter0 = draw->firstVertex;
+ prim.ExtendedParameter1 = firstInstance;
+ prim.ExtendedParameter2 = i;
+ }
+
+ genX(batch_emit_post_3dprimitive_was)(&cmd_buffer->batch,
+ cmd_buffer->device,
+ cmd_buffer->state.gfx.primitive_topology,
+ drawCount == 0 ? 0 :
+ pVertexInfo[drawCount - 1].vertexCount);
+
+ genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, false);
+ trace_intel_end_draw_multi(&cmd_buffer->trace, count);
+ }
+#endif
+
+ update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, SEQUENTIAL);
+}
+
+void genX(CmdDrawIndexed)(
+ VkCommandBuffer commandBuffer,
+ uint32_t indexCount,
+ uint32_t instanceCount,
+ uint32_t firstIndex,
+ int32_t vertexOffset,
+ uint32_t firstInstance)
+{
+ ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+ struct anv_graphics_pipeline *pipeline =
+ anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
+
+ if (anv_batch_has_error(&cmd_buffer->batch))
+ return;
+
+ const uint32_t count =
+ indexCount * instanceCount * pipeline->instance_multiplier;
+ anv_measure_snapshot(cmd_buffer,
+ INTEL_SNAPSHOT_DRAW,
+ "draw indexed",
+ count);
+ trace_intel_begin_draw_indexed(&cmd_buffer->trace);
+
+ /* Select pipeline here to allow
+ * cmd_buffer_emit_vertex_constants_and_flush() without flushing before
+ * cmd_buffer_flush_gfx_state().
+ */
+ genX(flush_pipeline_select_3d)(cmd_buffer);
+
+ if (cmd_buffer->state.conditional_render_enabled)
+ genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
+
+#if GFX_VER < 11
+ const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
+ cmd_buffer_emit_vertex_constants_and_flush(cmd_buffer, vs_prog_data,
+ vertexOffset, firstInstance,
+ 0, false /* force_flush */);
+#endif
+
+ genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
+ genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, true);
+
+ anv_batch_emit(&cmd_buffer->batch, _3DPRIMITIVE_DIRECT, prim) {
+ prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled;
+#if GFX_VERx10 >= 125
+ prim.TBIMREnable = cmd_buffer->state.gfx.dyn_state.use_tbimr;
+#endif
+ prim.VertexAccessType = RANDOM;
+ prim.VertexCountPerInstance = indexCount;
+ prim.StartVertexLocation = firstIndex;
+ prim.InstanceCount = instanceCount *
+ pipeline->instance_multiplier;
+ prim.StartInstanceLocation = firstInstance;
+ prim.BaseVertexLocation = vertexOffset;
+#if GFX_VER >= 11
+ prim.ExtendedParametersPresent = true;
+ prim.ExtendedParameter0 = vertexOffset;
+ prim.ExtendedParameter1 = firstInstance;
+ prim.ExtendedParameter2 = 0;
+#endif
+ }
+
+ genX(batch_emit_post_3dprimitive_was)(&cmd_buffer->batch,
+ cmd_buffer->device,
+ cmd_buffer->state.gfx.primitive_topology,
+ indexCount);
+ genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, false);
+
+ update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, RANDOM);
+
+ trace_intel_end_draw_indexed(&cmd_buffer->trace, count);
+}
+
+void genX(CmdDrawMultiIndexedEXT)(
+ VkCommandBuffer commandBuffer,
+ uint32_t drawCount,
+ const VkMultiDrawIndexedInfoEXT *pIndexInfo,
+ uint32_t instanceCount,
+ uint32_t firstInstance,
+ uint32_t stride,
+ const int32_t *pVertexOffset)
+{
+ ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+ struct anv_graphics_pipeline *pipeline =
+ anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
+
+ if (anv_batch_has_error(&cmd_buffer->batch))
+ return;
+
+ genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
+
+ if (cmd_buffer->state.conditional_render_enabled)
+ genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
+
+ uint32_t i = 0;
+#if GFX_VER < 11
+ const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
+ if (pVertexOffset) {
+ if (vs_prog_data->uses_drawid) {
+ bool emitted = true;
+ if (vs_prog_data->uses_firstvertex ||
+ vs_prog_data->uses_baseinstance) {
+ emit_base_vertex_instance(cmd_buffer, *pVertexOffset, firstInstance);
+ emitted = true;
+ }
+ vk_foreach_multi_draw_indexed(draw, i, pIndexInfo, drawCount, stride) {
+ if (vs_prog_data->uses_drawid) {
+ emit_draw_index(cmd_buffer, i);
+ emitted = true;
+ }
+ /* Emitting draw index or vertex index BOs may result in needing
+ * additional VF cache flushes.
+ */
+ if (emitted)
+ genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+
+ const uint32_t count =
+ draw->indexCount * instanceCount * pipeline->instance_multiplier;
+ anv_measure_snapshot(cmd_buffer,
+ INTEL_SNAPSHOT_DRAW,
+ "draw indexed multi",
+ count);
+ trace_intel_begin_draw_indexed_multi(&cmd_buffer->trace);
+ genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device,
+ true);
+
+ anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
+ prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled;
+ prim.VertexAccessType = RANDOM;
+ prim.VertexCountPerInstance = draw->indexCount;
+ prim.StartVertexLocation = draw->firstIndex;
+ prim.InstanceCount = instanceCount *
+ pipeline->instance_multiplier;
+ prim.StartInstanceLocation = firstInstance;
+ prim.BaseVertexLocation = *pVertexOffset;
+ }
+
+ genX(batch_emit_post_3dprimitive_was)(&cmd_buffer->batch,
+ cmd_buffer->device,
+ cmd_buffer->state.gfx.primitive_topology,
+ drawCount == 0 ? 0 :
+ pIndexInfo[drawCount - 1].indexCount);
+
+ genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device,
+ false);
+ trace_intel_end_draw_indexed_multi(&cmd_buffer->trace, count);
+ emitted = false;
+ }
+ } else {
+ if (vs_prog_data->uses_firstvertex ||
+ vs_prog_data->uses_baseinstance) {
+ emit_base_vertex_instance(cmd_buffer, *pVertexOffset, firstInstance);
+ /* Emitting draw index or vertex index BOs may result in needing
+ * additional VF cache flushes.
+ */
+ genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+ }
+ vk_foreach_multi_draw_indexed(draw, i, pIndexInfo, drawCount, stride) {
+ const uint32_t count =
+ draw->indexCount * instanceCount * pipeline->instance_multiplier;
+ anv_measure_snapshot(cmd_buffer,
+ INTEL_SNAPSHOT_DRAW,
+ "draw indexed multi",
+ count);
+ trace_intel_begin_draw_indexed_multi(&cmd_buffer->trace);
+ genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device,
+ true);
+
+ anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
+ prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled;
+ prim.VertexAccessType = RANDOM;
+ prim.VertexCountPerInstance = draw->indexCount;
+ prim.StartVertexLocation = draw->firstIndex;
+ prim.InstanceCount = instanceCount *
+ pipeline->instance_multiplier;
+ prim.StartInstanceLocation = firstInstance;
+ prim.BaseVertexLocation = *pVertexOffset;
+ }
+
+ genX(batch_emit_post_3dprimitive_was)(&cmd_buffer->batch,
+ cmd_buffer->device,
+ cmd_buffer->state.gfx.primitive_topology,
+ drawCount == 0 ? 0 :
+ pIndexInfo[drawCount - 1].indexCount);
+
+ genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device,
+ false);
+ trace_intel_end_draw_indexed_multi(&cmd_buffer->trace, count);
+ }
+ }
+ } else {
+ vk_foreach_multi_draw_indexed(draw, i, pIndexInfo, drawCount, stride) {
+ cmd_buffer_emit_vertex_constants_and_flush(cmd_buffer, vs_prog_data,
+ draw->vertexOffset,
+ firstInstance, i, i != 0);
+
+ const uint32_t count =
+ draw->indexCount * instanceCount * pipeline->instance_multiplier;
+ anv_measure_snapshot(cmd_buffer,
+ INTEL_SNAPSHOT_DRAW,
+ "draw indexed multi",
+ count);
+ trace_intel_begin_draw_indexed_multi(&cmd_buffer->trace);
+ genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, true);
+
+ anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
+ prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled;
+ prim.VertexAccessType = RANDOM;
+ prim.VertexCountPerInstance = draw->indexCount;
+ prim.StartVertexLocation = draw->firstIndex;
+ prim.InstanceCount = instanceCount *
+ pipeline->instance_multiplier;
+ prim.StartInstanceLocation = firstInstance;
+ prim.BaseVertexLocation = draw->vertexOffset;
+ }
+
+ genX(batch_emit_post_3dprimitive_was)(&cmd_buffer->batch,
+ cmd_buffer->device,
+ cmd_buffer->state.gfx.primitive_topology,
+ drawCount == 0 ? 0 :
+ pIndexInfo[drawCount - 1].indexCount);
+
+ genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, false);
+ trace_intel_end_draw_indexed_multi(&cmd_buffer->trace, count);
+ }
+ }
+#else
+ vk_foreach_multi_draw_indexed(draw, i, pIndexInfo, drawCount, stride) {
+
+ /* Wa_1306463417, Wa_16011107343 - Send HS state for every primitive,
+ * first one was handled by cmd_buffer_flush_gfx_state.
+ */
+ if (i && (INTEL_NEEDS_WA_1306463417 || INTEL_NEEDS_WA_16011107343))
+ genX(emit_hs)(cmd_buffer);
+ genX(emit_ds)(cmd_buffer);
+
+ const uint32_t count =
+ draw->indexCount * instanceCount * pipeline->instance_multiplier;
+ anv_measure_snapshot(cmd_buffer,
+ INTEL_SNAPSHOT_DRAW,
+ "draw indexed multi",
+ count);
+ trace_intel_begin_draw_indexed_multi(&cmd_buffer->trace);
+ genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, true);
+
+ anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE_EXTENDED), prim) {
+#if GFX_VERx10 >= 125
+ prim.TBIMREnable = cmd_buffer->state.gfx.dyn_state.use_tbimr;
+#endif
+ prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled;
+ prim.VertexAccessType = RANDOM;
+ prim.VertexCountPerInstance = draw->indexCount;
+ prim.StartVertexLocation = draw->firstIndex;
+ prim.InstanceCount = instanceCount *
+ pipeline->instance_multiplier;
+ prim.StartInstanceLocation = firstInstance;
+ prim.BaseVertexLocation = pVertexOffset ? *pVertexOffset : draw->vertexOffset;
+ prim.ExtendedParametersPresent = true;
+ prim.ExtendedParameter0 = pVertexOffset ? *pVertexOffset : draw->vertexOffset;
+ prim.ExtendedParameter1 = firstInstance;
+ prim.ExtendedParameter2 = i;
+ }
+
+ genX(batch_emit_post_3dprimitive_was)(&cmd_buffer->batch,
+ cmd_buffer->device,
+ cmd_buffer->state.gfx.primitive_topology,
+ drawCount == 0 ? 0 :
+ pIndexInfo[drawCount - 1].indexCount);
+
+ genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, false);
+ trace_intel_end_draw_indexed_multi(&cmd_buffer->trace, count);
+ }
+#endif
+
+ update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, RANDOM);
+}
+
+/* Auto-Draw / Indirect Registers */
+#define GFX7_3DPRIM_END_OFFSET 0x2420
+#define GFX7_3DPRIM_START_VERTEX 0x2430
+#define GFX7_3DPRIM_VERTEX_COUNT 0x2434
+#define GFX7_3DPRIM_INSTANCE_COUNT 0x2438
+#define GFX7_3DPRIM_START_INSTANCE 0x243C
+#define GFX7_3DPRIM_BASE_VERTEX 0x2440
+
+/* On Gen11+, we have three custom "extended parameters" which we can use to
+ * provide extra system-generated values to shaders. Our assignment of these
+ * is arbitrary; we choose to assign them as follows:
+ *
+ * gl_BaseVertex = XP0
+ * gl_BaseInstance = XP1
+ * gl_DrawID = XP2
+ *
+ * For gl_BaseInstance, we never actually have to set up the value because we
+ * can just program 3DSTATE_VF_SGVS_2 to load it implicitly. We can also do
+ * that for gl_BaseVertex but it does the wrong thing for indexed draws.
+ */
+#define GEN11_3DPRIM_XP0 0x2690
+#define GEN11_3DPRIM_XP1 0x2694
+#define GEN11_3DPRIM_XP2 0x2698
+#define GEN11_3DPRIM_XP_BASE_VERTEX GEN11_3DPRIM_XP0
+#define GEN11_3DPRIM_XP_BASE_INSTANCE GEN11_3DPRIM_XP1
+#define GEN11_3DPRIM_XP_DRAW_ID GEN11_3DPRIM_XP2
+
+void genX(CmdDrawIndirectByteCountEXT)(
+ VkCommandBuffer commandBuffer,
+ uint32_t instanceCount,
+ uint32_t firstInstance,
+ VkBuffer counterBuffer,
+ VkDeviceSize counterBufferOffset,
+ uint32_t counterOffset,
+ uint32_t vertexStride)
+{
+ ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+ ANV_FROM_HANDLE(anv_buffer, counter_buffer, counterBuffer);
+ struct anv_graphics_pipeline *pipeline =
+ anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
+
+ /* firstVertex is always zero for this draw function */
+ const uint32_t firstVertex = 0;
+
+ if (anv_batch_has_error(&cmd_buffer->batch))
+ return;
+
+ anv_measure_snapshot(cmd_buffer,
+ INTEL_SNAPSHOT_DRAW,
+ "draw indirect byte count",
+ instanceCount * pipeline->instance_multiplier);
+ trace_intel_begin_draw_indirect_byte_count(&cmd_buffer->trace);
+
+ /* Select pipeline here to allow
+ * cmd_buffer_emit_vertex_constants_and_flush() without flushing before
+ * emit_base_vertex_instance() & emit_draw_index().
+ */
+ genX(flush_pipeline_select_3d)(cmd_buffer);
+
+ if (cmd_buffer->state.conditional_render_enabled)
+ genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
+
+#if GFX_VER < 11
+ const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
+ if (vs_prog_data->uses_firstvertex ||
+ vs_prog_data->uses_baseinstance)
+ emit_base_vertex_instance(cmd_buffer, firstVertex, firstInstance);
+ if (vs_prog_data->uses_drawid)
+ emit_draw_index(cmd_buffer, 0);
+#endif
+
+ genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
+
+ struct mi_builder b;
+ mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
+ const uint32_t mocs = anv_mocs_for_address(cmd_buffer->device, &counter_buffer->address);
+ mi_builder_set_mocs(&b, mocs);
+ struct mi_value count =
+ mi_mem32(anv_address_add(counter_buffer->address,
+ counterBufferOffset));
+ if (counterOffset)
+ count = mi_isub(&b, count, mi_imm(counterOffset));
+ count = mi_udiv32_imm(&b, count, vertexStride);
+ mi_store(&b, mi_reg32(GFX7_3DPRIM_VERTEX_COUNT), count);
+
+ mi_store(&b, mi_reg32(GFX7_3DPRIM_START_VERTEX), mi_imm(firstVertex));
+ mi_store(&b, mi_reg32(GFX7_3DPRIM_INSTANCE_COUNT),
+ mi_imm(instanceCount * pipeline->instance_multiplier));
+ mi_store(&b, mi_reg32(GFX7_3DPRIM_START_INSTANCE), mi_imm(firstInstance));
+ mi_store(&b, mi_reg32(GFX7_3DPRIM_BASE_VERTEX), mi_imm(0));
+
+#if GFX_VER >= 11
+ mi_store(&b, mi_reg32(GEN11_3DPRIM_XP_BASE_VERTEX),
+ mi_imm(firstVertex));
+ /* GEN11_3DPRIM_XP_BASE_INSTANCE is implicit */
+ mi_store(&b, mi_reg32(GEN11_3DPRIM_XP_DRAW_ID), mi_imm(0));
+#endif
+
+ genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, true);
+ anv_batch_emit(&cmd_buffer->batch, _3DPRIMITIVE_DIRECT, prim) {
+#if GFX_VERx10 >= 125
+ prim.TBIMREnable = cmd_buffer->state.gfx.dyn_state.use_tbimr;
+#endif
+ prim.IndirectParameterEnable = true;
+ prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled;
+ prim.VertexAccessType = SEQUENTIAL;
+#if GFX_VER >= 11
+ prim.ExtendedParametersPresent = true;
+#endif
+ }
+
+ genX(batch_emit_post_3dprimitive_was)(&cmd_buffer->batch,
+ cmd_buffer->device,
+ cmd_buffer->state.gfx.primitive_topology,
+ 1);
+ genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, false);
+
+ update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, SEQUENTIAL);
+
+ trace_intel_end_draw_indirect_byte_count(&cmd_buffer->trace,
+ instanceCount * pipeline->instance_multiplier);
+}
+
+static void
+load_indirect_parameters(struct anv_cmd_buffer *cmd_buffer,
+ struct anv_address addr,
+ bool indexed,
+ uint32_t draw_id)
+{
+ struct anv_graphics_pipeline *pipeline =
+ anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
+
+ struct mi_builder b;
+ mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
+ const uint32_t mocs = anv_mocs_for_address(cmd_buffer->device, &addr);
+ mi_builder_set_mocs(&b, mocs);
+
+ mi_store(&b, mi_reg32(GFX7_3DPRIM_VERTEX_COUNT),
+ mi_mem32(anv_address_add(addr, 0)));
+
+ struct mi_value instance_count = mi_mem32(anv_address_add(addr, 4));
+ if (pipeline->instance_multiplier > 1) {
+ instance_count = mi_imul_imm(&b, instance_count,
+ pipeline->instance_multiplier);
+ }
+ mi_store(&b, mi_reg32(GFX7_3DPRIM_INSTANCE_COUNT), instance_count);
+
+ mi_store(&b, mi_reg32(GFX7_3DPRIM_START_VERTEX),
+ mi_mem32(anv_address_add(addr, 8)));
+
+ if (indexed) {
+ mi_store(&b, mi_reg32(GFX7_3DPRIM_BASE_VERTEX),
+ mi_mem32(anv_address_add(addr, 12)));
+ mi_store(&b, mi_reg32(GFX7_3DPRIM_START_INSTANCE),
+ mi_mem32(anv_address_add(addr, 16)));
+#if GFX_VER >= 11
+ mi_store(&b, mi_reg32(GEN11_3DPRIM_XP_BASE_VERTEX),
+ mi_mem32(anv_address_add(addr, 12)));
+ /* GEN11_3DPRIM_XP_BASE_INSTANCE is implicit */
+#endif
+ } else {
+ mi_store(&b, mi_reg32(GFX7_3DPRIM_START_INSTANCE),
+ mi_mem32(anv_address_add(addr, 12)));
+ mi_store(&b, mi_reg32(GFX7_3DPRIM_BASE_VERTEX), mi_imm(0));
+#if GFX_VER >= 11
+ mi_store(&b, mi_reg32(GEN11_3DPRIM_XP_BASE_VERTEX),
+ mi_mem32(anv_address_add(addr, 8)));
+ /* GEN11_3DPRIM_XP_BASE_INSTANCE is implicit */
+#endif
+ }
+
+#if GFX_VER >= 11
+ mi_store(&b, mi_reg32(GEN11_3DPRIM_XP_DRAW_ID),
+ mi_imm(draw_id));
+#endif
+}
+
+static const bool
+execute_indirect_draw_supported(struct anv_cmd_buffer *cmd_buffer)
+{
+#if GFX_VERx10 >= 125
+ const struct intel_device_info *devinfo = cmd_buffer->device->info;
+ struct anv_graphics_pipeline *pipeline =
+ anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
+ const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
+ const bool is_multiview = pipeline->instance_multiplier > 1;
+
+ return (devinfo->has_indirect_unroll &&
+ !is_multiview &&
+ !vs_prog_data->uses_firstvertex &&
+ !vs_prog_data->uses_baseinstance &&
+ !vs_prog_data->uses_drawid);
+#else
+ return false;
+#endif
+}
+
+static void
+emit_indirect_draws(struct anv_cmd_buffer *cmd_buffer,
+ struct anv_address indirect_data_addr,
+ uint32_t indirect_data_stride,
+ uint32_t draw_count,
+ bool indexed)
+{
+#if GFX_VER < 11
+ struct anv_graphics_pipeline *pipeline =
+ anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
+ const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
+#endif
+ UNUSED const struct intel_device_info *devinfo = cmd_buffer->device->info;
+ UNUSED const bool aligned_stride =
+ (indirect_data_stride == 0 ||
+ (!indexed && indirect_data_stride == sizeof(VkDrawIndirectCommand)) ||
+ (indexed && indirect_data_stride == sizeof(VkDrawIndexedIndirectCommand)));
+ UNUSED const bool execute_indirect_supported =
+ execute_indirect_draw_supported(cmd_buffer);
+
+ genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
+
+ if (cmd_buffer->state.conditional_render_enabled)
+ genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
+
+ uint32_t offset = 0;
+ for (uint32_t i = 0; i < draw_count; i++) {
+ struct anv_address draw = anv_address_add(indirect_data_addr, offset);
+
+#if GFX_VER < 11
+ /* TODO: We need to stomp base vertex to 0 somehow */
+
+ /* With sequential draws, we're dealing with the VkDrawIndirectCommand
+ * structure data. We want to load VkDrawIndirectCommand::firstVertex at
+ * offset 8 in the structure.
+ *
+ * With indexed draws, we're dealing with VkDrawIndexedIndirectCommand.
+ * We want the VkDrawIndirectCommand::vertexOffset field at offset 12 in
+ * the structure.
+ */
+ if (vs_prog_data->uses_firstvertex ||
+ vs_prog_data->uses_baseinstance) {
+ emit_base_vertex_instance_bo(cmd_buffer,
+ anv_address_add(draw, indexed ? 12 : 8));
+ }
+ if (vs_prog_data->uses_drawid)
+ emit_draw_index(cmd_buffer, i);
+#endif
+
+ /* Emitting draw index or vertex index BOs may result in needing
+ * additional VF cache flushes.
+ */
+ genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+
+ /* Wa_1306463417, Wa_16011107343 - Send HS state for every primitive,
+ * first one was handled by cmd_buffer_flush_gfx_state.
+ */
+ if (i && (INTEL_NEEDS_WA_1306463417 || INTEL_NEEDS_WA_16011107343))
+ genX(emit_hs)(cmd_buffer);
+ genX(emit_ds)(cmd_buffer);
+
+ if (execute_indirect_supported) {
+#if GFX_VERx10 >= 125
+ genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, true);
+ anv_batch_emit(&cmd_buffer->batch, GENX(EXECUTE_INDIRECT_DRAW), ind) {
+ ind.ArgumentFormat = indexed ? DRAWINDEXED : DRAW;
+ ind.TBIMREnabled = cmd_buffer->state.gfx.dyn_state.use_tbimr;
+ ind.PredicateEnable =
+ cmd_buffer->state.conditional_render_enabled;
+ ind.MaxCount = aligned_stride ? draw_count : 1;
+ ind.ArgumentBufferStartAddress = draw;
+ ind.MOCS =
+ anv_mocs(cmd_buffer->device, draw.bo, 0);
+ }
+ /* If all the indirect structures are aligned, then we can let the HW
+ * do the unrolling and we only need one instruction. Otherwise we
+ * need to emit one instruction per draw, but we're still avoiding
+ * the register loads with MI commands.
+ */
+ if (aligned_stride)
+ break;
+#else
+ unreachable("EXECUTE_INDIRECT_DRAW instruction expectation mismatch");
+#endif
+ } else {
+ load_indirect_parameters(cmd_buffer, draw, indexed, i);
+
+ genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, true);
+ anv_batch_emit(&cmd_buffer->batch, _3DPRIMITIVE_DIRECT, prim) {
+#if GFX_VERx10 >= 125
+ prim.TBIMREnable = cmd_buffer->state.gfx.dyn_state.use_tbimr;
+#endif
+ prim.IndirectParameterEnable = true;
+ prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled;
+ prim.VertexAccessType = indexed ? RANDOM : SEQUENTIAL;
+#if GFX_VER >= 11
+ prim.ExtendedParametersPresent = true;
+#endif
+ }
+ }
+
+ genX(batch_emit_post_3dprimitive_was)(&cmd_buffer->batch,
+ cmd_buffer->device,
+ cmd_buffer->state.gfx.primitive_topology,
+ 1);
+
+ genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, false);
+
+ update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer,
+ indexed ? RANDOM : SEQUENTIAL);
+
+ offset += indirect_data_stride;
+ }
+}
+
+void genX(CmdDrawIndirect)(
+ VkCommandBuffer commandBuffer,
+ VkBuffer _buffer,
+ VkDeviceSize offset,
+ uint32_t drawCount,
+ uint32_t stride)
+{
+ ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+ ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
+
+ if (anv_batch_has_error(&cmd_buffer->batch))
+ return;
+
+ anv_measure_snapshot(cmd_buffer,
+ INTEL_SNAPSHOT_DRAW,
+ "draw indirect",
+ drawCount);
+ trace_intel_begin_draw_indirect(&cmd_buffer->trace);
+
+ if (anv_use_generated_draws(cmd_buffer, drawCount)) {
+ genX(cmd_buffer_emit_indirect_generated_draws)(
+ cmd_buffer,
+ anv_address_add(buffer->address, offset),
+ MAX2(stride, sizeof(VkDrawIndirectCommand)),
+ ANV_NULL_ADDRESS /* count_addr */,
+ drawCount,
+ false /* indexed */);
+ } else {
+ emit_indirect_draws(cmd_buffer,
+ anv_address_add(buffer->address, offset),
+ stride, drawCount, false /* indexed */);
+ }
+
+ trace_intel_end_draw_indirect(&cmd_buffer->trace, drawCount);
+}
+
+void genX(CmdDrawIndexedIndirect)(
+ VkCommandBuffer commandBuffer,
+ VkBuffer _buffer,
+ VkDeviceSize offset,
+ uint32_t drawCount,
+ uint32_t stride)
+{
+ ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+ ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
+
+ if (anv_batch_has_error(&cmd_buffer->batch))
+ return;
+
+ anv_measure_snapshot(cmd_buffer,
+ INTEL_SNAPSHOT_DRAW,
+ "draw indexed indirect",
+ drawCount);
+ trace_intel_begin_draw_indexed_indirect(&cmd_buffer->trace);
+
+ if (anv_use_generated_draws(cmd_buffer, drawCount)) {
+ genX(cmd_buffer_emit_indirect_generated_draws)(
+ cmd_buffer,
+ anv_address_add(buffer->address, offset),
+ MAX2(stride, sizeof(VkDrawIndexedIndirectCommand)),
+ ANV_NULL_ADDRESS /* count_addr */,
+ drawCount,
+ true /* indexed */);
+ } else {
+ emit_indirect_draws(cmd_buffer,
+ anv_address_add(buffer->address, offset),
+ stride, drawCount, true /* indexed */);
+ }
+
+ trace_intel_end_draw_indexed_indirect(&cmd_buffer->trace, drawCount);
+}
+
+#define MI_PREDICATE_SRC0 0x2400
+#define MI_PREDICATE_SRC1 0x2408
+#define MI_PREDICATE_RESULT 0x2418
+
+static struct mi_value
+prepare_for_draw_count_predicate(struct anv_cmd_buffer *cmd_buffer,
+ struct mi_builder *b,
+ struct anv_address count_address)
+{
+ struct mi_value ret = mi_imm(0);
+
+ if (cmd_buffer->state.conditional_render_enabled) {
+ ret = mi_new_gpr(b);
+ mi_store(b, mi_value_ref(b, ret), mi_mem32(count_address));
+ } else {
+ /* Upload the current draw count from the draw parameters buffer to
+ * MI_PREDICATE_SRC0.
+ */
+ mi_store(b, mi_reg64(MI_PREDICATE_SRC0), mi_mem32(count_address));
+ mi_store(b, mi_reg32(MI_PREDICATE_SRC1 + 4), mi_imm(0));
+ }
+
+ return ret;
+}
+
+static void
+emit_draw_count_predicate(struct anv_cmd_buffer *cmd_buffer,
+ struct mi_builder *b,
+ uint32_t draw_index)
+{
+ /* Upload the index of the current primitive to MI_PREDICATE_SRC1. */
+ mi_store(b, mi_reg32(MI_PREDICATE_SRC1), mi_imm(draw_index));
+
+ if (draw_index == 0) {
+ anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
+ mip.LoadOperation = LOAD_LOADINV;
+ mip.CombineOperation = COMBINE_SET;
+ mip.CompareOperation = COMPARE_SRCS_EQUAL;
+ }
+ } else {
+ /* While draw_index < draw_count the predicate's result will be
+ * (draw_index == draw_count) ^ TRUE = TRUE
+ * When draw_index == draw_count the result is
+ * (TRUE) ^ TRUE = FALSE
+ * After this all results will be:
+ * (FALSE) ^ FALSE = FALSE
+ */
+ anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
+ mip.LoadOperation = LOAD_LOAD;
+ mip.CombineOperation = COMBINE_XOR;
+ mip.CompareOperation = COMPARE_SRCS_EQUAL;
+ }
+ }
+}
+
+static void
+emit_draw_count_predicate_with_conditional_render(
+ struct anv_cmd_buffer *cmd_buffer,
+ struct mi_builder *b,
+ uint32_t draw_index,
+ struct mi_value max)
+{
+ struct mi_value pred = mi_ult(b, mi_imm(draw_index), max);
+ pred = mi_iand(b, pred, mi_reg64(ANV_PREDICATE_RESULT_REG));
+
+ mi_store(b, mi_reg32(MI_PREDICATE_RESULT), pred);
+}
+
+static void
+emit_draw_count_predicate_cond(struct anv_cmd_buffer *cmd_buffer,
+ struct mi_builder *b,
+ uint32_t draw_index,
+ struct mi_value max)
+{
+ if (cmd_buffer->state.conditional_render_enabled) {
+ emit_draw_count_predicate_with_conditional_render(
+ cmd_buffer, b, draw_index, mi_value_ref(b, max));
+ } else {
+ emit_draw_count_predicate(cmd_buffer, b, draw_index);
+ }
+}
+
+static void
+emit_indirect_count_draws(struct anv_cmd_buffer *cmd_buffer,
+ struct anv_address indirect_data_addr,
+ uint64_t indirect_data_stride,
+ struct anv_address draw_count_addr,
+ uint32_t max_draw_count,
+ bool indexed)
+{
+#if GFX_VER < 11
+ struct anv_graphics_pipeline *pipeline =
+ anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
+ const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
+#endif
+
+ genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
+
+ struct mi_builder b;
+ mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
+ const uint32_t mocs = anv_mocs_for_address(cmd_buffer->device, &draw_count_addr);
+ mi_builder_set_mocs(&b, mocs);
+ struct mi_value max =
+ prepare_for_draw_count_predicate(cmd_buffer, &b, draw_count_addr);
+
+ for (uint32_t i = 0; i < max_draw_count; i++) {
+ struct anv_address draw =
+ anv_address_add(indirect_data_addr, i * indirect_data_stride);
+
+ emit_draw_count_predicate_cond(cmd_buffer, &b, i, max);
+
+#if GFX_VER < 11
+ if (vs_prog_data->uses_firstvertex ||
+ vs_prog_data->uses_baseinstance) {
+ emit_base_vertex_instance_bo(cmd_buffer,
+ anv_address_add(draw, indexed ? 12 : 8));
+ }
+ if (vs_prog_data->uses_drawid)
+ emit_draw_index(cmd_buffer, i);
+
+ /* Emitting draw index or vertex index BOs may result in needing
+ * additional VF cache flushes.
+ */
+ genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+#endif
+
+ load_indirect_parameters(cmd_buffer, draw, indexed, i);
+
+ /* Wa_1306463417, Wa_16011107343 - Send HS state for every primitive,
+ * first one was handled by cmd_buffer_flush_gfx_state.
+ */
+ if (i && (INTEL_NEEDS_WA_1306463417 || INTEL_NEEDS_WA_16011107343))
+ genX(emit_hs)(cmd_buffer);
+ genX(emit_ds)(cmd_buffer);
+
+ genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, true);
+ anv_batch_emit(&cmd_buffer->batch, _3DPRIMITIVE_DIRECT, prim) {
+#if GFX_VERx10 >= 125
+ prim.TBIMREnable = cmd_buffer->state.gfx.dyn_state.use_tbimr;
+#endif
+ prim.IndirectParameterEnable = true;
+ prim.PredicateEnable = true;
+ prim.VertexAccessType = indexed ? RANDOM : SEQUENTIAL;
+#if GFX_VER >= 11
+ prim.ExtendedParametersPresent = true;
+#endif
+ }
+
+ genX(batch_emit_post_3dprimitive_was)(&cmd_buffer->batch,
+ cmd_buffer->device,
+ cmd_buffer->state.gfx.primitive_topology,
+ 1);
+ genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, false);
+
+ update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, SEQUENTIAL);
+ }
+
+ mi_value_unref(&b, max);
+}
+
+void genX(CmdDrawIndirectCount)(
+ VkCommandBuffer commandBuffer,
+ VkBuffer _buffer,
+ VkDeviceSize offset,
+ VkBuffer _countBuffer,
+ VkDeviceSize countBufferOffset,
+ uint32_t maxDrawCount,
+ uint32_t stride)
+{
+ ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+ ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
+ ANV_FROM_HANDLE(anv_buffer, count_buffer, _countBuffer);
+
+ if (anv_batch_has_error(&cmd_buffer->batch))
+ return;
+
+ anv_measure_snapshot(cmd_buffer,
+ INTEL_SNAPSHOT_DRAW,
+ "draw indirect count",
+ 0);
+ trace_intel_begin_draw_indirect_count(&cmd_buffer->trace);
+
+ struct anv_address indirect_data_address =
+ anv_address_add(buffer->address, offset);
+ struct anv_address count_address =
+ anv_address_add(count_buffer->address, countBufferOffset);
+ stride = MAX2(stride, sizeof(VkDrawIndirectCommand));
+
+ if (anv_use_generated_draws(cmd_buffer, maxDrawCount)) {
+ genX(cmd_buffer_emit_indirect_generated_draws)(
+ cmd_buffer,
+ indirect_data_address,
+ stride,
+ count_address,
+ maxDrawCount,
+ false /* indexed */);
+ } else {
+ emit_indirect_count_draws(cmd_buffer,
+ indirect_data_address,
+ stride,
+ count_address,
+ maxDrawCount,
+ false /* indexed */);
+ }
+
+ trace_intel_end_draw_indirect_count(&cmd_buffer->trace, maxDrawCount);
+}
+
+void genX(CmdDrawIndexedIndirectCount)(
+ VkCommandBuffer commandBuffer,
+ VkBuffer _buffer,
+ VkDeviceSize offset,
+ VkBuffer _countBuffer,
+ VkDeviceSize countBufferOffset,
+ uint32_t maxDrawCount,
+ uint32_t stride)
+{
+ ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+ ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
+ ANV_FROM_HANDLE(anv_buffer, count_buffer, _countBuffer);
+
+ if (anv_batch_has_error(&cmd_buffer->batch))
+ return;
+
+ anv_measure_snapshot(cmd_buffer,
+ INTEL_SNAPSHOT_DRAW,
+ "draw indexed indirect count",
+ 0);
+ trace_intel_begin_draw_indexed_indirect_count(&cmd_buffer->trace);
+
+ struct anv_address indirect_data_address =
+ anv_address_add(buffer->address, offset);
+ struct anv_address count_address =
+ anv_address_add(count_buffer->address, countBufferOffset);
+ stride = MAX2(stride, sizeof(VkDrawIndexedIndirectCommand));
+
+ if (anv_use_generated_draws(cmd_buffer, maxDrawCount)) {
+ genX(cmd_buffer_emit_indirect_generated_draws)(
+ cmd_buffer,
+ indirect_data_address,
+ stride,
+ count_address,
+ maxDrawCount,
+ true /* indexed */);
+ } else {
+ emit_indirect_count_draws(cmd_buffer,
+ indirect_data_address,
+ stride,
+ count_address,
+ maxDrawCount,
+ true /* indexed */);
+ }
+
+ trace_intel_end_draw_indexed_indirect_count(&cmd_buffer->trace, maxDrawCount);
+
+}
+
+void genX(CmdBeginTransformFeedbackEXT)(
+ VkCommandBuffer commandBuffer,
+ uint32_t firstCounterBuffer,
+ uint32_t counterBufferCount,
+ const VkBuffer* pCounterBuffers,
+ const VkDeviceSize* pCounterBufferOffsets)
+{
+ ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+
+ assert(firstCounterBuffer < MAX_XFB_BUFFERS);
+ assert(counterBufferCount <= MAX_XFB_BUFFERS);
+ assert(firstCounterBuffer + counterBufferCount <= MAX_XFB_BUFFERS);
+
+ trace_intel_begin_xfb(&cmd_buffer->trace);
+
+ /* From the SKL PRM Vol. 2c, SO_WRITE_OFFSET:
+ *
+ * "Ssoftware must ensure that no HW stream output operations can be in
+ * process or otherwise pending at the point that the MI_LOAD/STORE
+ * commands are processed. This will likely require a pipeline flush."
+ */
+ anv_add_pending_pipe_bits(cmd_buffer,
+ ANV_PIPE_CS_STALL_BIT,
+ "begin transform feedback");
+ genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+
+ for (uint32_t idx = 0; idx < MAX_XFB_BUFFERS; idx++) {
+ /* If we have a counter buffer, this is a resume so we need to load the
+ * value into the streamout offset register. Otherwise, this is a begin
+ * and we need to reset it to zero.
+ */
+ if (pCounterBuffers &&
+ idx >= firstCounterBuffer &&
+ idx - firstCounterBuffer < counterBufferCount &&
+ pCounterBuffers[idx - firstCounterBuffer] != VK_NULL_HANDLE) {
+ uint32_t cb_idx = idx - firstCounterBuffer;
+ ANV_FROM_HANDLE(anv_buffer, counter_buffer, pCounterBuffers[cb_idx]);
+ uint64_t offset = pCounterBufferOffsets ?
+ pCounterBufferOffsets[cb_idx] : 0;
+
+ anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
+ lrm.RegisterAddress = GENX(SO_WRITE_OFFSET0_num) + idx * 4;
+ lrm.MemoryAddress = anv_address_add(counter_buffer->address,
+ offset);
+ }
+ } else {
+ anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
+ lri.RegisterOffset = GENX(SO_WRITE_OFFSET0_num) + idx * 4;
+ lri.DataDWord = 0;
+ }
+ }
+ }
+
+ cmd_buffer->state.xfb_enabled = true;
+ cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_XFB_ENABLE;
+}
+
+void genX(CmdEndTransformFeedbackEXT)(
+ VkCommandBuffer commandBuffer,
+ uint32_t firstCounterBuffer,
+ uint32_t counterBufferCount,
+ const VkBuffer* pCounterBuffers,
+ const VkDeviceSize* pCounterBufferOffsets)
+{
+ ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+
+ assert(firstCounterBuffer < MAX_XFB_BUFFERS);
+ assert(counterBufferCount <= MAX_XFB_BUFFERS);
+ assert(firstCounterBuffer + counterBufferCount <= MAX_XFB_BUFFERS);
+
+ /* From the SKL PRM Vol. 2c, SO_WRITE_OFFSET:
+ *
+ * "Ssoftware must ensure that no HW stream output operations can be in
+ * process or otherwise pending at the point that the MI_LOAD/STORE
+ * commands are processed. This will likely require a pipeline flush."
+ */
+ anv_add_pending_pipe_bits(cmd_buffer,
+ ANV_PIPE_CS_STALL_BIT,
+ "end transform feedback");
+ genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+
+ for (uint32_t cb_idx = 0; cb_idx < counterBufferCount; cb_idx++) {
+ unsigned idx = firstCounterBuffer + cb_idx;
+
+ /* If we have a counter buffer, this is a resume so we need to load the
+ * value into the streamout offset register. Otherwise, this is a begin
+ * and we need to reset it to zero.
+ */
+ if (pCounterBuffers &&
+ cb_idx < counterBufferCount &&
+ pCounterBuffers[cb_idx] != VK_NULL_HANDLE) {
+ ANV_FROM_HANDLE(anv_buffer, counter_buffer, pCounterBuffers[cb_idx]);
+ uint64_t offset = pCounterBufferOffsets ?
+ pCounterBufferOffsets[cb_idx] : 0;
+
+ anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), srm) {
+ srm.MemoryAddress = anv_address_add(counter_buffer->address,
+ offset);
+ srm.RegisterAddress = GENX(SO_WRITE_OFFSET0_num) + idx * 4;
+ }
+ }
+ }
+
+ trace_intel_end_xfb(&cmd_buffer->trace);
+
+ cmd_buffer->state.xfb_enabled = false;
+ cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_XFB_ENABLE;
+}
+
+#if GFX_VERx10 >= 125
+
+void
+genX(CmdDrawMeshTasksEXT)(
+ VkCommandBuffer commandBuffer,
+ uint32_t x,
+ uint32_t y,
+ uint32_t z)
+{
+ ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+
+ if (anv_batch_has_error(&cmd_buffer->batch))
+ return;
+
+ anv_measure_snapshot(cmd_buffer,
+ INTEL_SNAPSHOT_DRAW,
+ "draw mesh", x * y * z);
+
+ trace_intel_begin_draw_mesh(&cmd_buffer->trace);
+
+ /* TODO(mesh): Check if this is not emitting more packets than we need. */
+ genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
+
+ if (cmd_buffer->state.conditional_render_enabled)
+ genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
+
+ anv_batch_emit(&cmd_buffer->batch, GENX(3DMESH_3D), m) {
+ m.PredicateEnable = cmd_buffer->state.conditional_render_enabled;
+ m.ThreadGroupCountX = x;
+ m.ThreadGroupCountY = y;
+ m.ThreadGroupCountZ = z;
+ }
+
+ trace_intel_end_draw_mesh(&cmd_buffer->trace, x, y, z);
+}
+
+#define GFX125_3DMESH_TG_COUNT 0x26F0
+#define GFX10_3DPRIM_XP(n) (0x2690 + (n) * 4) /* n = { 0, 1, 2 } */
+
+static void
+mesh_load_indirect_parameters_3dmesh_3d(struct anv_cmd_buffer *cmd_buffer,
+ struct mi_builder *b,
+ struct anv_address addr,
+ bool emit_xp0,
+ uint32_t xp0)
+{
+ const size_t groupCountXOff = offsetof(VkDrawMeshTasksIndirectCommandEXT, groupCountX);
+ const size_t groupCountYOff = offsetof(VkDrawMeshTasksIndirectCommandEXT, groupCountY);
+ const size_t groupCountZOff = offsetof(VkDrawMeshTasksIndirectCommandEXT, groupCountZ);
+
+ mi_store(b, mi_reg32(GFX125_3DMESH_TG_COUNT),
+ mi_mem32(anv_address_add(addr, groupCountXOff)));
+
+ mi_store(b, mi_reg32(GFX10_3DPRIM_XP(1)),
+ mi_mem32(anv_address_add(addr, groupCountYOff)));
+
+ mi_store(b, mi_reg32(GFX10_3DPRIM_XP(2)),
+ mi_mem32(anv_address_add(addr, groupCountZOff)));
+
+ if (emit_xp0)
+ mi_store(b, mi_reg32(GFX10_3DPRIM_XP(0)), mi_imm(xp0));
+}
+
+static void
+emit_indirect_3dmesh_3d(struct anv_batch *batch,
+ bool predicate_enable,
+ bool uses_drawid)
+{
+ uint32_t len = GENX(3DMESH_3D_length) + uses_drawid;
+ uint32_t *dw = anv_batch_emitn(batch, len, GENX(3DMESH_3D),
+ .PredicateEnable = predicate_enable,
+ .IndirectParameterEnable = true,
+ .ExtendedParameter0Present = uses_drawid);
+ if (uses_drawid)
+ dw[len - 1] = 0;
+}
+
+void
+genX(CmdDrawMeshTasksIndirectEXT)(
+ VkCommandBuffer commandBuffer,
+ VkBuffer _buffer,
+ VkDeviceSize offset,
+ uint32_t drawCount,
+ uint32_t stride)
+{
+ ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+ ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
+ struct anv_graphics_pipeline *pipeline =
+ anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
+ const struct brw_task_prog_data *task_prog_data = get_task_prog_data(pipeline);
+ const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline);
+ struct anv_cmd_state *cmd_state = &cmd_buffer->state;
+
+ if (anv_batch_has_error(&cmd_buffer->batch))
+ return;
+
+ anv_measure_snapshot(cmd_buffer,
+ INTEL_SNAPSHOT_DRAW,
+ "draw mesh indirect", drawCount);
+
+ trace_intel_begin_draw_mesh_indirect(&cmd_buffer->trace);
+
+ genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
+
+ if (cmd_state->conditional_render_enabled)
+ genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
+
+ bool uses_drawid = (task_prog_data && task_prog_data->uses_drawid) ||
+ mesh_prog_data->uses_drawid;
+ struct mi_builder b;
+ mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
+
+ for (uint32_t i = 0; i < drawCount; i++) {
+ struct anv_address draw = anv_address_add(buffer->address, offset);
+
+ mesh_load_indirect_parameters_3dmesh_3d(cmd_buffer, &b, draw, uses_drawid, i);
+
+ emit_indirect_3dmesh_3d(&cmd_buffer->batch,
+ cmd_state->conditional_render_enabled, uses_drawid);
+
+ offset += stride;
+ }
+
+ trace_intel_end_draw_mesh_indirect(&cmd_buffer->trace, drawCount);
+}
+
+void
+genX(CmdDrawMeshTasksIndirectCountEXT)(
+ VkCommandBuffer commandBuffer,
+ VkBuffer _buffer,
+ VkDeviceSize offset,
+ VkBuffer _countBuffer,
+ VkDeviceSize countBufferOffset,
+ uint32_t maxDrawCount,
+ uint32_t stride)
+{
+ ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+ ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
+ ANV_FROM_HANDLE(anv_buffer, count_buffer, _countBuffer);
+ struct anv_graphics_pipeline *pipeline =
+ anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
+ const struct brw_task_prog_data *task_prog_data = get_task_prog_data(pipeline);
+ const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline);
+
+ if (anv_batch_has_error(&cmd_buffer->batch))
+ return;
+
+ anv_measure_snapshot(cmd_buffer,
+ INTEL_SNAPSHOT_DRAW,
+ "draw mesh indirect count", 0);
+
+ trace_intel_begin_draw_mesh_indirect_count(&cmd_buffer->trace);
+
+ genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
+
+ bool uses_drawid = (task_prog_data && task_prog_data->uses_drawid) ||
+ mesh_prog_data->uses_drawid;
+
+ struct mi_builder b;
+ mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
+ const uint32_t mocs = anv_mocs_for_address(cmd_buffer->device, &count_buffer->address);
+ mi_builder_set_mocs(&b, mocs);
+
+ struct mi_value max =
+ prepare_for_draw_count_predicate(
+ cmd_buffer, &b,
+ anv_address_add(count_buffer->address, countBufferOffset));
+
+ for (uint32_t i = 0; i < maxDrawCount; i++) {
+ struct anv_address draw = anv_address_add(buffer->address, offset);
+
+ emit_draw_count_predicate_cond(cmd_buffer, &b, i, max);
+
+ mesh_load_indirect_parameters_3dmesh_3d(cmd_buffer, &b, draw, uses_drawid, i);
+
+ emit_indirect_3dmesh_3d(&cmd_buffer->batch, true, uses_drawid);
+
+ offset += stride;
+ }
+
+ trace_intel_end_draw_mesh_indirect_count(&cmd_buffer->trace, maxDrawCount);
+}
+
+#endif /* GFX_VERx10 >= 125 */
diff --git a/src/intel/vulkan/genX_cmd_draw_generated_flush.h b/src/intel/vulkan/genX_cmd_draw_generated_flush.h
new file mode 100644
index 00000000000..2240d1e1918
--- /dev/null
+++ b/src/intel/vulkan/genX_cmd_draw_generated_flush.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright © 2024 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef GENX_CMD_DRAW_GENERATED_FLUSH_H
+#define GENX_CMD_DRAW_GENERATED_FLUSH_H
+
+#include <assert.h>
+#include <stdbool.h>
+
+#include "util/macros.h"
+
+#include "common/intel_genX_state_brw.h"
+
+#include "anv_private.h"
+
+static void
+genX(cmd_buffer_flush_generated_draws)(struct anv_cmd_buffer *cmd_buffer)
+{
+ if (!anv_cmd_buffer_is_render_queue(cmd_buffer))
+ return;
+
+ /* No return address setup means we don't have to do anything */
+ if (anv_address_is_null(cmd_buffer->generation.return_addr))
+ return;
+
+ struct anv_batch *batch = &cmd_buffer->generation.batch;
+
+ /* Wait for all the generation vertex shader to generate the commands. */
+ genX(emit_apply_pipe_flushes)(batch,
+ cmd_buffer->device,
+ _3D,
+#if GFX_VER == 9
+ ANV_PIPE_VF_CACHE_INVALIDATE_BIT |
+#endif
+ ANV_PIPE_DATA_CACHE_FLUSH_BIT |
+ ANV_PIPE_CS_STALL_BIT,
+ NULL /* emitted_bits */);
+
+#if GFX_VER >= 12
+ anv_batch_emit(batch, GENX(MI_ARB_CHECK), arb) {
+ arb.PreParserDisableMask = true;
+ arb.PreParserDisable = true;
+ }
+#else
+ /* Prior to Gfx12 we cannot disable the CS prefetch but it doesn't matter
+ * as the prefetch shouldn't follow the MI_BATCH_BUFFER_START.
+ */
+#endif
+
+ /* Return to the main batch. */
+ anv_batch_emit(batch, GENX(MI_BATCH_BUFFER_START), bbs) {
+ bbs.AddressSpaceIndicator = ASI_PPGTT;
+ bbs.BatchBufferStartAddress = cmd_buffer->generation.return_addr;
+ }
+
+ cmd_buffer->generation.return_addr = ANV_NULL_ADDRESS;
+}
+
+#endif /* GENX_CMD_DRAW_GENERATED_FLUSH_H */
diff --git a/src/intel/vulkan/genX_cmd_draw_generated_indirect.h b/src/intel/vulkan/genX_cmd_draw_generated_indirect.h
new file mode 100644
index 00000000000..0db4cffb297
--- /dev/null
+++ b/src/intel/vulkan/genX_cmd_draw_generated_indirect.h
@@ -0,0 +1,656 @@
+/*
+ * Copyright © 2022 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef GENX_CMD_DRAW_GENERATED_INDIRECT_H
+#define GENX_CMD_DRAW_GENERATED_INDIRECT_H
+
+#include <assert.h>
+#include <stdbool.h>
+
+#include "util/macros.h"
+
+#include "common/intel_genX_state_brw.h"
+
+#include "anv_private.h"
+#include "anv_internal_kernels.h"
+
+/* This is a maximum number of items a fragment shader can generate due to the
+ * viewport size.
+ */
+#define MAX_GENERATED_DRAW_COUNT (8192 * 8192)
+
+#define MAX_RING_BO_ITEMS (8192)
+
+static struct anv_state
+genX(cmd_buffer_emit_generate_draws)(struct anv_cmd_buffer *cmd_buffer,
+ struct anv_simple_shader *simple_state,
+ struct anv_address generated_cmds_addr,
+ uint32_t generated_cmd_stride,
+ struct anv_address indirect_data_addr,
+ uint32_t indirect_data_stride,
+ struct anv_address draw_id_addr,
+ uint32_t item_base,
+ uint32_t item_count,
+ struct anv_address count_addr,
+ uint32_t max_count,
+ bool indexed,
+ uint32_t ring_count)
+{
+ struct anv_device *device = cmd_buffer->device;
+
+ struct anv_state push_data_state =
+ genX(simple_shader_alloc_push)(simple_state,
+ sizeof(struct anv_gen_indirect_params));
+ if (push_data_state.map == NULL)
+ return ANV_STATE_NULL;
+
+ struct anv_graphics_pipeline *pipeline =
+ anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
+ const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
+ const bool use_tbimr = cmd_buffer->state.gfx.dyn_state.use_tbimr;
+
+ struct anv_address draw_count_addr;
+ if (anv_address_is_null(count_addr)) {
+ draw_count_addr = anv_address_add(
+ genX(simple_shader_push_state_address)(simple_state, push_data_state),
+ offsetof(struct anv_gen_indirect_params, draw_count));
+ } else {
+ draw_count_addr = count_addr;
+ }
+
+ struct anv_gen_indirect_params *push_data = push_data_state.map;
+ *push_data = (struct anv_gen_indirect_params) {
+ .draw_id_addr = anv_address_physical(draw_id_addr),
+ .indirect_data_addr = anv_address_physical(indirect_data_addr),
+ .indirect_data_stride = indirect_data_stride,
+ .flags = (use_tbimr ? ANV_GENERATED_FLAG_TBIMR : 0) |
+ (indexed ? ANV_GENERATED_FLAG_INDEXED : 0) |
+ (cmd_buffer->state.conditional_render_enabled ?
+ ANV_GENERATED_FLAG_PREDICATED : 0) |
+ ((vs_prog_data->uses_firstvertex ||
+ vs_prog_data->uses_baseinstance) ?
+ ANV_GENERATED_FLAG_BASE : 0) |
+ (vs_prog_data->uses_drawid ? ANV_GENERATED_FLAG_DRAWID : 0) |
+ (anv_mocs(device, indirect_data_addr.bo,
+ ISL_SURF_USAGE_VERTEX_BUFFER_BIT) << 8) |
+ (!anv_address_is_null(count_addr) ?
+ ANV_GENERATED_FLAG_COUNT : 0) |
+ (ring_count != 0 ? ANV_GENERATED_FLAG_RING_MODE : 0) |
+ ((generated_cmd_stride / 4) << 16),
+ .draw_base = item_base,
+ .max_draw_count = max_count,
+ .ring_count = ring_count,
+ .instance_multiplier = pipeline->instance_multiplier,
+ .draw_count = anv_address_is_null(count_addr) ? max_count : 0,
+ .generated_cmds_addr = anv_address_physical(generated_cmds_addr),
+ .draw_count_addr = anv_address_physical(draw_count_addr),
+ };
+
+ genX(emit_simple_shader_dispatch)(simple_state, item_count, push_data_state);
+
+ return push_data_state;
+}
+
+static void
+genX(cmd_buffer_emit_indirect_generated_draws_init)(struct anv_cmd_buffer *cmd_buffer)
+{
+ anv_batch_emit_ensure_space(&cmd_buffer->generation.batch, 4);
+
+ trace_intel_begin_generate_draws(&cmd_buffer->trace);
+
+ anv_batch_emit(&cmd_buffer->batch, GENX(MI_BATCH_BUFFER_START), bbs) {
+ bbs.AddressSpaceIndicator = ASI_PPGTT;
+ bbs.BatchBufferStartAddress =
+ anv_batch_current_address(&cmd_buffer->generation.batch);
+ }
+
+ cmd_buffer->generation.return_addr = anv_batch_current_address(&cmd_buffer->batch);
+
+#if GFX_VER >= 12
+ anv_batch_emit(&cmd_buffer->batch, GENX(MI_ARB_CHECK), arb) {
+ arb.PreParserDisableMask = true;
+ arb.PreParserDisable = false;
+ }
+#endif
+
+ trace_intel_end_generate_draws(&cmd_buffer->trace);
+
+ struct anv_shader_bin *gen_kernel;
+ VkResult ret =
+ anv_device_get_internal_shader(
+ cmd_buffer->device,
+ ANV_INTERNAL_KERNEL_GENERATED_DRAWS,
+ &gen_kernel);
+ if (ret != VK_SUCCESS) {
+ anv_batch_set_error(&cmd_buffer->batch, ret);
+ return;
+ }
+
+ struct anv_device *device = cmd_buffer->device;
+ struct anv_simple_shader *state = &cmd_buffer->generation.shader_state;
+ *state = (struct anv_simple_shader) {
+ .device = device,
+ .cmd_buffer = cmd_buffer,
+ .dynamic_state_stream = &cmd_buffer->dynamic_state_stream,
+ .general_state_stream = &cmd_buffer->general_state_stream,
+ .batch = &cmd_buffer->generation.batch,
+ .kernel = gen_kernel,
+ .l3_config = device->internal_kernels_l3_config,
+ .urb_cfg = &cmd_buffer->state.gfx.urb_cfg,
+ };
+
+ genX(emit_simple_shader_init)(state);
+}
+
+static struct anv_address
+genX(cmd_buffer_get_draw_id_addr)(struct anv_cmd_buffer *cmd_buffer,
+ uint32_t draw_id_count)
+{
+#if GFX_VER >= 11
+ return ANV_NULL_ADDRESS;
+#else
+ struct anv_graphics_pipeline *pipeline =
+ anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
+ const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
+ if (!vs_prog_data->uses_drawid)
+ return ANV_NULL_ADDRESS;
+
+ struct anv_state draw_id_state =
+ anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, 4 * draw_id_count, 4);
+ return anv_state_pool_state_address(&cmd_buffer->device->dynamic_state_pool,
+ draw_id_state);
+#endif
+}
+
+static uint32_t
+genX(cmd_buffer_get_generated_draw_stride)(struct anv_cmd_buffer *cmd_buffer)
+{
+ /* With the extended parameters in 3DPRIMITIVE on Gfx11+ we can emit
+ * everything. Prior to this, we need to emit a couple of
+ * VERTEX_BUFFER_STATE.
+ */
+#if GFX_VER >= 11
+ return 4 * GENX(3DPRIMITIVE_EXTENDED_length);
+#else
+ struct anv_graphics_pipeline *pipeline =
+ anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
+ const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
+
+ uint32_t len = 0;
+
+ if (vs_prog_data->uses_firstvertex ||
+ vs_prog_data->uses_baseinstance ||
+ vs_prog_data->uses_drawid) {
+ len += 4; /* 3DSTATE_VERTEX_BUFFERS */
+
+ if (vs_prog_data->uses_firstvertex ||
+ vs_prog_data->uses_baseinstance)
+ len += 4 * GENX(VERTEX_BUFFER_STATE_length);
+
+ if (vs_prog_data->uses_drawid)
+ len += 4 * GENX(VERTEX_BUFFER_STATE_length);
+ }
+
+ return len + 4 * GENX(3DPRIMITIVE_length);
+#endif
+}
+
+static void
+genX(cmd_buffer_rewrite_forward_end_addr)(struct anv_cmd_buffer *cmd_buffer,
+ struct anv_gen_indirect_params *params)
+{
+ /* We don't know the end_addr until we have emitted all the generation
+ * draws. Go and edit the address of all the push parameters.
+ */
+ uint64_t end_addr =
+ anv_address_physical(anv_batch_current_address(&cmd_buffer->batch));
+ while (params != NULL) {
+ params->end_addr = end_addr;
+ params = params->prev;
+ }
+}
+
+static void
+genX(cmd_buffer_emit_indirect_generated_draws_inplace)(struct anv_cmd_buffer *cmd_buffer,
+ struct anv_address indirect_data_addr,
+ uint32_t indirect_data_stride,
+ struct anv_address count_addr,
+ uint32_t max_draw_count,
+ bool indexed)
+{
+ const bool start_generation_batch =
+ anv_address_is_null(cmd_buffer->generation.return_addr);
+
+ genX(flush_pipeline_select_3d)(cmd_buffer);
+
+ struct anv_address draw_id_addr =
+ genX(cmd_buffer_get_draw_id_addr)(cmd_buffer, max_draw_count);
+
+#if GFX_VER == 9
+ /* Mark the VB-0 as using the entire dynamic state pool area, but only for
+ * the draw call starting the generation batch. All the following ones will
+ * use the same area.
+ */
+ if (start_generation_batch) {
+ struct anv_device *device = cmd_buffer->device;
+ genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(
+ cmd_buffer, 0,
+ (struct anv_address) {
+ .offset = device->physical->va.dynamic_state_pool.addr,
+ },
+ device->physical->va.dynamic_state_pool.size);
+ }
+
+ struct anv_graphics_pipeline *pipeline =
+ anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
+ const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
+
+ if (vs_prog_data->uses_baseinstance ||
+ vs_prog_data->uses_firstvertex) {
+ /* We're using the indirect buffer directly to source base instance &
+ * first vertex values. Mark the entire area as used.
+ */
+ genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(cmd_buffer, ANV_SVGS_VB_INDEX,
+ indirect_data_addr,
+ indirect_data_stride * max_draw_count);
+ }
+
+ if (vs_prog_data->uses_drawid) {
+ /* Mark the whole draw id buffer as used. */
+ genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(cmd_buffer, ANV_SVGS_VB_INDEX,
+ draw_id_addr,
+ sizeof(uint32_t) * max_draw_count);
+ }
+#endif
+
+ /* Apply the pipeline flush here so the indirect data is available for the
+ * generation shader.
+ */
+ genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+
+ if (start_generation_batch)
+ genX(cmd_buffer_emit_indirect_generated_draws_init)(cmd_buffer);
+
+ if (cmd_buffer->state.conditional_render_enabled)
+ genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
+
+ /* Emit the 3D state in the main batch. */
+ genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
+
+ const uint32_t draw_cmd_stride =
+ genX(cmd_buffer_get_generated_draw_stride)(cmd_buffer);
+
+ struct anv_gen_indirect_params *last_params = NULL;
+ uint32_t item_base = 0;
+ while (item_base < max_draw_count) {
+ const uint32_t item_count = MIN2(max_draw_count - item_base,
+ MAX_GENERATED_DRAW_COUNT);
+ const uint32_t draw_cmd_size = item_count * draw_cmd_stride;
+
+ /* Ensure we have enough contiguous space for all the draws so that the
+ * compute shader can edit all the 3DPRIMITIVEs from a single base
+ * address.
+ *
+ * TODO: we might have to split that if the amount of space is to large (at
+ * 1Mb?).
+ */
+ VkResult result = anv_batch_emit_ensure_space(&cmd_buffer->batch,
+ draw_cmd_size);
+ if (result != VK_SUCCESS)
+ return;
+
+ struct anv_state params_state =
+ genX(cmd_buffer_emit_generate_draws)(
+ cmd_buffer,
+ &cmd_buffer->generation.shader_state,
+ anv_batch_current_address(&cmd_buffer->batch),
+ draw_cmd_stride,
+ indirect_data_addr,
+ indirect_data_stride,
+ anv_address_add(draw_id_addr, 4 * item_base),
+ item_base,
+ item_count,
+ count_addr,
+ max_draw_count,
+ indexed,
+ 0 /* ring_count */);
+ struct anv_gen_indirect_params *params = params_state.map;
+ if (params == NULL)
+ return;
+
+ anv_batch_advance(&cmd_buffer->batch, draw_cmd_size);
+
+ item_base += item_count;
+
+ params->prev = last_params;
+ last_params = params;
+ }
+
+ genX(cmd_buffer_rewrite_forward_end_addr)(cmd_buffer, last_params);
+
+#if GFX_VER == 9
+ update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, indexed ? RANDOM : SEQUENTIAL);
+#endif
+}
+
+static void
+genX(cmd_buffer_emit_indirect_generated_draws_inring)(struct anv_cmd_buffer *cmd_buffer,
+ struct anv_address indirect_data_addr,
+ uint32_t indirect_data_stride,
+ struct anv_address count_addr,
+ uint32_t max_draw_count,
+ bool indexed)
+{
+ struct anv_device *device = cmd_buffer->device;
+
+ genX(flush_pipeline_select_3d)(cmd_buffer);
+
+ const uint32_t draw_cmd_stride =
+ genX(cmd_buffer_get_generated_draw_stride)(cmd_buffer);
+
+ if (cmd_buffer->generation.ring_bo == NULL) {
+ const uint32_t bo_size = align(
+#if GFX_VER >= 12
+ GENX(MI_ARB_CHECK_length) * 4 +
+#endif
+ draw_cmd_stride * MAX_RING_BO_ITEMS +
+#if GFX_VER == 9
+ 4 * MAX_RING_BO_ITEMS +
+#endif
+ GENX(MI_BATCH_BUFFER_START_length) * 4,
+ 4096);
+ VkResult result = anv_bo_pool_alloc(&device->batch_bo_pool, bo_size,
+ &cmd_buffer->generation.ring_bo);
+ if (result != VK_SUCCESS) {
+ anv_batch_set_error(&cmd_buffer->batch, result);
+ return;
+ }
+ }
+
+ /* How many items will be generated by each iteration of the generation
+ * shader dispatch.
+ */
+ const uint32_t ring_count = MIN2(MAX_RING_BO_ITEMS, max_draw_count);
+
+ /* The ring bo has the following layout:
+ *
+ * --------------------------------------------------
+ * | MI_ARB_CHECK to resume CS prefetch (Gfx12+) |
+ * |------------------------------------------------|
+ * | ring_count * 3DPRIMITIVE |
+ * |------------------------------------------------|
+ * | jump instruction (either back to generate more |
+ * | commands or to the next set of commands) |
+ * |------------------------------------------------|
+ * | draw ids (only used on Gfx9) |
+ * --------------------------------------------------
+ */
+
+ struct anv_address draw_id_addr = (struct anv_address) {
+ .bo = cmd_buffer->generation.ring_bo,
+ .offset = ring_count * draw_cmd_stride +
+ GENX(MI_BATCH_BUFFER_START_length) * 4,
+ };
+
+ struct anv_address draw_cmds_addr = (struct anv_address) {
+ .bo = cmd_buffer->generation.ring_bo,
+#if GFX_VER >= 12
+ .offset = GENX(MI_ARB_CHECK_length) * 4,
+#endif
+ };
+
+#if GFX_VER >= 12
+ struct GENX(MI_ARB_CHECK) resume_prefetch = {
+ .PreParserDisableMask = true,
+ .PreParserDisable = false,
+ };
+ GENX(MI_ARB_CHECK_pack)(NULL, cmd_buffer->generation.ring_bo->map,
+ &resume_prefetch);
+#endif
+
+#if GFX_VER == 9
+ /* Mark the VB-0 as using the entire ring_bo, but only for the draw call
+ * starting the generation batch. All the following ones will use the same
+ * area.
+ */
+ genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(
+ cmd_buffer, 0,
+ (struct anv_address) {
+ .bo = cmd_buffer->generation.ring_bo,
+ },
+ cmd_buffer->generation.ring_bo->size);
+
+ struct anv_graphics_pipeline *pipeline =
+ anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
+ const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
+
+ if (vs_prog_data->uses_baseinstance ||
+ vs_prog_data->uses_firstvertex) {
+ /* We're using the indirect buffer directly to source base instance &
+ * first vertex values. Mark the entire area as used.
+ */
+ genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(cmd_buffer, ANV_SVGS_VB_INDEX,
+ indirect_data_addr,
+ indirect_data_stride * max_draw_count);
+ }
+
+ if (vs_prog_data->uses_drawid) {
+ /* Mark the whole draw id buffer as used. */
+ genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(cmd_buffer, ANV_SVGS_VB_INDEX,
+ draw_id_addr,
+ sizeof(uint32_t) * max_draw_count);
+ }
+#endif
+
+ /* Apply the pipeline flush here so the indirect data is available for the
+ * generation shader.
+ */
+ genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+
+ trace_intel_begin_generate_draws(&cmd_buffer->trace);
+
+ /***
+ * This is where the command buffer below will jump back to if we need to
+ * generate more draws.
+ */
+ struct anv_address gen_addr = anv_batch_current_address(&cmd_buffer->batch);
+
+ struct anv_shader_bin *gen_kernel;
+ VkResult ret =
+ anv_device_get_internal_shader(
+ cmd_buffer->device,
+ ANV_INTERNAL_KERNEL_GENERATED_DRAWS,
+ &gen_kernel);
+ if (ret != VK_SUCCESS) {
+ anv_batch_set_error(&cmd_buffer->batch, ret);
+ return;
+ }
+
+ struct anv_simple_shader simple_state = (struct anv_simple_shader) {
+ .device = device,
+ .cmd_buffer = cmd_buffer,
+ .dynamic_state_stream = &cmd_buffer->dynamic_state_stream,
+ .general_state_stream = &cmd_buffer->general_state_stream,
+ .batch = &cmd_buffer->batch,
+ .kernel = gen_kernel,
+ .l3_config = device->internal_kernels_l3_config,
+ .urb_cfg = &cmd_buffer->state.gfx.urb_cfg,
+ };
+ genX(emit_simple_shader_init)(&simple_state);
+
+ struct anv_state params_state =
+ genX(cmd_buffer_emit_generate_draws)(
+ cmd_buffer,
+ &simple_state,
+ draw_cmds_addr,
+ draw_cmd_stride,
+ indirect_data_addr,
+ indirect_data_stride,
+ draw_id_addr,
+ 0 /* item_base */,
+ MIN2(MAX_RING_BO_ITEMS, max_draw_count) /* item_count */,
+ count_addr,
+ max_draw_count,
+ indexed,
+ ring_count);
+ struct anv_gen_indirect_params *params = params_state.map;
+
+ anv_add_pending_pipe_bits(cmd_buffer,
+#if GFX_VER == 9
+ ANV_PIPE_VF_CACHE_INVALIDATE_BIT |
+#endif
+ ANV_PIPE_DATA_CACHE_FLUSH_BIT |
+ ANV_PIPE_CS_STALL_BIT,
+ "after generation flush");
+
+ trace_intel_end_generate_draws(&cmd_buffer->trace);
+
+ if (cmd_buffer->state.conditional_render_enabled)
+ genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
+
+ /* Emit the 3D state in the main batch. */
+ genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
+
+ if (max_draw_count > 0) {
+#if GFX_VER >= 12
+ /* Prior to Gfx12 we cannot disable the CS prefetch but it doesn't matter
+ * as the prefetch shouldn't follow the MI_BATCH_BUFFER_START.
+ */
+ anv_batch_emit(&cmd_buffer->batch, GENX(MI_ARB_CHECK), arb) {
+ arb.PreParserDisableMask = true;
+ arb.PreParserDisable = true;
+ }
+#endif
+
+ /* Jump into the ring buffer. */
+ anv_batch_emit(&cmd_buffer->batch, GENX(MI_BATCH_BUFFER_START), bbs) {
+ bbs.AddressSpaceIndicator = ASI_PPGTT;
+ bbs.BatchBufferStartAddress = (struct anv_address) {
+ .bo = cmd_buffer->generation.ring_bo,
+ };
+ }
+
+ /***
+ * This is the location at which the ring buffer jumps to if it needs to
+ * generate more draw calls. We do the following :
+ * - wait for draws in the ring buffer to complete (cs stall) so we're
+ * sure the push constant data we're about to edit is not read anymore
+ * - increment the base draw number by the number of draws
+ * executed in the ring
+ * - invalidate the constant cache since the
+ * anv_generated_indirect_params::draw::draw_base is updated
+ * - jump back to the generation shader
+ */
+ struct anv_address inc_addr =
+ anv_batch_current_address(&cmd_buffer->batch);
+
+ anv_add_pending_pipe_bits(cmd_buffer,
+ ANV_PIPE_STALL_AT_SCOREBOARD_BIT |
+ ANV_PIPE_CS_STALL_BIT,
+ "after generated draws batch");
+ genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+
+ struct mi_builder b;
+ mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
+
+ struct anv_address draw_base_addr = anv_address_add(
+ genX(simple_shader_push_state_address)(
+ &simple_state, params_state),
+ offsetof(struct anv_gen_indirect_params, draw_base));
+
+ const uint32_t mocs = anv_mocs_for_address(cmd_buffer->device,
+ &draw_base_addr);
+ mi_builder_set_mocs(&b, mocs);
+
+ mi_store(&b, mi_mem32(draw_base_addr),
+ mi_iadd(&b, mi_mem32(draw_base_addr),
+ mi_imm(ring_count)));
+
+ anv_add_pending_pipe_bits(cmd_buffer,
+ ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT,
+ "after generated draws batch increment");
+ genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+
+ anv_batch_emit(&cmd_buffer->batch, GENX(MI_BATCH_BUFFER_START), bbs) {
+ bbs.AddressSpaceIndicator = ASI_PPGTT;
+ bbs.BatchBufferStartAddress = gen_addr;
+ }
+
+ /***
+ * This is the location at which the ring buffer jump to once all the draw
+ * calls have executed.
+ */
+ struct anv_address end_addr = anv_batch_current_address(&cmd_buffer->batch);
+
+ /* Reset the draw_base field in case we ever replay the command buffer. */
+ mi_store(&b, mi_mem32(draw_base_addr), mi_imm(0));
+
+ anv_add_pending_pipe_bits(cmd_buffer,
+ ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT,
+ "after generated draws end");
+
+ params->gen_addr = anv_address_physical(inc_addr);
+ params->end_addr = anv_address_physical(end_addr);
+ }
+}
+
+static void
+genX(cmd_buffer_emit_indirect_generated_draws)(struct anv_cmd_buffer *cmd_buffer,
+ struct anv_address indirect_data_addr,
+ uint32_t indirect_data_stride,
+ struct anv_address count_addr,
+ uint32_t max_draw_count,
+ bool indexed)
+{
+ /* In order to have the vertex fetch gather the data we need to have a non
+ * 0 stride. It's possible to have a 0 stride given by the application when
+ * draw_count is 1, but we need a correct value for the
+ * VERTEX_BUFFER_STATE::BufferPitch, so ensure the caller set this
+ * correctly :
+ *
+ * Vulkan spec, vkCmdDrawIndirect:
+ *
+ * "If drawCount is less than or equal to one, stride is ignored."
+ */
+ assert(indirect_data_stride > 0);
+
+ const bool use_ring_buffer = max_draw_count >=
+ cmd_buffer->device->physical->instance->generated_indirect_ring_threshold;
+ if (use_ring_buffer) {
+ genX(cmd_buffer_emit_indirect_generated_draws_inring)(cmd_buffer,
+ indirect_data_addr,
+ indirect_data_stride,
+ count_addr,
+ max_draw_count,
+ indexed);
+ } else {
+ genX(cmd_buffer_emit_indirect_generated_draws_inplace)(cmd_buffer,
+ indirect_data_addr,
+ indirect_data_stride,
+ count_addr,
+ max_draw_count,
+ indexed);
+ }
+}
+
+#endif /* GENX_CMD_DRAW_GENERATED_INDIRECT_H */
diff --git a/src/intel/vulkan/genX_cmd_draw_helpers.h b/src/intel/vulkan/genX_cmd_draw_helpers.h
new file mode 100644
index 00000000000..2c370909ef1
--- /dev/null
+++ b/src/intel/vulkan/genX_cmd_draw_helpers.h
@@ -0,0 +1,153 @@
+/*
+ * Copyright © 2022 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef GENX_CMD_DRAW_HELPERS_H
+#define GENX_CMD_DRAW_HELPERS_H
+
+#include <assert.h>
+#include <stdbool.h>
+
+#include "anv_private.h"
+
+#if GFX_VER < 11
+static void
+emit_vertex_bo(struct anv_cmd_buffer *cmd_buffer,
+ struct anv_address addr,
+ uint32_t size, uint32_t index)
+{
+ uint32_t *p = anv_batch_emitn(&cmd_buffer->batch, 5,
+ GENX(3DSTATE_VERTEX_BUFFERS));
+
+ GENX(VERTEX_BUFFER_STATE_pack)(&cmd_buffer->batch, p + 1,
+ &(struct GENX(VERTEX_BUFFER_STATE)) {
+ .VertexBufferIndex = index,
+ .AddressModifyEnable = true,
+ .BufferPitch = 0,
+ .MOCS = anv_mocs(cmd_buffer->device, addr.bo,
+ ISL_SURF_USAGE_VERTEX_BUFFER_BIT),
+ .NullVertexBuffer = size == 0,
+ .BufferStartingAddress = addr,
+ .BufferSize = size
+ });
+
+#if GFX_VER == 9
+ genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(cmd_buffer,
+ index, addr, size);
+#endif
+}
+
+static void
+emit_base_vertex_instance_bo(struct anv_cmd_buffer *cmd_buffer,
+ struct anv_address addr)
+{
+ emit_vertex_bo(cmd_buffer, addr, addr.bo ? 8 : 0, ANV_SVGS_VB_INDEX);
+}
+
+static void
+emit_base_vertex_instance(struct anv_cmd_buffer *cmd_buffer,
+ uint32_t base_vertex, uint32_t base_instance)
+{
+ if (base_vertex == 0 && base_instance == 0) {
+ emit_base_vertex_instance_bo(cmd_buffer, ANV_NULL_ADDRESS);
+ return;
+ }
+
+ struct anv_state id_state =
+ anv_cmd_buffer_alloc_temporary_state(cmd_buffer, 8, 4);
+
+ ((uint32_t *)id_state.map)[0] = base_vertex;
+ ((uint32_t *)id_state.map)[1] = base_instance;
+
+ struct anv_address addr =
+ anv_cmd_buffer_temporary_state_address(cmd_buffer, id_state);
+
+ emit_base_vertex_instance_bo(cmd_buffer, addr);
+}
+
+static void
+emit_draw_index(struct anv_cmd_buffer *cmd_buffer, uint32_t draw_index)
+{
+ struct anv_state state =
+ anv_cmd_buffer_alloc_temporary_state(cmd_buffer, 4, 4);
+
+ ((uint32_t *)state.map)[0] = draw_index;
+
+ struct anv_address addr =
+ anv_cmd_buffer_temporary_state_address(cmd_buffer, state);
+
+ emit_vertex_bo(cmd_buffer, addr, 4, ANV_DRAWID_VB_INDEX);
+}
+#endif /* GFX_VER <= 11 */
+
+static void
+update_dirty_vbs_for_gfx8_vb_flush(struct anv_cmd_buffer *cmd_buffer,
+ uint32_t access_type)
+{
+#if GFX_VER == 9
+ const struct vk_dynamic_graphics_state *dyn =
+ &cmd_buffer->vk.dynamic_graphics_state;
+ struct anv_graphics_pipeline *pipeline =
+ anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
+ const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
+
+ uint64_t vb_used = dyn->vi->bindings_valid;
+ if (vs_prog_data->uses_firstvertex ||
+ vs_prog_data->uses_baseinstance)
+ vb_used |= 1ull << ANV_SVGS_VB_INDEX;
+ if (vs_prog_data->uses_drawid)
+ vb_used |= 1ull << ANV_DRAWID_VB_INDEX;
+
+ genX(cmd_buffer_update_dirty_vbs_for_gfx8_vb_flush)(cmd_buffer,
+ access_type,
+ vb_used);
+#endif
+}
+
+#if GFX_VER < 11
+ALWAYS_INLINE static void
+cmd_buffer_emit_vertex_constants_and_flush(struct anv_cmd_buffer *cmd_buffer,
+ const struct brw_vs_prog_data *vs_prog_data,
+ uint32_t base_vertex,
+ uint32_t base_instance,
+ uint32_t draw_id,
+ bool force_flush)
+{
+ bool emitted = false;
+ if (vs_prog_data->uses_firstvertex ||
+ vs_prog_data->uses_baseinstance) {
+ emit_base_vertex_instance(cmd_buffer, base_vertex, base_instance);
+ emitted = true;
+ }
+ if (vs_prog_data->uses_drawid) {
+ emit_draw_index(cmd_buffer, draw_id);
+ emitted = true;
+ }
+ /* Emitting draw index or vertex index BOs may result in needing
+ * additional VF cache flushes.
+ */
+ if (emitted || force_flush)
+ genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+}
+#endif
+
+#endif /* GENX_CMD_DRAW_HELPERS_H */
diff --git a/src/intel/vulkan/genX_cmd_video.c b/src/intel/vulkan/genX_cmd_video.c
new file mode 100644
index 00000000000..e7e94f16f25
--- /dev/null
+++ b/src/intel/vulkan/genX_cmd_video.c
@@ -0,0 +1,1195 @@
+/*
+ * Copyright © 2021 Red Hat
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "anv_private.h"
+
+#include "genxml/gen_macros.h"
+#include "genxml/genX_pack.h"
+
+#include "util/vl_zscan_data.h"
+
+void
+genX(CmdBeginVideoCodingKHR)(VkCommandBuffer commandBuffer,
+ const VkVideoBeginCodingInfoKHR *pBeginInfo)
+{
+ ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+ ANV_FROM_HANDLE(anv_video_session, vid, pBeginInfo->videoSession);
+ ANV_FROM_HANDLE(anv_video_session_params, params, pBeginInfo->videoSessionParameters);
+
+ cmd_buffer->video.vid = vid;
+ cmd_buffer->video.params = params;
+}
+
+void
+genX(CmdControlVideoCodingKHR)(VkCommandBuffer commandBuffer,
+ const VkVideoCodingControlInfoKHR *pCodingControlInfo)
+{
+ ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+
+ if (pCodingControlInfo->flags & VK_VIDEO_CODING_CONTROL_RESET_BIT_KHR) {
+ anv_batch_emit(&cmd_buffer->batch, GENX(MI_FLUSH_DW), flush) {
+ flush.VideoPipelineCacheInvalidate = 1;
+ }
+ }
+}
+
+void
+genX(CmdEndVideoCodingKHR)(VkCommandBuffer commandBuffer,
+ const VkVideoEndCodingInfoKHR *pEndCodingInfo)
+{
+ ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+
+ cmd_buffer->video.vid = NULL;
+ cmd_buffer->video.params = NULL;
+}
+
+/*
+ * The default scan order of scaling lists is up-right-diagonal
+ * according to the spec. But the device requires raster order,
+ * so we need to convert from the passed scaling lists.
+ */
+static void
+anv_h265_matrix_from_uprightdiagonal(StdVideoH265ScalingLists *out_sl,
+ const StdVideoH265ScalingLists *sl)
+{
+ uint8_t i, j;
+
+ for (i = 0; i < 6; i++) {
+ for (j = 0; j < STD_VIDEO_H265_SCALING_LIST_4X4_NUM_ELEMENTS; j++)
+ out_sl->ScalingList4x4[i][vl_zscan_h265_up_right_diagonal_16[j]] =
+ sl->ScalingList4x4[i][j];
+
+ for (j = 0; j < STD_VIDEO_H265_SCALING_LIST_8X8_NUM_ELEMENTS; j++)
+ out_sl->ScalingList8x8[i][vl_zscan_h265_up_right_diagonal[j]] =
+ sl->ScalingList8x8[i][j];
+
+ for (j = 0; j < STD_VIDEO_H265_SCALING_LIST_16X16_NUM_ELEMENTS; j++)
+ out_sl->ScalingList16x16[i][vl_zscan_h265_up_right_diagonal[j]] =
+ sl->ScalingList16x16[i][j];
+ }
+
+ for (i = 0; i < STD_VIDEO_H265_SCALING_LIST_32X32_NUM_LISTS; i++) {
+ for (j = 0; j < STD_VIDEO_H265_SCALING_LIST_32X32_NUM_ELEMENTS; j++)
+ out_sl->ScalingList32x32[i][vl_zscan_h265_up_right_diagonal[j]] =
+ sl->ScalingList32x32[i][j];
+ }
+}
+
+static void
+scaling_list(struct anv_cmd_buffer *cmd_buffer,
+ const StdVideoH265ScalingLists *scaling_list)
+{
+ StdVideoH265ScalingLists out_sl = {0, };
+
+ anv_h265_matrix_from_uprightdiagonal(&out_sl, scaling_list);
+
+ /* 4x4, 8x8, 16x16, 32x32 */
+ for (uint8_t size = 0; size < 4; size++) {
+ /* Intra, Inter */
+ for (uint8_t pred = 0; pred < 2; pred++) {
+ /* Y, Cb, Cr */
+ for (uint8_t color = 0; color < 3; color++) {
+ if (size == 3 && color > 0)
+ continue;
+
+ anv_batch_emit(&cmd_buffer->batch, GENX(HCP_QM_STATE), qm) {
+ qm.SizeID = size;
+ qm.PredictionType = pred;
+ qm.ColorComponent = color;
+
+ qm.DCCoefficient = size > 1 ?
+ (size == 2 ? scaling_list->ScalingListDCCoef16x16[3 * pred + color] :
+ scaling_list->ScalingListDCCoef32x32[pred]) : 0;
+
+ if (size == 0) {
+ for (uint8_t i = 0; i < 4; i++)
+ for (uint8_t j = 0; j < 4; j++)
+ qm.QuantizerMatrix8x8[4 * i + j] =
+ out_sl.ScalingList4x4[3 * pred + color][4 * i + j];
+ } else if (size == 1) {
+ for (uint8_t i = 0; i < 8; i++)
+ for (uint8_t j = 0; j < 8; j++)
+ qm.QuantizerMatrix8x8[8 * i + j] =
+ out_sl.ScalingList8x8[3 * pred + color][8 * i + j];
+ } else if (size == 2) {
+ for (uint8_t i = 0; i < 8; i++)
+ for (uint8_t j = 0; j < 8; j++)
+ qm.QuantizerMatrix8x8[8 * i + j] =
+ out_sl.ScalingList16x16[3 * pred + color][8 * i + j];
+ } else if (size == 3) {
+ for (uint8_t i = 0; i < 8; i++)
+ for (uint8_t j = 0; j < 8; j++)
+ qm.QuantizerMatrix8x8[8 * i + j] =
+ out_sl.ScalingList32x32[pred][8 * i + j];
+ }
+ }
+ }
+ }
+ }
+}
+
+static void
+anv_h265_decode_video(struct anv_cmd_buffer *cmd_buffer,
+ const VkVideoDecodeInfoKHR *frame_info)
+{
+ ANV_FROM_HANDLE(anv_buffer, src_buffer, frame_info->srcBuffer);
+ struct anv_video_session *vid = cmd_buffer->video.vid;
+ struct anv_video_session_params *params = cmd_buffer->video.params;
+
+ const struct VkVideoDecodeH265PictureInfoKHR *h265_pic_info =
+ vk_find_struct_const(frame_info->pNext, VIDEO_DECODE_H265_PICTURE_INFO_KHR);
+
+ const StdVideoH265SequenceParameterSet *sps =
+ vk_video_find_h265_dec_std_sps(&params->vk, h265_pic_info->pStdPictureInfo->pps_seq_parameter_set_id);
+ const StdVideoH265PictureParameterSet *pps =
+ vk_video_find_h265_dec_std_pps(&params->vk, h265_pic_info->pStdPictureInfo->pps_pic_parameter_set_id);
+
+ struct vk_video_h265_reference ref_slots[2][8] = { 0 };
+ uint8_t dpb_idx[ANV_VIDEO_H265_MAX_NUM_REF_FRAME] = { 0,};
+ bool is_10bit = sps->bit_depth_chroma_minus8 || sps->bit_depth_luma_minus8;
+
+ anv_batch_emit(&cmd_buffer->batch, GENX(MI_FLUSH_DW), flush) {
+ flush.VideoPipelineCacheInvalidate = 1;
+ };
+
+#if GFX_VER >= 12
+ anv_batch_emit(&cmd_buffer->batch, GENX(MI_FORCE_WAKEUP), wake) {
+ wake.HEVCPowerWellControl = 1;
+ wake.MaskBits = 768;
+ }
+
+ anv_batch_emit(&cmd_buffer->batch, GENX(VD_CONTROL_STATE), cs) {
+ cs.PipelineInitialization = true;
+ }
+
+ anv_batch_emit(&cmd_buffer->batch, GENX(MFX_WAIT), mfx) {
+ mfx.MFXSyncControlFlag = 1;
+ }
+#endif
+
+ anv_batch_emit(&cmd_buffer->batch, GENX(HCP_PIPE_MODE_SELECT), sel) {
+ sel.CodecSelect = Decode;
+ sel.CodecStandardSelect = HEVC;
+ }
+
+#if GFX_VER >= 12
+ anv_batch_emit(&cmd_buffer->batch, GENX(MFX_WAIT), mfx) {
+ mfx.MFXSyncControlFlag = 1;
+ }
+#endif
+
+ const struct anv_image_view *iv =
+ anv_image_view_from_handle(frame_info->dstPictureResource.imageViewBinding);
+ const struct anv_image *img = iv->image;
+
+ anv_batch_emit(&cmd_buffer->batch, GENX(HCP_SURFACE_STATE), ss) {
+ ss.SurfacePitch = img->planes[0].primary_surface.isl.row_pitch_B - 1;
+ ss.SurfaceID = HCP_CurrentDecodedPicture;
+ ss.SurfaceFormat = is_10bit ? P010 : PLANAR_420_8;
+
+ ss.YOffsetforUCb = img->planes[1].primary_surface.memory_range.offset /
+ img->planes[0].primary_surface.isl.row_pitch_B;
+
+#if GFX_VER >= 11
+ ss.DefaultAlphaValue = 0xffff;
+#endif
+ }
+
+#if GFX_VER >= 12
+ /* Seems to need to set same states to ref as decode on gen12 */
+ anv_batch_emit(&cmd_buffer->batch, GENX(HCP_SURFACE_STATE), ss) {
+ ss.SurfacePitch = img->planes[0].primary_surface.isl.row_pitch_B - 1;
+ ss.SurfaceID = HCP_ReferencePicture;
+ ss.SurfaceFormat = is_10bit ? P010 : PLANAR_420_8;
+
+ ss.YOffsetforUCb = img->planes[1].primary_surface.memory_range.offset /
+ img->planes[0].primary_surface.isl.row_pitch_B;
+
+ ss.DefaultAlphaValue = 0xffff;
+ }
+#endif
+
+ anv_batch_emit(&cmd_buffer->batch, GENX(HCP_PIPE_BUF_ADDR_STATE), buf) {
+ buf.DecodedPictureAddress =
+ anv_image_address(img, &img->planes[0].primary_surface.memory_range);
+
+ buf.DecodedPictureMemoryAddressAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+ .MOCS = anv_mocs(cmd_buffer->device, buf.DecodedPictureAddress.bo, 0),
+ };
+
+ buf.DeblockingFilterLineBufferAddress = (struct anv_address) {
+ vid->vid_mem[ANV_VID_MEM_H265_DEBLOCK_FILTER_ROW_STORE_LINE].mem->bo,
+ vid->vid_mem[ANV_VID_MEM_H265_DEBLOCK_FILTER_ROW_STORE_LINE].offset
+ };
+
+ buf.DeblockingFilterLineBufferMemoryAddressAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+ .MOCS = anv_mocs(cmd_buffer->device, buf.DeblockingFilterLineBufferAddress.bo, 0),
+ };
+
+ buf.DeblockingFilterTileLineBufferAddress = (struct anv_address) {
+ vid->vid_mem[ANV_VID_MEM_H265_DEBLOCK_FILTER_ROW_STORE_TILE_LINE].mem->bo,
+ vid->vid_mem[ANV_VID_MEM_H265_DEBLOCK_FILTER_ROW_STORE_TILE_LINE].offset
+ };
+
+ buf.DeblockingFilterTileLineBufferMemoryAddressAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+ .MOCS = anv_mocs(cmd_buffer->device, buf.DeblockingFilterTileLineBufferAddress.bo, 0),
+ };
+
+ buf.DeblockingFilterTileColumnBufferAddress = (struct anv_address) {
+ vid->vid_mem[ANV_VID_MEM_H265_DEBLOCK_FILTER_ROW_STORE_TILE_COLUMN].mem->bo,
+ vid->vid_mem[ANV_VID_MEM_H265_DEBLOCK_FILTER_ROW_STORE_TILE_COLUMN].offset
+ };
+
+ buf.DeblockingFilterTileColumnBufferMemoryAddressAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+ .MOCS = anv_mocs(cmd_buffer->device, buf.DeblockingFilterTileColumnBufferAddress.bo, 0),
+ };
+
+ buf.MetadataLineBufferAddress = (struct anv_address) {
+ vid->vid_mem[ANV_VID_MEM_H265_METADATA_LINE].mem->bo,
+ vid->vid_mem[ANV_VID_MEM_H265_METADATA_LINE].offset
+ };
+
+ buf.MetadataLineBufferMemoryAddressAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+ .MOCS = anv_mocs(cmd_buffer->device, buf.MetadataLineBufferAddress.bo, 0),
+ };
+
+ buf.MetadataTileLineBufferAddress = (struct anv_address) {
+ vid->vid_mem[ANV_VID_MEM_H265_METADATA_TILE_LINE].mem->bo,
+ vid->vid_mem[ANV_VID_MEM_H265_METADATA_TILE_LINE].offset
+ };
+
+ buf.MetadataTileLineBufferMemoryAddressAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+ .MOCS = anv_mocs(cmd_buffer->device, buf.MetadataTileLineBufferAddress.bo, 0),
+ };
+
+ buf.MetadataTileColumnBufferAddress = (struct anv_address) {
+ vid->vid_mem[ANV_VID_MEM_H265_METADATA_TILE_COLUMN].mem->bo,
+ vid->vid_mem[ANV_VID_MEM_H265_METADATA_TILE_COLUMN].offset
+ };
+
+ buf.MetadataTileColumnBufferMemoryAddressAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+ .MOCS = anv_mocs(cmd_buffer->device, buf.MetadataTileColumnBufferAddress.bo, 0),
+ };
+
+ buf.SAOLineBufferAddress = (struct anv_address) {
+ vid->vid_mem[ANV_VID_MEM_H265_SAO_LINE].mem->bo,
+ vid->vid_mem[ANV_VID_MEM_H265_SAO_LINE].offset
+ };
+
+ buf.SAOLineBufferMemoryAddressAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+ .MOCS = anv_mocs(cmd_buffer->device, buf.SAOLineBufferAddress.bo, 0),
+ };
+
+ buf.SAOTileLineBufferAddress = (struct anv_address) {
+ vid->vid_mem[ANV_VID_MEM_H265_SAO_TILE_LINE].mem->bo,
+ vid->vid_mem[ANV_VID_MEM_H265_SAO_TILE_LINE].offset
+ };
+
+ buf.SAOTileLineBufferMemoryAddressAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+ .MOCS = anv_mocs(cmd_buffer->device, buf.SAOTileLineBufferAddress.bo, 0),
+ };
+
+ buf.SAOTileColumnBufferAddress = (struct anv_address) {
+ vid->vid_mem[ANV_VID_MEM_H265_SAO_TILE_COLUMN].mem->bo,
+ vid->vid_mem[ANV_VID_MEM_H265_SAO_TILE_COLUMN].offset
+ };
+
+ buf.SAOTileColumnBufferMemoryAddressAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+ .MOCS = anv_mocs(cmd_buffer->device, buf.SAOTileColumnBufferAddress.bo, 0),
+ };
+
+ buf.CurrentMVTemporalBufferAddress = anv_image_address(img, &img->vid_dmv_top_surface);
+
+ buf.CurrentMVTemporalBufferMemoryAddressAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+ .MOCS = anv_mocs(cmd_buffer->device, buf.CurrentMVTemporalBufferAddress.bo, 0),
+ };
+
+ for (unsigned i = 0; i < frame_info->referenceSlotCount; i++) {
+ const struct anv_image_view *ref_iv =
+ anv_image_view_from_handle(frame_info->pReferenceSlots[i].pPictureResource->imageViewBinding);
+ int slot_idx = frame_info->pReferenceSlots[i].slotIndex;
+
+ assert(slot_idx < ANV_VIDEO_H265_MAX_NUM_REF_FRAME);
+ dpb_idx[slot_idx] = i;
+
+ buf.ReferencePictureAddress[i] =
+ anv_image_address(ref_iv->image, &ref_iv->image->planes[0].primary_surface.memory_range);
+ }
+
+ buf.ReferencePictureMemoryAddressAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+ .MOCS = anv_mocs(cmd_buffer->device, NULL, 0),
+ };
+
+ buf.OriginalUncompressedPictureSourceMemoryAddressAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+ .MOCS = anv_mocs(cmd_buffer->device, NULL, 0),
+ };
+
+ buf.StreamOutDataDestinationMemoryAddressAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+ .MOCS = anv_mocs(cmd_buffer->device, NULL, 0),
+ };
+
+ buf.DecodedPictureStatusBufferMemoryAddressAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+ .MOCS = anv_mocs(cmd_buffer->device, NULL, 0),
+ };
+
+ buf.LCUILDBStreamOutBufferMemoryAddressAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+ .MOCS = anv_mocs(cmd_buffer->device, NULL, 0),
+ };
+
+ for (unsigned i = 0; i < frame_info->referenceSlotCount; i++) {
+ const struct anv_image_view *ref_iv =
+ anv_image_view_from_handle(frame_info->pReferenceSlots[i].pPictureResource->imageViewBinding);
+
+ buf.CollocatedMVTemporalBufferAddress[i] =
+ anv_image_address(ref_iv->image, &ref_iv->image->vid_dmv_top_surface);
+ }
+
+ buf.CollocatedMVTemporalBufferMemoryAddressAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+ .MOCS = anv_mocs(cmd_buffer->device, buf.CollocatedMVTemporalBufferAddress[0].bo, 0),
+ };
+
+ buf.VP9ProbabilityBufferMemoryAddressAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+ .MOCS = anv_mocs(cmd_buffer->device, NULL, 0),
+ };
+
+ buf.VP9SegmentIDBufferMemoryAddressAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+ .MOCS = anv_mocs(cmd_buffer->device, NULL, 0),
+ };
+
+ buf.VP9HVDLineRowStoreBufferMemoryAddressAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+ .MOCS = anv_mocs(cmd_buffer->device, NULL, 0),
+ };
+
+ buf.VP9HVDTileRowStoreBufferMemoryAddressAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+ .MOCS = anv_mocs(cmd_buffer->device, NULL, 0),
+ };
+#if GFX_VER >= 11
+ buf.SAOStreamOutDataDestinationBufferMemoryAddressAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+ .MOCS = anv_mocs(cmd_buffer->device, NULL, 0),
+ };
+ buf.FrameStatisticsStreamOutDataDestinationBufferMemoryAddressAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+ .MOCS = anv_mocs(cmd_buffer->device, NULL, 0),
+ };
+ buf.SSESourcePixelRowStoreBufferMemoryAddressAttributesReadWrite = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+ .MOCS = anv_mocs(cmd_buffer->device, NULL, 0),
+ };
+ buf.HCPScalabilitySliceStateBufferMemoryAddressAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+ .MOCS = anv_mocs(cmd_buffer->device, NULL, 0),
+ };
+ buf.HCPScalabilityCABACDecodedSyntaxElementsBufferMemoryAddressAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+ .MOCS = anv_mocs(cmd_buffer->device, NULL, 0),
+ };
+ buf.MVUpperRightColumnStoreBufferMemoryAddressAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+ .MOCS = anv_mocs(cmd_buffer->device, NULL, 0),
+ };
+ buf.IntraPredictionUpperRightColumnStoreBufferMemoryAddressAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+ .MOCS = anv_mocs(cmd_buffer->device, NULL, 0),
+ };
+ buf.IntraPredictionLeftReconColumnStoreBufferMemoryAddressAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+ .MOCS = anv_mocs(cmd_buffer->device, NULL, 0),
+ };
+#endif
+ }
+
+ anv_batch_emit(&cmd_buffer->batch, GENX(HCP_IND_OBJ_BASE_ADDR_STATE), indirect) {
+ indirect.HCPIndirectBitstreamObjectBaseAddress =
+ anv_address_add(src_buffer->address, frame_info->srcBufferOffset & ~4095);
+
+ indirect.HCPIndirectBitstreamObjectMemoryAddressAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+ .MOCS = anv_mocs(cmd_buffer->device, src_buffer->address.bo, 0),
+ };
+
+ indirect.HCPIndirectBitstreamObjectAccessUpperBound =
+ anv_address_add(src_buffer->address, align64(frame_info->srcBufferRange, 4096));
+
+ indirect.HCPIndirectCUObjectMemoryAddressAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+ .MOCS = anv_mocs(cmd_buffer->device, NULL, 0),
+ };
+
+ indirect.HCPPAKBSEObjectMemoryAddressAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+ .MOCS = anv_mocs(cmd_buffer->device, NULL, 0),
+ };
+
+#if GFX_VER >= 11
+ indirect.HCPVP9PAKCompressedHeaderSyntaxStreamInMemoryAddressAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+ .MOCS = anv_mocs(cmd_buffer->device, NULL, 0),
+ };
+ indirect.HCPVP9PAKProbabilityCounterStreamOutMemoryAddressAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+ .MOCS = anv_mocs(cmd_buffer->device, NULL, 0),
+ };
+ indirect.HCPVP9PAKProbabilityDeltasStreamInMemoryAddressAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+ .MOCS = anv_mocs(cmd_buffer->device, NULL, 0),
+ };
+ indirect.HCPVP9PAKTileRecordStreamOutMemoryAddressAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+ .MOCS = anv_mocs(cmd_buffer->device, NULL, 0),
+ };
+ indirect.HCPVP9PAKCULevelStatisticStreamOutMemoryAddressAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+ .MOCS = anv_mocs(cmd_buffer->device, NULL, 0),
+ };
+#endif
+ }
+
+ if (sps->flags.scaling_list_enabled_flag) {
+ if (pps->flags.pps_scaling_list_data_present_flag) {
+ scaling_list(cmd_buffer, pps->pScalingLists);
+ } else if (sps->flags.sps_scaling_list_data_present_flag) {
+ scaling_list(cmd_buffer, sps->pScalingLists);
+ }
+ } else {
+ for (uint8_t size = 0; size < 4; size++) {
+ for (uint8_t pred = 0; pred < 2; pred++) {
+ for (uint8_t color = 0; color < 3; color++) {
+
+ if (size == 3 && color > 0)
+ continue;
+
+ anv_batch_emit(&cmd_buffer->batch, GENX(HCP_QM_STATE), qm) {
+ qm.SizeID = size;
+ qm.PredictionType = pred;
+ qm.ColorComponent = color;
+ qm.DCCoefficient = (size > 1) ? 16 : 0;
+ unsigned len = (size == 0) ? 16 : 64;
+
+ for (uint8_t q = 0; q < len; q++)
+ qm.QuantizerMatrix8x8[q] = 0x10;
+ }
+ }
+ }
+ }
+ }
+
+ anv_batch_emit(&cmd_buffer->batch, GENX(HCP_PIC_STATE), pic) {
+ pic.FrameWidthInMinimumCodingBlockSize =
+ sps->pic_width_in_luma_samples / (1 << (sps->log2_min_luma_coding_block_size_minus3 + 3)) - 1;
+ pic.FrameHeightInMinimumCodingBlockSize =
+ sps->pic_height_in_luma_samples / (1 << (sps->log2_min_luma_coding_block_size_minus3 + 3)) - 1;
+
+ pic.MinCUSize = sps->log2_min_luma_coding_block_size_minus3 & 0x3;
+ pic.LCUSize = (sps->log2_diff_max_min_luma_coding_block_size +
+ sps->log2_min_luma_coding_block_size_minus3) & 0x3;
+
+ pic.MinTUSize = sps->log2_min_luma_transform_block_size_minus2 & 0x3;
+ pic.MaxTUSize = (sps->log2_diff_max_min_luma_transform_block_size + sps->log2_min_luma_transform_block_size_minus2) & 0x3;
+ pic.MinPCMSize = sps->log2_min_pcm_luma_coding_block_size_minus3 & 0x3;
+ pic.MaxPCMSize = (sps->log2_diff_max_min_pcm_luma_coding_block_size + sps->log2_min_pcm_luma_coding_block_size_minus3) & 0x3;
+
+#if GFX_VER >= 11
+ pic.Log2SAOOffsetScaleLuma = pps->log2_sao_offset_scale_luma;
+ pic.Log2SAOOffsetScaleChroma = pps->log2_sao_offset_scale_chroma;
+ pic.ChromaQPOffsetListLength = pps->chroma_qp_offset_list_len_minus1;
+ pic.DiffCUChromaQPOffsetDepth = pps->diff_cu_chroma_qp_offset_depth;
+ pic.ChromaQPOffsetListEnable = pps->flags.chroma_qp_offset_list_enabled_flag;
+ pic.ChromaSubsampling = sps->chroma_format_idc;
+
+ pic.HighPrecisionOffsetsEnable = sps->flags.high_precision_offsets_enabled_flag;
+ pic.Log2MaxTransformSkipSize = pps->log2_max_transform_skip_block_size_minus2 + 2;
+ pic.CrossComponentPredictionEnable = pps->flags.cross_component_prediction_enabled_flag;
+ pic.CABACBypassAlignmentEnable = sps->flags.cabac_bypass_alignment_enabled_flag;
+ pic.PersistentRiceAdaptationEnable = sps->flags.persistent_rice_adaptation_enabled_flag;
+ pic.IntraSmoothingDisable = sps->flags.intra_smoothing_disabled_flag;
+ pic.ExplicitRDPCMEnable = sps->flags.explicit_rdpcm_enabled_flag;
+ pic.ImplicitRDPCMEnable = sps->flags.implicit_rdpcm_enabled_flag;
+ pic.TransformSkipContextEnable = sps->flags.transform_skip_context_enabled_flag;
+ pic.TransformSkipRotationEnable = sps->flags.transform_skip_rotation_enabled_flag;
+ pic.SPSRangeExtensionEnable = sps->flags.sps_range_extension_flag;
+#endif
+
+ pic.CollocatedPictureIsISlice = false;
+ pic.CurrentPictureIsISlice = false;
+ pic.SampleAdaptiveOffsetEnable = sps->flags.sample_adaptive_offset_enabled_flag;
+ pic.PCMEnable = sps->flags.pcm_enabled_flag;
+ pic.CUQPDeltaEnable = pps->flags.cu_qp_delta_enabled_flag;
+ pic.MaxDQPDepth = pps->diff_cu_qp_delta_depth;
+ pic.PCMLoopFilterDisable = sps->flags.pcm_loop_filter_disabled_flag;
+ pic.ConstrainedIntraPrediction = pps->flags.constrained_intra_pred_flag;
+ pic.Log2ParallelMergeLevel = pps->log2_parallel_merge_level_minus2;
+ pic.SignDataHiding = pps->flags.sign_data_hiding_enabled_flag;
+ pic.LoopFilterEnable = pps->flags.loop_filter_across_tiles_enabled_flag;
+ pic.EntropyCodingSyncEnable = pps->flags.entropy_coding_sync_enabled_flag;
+ pic.TilingEnable = pps->flags.tiles_enabled_flag;
+ pic.WeightedBiPredicationEnable = pps->flags.weighted_bipred_flag;
+ pic.WeightedPredicationEnable = pps->flags.weighted_pred_flag;
+ pic.FieldPic = 0;
+ pic.TopField = true;
+ pic.TransformSkipEnable = pps->flags.transform_skip_enabled_flag;
+ pic.AMPEnable = sps->flags.amp_enabled_flag;
+ pic.TransquantBypassEnable = pps->flags.transquant_bypass_enabled_flag;
+ pic.StrongIntraSmoothingEnable = sps->flags.strong_intra_smoothing_enabled_flag;
+ pic.CUPacketStructure = 0;
+
+ pic.PictureCbQPOffset = pps->pps_cb_qp_offset;
+ pic.PictureCrQPOffset = pps->pps_cr_qp_offset;
+ pic.IntraMaxTransformHierarchyDepth = sps->max_transform_hierarchy_depth_intra;
+ pic.InterMaxTransformHierarchyDepth = sps->max_transform_hierarchy_depth_inter;
+ pic.ChromaPCMSampleBitDepth = sps->pcm_sample_bit_depth_chroma_minus1 & 0xf;
+ pic.LumaPCMSampleBitDepth = sps->pcm_sample_bit_depth_luma_minus1 & 0xf;
+
+ pic.ChromaBitDepth = sps->bit_depth_chroma_minus8;
+ pic.LumaBitDepth = sps->bit_depth_luma_minus8;
+
+#if GFX_VER >= 11
+ pic.CbQPOffsetList0 = pps->cb_qp_offset_list[0];
+ pic.CbQPOffsetList1 = pps->cb_qp_offset_list[1];
+ pic.CbQPOffsetList2 = pps->cb_qp_offset_list[2];
+ pic.CbQPOffsetList3 = pps->cb_qp_offset_list[3];
+ pic.CbQPOffsetList4 = pps->cb_qp_offset_list[4];
+ pic.CbQPOffsetList5 = pps->cb_qp_offset_list[5];
+
+ pic.CrQPOffsetList0 = pps->cr_qp_offset_list[0];
+ pic.CrQPOffsetList1 = pps->cr_qp_offset_list[1];
+ pic.CrQPOffsetList2 = pps->cr_qp_offset_list[2];
+ pic.CrQPOffsetList3 = pps->cr_qp_offset_list[3];
+ pic.CrQPOffsetList4 = pps->cr_qp_offset_list[4];
+ pic.CrQPOffsetList5 = pps->cr_qp_offset_list[5];
+#endif
+ }
+
+ if (pps->flags.tiles_enabled_flag) {
+ int cum = 0;
+ anv_batch_emit(&cmd_buffer->batch, GENX(HCP_TILE_STATE), tile) {
+ tile.NumberofTileColumns = pps->num_tile_columns_minus1;
+ tile.NumberofTileRows = pps->num_tile_rows_minus1;
+ for (unsigned i = 0; i < 5; i++) {
+ tile.ColumnPosition[i].CtbPos0i = cum;
+ if ((4 * i) == pps->num_tile_columns_minus1)
+ break;
+
+ cum += pps->column_width_minus1[4 * i] + 1;
+ tile.ColumnPosition[i].CtbPos1i = cum;
+
+ if ((4 * i + 1) == pps->num_tile_columns_minus1)
+ break;
+ cum += pps->column_width_minus1[4 * i + 1] + 1;
+ tile.ColumnPosition[i].CtbPos2i = cum;
+
+ if ((4 * i + 2) == pps->num_tile_columns_minus1)
+ break;
+ cum += pps->column_width_minus1[4 * i + 2] + 1;
+ tile.ColumnPosition[i].CtbPos3i = cum;
+
+ if ((4 * i + 3) >= MIN2(pps->num_tile_columns_minus1,
+ ARRAY_SIZE(pps->column_width_minus1)))
+ break;
+
+ cum += pps->column_width_minus1[4 * i + 3] + 1;
+ }
+
+ cum = 0;
+
+ for (unsigned i = 0; i < 5; i++) {
+ tile.Rowposition[i].CtbPos0i = cum;
+ if ((4 * i) == pps->num_tile_rows_minus1)
+ break;
+
+ cum += pps->row_height_minus1[4 * i] + 1;
+ tile.Rowposition[i].CtbPos1i = cum;
+
+ if ((4 * i + 1) == pps->num_tile_rows_minus1)
+ break;
+ cum += pps->row_height_minus1[4 * i + 1] + 1;
+ tile.Rowposition[i].CtbPos2i = cum;
+
+ if ((4 * i + 2) == pps->num_tile_rows_minus1)
+ break;
+ cum += pps->row_height_minus1[4 * i + 2] + 1;
+ tile.Rowposition[i].CtbPos3i = cum;
+
+ if ((4 * i + 3) == pps->num_tile_rows_minus1)
+ break;
+
+ cum += pps->row_height_minus1[4 * i + 3] + 1;
+ }
+
+ if (pps->num_tile_rows_minus1 == 20) {
+ tile.Rowposition[5].CtbPos0i = cum;
+ }
+ if (pps->num_tile_rows_minus1 == 20) {
+ tile.Rowposition[5].CtbPos0i = cum;
+ cum += pps->row_height_minus1[20] + 1;
+ tile.Rowposition[5].CtbPos1i = cum;
+ }
+ }
+ }
+
+ /* Slice parsing */
+ uint32_t last_slice = h265_pic_info->sliceSegmentCount - 1;
+ void *slice_map;
+ VkResult result =
+ anv_device_map_bo(cmd_buffer->device,
+ src_buffer->address.bo,
+ src_buffer->address.offset,
+ frame_info->srcBufferRange + frame_info->srcBufferOffset,
+ NULL /* placed_addr */,
+ &slice_map);
+ if (result != VK_SUCCESS) {
+ anv_batch_set_error(&cmd_buffer->batch, result);
+ return;
+ }
+
+ slice_map += frame_info->srcBufferOffset;
+
+ struct vk_video_h265_slice_params slice_params[h265_pic_info->sliceSegmentCount];
+
+ /* All slices should be parsed in advance to collect information necessary */
+ for (unsigned s = 0; s < h265_pic_info->sliceSegmentCount; s++) {
+ uint32_t current_offset = h265_pic_info->pSliceSegmentOffsets[s];
+ void *map = slice_map + current_offset;
+ uint32_t slice_size = 0;
+
+ if (s == last_slice)
+ slice_size = frame_info->srcBufferRange - current_offset;
+ else
+ slice_size = h265_pic_info->pSliceSegmentOffsets[s + 1] - current_offset;
+
+ vk_video_parse_h265_slice_header(frame_info, h265_pic_info, sps, pps, map, slice_size, &slice_params[s]);
+ vk_fill_video_h265_reference_info(frame_info, h265_pic_info, &slice_params[s], ref_slots);
+ }
+
+ anv_device_unmap_bo(cmd_buffer->device, src_buffer->address.bo,
+ slice_map, frame_info->srcBufferRange,
+ false /* replace */);
+
+ for (unsigned s = 0; s < h265_pic_info->sliceSegmentCount; s++) {
+ uint32_t ctb_size = 1 << (sps->log2_diff_max_min_luma_coding_block_size +
+ sps->log2_min_luma_coding_block_size_minus3 + 3);
+ uint32_t pic_width_in_min_cbs_y = sps->pic_width_in_luma_samples /
+ (1 << (sps->log2_min_luma_coding_block_size_minus3 + 3));
+ uint32_t width_in_pix = (1 << (sps->log2_min_luma_coding_block_size_minus3 + 3)) *
+ pic_width_in_min_cbs_y;
+ uint32_t ctb_w = DIV_ROUND_UP(width_in_pix, ctb_size);
+ bool is_last = (s == last_slice);
+ int slice_qp = (slice_params[s].slice_qp_delta + pps->init_qp_minus26 + 26) & 0x3f;
+
+ anv_batch_emit(&cmd_buffer->batch, GENX(HCP_SLICE_STATE), slice) {
+ slice.SliceHorizontalPosition = slice_params[s].slice_segment_address % ctb_w;
+ slice.SliceVerticalPosition = slice_params[s].slice_segment_address / ctb_w;
+
+ if (is_last) {
+ slice.NextSliceHorizontalPosition = 0;
+ slice.NextSliceVerticalPosition = 0;
+ } else {
+ slice.NextSliceHorizontalPosition = (slice_params[s + 1].slice_segment_address) % ctb_w;
+ slice.NextSliceVerticalPosition = (slice_params[s + 1].slice_segment_address) / ctb_w;
+ }
+
+ slice.SliceType = slice_params[s].slice_type;
+ slice.LastSlice = is_last;
+ slice.DependentSlice = slice_params[s].dependent_slice_segment;
+ slice.SliceTemporalMVPEnable = slice_params[s].temporal_mvp_enable;
+ slice.SliceQP = abs(slice_qp);
+ slice.SliceQPSign = slice_qp >= 0 ? 0 : 1;
+ slice.SliceCbQPOffset = slice_params[s].slice_cb_qp_offset;
+ slice.SliceCrQPOffset = slice_params[s].slice_cr_qp_offset;
+ slice.SliceHeaderDisableDeblockingFilter = pps->flags.deblocking_filter_override_enabled_flag ?
+ slice_params[s].disable_deblocking_filter_idc : pps->flags.pps_deblocking_filter_disabled_flag;
+ slice.SliceTCOffsetDiv2 = slice_params[s].tc_offset_div2;
+ slice.SliceBetaOffsetDiv2 = slice_params[s].beta_offset_div2;
+ slice.SliceLoopFilterEnable = slice_params[s].loop_filter_across_slices_enable;
+ slice.SliceSAOChroma = slice_params[s].sao_chroma_flag;
+ slice.SliceSAOLuma = slice_params[s].sao_luma_flag;
+ slice.MVDL1Zero = slice_params[s].mvd_l1_zero_flag;
+
+ uint8_t low_delay = true;
+
+ if (slice_params[s].slice_type == STD_VIDEO_H265_SLICE_TYPE_I) {
+ low_delay = false;
+ } else {
+ for (unsigned i = 0; i < slice_params[s].num_ref_idx_l0_active; i++) {
+ int slot_idx = ref_slots[0][i].slot_index;
+
+ if (vk_video_h265_poc_by_slot(frame_info, slot_idx) >
+ h265_pic_info->pStdPictureInfo->PicOrderCntVal) {
+ low_delay = false;
+ break;
+ }
+ }
+
+ for (unsigned i = 0; i < slice_params[s].num_ref_idx_l1_active; i++) {
+ int slot_idx = ref_slots[1][i].slot_index;
+ if (vk_video_h265_poc_by_slot(frame_info, slot_idx) >
+ h265_pic_info->pStdPictureInfo->PicOrderCntVal) {
+ low_delay = false;
+ break;
+ }
+ }
+ }
+
+ slice.LowDelay = low_delay;
+ slice.CollocatedFromL0 = slice_params[s].collocated_list == 0 ? true : false;
+ slice.Log2WeightDenominatorChroma = slice_params[s].luma_log2_weight_denom +
+ (slice_params[s].chroma_log2_weight_denom - slice_params[s].luma_log2_weight_denom);
+ slice.Log2WeightDenominatorLuma = slice_params[s].luma_log2_weight_denom;
+ slice.CABACInit = slice_params[s].cabac_init_idc;
+ slice.MaxMergeIndex = slice_params[s].max_num_merge_cand - 1;
+ slice.CollocatedMVTemporalBufferIndex =
+ dpb_idx[ref_slots[slice_params[s].collocated_list][slice_params[s].collocated_ref_idx].slot_index];
+ assert(slice.CollocatedMVTemporalBufferIndex < ANV_VIDEO_H265_HCP_NUM_REF_FRAME);
+
+ slice.SliceHeaderLength = slice_params[s].slice_data_bytes_offset;
+ slice.CABACZeroWordInsertionEnable = false;
+ slice.EmulationByteSliceInsertEnable = false;
+ slice.TailInsertionPresent = false;
+ slice.SliceDataInsertionPresent = false;
+ slice.HeaderInsertionPresent = false;
+
+ slice.IndirectPAKBSEDataStartOffset = 0;
+ slice.TransformSkipLambda = 0;
+ slice.TransformSkipNumberofNonZeroCoeffsFactor0 = 0;
+ slice.TransformSkipNumberofZeroCoeffsFactor0 = 0;
+ slice.TransformSkipNumberofNonZeroCoeffsFactor1 = 0;
+ slice.TransformSkipNumberofZeroCoeffsFactor1 = 0;
+
+#if GFX_VER >= 12
+ slice.OriginalSliceStartCtbX = slice_params[s].slice_segment_address % ctb_w;
+ slice.OriginalSliceStartCtbY = slice_params[s].slice_segment_address / ctb_w;
+#endif
+ }
+
+ if (slice_params[s].slice_type != STD_VIDEO_H265_SLICE_TYPE_I) {
+ anv_batch_emit(&cmd_buffer->batch, GENX(HCP_REF_IDX_STATE), ref) {
+ ref.ReferencePictureListSelect = 0;
+ ref.NumberofReferenceIndexesActive = slice_params[s].num_ref_idx_l0_active - 1;
+
+ for (unsigned i = 0; i < ref.NumberofReferenceIndexesActive + 1; i++) {
+ int slot_idx = ref_slots[0][i].slot_index;
+ unsigned poc = ref_slots[0][i].pic_order_cnt;
+ int32_t diff_poc = h265_pic_info->pStdPictureInfo->PicOrderCntVal - poc;
+
+ assert(dpb_idx[slot_idx] < ANV_VIDEO_H265_HCP_NUM_REF_FRAME);
+
+ ref.ReferenceListEntry[i].ListEntry = dpb_idx[slot_idx];
+ ref.ReferenceListEntry[i].ReferencePicturetbValue = CLAMP(diff_poc, -128, 127) & 0xff;
+ ref.ReferenceListEntry[i].TopField = true;
+ }
+ }
+ }
+
+ if (slice_params[s].slice_type == STD_VIDEO_H265_SLICE_TYPE_B) {
+ anv_batch_emit(&cmd_buffer->batch, GENX(HCP_REF_IDX_STATE), ref) {
+ ref.ReferencePictureListSelect = 1;
+ ref.NumberofReferenceIndexesActive = slice_params[s].num_ref_idx_l1_active - 1;
+
+ for (unsigned i = 0; i < ref.NumberofReferenceIndexesActive + 1; i++) {
+ int slot_idx = ref_slots[1][i].slot_index;;
+ unsigned poc = ref_slots[1][i].pic_order_cnt;
+ int32_t diff_poc = h265_pic_info->pStdPictureInfo->PicOrderCntVal - poc;
+
+ assert(dpb_idx[slot_idx] < ANV_VIDEO_H265_HCP_NUM_REF_FRAME);
+
+ ref.ReferenceListEntry[i].ListEntry = dpb_idx[slot_idx];
+ ref.ReferenceListEntry[i].ReferencePicturetbValue = CLAMP(diff_poc, -128, 127) & 0xff;
+ ref.ReferenceListEntry[i].TopField = true;
+ }
+ }
+ }
+
+ if ((pps->flags.weighted_pred_flag && (slice_params[s].slice_type == STD_VIDEO_H265_SLICE_TYPE_P)) ||
+ (pps->flags.weighted_bipred_flag && (slice_params[s].slice_type == STD_VIDEO_H265_SLICE_TYPE_B))) {
+ anv_batch_emit(&cmd_buffer->batch, GENX(HCP_WEIGHTOFFSET_STATE), w) {
+ w.ReferencePictureListSelect = 0;
+
+ for (unsigned i = 0; i < ANV_VIDEO_H265_MAX_NUM_REF_FRAME; i++) {
+ w.LumaOffsets[i].DeltaLumaWeightLX = slice_params[s].delta_luma_weight_l0[i] & 0xff;
+ w.LumaOffsets[i].LumaOffsetLX = slice_params[s].luma_offset_l0[i] & 0xff;
+ w.ChromaOffsets[i].DeltaChromaWeightLX0 = slice_params[s].delta_chroma_weight_l0[i][0] & 0xff;
+ w.ChromaOffsets[i].ChromaOffsetLX0 = slice_params[s].chroma_offset_l0[i][0] & 0xff;
+ w.ChromaOffsets[i].DeltaChromaWeightLX1 = slice_params[s].delta_chroma_weight_l0[i][1] & 0xff;
+ w.ChromaOffsets[i].ChromaOffsetLX1 = slice_params[s].chroma_offset_l0[i][1] & 0xff;
+ }
+ }
+
+ if (slice_params[s].slice_type == STD_VIDEO_H265_SLICE_TYPE_B) {
+ anv_batch_emit(&cmd_buffer->batch, GENX(HCP_WEIGHTOFFSET_STATE), w) {
+ w.ReferencePictureListSelect = 1;
+
+ for (unsigned i = 0; i < ANV_VIDEO_H265_MAX_NUM_REF_FRAME; i++) {
+ w.LumaOffsets[i].DeltaLumaWeightLX = slice_params[s].delta_luma_weight_l1[i] & 0xff;
+ w.LumaOffsets[i].LumaOffsetLX = slice_params[s].luma_offset_l1[i] & 0xff;
+ w.ChromaOffsets[i].DeltaChromaWeightLX0 = slice_params[s].delta_chroma_weight_l1[i][0] & 0xff;
+ w.ChromaOffsets[i].DeltaChromaWeightLX1 = slice_params[s].delta_chroma_weight_l1[i][1] & 0xff;
+ w.ChromaOffsets[i].ChromaOffsetLX0 = slice_params[s].chroma_offset_l1[i][0] & 0xff;
+ w.ChromaOffsets[i].ChromaOffsetLX1 = slice_params[s].chroma_offset_l1[i][1] & 0xff;
+ }
+ }
+ }
+ }
+
+ uint32_t buffer_offset = frame_info->srcBufferOffset & 4095;
+
+ anv_batch_emit(&cmd_buffer->batch, GENX(HCP_BSD_OBJECT), bsd) {
+ bsd.IndirectBSDDataLength = slice_params[s].slice_size - 3;
+ bsd.IndirectBSDDataStartAddress = buffer_offset + h265_pic_info->pSliceSegmentOffsets[s] + 3;
+ }
+ }
+
+#if GFX_VER >= 12
+ anv_batch_emit(&cmd_buffer->batch, GENX(VD_CONTROL_STATE), cs) {
+ cs.MemoryImplicitFlush = true;
+ }
+#endif
+
+ anv_batch_emit(&cmd_buffer->batch, GENX(VD_PIPELINE_FLUSH), flush) {
+ flush.HEVCPipelineDone = true;
+ flush.HEVCPipelineCommandFlush = true;
+ flush.VDCommandMessageParserDone = true;
+ }
+}
+
+static void
+anv_h264_decode_video(struct anv_cmd_buffer *cmd_buffer,
+ const VkVideoDecodeInfoKHR *frame_info)
+{
+ ANV_FROM_HANDLE(anv_buffer, src_buffer, frame_info->srcBuffer);
+ struct anv_video_session *vid = cmd_buffer->video.vid;
+ struct anv_video_session_params *params = cmd_buffer->video.params;
+ const struct VkVideoDecodeH264PictureInfoKHR *h264_pic_info =
+ vk_find_struct_const(frame_info->pNext, VIDEO_DECODE_H264_PICTURE_INFO_KHR);
+ const StdVideoH264SequenceParameterSet *sps = vk_video_find_h264_dec_std_sps(&params->vk, h264_pic_info->pStdPictureInfo->seq_parameter_set_id);
+ const StdVideoH264PictureParameterSet *pps = vk_video_find_h264_dec_std_pps(&params->vk, h264_pic_info->pStdPictureInfo->pic_parameter_set_id);
+
+ anv_batch_emit(&cmd_buffer->batch, GENX(MI_FLUSH_DW), flush) {
+ flush.DWordLength = 2;
+ flush.VideoPipelineCacheInvalidate = 1;
+ };
+
+#if GFX_VER >= 12
+ anv_batch_emit(&cmd_buffer->batch, GENX(MI_FORCE_WAKEUP), wake) {
+ wake.MFXPowerWellControl = 1;
+ wake.MaskBits = 768;
+ }
+
+ anv_batch_emit(&cmd_buffer->batch, GENX(MFX_WAIT), mfx) {
+ mfx.MFXSyncControlFlag = 1;
+ }
+#endif
+
+ anv_batch_emit(&cmd_buffer->batch, GENX(MFX_PIPE_MODE_SELECT), sel) {
+ sel.StandardSelect = SS_AVC;
+ sel.CodecSelect = Decode;
+ sel.DecoderShortFormatMode = ShortFormatDriverInterface;
+ sel.DecoderModeSelect = VLDMode; // Hardcoded
+
+ sel.PreDeblockingOutputEnable = 0;
+ sel.PostDeblockingOutputEnable = 1;
+ }
+
+#if GFX_VER >= 12
+ anv_batch_emit(&cmd_buffer->batch, GENX(MFX_WAIT), mfx) {
+ mfx.MFXSyncControlFlag = 1;
+ }
+#endif
+
+ const struct anv_image_view *iv = anv_image_view_from_handle(frame_info->dstPictureResource.imageViewBinding);
+ const struct anv_image *img = iv->image;
+ anv_batch_emit(&cmd_buffer->batch, GENX(MFX_SURFACE_STATE), ss) {
+ ss.Width = img->vk.extent.width - 1;
+ ss.Height = img->vk.extent.height - 1;
+ ss.SurfaceFormat = PLANAR_420_8; // assert on this?
+ ss.InterleaveChroma = 1;
+ ss.SurfacePitch = img->planes[0].primary_surface.isl.row_pitch_B - 1;
+ ss.TiledSurface = img->planes[0].primary_surface.isl.tiling != ISL_TILING_LINEAR;
+ ss.TileWalk = TW_YMAJOR;
+
+ ss.YOffsetforUCb = ss.YOffsetforVCr =
+ img->planes[1].primary_surface.memory_range.offset / img->planes[0].primary_surface.isl.row_pitch_B;
+ }
+
+ anv_batch_emit(&cmd_buffer->batch, GENX(MFX_PIPE_BUF_ADDR_STATE), buf) {
+ bool use_pre_deblock = false;
+ if (use_pre_deblock) {
+ buf.PreDeblockingDestinationAddress = anv_image_address(img,
+ &img->planes[0].primary_surface.memory_range);
+ } else {
+ buf.PostDeblockingDestinationAddress = anv_image_address(img,
+ &img->planes[0].primary_surface.memory_range);
+ }
+ buf.PreDeblockingDestinationAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+ .MOCS = anv_mocs(cmd_buffer->device, buf.PreDeblockingDestinationAddress.bo, 0),
+ };
+ buf.PostDeblockingDestinationAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+ .MOCS = anv_mocs(cmd_buffer->device, buf.PostDeblockingDestinationAddress.bo, 0),
+ };
+
+ buf.IntraRowStoreScratchBufferAddress = (struct anv_address) { vid->vid_mem[ANV_VID_MEM_H264_INTRA_ROW_STORE].mem->bo, vid->vid_mem[ANV_VID_MEM_H264_INTRA_ROW_STORE].offset };
+ buf.IntraRowStoreScratchBufferAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+ .MOCS = anv_mocs(cmd_buffer->device, buf.IntraRowStoreScratchBufferAddress.bo, 0),
+ };
+ buf.DeblockingFilterRowStoreScratchAddress = (struct anv_address) { vid->vid_mem[ANV_VID_MEM_H264_DEBLOCK_FILTER_ROW_STORE].mem->bo, vid->vid_mem[ANV_VID_MEM_H264_DEBLOCK_FILTER_ROW_STORE].offset };
+ buf.DeblockingFilterRowStoreScratchAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+ .MOCS = anv_mocs(cmd_buffer->device, buf.DeblockingFilterRowStoreScratchAddress.bo, 0),
+ };
+ buf.MBStatusBufferAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+ .MOCS = anv_mocs(cmd_buffer->device, NULL, 0),
+ };
+ buf.MBILDBStreamOutBufferAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+ .MOCS = anv_mocs(cmd_buffer->device, NULL, 0),
+ };
+ buf.SecondMBILDBStreamOutBufferAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+ .MOCS = anv_mocs(cmd_buffer->device, NULL, 0),
+ };
+ buf.ScaledReferenceSurfaceAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+ .MOCS = anv_mocs(cmd_buffer->device, NULL, 0),
+ };
+ buf.OriginalUncompressedPictureSourceAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+ .MOCS = anv_mocs(cmd_buffer->device, NULL, 0),
+ };
+ buf.StreamOutDataDestinationAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+ .MOCS = anv_mocs(cmd_buffer->device, NULL, 0),
+ };
+
+ struct anv_bo *ref_bo = NULL;
+ for (unsigned i = 0; i < frame_info->referenceSlotCount; i++) {
+ const struct anv_image_view *ref_iv = anv_image_view_from_handle(frame_info->pReferenceSlots[i].pPictureResource->imageViewBinding);
+ int idx = frame_info->pReferenceSlots[i].slotIndex;
+ buf.ReferencePictureAddress[idx] = anv_image_address(ref_iv->image,
+ &ref_iv->image->planes[0].primary_surface.memory_range);
+
+ if (i == 0) {
+ ref_bo = ref_iv->image->bindings[0].address.bo;
+ }
+ }
+ buf.ReferencePictureAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+ .MOCS = anv_mocs(cmd_buffer->device, ref_bo, 0),
+ };
+ }
+
+ anv_batch_emit(&cmd_buffer->batch, GENX(MFX_IND_OBJ_BASE_ADDR_STATE), index_obj) {
+ index_obj.MFXIndirectBitstreamObjectAddress = anv_address_add(src_buffer->address,
+ frame_info->srcBufferOffset & ~4095);
+ index_obj.MFXIndirectBitstreamObjectAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+ .MOCS = anv_mocs(cmd_buffer->device, src_buffer->address.bo, 0),
+ };
+ index_obj.MFXIndirectMVObjectAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+ .MOCS = anv_mocs(cmd_buffer->device, NULL, 0),
+ };
+ index_obj.MFDIndirectITCOEFFObjectAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+ .MOCS = anv_mocs(cmd_buffer->device, NULL, 0),
+ };
+ index_obj.MFDIndirectITDBLKObjectAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+ .MOCS = anv_mocs(cmd_buffer->device, NULL, 0),
+ };
+ index_obj.MFCIndirectPAKBSEObjectAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+ .MOCS = anv_mocs(cmd_buffer->device, NULL, 0),
+ };
+ }
+
+ anv_batch_emit(&cmd_buffer->batch, GENX(MFX_BSP_BUF_BASE_ADDR_STATE), bsp) {
+ bsp.BSDMPCRowStoreScratchBufferAddress = (struct anv_address) { vid->vid_mem[ANV_VID_MEM_H264_BSD_MPC_ROW_SCRATCH].mem->bo,
+ vid->vid_mem[ANV_VID_MEM_H264_BSD_MPC_ROW_SCRATCH].offset };
+
+ bsp.BSDMPCRowStoreScratchBufferAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+ .MOCS = anv_mocs(cmd_buffer->device, bsp.BSDMPCRowStoreScratchBufferAddress.bo, 0),
+ };
+ bsp.MPRRowStoreScratchBufferAddress = (struct anv_address) { vid->vid_mem[ANV_VID_MEM_H264_MPR_ROW_SCRATCH].mem->bo,
+ vid->vid_mem[ANV_VID_MEM_H264_BSD_MPC_ROW_SCRATCH].offset };
+
+ bsp.MPRRowStoreScratchBufferAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+ .MOCS = anv_mocs(cmd_buffer->device, bsp.MPRRowStoreScratchBufferAddress.bo, 0),
+ };
+ bsp.BitplaneReadBufferAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+ .MOCS = anv_mocs(cmd_buffer->device, NULL, 0),
+ };
+ }
+
+ anv_batch_emit(&cmd_buffer->batch, GENX(MFD_AVC_DPB_STATE), avc_dpb) {
+ for (unsigned i = 0; i < frame_info->referenceSlotCount; i++) {
+ const struct VkVideoDecodeH264DpbSlotInfoKHR *dpb_slot =
+ vk_find_struct_const(frame_info->pReferenceSlots[i].pNext, VIDEO_DECODE_H264_DPB_SLOT_INFO_KHR);
+ const StdVideoDecodeH264ReferenceInfo *ref_info = dpb_slot->pStdReferenceInfo;
+ int idx = frame_info->pReferenceSlots[i].slotIndex;
+ avc_dpb.NonExistingFrame[idx] = ref_info->flags.is_non_existing;
+ avc_dpb.LongTermFrame[idx] = ref_info->flags.used_for_long_term_reference;
+ if (!ref_info->flags.top_field_flag && !ref_info->flags.bottom_field_flag)
+ avc_dpb.UsedforReference[idx] = 3;
+ else
+ avc_dpb.UsedforReference[idx] = ref_info->flags.top_field_flag | (ref_info->flags.bottom_field_flag << 1);
+ avc_dpb.LTSTFrameNumberList[idx] = ref_info->FrameNum;
+ }
+ }
+
+ anv_batch_emit(&cmd_buffer->batch, GENX(MFD_AVC_PICID_STATE), picid) {
+ picid.PictureIDRemappingDisable = true;
+ }
+
+ uint32_t pic_height = sps->pic_height_in_map_units_minus1 + 1;
+ if (!sps->flags.frame_mbs_only_flag)
+ pic_height *= 2;
+ anv_batch_emit(&cmd_buffer->batch, GENX(MFX_AVC_IMG_STATE), avc_img) {
+ avc_img.FrameWidth = sps->pic_width_in_mbs_minus1;
+ avc_img.FrameHeight = pic_height - 1;
+ avc_img.FrameSize = (sps->pic_width_in_mbs_minus1 + 1) * pic_height;
+
+ if (!h264_pic_info->pStdPictureInfo->flags.field_pic_flag)
+ avc_img.ImageStructure = FramePicture;
+ else if (h264_pic_info->pStdPictureInfo->flags.bottom_field_flag)
+ avc_img.ImageStructure = BottomFieldPicture;
+ else
+ avc_img.ImageStructure = TopFieldPicture;
+
+ avc_img.WeightedBiPredictionIDC = pps->weighted_bipred_idc;
+ avc_img.WeightedPredictionEnable = pps->flags.weighted_pred_flag;
+ avc_img.FirstChromaQPOffset = pps->chroma_qp_index_offset;
+ avc_img.SecondChromaQPOffset = pps->second_chroma_qp_index_offset;
+ avc_img.FieldPicture = h264_pic_info->pStdPictureInfo->flags.field_pic_flag;
+ avc_img.MBAFFMode = (sps->flags.mb_adaptive_frame_field_flag &&
+ !h264_pic_info->pStdPictureInfo->flags.field_pic_flag);
+ avc_img.FrameMBOnly = sps->flags.frame_mbs_only_flag;
+ avc_img._8x8IDCTTransformMode = pps->flags.transform_8x8_mode_flag;
+ avc_img.Direct8x8Inference = sps->flags.direct_8x8_inference_flag;
+ avc_img.ConstrainedIntraPrediction = pps->flags.constrained_intra_pred_flag;
+ avc_img.NonReferencePicture = !h264_pic_info->pStdPictureInfo->flags.is_reference;
+ avc_img.EntropyCodingSyncEnable = pps->flags.entropy_coding_mode_flag;
+ avc_img.ChromaFormatIDC = sps->chroma_format_idc;
+ avc_img.TrellisQuantizationChromaDisable = true;
+ avc_img.NumberofReferenceFrames = frame_info->referenceSlotCount;
+ avc_img.NumberofActiveReferencePicturesfromL0 = pps->num_ref_idx_l0_default_active_minus1 + 1;
+ avc_img.NumberofActiveReferencePicturesfromL1 = pps->num_ref_idx_l1_default_active_minus1 + 1;
+ avc_img.InitialQPValue = pps->pic_init_qp_minus26;
+ avc_img.PicOrderPresent = pps->flags.bottom_field_pic_order_in_frame_present_flag;
+ avc_img.DeltaPicOrderAlwaysZero = sps->flags.delta_pic_order_always_zero_flag;
+ avc_img.PicOrderCountType = sps->pic_order_cnt_type;
+ avc_img.DeblockingFilterControlPresent = pps->flags.deblocking_filter_control_present_flag;
+ avc_img.RedundantPicCountPresent = pps->flags.redundant_pic_cnt_present_flag;
+ avc_img.Log2MaxFrameNumber = sps->log2_max_frame_num_minus4;
+ avc_img.Log2MaxPicOrderCountLSB = sps->log2_max_pic_order_cnt_lsb_minus4;
+ avc_img.CurrentPictureFrameNumber = h264_pic_info->pStdPictureInfo->frame_num;
+ }
+
+ StdVideoH264ScalingLists scaling_lists;
+ vk_video_derive_h264_scaling_list(sps, pps, &scaling_lists);
+ anv_batch_emit(&cmd_buffer->batch, GENX(MFX_QM_STATE), qm) {
+ qm.DWordLength = 16;
+ qm.AVC = AVC_4x4_Intra_MATRIX;
+ for (unsigned m = 0; m < 3; m++)
+ for (unsigned q = 0; q < 16; q++)
+ qm.ForwardQuantizerMatrix[m * 16 + vl_zscan_normal_16[q]] = scaling_lists.ScalingList4x4[m][q];
+ }
+ anv_batch_emit(&cmd_buffer->batch, GENX(MFX_QM_STATE), qm) {
+ qm.DWordLength = 16;
+ qm.AVC = AVC_4x4_Inter_MATRIX;
+ for (unsigned m = 0; m < 3; m++)
+ for (unsigned q = 0; q < 16; q++)
+ qm.ForwardQuantizerMatrix[m * 16 + vl_zscan_normal_16[q]] = scaling_lists.ScalingList4x4[m + 3][q];
+ }
+ if (pps->flags.transform_8x8_mode_flag) {
+ anv_batch_emit(&cmd_buffer->batch, GENX(MFX_QM_STATE), qm) {
+ qm.DWordLength = 16;
+ qm.AVC = AVC_8x8_Intra_MATRIX;
+ for (unsigned q = 0; q < 64; q++)
+ qm.ForwardQuantizerMatrix[vl_zscan_normal[q]] = scaling_lists.ScalingList8x8[0][q];
+ }
+ anv_batch_emit(&cmd_buffer->batch, GENX(MFX_QM_STATE), qm) {
+ qm.DWordLength = 16;
+ qm.AVC = AVC_8x8_Inter_MATRIX;
+ for (unsigned q = 0; q < 64; q++)
+ qm.ForwardQuantizerMatrix[vl_zscan_normal[q]] = scaling_lists.ScalingList8x8[1][q];
+ }
+ }
+
+ anv_batch_emit(&cmd_buffer->batch, GENX(MFX_AVC_DIRECTMODE_STATE), avc_directmode) {
+ /* bind reference frame DMV */
+ struct anv_bo *dmv_bo = NULL;
+ for (unsigned i = 0; i < frame_info->referenceSlotCount; i++) {
+ int idx = frame_info->pReferenceSlots[i].slotIndex;
+ const struct VkVideoDecodeH264DpbSlotInfoKHR *dpb_slot =
+ vk_find_struct_const(frame_info->pReferenceSlots[i].pNext, VIDEO_DECODE_H264_DPB_SLOT_INFO_KHR);
+ const struct anv_image_view *ref_iv = anv_image_view_from_handle(frame_info->pReferenceSlots[i].pPictureResource->imageViewBinding);
+ const StdVideoDecodeH264ReferenceInfo *ref_info = dpb_slot->pStdReferenceInfo;
+ avc_directmode.DirectMVBufferAddress[idx] = anv_image_address(ref_iv->image,
+ &ref_iv->image->vid_dmv_top_surface);
+ if (i == 0) {
+ dmv_bo = ref_iv->image->bindings[0].address.bo;
+ }
+ avc_directmode.POCList[2 * idx] = ref_info->PicOrderCnt[0];
+ avc_directmode.POCList[2 * idx + 1] = ref_info->PicOrderCnt[1];
+ }
+ avc_directmode.DirectMVBufferAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+ .MOCS = anv_mocs(cmd_buffer->device, dmv_bo, 0),
+ };
+
+ avc_directmode.DirectMVBufferWriteAddress = anv_image_address(img,
+ &img->vid_dmv_top_surface);
+ avc_directmode.DirectMVBufferWriteAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+ .MOCS = anv_mocs(cmd_buffer->device, img->bindings[0].address.bo, 0),
+ };
+ avc_directmode.POCList[32] = h264_pic_info->pStdPictureInfo->PicOrderCnt[0];
+ avc_directmode.POCList[33] = h264_pic_info->pStdPictureInfo->PicOrderCnt[1];
+ }
+
+ uint32_t buffer_offset = frame_info->srcBufferOffset & 4095;
+#define HEADER_OFFSET 3
+ for (unsigned s = 0; s < h264_pic_info->sliceCount; s++) {
+ bool last_slice = s == (h264_pic_info->sliceCount - 1);
+ uint32_t current_offset = h264_pic_info->pSliceOffsets[s];
+ uint32_t this_end;
+ if (!last_slice) {
+ uint32_t next_offset = h264_pic_info->pSliceOffsets[s + 1];
+ uint32_t next_end = h264_pic_info->pSliceOffsets[s + 2];
+ if (s == h264_pic_info->sliceCount - 2)
+ next_end = frame_info->srcBufferRange;
+ anv_batch_emit(&cmd_buffer->batch, GENX(MFD_AVC_SLICEADDR), sliceaddr) {
+ sliceaddr.IndirectBSDDataLength = next_end - next_offset - HEADER_OFFSET;
+ /* start decoding after the 3-byte header. */
+ sliceaddr.IndirectBSDDataStartAddress = buffer_offset + next_offset + HEADER_OFFSET;
+ };
+ this_end = next_offset;
+ } else
+ this_end = frame_info->srcBufferRange;
+ anv_batch_emit(&cmd_buffer->batch, GENX(MFD_AVC_BSD_OBJECT), avc_bsd) {
+ avc_bsd.IndirectBSDDataLength = this_end - current_offset - HEADER_OFFSET;
+ /* start decoding after the 3-byte header. */
+ avc_bsd.IndirectBSDDataStartAddress = buffer_offset + current_offset + HEADER_OFFSET;
+ avc_bsd.InlineData.LastSlice = last_slice;
+ avc_bsd.InlineData.FixPrevMBSkipped = 1;
+ avc_bsd.InlineData.IntraPredictionErrorControl = 1;
+ avc_bsd.InlineData.Intra8x84x4PredictionErrorConcealmentControl = 1;
+ avc_bsd.InlineData.ISliceConcealmentMode = 1;
+ };
+ }
+}
+
+void
+genX(CmdDecodeVideoKHR)(VkCommandBuffer commandBuffer,
+ const VkVideoDecodeInfoKHR *frame_info)
+{
+ ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+
+ switch (cmd_buffer->video.vid->vk.op) {
+ case VK_VIDEO_CODEC_OPERATION_DECODE_H264_BIT_KHR:
+ anv_h264_decode_video(cmd_buffer, frame_info);
+ break;
+ case VK_VIDEO_CODEC_OPERATION_DECODE_H265_BIT_KHR:
+ anv_h265_decode_video(cmd_buffer, frame_info);
+ break;
+ default:
+ assert(0);
+ }
+}
+
+#ifdef VK_ENABLE_BETA_EXTENSIONS
+void
+genX(CmdEncodeVideoKHR)(VkCommandBuffer commandBuffer,
+ const VkVideoEncodeInfoKHR *pEncodeInfo)
+{
+}
+#endif
diff --git a/src/intel/vulkan/genX_gfx_state.c b/src/intel/vulkan/genX_gfx_state.c
new file mode 100644
index 00000000000..5f0b1e1c538
--- /dev/null
+++ b/src/intel/vulkan/genX_gfx_state.c
@@ -0,0 +1,2385 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <assert.h>
+#include <stdbool.h>
+#include <string.h>
+#include <unistd.h>
+#include <fcntl.h>
+
+#include "anv_private.h"
+
+#include "genxml/gen_macros.h"
+#include "genxml/genX_pack.h"
+#include "common/intel_genX_state_brw.h"
+#include "common/intel_guardband.h"
+#include "common/intel_tiled_render.h"
+#include "compiler/brw_prim.h"
+
+const uint32_t genX(vk_to_intel_blend)[] = {
+ [VK_BLEND_FACTOR_ZERO] = BLENDFACTOR_ZERO,
+ [VK_BLEND_FACTOR_ONE] = BLENDFACTOR_ONE,
+ [VK_BLEND_FACTOR_SRC_COLOR] = BLENDFACTOR_SRC_COLOR,
+ [VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR] = BLENDFACTOR_INV_SRC_COLOR,
+ [VK_BLEND_FACTOR_DST_COLOR] = BLENDFACTOR_DST_COLOR,
+ [VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR] = BLENDFACTOR_INV_DST_COLOR,
+ [VK_BLEND_FACTOR_SRC_ALPHA] = BLENDFACTOR_SRC_ALPHA,
+ [VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA] = BLENDFACTOR_INV_SRC_ALPHA,
+ [VK_BLEND_FACTOR_DST_ALPHA] = BLENDFACTOR_DST_ALPHA,
+ [VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA] = BLENDFACTOR_INV_DST_ALPHA,
+ [VK_BLEND_FACTOR_CONSTANT_COLOR] = BLENDFACTOR_CONST_COLOR,
+ [VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR]= BLENDFACTOR_INV_CONST_COLOR,
+ [VK_BLEND_FACTOR_CONSTANT_ALPHA] = BLENDFACTOR_CONST_ALPHA,
+ [VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA]= BLENDFACTOR_INV_CONST_ALPHA,
+ [VK_BLEND_FACTOR_SRC_ALPHA_SATURATE] = BLENDFACTOR_SRC_ALPHA_SATURATE,
+ [VK_BLEND_FACTOR_SRC1_COLOR] = BLENDFACTOR_SRC1_COLOR,
+ [VK_BLEND_FACTOR_ONE_MINUS_SRC1_COLOR] = BLENDFACTOR_INV_SRC1_COLOR,
+ [VK_BLEND_FACTOR_SRC1_ALPHA] = BLENDFACTOR_SRC1_ALPHA,
+ [VK_BLEND_FACTOR_ONE_MINUS_SRC1_ALPHA] = BLENDFACTOR_INV_SRC1_ALPHA,
+};
+
+static const uint32_t genX(vk_to_intel_blend_op)[] = {
+ [VK_BLEND_OP_ADD] = BLENDFUNCTION_ADD,
+ [VK_BLEND_OP_SUBTRACT] = BLENDFUNCTION_SUBTRACT,
+ [VK_BLEND_OP_REVERSE_SUBTRACT] = BLENDFUNCTION_REVERSE_SUBTRACT,
+ [VK_BLEND_OP_MIN] = BLENDFUNCTION_MIN,
+ [VK_BLEND_OP_MAX] = BLENDFUNCTION_MAX,
+};
+
+static void
+genX(streamout_prologue)(struct anv_cmd_buffer *cmd_buffer)
+{
+#if INTEL_WA_16013994831_GFX_VER
+ /* Wa_16013994831 - Disable preemption during streamout, enable back
+ * again if XFB not used by the current pipeline.
+ *
+ * Although this workaround applies to Gfx12+, we already disable object
+ * level preemption for another reason in genX_state.c so we can skip this
+ * for Gfx12.
+ */
+ if (!intel_needs_workaround(cmd_buffer->device->info, 16013994831))
+ return;
+
+ struct anv_graphics_pipeline *pipeline =
+ anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
+ if (pipeline->uses_xfb) {
+ genX(cmd_buffer_set_preemption)(cmd_buffer, false);
+ return;
+ }
+
+ if (!cmd_buffer->state.gfx.object_preemption)
+ genX(cmd_buffer_set_preemption)(cmd_buffer, true);
+#endif
+}
+
+#if GFX_VER >= 12
+static uint32_t
+get_cps_state_offset(struct anv_cmd_buffer *cmd_buffer, bool cps_enabled,
+ const struct vk_fragment_shading_rate_state *fsr)
+{
+ struct anv_device *device = cmd_buffer->device;
+
+ if (!cps_enabled) {
+ assert(cmd_buffer->state.current_db_mode !=
+ ANV_CMD_DESCRIPTOR_BUFFER_MODE_UNKNOWN);
+ return cmd_buffer->state.current_db_mode ==
+ ANV_CMD_DESCRIPTOR_BUFFER_MODE_BUFFER ?
+ device->cps_states_db.offset :
+ device->cps_states.offset;
+ }
+
+ uint32_t offset;
+ static const uint32_t size_index[] = {
+ [1] = 0,
+ [2] = 1,
+ [4] = 2,
+ };
+
+#if GFX_VERx10 >= 125
+ offset =
+ 1 + /* skip disabled */
+ fsr->combiner_ops[0] * 5 * 3 * 3 +
+ fsr->combiner_ops[1] * 3 * 3 +
+ size_index[fsr->fragment_size.width] * 3 +
+ size_index[fsr->fragment_size.height];
+#else
+ offset =
+ 1 + /* skip disabled */
+ size_index[fsr->fragment_size.width] * 3 +
+ size_index[fsr->fragment_size.height];
+#endif
+
+ offset *= MAX_VIEWPORTS * GENX(CPS_STATE_length) * 4;
+
+ assert(cmd_buffer->state.current_db_mode !=
+ ANV_CMD_DESCRIPTOR_BUFFER_MODE_UNKNOWN);
+ return (cmd_buffer->state.current_db_mode ==
+ ANV_CMD_DESCRIPTOR_BUFFER_MODE_BUFFER ?
+ device->cps_states_db.offset :
+ device->cps_states.offset) + offset;
+}
+#endif /* GFX_VER >= 12 */
+
+static bool
+has_ds_feedback_loop(const struct vk_dynamic_graphics_state *dyn)
+{
+ return dyn->feedback_loops & (VK_IMAGE_ASPECT_DEPTH_BIT |
+ VK_IMAGE_ASPECT_STENCIL_BIT);
+}
+
+UNUSED static bool
+want_stencil_pma_fix(struct anv_cmd_buffer *cmd_buffer,
+ const struct vk_dynamic_graphics_state *dyn,
+ const struct vk_depth_stencil_state *ds)
+{
+ if (GFX_VER > 9)
+ return false;
+ assert(GFX_VER == 9);
+
+ /* From the Skylake PRM Vol. 2c CACHE_MODE_1::STC PMA Optimization Enable:
+ *
+ * Clearing this bit will force the STC cache to wait for pending
+ * retirement of pixels at the HZ-read stage and do the STC-test for
+ * Non-promoted, R-computed and Computed depth modes instead of
+ * postponing the STC-test to RCPFE.
+ *
+ * STC_TEST_EN = 3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE &&
+ * 3DSTATE_WM_DEPTH_STENCIL::StencilTestEnable
+ *
+ * STC_WRITE_EN = 3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE &&
+ * (3DSTATE_WM_DEPTH_STENCIL::Stencil Buffer Write Enable &&
+ * 3DSTATE_DEPTH_BUFFER::STENCIL_WRITE_ENABLE)
+ *
+ * COMP_STC_EN = STC_TEST_EN &&
+ * 3DSTATE_PS_EXTRA::PixelShaderComputesStencil
+ *
+ * SW parses the pipeline states to generate the following logical
+ * signal indicating if PMA FIX can be enabled.
+ *
+ * STC_PMA_OPT =
+ * 3DSTATE_WM::ForceThreadDispatch != 1 &&
+ * !(3DSTATE_RASTER::ForceSampleCount != NUMRASTSAMPLES_0) &&
+ * 3DSTATE_DEPTH_BUFFER::SURFACE_TYPE != NULL &&
+ * 3DSTATE_DEPTH_BUFFER::HIZ Enable &&
+ * !(3DSTATE_WM::EDSC_Mode == 2) &&
+ * 3DSTATE_PS_EXTRA::PixelShaderValid &&
+ * !(3DSTATE_WM_HZ_OP::DepthBufferClear ||
+ * 3DSTATE_WM_HZ_OP::DepthBufferResolve ||
+ * 3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve Enable ||
+ * 3DSTATE_WM_HZ_OP::StencilBufferClear) &&
+ * (COMP_STC_EN || STC_WRITE_EN) &&
+ * ((3DSTATE_PS_EXTRA::PixelShaderKillsPixels ||
+ * 3DSTATE_WM::ForceKillPix == ON ||
+ * 3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||
+ * 3DSTATE_PS_BLEND::AlphaToCoverageEnable ||
+ * 3DSTATE_PS_BLEND::AlphaTestEnable ||
+ * 3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable) ||
+ * (3DSTATE_PS_EXTRA::Pixel Shader Computed Depth mode != PSCDEPTH_OFF))
+ */
+
+ /* These are always true:
+ * 3DSTATE_WM::ForceThreadDispatch != 1 &&
+ * !(3DSTATE_RASTER::ForceSampleCount != NUMRASTSAMPLES_0)
+ */
+
+ /* We only enable the PMA fix if we know for certain that HiZ is enabled.
+ * If we don't know whether HiZ is enabled or not, we disable the PMA fix
+ * and there is no harm.
+ *
+ * (3DSTATE_DEPTH_BUFFER::SURFACE_TYPE != NULL) &&
+ * 3DSTATE_DEPTH_BUFFER::HIZ Enable
+ */
+ if (!cmd_buffer->state.hiz_enabled)
+ return false;
+
+ /* We can't possibly know if HiZ is enabled without the depth attachment */
+ ASSERTED const struct anv_image_view *d_iview =
+ cmd_buffer->state.gfx.depth_att.iview;
+ assert(d_iview && d_iview->image->planes[0].aux_usage == ISL_AUX_USAGE_HIZ);
+
+ /* 3DSTATE_PS_EXTRA::PixelShaderValid */
+ struct anv_graphics_pipeline *pipeline =
+ anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
+ if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT))
+ return false;
+
+ /* !(3DSTATE_WM::EDSC_Mode == 2) */
+ const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
+ if (wm_prog_data->early_fragment_tests)
+ return false;
+
+ /* We never use anv_pipeline for HiZ ops so this is trivially true:
+ * !(3DSTATE_WM_HZ_OP::DepthBufferClear ||
+ * 3DSTATE_WM_HZ_OP::DepthBufferResolve ||
+ * 3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve Enable ||
+ * 3DSTATE_WM_HZ_OP::StencilBufferClear)
+ */
+
+ /* 3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE &&
+ * 3DSTATE_WM_DEPTH_STENCIL::StencilTestEnable
+ */
+ const bool stc_test_en = ds->stencil.test_enable;
+
+ /* 3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE &&
+ * (3DSTATE_WM_DEPTH_STENCIL::Stencil Buffer Write Enable &&
+ * 3DSTATE_DEPTH_BUFFER::STENCIL_WRITE_ENABLE)
+ */
+ const bool stc_write_en = ds->stencil.write_enable;
+
+ /* STC_TEST_EN && 3DSTATE_PS_EXTRA::PixelShaderComputesStencil */
+ const bool comp_stc_en = stc_test_en && wm_prog_data->computed_stencil;
+
+ /* COMP_STC_EN || STC_WRITE_EN */
+ if (!(comp_stc_en || stc_write_en))
+ return false;
+
+ /* (3DSTATE_PS_EXTRA::PixelShaderKillsPixels ||
+ * 3DSTATE_WM::ForceKillPix == ON ||
+ * 3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||
+ * 3DSTATE_PS_BLEND::AlphaToCoverageEnable ||
+ * 3DSTATE_PS_BLEND::AlphaTestEnable ||
+ * 3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable) ||
+ * (3DSTATE_PS_EXTRA::Pixel Shader Computed Depth mode != PSCDEPTH_OFF)
+ */
+ return pipeline->kill_pixel ||
+ pipeline->rp_has_ds_self_dep ||
+ has_ds_feedback_loop(dyn) ||
+ wm_prog_data->computed_depth_mode != PSCDEPTH_OFF;
+}
+
+static void
+genX(rasterization_mode)(VkPolygonMode raster_mode,
+ VkLineRasterizationModeKHR line_mode,
+ float line_width,
+ uint32_t *api_mode,
+ bool *msaa_rasterization_enable)
+{
+ if (raster_mode == VK_POLYGON_MODE_LINE) {
+ /* Unfortunately, configuring our line rasterization hardware on gfx8
+ * and later is rather painful. Instead of giving us bits to tell the
+ * hardware what line mode to use like we had on gfx7, we now have an
+ * arcane combination of API Mode and MSAA enable bits which do things
+ * in a table which are expected to magically put the hardware into the
+ * right mode for your API. Sadly, Vulkan isn't any of the APIs the
+ * hardware people thought of so nothing works the way you want it to.
+ *
+ * Look at the table titled "Multisample Rasterization Modes" in Vol 7
+ * of the Skylake PRM for more details.
+ */
+ switch (line_mode) {
+ case VK_LINE_RASTERIZATION_MODE_RECTANGULAR_EXT:
+ *api_mode = DX101;
+#if GFX_VER <= 9
+ /* Prior to ICL, the algorithm the HW uses to draw wide lines
+ * doesn't quite match what the CTS expects, at least for rectangular
+ * lines, so we set this to false here, making it draw parallelograms
+ * instead, which work well enough.
+ */
+ *msaa_rasterization_enable = line_width < 1.0078125;
+#else
+ *msaa_rasterization_enable = true;
+#endif
+ break;
+
+ case VK_LINE_RASTERIZATION_MODE_RECTANGULAR_SMOOTH_EXT:
+ case VK_LINE_RASTERIZATION_MODE_BRESENHAM_EXT:
+ *api_mode = DX9OGL;
+ *msaa_rasterization_enable = false;
+ break;
+
+ default:
+ unreachable("Unsupported line rasterization mode");
+ }
+ } else {
+ *api_mode = DX101;
+ *msaa_rasterization_enable = true;
+ }
+}
+
+static bool
+is_src1_blend_factor(enum GENX(3D_Color_Buffer_Blend_Factor) factor)
+{
+ return factor == BLENDFACTOR_SRC1_COLOR ||
+ factor == BLENDFACTOR_SRC1_ALPHA ||
+ factor == BLENDFACTOR_INV_SRC1_COLOR ||
+ factor == BLENDFACTOR_INV_SRC1_ALPHA;
+}
+
+#if GFX_VERx10 == 125
+/**
+ * Return the dimensions of the current rendering area, defined as the
+ * bounding box of all present color, depth and stencil attachments.
+ */
+UNUSED static bool
+calculate_render_area(struct anv_cmd_buffer *cmd_buffer,
+ unsigned *width, unsigned *height)
+{
+ struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
+
+ *width = gfx->render_area.offset.x + gfx->render_area.extent.width;
+ *height = gfx->render_area.offset.y + gfx->render_area.extent.height;
+
+ for (unsigned i = 0; i < gfx->color_att_count; i++) {
+ struct anv_attachment *att = &gfx->color_att[i];
+ if (att->iview) {
+ *width = MAX2(*width, att->iview->vk.extent.width);
+ *height = MAX2(*height, att->iview->vk.extent.height);
+ }
+ }
+
+ const struct anv_image_view *const z_view = gfx->depth_att.iview;
+ if (z_view) {
+ *width = MAX2(*width, z_view->vk.extent.width);
+ *height = MAX2(*height, z_view->vk.extent.height);
+ }
+
+ const struct anv_image_view *const s_view = gfx->stencil_att.iview;
+ if (s_view) {
+ *width = MAX2(*width, s_view->vk.extent.width);
+ *height = MAX2(*height, s_view->vk.extent.height);
+ }
+
+ return *width && *height;
+}
+
+/* Calculate TBIMR tiling parameters adequate for the current pipeline
+ * setup. Return true if TBIMR should be enabled.
+ */
+UNUSED static bool
+calculate_tile_dimensions(struct anv_cmd_buffer *cmd_buffer,
+ unsigned fb_width, unsigned fb_height,
+ unsigned *tile_width, unsigned *tile_height)
+{
+ const struct anv_device *device = cmd_buffer->device;
+ struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
+ const unsigned aux_scale = 256;
+ unsigned pixel_size = 0;
+
+ /* Perform a rough calculation of the tile cache footprint of the
+ * pixel pipeline, approximating it as the sum of the amount of
+ * memory used per pixel by every render target, depth, stencil and
+ * auxiliary surfaces bound to the pipeline.
+ */
+ for (uint32_t i = 0; i < gfx->color_att_count; i++) {
+ struct anv_attachment *att = &gfx->color_att[i];
+
+ if (att->iview) {
+ const struct anv_image *image = att->iview->image;
+ const unsigned p = anv_image_aspect_to_plane(image,
+ VK_IMAGE_ASPECT_COLOR_BIT);
+ const struct anv_image_plane *plane = &image->planes[p];
+
+ pixel_size += intel_calculate_surface_pixel_size(
+ &plane->primary_surface.isl);
+
+ if (isl_aux_usage_has_mcs(att->aux_usage))
+ pixel_size += intel_calculate_surface_pixel_size(
+ &plane->aux_surface.isl);
+
+ /* XXX - Use proper implicit CCS surface metadata tracking
+ * instead of inferring pixel size from primary
+ * surface.
+ */
+ if (isl_aux_usage_has_ccs(att->aux_usage))
+ pixel_size += DIV_ROUND_UP(intel_calculate_surface_pixel_size(
+ &plane->primary_surface.isl),
+ aux_scale);
+ }
+ }
+
+ const struct anv_image_view *const z_view = gfx->depth_att.iview;
+ if (z_view) {
+ const struct anv_image *image = z_view->image;
+ assert(image->vk.aspects & VK_IMAGE_ASPECT_DEPTH_BIT);
+ const unsigned p = anv_image_aspect_to_plane(image,
+ VK_IMAGE_ASPECT_DEPTH_BIT);
+ const struct anv_image_plane *plane = &image->planes[p];
+
+ pixel_size += intel_calculate_surface_pixel_size(
+ &plane->primary_surface.isl);
+
+ if (isl_aux_usage_has_hiz(image->planes[p].aux_usage))
+ pixel_size += intel_calculate_surface_pixel_size(
+ &plane->aux_surface.isl);
+
+ /* XXX - Use proper implicit CCS surface metadata tracking
+ * instead of inferring pixel size from primary
+ * surface.
+ */
+ if (isl_aux_usage_has_ccs(image->planes[p].aux_usage))
+ pixel_size += DIV_ROUND_UP(intel_calculate_surface_pixel_size(
+ &plane->primary_surface.isl),
+ aux_scale);
+ }
+
+ const struct anv_image_view *const s_view = gfx->depth_att.iview;
+ if (s_view && s_view != z_view) {
+ const struct anv_image *image = s_view->image;
+ assert(image->vk.aspects & VK_IMAGE_ASPECT_STENCIL_BIT);
+ const unsigned p = anv_image_aspect_to_plane(image,
+ VK_IMAGE_ASPECT_STENCIL_BIT);
+ const struct anv_image_plane *plane = &image->planes[p];
+
+ pixel_size += intel_calculate_surface_pixel_size(
+ &plane->primary_surface.isl);
+ }
+
+ if (!pixel_size)
+ return false;
+
+ /* Compute a tile layout that allows reasonable utilization of the
+ * tile cache based on the per-pixel cache footprint estimated
+ * above.
+ */
+ intel_calculate_tile_dimensions(device->info, cmd_buffer->state.current_l3_config,
+ 32, 32, fb_width, fb_height,
+ pixel_size, tile_width, tile_height);
+
+ /* Perform TBIMR tile passes only if the framebuffer covers more
+ * than a single tile.
+ */
+ return *tile_width < fb_width || *tile_height < fb_height;
+}
+#endif
+
+/**
+ * This function takes the vulkan runtime values & dirty states and updates
+ * the values in anv_gfx_dynamic_state, flagging HW instructions for
+ * reemission if the values are changing.
+ *
+ * Nothing is emitted in the batch buffer.
+ */
+void
+genX(cmd_buffer_flush_gfx_runtime_state)(struct anv_cmd_buffer *cmd_buffer)
+{
+ UNUSED struct anv_device *device = cmd_buffer->device;
+ struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
+ const struct anv_graphics_pipeline *pipeline =
+ anv_pipeline_to_graphics(gfx->base.pipeline);
+ const struct vk_dynamic_graphics_state *dyn =
+ &cmd_buffer->vk.dynamic_graphics_state;
+ struct anv_gfx_dynamic_state *hw_state = &gfx->dyn_state;
+ const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
+ struct anv_instance *instance = cmd_buffer->device->physical->instance;
+
+#define GET(field) hw_state->field
+#define SET(bit, field, value) \
+ do { \
+ __typeof(hw_state->field) __v = value; \
+ if (hw_state->field != __v) { \
+ hw_state->field = __v; \
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_##bit); \
+ } \
+ } while (0)
+#define SET_STAGE(bit, field, value, stage) \
+ do { \
+ __typeof(hw_state->field) __v = value; \
+ if (!anv_pipeline_has_stage(pipeline, \
+ MESA_SHADER_##stage)) { \
+ hw_state->field = __v; \
+ break; \
+ } \
+ if (hw_state->field != __v) { \
+ hw_state->field = __v; \
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_##bit); \
+ } \
+ } while (0)
+
+#define SETUP_PROVOKING_VERTEX(bit, cmd, mode) \
+ switch (mode) { \
+ case VK_PROVOKING_VERTEX_MODE_FIRST_VERTEX_EXT: \
+ SET(bit, cmd.TriangleStripListProvokingVertexSelect, 0); \
+ SET(bit, cmd.LineStripListProvokingVertexSelect, 0); \
+ SET(bit, cmd.TriangleFanProvokingVertexSelect, 1); \
+ break; \
+ case VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT: \
+ SET(bit, cmd.TriangleStripListProvokingVertexSelect, 2); \
+ SET(bit, cmd.LineStripListProvokingVertexSelect, 1); \
+ SET(bit, cmd.TriangleFanProvokingVertexSelect, 2); \
+ break; \
+ default: \
+ unreachable("Invalid provoking vertex mode"); \
+ } \
+
+ UNUSED bool fs_msaa_changed = false;
+ if ((gfx->dirty & ANV_CMD_DIRTY_PIPELINE) ||
+ BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_ALPHA_TO_COVERAGE_ENABLE) ||
+ BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_RASTERIZATION_SAMPLES) ||
+ BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_FSR)) {
+ enum intel_msaa_flags fs_msaa_flags = 0;
+
+ if (wm_prog_data) {
+ /* If we have any dynamic bits here, we might need to update the
+ * value in the push constant for the shader.
+ */
+ if (wm_prog_data->coarse_pixel_dispatch == BRW_SOMETIMES ||
+ wm_prog_data->persample_dispatch == BRW_SOMETIMES ||
+ wm_prog_data->alpha_to_coverage == BRW_SOMETIMES) {
+ fs_msaa_flags = INTEL_MSAA_FLAG_ENABLE_DYNAMIC;
+
+ if (dyn->ms.rasterization_samples > 1) {
+ fs_msaa_flags |= INTEL_MSAA_FLAG_MULTISAMPLE_FBO;
+
+ if (wm_prog_data->sample_shading) {
+ assert(wm_prog_data->persample_dispatch != BRW_NEVER);
+ fs_msaa_flags |= INTEL_MSAA_FLAG_PERSAMPLE_DISPATCH;
+ }
+ if ((pipeline->sample_shading_enable &&
+ (pipeline->min_sample_shading * dyn->ms.rasterization_samples) > 1) ||
+ wm_prog_data->sample_shading) {
+ fs_msaa_flags |= INTEL_MSAA_FLAG_PERSAMPLE_DISPATCH |
+ INTEL_MSAA_FLAG_PERSAMPLE_INTERP;
+ }
+ }
+
+ if (wm_prog_data->coarse_pixel_dispatch == BRW_SOMETIMES &&
+ !(fs_msaa_flags & INTEL_MSAA_FLAG_PERSAMPLE_DISPATCH)) {
+ fs_msaa_flags |= INTEL_MSAA_FLAG_COARSE_PI_MSG |
+ INTEL_MSAA_FLAG_COARSE_RT_WRITES;
+ }
+
+ if (wm_prog_data->alpha_to_coverage == BRW_SOMETIMES &&
+ dyn->ms.alpha_to_coverage_enable)
+ fs_msaa_flags |= INTEL_MSAA_FLAG_ALPHA_TO_COVERAGE;
+
+ /* Check the last push constant value and update */
+
+ if (gfx->base.push_constants.gfx.fs_msaa_flags != fs_msaa_flags) {
+ gfx->base.push_constants.gfx.fs_msaa_flags = fs_msaa_flags;
+ cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_FRAGMENT_BIT;
+ gfx->base.push_constants_data_dirty = true;
+ }
+ }
+ }
+
+ if (fs_msaa_flags != gfx->fs_msaa_flags) {
+ gfx->fs_msaa_flags = fs_msaa_flags;
+ gfx->dirty |= ANV_CMD_DIRTY_FS_MSAA_FLAGS;
+ }
+ }
+
+ if ((gfx->dirty & ANV_CMD_DIRTY_PIPELINE) ||
+ (gfx->dirty & ANV_CMD_DIRTY_FS_MSAA_FLAGS)) {
+ if (wm_prog_data) {
+ const struct anv_shader_bin *fs_bin =
+ pipeline->base.shaders[MESA_SHADER_FRAGMENT];
+
+ struct GENX(3DSTATE_PS) ps = {};
+ intel_set_ps_dispatch_state(&ps, device->info, wm_prog_data,
+ MAX2(dyn->ms.rasterization_samples, 1),
+ gfx->fs_msaa_flags);
+
+ SET(PS, ps.KernelStartPointer0,
+ fs_bin->kernel.offset +
+ brw_wm_prog_data_prog_offset(wm_prog_data, ps, 0));
+ SET(PS, ps.KernelStartPointer1,
+ fs_bin->kernel.offset +
+ brw_wm_prog_data_prog_offset(wm_prog_data, ps, 1));
+#if GFX_VER < 20
+ SET(PS, ps.KernelStartPointer2,
+ fs_bin->kernel.offset +
+ brw_wm_prog_data_prog_offset(wm_prog_data, ps, 2));
+#endif
+
+ SET(PS, ps.DispatchGRFStartRegisterForConstantSetupData0,
+ brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 0));
+ SET(PS, ps.DispatchGRFStartRegisterForConstantSetupData1,
+ brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 1));
+#if GFX_VER < 20
+ SET(PS, ps.DispatchGRFStartRegisterForConstantSetupData2,
+ brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 2));
+#endif
+
+#if GFX_VER < 20
+ SET(PS, ps._8PixelDispatchEnable, ps._8PixelDispatchEnable);
+ SET(PS, ps._16PixelDispatchEnable, ps._16PixelDispatchEnable);
+ SET(PS, ps._32PixelDispatchEnable, ps._32PixelDispatchEnable);
+#else
+ SET(PS, ps.Kernel0Enable, ps.Kernel0Enable);
+ SET(PS, ps.Kernel1Enable, ps.Kernel1Enable);
+ SET(PS, ps.Kernel0SIMDWidth, ps.Kernel0SIMDWidth);
+ SET(PS, ps.Kernel1SIMDWidth, ps.Kernel1SIMDWidth);
+ SET(PS, ps.Kernel0PolyPackingPolicy, ps.Kernel0PolyPackingPolicy);
+#endif
+
+ SET(PS, ps.PositionXYOffsetSelect,
+ !wm_prog_data->uses_pos_offset ? POSOFFSET_NONE :
+ brw_wm_prog_data_is_persample(wm_prog_data, gfx->fs_msaa_flags) ?
+ POSOFFSET_SAMPLE : POSOFFSET_CENTROID);
+
+ SET(PS_EXTRA, ps_extra.PixelShaderIsPerSample,
+ brw_wm_prog_data_is_persample(wm_prog_data, gfx->fs_msaa_flags));
+#if GFX_VER >= 11
+ SET(PS_EXTRA, ps_extra.PixelShaderIsPerCoarsePixel,
+ brw_wm_prog_data_is_coarse(wm_prog_data, gfx->fs_msaa_flags));
+#endif
+#if GFX_VERx10 >= 125
+ /* TODO: We should only require this when the last geometry shader
+ * uses a fragment shading rate that is not constant.
+ */
+ SET(PS_EXTRA, ps_extra.EnablePSDependencyOnCPsizeChange,
+ brw_wm_prog_data_is_coarse(wm_prog_data, gfx->fs_msaa_flags));
+#endif
+ SET(WM, wm.BarycentricInterpolationMode,
+ wm_prog_data_barycentric_modes(wm_prog_data, gfx->fs_msaa_flags));
+ } else {
+#if GFX_VER < 20
+ SET(PS, ps._8PixelDispatchEnable, false);
+ SET(PS, ps._16PixelDispatchEnable, false);
+ SET(PS, ps._32PixelDispatchEnable, false);
+#else
+ SET(PS, ps.Kernel0Enable, false);
+ SET(PS, ps.Kernel1Enable, false);
+#endif
+ }
+ }
+
+ if ((gfx->dirty & (ANV_CMD_DIRTY_PIPELINE |
+ ANV_CMD_DIRTY_XFB_ENABLE |
+ ANV_CMD_DIRTY_OCCLUSION_QUERY_ACTIVE)) ||
+ BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_RASTERIZER_DISCARD_ENABLE) ||
+ BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_RASTERIZATION_STREAM) ||
+ BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_PROVOKING_VERTEX)) {
+ SET(STREAMOUT, so.RenderingDisable, dyn->rs.rasterizer_discard_enable);
+ SET(STREAMOUT, so.RenderStreamSelect, dyn->rs.rasterization_stream);
+
+#if INTEL_NEEDS_WA_18022508906
+ /* Wa_18022508906 :
+ *
+ * SKL PRMs, Volume 7: 3D-Media-GPGPU, Stream Output Logic (SOL) Stage:
+ *
+ * SOL_INT::Render_Enable =
+ * (3DSTATE_STREAMOUT::Force_Rending == Force_On) ||
+ * (
+ * (3DSTATE_STREAMOUT::Force_Rending != Force_Off) &&
+ * !(3DSTATE_GS::Enable && 3DSTATE_GS::Output Vertex Size == 0) &&
+ * !3DSTATE_STREAMOUT::API_Render_Disable &&
+ * (
+ * 3DSTATE_DEPTH_STENCIL_STATE::Stencil_TestEnable ||
+ * 3DSTATE_DEPTH_STENCIL_STATE::Depth_TestEnable ||
+ * 3DSTATE_DEPTH_STENCIL_STATE::Depth_WriteEnable ||
+ * 3DSTATE_PS_EXTRA::PS_Valid ||
+ * 3DSTATE_WM::Legacy Depth_Buffer_Clear ||
+ * 3DSTATE_WM::Legacy Depth_Buffer_Resolve_Enable ||
+ * 3DSTATE_WM::Legacy Hierarchical_Depth_Buffer_Resolve_Enable
+ * )
+ * )
+ *
+ * If SOL_INT::Render_Enable is false, the SO stage will not forward any
+ * topologies down the pipeline. Which is not what we want for occlusion
+ * queries.
+ *
+ * Here we force rendering to get SOL_INT::Render_Enable when occlusion
+ * queries are active.
+ */
+ SET(STREAMOUT, so.ForceRendering,
+ (!GET(so.RenderingDisable) && gfx->n_occlusion_queries > 0) ?
+ Force_on : 0);
+#endif
+
+ switch (dyn->rs.provoking_vertex) {
+ case VK_PROVOKING_VERTEX_MODE_FIRST_VERTEX_EXT:
+ SET(STREAMOUT, so.ReorderMode, LEADING);
+ SET_STAGE(GS, gs.ReorderMode, LEADING, GEOMETRY);
+ break;
+
+ case VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT:
+ SET(STREAMOUT, so.ReorderMode, TRAILING);
+ SET_STAGE(GS, gs.ReorderMode, TRAILING, GEOMETRY);
+ break;
+
+ default:
+ unreachable("Invalid provoking vertex mode");
+ }
+ }
+
+ if ((gfx->dirty & ANV_CMD_DIRTY_PIPELINE) ||
+ BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_IA_PRIMITIVE_TOPOLOGY)) {
+ uint32_t topology;
+ if (anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL))
+ topology = _3DPRIM_PATCHLIST(dyn->ts.patch_control_points);
+ else
+ topology = genX(vk_to_intel_primitive_type)[dyn->ia.primitive_topology];
+
+ gfx->primitive_topology = topology;
+
+ SET(VF_TOPOLOGY, vft.PrimitiveTopologyType, topology);
+ }
+
+ if ((gfx->dirty & ANV_CMD_DIRTY_PIPELINE) ||
+ BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VI) ||
+ BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VI_BINDINGS_VALID) ||
+ BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VI_BINDING_STRIDES))
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VERTEX_INPUT);
+
+#if GFX_VER >= 11
+ if (cmd_buffer->device->vk.enabled_extensions.KHR_fragment_shading_rate &&
+ ((gfx->dirty & ANV_CMD_DIRTY_PIPELINE) ||
+ (gfx->dirty & ANV_CMD_DIRTY_FS_MSAA_FLAGS) ||
+ BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_FSR))) {
+ const bool cps_enable = wm_prog_data &&
+ brw_wm_prog_data_is_coarse(wm_prog_data, gfx->fs_msaa_flags);
+#if GFX_VER == 11
+ SET(CPS, cps.CoarsePixelShadingMode,
+ cps_enable ? CPS_MODE_CONSTANT : CPS_MODE_NONE);
+ SET(CPS, cps.MinCPSizeX, dyn->fsr.fragment_size.width);
+ SET(CPS, cps.MinCPSizeY, dyn->fsr.fragment_size.height);
+#elif GFX_VER >= 12
+ SET(CPS, cps.CoarsePixelShadingStateArrayPointer,
+ get_cps_state_offset(cmd_buffer, cps_enable, &dyn->fsr));
+#endif
+ }
+#endif /* GFX_VER >= 11 */
+
+ if ((gfx->dirty & ANV_CMD_DIRTY_PIPELINE) ||
+ BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_TS_DOMAIN_ORIGIN)) {
+ const struct brw_tes_prog_data *tes_prog_data = get_tes_prog_data(pipeline);
+
+ if (tes_prog_data && anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL)) {
+ if (dyn->ts.domain_origin == VK_TESSELLATION_DOMAIN_ORIGIN_LOWER_LEFT) {
+ SET(TE, te.OutputTopology, tes_prog_data->output_topology);
+ } else {
+ /* When the origin is upper-left, we have to flip the winding order */
+ if (tes_prog_data->output_topology == OUTPUT_TRI_CCW) {
+ SET(TE, te.OutputTopology, OUTPUT_TRI_CW);
+ } else if (tes_prog_data->output_topology == OUTPUT_TRI_CW) {
+ SET(TE, te.OutputTopology, OUTPUT_TRI_CCW);
+ } else {
+ SET(TE, te.OutputTopology, tes_prog_data->output_topology);
+ }
+ }
+ } else {
+ SET(TE, te.OutputTopology, OUTPUT_POINT);
+ }
+ }
+
+ if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_LINE_WIDTH))
+ SET(SF, sf.LineWidth, dyn->rs.line.width);
+
+ if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_PROVOKING_VERTEX)) {
+ SETUP_PROVOKING_VERTEX(SF, sf, dyn->rs.provoking_vertex);
+ SETUP_PROVOKING_VERTEX(CLIP, clip, dyn->rs.provoking_vertex);
+ }
+
+ if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_BIAS_FACTORS)) {
+ /**
+ * From the Vulkan Spec:
+ *
+ * "VK_DEPTH_BIAS_REPRESENTATION_FLOAT_EXT specifies that the depth
+ * bias representation is a factor of constant r equal to 1."
+ *
+ * From the SKL PRMs, Volume 7: 3D-Media-GPGPU, Depth Offset:
+ *
+ * "When UNORM Depth Buffer is at Output Merger (or no Depth Buffer):
+ *
+ * Bias = GlobalDepthOffsetConstant * r + GlobalDepthOffsetScale * MaxDepthSlope
+ *
+ * Where r is the minimum representable value > 0 in the depth
+ * buffer format, converted to float32 (note: If state bit Legacy
+ * Global Depth Bias Enable is set, the r term will be forced to
+ * 1.0)"
+ *
+ * When VK_DEPTH_BIAS_REPRESENTATION_FLOAT_EXT is set, enable
+ * LegacyGlobalDepthBiasEnable.
+ */
+ SET(SF, sf.LegacyGlobalDepthBiasEnable,
+ dyn->rs.depth_bias.representation ==
+ VK_DEPTH_BIAS_REPRESENTATION_FLOAT_EXT);
+ }
+
+ if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_DEPTH_CLIP_NEGATIVE_ONE_TO_ONE))
+ SET(CLIP, clip.APIMode, dyn->vp.depth_clip_negative_one_to_one ? APIMODE_OGL : APIMODE_D3D);
+
+ if ((gfx->dirty & ANV_CMD_DIRTY_PIPELINE) ||
+ (gfx->dirty & ANV_CMD_DIRTY_RENDER_TARGETS) ||
+ BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_IA_PRIMITIVE_TOPOLOGY) ||
+ BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_CULL_MODE) ||
+ BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_FRONT_FACE) ||
+ BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_BIAS_ENABLE) ||
+ BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_BIAS_FACTORS) ||
+ BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_POLYGON_MODE) ||
+ BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_LINE_MODE) ||
+ BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_LINE_WIDTH) ||
+ BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_CLIP_ENABLE) ||
+ BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_CLAMP_ENABLE) ||
+ BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_CONSERVATIVE_MODE)) {
+ /* Take dynamic primitive topology in to account with
+ * 3DSTATE_RASTER::APIMode
+ * 3DSTATE_RASTER::DXMultisampleRasterizationEnable
+ * 3DSTATE_RASTER::AntialiasingEnable
+ */
+ uint32_t api_mode = 0;
+ bool msaa_raster_enable = false;
+
+ const VkLineRasterizationModeKHR line_mode =
+ anv_line_rasterization_mode(dyn->rs.line.mode,
+ dyn->ms.rasterization_samples);
+
+ const VkPolygonMode dynamic_raster_mode =
+ genX(raster_polygon_mode)(pipeline,
+ dyn->rs.polygon_mode,
+ dyn->ia.primitive_topology);
+
+ genX(rasterization_mode)(dynamic_raster_mode,
+ line_mode, dyn->rs.line.width,
+ &api_mode, &msaa_raster_enable);
+
+ /* From the Browadwell PRM, Volume 2, documentation for
+ * 3DSTATE_RASTER, "Antialiasing Enable":
+ *
+ * "This field must be disabled if any of the render targets
+ * have integer (UINT or SINT) surface format."
+ *
+ * Additionally internal documentation for Gfx12+ states:
+ *
+ * "This bit MUST not be set when NUM_MULTISAMPLES > 1 OR
+ * FORCED_SAMPLE_COUNT > 1."
+ */
+ const bool aa_enable =
+ anv_rasterization_aa_mode(dynamic_raster_mode, line_mode) &&
+ !gfx->has_uint_rt &&
+ !(GFX_VER >= 12 && gfx->samples > 1);
+
+ const bool depth_clip_enable =
+ vk_rasterization_state_depth_clip_enable(&dyn->rs);
+
+ const bool xy_clip_test_enable =
+ (dynamic_raster_mode == VK_POLYGON_MODE_FILL);
+
+ SET(CLIP, clip.ViewportXYClipTestEnable, xy_clip_test_enable);
+
+ SET(RASTER, raster.APIMode, api_mode);
+ SET(RASTER, raster.DXMultisampleRasterizationEnable, msaa_raster_enable);
+ SET(RASTER, raster.AntialiasingEnable, aa_enable);
+ SET(RASTER, raster.CullMode, genX(vk_to_intel_cullmode)[dyn->rs.cull_mode]);
+ SET(RASTER, raster.FrontWinding, genX(vk_to_intel_front_face)[dyn->rs.front_face]);
+ SET(RASTER, raster.GlobalDepthOffsetEnableSolid, dyn->rs.depth_bias.enable);
+ SET(RASTER, raster.GlobalDepthOffsetEnableWireframe, dyn->rs.depth_bias.enable);
+ SET(RASTER, raster.GlobalDepthOffsetEnablePoint, dyn->rs.depth_bias.enable);
+ SET(RASTER, raster.GlobalDepthOffsetConstant, dyn->rs.depth_bias.constant);
+ SET(RASTER, raster.GlobalDepthOffsetScale, dyn->rs.depth_bias.slope);
+ SET(RASTER, raster.GlobalDepthOffsetClamp, dyn->rs.depth_bias.clamp);
+ SET(RASTER, raster.FrontFaceFillMode, genX(vk_to_intel_fillmode)[dyn->rs.polygon_mode]);
+ SET(RASTER, raster.BackFaceFillMode, genX(vk_to_intel_fillmode)[dyn->rs.polygon_mode]);
+ SET(RASTER, raster.ViewportZFarClipTestEnable, depth_clip_enable);
+ SET(RASTER, raster.ViewportZNearClipTestEnable, depth_clip_enable);
+ SET(RASTER, raster.ConservativeRasterizationEnable,
+ dyn->rs.conservative_mode !=
+ VK_CONSERVATIVE_RASTERIZATION_MODE_DISABLED_EXT);
+ }
+
+ if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_RASTERIZATION_SAMPLES)) {
+ SET(MULTISAMPLE, ms.NumberofMultisamples,
+ __builtin_ffs(MAX2(dyn->ms.rasterization_samples, 1)) - 1);
+ }
+
+ if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_SAMPLE_MASK)) {
+ /* From the Vulkan 1.0 spec:
+ * If pSampleMask is NULL, it is treated as if the mask has all bits
+ * enabled, i.e. no coverage is removed from fragments.
+ *
+ * 3DSTATE_SAMPLE_MASK.SampleMask is 16 bits.
+ */
+ SET(SAMPLE_MASK, sm.SampleMask, dyn->ms.sample_mask & 0xffff);
+ }
+
+ if ((gfx->dirty & ANV_CMD_DIRTY_RENDER_TARGETS) ||
+#if GFX_VER == 9
+ /* For the PMA fix */
+ (gfx->dirty & ANV_CMD_DIRTY_PIPELINE) ||
+#endif
+ BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_TEST_ENABLE) ||
+ BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_WRITE_ENABLE) ||
+ BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_COMPARE_OP) ||
+ BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_TEST_ENABLE) ||
+ BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_OP) ||
+ BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_COMPARE_MASK) ||
+ BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_WRITE_MASK) ||
+ BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_REFERENCE)) {
+ VkImageAspectFlags ds_aspects = 0;
+ if (gfx->depth_att.vk_format != VK_FORMAT_UNDEFINED)
+ ds_aspects |= VK_IMAGE_ASPECT_DEPTH_BIT;
+ if (gfx->stencil_att.vk_format != VK_FORMAT_UNDEFINED)
+ ds_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT;
+
+ struct vk_depth_stencil_state opt_ds = dyn->ds;
+ vk_optimize_depth_stencil_state(&opt_ds, ds_aspects, true);
+
+ SET(WM_DEPTH_STENCIL, ds.DoubleSidedStencilEnable, true);
+
+ SET(WM_DEPTH_STENCIL, ds.StencilTestMask,
+ opt_ds.stencil.front.compare_mask & 0xff);
+ SET(WM_DEPTH_STENCIL, ds.StencilWriteMask,
+ opt_ds.stencil.front.write_mask & 0xff);
+
+ SET(WM_DEPTH_STENCIL, ds.BackfaceStencilTestMask, opt_ds.stencil.back.compare_mask & 0xff);
+ SET(WM_DEPTH_STENCIL, ds.BackfaceStencilWriteMask, opt_ds.stencil.back.write_mask & 0xff);
+
+ SET(WM_DEPTH_STENCIL, ds.StencilReferenceValue,
+ opt_ds.stencil.front.reference & 0xff);
+ SET(WM_DEPTH_STENCIL, ds.BackfaceStencilReferenceValue,
+ opt_ds.stencil.back.reference & 0xff);
+
+ SET(WM_DEPTH_STENCIL, ds.DepthTestEnable, opt_ds.depth.test_enable);
+ SET(WM_DEPTH_STENCIL, ds.DepthBufferWriteEnable, opt_ds.depth.write_enable);
+ SET(WM_DEPTH_STENCIL, ds.DepthTestFunction,
+ genX(vk_to_intel_compare_op)[opt_ds.depth.compare_op]);
+ SET(WM_DEPTH_STENCIL, ds.StencilTestEnable, opt_ds.stencil.test_enable);
+ SET(WM_DEPTH_STENCIL, ds.StencilBufferWriteEnable, opt_ds.stencil.write_enable);
+ SET(WM_DEPTH_STENCIL, ds.StencilFailOp,
+ genX(vk_to_intel_stencil_op)[opt_ds.stencil.front.op.fail]);
+ SET(WM_DEPTH_STENCIL, ds.StencilPassDepthPassOp,
+ genX(vk_to_intel_stencil_op)[opt_ds.stencil.front.op.pass]);
+ SET(WM_DEPTH_STENCIL, ds.StencilPassDepthFailOp,
+ genX(vk_to_intel_stencil_op)[opt_ds.stencil.front.op.depth_fail]);
+ SET(WM_DEPTH_STENCIL, ds.StencilTestFunction,
+ genX(vk_to_intel_compare_op)[opt_ds.stencil.front.op.compare]);
+ SET(WM_DEPTH_STENCIL, ds.BackfaceStencilFailOp,
+ genX(vk_to_intel_stencil_op)[opt_ds.stencil.back.op.fail]);
+ SET(WM_DEPTH_STENCIL, ds.BackfaceStencilPassDepthPassOp,
+ genX(vk_to_intel_stencil_op)[opt_ds.stencil.back.op.pass]);
+ SET(WM_DEPTH_STENCIL, ds.BackfaceStencilPassDepthFailOp,
+ genX(vk_to_intel_stencil_op)[opt_ds.stencil.back.op.depth_fail]);
+ SET(WM_DEPTH_STENCIL, ds.BackfaceStencilTestFunction,
+ genX(vk_to_intel_compare_op)[opt_ds.stencil.back.op.compare]);
+
+#if GFX_VER == 9
+ const bool pma = want_stencil_pma_fix(cmd_buffer, dyn, &opt_ds);
+ SET(PMA_FIX, pma_fix, pma);
+#endif
+
+#if GFX_VERx10 >= 125
+ if (intel_needs_workaround(cmd_buffer->device->info, 18019816803)) {
+ bool ds_write_state = opt_ds.depth.write_enable || opt_ds.stencil.write_enable;
+ if (gfx->ds_write_state != ds_write_state) {
+ gfx->ds_write_state = ds_write_state;
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_WA_18019816803);
+ }
+ }
+#endif
+ }
+
+#if GFX_VER >= 12
+ if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_BOUNDS_TEST_ENABLE) ||
+ BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_BOUNDS_TEST_BOUNDS)) {
+ SET(DEPTH_BOUNDS, db.DepthBoundsTestEnable, dyn->ds.depth.bounds_test.enable);
+ /* Only look at updating the bounds if testing is enabled */
+ if (dyn->ds.depth.bounds_test.enable) {
+ SET(DEPTH_BOUNDS, db.DepthBoundsTestMinValue, dyn->ds.depth.bounds_test.min);
+ SET(DEPTH_BOUNDS, db.DepthBoundsTestMaxValue, dyn->ds.depth.bounds_test.max);
+ }
+ }
+#endif
+
+ if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_LINE_STIPPLE) ||
+ BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_LINE_STIPPLE_ENABLE)) {
+ SET(LINE_STIPPLE, ls.LineStipplePattern, dyn->rs.line.stipple.pattern);
+ SET(LINE_STIPPLE, ls.LineStippleInverseRepeatCount,
+ 1.0f / MAX2(1, dyn->rs.line.stipple.factor));
+ SET(LINE_STIPPLE, ls.LineStippleRepeatCount, dyn->rs.line.stipple.factor);
+
+ SET(WM, wm.LineStippleEnable, dyn->rs.line.stipple.enable);
+ }
+
+ if ((gfx->dirty & ANV_CMD_DIRTY_RESTART_INDEX) ||
+ BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_IA_PRIMITIVE_RESTART_ENABLE)) {
+ SET(VF, vf.IndexedDrawCutIndexEnable, dyn->ia.primitive_restart_enable);
+ SET(VF, vf.CutIndex, gfx->restart_index);
+ }
+
+ if (gfx->dirty & ANV_CMD_DIRTY_INDEX_BUFFER)
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_INDEX_BUFFER);
+
+#if GFX_VERx10 >= 125
+ if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_IA_PRIMITIVE_RESTART_ENABLE))
+ SET(VFG, vfg.ListCutIndexEnable, dyn->ia.primitive_restart_enable);
+#endif
+
+ if (cmd_buffer->device->vk.enabled_extensions.EXT_sample_locations &&
+ (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_SAMPLE_LOCATIONS) ||
+ BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_SAMPLE_LOCATIONS_ENABLE)))
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_SAMPLE_PATTERN);
+
+ if ((gfx->dirty & ANV_CMD_DIRTY_PIPELINE) ||
+ (gfx->dirty & ANV_CMD_DIRTY_RENDER_TARGETS) ||
+ BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES)) {
+ /* 3DSTATE_WM in the hope we can avoid spawning fragment shaders
+ * threads.
+ */
+ bool force_thread_dispatch =
+ anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT) &&
+ (pipeline->force_fragment_thread_dispatch ||
+ anv_cmd_buffer_all_color_write_masked(cmd_buffer));
+ SET(WM, wm.ForceThreadDispatchEnable, force_thread_dispatch ? ForceON : 0);
+ }
+
+ if ((cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) ||
+ BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_ATTACHMENT_FEEDBACK_LOOP_ENABLE)) {
+ SET_STAGE(PS_EXTRA, ps_extra.PixelShaderKillsPixel,
+ wm_prog_data && (pipeline->rp_has_ds_self_dep ||
+ has_ds_feedback_loop(dyn) ||
+ wm_prog_data->uses_kill),
+ FRAGMENT);
+ }
+
+ if ((gfx->dirty & ANV_CMD_DIRTY_PIPELINE) ||
+ (gfx->dirty & ANV_CMD_DIRTY_RENDER_TARGETS) ||
+ BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_LOGIC_OP) ||
+ BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES) ||
+ BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_LOGIC_OP_ENABLE) ||
+ BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_ALPHA_TO_ONE_ENABLE) ||
+ BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_WRITE_MASKS) ||
+ BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_BLEND_ENABLES) ||
+ BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_BLEND_EQUATIONS)) {
+ const uint8_t color_writes = dyn->cb.color_write_enables;
+ bool has_writeable_rt =
+ anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT) &&
+ (color_writes & ((1u << gfx->color_att_count) - 1)) != 0;
+
+ SET(BLEND_STATE, blend.AlphaToCoverageEnable,
+ dyn->ms.alpha_to_coverage_enable);
+ SET(BLEND_STATE, blend.AlphaToOneEnable,
+ dyn->ms.alpha_to_one_enable);
+
+ bool independent_alpha_blend = false;
+ /* Wa_14018912822, check if we set these during RT setup. */
+ bool color_blend_zero = false;
+ bool alpha_blend_zero = false;
+ for (uint32_t i = 0; i < MAX_RTS; i++) {
+ /* Disable anything above the current number of color attachments. */
+ bool write_disabled = i >= gfx->color_att_count ||
+ (color_writes & BITFIELD_BIT(i)) == 0;
+
+ SET(BLEND_STATE, blend.rts[i].WriteDisableAlpha,
+ write_disabled ||
+ (dyn->cb.attachments[i].write_mask &
+ VK_COLOR_COMPONENT_A_BIT) == 0);
+ SET(BLEND_STATE, blend.rts[i].WriteDisableRed,
+ write_disabled ||
+ (dyn->cb.attachments[i].write_mask &
+ VK_COLOR_COMPONENT_R_BIT) == 0);
+ SET(BLEND_STATE, blend.rts[i].WriteDisableGreen,
+ write_disabled ||
+ (dyn->cb.attachments[i].write_mask &
+ VK_COLOR_COMPONENT_G_BIT) == 0);
+ SET(BLEND_STATE, blend.rts[i].WriteDisableBlue,
+ write_disabled ||
+ (dyn->cb.attachments[i].write_mask &
+ VK_COLOR_COMPONENT_B_BIT) == 0);
+ /* Vulkan specification 1.2.168, VkLogicOp:
+ *
+ * "Logical operations are controlled by the logicOpEnable and
+ * logicOp members of VkPipelineColorBlendStateCreateInfo. If
+ * logicOpEnable is VK_TRUE, then a logical operation selected by
+ * logicOp is applied between each color attachment and the
+ * fragment’s corresponding output value, and blending of all
+ * attachments is treated as if it were disabled."
+ *
+ * From the Broadwell PRM Volume 2d: Command Reference: Structures:
+ * BLEND_STATE_ENTRY:
+ *
+ * "Enabling LogicOp and Color Buffer Blending at the same time is
+ * UNDEFINED"
+ */
+ SET(BLEND_STATE, blend.rts[i].LogicOpFunction,
+ genX(vk_to_intel_logic_op)[dyn->cb.logic_op]);
+ SET(BLEND_STATE, blend.rts[i].LogicOpEnable, dyn->cb.logic_op_enable);
+
+ SET(BLEND_STATE, blend.rts[i].ColorClampRange, COLORCLAMP_RTFORMAT);
+ SET(BLEND_STATE, blend.rts[i].PreBlendColorClampEnable, true);
+ SET(BLEND_STATE, blend.rts[i].PostBlendColorClampEnable, true);
+
+ /* Setup blend equation. */
+ SET(BLEND_STATE, blend.rts[i].ColorBlendFunction,
+ genX(vk_to_intel_blend_op)[
+ dyn->cb.attachments[i].color_blend_op]);
+ SET(BLEND_STATE, blend.rts[i].AlphaBlendFunction,
+ genX(vk_to_intel_blend_op)[
+ dyn->cb.attachments[i].alpha_blend_op]);
+
+ if (dyn->cb.attachments[i].src_color_blend_factor !=
+ dyn->cb.attachments[i].src_alpha_blend_factor ||
+ dyn->cb.attachments[i].dst_color_blend_factor !=
+ dyn->cb.attachments[i].dst_alpha_blend_factor ||
+ dyn->cb.attachments[i].color_blend_op !=
+ dyn->cb.attachments[i].alpha_blend_op) {
+ independent_alpha_blend = true;
+ }
+
+ /* The Dual Source Blending documentation says:
+ *
+ * "If SRC1 is included in a src/dst blend factor and
+ * a DualSource RT Write message is not used, results
+ * are UNDEFINED. (This reflects the same restriction in DX APIs,
+ * where undefined results are produced if “o1” is not written
+ * by a PS – there are no default values defined)."
+ *
+ * There is no way to gracefully fix this undefined situation
+ * so we just disable the blending to prevent possible issues.
+ */
+ if (wm_prog_data && !wm_prog_data->dual_src_blend &&
+ anv_is_dual_src_blend_equation(&dyn->cb.attachments[i])) {
+ SET(BLEND_STATE, blend.rts[i].ColorBufferBlendEnable, false);
+ } else {
+ SET(BLEND_STATE, blend.rts[i].ColorBufferBlendEnable,
+ !dyn->cb.logic_op_enable &&
+ dyn->cb.attachments[i].blend_enable);
+ }
+
+ /* Our hardware applies the blend factor prior to the blend function
+ * regardless of what function is used. Technically, this means the
+ * hardware can do MORE than GL or Vulkan specify. However, it also
+ * means that, for MIN and MAX, we have to stomp the blend factor to
+ * ONE to make it a no-op.
+ */
+ uint32_t SourceBlendFactor;
+ uint32_t DestinationBlendFactor;
+ uint32_t SourceAlphaBlendFactor;
+ uint32_t DestinationAlphaBlendFactor;
+ if (dyn->cb.attachments[i].color_blend_op == VK_BLEND_OP_MIN ||
+ dyn->cb.attachments[i].color_blend_op == VK_BLEND_OP_MAX) {
+ SourceBlendFactor = BLENDFACTOR_ONE;
+ DestinationBlendFactor = BLENDFACTOR_ONE;
+ } else {
+ SourceBlendFactor = genX(vk_to_intel_blend)[
+ dyn->cb.attachments[i].src_color_blend_factor];
+ DestinationBlendFactor = genX(vk_to_intel_blend)[
+ dyn->cb.attachments[i].dst_color_blend_factor];
+ }
+
+ if (dyn->cb.attachments[i].alpha_blend_op == VK_BLEND_OP_MIN ||
+ dyn->cb.attachments[i].alpha_blend_op == VK_BLEND_OP_MAX) {
+ SourceAlphaBlendFactor = BLENDFACTOR_ONE;
+ DestinationAlphaBlendFactor = BLENDFACTOR_ONE;
+ } else {
+ SourceAlphaBlendFactor = genX(vk_to_intel_blend)[
+ dyn->cb.attachments[i].src_alpha_blend_factor];
+ DestinationAlphaBlendFactor = genX(vk_to_intel_blend)[
+ dyn->cb.attachments[i].dst_alpha_blend_factor];
+ }
+
+ /* Replace and Src1 value by 1.0 if dual source blending is not
+ * enabled.
+ */
+ if (wm_prog_data && !wm_prog_data->dual_src_blend) {
+ if (is_src1_blend_factor(SourceBlendFactor))
+ SourceBlendFactor = BLENDFACTOR_ONE;
+ if (is_src1_blend_factor(DestinationBlendFactor))
+ DestinationBlendFactor = BLENDFACTOR_ONE;
+ }
+
+ if (instance->intel_enable_wa_14018912822 &&
+ intel_needs_workaround(cmd_buffer->device->info, 14018912822) &&
+ dyn->ms.rasterization_samples > 1) {
+ if (DestinationBlendFactor == BLENDFACTOR_ZERO) {
+ DestinationBlendFactor = BLENDFACTOR_CONST_COLOR;
+ color_blend_zero = true;
+ }
+ if (DestinationAlphaBlendFactor == BLENDFACTOR_ZERO) {
+ DestinationAlphaBlendFactor = BLENDFACTOR_CONST_ALPHA;
+ alpha_blend_zero = true;
+ }
+ }
+
+ SET(BLEND_STATE, blend.rts[i].SourceBlendFactor, SourceBlendFactor);
+ SET(BLEND_STATE, blend.rts[i].DestinationBlendFactor, DestinationBlendFactor);
+ SET(BLEND_STATE, blend.rts[i].SourceAlphaBlendFactor, SourceAlphaBlendFactor);
+ SET(BLEND_STATE, blend.rts[i].DestinationAlphaBlendFactor, DestinationAlphaBlendFactor);
+ }
+ gfx->color_blend_zero = color_blend_zero;
+ gfx->alpha_blend_zero = alpha_blend_zero;
+
+ SET(BLEND_STATE, blend.IndependentAlphaBlendEnable, independent_alpha_blend);
+
+ /* 3DSTATE_PS_BLEND to be consistent with the rest of the
+ * BLEND_STATE_ENTRY.
+ */
+ SET(PS_BLEND, ps_blend.HasWriteableRT, has_writeable_rt);
+ SET(PS_BLEND, ps_blend.ColorBufferBlendEnable, GET(blend.rts[0].ColorBufferBlendEnable));
+ SET(PS_BLEND, ps_blend.SourceAlphaBlendFactor, GET(blend.rts[0].SourceAlphaBlendFactor));
+ SET(PS_BLEND, ps_blend.DestinationAlphaBlendFactor, gfx->alpha_blend_zero ?
+ BLENDFACTOR_CONST_ALPHA :
+ GET(blend.rts[0].DestinationAlphaBlendFactor));
+ SET(PS_BLEND, ps_blend.SourceBlendFactor, GET(blend.rts[0].SourceBlendFactor));
+ SET(PS_BLEND, ps_blend.DestinationBlendFactor, gfx->color_blend_zero ?
+ BLENDFACTOR_CONST_COLOR :
+ GET(blend.rts[0].DestinationBlendFactor));
+ SET(PS_BLEND, ps_blend.AlphaTestEnable, false);
+ SET(PS_BLEND, ps_blend.IndependentAlphaBlendEnable, GET(blend.IndependentAlphaBlendEnable));
+ SET(PS_BLEND, ps_blend.AlphaToCoverageEnable, dyn->ms.alpha_to_coverage_enable);
+ }
+
+ if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_BLEND_CONSTANTS)) {
+ SET(CC_STATE, cc.BlendConstantColorRed,
+ gfx->color_blend_zero ? 0.0f : dyn->cb.blend_constants[0]);
+ SET(CC_STATE, cc.BlendConstantColorGreen,
+ gfx->color_blend_zero ? 0.0f : dyn->cb.blend_constants[1]);
+ SET(CC_STATE, cc.BlendConstantColorBlue,
+ gfx->color_blend_zero ? 0.0f : dyn->cb.blend_constants[2]);
+ SET(CC_STATE, cc.BlendConstantColorAlpha,
+ gfx->alpha_blend_zero ? 0.0f : dyn->cb.blend_constants[3]);
+ }
+
+ if ((gfx->dirty & ANV_CMD_DIRTY_RENDER_AREA) ||
+ BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_VIEWPORTS) ||
+ BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_SCISSORS) ||
+ BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_CLAMP_ENABLE) ||
+ BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_DEPTH_CLIP_NEGATIVE_ONE_TO_ONE)) {
+ struct anv_instance *instance = cmd_buffer->device->physical->instance;
+ const VkViewport *viewports = dyn->vp.viewports;
+
+ const float scale = dyn->vp.depth_clip_negative_one_to_one ? 0.5f : 1.0f;
+
+ for (uint32_t i = 0; i < dyn->vp.viewport_count; i++) {
+ const VkViewport *vp = &viewports[i];
+
+ /* The gfx7 state struct has just the matrix and guardband fields, the
+ * gfx8 struct adds the min/max viewport fields. */
+ struct GENX(SF_CLIP_VIEWPORT) sfv = {
+ .ViewportMatrixElementm00 = vp->width / 2,
+ .ViewportMatrixElementm11 = vp->height / 2,
+ .ViewportMatrixElementm22 = (vp->maxDepth - vp->minDepth) * scale,
+ .ViewportMatrixElementm30 = vp->x + vp->width / 2,
+ .ViewportMatrixElementm31 = vp->y + vp->height / 2,
+ .ViewportMatrixElementm32 = dyn->vp.depth_clip_negative_one_to_one ?
+ (vp->minDepth + vp->maxDepth) * scale : vp->minDepth,
+ .XMinClipGuardband = -1.0f,
+ .XMaxClipGuardband = 1.0f,
+ .YMinClipGuardband = -1.0f,
+ .YMaxClipGuardband = 1.0f,
+ .XMinViewPort = vp->x,
+ .XMaxViewPort = vp->x + vp->width - 1,
+ .YMinViewPort = MIN2(vp->y, vp->y + vp->height),
+ .YMaxViewPort = MAX2(vp->y, vp->y + vp->height) - 1,
+ };
+
+ /* Fix depth test misrenderings by lowering translated depth range */
+ if (instance->lower_depth_range_rate != 1.0f)
+ sfv.ViewportMatrixElementm32 *= instance->lower_depth_range_rate;
+
+ const uint32_t fb_size_max = 1 << 14;
+ uint32_t x_min = 0, x_max = fb_size_max;
+ uint32_t y_min = 0, y_max = fb_size_max;
+
+ /* If we have a valid renderArea, include that */
+ if (gfx->render_area.extent.width > 0 &&
+ gfx->render_area.extent.height > 0) {
+ x_min = MAX2(x_min, gfx->render_area.offset.x);
+ x_max = MIN2(x_max, gfx->render_area.offset.x +
+ gfx->render_area.extent.width);
+ y_min = MAX2(y_min, gfx->render_area.offset.y);
+ y_max = MIN2(y_max, gfx->render_area.offset.y +
+ gfx->render_area.extent.height);
+ }
+
+ /* The client is required to have enough scissors for whatever it
+ * sets as ViewportIndex but it's possible that they've got more
+ * viewports set from a previous command. Also, from the Vulkan
+ * 1.3.207:
+ *
+ * "The application must ensure (using scissor if necessary) that
+ * all rendering is contained within the render area."
+ *
+ * If the client doesn't set a scissor, that basically means it
+ * guarantees everything is in-bounds already. If we end up using a
+ * guardband of [-1, 1] in that case, there shouldn't be much loss.
+ * It's theoretically possible that they could do all their clipping
+ * with clip planes but that'd be a bit odd.
+ */
+ if (i < dyn->vp.scissor_count) {
+ const VkRect2D *scissor = &dyn->vp.scissors[i];
+ x_min = MAX2(x_min, scissor->offset.x);
+ x_max = MIN2(x_max, scissor->offset.x + scissor->extent.width);
+ y_min = MAX2(y_min, scissor->offset.y);
+ y_max = MIN2(y_max, scissor->offset.y + scissor->extent.height);
+ }
+
+ /* Only bother calculating the guardband if our known render area is
+ * less than the maximum size. Otherwise, it will calculate [-1, 1]
+ * anyway but possibly with precision loss.
+ */
+ if (x_min > 0 || x_max < fb_size_max ||
+ y_min > 0 || y_max < fb_size_max) {
+ intel_calculate_guardband_size(x_min, x_max, y_min, y_max,
+ sfv.ViewportMatrixElementm00,
+ sfv.ViewportMatrixElementm11,
+ sfv.ViewportMatrixElementm30,
+ sfv.ViewportMatrixElementm31,
+ &sfv.XMinClipGuardband,
+ &sfv.XMaxClipGuardband,
+ &sfv.YMinClipGuardband,
+ &sfv.YMaxClipGuardband);
+ }
+
+#define SET_VP(bit, state, field) \
+ do { \
+ if (hw_state->state.field != sfv.field) { \
+ hw_state->state.field = sfv.field; \
+ BITSET_SET(hw_state->dirty, \
+ ANV_GFX_STATE_##bit); \
+ } \
+ } while (0)
+ SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], ViewportMatrixElementm00);
+ SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], ViewportMatrixElementm11);
+ SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], ViewportMatrixElementm22);
+ SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], ViewportMatrixElementm30);
+ SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], ViewportMatrixElementm31);
+ SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], ViewportMatrixElementm32);
+ SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], XMinClipGuardband);
+ SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], XMaxClipGuardband);
+ SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], YMinClipGuardband);
+ SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], YMaxClipGuardband);
+ SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], XMinViewPort);
+ SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], XMaxViewPort);
+ SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], YMinViewPort);
+ SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], YMaxViewPort);
+#undef SET_VP
+
+ const bool depth_range_unrestricted =
+ cmd_buffer->device->vk.enabled_extensions.EXT_depth_range_unrestricted;
+
+ float min_depth_limit = depth_range_unrestricted ? -FLT_MAX : 0.0;
+ float max_depth_limit = depth_range_unrestricted ? FLT_MAX : 1.0;
+
+ float min_depth = dyn->rs.depth_clamp_enable ?
+ MIN2(vp->minDepth, vp->maxDepth) : min_depth_limit;
+ float max_depth = dyn->rs.depth_clamp_enable ?
+ MAX2(vp->minDepth, vp->maxDepth) : max_depth_limit;
+
+ SET(VIEWPORT_CC, vp_cc.elem[i].MinimumDepth, min_depth);
+ SET(VIEWPORT_CC, vp_cc.elem[i].MaximumDepth, max_depth);
+
+ SET(CLIP, clip.MaximumVPIndex, dyn->vp.viewport_count > 0 ?
+ dyn->vp.viewport_count - 1 : 0);
+ }
+
+ /* If the HW state is already considered dirty or the previous
+ * programmed viewport count is smaller than what we need, update the
+ * viewport count and ensure the HW state is dirty. Otherwise if the
+ * number of viewport programmed previously was larger than what we need
+ * now, no need to reemit we can just keep the old programmed values.
+ */
+ if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VIEWPORT_SF_CLIP) ||
+ hw_state->vp_sf_clip.count < dyn->vp.viewport_count) {
+ hw_state->vp_sf_clip.count = dyn->vp.viewport_count;
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VIEWPORT_SF_CLIP);
+ }
+ if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VIEWPORT_CC) ||
+ hw_state->vp_cc.count < dyn->vp.viewport_count) {
+ hw_state->vp_cc.count = dyn->vp.viewport_count;
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VIEWPORT_CC);
+ }
+ }
+
+ if ((gfx->dirty & ANV_CMD_DIRTY_RENDER_AREA) ||
+ BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_SCISSORS) ||
+ BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_VIEWPORTS)) {
+ const VkRect2D *scissors = dyn->vp.scissors;
+ const VkViewport *viewports = dyn->vp.viewports;
+
+ for (uint32_t i = 0; i < dyn->vp.scissor_count; i++) {
+ const VkRect2D *s = &scissors[i];
+ const VkViewport *vp = &viewports[i];
+
+ const int max = 0xffff;
+
+ uint32_t y_min = MAX2(s->offset.y, MIN2(vp->y, vp->y + vp->height));
+ uint32_t x_min = MAX2(s->offset.x, vp->x);
+ int64_t y_max = MIN2(s->offset.y + s->extent.height - 1,
+ MAX2(vp->y, vp->y + vp->height) - 1);
+ int64_t x_max = MIN2(s->offset.x + s->extent.width - 1,
+ vp->x + vp->width - 1);
+
+ y_max = CLAMP(y_max, 0, INT16_MAX >> 1);
+ x_max = CLAMP(x_max, 0, INT16_MAX >> 1);
+
+ /* Do this math using int64_t so overflow gets clamped correctly. */
+ if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) {
+ y_min = CLAMP((uint64_t) y_min, gfx->render_area.offset.y, max);
+ x_min = CLAMP((uint64_t) x_min, gfx->render_area.offset.x, max);
+ y_max = CLAMP((uint64_t) y_max, 0,
+ gfx->render_area.offset.y +
+ gfx->render_area.extent.height - 1);
+ x_max = CLAMP((uint64_t) x_max, 0,
+ gfx->render_area.offset.x +
+ gfx->render_area.extent.width - 1);
+ }
+
+ if (s->extent.width <= 0 || s->extent.height <= 0) {
+ /* Since xmax and ymax are inclusive, we have to have xmax < xmin
+ * or ymax < ymin for empty clips. In case clip x, y, width height
+ * are all 0, the clamps below produce 0 for xmin, ymin, xmax,
+ * ymax, which isn't what we want. Just special case empty clips
+ * and produce a canonical empty clip.
+ */
+ SET(SCISSOR, scissor.elem[i].ScissorRectangleYMin, 1);
+ SET(SCISSOR, scissor.elem[i].ScissorRectangleXMin, 1);
+ SET(SCISSOR, scissor.elem[i].ScissorRectangleYMax, 0);
+ SET(SCISSOR, scissor.elem[i].ScissorRectangleXMax, 0);
+ } else {
+ SET(SCISSOR, scissor.elem[i].ScissorRectangleYMin, y_min);
+ SET(SCISSOR, scissor.elem[i].ScissorRectangleXMin, x_min);
+ SET(SCISSOR, scissor.elem[i].ScissorRectangleYMax, y_max);
+ SET(SCISSOR, scissor.elem[i].ScissorRectangleXMax, x_max);
+ }
+ }
+
+ /* If the HW state is already considered dirty or the previous
+ * programmed viewport count is smaller than what we need, update the
+ * viewport count and ensure the HW state is dirty. Otherwise if the
+ * number of viewport programmed previously was larger than what we need
+ * now, no need to reemit we can just keep the old programmed values.
+ */
+ if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_SCISSOR) ||
+ hw_state->scissor.count < dyn->vp.scissor_count) {
+ hw_state->scissor.count = dyn->vp.scissor_count;
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_SCISSOR);
+ }
+ }
+
+#if GFX_VERx10 == 125
+ if ((cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_RENDER_TARGETS)) {
+ unsigned fb_width, fb_height, tile_width, tile_height;
+
+ if (cmd_buffer->device->physical->instance->enable_tbimr &&
+ calculate_render_area(cmd_buffer, &fb_width, &fb_height) &&
+ calculate_tile_dimensions(cmd_buffer, fb_width, fb_height,
+ &tile_width, &tile_height)) {
+ /* Use a batch size of 128 polygons per slice as recommended
+ * by BSpec 68436 "TBIMR Programming".
+ */
+ const unsigned num_slices = cmd_buffer->device->info->num_slices;
+ const unsigned batch_size = DIV_ROUND_UP(num_slices, 2) * 256;
+
+ SET(TBIMR_TILE_PASS_INFO, tbimr.TileRectangleHeight, tile_height);
+ SET(TBIMR_TILE_PASS_INFO, tbimr.TileRectangleWidth, tile_width);
+ SET(TBIMR_TILE_PASS_INFO, tbimr.VerticalTileCount,
+ DIV_ROUND_UP(fb_height, tile_height));
+ SET(TBIMR_TILE_PASS_INFO, tbimr.HorizontalTileCount,
+ DIV_ROUND_UP(fb_width, tile_width));
+ SET(TBIMR_TILE_PASS_INFO, tbimr.TBIMRBatchSize,
+ util_logbase2(batch_size) - 5);
+ SET(TBIMR_TILE_PASS_INFO, tbimr.TileBoxCheck, true);
+ SET(TBIMR_TILE_PASS_INFO, use_tbimr, true);
+ } else {
+ hw_state->use_tbimr = false;
+ }
+ }
+#endif
+
+ struct anv_push_constants *push = &cmd_buffer->state.gfx.base.push_constants;
+
+ /* If the pipeline uses a dynamic value of patch_control_points and either
+ * the pipeline change or the dynamic value change, check the value and
+ * reemit if needed.
+ */
+ if (pipeline->dynamic_patch_control_points &&
+ ((gfx->dirty & ANV_CMD_DIRTY_PIPELINE) ||
+ BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_TS_PATCH_CONTROL_POINTS)) &&
+ push->gfx.tcs_input_vertices != dyn->ts.patch_control_points) {
+ push->gfx.tcs_input_vertices = dyn->ts.patch_control_points;
+ cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT;
+ gfx->base.push_constants_data_dirty = true;
+ }
+
+#undef GET
+#undef SET
+#undef SET_STAGE
+
+ vk_dynamic_graphics_state_clear_dirty(&cmd_buffer->vk.dynamic_graphics_state);
+}
+
+static void
+emit_wa_18020335297_dummy_draw(struct anv_cmd_buffer *cmd_buffer)
+{
+#if GFX_VERx10 >= 125
+ anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VFG), vfg) {
+ vfg.DistributionMode = RR_STRICT;
+ }
+ anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VF), vf) {
+ vf.GeometryDistributionEnable = true;
+ }
+#endif
+
+#if GFX_VER >= 12
+ anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_PRIMITIVE_REPLICATION), pr) {
+ pr.ReplicaMask = 1;
+ }
+#endif
+
+ anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_RASTER), rr) {
+ rr.CullMode = CULLMODE_NONE;
+ rr.FrontFaceFillMode = FILL_MODE_SOLID;
+ rr.BackFaceFillMode = FILL_MODE_SOLID;
+ }
+
+ anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VF_STATISTICS), zero);
+ anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VF_SGVS), zero);
+
+#if GFX_VER >= 11
+ anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VF_SGVS_2), zero);
+#endif
+
+ anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CLIP), clip) {
+ clip.ClipEnable = true;
+ clip.ClipMode = CLIPMODE_REJECT_ALL;
+ }
+
+ anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VS), zero);
+ anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_GS), zero);
+ anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_HS), zero);
+ anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_TE), zero);
+ anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_DS), zero);
+ anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_STREAMOUT), zero);
+
+ uint32_t *vertex_elements = anv_batch_emitn(&cmd_buffer->batch, 1 + 2 * 2,
+ GENX(3DSTATE_VERTEX_ELEMENTS));
+ uint32_t *ve_pack_dest = &vertex_elements[1];
+
+ for (int i = 0; i < 2; i++) {
+ struct GENX(VERTEX_ELEMENT_STATE) element = {
+ .Valid = true,
+ .SourceElementFormat = ISL_FORMAT_R32G32B32A32_FLOAT,
+ .Component0Control = VFCOMP_STORE_0,
+ .Component1Control = VFCOMP_STORE_0,
+ .Component2Control = i == 0 ? VFCOMP_STORE_0 : VFCOMP_STORE_1_FP,
+ .Component3Control = i == 0 ? VFCOMP_STORE_0 : VFCOMP_STORE_1_FP,
+ };
+ GENX(VERTEX_ELEMENT_STATE_pack)(NULL, ve_pack_dest, &element);
+ ve_pack_dest += GENX(VERTEX_ELEMENT_STATE_length);
+ }
+
+ anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VF_TOPOLOGY), topo) {
+ topo.PrimitiveTopologyType = _3DPRIM_TRILIST;
+ }
+
+ /* Emit dummy draw per slice. */
+ for (unsigned i = 0; i < cmd_buffer->device->info->num_slices; i++) {
+ anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
+ prim.VertexCountPerInstance = 3;
+ prim.PrimitiveTopologyType = _3DPRIM_TRILIST;
+ prim.InstanceCount = 1;
+ prim.VertexAccessType = SEQUENTIAL;
+ }
+ }
+}
+/**
+ * This function handles dirty state emission to the batch buffer.
+ */
+static void
+cmd_buffer_gfx_state_emission(struct anv_cmd_buffer *cmd_buffer)
+{
+ struct anv_device *device = cmd_buffer->device;
+ struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
+ struct anv_graphics_pipeline *pipeline =
+ anv_pipeline_to_graphics(gfx->base.pipeline);
+ const struct vk_dynamic_graphics_state *dyn =
+ &cmd_buffer->vk.dynamic_graphics_state;
+ struct anv_gfx_dynamic_state *hw_state = &gfx->dyn_state;
+
+ if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_URB)) {
+ genX(urb_workaround)(cmd_buffer, &pipeline->urb_cfg);
+
+ anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.urb);
+
+ memcpy(&gfx->urb_cfg, &pipeline->urb_cfg,
+ sizeof(struct intel_urb_config));
+ }
+
+ if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_PRIMITIVE_REPLICATION))
+ anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.primitive_replication);
+
+ if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VF_SGVS_INSTANCING))
+ anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.vf_sgvs_instancing);
+
+ if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VF_SGVS))
+ anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.vf_sgvs);
+
+#if GFX_VER >= 11
+ if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VF_SGVS_2))
+ anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.vf_sgvs_2);
+#endif
+
+ if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VS))
+ anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.vs);
+
+ if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_HS))
+ anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.hs);
+
+ if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_DS))
+ anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.ds);
+
+ if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VF_STATISTICS))
+ anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.vf_statistics);
+
+ if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_SBE))
+ anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.sbe);
+
+ if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_SBE_SWIZ))
+ anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.sbe_swiz);
+
+ if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_SO_DECL_LIST)) {
+ /* Wa_16011773973:
+ * If SOL is enabled and SO_DECL state has to be programmed,
+ * 1. Send 3D State SOL state with SOL disabled
+ * 2. Send SO_DECL NP state
+ * 3. Send 3D State SOL with SOL Enabled
+ */
+ if (intel_needs_workaround(device->info, 16011773973) &&
+ pipeline->uses_xfb)
+ anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_STREAMOUT), so);
+
+ anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline,
+ final.so_decl_list);
+
+#if GFX_VER >= 11 && GFX_VER < 20
+ /* ICL PRMs, Volume 2a - Command Reference: Instructions,
+ * 3DSTATE_SO_DECL_LIST:
+ *
+ * "Workaround: This command must be followed by a PIPE_CONTROL with
+ * CS Stall bit set."
+ *
+ * On DG2+ also known as Wa_1509820217.
+ */
+ genx_batch_emit_pipe_control(&cmd_buffer->batch, device->info,
+ cmd_buffer->state.current_pipeline,
+ ANV_PIPE_CS_STALL_BIT);
+#endif
+ }
+
+ if (device->vk.enabled_extensions.EXT_mesh_shader) {
+ if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_MESH_CONTROL))
+ anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.mesh_control);
+
+ if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_MESH_SHADER))
+ anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.mesh_shader);
+
+ if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_MESH_DISTRIB))
+ anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.mesh_distrib);
+
+ if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_TASK_CONTROL))
+ anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.task_control);
+
+ if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_TASK_SHADER))
+ anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.task_shader);
+
+ if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_TASK_REDISTRIB))
+ anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.task_redistrib);
+
+ if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_SBE_MESH))
+ anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.sbe_mesh);
+
+ if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_CLIP_MESH))
+ anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.clip_mesh);
+ } else {
+ assert(!BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_MESH_CONTROL) &&
+ !BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_MESH_SHADER) &&
+ !BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_MESH_DISTRIB) &&
+ !BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_TASK_CONTROL) &&
+ !BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_TASK_SHADER) &&
+ !BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_TASK_REDISTRIB) &&
+ !BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_CLIP_MESH) &&
+ !BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_SBE_MESH));
+ }
+
+#define INIT(category, name) \
+ .name = hw_state->category.name
+#define SET(s, category, name) \
+ s.name = hw_state->category.name
+
+ /* Now the potentially dynamic instructions */
+
+ if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_PS)) {
+ anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_PS),
+ pipeline, partial.ps, ps) {
+ SET(ps, ps, KernelStartPointer0);
+ SET(ps, ps, KernelStartPointer1);
+ SET(ps, ps, DispatchGRFStartRegisterForConstantSetupData0);
+ SET(ps, ps, DispatchGRFStartRegisterForConstantSetupData1);
+
+#if GFX_VER < 20
+ SET(ps, ps, KernelStartPointer2);
+ SET(ps, ps, DispatchGRFStartRegisterForConstantSetupData2);
+
+ SET(ps, ps, _8PixelDispatchEnable);
+ SET(ps, ps, _16PixelDispatchEnable);
+ SET(ps, ps, _32PixelDispatchEnable);
+#else
+ SET(ps, ps, Kernel0Enable);
+ SET(ps, ps, Kernel1Enable);
+ SET(ps, ps, Kernel0SIMDWidth);
+ SET(ps, ps, Kernel1SIMDWidth);
+ SET(ps, ps, Kernel0PolyPackingPolicy);
+#endif
+ SET(ps, ps, PositionXYOffsetSelect);
+ }
+ }
+
+ if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_PS_EXTRA)) {
+ anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_PS_EXTRA),
+ pipeline, partial.ps_extra, pse) {
+ SET(pse, ps_extra, PixelShaderIsPerSample);
+#if GFX_VER >= 11
+ SET(pse, ps_extra, PixelShaderIsPerCoarsePixel);
+#endif
+#if GFX_VERx10 >= 125
+ SET(pse, ps_extra, EnablePSDependencyOnCPsizeChange);
+#endif
+ SET(pse, ps_extra, PixelShaderKillsPixel);
+ }
+ }
+
+ if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_CLIP)) {
+ anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_CLIP),
+ pipeline, partial.clip, clip) {
+ SET(clip, clip, APIMode);
+ SET(clip, clip, ViewportXYClipTestEnable);
+ SET(clip, clip, TriangleStripListProvokingVertexSelect);
+ SET(clip, clip, LineStripListProvokingVertexSelect);
+ SET(clip, clip, TriangleFanProvokingVertexSelect);
+ SET(clip, clip, MaximumVPIndex);
+ }
+ }
+
+ if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_STREAMOUT)) {
+ genX(streamout_prologue)(cmd_buffer);
+
+ anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_STREAMOUT),
+ pipeline, partial.so, so) {
+ SET(so, so, RenderingDisable);
+ SET(so, so, RenderStreamSelect);
+ SET(so, so, ReorderMode);
+ SET(so, so, ForceRendering);
+ }
+ }
+
+ if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VIEWPORT_SF_CLIP)) {
+ struct anv_state sf_clip_state =
+ anv_cmd_buffer_alloc_dynamic_state(cmd_buffer,
+ hw_state->vp_sf_clip.count * 64, 64);
+
+ for (uint32_t i = 0; i < hw_state->vp_sf_clip.count; i++) {
+ struct GENX(SF_CLIP_VIEWPORT) sfv = {
+ INIT(vp_sf_clip.elem[i], ViewportMatrixElementm00),
+ INIT(vp_sf_clip.elem[i], ViewportMatrixElementm11),
+ INIT(vp_sf_clip.elem[i], ViewportMatrixElementm22),
+ INIT(vp_sf_clip.elem[i], ViewportMatrixElementm30),
+ INIT(vp_sf_clip.elem[i], ViewportMatrixElementm31),
+ INIT(vp_sf_clip.elem[i], ViewportMatrixElementm32),
+ INIT(vp_sf_clip.elem[i], XMinClipGuardband),
+ INIT(vp_sf_clip.elem[i], XMaxClipGuardband),
+ INIT(vp_sf_clip.elem[i], YMinClipGuardband),
+ INIT(vp_sf_clip.elem[i], YMaxClipGuardband),
+ INIT(vp_sf_clip.elem[i], XMinViewPort),
+ INIT(vp_sf_clip.elem[i], XMaxViewPort),
+ INIT(vp_sf_clip.elem[i], YMinViewPort),
+ INIT(vp_sf_clip.elem[i], YMaxViewPort),
+ };
+ GENX(SF_CLIP_VIEWPORT_pack)(NULL, sf_clip_state.map + i * 64, &sfv);
+ }
+
+ anv_batch_emit(&cmd_buffer->batch,
+ GENX(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP), clip) {
+ clip.SFClipViewportPointer = sf_clip_state.offset;
+ }
+ }
+
+ if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VIEWPORT_CC)) {
+ hw_state->vp_cc.state =
+ anv_cmd_buffer_alloc_dynamic_state(cmd_buffer,
+ hw_state->vp_cc.count * 8, 32);
+
+ for (uint32_t i = 0; i < hw_state->vp_cc.count; i++) {
+ struct GENX(CC_VIEWPORT) cc_viewport = {
+ INIT(vp_cc.elem[i], MinimumDepth),
+ INIT(vp_cc.elem[i], MaximumDepth),
+ };
+ GENX(CC_VIEWPORT_pack)(NULL, hw_state->vp_cc.state.map + i * 8,
+ &cc_viewport);
+ }
+
+ /* Dirty the pointers to reemit 3DSTATE_VIEWPORT_STATE_POINTERS_CC below
+ */
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VIEWPORT_CC_PTR);
+ }
+
+ if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VIEWPORT_CC_PTR)) {
+ anv_batch_emit(&cmd_buffer->batch,
+ GENX(3DSTATE_VIEWPORT_STATE_POINTERS_CC), cc) {
+ cc.CCViewportPointer = hw_state->vp_cc.state.offset;
+ }
+ cmd_buffer->state.gfx.viewport_set = true;
+ }
+
+ if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_SCISSOR)) {
+ /* Wa_1409725701:
+ *
+ * "The viewport-specific state used by the SF unit (SCISSOR_RECT) is
+ * stored as an array of up to 16 elements. The location of first
+ * element of the array, as specified by Pointer to SCISSOR_RECT,
+ * should be aligned to a 64-byte boundary.
+ */
+ struct anv_state scissor_state =
+ anv_cmd_buffer_alloc_dynamic_state(cmd_buffer,
+ hw_state->scissor.count * 8, 64);
+
+ for (uint32_t i = 0; i < hw_state->scissor.count; i++) {
+ struct GENX(SCISSOR_RECT) scissor = {
+ INIT(scissor.elem[i], ScissorRectangleYMin),
+ INIT(scissor.elem[i], ScissorRectangleXMin),
+ INIT(scissor.elem[i], ScissorRectangleYMax),
+ INIT(scissor.elem[i], ScissorRectangleXMax),
+ };
+ GENX(SCISSOR_RECT_pack)(NULL, scissor_state.map + i * 8, &scissor);
+ }
+
+ anv_batch_emit(&cmd_buffer->batch,
+ GENX(3DSTATE_SCISSOR_STATE_POINTERS), ssp) {
+ ssp.ScissorRectPointer = scissor_state.offset;
+ }
+ }
+
+ if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VF_TOPOLOGY)) {
+ anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VF_TOPOLOGY), vft) {
+ SET(vft, vft, PrimitiveTopologyType);
+ }
+ }
+
+ if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VERTEX_INPUT)) {
+ const uint32_t ve_count =
+ pipeline->vs_input_elements + pipeline->svgs_count;
+ const uint32_t num_dwords = 1 + 2 * MAX2(1, ve_count);
+ uint32_t *p = anv_batch_emitn(&cmd_buffer->batch, num_dwords,
+ GENX(3DSTATE_VERTEX_ELEMENTS));
+
+ if (p) {
+ if (ve_count == 0) {
+ memcpy(p + 1, cmd_buffer->device->empty_vs_input,
+ sizeof(cmd_buffer->device->empty_vs_input));
+ } else if (ve_count == pipeline->vertex_input_elems) {
+ /* MESA_VK_DYNAMIC_VI is not dynamic for this pipeline, so
+ * everything is in pipeline->vertex_input_data and we can just
+ * memcpy
+ */
+ memcpy(p + 1, pipeline->vertex_input_data, 4 * 2 * ve_count);
+ anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline,
+ final.vf_instancing);
+ } else {
+ assert(pipeline->final.vf_instancing.len == 0);
+ /* Use dyn->vi to emit the dynamic VERTEX_ELEMENT_STATE input. */
+ genX(emit_vertex_input)(&cmd_buffer->batch, p + 1,
+ pipeline, dyn->vi, false /* emit_in_pipeline */);
+ /* Then append the VERTEX_ELEMENT_STATE for the draw parameters */
+ memcpy(p + 1 + 2 * pipeline->vs_input_elements,
+ pipeline->vertex_input_data,
+ 4 * 2 * pipeline->vertex_input_elems);
+ }
+ }
+ }
+
+ if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_TE)) {
+ anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_TE),
+ pipeline, partial.te, te) {
+ SET(te, te, OutputTopology);
+ }
+ }
+
+ if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_GS)) {
+ anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_GS),
+ pipeline, partial.gs, gs) {
+ SET(gs, gs, ReorderMode);
+ }
+ }
+
+ if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_CPS)) {
+#if GFX_VER == 11
+ anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CPS), cps) {
+ SET(cps, cps, CoarsePixelShadingMode);
+ SET(cps, cps, MinCPSizeX);
+ SET(cps, cps, MinCPSizeY);
+ }
+#elif GFX_VER >= 12
+ /* TODO: we can optimize this flush in the following cases:
+ *
+ * In the case where the last geometry shader emits a value that is
+ * not constant, we can avoid this stall because we can synchronize
+ * the pixel shader internally with
+ * 3DSTATE_PS::EnablePSDependencyOnCPsizeChange.
+ *
+ * If we know that the previous pipeline and the current one are
+ * using the same fragment shading rate.
+ */
+ anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
+#if GFX_VERx10 >= 125
+ pc.PSSStallSyncEnable = true;
+#else
+ pc.PSDSyncEnable = true;
+#endif
+ }
+
+ anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CPS_POINTERS), cps) {
+ SET(cps, cps, CoarsePixelShadingStateArrayPointer);
+ }
+#endif
+ }
+
+ if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_SF)) {
+ anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_SF),
+ pipeline, partial.sf, sf) {
+ SET(sf, sf, LineWidth);
+ SET(sf, sf, TriangleStripListProvokingVertexSelect);
+ SET(sf, sf, LineStripListProvokingVertexSelect);
+ SET(sf, sf, TriangleFanProvokingVertexSelect);
+ SET(sf, sf, LegacyGlobalDepthBiasEnable);
+ }
+ }
+
+ if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_RASTER)) {
+ anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_RASTER),
+ pipeline, partial.raster, raster) {
+ SET(raster, raster, APIMode);
+ SET(raster, raster, DXMultisampleRasterizationEnable);
+ SET(raster, raster, AntialiasingEnable);
+ SET(raster, raster, CullMode);
+ SET(raster, raster, FrontWinding);
+ SET(raster, raster, GlobalDepthOffsetEnableSolid);
+ SET(raster, raster, GlobalDepthOffsetEnableWireframe);
+ SET(raster, raster, GlobalDepthOffsetEnablePoint);
+ SET(raster, raster, GlobalDepthOffsetConstant);
+ SET(raster, raster, GlobalDepthOffsetScale);
+ SET(raster, raster, GlobalDepthOffsetClamp);
+ SET(raster, raster, FrontFaceFillMode);
+ SET(raster, raster, BackFaceFillMode);
+ SET(raster, raster, ViewportZFarClipTestEnable);
+ SET(raster, raster, ViewportZNearClipTestEnable);
+ SET(raster, raster, ConservativeRasterizationEnable);
+ }
+ }
+
+ if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_MULTISAMPLE)) {
+ anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_MULTISAMPLE),
+ pipeline, partial.ms, ms) {
+ SET(ms, ms, NumberofMultisamples);
+ }
+ }
+
+ if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_CC_STATE)) {
+ hw_state->cc.state =
+ anv_cmd_buffer_alloc_dynamic_state(cmd_buffer,
+ GENX(COLOR_CALC_STATE_length) * 4,
+ 64);
+ struct GENX(COLOR_CALC_STATE) cc = {
+ INIT(cc, BlendConstantColorRed),
+ INIT(cc, BlendConstantColorGreen),
+ INIT(cc, BlendConstantColorBlue),
+ INIT(cc, BlendConstantColorAlpha),
+ };
+ GENX(COLOR_CALC_STATE_pack)(NULL, hw_state->cc.state.map, &cc);
+
+ /* Dirty the pointers to reemit 3DSTATE_CC_STATE_POINTERS below
+ */
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_CC_STATE_PTR);
+ }
+
+ if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_CC_STATE_PTR)) {
+ anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CC_STATE_POINTERS), ccp) {
+ ccp.ColorCalcStatePointer = hw_state->cc.state.offset;
+ ccp.ColorCalcStatePointerValid = true;
+ }
+ }
+
+ if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_SAMPLE_MASK)) {
+ anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_SAMPLE_MASK), sm) {
+ SET(sm, sm, SampleMask);
+ }
+ }
+
+ if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_WM_DEPTH_STENCIL)) {
+ anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_WM_DEPTH_STENCIL), ds) {
+ SET(ds, ds, DoubleSidedStencilEnable);
+ SET(ds, ds, StencilTestMask);
+ SET(ds, ds, StencilWriteMask);
+ SET(ds, ds, BackfaceStencilTestMask);
+ SET(ds, ds, BackfaceStencilWriteMask);
+ SET(ds, ds, StencilReferenceValue);
+ SET(ds, ds, BackfaceStencilReferenceValue);
+ SET(ds, ds, DepthTestEnable);
+ SET(ds, ds, DepthBufferWriteEnable);
+ SET(ds, ds, DepthTestFunction);
+ SET(ds, ds, StencilTestEnable);
+ SET(ds, ds, StencilBufferWriteEnable);
+ SET(ds, ds, StencilFailOp);
+ SET(ds, ds, StencilPassDepthPassOp);
+ SET(ds, ds, StencilPassDepthFailOp);
+ SET(ds, ds, StencilTestFunction);
+ SET(ds, ds, BackfaceStencilFailOp);
+ SET(ds, ds, BackfaceStencilPassDepthPassOp);
+ SET(ds, ds, BackfaceStencilPassDepthFailOp);
+ SET(ds, ds, BackfaceStencilTestFunction);
+ }
+ }
+
+#if GFX_VER >= 12
+ if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_DEPTH_BOUNDS)) {
+ anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_DEPTH_BOUNDS), db) {
+ SET(db, db, DepthBoundsTestEnable);
+ SET(db, db, DepthBoundsTestMinValue);
+ SET(db, db, DepthBoundsTestMaxValue);
+ }
+ }
+#endif
+
+ if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_LINE_STIPPLE)) {
+ anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_LINE_STIPPLE), ls) {
+ SET(ls, ls, LineStipplePattern);
+ SET(ls, ls, LineStippleInverseRepeatCount);
+ SET(ls, ls, LineStippleRepeatCount);
+ }
+#if GFX_VER >= 11
+ /* ICL PRMs, Volume 2a - Command Reference: Instructions,
+ * 3DSTATE_LINE_STIPPLE:
+ *
+ * "Workaround: This command must be followed by a PIPE_CONTROL with
+ * CS Stall bit set."
+ */
+ genx_batch_emit_pipe_control(&cmd_buffer->batch,
+ cmd_buffer->device->info,
+ cmd_buffer->state.current_pipeline,
+ ANV_PIPE_CS_STALL_BIT);
+#endif
+ }
+
+ if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VF)) {
+ anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VF), vf) {
+#if GFX_VERx10 >= 125
+ vf.GeometryDistributionEnable = true;
+#endif
+ SET(vf, vf, IndexedDrawCutIndexEnable);
+ SET(vf, vf, CutIndex);
+ }
+ }
+
+ if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_INDEX_BUFFER)) {
+ struct anv_buffer *buffer = gfx->index_buffer;
+ uint32_t offset = gfx->index_offset;
+ anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_INDEX_BUFFER), ib) {
+ ib.IndexFormat = gfx->index_type;
+ ib.MOCS = anv_mocs(cmd_buffer->device,
+ buffer ? buffer->address.bo : NULL,
+ ISL_SURF_USAGE_INDEX_BUFFER_BIT);
+#if GFX_VER >= 12
+ ib.L3BypassDisable = true;
+#endif
+ if (buffer) {
+ ib.BufferStartingAddress = anv_address_add(buffer->address, offset);
+ ib.BufferSize = gfx->index_size;
+ }
+ }
+ }
+
+#if GFX_VERx10 >= 125
+ if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VFG)) {
+ anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_VFG),
+ pipeline, partial.vfg, vfg) {
+ SET(vfg, vfg, ListCutIndexEnable);
+ }
+ }
+#endif
+
+ if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_SAMPLE_PATTERN)) {
+ genX(emit_sample_pattern)(&cmd_buffer->batch,
+ dyn->ms.sample_locations_enable ?
+ dyn->ms.sample_locations : NULL);
+ }
+
+ if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_WM)) {
+ anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_WM),
+ pipeline, partial.wm, wm) {
+ SET(wm, wm, ForceThreadDispatchEnable);
+ SET(wm, wm, LineStippleEnable);
+ SET(wm, wm, BarycentricInterpolationMode);
+ }
+ }
+
+ if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_PS_BLEND)) {
+ anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_PS_BLEND), blend) {
+ SET(blend, ps_blend, HasWriteableRT);
+ SET(blend, ps_blend, ColorBufferBlendEnable);
+ SET(blend, ps_blend, SourceAlphaBlendFactor);
+ SET(blend, ps_blend, DestinationAlphaBlendFactor);
+ SET(blend, ps_blend, SourceBlendFactor);
+ SET(blend, ps_blend, DestinationBlendFactor);
+ SET(blend, ps_blend, AlphaTestEnable);
+ SET(blend, ps_blend, IndependentAlphaBlendEnable);
+ SET(blend, ps_blend, AlphaToCoverageEnable);
+ }
+ }
+
+ if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_BLEND_STATE)) {
+ const uint32_t num_dwords = GENX(BLEND_STATE_length) +
+ GENX(BLEND_STATE_ENTRY_length) * MAX_RTS;
+ hw_state->blend.state =
+ anv_cmd_buffer_alloc_dynamic_state(cmd_buffer,
+ num_dwords * 4,
+ 64);
+
+ uint32_t *dws = hw_state->blend.state.map;
+
+ struct GENX(BLEND_STATE) blend_state = {
+ INIT(blend, AlphaToCoverageEnable),
+ INIT(blend, AlphaToOneEnable),
+ INIT(blend, IndependentAlphaBlendEnable),
+ };
+ GENX(BLEND_STATE_pack)(NULL, dws, &blend_state);
+
+ /* Jump to blend entries. */
+ dws += GENX(BLEND_STATE_length);
+ for (uint32_t i = 0; i < MAX_RTS; i++) {
+ struct GENX(BLEND_STATE_ENTRY) entry = {
+ INIT(blend.rts[i], WriteDisableAlpha),
+ INIT(blend.rts[i], WriteDisableRed),
+ INIT(blend.rts[i], WriteDisableGreen),
+ INIT(blend.rts[i], WriteDisableBlue),
+ INIT(blend.rts[i], LogicOpFunction),
+ INIT(blend.rts[i], LogicOpEnable),
+ INIT(blend.rts[i], ColorBufferBlendEnable),
+ INIT(blend.rts[i], ColorClampRange),
+ INIT(blend.rts[i], PreBlendColorClampEnable),
+ INIT(blend.rts[i], PostBlendColorClampEnable),
+ INIT(blend.rts[i], SourceBlendFactor),
+ INIT(blend.rts[i], DestinationBlendFactor),
+ INIT(blend.rts[i], ColorBlendFunction),
+ INIT(blend.rts[i], SourceAlphaBlendFactor),
+ INIT(blend.rts[i], DestinationAlphaBlendFactor),
+ INIT(blend.rts[i], AlphaBlendFunction),
+ };
+
+ GENX(BLEND_STATE_ENTRY_pack)(NULL, dws, &entry);
+ dws += GENX(BLEND_STATE_ENTRY_length);
+ }
+
+ /* Dirty the pointers to reemit 3DSTATE_BLEND_STATE_POINTERS below */
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_BLEND_STATE_PTR);
+ }
+
+ if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_BLEND_STATE_PTR)) {
+ anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_BLEND_STATE_POINTERS), bsp) {
+ bsp.BlendStatePointer = hw_state->blend.state.offset;
+ bsp.BlendStatePointerValid = true;
+ }
+ }
+
+#if GFX_VERx10 >= 125
+ if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_WA_18019816803)) {
+ genx_batch_emit_pipe_control(&cmd_buffer->batch, cmd_buffer->device->info,
+ cmd_buffer->state.current_pipeline,
+ ANV_PIPE_PSS_STALL_SYNC_BIT);
+ }
+#endif
+
+#if GFX_VER == 9
+ if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_PMA_FIX))
+ genX(cmd_buffer_enable_pma_fix)(cmd_buffer, hw_state->pma_fix);
+#endif
+
+#if GFX_VERx10 >= 125
+ if (hw_state->use_tbimr &&
+ BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_TBIMR_TILE_PASS_INFO)) {
+ anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_TBIMR_TILE_PASS_INFO),
+ tbimr) {
+ SET(tbimr, tbimr, TileRectangleHeight);
+ SET(tbimr, tbimr, TileRectangleWidth);
+ SET(tbimr, tbimr, VerticalTileCount);
+ SET(tbimr, tbimr, HorizontalTileCount);
+ SET(tbimr, tbimr, TBIMRBatchSize);
+ SET(tbimr, tbimr, TileBoxCheck);
+ }
+ }
+#endif
+
+#undef INIT
+#undef SET
+
+ BITSET_ZERO(hw_state->dirty);
+}
+
+/**
+ * This function handles possible state workarounds and emits the dirty
+ * instructions to the batch buffer.
+ */
+void
+genX(cmd_buffer_flush_gfx_hw_state)(struct anv_cmd_buffer *cmd_buffer)
+{
+ struct anv_device *device = cmd_buffer->device;
+ struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
+ struct anv_graphics_pipeline *pipeline =
+ anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
+ struct anv_gfx_dynamic_state *hw_state = &gfx->dyn_state;
+
+ if (INTEL_DEBUG(DEBUG_REEMIT)) {
+ BITSET_OR(gfx->dyn_state.dirty, gfx->dyn_state.dirty,
+ device->gfx_dirty_state);
+ }
+
+ /**
+ * Put potential workarounds here if you need to reemit an instruction
+ * because of another one is changing.
+ */
+
+ /* Since Wa_16011773973 will disable 3DSTATE_STREAMOUT, we need to reemit
+ * it after.
+ */
+ if (intel_needs_workaround(device->info, 16011773973) &&
+ pipeline->uses_xfb &&
+ BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_SO_DECL_LIST)) {
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_STREAMOUT);
+ }
+
+ /* Gfx11 undocumented issue :
+ * https://gitlab.freedesktop.org/mesa/mesa/-/issues/9781
+ */
+#if GFX_VER == 11
+ if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_WM))
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_MULTISAMPLE);
+#endif
+
+ /* Wa_18020335297 - Apply the WA when viewport ptr is reprogrammed. */
+ if (intel_needs_workaround(device->info, 18020335297) &&
+ (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VIEWPORT_CC) ||
+ BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VIEWPORT_CC_PTR)) &&
+ cmd_buffer->state.gfx.viewport_set) {
+ /* For mesh, we implement the WA using CS stall. This is for
+ * simplicity and takes care of possible interaction with Wa_16014390852.
+ */
+ if (anv_pipeline_is_mesh(pipeline)) {
+ genx_batch_emit_pipe_control(&cmd_buffer->batch, device->info,
+ _3D, ANV_PIPE_CS_STALL_BIT);
+ } else {
+ /* Mask off all instructions that we program. */
+ BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_VFG);
+ BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_VF);
+ BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_PRIMITIVE_REPLICATION);
+ BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_RASTER);
+ BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_VF_STATISTICS);
+ BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_VF_SGVS);
+ BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_VF_SGVS_2);
+ BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_CLIP);
+ BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_STREAMOUT);
+ BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_VERTEX_INPUT);
+ BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_VF_TOPOLOGY);
+
+ BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_VS);
+ BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_GS);
+ BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_HS);
+ BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_TE);
+ BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_DS);
+
+ cmd_buffer_gfx_state_emission(cmd_buffer);
+
+ emit_wa_18020335297_dummy_draw(cmd_buffer);
+
+ /* Dirty all emitted WA state to make sure that current real
+ * state is restored.
+ */
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VFG);
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VF);
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_PRIMITIVE_REPLICATION);
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_RASTER);
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VF_STATISTICS);
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VF_SGVS);
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VF_SGVS_2);
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_CLIP);
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_STREAMOUT);
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VERTEX_INPUT);
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VF_TOPOLOGY);
+
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VS);
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_GS);
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_HS);
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_TE);
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_DS);
+ }
+ }
+
+ cmd_buffer_gfx_state_emission(cmd_buffer);
+}
+
+void
+genX(cmd_buffer_enable_pma_fix)(struct anv_cmd_buffer *cmd_buffer, bool enable)
+{
+ if (!anv_cmd_buffer_is_render_queue(cmd_buffer))
+ return;
+
+ if (cmd_buffer->state.pma_fix_enabled == enable)
+ return;
+
+ cmd_buffer->state.pma_fix_enabled = enable;
+
+ /* According to the Broadwell PIPE_CONTROL documentation, software should
+ * emit a PIPE_CONTROL with the CS Stall and Depth Cache Flush bits set
+ * prior to the LRI. If stencil buffer writes are enabled, then a Render
+ * Cache Flush is also necessary.
+ *
+ * The Skylake docs say to use a depth stall rather than a command
+ * streamer stall. However, the hardware seems to violently disagree.
+ * A full command streamer stall seems to be needed in both cases.
+ */
+ genx_batch_emit_pipe_control
+ (&cmd_buffer->batch, cmd_buffer->device->info,
+ cmd_buffer->state.current_pipeline,
+ ANV_PIPE_DEPTH_CACHE_FLUSH_BIT |
+ ANV_PIPE_CS_STALL_BIT |
+#if GFX_VER >= 12
+ ANV_PIPE_TILE_CACHE_FLUSH_BIT |
+#endif
+ ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT);
+
+#if GFX_VER == 9
+ uint32_t cache_mode;
+ anv_pack_struct(&cache_mode, GENX(CACHE_MODE_0),
+ .STCPMAOptimizationEnable = enable,
+ .STCPMAOptimizationEnableMask = true);
+ anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
+ lri.RegisterOffset = GENX(CACHE_MODE_0_num);
+ lri.DataDWord = cache_mode;
+ }
+
+#endif /* GFX_VER == 9 */
+
+ /* After the LRI, a PIPE_CONTROL with both the Depth Stall and Depth Cache
+ * Flush bits is often necessary. We do it regardless because it's easier.
+ * The render cache flush is also necessary if stencil writes are enabled.
+ *
+ * Again, the Skylake docs give a different set of flushes but the BDW
+ * flushes seem to work just as well.
+ */
+ genx_batch_emit_pipe_control
+ (&cmd_buffer->batch, cmd_buffer->device->info,
+ cmd_buffer->state.current_pipeline,
+ ANV_PIPE_DEPTH_STALL_BIT |
+ ANV_PIPE_DEPTH_CACHE_FLUSH_BIT |
+#if GFX_VER >= 12
+ ANV_PIPE_TILE_CACHE_FLUSH_BIT |
+#endif
+ ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT);
+}
diff --git a/src/intel/vulkan/genX_gpu_memcpy.c b/src/intel/vulkan/genX_gpu_memcpy.c
index 8f83212b2d7..60ca6f0a248 100644
--- a/src/intel/vulkan/genX_gpu_memcpy.c
+++ b/src/intel/vulkan/genX_gpu_memcpy.c
@@ -51,14 +51,86 @@ gcd_pow2_u64(uint64_t a, uint64_t b)
return 1 << MIN2(a_log2, b_log2);
}
-void
-genX(cmd_buffer_so_memcpy)(struct anv_cmd_buffer *cmd_buffer,
- struct anv_address dst, struct anv_address src,
- uint32_t size)
+static void
+emit_common_so_memcpy(struct anv_batch *batch, struct anv_device *device,
+ const struct intel_urb_config *urb_cfg_in,
+ struct intel_urb_config *urb_cfg_out,
+ const struct intel_l3_config *l3_config)
{
- if (size == 0)
- return;
+ anv_batch_emit(batch, GENX(3DSTATE_VF_INSTANCING), vfi) {
+ vfi.InstancingEnable = false;
+ vfi.VertexElementIndex = 0;
+ }
+ anv_batch_emit(batch, GENX(3DSTATE_VF_SGVS), sgvs);
+#if GFX_VER >= 11
+ anv_batch_emit(batch, GENX(3DSTATE_VF_SGVS_2), sgvs);
+#endif
+
+ /* Disable all shader stages */
+ anv_batch_emit(batch, GENX(3DSTATE_VS), vs);
+ anv_batch_emit(batch, GENX(3DSTATE_HS), hs);
+ anv_batch_emit(batch, GENX(3DSTATE_TE), te);
+ anv_batch_emit(batch, GENX(3DSTATE_DS), DS);
+ anv_batch_emit(batch, GENX(3DSTATE_GS), gs);
+ anv_batch_emit(batch, GENX(3DSTATE_PS), gs);
+
+#if GFX_VERx10 >= 125
+ /* Disable Mesh, we can't have this and streamout enabled at the same
+ * time.
+ */
+ if (device->vk.enabled_extensions.EXT_mesh_shader) {
+ anv_batch_emit(batch, GENX(3DSTATE_MESH_CONTROL), mesh);
+ anv_batch_emit(batch, GENX(3DSTATE_TASK_CONTROL), task);
+ }
+#endif
+
+#if INTEL_WA_16013994831_GFX_VER
+ /* Wa_16013994831 - Disable preemption during streamout. */
+ if (intel_needs_workaround(device->info, 16013994831))
+ genX(batch_set_preemption)(batch, device->info, _3D, false);
+#endif
+
+ anv_batch_emit(batch, GENX(3DSTATE_SBE), sbe) {
+ sbe.VertexURBEntryReadOffset = 1;
+ sbe.NumberofSFOutputAttributes = 1;
+ sbe.VertexURBEntryReadLength = 1;
+ sbe.ForceVertexURBEntryReadLength = true;
+ sbe.ForceVertexURBEntryReadOffset = true;
+
+ for (unsigned i = 0; i < 32; i++)
+ sbe.AttributeActiveComponentFormat[i] = ACF_XYZW;
+ }
+
+ /* Emit URB setup. We tell it that the VS is active because we want it to
+ * allocate space for the VS. Even though one isn't run, we need VUEs to
+ * store the data that VF is going to pass to SOL.
+ */
+ const unsigned entry_size[4] = { DIV_ROUND_UP(32, 64), 1, 1, 1 };
+ memcpy(urb_cfg_out->size, &entry_size, sizeof(entry_size));
+
+ genX(emit_urb_setup)(device, batch, l3_config,
+ VK_SHADER_STAGE_VERTEX_BIT, urb_cfg_in, urb_cfg_out,
+ NULL);
+
+#if GFX_VER >= 12
+ /* Disable Primitive Replication. */
+ anv_batch_emit(batch, GENX(3DSTATE_PRIMITIVE_REPLICATION), pr);
+#endif
+ anv_batch_emit(batch, GENX(3DSTATE_VF_TOPOLOGY), topo) {
+ topo.PrimitiveTopologyType = _3DPRIM_POINTLIST;
+ }
+
+ anv_batch_emit(batch, GENX(3DSTATE_VF_STATISTICS), vf) {
+ vf.StatisticsEnable = false;
+ }
+}
+
+static void
+emit_so_memcpy(struct anv_batch *batch, struct anv_device *device,
+ struct anv_address dst, struct anv_address src,
+ uint32_t size)
+{
/* The maximum copy block size is 4 32-bit components at a time. */
assert(size % 4 == 0);
unsigned bs = gcd_pow2_u64(16, size);
@@ -72,38 +144,23 @@ genX(cmd_buffer_so_memcpy)(struct anv_cmd_buffer *cmd_buffer,
unreachable("Invalid size");
}
- if (!cmd_buffer->state.current_l3_config) {
- const struct intel_l3_config *cfg =
- intel_get_default_l3_config(&cmd_buffer->device->info);
- genX(cmd_buffer_config_l3)(cmd_buffer, cfg);
- }
-
- genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(cmd_buffer, 32, src, size);
- genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
-
- genX(flush_pipeline_select_3d)(cmd_buffer);
-
uint32_t *dw;
- dw = anv_batch_emitn(&cmd_buffer->batch, 5, GENX(3DSTATE_VERTEX_BUFFERS));
- GENX(VERTEX_BUFFER_STATE_pack)(&cmd_buffer->batch, dw + 1,
+ dw = anv_batch_emitn(batch, 5, GENX(3DSTATE_VERTEX_BUFFERS));
+ GENX(VERTEX_BUFFER_STATE_pack)(batch, dw + 1,
&(struct GENX(VERTEX_BUFFER_STATE)) {
.VertexBufferIndex = 32, /* Reserved for this */
.AddressModifyEnable = true,
.BufferStartingAddress = src,
.BufferPitch = bs,
- .MOCS = anv_mocs(cmd_buffer->device, src.bo, 0),
+ .MOCS = anv_mocs(device, src.bo, 0),
#if GFX_VER >= 12
.L3BypassDisable = true,
#endif
-#if (GFX_VER >= 8)
.BufferSize = size,
-#else
- .EndAddress = anv_address_add(src, size - 1),
-#endif
});
- dw = anv_batch_emitn(&cmd_buffer->batch, 3, GENX(3DSTATE_VERTEX_ELEMENTS));
- GENX(VERTEX_ELEMENT_STATE_pack)(&cmd_buffer->batch, dw + 1,
+ dw = anv_batch_emitn(batch, 3, GENX(3DSTATE_VERTEX_ELEMENTS));
+ GENX(VERTEX_ELEMENT_STATE_pack)(batch, dw + 1,
&(struct GENX(VERTEX_ELEMENT_STATE)) {
.VertexBufferIndex = 32,
.Valid = true,
@@ -115,69 +172,29 @@ genX(cmd_buffer_so_memcpy)(struct anv_cmd_buffer *cmd_buffer,
.Component3Control = (bs >= 16) ? VFCOMP_STORE_SRC : VFCOMP_STORE_0,
});
-#if GFX_VER >= 8
- anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VF_INSTANCING), vfi) {
- vfi.InstancingEnable = false;
- vfi.VertexElementIndex = 0;
- }
-#endif
-#if GFX_VER >= 8
- anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VF_SGVS), sgvs);
-#endif
-
- /* Disable all shader stages */
- anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VS), vs);
- anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_HS), hs);
- anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_TE), te);
- anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_DS), DS);
- anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_GS), gs);
- anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_PS), gs);
-
- anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_SBE), sbe) {
- sbe.VertexURBEntryReadOffset = 1;
- sbe.NumberofSFOutputAttributes = 1;
- sbe.VertexURBEntryReadLength = 1;
-#if GFX_VER >= 8
- sbe.ForceVertexURBEntryReadLength = true;
- sbe.ForceVertexURBEntryReadOffset = true;
-#endif
-
-#if GFX_VER >= 9
- for (unsigned i = 0; i < 32; i++)
- sbe.AttributeActiveComponentFormat[i] = ACF_XYZW;
-#endif
- }
-
- /* Emit URB setup. We tell it that the VS is active because we want it to
- * allocate space for the VS. Even though one isn't run, we need VUEs to
- * store the data that VF is going to pass to SOL.
+ /* Wa_16011411144:
+ *
+ * SW must insert a PIPE_CONTROL cmd before and after the
+ * 3dstate_so_buffer_index_0/1/2/3 states to ensure so_buffer_index_*
+ * state is not combined with other state changes.
*/
- const unsigned entry_size[4] = { DIV_ROUND_UP(32, 64), 1, 1, 1 };
-
- genX(emit_urb_setup)(cmd_buffer->device, &cmd_buffer->batch,
- cmd_buffer->state.current_l3_config,
- VK_SHADER_STAGE_VERTEX_BIT, entry_size, NULL);
+ if (intel_needs_workaround(device->info, 16011411144))
+ genx_batch_emit_pipe_control(batch, device->info, _3D, ANV_PIPE_CS_STALL_BIT);
- anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_SO_BUFFER), sob) {
+ anv_batch_emit(batch, GENX(3DSTATE_SO_BUFFER), sob) {
#if GFX_VER < 12
sob.SOBufferIndex = 0;
#else
sob._3DCommandOpcode = 0;
sob._3DCommandSubOpcode = SO_BUFFER_INDEX_0_CMD;
#endif
- sob.MOCS = anv_mocs(cmd_buffer->device, dst.bo, 0),
+ sob.MOCS = anv_mocs(device, dst.bo, ISL_SURF_USAGE_STREAM_OUT_BIT),
sob.SurfaceBaseAddress = dst;
-#if GFX_VER >= 8
sob.SOBufferEnable = true;
sob.SurfaceSize = size / 4 - 1;
-#else
- sob.SurfacePitch = bs;
- sob.SurfaceEndAddress = anv_address_add(dst, size);
-#endif
-#if GFX_VER >= 8
/* As SOL writes out data, it updates the SO_WRITE_OFFSET registers with
* the end position of the stream. We need to reset this value to 0 at
* the beginning of the run or else SOL will start at the offset from
@@ -185,21 +202,16 @@ genX(cmd_buffer_so_memcpy)(struct anv_cmd_buffer *cmd_buffer,
*/
sob.StreamOffsetWriteEnable = true;
sob.StreamOffset = 0;
-#endif
}
-#if GFX_VER <= 7
- /* The hardware can do this for us on BDW+ (see above) */
- anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_IMM), load) {
- load.RegisterOffset = GENX(SO_WRITE_OFFSET0_num);
- load.DataDWord = 0;
- }
-#endif
+ /* Wa_16011411144: also CS_STALL after touching SO_BUFFER change */
+ if (intel_needs_workaround(device->info, 16011411144))
+ genx_batch_emit_pipe_control(batch, device->info, _3D, ANV_PIPE_CS_STALL_BIT);
- dw = anv_batch_emitn(&cmd_buffer->batch, 5, GENX(3DSTATE_SO_DECL_LIST),
+ dw = anv_batch_emitn(batch, 5, GENX(3DSTATE_SO_DECL_LIST),
.StreamtoBufferSelects0 = (1 << 0),
.NumEntries0 = 1);
- GENX(SO_DECL_ENTRY_pack)(&cmd_buffer->batch, dw + 3,
+ GENX(SO_DECL_ENTRY_pack)(batch, dw + 3,
&(struct GENX(SO_DECL_ENTRY)) {
.Stream0Decl = {
.OutputBufferSlot = 0,
@@ -208,36 +220,22 @@ genX(cmd_buffer_so_memcpy)(struct anv_cmd_buffer *cmd_buffer,
},
});
- anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_STREAMOUT), so) {
+#if GFX_VERx10 == 125
+ /* Wa_14015946265: Send PC with CS stall after SO_DECL. */
+ genx_batch_emit_pipe_control(batch, device->info, _3D, ANV_PIPE_CS_STALL_BIT);
+#endif
+
+ anv_batch_emit(batch, GENX(3DSTATE_STREAMOUT), so) {
so.SOFunctionEnable = true;
so.RenderingDisable = true;
so.Stream0VertexReadOffset = 0;
so.Stream0VertexReadLength = DIV_ROUND_UP(32, 64);
-#if GFX_VER >= 8
so.Buffer0SurfacePitch = bs;
-#else
- so.SOBufferEnable0 = true;
-#endif
- }
-
-#if GFX_VER >= 8
- anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VF_TOPOLOGY), topo) {
- topo.PrimitiveTopologyType = _3DPRIM_POINTLIST;
}
-#endif
-
- anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VF_STATISTICS), vf) {
- vf.StatisticsEnable = false;
- }
-
-#if GFX_VER >= 12
- /* Disable Primitive Replication. */
- anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_PRIMITIVE_REPLICATION), pr);
-#endif
- anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
+ genX(emit_breakpoint)(batch, device, true);
+ anv_batch_emit(batch, GENX(3DPRIMITIVE), prim) {
prim.VertexAccessType = SEQUENTIAL;
- prim.PrimitiveTopologyType = _3DPRIM_POINTLIST;
prim.VertexCountPerInstance = size / bs;
prim.StartVertexLocation = 0;
prim.InstanceCount = 1;
@@ -245,8 +243,147 @@ genX(cmd_buffer_so_memcpy)(struct anv_cmd_buffer *cmd_buffer,
prim.BaseVertexLocation = 0;
}
+ genX(batch_emit_post_3dprimitive_was)(batch,
+ device,
+ _3DPRIM_POINTLIST, size / bs);
+
+ genX(emit_breakpoint)(batch, device, false);
+}
+
+void
+genX(emit_so_memcpy_init)(struct anv_memcpy_state *state,
+ struct anv_device *device,
+ struct anv_batch *batch)
+{
+ memset(state, 0, sizeof(*state));
+
+ state->batch = batch;
+ state->device = device;
+
+ const struct intel_l3_config *cfg = intel_get_default_l3_config(device->info);
+ genX(emit_l3_config)(batch, device, cfg);
+ genX(emit_pipeline_select)(batch, _3D, device);
+
+ struct intel_urb_config urb_cfg_in = { 0 };
+ struct intel_urb_config urb_cfg = { 0 };
+
+ emit_common_so_memcpy(batch, device, &urb_cfg_in, &urb_cfg, cfg);
+}
+
+void
+genX(emit_so_memcpy_fini)(struct anv_memcpy_state *state)
+{
+ genX(emit_apply_pipe_flushes)(state->batch, state->device, _3D,
+ ANV_PIPE_END_OF_PIPE_SYNC_BIT,
+ NULL);
+}
+
+void
+genX(emit_so_memcpy_end)(struct anv_memcpy_state *state)
+{
+ if (intel_needs_workaround(state->device->info, 16013994831))
+ genX(batch_set_preemption)(state->batch, state->device->info, _3D, true);
+
+ anv_batch_emit(state->batch, GENX(MI_BATCH_BUFFER_END), end);
+
+ if ((state->batch->next - state->batch->start) & 4)
+ anv_batch_emit(state->batch, GENX(MI_NOOP), noop);
+}
+
+void
+genX(emit_so_memcpy)(struct anv_memcpy_state *state,
+ struct anv_address dst, struct anv_address src,
+ uint32_t size)
+{
+ if (GFX_VER == 9 &&
+ anv_gfx8_9_vb_cache_range_needs_workaround(&state->vb_bound,
+ &state->vb_dirty,
+ src, size)) {
+ genX(emit_apply_pipe_flushes)(state->batch, state->device, _3D,
+ ANV_PIPE_CS_STALL_BIT |
+ ANV_PIPE_VF_CACHE_INVALIDATE_BIT,
+ NULL);
+ memset(&state->vb_dirty, 0, sizeof(state->vb_dirty));
+ }
+
+ emit_so_memcpy(state->batch, state->device, dst, src, size);
+}
+
+void
+genX(cmd_buffer_so_memcpy)(struct anv_cmd_buffer *cmd_buffer,
+ struct anv_address dst, struct anv_address src,
+ uint32_t size)
+{
+ if (size == 0)
+ return;
+
+ if (!cmd_buffer->state.current_l3_config) {
+ const struct intel_l3_config *cfg =
+ intel_get_default_l3_config(cmd_buffer->device->info);
+ genX(cmd_buffer_config_l3)(cmd_buffer, cfg);
+ }
+
+#if GFX_VER == 9
+ genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(cmd_buffer, 32, src, size);
+#endif
+
+ /* Wa_14015814527 */
+ genX(apply_task_urb_workaround)(cmd_buffer);
+
+ genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+
+ genX(flush_pipeline_select_3d)(cmd_buffer);
+
+ struct intel_urb_config urb_cfg;
+
+ emit_common_so_memcpy(&cmd_buffer->batch, cmd_buffer->device,
+ &cmd_buffer->state.gfx.urb_cfg,
+ &urb_cfg,
+ cmd_buffer->state.current_l3_config);
+ emit_so_memcpy(&cmd_buffer->batch, cmd_buffer->device, dst, src, size);
+
+#if GFX_VER == 9
genX(cmd_buffer_update_dirty_vbs_for_gfx8_vb_flush)(cmd_buffer, SEQUENTIAL,
1ull << 32);
+#endif
+
+ /* Update urb config after memcpy. */
+ memcpy(&cmd_buffer->state.gfx.urb_cfg, &urb_cfg,
+ sizeof(struct intel_urb_config));
+
+ /* Flag all the instructions emitted by the memcpy. */
+ struct anv_gfx_dynamic_state *hw_state =
+ &cmd_buffer->state.gfx.dyn_state;
+
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_URB);
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VF_STATISTICS);
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VF);
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VF_TOPOLOGY);
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VERTEX_INPUT);
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VF_SGVS);
+#if GFX_VER >= 11
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VF_SGVS_2);
+#endif
+#if GFX_VER >= 12
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_PRIMITIVE_REPLICATION);
+#endif
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_SO_DECL_LIST);
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_STREAMOUT);
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_SAMPLE_MASK);
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_MULTISAMPLE);
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_SF);
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_SBE);
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VS);
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_HS);
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_DS);
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_TE);
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_GS);
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_PS);
+ if (cmd_buffer->device->vk.enabled_extensions.EXT_mesh_shader) {
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_MESH_CONTROL);
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_TASK_CONTROL);
+ }
- cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_PIPELINE;
+ cmd_buffer->state.gfx.dirty |= ~(ANV_CMD_DIRTY_PIPELINE |
+ ANV_CMD_DIRTY_INDEX_BUFFER);
}
diff --git a/src/intel/vulkan/genX_init_state.c b/src/intel/vulkan/genX_init_state.c
new file mode 100644
index 00000000000..e86a6e42232
--- /dev/null
+++ b/src/intel/vulkan/genX_init_state.c
@@ -0,0 +1,1446 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "anv_private.h"
+
+#include "common/intel_aux_map.h"
+#include "common/intel_sample_positions.h"
+#include "common/intel_pixel_hash.h"
+#include "genxml/gen_macros.h"
+#include "genxml/genX_pack.h"
+
+#include "vk_standard_sample_locations.h"
+
+#if GFX_VERx10 == 125 && ANV_SUPPORT_RT
+#include "grl/genX_grl.h"
+#endif
+
+#include "vk_util.h"
+#include "vk_format.h"
+
+static void
+genX(emit_slice_hashing_state)(struct anv_device *device,
+ struct anv_batch *batch)
+{
+#if GFX_VER == 11
+ /* Gfx11 hardware has two pixel pipes at most. */
+ for (unsigned i = 2; i < ARRAY_SIZE(device->info->ppipe_subslices); i++)
+ assert(device->info->ppipe_subslices[i] == 0);
+
+ if (device->info->ppipe_subslices[0] == device->info->ppipe_subslices[1])
+ return;
+
+ if (!device->slice_hash.alloc_size) {
+ unsigned size = GENX(SLICE_HASH_TABLE_length) * 4;
+ device->slice_hash =
+ anv_state_pool_alloc(&device->dynamic_state_pool, size, 64);
+
+ const bool flip = device->info->ppipe_subslices[0] <
+ device->info->ppipe_subslices[1];
+ struct GENX(SLICE_HASH_TABLE) table;
+ intel_compute_pixel_hash_table_3way(16, 16, 3, 3, flip, table.Entry[0]);
+
+ GENX(SLICE_HASH_TABLE_pack)(NULL, device->slice_hash.map, &table);
+ }
+
+ anv_batch_emit(batch, GENX(3DSTATE_SLICE_TABLE_STATE_POINTERS), ptr) {
+ ptr.SliceHashStatePointerValid = true;
+ ptr.SliceHashTableStatePointer = device->slice_hash.offset;
+ }
+
+ anv_batch_emit(batch, GENX(3DSTATE_3D_MODE), mode) {
+ mode.SliceHashingTableEnable = true;
+ }
+#elif GFX_VERx10 == 120
+ /* For each n calculate ppipes_of[n], equal to the number of pixel pipes
+ * present with n active dual subslices.
+ */
+ unsigned ppipes_of[3] = {};
+
+ for (unsigned n = 0; n < ARRAY_SIZE(ppipes_of); n++) {
+ for (unsigned p = 0; p < 3; p++)
+ ppipes_of[n] += (device->info->ppipe_subslices[p] == n);
+ }
+
+ /* Gfx12 has three pixel pipes. */
+ for (unsigned p = 3; p < ARRAY_SIZE(device->info->ppipe_subslices); p++)
+ assert(device->info->ppipe_subslices[p] == 0);
+
+ if (ppipes_of[2] == 3 || ppipes_of[0] == 2) {
+ /* All three pixel pipes have the maximum number of active dual
+ * subslices, or there is only one active pixel pipe: Nothing to do.
+ */
+ return;
+ }
+
+ anv_batch_emit(batch, GENX(3DSTATE_SUBSLICE_HASH_TABLE), p) {
+ p.SliceHashControl[0] = TABLE_0;
+
+ if (ppipes_of[2] == 2 && ppipes_of[0] == 1)
+ intel_compute_pixel_hash_table_3way(8, 16, 2, 2, 0, p.TwoWayTableEntry[0]);
+ else if (ppipes_of[2] == 1 && ppipes_of[1] == 1 && ppipes_of[0] == 1)
+ intel_compute_pixel_hash_table_3way(8, 16, 3, 3, 0, p.TwoWayTableEntry[0]);
+
+ if (ppipes_of[2] == 2 && ppipes_of[1] == 1)
+ intel_compute_pixel_hash_table_3way(8, 16, 5, 4, 0, p.ThreeWayTableEntry[0]);
+ else if (ppipes_of[2] == 2 && ppipes_of[0] == 1)
+ intel_compute_pixel_hash_table_3way(8, 16, 2, 2, 0, p.ThreeWayTableEntry[0]);
+ else if (ppipes_of[2] == 1 && ppipes_of[1] == 1 && ppipes_of[0] == 1)
+ intel_compute_pixel_hash_table_3way(8, 16, 3, 3, 0, p.ThreeWayTableEntry[0]);
+ else
+ unreachable("Illegal fusing.");
+ }
+
+ anv_batch_emit(batch, GENX(3DSTATE_3D_MODE), p) {
+ p.SubsliceHashingTableEnable = true;
+ p.SubsliceHashingTableEnableMask = true;
+ }
+#elif GFX_VERx10 == 125
+ /* Calculate the set of present pixel pipes, and another set of
+ * present pixel pipes with 2 dual subslices enabled, the latter
+ * will appear on the hashing table with twice the frequency of
+ * pixel pipes with a single dual subslice present.
+ */
+ uint32_t ppipe_mask1 = 0, ppipe_mask2 = 0;
+ for (unsigned p = 0; p < ARRAY_SIZE(device->info->ppipe_subslices); p++) {
+ if (device->info->ppipe_subslices[p] > 0)
+ ppipe_mask1 |= (1u << p);
+ if (device->info->ppipe_subslices[p] > 1)
+ ppipe_mask2 |= (1u << p);
+ }
+ assert(ppipe_mask1);
+
+ if (!device->slice_hash.alloc_size) {
+ unsigned size = GENX(SLICE_HASH_TABLE_length) * 4;
+ device->slice_hash =
+ anv_state_pool_alloc(&device->dynamic_state_pool, size, 64);
+
+ struct GENX(SLICE_HASH_TABLE) table;
+
+ /* Note that the hardware expects an array with 7 tables, each
+ * table is intended to specify the pixel pipe hashing behavior
+ * for every possible slice count between 2 and 8, however that
+ * doesn't actually work, among other reasons due to hardware
+ * bugs that will cause the GPU to erroneously access the table
+ * at the wrong index in some cases, so in practice all 7 tables
+ * need to be initialized to the same value.
+ */
+ for (unsigned i = 0; i < 7; i++)
+ intel_compute_pixel_hash_table_nway(16, 16, ppipe_mask1, ppipe_mask2,
+ table.Entry[i][0]);
+
+ GENX(SLICE_HASH_TABLE_pack)(NULL, device->slice_hash.map, &table);
+ }
+
+ anv_batch_emit(batch, GENX(3DSTATE_SLICE_TABLE_STATE_POINTERS), ptr) {
+ ptr.SliceHashStatePointerValid = true;
+ ptr.SliceHashTableStatePointer = device->slice_hash.offset;
+ }
+
+ /* TODO: Figure out FCV support for other platforms
+ * Testing indicates that FCV is broken gfx125.
+ * Let's disable FCV for now till we figure out what's wrong.
+ *
+ * Alternatively, it can be toggled off via drirc option 'anv_disable_fcv'.
+ *
+ * Ref: https://gitlab.freedesktop.org/mesa/mesa/-/issues/9987
+ * Ref: https://gitlab.freedesktop.org/mesa/mesa/-/issues/10318
+ * Ref: https://gitlab.freedesktop.org/mesa/mesa/-/issues/10795
+ * Ref: Internal issue 1480 about Unreal Engine 5.1
+ */
+ anv_batch_emit(batch, GENX(3DSTATE_3D_MODE), mode) {
+ mode.SliceHashingTableEnable = true;
+ mode.SliceHashingTableEnableMask = true;
+ mode.CrossSliceHashingMode = (util_bitcount(ppipe_mask1) > 1 ?
+ hashing32x32 : NormalMode);
+ mode.CrossSliceHashingModeMask = -1;
+ mode.FastClearOptimizationEnable = !device->physical->disable_fcv;
+ mode.FastClearOptimizationEnableMask = !device->physical->disable_fcv;
+ }
+#endif
+}
+
+static void
+init_common_queue_state(struct anv_queue *queue, struct anv_batch *batch)
+{
+ UNUSED struct anv_device *device = queue->device;
+
+#if GFX_VER >= 11
+ /* Starting with GFX version 11, SLM is no longer part of the L3$ config
+ * so it never changes throughout the lifetime of the VkDevice.
+ */
+ const struct intel_l3_config *cfg = intel_get_default_l3_config(device->info);
+ genX(emit_l3_config)(batch, device, cfg);
+ device->l3_config = cfg;
+#endif
+
+#if GFX_VERx10 == 125
+ /* Even though L3 partial write merging is supposed to be enabled
+ * by default on Gfx12.5 according to the hardware spec, i915
+ * appears to accidentally clear the enables during context
+ * initialization, so make sure to enable them here since partial
+ * write merging has a large impact on rendering performance.
+ */
+ anv_batch_write_reg(batch, GENX(L3SQCREG5), reg) {
+ reg.L3CachePartialWriteMergeTimerInitialValue = 0x7f;
+ reg.CompressiblePartialWriteMergeEnable = true;
+ reg.CoherentPartialWriteMergeEnable = true;
+ reg.CrossTilePartialWriteMergeEnable = true;
+ }
+#endif
+
+ /* Emit STATE_BASE_ADDRESS on Gfx12+ because we set a default CPS_STATE and
+ * those are relative to STATE_BASE_ADDRESS::DynamicStateBaseAddress.
+ */
+#if GFX_VER >= 12
+
+#if GFX_VERx10 >= 125
+ /* Wa_14016407139:
+ *
+ * "On Surface state base address modification, for 3D workloads, SW must
+ * always program PIPE_CONTROL either with CS Stall or PS sync stall. In
+ * both the cases set Render Target Cache Flush Enable".
+ */
+ genx_batch_emit_pipe_control(batch, device->info,
+ 0,
+ ANV_PIPE_CS_STALL_BIT |
+ ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT);
+#endif
+
+ /* GEN:BUG:1607854226:
+ *
+ * Non-pipelined state has issues with not applying in MEDIA/GPGPU mode.
+ * Fortunately, we always start the context off in 3D mode.
+ */
+ uint32_t mocs = device->isl_dev.mocs.internal;
+ anv_batch_emit(batch, GENX(STATE_BASE_ADDRESS), sba) {
+ sba.GeneralStateBaseAddress = (struct anv_address) { NULL, 0 };
+ sba.GeneralStateBufferSize = 0xfffff;
+ sba.GeneralStateMOCS = mocs;
+ sba.GeneralStateBaseAddressModifyEnable = true;
+ sba.GeneralStateBufferSizeModifyEnable = true;
+
+ sba.StatelessDataPortAccessMOCS = mocs;
+
+ sba.SurfaceStateBaseAddress =
+ (struct anv_address) { .offset =
+ device->physical->va.internal_surface_state_pool.addr,
+ };
+ sba.SurfaceStateMOCS = mocs;
+ sba.SurfaceStateBaseAddressModifyEnable = true;
+
+ sba.DynamicStateBaseAddress =
+ (struct anv_address) { .offset =
+ device->physical->va.dynamic_state_pool.addr,
+ };
+ sba.DynamicStateBufferSize = (device->physical->va.dynamic_state_pool.size +
+ device->physical->va.sampler_state_pool.size) / 4096;
+ sba.DynamicStateMOCS = mocs;
+ sba.DynamicStateBaseAddressModifyEnable = true;
+ sba.DynamicStateBufferSizeModifyEnable = true;
+
+ sba.IndirectObjectBaseAddress = (struct anv_address) { NULL, 0 };
+ sba.IndirectObjectBufferSize = 0xfffff;
+ sba.IndirectObjectMOCS = mocs;
+ sba.IndirectObjectBaseAddressModifyEnable = true;
+ sba.IndirectObjectBufferSizeModifyEnable = true;
+
+ sba.InstructionBaseAddress =
+ (struct anv_address) { .offset =
+ device->physical->va.instruction_state_pool.addr,
+ };
+ sba.InstructionBufferSize = device->physical->va.instruction_state_pool.size / 4096;
+ sba.InstructionMOCS = mocs;
+ sba.InstructionBaseAddressModifyEnable = true;
+ sba.InstructionBuffersizeModifyEnable = true;
+
+#if GFX_VER >= 11
+ sba.BindlessSamplerStateBaseAddress = ANV_NULL_ADDRESS;
+ sba.BindlessSamplerStateBufferSize = 0;
+ sba.BindlessSamplerStateMOCS = mocs;
+ sba.BindlessSamplerStateBaseAddressModifyEnable = true;
+#endif
+
+ if (device->physical->indirect_descriptors) {
+ sba.BindlessSurfaceStateBaseAddress =
+ (struct anv_address) { .offset =
+ device->physical->va.bindless_surface_state_pool.addr,
+ };
+ sba.BindlessSurfaceStateSize =
+ anv_physical_device_bindless_heap_size(device->physical, false) /
+ ANV_SURFACE_STATE_SIZE - 1;
+ sba.BindlessSurfaceStateMOCS = mocs;
+ sba.BindlessSurfaceStateBaseAddressModifyEnable = true;
+ } else {
+ /* Bindless Surface State & Bindless Sampler State are aligned to the
+ * same heap
+ */
+ sba.BindlessSurfaceStateBaseAddress = (struct anv_address) {
+ .offset = device->physical->va.internal_surface_state_pool.addr,
+ };
+ sba.BindlessSurfaceStateSize =
+ (device->physical->va.internal_surface_state_pool.size +
+ device->physical->va.bindless_surface_state_pool.size) - 1;
+ sba.BindlessSurfaceStateMOCS = mocs;
+ sba.BindlessSurfaceStateBaseAddressModifyEnable = true;
+ }
+
+#if GFX_VERx10 >= 125
+ sba.L1CacheControl = L1CC_WB;
+#endif
+ }
+#endif
+
+#if GFX_VERx10 >= 125
+ if (ANV_SUPPORT_RT && device->info->has_ray_tracing) {
+ anv_batch_emit(batch, GENX(3DSTATE_BTD), btd) {
+ /* TODO: This is the timeout after which the bucketed thread
+ * dispatcher will kick off a wave of threads. We go with the
+ * lowest value for now. It could be tweaked on a per
+ * application basis (drirc).
+ */
+ btd.DispatchTimeoutCounter = _64clocks;
+ /* BSpec 43851: "This field must be programmed to 6h i.e. memory
+ * backed buffer must be 128KB."
+ */
+ btd.PerDSSMemoryBackedBufferSize = 6;
+ btd.MemoryBackedBufferBasePointer = (struct anv_address) {
+ /* This batch doesn't have a reloc list so we can't use the BO
+ * here. We just use the address directly.
+ */
+ .offset = device->btd_fifo_bo->offset,
+ };
+ }
+ }
+#endif
+}
+
+static VkResult
+init_render_queue_state(struct anv_queue *queue, bool is_companion_rcs_batch)
+{
+ struct anv_device *device = queue->device;
+ UNUSED const struct intel_device_info *devinfo = queue->device->info;
+ uint32_t cmds[256];
+ struct anv_batch batch = {
+ .start = cmds,
+ .next = cmds,
+ .end = (void *) cmds + sizeof(cmds),
+ };
+
+ struct GENX(VERTEX_ELEMENT_STATE) empty_ve = {
+ .Valid = true,
+ .Component0Control = VFCOMP_STORE_0,
+ .Component1Control = VFCOMP_STORE_0,
+ .Component2Control = VFCOMP_STORE_0,
+ .Component3Control = VFCOMP_STORE_0,
+ };
+ GENX(VERTEX_ELEMENT_STATE_pack)(NULL, device->empty_vs_input, &empty_ve);
+
+ genX(emit_pipeline_select)(&batch, _3D, device);
+
+#if GFX_VER == 9
+ anv_batch_write_reg(&batch, GENX(CACHE_MODE_1), cm1) {
+ cm1.FloatBlendOptimizationEnable = true;
+ cm1.FloatBlendOptimizationEnableMask = true;
+ cm1.MSCRAWHazardAvoidanceBit = true;
+ cm1.MSCRAWHazardAvoidanceBitMask = true;
+ cm1.PartialResolveDisableInVC = true;
+ cm1.PartialResolveDisableInVCMask = true;
+ }
+#endif
+
+ anv_batch_emit(&batch, GENX(3DSTATE_AA_LINE_PARAMETERS), aa);
+
+ anv_batch_emit(&batch, GENX(3DSTATE_DRAWING_RECTANGLE), rect) {
+ rect.ClippedDrawingRectangleYMin = 0;
+ rect.ClippedDrawingRectangleXMin = 0;
+ rect.ClippedDrawingRectangleYMax = UINT16_MAX;
+ rect.ClippedDrawingRectangleXMax = UINT16_MAX;
+ rect.DrawingRectangleOriginY = 0;
+ rect.DrawingRectangleOriginX = 0;
+ }
+
+ anv_batch_emit(&batch, GENX(3DSTATE_WM_CHROMAKEY), ck);
+
+ /* SKL PRMs, Volume 2a: Command Reference: Instructions: 3DSTATE_WM_HZ_OP:
+ *
+ * "3DSTATE_RASTER if used must be programmed prior to using this
+ * packet."
+ *
+ * Emit this before 3DSTATE_WM_HZ_OP below.
+ */
+ anv_batch_emit(&batch, GENX(3DSTATE_RASTER), rast) {
+ rast.APIMode = DX101;
+ }
+
+ /* SKL PRMs, Volume 2a: Command Reference: Instructions: 3DSTATE_WM_HZ_OP:
+ *
+ * "3DSTATE_MULTISAMPLE packet must be used prior to this packet to
+ * change the Number of Multisamples. This packet must not be used to
+ * change Number of Multisamples in a rendering sequence."
+ *
+ * Emit this before 3DSTATE_WM_HZ_OP below.
+ */
+ anv_batch_emit(&batch, GENX(3DSTATE_MULTISAMPLE), ms);
+
+ /* The BDW+ docs describe how to use the 3DSTATE_WM_HZ_OP instruction in the
+ * section titled, "Optimized Depth Buffer Clear and/or Stencil Buffer
+ * Clear." It mentions that the packet overrides GPU state for the clear
+ * operation and needs to be reset to 0s to clear the overrides. Depending
+ * on the kernel, we may not get a context with the state for this packet
+ * zeroed. Do it ourselves just in case. We've observed this to prevent a
+ * number of GPU hangs on ICL.
+ */
+ anv_batch_emit(&batch, GENX(3DSTATE_WM_HZ_OP), hzp);
+
+ genX(emit_sample_pattern)(&batch, NULL);
+
+#if GFX_VER == 11
+ /* The default behavior of bit 5 "Headerless Message for Pre-emptable
+ * Contexts" in SAMPLER MODE register is set to 0, which means
+ * headerless sampler messages are not allowed for pre-emptable
+ * contexts. Set the bit 5 to 1 to allow them.
+ */
+ anv_batch_write_reg(&batch, GENX(SAMPLER_MODE), sm) {
+ sm.HeaderlessMessageforPreemptableContexts = true;
+ sm.HeaderlessMessageforPreemptableContextsMask = true;
+ }
+
+ /* Bit 1 "Enabled Texel Offset Precision Fix" must be set in
+ * HALF_SLICE_CHICKEN7 register.
+ */
+ anv_batch_write_reg(&batch, GENX(HALF_SLICE_CHICKEN7), hsc7) {
+ hsc7.EnabledTexelOffsetPrecisionFix = true;
+ hsc7.EnabledTexelOffsetPrecisionFixMask = true;
+ }
+
+ anv_batch_write_reg(&batch, GENX(TCCNTLREG), tcc) {
+ tcc.L3DataPartialWriteMergingEnable = true;
+ tcc.ColorZPartialWriteMergingEnable = true;
+ tcc.URBPartialWriteMergingEnable = true;
+ tcc.TCDisable = true;
+ }
+#endif
+ genX(emit_slice_hashing_state)(device, &batch);
+
+#if GFX_VER >= 11
+ /* hardware specification recommends disabling repacking for
+ * the compatibility with decompression mechanism in display controller.
+ */
+ if (device->info->disable_ccs_repack) {
+ anv_batch_write_reg(&batch, GENX(CACHE_MODE_0), cm0) {
+ cm0.DisableRepackingforCompression = true;
+ cm0.DisableRepackingforCompressionMask = true;
+ }
+ }
+
+ /* an unknown issue is causing vs push constants to become
+ * corrupted during object-level preemption. For now, restrict
+ * to command buffer level preemption to avoid rendering
+ * corruption.
+ */
+ anv_batch_write_reg(&batch, GENX(CS_CHICKEN1), cc1) {
+ cc1.ReplayMode = MidcmdbufferPreemption;
+ cc1.ReplayModeMask = true;
+
+#if GFX_VERx10 == 120
+ cc1.DisablePreemptionandHighPriorityPausingdueto3DPRIMITIVECommand = true;
+ cc1.DisablePreemptionandHighPriorityPausingdueto3DPRIMITIVECommandMask = true;
+#endif
+ }
+
+#if INTEL_NEEDS_WA_1806527549
+ /* Wa_1806527549 says to disable the following HiZ optimization when the
+ * depth buffer is D16_UNORM. We've found the WA to help with more depth
+ * buffer configurations however, so we always disable it just to be safe.
+ */
+ anv_batch_write_reg(&batch, GENX(HIZ_CHICKEN), reg) {
+ reg.HZDepthTestLEGEOptimizationDisable = true;
+ reg.HZDepthTestLEGEOptimizationDisableMask = true;
+ }
+#endif
+
+#if GFX_VER == 12
+ anv_batch_write_reg(&batch, GENX(FF_MODE2), reg) {
+ /* On Alchemist, the FF_MODE2 docs for the GS timer say:
+ *
+ * "The timer value must be set to 224."
+ *
+ * and Wa_16011163337 indicates this is the case for all Gfx12 parts,
+ * and that this is necessary to avoid hanging the HS/DS units. It
+ * also clarifies that 224 is literally 0xE0 in the bits, not 7*32=224.
+ *
+ * The HS timer docs also have the same quote for Alchemist. I am
+ * unaware of a reason it needs to be set to 224 on Tigerlake, but
+ * we do so for consistency if nothing else.
+ *
+ * For the TDS timer value, the docs say:
+ *
+ * "For best performance, a value of 4 should be programmed."
+ *
+ * i915 also sets it this way on Tigerlake due to workarounds.
+ *
+ * The default VS timer appears to be 0, so we leave it at that.
+ */
+ reg.GSTimerValue = 224;
+ reg.HSTimerValue = 224;
+ reg.TDSTimerValue = 4;
+ reg.VSTimerValue = 0;
+ }
+#endif
+
+#if INTEL_NEEDS_WA_1508744258
+ /* Disable RHWO by setting 0x7010[14] by default except during resolve
+ * pass.
+ *
+ * We implement global disabling of the optimization here and we toggle it
+ * in anv_image_ccs_op().
+ */
+ anv_batch_write_reg(&batch, GENX(COMMON_SLICE_CHICKEN1), c1) {
+ c1.RCCRHWOOptimizationDisable = true;
+ c1.RCCRHWOOptimizationDisableMask = true;
+ }
+#endif
+
+#if GFX_VERx10 < 125
+#define AA_LINE_QUALITY_REG GENX(3D_CHICKEN3)
+#else
+#define AA_LINE_QUALITY_REG GENX(CHICKEN_RASTER_1)
+#endif
+
+ /* Enable the new line drawing algorithm that produces higher quality
+ * lines.
+ */
+ anv_batch_write_reg(&batch, AA_LINE_QUALITY_REG, c3) {
+ c3.AALineQualityFix = true;
+ c3.AALineQualityFixMask = true;
+ }
+#endif
+
+#if GFX_VER == 12
+ if (device->info->has_aux_map) {
+ uint64_t aux_base_addr = intel_aux_map_get_base(device->aux_map_ctx);
+ assert(aux_base_addr % (32 * 1024) == 0);
+ anv_batch_emit(&batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
+ lri.RegisterOffset = GENX(GFX_AUX_TABLE_BASE_ADDR_num);
+ lri.DataDWord = aux_base_addr & 0xffffffff;
+ }
+ anv_batch_emit(&batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
+ lri.RegisterOffset = GENX(GFX_AUX_TABLE_BASE_ADDR_num) + 4;
+ lri.DataDWord = aux_base_addr >> 32;
+ }
+ }
+#endif
+
+#if GFX_VERx10 == 125
+ anv_batch_write_reg(&batch, GENX(CHICKEN_RASTER_2), reg) {
+ reg.TBIMRBatchSizeOverride = true;
+ reg.TBIMROpenBatchEnable = true;
+ reg.TBIMRFastClip = true;
+ reg.TBIMRBatchSizeOverrideMask = true;
+ reg.TBIMROpenBatchEnableMask = true;
+ reg.TBIMRFastClipMask = true;
+ }
+#endif
+
+ /* Set the "CONSTANT_BUFFER Address Offset Disable" bit, so
+ * 3DSTATE_CONSTANT_XS buffer 0 is an absolute address.
+ *
+ * This is only safe on kernels with context isolation support.
+ */
+ assert(device->physical->info.has_context_isolation);
+ anv_batch_write_reg(&batch, GENX(CS_DEBUG_MODE2), csdm2) {
+ csdm2.CONSTANT_BUFFERAddressOffsetDisable = true;
+ csdm2.CONSTANT_BUFFERAddressOffsetDisableMask = true;
+ }
+
+ init_common_queue_state(queue, &batch);
+
+ /* Because 3DSTATE_CPS::CoarsePixelShadingStateArrayPointer is relative to
+ * the dynamic state base address we need to emit this instruction after
+ * STATE_BASE_ADDRESS in init_common_queue_state().
+ */
+#if GFX_VER == 11
+ anv_batch_emit(&batch, GENX(3DSTATE_CPS), cps);
+#elif GFX_VER >= 12
+ anv_batch_emit(&batch, GENX(3DSTATE_CPS_POINTERS), cps) {
+ assert(device->cps_states.alloc_size != 0);
+ /* Offset 0 is the disabled state */
+ cps.CoarsePixelShadingStateArrayPointer =
+ device->cps_states.offset;
+ }
+#endif
+
+#if GFX_VERx10 >= 125
+ anv_batch_emit(&batch, GENX(STATE_COMPUTE_MODE), cm) {
+ cm.Mask1 = 0xffff;
+ }
+ anv_batch_emit(&batch, GENX(3DSTATE_MESH_CONTROL), zero);
+ anv_batch_emit(&batch, GENX(3DSTATE_TASK_CONTROL), zero);
+
+ /* We no longer required to explicitly flush or invalidate caches since the
+ * PIPELINE_SELECT is getting deprecated on Xe2+.
+ */
+#if GFX_VER < 20
+ genx_batch_emit_pipe_control_write(&batch, device->info, _3D, NoWrite,
+ ANV_NULL_ADDRESS,
+ 0,
+ ANV_PIPE_FLUSH_BITS | ANV_PIPE_INVALIDATE_BITS);
+#endif
+
+ genX(emit_pipeline_select)(&batch, GPGPU, device);
+ anv_batch_emit(&batch, GENX(CFE_STATE), cfe) {
+ cfe.MaximumNumberofThreads =
+ devinfo->max_cs_threads * devinfo->subslice_total;
+ }
+
+ /* We no longer required to explicitly flush or invalidate caches since the
+ * PIPELINE_SELECT is getting deprecated on Xe2+.
+ */
+#if GFX_VER < 20
+ genx_batch_emit_pipe_control_write(&batch, device->info, _3D, NoWrite,
+ ANV_NULL_ADDRESS,
+ 0,
+ ANV_PIPE_FLUSH_BITS | ANV_PIPE_INVALIDATE_BITS);
+#endif
+
+ genX(emit_pipeline_select)(&batch, _3D, device);
+#endif
+
+ anv_batch_emit(&batch, GENX(MI_BATCH_BUFFER_END), bbe);
+
+ assert(batch.next <= batch.end);
+
+ if (!device->trtt.queue)
+ device->trtt.queue = queue;
+
+ return anv_queue_submit_simple_batch(queue, &batch, is_companion_rcs_batch);
+}
+
+static VkResult
+init_compute_queue_state(struct anv_queue *queue)
+{
+ UNUSED const struct intel_device_info *devinfo = queue->device->info;
+ uint32_t cmds[64];
+ struct anv_batch batch = {
+ .start = cmds,
+ .next = cmds,
+ .end = (void *) cmds + sizeof(cmds),
+ };
+
+ genX(emit_pipeline_select)(&batch, GPGPU, queue->device);
+
+#if GFX_VER == 12
+ if (queue->device->info->has_aux_map) {
+ uint64_t aux_base_addr =
+ intel_aux_map_get_base(queue->device->aux_map_ctx);
+ assert(aux_base_addr % (32 * 1024) == 0);
+ anv_batch_emit(&batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
+ lri.RegisterOffset = GENX(COMPCS0_AUX_TABLE_BASE_ADDR_num);
+ lri.DataDWord = aux_base_addr & 0xffffffff;
+ }
+ anv_batch_emit(&batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
+ lri.RegisterOffset = GENX(COMPCS0_AUX_TABLE_BASE_ADDR_num) + 4;
+ lri.DataDWord = aux_base_addr >> 32;
+ }
+ }
+#else
+ assert(!queue->device->info->has_aux_map);
+#endif
+
+ /* Wa_14015782607 - Issue pipe control with HDC_flush and
+ * untyped cache flush set to 1 when CCS has NP state update with
+ * STATE_COMPUTE_MODE.
+ */
+ if (intel_needs_workaround(devinfo, 14015782607) &&
+ queue->family->engine_class == INTEL_ENGINE_CLASS_COMPUTE) {
+ genx_batch_emit_pipe_control(&batch, devinfo, GPGPU,
+ ANV_PIPE_CS_STALL_BIT |
+ ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT |
+ ANV_PIPE_HDC_PIPELINE_FLUSH_BIT);
+ }
+
+#if GFX_VERx10 >= 125
+ /* Wa_14014427904/22013045878 - We need additional invalidate/flush when
+ * emitting NP state commands with ATS-M in compute mode.
+ */
+ if (intel_device_info_is_atsm(devinfo) &&
+ queue->family->engine_class == INTEL_ENGINE_CLASS_COMPUTE) {
+ genx_batch_emit_pipe_control
+ (&batch, devinfo, GPGPU,
+ ANV_PIPE_CS_STALL_BIT |
+ ANV_PIPE_STATE_CACHE_INVALIDATE_BIT |
+ ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT |
+ ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT |
+ ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT |
+ ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT |
+ ANV_PIPE_HDC_PIPELINE_FLUSH_BIT);
+ }
+
+ anv_batch_emit(&batch, GENX(STATE_COMPUTE_MODE), cm) {
+ cm.PixelAsyncComputeThreadLimit = 4;
+ cm.PixelAsyncComputeThreadLimitMask = 0x7;
+ }
+#endif
+
+ init_common_queue_state(queue, &batch);
+
+#if GFX_VERx10 >= 125
+ anv_batch_emit(&batch, GENX(CFE_STATE), cfe) {
+ cfe.MaximumNumberofThreads =
+ devinfo->max_cs_threads * devinfo->subslice_total;
+ }
+#endif
+
+ anv_batch_emit(&batch, GENX(MI_BATCH_BUFFER_END), bbe);
+
+ assert(batch.next <= batch.end);
+
+ return anv_queue_submit_simple_batch(queue, &batch,
+ false /* is_companion_rcs_batch */);
+}
+
+static VkResult
+init_copy_video_queue_state(struct anv_queue *queue)
+{
+#if GFX_VER >= 12
+ UNUSED const struct intel_device_info *devinfo = queue->device->info;
+ uint32_t cmds[64];
+ UNUSED struct anv_batch batch = {
+ .start = cmds,
+ .next = cmds,
+ .end = (void *) cmds + sizeof(cmds),
+ };
+
+ if (queue->device->info->has_aux_map) {
+ uint64_t reg = GENX(VD0_AUX_TABLE_BASE_ADDR_num);
+
+ if (queue->family->engine_class == INTEL_ENGINE_CLASS_COPY) {
+#if GFX_VERx10 >= 125
+ reg = GENX(BCS_AUX_TABLE_BASE_ADDR_num);
+#endif
+ }
+
+ uint64_t aux_base_addr =
+ intel_aux_map_get_base(queue->device->aux_map_ctx);
+ assert(aux_base_addr % (32 * 1024) == 0);
+ anv_batch_emit(&batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
+ lri.RegisterOffset = reg;
+ lri.DataDWord = aux_base_addr & 0xffffffff;
+ }
+ anv_batch_emit(&batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
+ lri.RegisterOffset = reg + 4;
+ lri.DataDWord = aux_base_addr >> 32;
+ }
+
+ anv_batch_emit(&batch, GENX(MI_BATCH_BUFFER_END), bbe);
+ assert(batch.next <= batch.end);
+
+ return anv_queue_submit_simple_batch(queue, &batch,
+ false /* is_companion_rcs_batch */);
+ }
+#else
+ assert(!queue->device->info->has_aux_map);
+#endif
+
+ return VK_SUCCESS;
+}
+
+void
+genX(init_physical_device_state)(ASSERTED struct anv_physical_device *pdevice)
+{
+ assert(pdevice->info.verx10 == GFX_VERx10);
+#if GFX_VERx10 == 125 && ANV_SUPPORT_RT
+ genX(grl_load_rt_uuid)(pdevice->rt_uuid);
+ pdevice->max_grl_scratch_size = genX(grl_max_scratch_size)();
+#endif
+
+ pdevice->cmd_emit_timestamp = genX(cmd_emit_timestamp);
+
+ pdevice->gpgpu_pipeline_value = GPGPU;
+}
+
+VkResult
+genX(init_device_state)(struct anv_device *device)
+{
+ VkResult res;
+
+ device->slice_hash = (struct anv_state) { 0 };
+ for (uint32_t i = 0; i < device->queue_count; i++) {
+ struct anv_queue *queue = &device->queues[i];
+ switch (queue->family->engine_class) {
+ case INTEL_ENGINE_CLASS_RENDER:
+ res = init_render_queue_state(queue, false /* is_companion_rcs_batch */);
+ break;
+ case INTEL_ENGINE_CLASS_COMPUTE: {
+ res = init_compute_queue_state(queue);
+ if (res != VK_SUCCESS)
+ return res;
+
+ /**
+ * Execute RCS init batch by default on the companion RCS command buffer in
+ * order to support MSAA copy/clear operations on compute queue.
+ */
+ res = init_render_queue_state(queue, true /* is_companion_rcs_batch */);
+ break;
+ }
+ case INTEL_ENGINE_CLASS_VIDEO:
+ res = init_copy_video_queue_state(queue);
+ break;
+ case INTEL_ENGINE_CLASS_COPY:
+ res = init_copy_video_queue_state(queue);
+ if (res != VK_SUCCESS)
+ return res;
+
+ /**
+ * Execute RCS init batch by default on the companion RCS command buffer in
+ * order to support MSAA copy/clear operations on copy queue.
+ */
+ res = init_render_queue_state(queue, true /* is_companion_rcs_batch */);
+ break;
+ default:
+ res = vk_error(device, VK_ERROR_INITIALIZATION_FAILED);
+ break;
+ }
+ if (res != VK_SUCCESS)
+ return res;
+ }
+
+ if (device->vk.enabled_extensions.EXT_descriptor_buffer &&
+ device->slice_hash.alloc_size) {
+ device->slice_hash_db =
+ anv_state_pool_alloc(&device->dynamic_state_db_pool,
+ device->slice_hash.alloc_size, 64);
+
+ memcpy(device->slice_hash_db.map,
+ device->slice_hash.map,
+ device->slice_hash.alloc_size);
+ }
+
+ return res;
+}
+
+#if GFX_VERx10 >= 125
+#define maybe_for_each_shading_rate_op(name) \
+ for (VkFragmentShadingRateCombinerOpKHR name = VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR; \
+ name <= VK_FRAGMENT_SHADING_RATE_COMBINER_OP_MUL_KHR; \
+ name++)
+#elif GFX_VER >= 12
+#define maybe_for_each_shading_rate_op(name)
+#endif
+
+/* Rather than reemitting the CPS_STATE structure everything those changes and
+ * for as many viewports as needed, we can just prepare all possible cases and
+ * just pick the right offset from the prepacked states when needed.
+ */
+void
+genX(init_cps_device_state)(struct anv_device *device)
+{
+#if GFX_VER >= 12
+ void *cps_state_ptr = device->cps_states.map;
+
+ /* Disabled CPS mode */
+ for (uint32_t __v = 0; __v < MAX_VIEWPORTS; __v++) {
+ /* ICL PRMs, Volume 2d: Command Reference: Structures: 3DSTATE_CPS_BODY:
+ *
+ * "It is an INVALID configuration to set the CPS mode other than
+ * CPS_MODE_NONE and request per-sample dispatch in 3DSTATE_PS_EXTRA.
+ * Such configuration should be disallowed at the API level, and
+ * rendering results are undefined."
+ *
+ * Since we select this state when per coarse pixel is disabled and that
+ * includes when per-sample dispatch is enabled, we need to ensure this
+ * is set to NONE.
+ */
+ struct GENX(CPS_STATE) cps_state = {
+ .CoarsePixelShadingMode = CPS_MODE_NONE,
+ };
+
+ GENX(CPS_STATE_pack)(NULL, cps_state_ptr, &cps_state);
+ cps_state_ptr += GENX(CPS_STATE_length) * 4;
+ }
+
+ maybe_for_each_shading_rate_op(op0) {
+ maybe_for_each_shading_rate_op(op1) {
+ for (uint32_t x = 1; x <= 4; x *= 2) {
+ for (uint32_t y = 1; y <= 4; y *= 2) {
+ struct GENX(CPS_STATE) cps_state = {
+ .CoarsePixelShadingMode = CPS_MODE_CONSTANT,
+ .MinCPSizeX = x,
+ .MinCPSizeY = y,
+ };
+
+#if GFX_VERx10 >= 125
+ static const uint32_t combiner_ops[] = {
+ [VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR] = PASSTHROUGH,
+ [VK_FRAGMENT_SHADING_RATE_COMBINER_OP_REPLACE_KHR] = OVERRIDE,
+ [VK_FRAGMENT_SHADING_RATE_COMBINER_OP_MIN_KHR] = HIGH_QUALITY,
+ [VK_FRAGMENT_SHADING_RATE_COMBINER_OP_MAX_KHR] = LOW_QUALITY,
+ [VK_FRAGMENT_SHADING_RATE_COMBINER_OP_MUL_KHR] = RELATIVE,
+ };
+
+ cps_state.Combiner0OpcodeforCPsize = combiner_ops[op0];
+ cps_state.Combiner1OpcodeforCPsize = combiner_ops[op1];
+#endif /* GFX_VERx10 >= 125 */
+
+ for (uint32_t __v = 0; __v < MAX_VIEWPORTS; __v++) {
+ GENX(CPS_STATE_pack)(NULL, cps_state_ptr, &cps_state);
+ cps_state_ptr += GENX(CPS_STATE_length) * 4;
+ }
+ }
+ }
+ }
+ }
+#endif /* GFX_VER >= 12 */
+}
+
+void
+genX(emit_l3_config)(struct anv_batch *batch,
+ const struct anv_device *device,
+ const struct intel_l3_config *cfg)
+{
+#if GFX_VER < 20
+ UNUSED const struct intel_device_info *devinfo = device->info;
+
+#if GFX_VER >= 12
+#define L3_ALLOCATION_REG GENX(L3ALLOC)
+#define L3_ALLOCATION_REG_num GENX(L3ALLOC_num)
+#else
+#define L3_ALLOCATION_REG GENX(L3CNTLREG)
+#define L3_ALLOCATION_REG_num GENX(L3CNTLREG_num)
+#endif
+
+ anv_batch_write_reg(batch, L3_ALLOCATION_REG, l3cr) {
+ if (cfg == NULL || (GFX_VER >= 12 && cfg->n[INTEL_L3P_ALL] > 126)) {
+ assert(!cfg || !(cfg->n[INTEL_L3P_SLM] || cfg->n[INTEL_L3P_URB] ||
+ cfg->n[INTEL_L3P_DC] || cfg->n[INTEL_L3P_RO] ||
+ cfg->n[INTEL_L3P_IS] || cfg->n[INTEL_L3P_C] ||
+ cfg->n[INTEL_L3P_T] || cfg->n[INTEL_L3P_TC]));
+#if GFX_VER >= 12
+ l3cr.L3FullWayAllocationEnable = true;
+#else
+ unreachable("Invalid L3$ config");
+#endif
+ } else {
+#if GFX_VER < 11
+ l3cr.SLMEnable = cfg->n[INTEL_L3P_SLM];
+#endif
+#if INTEL_NEEDS_WA_1406697149
+ /* Wa_1406697149: Bit 9 "Error Detection Behavior Control" must be
+ * set in L3CNTLREG register. The default setting of the bit is not
+ * the desirable behavior.
+ */
+ l3cr.ErrorDetectionBehaviorControl = true;
+ l3cr.UseFullWays = true;
+#endif /* INTEL_NEEDS_WA_1406697149 */
+ assert(cfg->n[INTEL_L3P_IS] == 0);
+ assert(cfg->n[INTEL_L3P_C] == 0);
+ assert(cfg->n[INTEL_L3P_T] == 0);
+ l3cr.URBAllocation = cfg->n[INTEL_L3P_URB];
+ l3cr.ROAllocation = cfg->n[INTEL_L3P_RO];
+ l3cr.DCAllocation = cfg->n[INTEL_L3P_DC];
+ l3cr.AllAllocation = cfg->n[INTEL_L3P_ALL];
+ }
+ }
+#endif /* GFX_VER < 20 */
+}
+
+void
+genX(emit_sample_pattern)(struct anv_batch *batch,
+ const struct vk_sample_locations_state *sl)
+{
+ assert(sl == NULL || sl->grid_size.width == 1);
+ assert(sl == NULL || sl->grid_size.height == 1);
+
+ /* See the Vulkan 1.0 spec Table 24.1 "Standard sample locations" and
+ * VkPhysicalDeviceFeatures::standardSampleLocations.
+ */
+ anv_batch_emit(batch, GENX(3DSTATE_SAMPLE_PATTERN), sp) {
+ /* The Skylake PRM Vol. 2a "3DSTATE_SAMPLE_PATTERN" says:
+ *
+ * "When programming the sample offsets (for NUMSAMPLES_4 or _8
+ * and MSRASTMODE_xxx_PATTERN), the order of the samples 0 to 3
+ * (or 7 for 8X, or 15 for 16X) must have monotonically increasing
+ * distance from the pixel center. This is required to get the
+ * correct centroid computation in the device."
+ *
+ * However, the Vulkan spec seems to require that the the samples occur
+ * in the order provided through the API. The standard sample patterns
+ * have the above property that they have monotonically increasing
+ * distances from the center but client-provided ones do not. As long as
+ * this only affects centroid calculations as the docs say, we should be
+ * ok because OpenGL and Vulkan only require that the centroid be some
+ * lit sample and that it's the same for all samples in a pixel; they
+ * have no requirement that it be the one closest to center.
+ */
+ for (uint32_t i = 1; i <= 16; i *= 2) {
+ switch (i) {
+ case VK_SAMPLE_COUNT_1_BIT:
+ if (sl && sl->per_pixel == i) {
+ INTEL_SAMPLE_POS_1X_ARRAY(sp._1xSample, sl->locations);
+ } else {
+ INTEL_SAMPLE_POS_1X(sp._1xSample);
+ }
+ break;
+ case VK_SAMPLE_COUNT_2_BIT:
+ if (sl && sl->per_pixel == i) {
+ INTEL_SAMPLE_POS_2X_ARRAY(sp._2xSample, sl->locations);
+ } else {
+ INTEL_SAMPLE_POS_2X(sp._2xSample);
+ }
+ break;
+ case VK_SAMPLE_COUNT_4_BIT:
+ if (sl && sl->per_pixel == i) {
+ INTEL_SAMPLE_POS_4X_ARRAY(sp._4xSample, sl->locations);
+ } else {
+ INTEL_SAMPLE_POS_4X(sp._4xSample);
+ }
+ break;
+ case VK_SAMPLE_COUNT_8_BIT:
+ if (sl && sl->per_pixel == i) {
+ INTEL_SAMPLE_POS_8X_ARRAY(sp._8xSample, sl->locations);
+ } else {
+ INTEL_SAMPLE_POS_8X(sp._8xSample);
+ }
+ break;
+ case VK_SAMPLE_COUNT_16_BIT:
+ if (sl && sl->per_pixel == i) {
+ INTEL_SAMPLE_POS_16X_ARRAY(sp._16xSample, sl->locations);
+ } else {
+ INTEL_SAMPLE_POS_16X(sp._16xSample);
+ }
+ break;
+ default:
+ unreachable("Invalid sample count");
+ }
+ }
+ }
+}
+
+static uint32_t
+vk_to_intel_tex_filter(VkFilter filter, bool anisotropyEnable)
+{
+ switch (filter) {
+ default:
+ unreachable("Invalid filter");
+ case VK_FILTER_NEAREST:
+ return anisotropyEnable ? MAPFILTER_ANISOTROPIC : MAPFILTER_NEAREST;
+ case VK_FILTER_LINEAR:
+ return anisotropyEnable ? MAPFILTER_ANISOTROPIC : MAPFILTER_LINEAR;
+ }
+}
+
+static uint32_t
+vk_to_intel_max_anisotropy(float ratio)
+{
+ return (CLAMP(ratio, 2, 16) - 2) / 2;
+}
+
+static const uint32_t vk_to_intel_mipmap_mode[] = {
+ [VK_SAMPLER_MIPMAP_MODE_NEAREST] = MIPFILTER_NEAREST,
+ [VK_SAMPLER_MIPMAP_MODE_LINEAR] = MIPFILTER_LINEAR
+};
+
+static const uint32_t vk_to_intel_tex_address[] = {
+ [VK_SAMPLER_ADDRESS_MODE_REPEAT] = TCM_WRAP,
+ [VK_SAMPLER_ADDRESS_MODE_MIRRORED_REPEAT] = TCM_MIRROR,
+ [VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE] = TCM_CLAMP,
+ [VK_SAMPLER_ADDRESS_MODE_MIRROR_CLAMP_TO_EDGE] = TCM_MIRROR_ONCE,
+ [VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER] = TCM_CLAMP_BORDER,
+};
+
+/* Vulkan specifies the result of shadow comparisons as:
+ * 1 if ref <op> texel,
+ * 0 otherwise.
+ *
+ * The hardware does:
+ * 0 if texel <op> ref,
+ * 1 otherwise.
+ *
+ * So, these look a bit strange because there's both a negation
+ * and swapping of the arguments involved.
+ */
+static const uint32_t vk_to_intel_shadow_compare_op[] = {
+ [VK_COMPARE_OP_NEVER] = PREFILTEROP_ALWAYS,
+ [VK_COMPARE_OP_LESS] = PREFILTEROP_LEQUAL,
+ [VK_COMPARE_OP_EQUAL] = PREFILTEROP_NOTEQUAL,
+ [VK_COMPARE_OP_LESS_OR_EQUAL] = PREFILTEROP_LESS,
+ [VK_COMPARE_OP_GREATER] = PREFILTEROP_GEQUAL,
+ [VK_COMPARE_OP_NOT_EQUAL] = PREFILTEROP_EQUAL,
+ [VK_COMPARE_OP_GREATER_OR_EQUAL] = PREFILTEROP_GREATER,
+ [VK_COMPARE_OP_ALWAYS] = PREFILTEROP_NEVER,
+};
+
+static const uint32_t vk_to_intel_sampler_reduction_mode[] = {
+ [VK_SAMPLER_REDUCTION_MODE_WEIGHTED_AVERAGE] = STD_FILTER,
+ [VK_SAMPLER_REDUCTION_MODE_MIN] = MINIMUM,
+ [VK_SAMPLER_REDUCTION_MODE_MAX] = MAXIMUM,
+};
+
+VkResult genX(CreateSampler)(
+ VkDevice _device,
+ const VkSamplerCreateInfo* pCreateInfo,
+ const VkAllocationCallbacks* pAllocator,
+ VkSampler* pSampler)
+{
+ ANV_FROM_HANDLE(anv_device, device, _device);
+ struct anv_sampler *sampler;
+
+ sampler = vk_sampler_create(&device->vk, pCreateInfo,
+ pAllocator, sizeof(*sampler));
+ if (!sampler)
+ return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+ const struct vk_format_ycbcr_info *ycbcr_info =
+ sampler->vk.format != VK_FORMAT_UNDEFINED ?
+ vk_format_get_ycbcr_info(sampler->vk.format) : NULL;
+ assert((ycbcr_info == NULL) == (sampler->vk.ycbcr_conversion == NULL));
+
+ sampler->n_planes = ycbcr_info ? ycbcr_info->n_planes : 1;
+
+ uint32_t border_color_stride = 64;
+ uint32_t border_color_offset, border_color_db_offset = 0;
+ void *border_color_ptr;
+ if (sampler->vk.border_color <= VK_BORDER_COLOR_INT_OPAQUE_WHITE) {
+ border_color_offset = device->border_colors.offset +
+ pCreateInfo->borderColor *
+ border_color_stride;
+ border_color_db_offset = device->border_colors_db.offset +
+ pCreateInfo->borderColor *
+ border_color_stride;
+ border_color_ptr = device->border_colors.map +
+ pCreateInfo->borderColor * border_color_stride;
+ } else {
+ assert(vk_border_color_is_custom(sampler->vk.border_color));
+ sampler->custom_border_color =
+ anv_state_reserved_pool_alloc(&device->custom_border_colors);
+ border_color_offset = sampler->custom_border_color.offset;
+ border_color_ptr = sampler->custom_border_color.map;
+
+ union isl_color_value color = { .u32 = {
+ sampler->vk.border_color_value.uint32[0],
+ sampler->vk.border_color_value.uint32[1],
+ sampler->vk.border_color_value.uint32[2],
+ sampler->vk.border_color_value.uint32[3],
+ } };
+
+ const struct anv_format *format_desc =
+ sampler->vk.format != VK_FORMAT_UNDEFINED ?
+ anv_get_format(sampler->vk.format) : NULL;
+
+ if (format_desc && format_desc->n_planes == 1 &&
+ !isl_swizzle_is_identity(format_desc->planes[0].swizzle)) {
+ const struct anv_format_plane *fmt_plane = &format_desc->planes[0];
+
+ assert(!isl_format_has_int_channel(fmt_plane->isl_format));
+ color = isl_color_value_swizzle(color, fmt_plane->swizzle, true);
+ }
+
+ memcpy(border_color_ptr, color.u32, sizeof(color));
+
+ if (device->vk.enabled_extensions.EXT_descriptor_buffer) {
+ if (pCreateInfo->flags & VK_SAMPLER_CREATE_DESCRIPTOR_BUFFER_CAPTURE_REPLAY_BIT_EXT) {
+ const VkOpaqueCaptureDescriptorDataCreateInfoEXT *opaque_info =
+ vk_find_struct_const(pCreateInfo->pNext,
+ OPAQUE_CAPTURE_DESCRIPTOR_DATA_CREATE_INFO_EXT);
+ if (opaque_info) {
+ uint32_t alloc_idx = *((const uint32_t *)opaque_info->opaqueCaptureDescriptorData);
+ sampler->custom_border_color_db =
+ anv_state_reserved_array_pool_alloc_index(&device->custom_border_colors_db, alloc_idx);
+ } else {
+ sampler->custom_border_color_db =
+ anv_state_reserved_array_pool_alloc(&device->custom_border_colors_db, true);
+ }
+ } else {
+ sampler->custom_border_color_db =
+ anv_state_reserved_array_pool_alloc(&device->custom_border_colors_db, false);
+ }
+ if (sampler->custom_border_color_db.alloc_size == 0)
+ return vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
+ border_color_db_offset = sampler->custom_border_color_db.offset;
+ memcpy(sampler->custom_border_color_db.map, color.u32, sizeof(color));
+ }
+ }
+
+ const bool seamless_cube =
+ !(pCreateInfo->flags & VK_SAMPLER_CREATE_NON_SEAMLESS_CUBE_MAP_BIT_EXT);
+
+ struct mesa_sha1 ctx;
+ _mesa_sha1_init(&ctx);
+
+ for (unsigned p = 0; p < sampler->n_planes; p++) {
+ const bool plane_has_chroma =
+ ycbcr_info && ycbcr_info->planes[p].has_chroma;
+ const VkFilter min_filter =
+ plane_has_chroma ? sampler->vk.ycbcr_conversion->state.chroma_filter :
+ pCreateInfo->minFilter;
+ const VkFilter mag_filter =
+ plane_has_chroma ? sampler->vk.ycbcr_conversion->state.chroma_filter :
+ pCreateInfo->magFilter;
+ const bool force_addr_rounding =
+ device->physical->instance->force_filter_addr_rounding;
+ const bool enable_min_filter_addr_rounding =
+ force_addr_rounding || min_filter != VK_FILTER_NEAREST;
+ const bool enable_mag_filter_addr_rounding =
+ force_addr_rounding || mag_filter != VK_FILTER_NEAREST;
+ /* From Broadwell PRM, SAMPLER_STATE:
+ * "Mip Mode Filter must be set to MIPFILTER_NONE for Planar YUV surfaces."
+ */
+ enum isl_format plane0_isl_format = sampler->vk.ycbcr_conversion ?
+ anv_get_format(sampler->vk.format)->planes[0].isl_format :
+ ISL_FORMAT_UNSUPPORTED;
+ const bool isl_format_is_planar_yuv =
+ plane0_isl_format != ISL_FORMAT_UNSUPPORTED &&
+ isl_format_is_yuv(plane0_isl_format) &&
+ isl_format_is_planar(plane0_isl_format);
+
+ const uint32_t mip_filter_mode =
+ isl_format_is_planar_yuv ?
+ MIPFILTER_NONE : vk_to_intel_mipmap_mode[pCreateInfo->mipmapMode];
+
+ struct GENX(SAMPLER_STATE) sampler_state = {
+ .SamplerDisable = false,
+ .TextureBorderColorMode = DX10OGL,
+
+#if GFX_VER >= 11
+ .CPSLODCompensationEnable = true,
+#endif
+
+ .LODPreClampMode = CLAMP_MODE_OGL,
+
+ .MipModeFilter = mip_filter_mode,
+ .MagModeFilter = vk_to_intel_tex_filter(mag_filter, pCreateInfo->anisotropyEnable),
+ .MinModeFilter = vk_to_intel_tex_filter(min_filter, pCreateInfo->anisotropyEnable),
+ .TextureLODBias = CLAMP(pCreateInfo->mipLodBias, -16, 15.996),
+ .AnisotropicAlgorithm =
+ pCreateInfo->anisotropyEnable ? EWAApproximation : LEGACY,
+ .MinLOD = CLAMP(pCreateInfo->minLod, 0, 14),
+ .MaxLOD = CLAMP(pCreateInfo->maxLod, 0, 14),
+ .ChromaKeyEnable = 0,
+ .ChromaKeyIndex = 0,
+ .ChromaKeyMode = 0,
+ .ShadowFunction =
+ vk_to_intel_shadow_compare_op[pCreateInfo->compareEnable ?
+ pCreateInfo->compareOp : VK_COMPARE_OP_NEVER],
+ .CubeSurfaceControlMode = seamless_cube ? OVERRIDE : PROGRAMMED,
+
+ .LODClampMagnificationMode = MIPNONE,
+
+ .MaximumAnisotropy = vk_to_intel_max_anisotropy(pCreateInfo->maxAnisotropy),
+ .RAddressMinFilterRoundingEnable = enable_min_filter_addr_rounding,
+ .RAddressMagFilterRoundingEnable = enable_mag_filter_addr_rounding,
+ .VAddressMinFilterRoundingEnable = enable_min_filter_addr_rounding,
+ .VAddressMagFilterRoundingEnable = enable_mag_filter_addr_rounding,
+ .UAddressMinFilterRoundingEnable = enable_min_filter_addr_rounding,
+ .UAddressMagFilterRoundingEnable = enable_mag_filter_addr_rounding,
+ .TrilinearFilterQuality = 0,
+ .NonnormalizedCoordinateEnable = pCreateInfo->unnormalizedCoordinates,
+ .TCXAddressControlMode = vk_to_intel_tex_address[pCreateInfo->addressModeU],
+ .TCYAddressControlMode = vk_to_intel_tex_address[pCreateInfo->addressModeV],
+ .TCZAddressControlMode = vk_to_intel_tex_address[pCreateInfo->addressModeW],
+
+ .ReductionType =
+ vk_to_intel_sampler_reduction_mode[sampler->vk.reduction_mode],
+ .ReductionTypeEnable =
+ sampler->vk.reduction_mode != VK_SAMPLER_REDUCTION_MODE_WEIGHTED_AVERAGE,
+ };
+
+ /* Pack a version of the SAMPLER_STATE without the border color. We'll
+ * use it to store into the shader cache and also for hashing.
+ */
+ GENX(SAMPLER_STATE_pack)(NULL, sampler->state_no_bc[p], &sampler_state);
+ _mesa_sha1_update(&ctx, sampler->state_no_bc[p], sizeof(sampler->state_no_bc[p]));
+
+ /* Put border color after the hashing, we don't want the allocation
+ * order of border colors to influence the hash. We just need th
+ * parameters to be hashed.
+ */
+ sampler_state.BorderColorPointer = border_color_offset;
+ GENX(SAMPLER_STATE_pack)(NULL, sampler->state[p], &sampler_state);
+
+ if (device->vk.enabled_extensions.EXT_descriptor_buffer) {
+ sampler_state.BorderColorPointer = border_color_db_offset;
+ GENX(SAMPLER_STATE_pack)(NULL, sampler->db_state[p], &sampler_state);
+ }
+ }
+
+ /* If we have bindless, allocate enough samplers. We allocate 32 bytes
+ * for each sampler instead of 16 bytes because we want all bindless
+ * samplers to be 32-byte aligned so we don't have to use indirect
+ * sampler messages on them.
+ */
+ sampler->bindless_state =
+ anv_state_pool_alloc(&device->dynamic_state_pool,
+ sampler->n_planes * 32, 32);
+ if (sampler->bindless_state.map) {
+ memcpy(sampler->bindless_state.map, sampler->state,
+ sampler->n_planes * GENX(SAMPLER_STATE_length) * 4);
+ }
+
+ /* Hash the border color */
+ _mesa_sha1_update(&ctx, border_color_ptr,
+ sizeof(union isl_color_value));
+
+ _mesa_sha1_final(&ctx, sampler->sha1);
+
+ *pSampler = anv_sampler_to_handle(sampler);
+
+ return VK_SUCCESS;
+}
+
+void
+genX(emit_embedded_sampler)(struct anv_device *device,
+ struct anv_embedded_sampler *sampler,
+ struct anv_pipeline_embedded_sampler_binding *binding)
+{
+ sampler->ref_cnt = 1;
+ memcpy(&sampler->key, &binding->key, sizeof(binding->key));
+
+ sampler->border_color_state =
+ anv_state_pool_alloc(&device->dynamic_state_db_pool,
+ sizeof(struct gfx8_border_color), 64);
+ memcpy(sampler->border_color_state.map,
+ binding->key.color,
+ sizeof(binding->key.color));
+
+ sampler->sampler_state =
+ anv_state_pool_alloc(&device->dynamic_state_db_pool,
+ ANV_SAMPLER_STATE_SIZE, 32);
+
+ struct GENX(SAMPLER_STATE) sampler_state = {
+ .BorderColorPointer = sampler->border_color_state.offset,
+ };
+ uint32_t dwords[GENX(SAMPLER_STATE_length)];
+ GENX(SAMPLER_STATE_pack)(NULL, dwords, &sampler_state);
+
+ for (uint32_t i = 0; i < GENX(SAMPLER_STATE_length); i++) {
+ ((uint32_t *)sampler->sampler_state.map)[i] =
+ dwords[i] | binding->key.sampler[i];
+ }
+}
+
+/* Wa_14015814527
+ *
+ * Check if task shader was utilized within cmd_buffer, if so
+ * commit empty URB states and null prim.
+ */
+void
+genX(apply_task_urb_workaround)(struct anv_cmd_buffer *cmd_buffer)
+{
+ if (!anv_cmd_buffer_is_render_queue(cmd_buffer))
+ return;
+
+#if GFX_VERx10 >= 125
+ const struct intel_device_info *devinfo = &cmd_buffer->device->physical->info;
+
+ if (!intel_needs_workaround(devinfo, 16014390852))
+ return;
+
+ if (cmd_buffer->state.current_pipeline != _3D ||
+ !cmd_buffer->state.gfx.used_task_shader)
+ return;
+
+ cmd_buffer->state.gfx.used_task_shader = false;
+
+ /* Wa_14015821291 mentions that WA below is not required if we have
+ * a pipeline flush going on. It will get flushed during
+ * cmd_buffer_flush_state before draw.
+ */
+ if ((cmd_buffer->state.pending_pipe_bits & ANV_PIPE_CS_STALL_BIT))
+ return;
+
+ for (int i = 0; i <= MESA_SHADER_GEOMETRY; i++) {
+ anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_URB_VS), urb) {
+ urb._3DCommandSubOpcode += i;
+ }
+ }
+
+ anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_URB_ALLOC_MESH), zero);
+ anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_URB_ALLOC_TASK), zero);
+
+ /* Issue 'nullprim' to commit the state. */
+ genx_batch_emit_pipe_control_write
+ (&cmd_buffer->batch, cmd_buffer->device->info,
+ cmd_buffer->state.current_pipeline,
+ WriteImmediateData, cmd_buffer->device->workaround_address, 0, 0);
+#endif
+}
+
+VkResult
+genX(init_trtt_context_state)(struct anv_queue *queue)
+{
+#if GFX_VER >= 12
+ struct anv_device *device = queue->device;
+ struct anv_trtt *trtt = &device->trtt;
+
+ uint32_t cmds[128];
+ struct anv_batch batch = {
+ .start = cmds,
+ .next = cmds,
+ .end = (void *)cmds + sizeof(cmds),
+ };
+
+ anv_batch_write_reg(&batch, GENX(GFX_TRTT_INVAL), trtt_inval) {
+ trtt_inval.InvalidTileDetectionValue = ANV_TRTT_L1_INVALID_TILE_VAL;
+ }
+ anv_batch_write_reg(&batch, GENX(GFX_TRTT_NULL), trtt_null) {
+ trtt_null.NullTileDetectionValue = ANV_TRTT_L1_NULL_TILE_VAL;
+ }
+#if GFX_VER >= 20
+ anv_batch_write_reg(&batch, GENX(GFX_TRTT_VA_RANGE), trtt_va_range) {
+ trtt_va_range.TRVABase = device->physical->va.trtt.addr >> 44;
+ }
+#else
+ anv_batch_write_reg(&batch, GENX(GFX_TRTT_VA_RANGE), trtt_va_range) {
+ trtt_va_range.TRVAMaskValue = 0xF;
+ trtt_va_range.TRVADataValue = 0xF;
+ }
+#endif
+
+ uint64_t l3_addr = trtt->l3_addr;
+ assert((l3_addr & 0xFFF) == 0);
+ anv_batch_write_reg(&batch, GENX(GFX_TRTT_L3_BASE_LOW), trtt_base_low) {
+ trtt_base_low.TRVAL3PointerLowerAddress =
+ (l3_addr & 0xFFFFF000) >> 12;
+ }
+ anv_batch_write_reg(&batch, GENX(GFX_TRTT_L3_BASE_HIGH),
+ trtt_base_high) {
+ trtt_base_high.TRVAL3PointerUpperAddress =
+ (l3_addr >> 32) & 0xFFFF;
+ }
+ /* Enabling TR-TT needs to be done after setting up the other registers.
+ */
+ anv_batch_write_reg(&batch, GENX(GFX_TRTT_CR), trtt_cr) {
+ trtt_cr.TRTTEnable = true;
+ }
+
+ anv_batch_emit(&batch, GENX(MI_BATCH_BUFFER_END), bbe);
+ assert(batch.next <= batch.end);
+
+ VkResult res = anv_queue_submit_simple_batch(queue, &batch, false);
+ if (res != VK_SUCCESS)
+ return res;
+
+#endif
+ return VK_SUCCESS;
+}
diff --git a/src/intel/vulkan/genX_internal_kernels.c b/src/intel/vulkan/genX_internal_kernels.c
new file mode 100644
index 00000000000..a476e2bcd04
--- /dev/null
+++ b/src/intel/vulkan/genX_internal_kernels.c
@@ -0,0 +1,111 @@
+/* Copyright © 2023 Intel Corporation
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "anv_private.h"
+#include "anv_internal_kernels.h"
+
+#include "compiler/nir/nir_builder.h"
+#include "compiler/nir/nir_serialize.h"
+
+#if GFX_VERx10 == 90
+# include "intel_gfx9_shaders_code.h"
+#elif GFX_VERx10 == 110
+# include "intel_gfx11_shaders_code.h"
+#elif GFX_VERx10 == 120
+# include "intel_gfx12_shaders_code.h"
+#elif GFX_VERx10 == 125
+# include "intel_gfx125_shaders_code.h"
+#elif GFX_VERx10 == 200
+# include "intel_gfx20_shaders_code.h"
+#else
+# error "Unsupported generation"
+#endif
+
+#include "genxml/gen_macros.h"
+
+#define load_param(b, bit_size, struct_name, field_name) \
+ nir_load_uniform(b, 1, bit_size, nir_imm_int(b, 0), \
+ .base = offsetof(struct_name, field_name), \
+ .range = bit_size / 8)
+
+static nir_def *
+load_fragment_index(nir_builder *b)
+{
+ nir_def *pos_in = nir_f2i32(b, nir_trim_vector(b, nir_load_frag_coord(b), 2));
+ return nir_iadd(b,
+ nir_imul_imm(b, nir_channel(b, pos_in, 1), 8192),
+ nir_channel(b, pos_in, 0));
+}
+
+static nir_def *
+load_compute_index(nir_builder *b)
+{
+ return nir_channel(b, nir_load_global_invocation_id(b, 32), 0);
+}
+
+nir_shader *
+genX(load_libanv_shader)(struct anv_device *device, void *mem_ctx)
+{
+ const nir_shader_compiler_options *nir_options =
+ device->physical->compiler->nir_options[MESA_SHADER_KERNEL];
+
+ struct blob_reader blob;
+ blob_reader_init(&blob, (void *)genX(intel_shaders_nir),
+ sizeof(genX(intel_shaders_nir)));
+ return nir_deserialize(mem_ctx, nir_options, &blob);
+}
+
+uint32_t
+genX(call_internal_shader)(nir_builder *b, enum anv_internal_kernel_name shader_name)
+{
+ switch (shader_name) {
+ case ANV_INTERNAL_KERNEL_GENERATED_DRAWS:
+ genX(libanv_write_draw)(
+ b,
+ load_param(b, 64, struct anv_gen_indirect_params, generated_cmds_addr),
+ load_param(b, 64, struct anv_gen_indirect_params, indirect_data_addr),
+ load_param(b, 64, struct anv_gen_indirect_params, draw_id_addr),
+ load_param(b, 32, struct anv_gen_indirect_params, indirect_data_stride),
+ load_param(b, 64, struct anv_gen_indirect_params, draw_count_addr),
+ load_param(b, 32, struct anv_gen_indirect_params, draw_base),
+ load_param(b, 32, struct anv_gen_indirect_params, instance_multiplier),
+ load_param(b, 32, struct anv_gen_indirect_params, max_draw_count),
+ load_param(b, 32, struct anv_gen_indirect_params, flags),
+ load_param(b, 32, struct anv_gen_indirect_params, ring_count),
+ load_param(b, 64, struct anv_gen_indirect_params, gen_addr),
+ load_param(b, 64, struct anv_gen_indirect_params, end_addr),
+ load_fragment_index(b));
+ return sizeof(struct anv_gen_indirect_params);
+
+ case ANV_INTERNAL_KERNEL_COPY_QUERY_RESULTS_COMPUTE:
+ case ANV_INTERNAL_KERNEL_COPY_QUERY_RESULTS_FRAGMENT:
+ genX(libanv_query_copy)(
+ b,
+ load_param(b, 64, struct anv_query_copy_params, destination_addr),
+ load_param(b, 32, struct anv_query_copy_params, destination_stride),
+ load_param(b, 64, struct anv_query_copy_params, query_data_addr),
+ load_param(b, 32, struct anv_query_copy_params, query_base),
+ load_param(b, 32, struct anv_query_copy_params, num_queries),
+ load_param(b, 32, struct anv_query_copy_params, query_data_offset),
+ load_param(b, 32, struct anv_query_copy_params, query_stride),
+ load_param(b, 32, struct anv_query_copy_params, num_items),
+ load_param(b, 32, struct anv_query_copy_params, flags),
+ shader_name == ANV_INTERNAL_KERNEL_COPY_QUERY_RESULTS_COMPUTE ?
+ load_compute_index(b) : load_fragment_index(b));
+ return sizeof(struct anv_query_copy_params);
+
+ case ANV_INTERNAL_KERNEL_MEMCPY_COMPUTE:
+ genX(libanv_memcpy)(
+ b,
+ load_param(b, 64, struct anv_memcpy_params, dst_addr),
+ load_param(b, 64, struct anv_memcpy_params, src_addr),
+ load_param(b, 32, struct anv_memcpy_params, num_dwords),
+ nir_imul_imm(b, load_compute_index(b), 4));
+ return sizeof(struct anv_memcpy_params);
+
+ default:
+ unreachable("Invalid shader name");
+ break;
+ }
+}
diff --git a/src/intel/vulkan/genX_pipeline.c b/src/intel/vulkan/genX_pipeline.c
index cb5605e8883..f667c8bacbd 100644
--- a/src/intel/vulkan/genX_pipeline.c
+++ b/src/intel/vulkan/genX_pipeline.c
@@ -25,13 +25,62 @@
#include "genxml/gen_macros.h"
#include "genxml/genX_pack.h"
-#include "genxml/gen_rt_pack.h"
+#include "genxml/genX_rt_pack.h"
+#include "common/intel_genX_state_brw.h"
#include "common/intel_l3_config.h"
#include "common/intel_sample_positions.h"
#include "nir/nir_xfb_info.h"
#include "vk_util.h"
#include "vk_format.h"
+#include "vk_log.h"
+#include "vk_render_pass.h"
+
+static inline struct anv_batch *
+anv_gfx_pipeline_add(struct anv_graphics_pipeline *pipeline,
+ struct anv_gfx_state_ptr *ptr,
+ uint32_t n_dwords)
+{
+ struct anv_batch *batch = &pipeline->base.base.batch;
+
+ assert(ptr->len == 0 ||
+ (batch->next - batch->start) / 4 == (ptr->offset + ptr->len));
+ if (ptr->len == 0)
+ ptr->offset = (batch->next - batch->start) / 4;
+ ptr->len += n_dwords;
+
+ return batch;
+}
+
+#define anv_pipeline_emit(pipeline, state, cmd, name) \
+ for (struct cmd name = { __anv_cmd_header(cmd) }, \
+ *_dst = anv_batch_emit_dwords( \
+ anv_gfx_pipeline_add(pipeline, \
+ &(pipeline)->state, \
+ __anv_cmd_length(cmd)), \
+ __anv_cmd_length(cmd)); \
+ __builtin_expect(_dst != NULL, 1); \
+ ({ __anv_cmd_pack(cmd)(&(pipeline)->base.base.batch, \
+ _dst, &name); \
+ VG(VALGRIND_CHECK_MEM_IS_DEFINED(_dst, __anv_cmd_length(cmd) * 4)); \
+ _dst = NULL; \
+ }))
+
+#define anv_pipeline_emitn(pipeline, state, n, cmd, ...) ({ \
+ void *__dst = anv_batch_emit_dwords( \
+ anv_gfx_pipeline_add(pipeline, &(pipeline)->state, n), n); \
+ if (__dst) { \
+ struct cmd __template = { \
+ __anv_cmd_header(cmd), \
+ .DWordLength = n - __anv_cmd_length_bias(cmd), \
+ __VA_ARGS__ \
+ }; \
+ __anv_cmd_pack(cmd)(&pipeline->base.base.batch, \
+ __dst, &__template); \
+ } \
+ __dst; \
+ })
+
static uint32_t
vertex_element_comp_control(enum isl_format format, unsigned comp)
@@ -85,39 +134,23 @@ vertex_element_comp_control(enum isl_format format, unsigned comp)
}
}
-static void
-emit_vertex_input(struct anv_graphics_pipeline *pipeline,
- const VkPipelineVertexInputStateCreateInfo *info)
+void
+genX(emit_vertex_input)(struct anv_batch *batch,
+ uint32_t *vertex_element_dws,
+ struct anv_graphics_pipeline *pipeline,
+ const struct vk_vertex_input_state *vi,
+ bool emit_in_pipeline)
{
+ const struct anv_device *device = pipeline->base.base.device;
const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
-
- /* Pull inputs_read out of the VS prog data */
const uint64_t inputs_read = vs_prog_data->inputs_read;
const uint64_t double_inputs_read =
vs_prog_data->double_inputs_read & inputs_read;
assert((inputs_read & ((1 << VERT_ATTRIB_GENERIC0) - 1)) == 0);
const uint32_t elements = inputs_read >> VERT_ATTRIB_GENERIC0;
const uint32_t elements_double = double_inputs_read >> VERT_ATTRIB_GENERIC0;
- const bool needs_svgs_elem = vs_prog_data->uses_vertexid ||
- vs_prog_data->uses_instanceid ||
- vs_prog_data->uses_firstvertex ||
- vs_prog_data->uses_baseinstance;
-
- uint32_t elem_count = __builtin_popcount(elements) -
- __builtin_popcount(elements_double) / 2;
- const uint32_t total_elems =
- MAX2(1, elem_count + needs_svgs_elem + vs_prog_data->uses_drawid);
-
- uint32_t *p;
-
- const uint32_t num_dwords = 1 + total_elems * 2;
- p = anv_batch_emitn(&pipeline->base.batch, num_dwords,
- GENX(3DSTATE_VERTEX_ELEMENTS));
- if (!p)
- return;
-
- for (uint32_t i = 0; i < total_elems; i++) {
+ for (uint32_t i = 0; i < pipeline->vs_input_elements; i++) {
/* The SKL docs for VERTEX_ELEMENT_STATE say:
*
* "All elements must be valid from Element[0] to the last valid
@@ -142,94 +175,168 @@ emit_vertex_input(struct anv_graphics_pipeline *pipeline,
.Component2Control = VFCOMP_STORE_0,
.Component3Control = VFCOMP_STORE_0,
};
- GENX(VERTEX_ELEMENT_STATE_pack)(NULL, &p[1 + i * 2], &element);
+ GENX(VERTEX_ELEMENT_STATE_pack)(NULL,
+ &vertex_element_dws[i * 2],
+ &element);
}
- for (uint32_t i = 0; i < info->vertexAttributeDescriptionCount; i++) {
- const VkVertexInputAttributeDescription *desc =
- &info->pVertexAttributeDescriptions[i];
- enum isl_format format = anv_get_isl_format(&pipeline->base.device->info,
- desc->format,
+ u_foreach_bit(a, vi->attributes_valid) {
+ enum isl_format format = anv_get_isl_format(device->info,
+ vi->attributes[a].format,
VK_IMAGE_ASPECT_COLOR_BIT,
VK_IMAGE_TILING_LINEAR);
+ assume(format < ISL_NUM_FORMATS);
- assert(desc->binding < MAX_VBS);
+ uint32_t binding = vi->attributes[a].binding;
+ assert(binding < MAX_VBS);
- if ((elements & (1 << desc->location)) == 0)
+ if ((elements & (1 << a)) == 0)
continue; /* Binding unused */
uint32_t slot =
- __builtin_popcount(elements & ((1 << desc->location) - 1)) -
+ __builtin_popcount(elements & ((1 << a) - 1)) -
DIV_ROUND_UP(__builtin_popcount(elements_double &
- ((1 << desc->location) -1)), 2);
+ ((1 << a) -1)), 2);
struct GENX(VERTEX_ELEMENT_STATE) element = {
- .VertexBufferIndex = desc->binding,
+ .VertexBufferIndex = vi->attributes[a].binding,
.Valid = true,
.SourceElementFormat = format,
.EdgeFlagEnable = false,
- .SourceElementOffset = desc->offset,
+ .SourceElementOffset = vi->attributes[a].offset,
.Component0Control = vertex_element_comp_control(format, 0),
.Component1Control = vertex_element_comp_control(format, 1),
.Component2Control = vertex_element_comp_control(format, 2),
.Component3Control = vertex_element_comp_control(format, 3),
};
- GENX(VERTEX_ELEMENT_STATE_pack)(NULL, &p[1 + slot * 2], &element);
+ GENX(VERTEX_ELEMENT_STATE_pack)(NULL,
+ &vertex_element_dws[slot * 2],
+ &element);
-#if GFX_VER >= 8
/* On Broadwell and later, we have a separate VF_INSTANCING packet
* that controls instancing. On Haswell and prior, that's part of
* VERTEX_BUFFER_STATE which we emit later.
*/
- anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_VF_INSTANCING), vfi) {
- vfi.InstancingEnable = pipeline->vb[desc->binding].instanced;
- vfi.VertexElementIndex = slot;
- vfi.InstanceDataStepRate =
- pipeline->vb[desc->binding].instance_divisor;
+ if (emit_in_pipeline) {
+ anv_pipeline_emit(pipeline, final.vf_instancing, GENX(3DSTATE_VF_INSTANCING), vfi) {
+ bool per_instance = vi->bindings[binding].input_rate ==
+ VK_VERTEX_INPUT_RATE_INSTANCE;
+ uint32_t divisor = vi->bindings[binding].divisor *
+ pipeline->instance_multiplier;
+
+ vfi.InstancingEnable = per_instance;
+ vfi.VertexElementIndex = slot;
+ vfi.InstanceDataStepRate = per_instance ? divisor : 1;
+ }
+ } else {
+ anv_batch_emit(batch, GENX(3DSTATE_VF_INSTANCING), vfi) {
+ bool per_instance = vi->bindings[binding].input_rate ==
+ VK_VERTEX_INPUT_RATE_INSTANCE;
+ uint32_t divisor = vi->bindings[binding].divisor *
+ pipeline->instance_multiplier;
+
+ vfi.InstancingEnable = per_instance;
+ vfi.VertexElementIndex = slot;
+ vfi.InstanceDataStepRate = per_instance ? divisor : 1;
+ }
}
-#endif
}
+}
- const uint32_t id_slot = elem_count;
- if (needs_svgs_elem) {
- /* From the Broadwell PRM for the 3D_Vertex_Component_Control enum:
- * "Within a VERTEX_ELEMENT_STATE structure, if a Component
- * Control field is set to something other than VFCOMP_STORE_SRC,
- * no higher-numbered Component Control fields may be set to
- * VFCOMP_STORE_SRC"
- *
- * This means, that if we have BaseInstance, we need BaseVertex as
- * well. Just do all or nothing.
- */
- uint32_t base_ctrl = (vs_prog_data->uses_firstvertex ||
- vs_prog_data->uses_baseinstance) ?
- VFCOMP_STORE_SRC : VFCOMP_STORE_0;
+static void
+emit_vertex_input(struct anv_graphics_pipeline *pipeline,
+ const struct vk_graphics_pipeline_state *state,
+ const struct vk_vertex_input_state *vi)
+{
+ /* Only pack the VERTEX_ELEMENT_STATE if not dynamic so we can just memcpy
+ * everything in gfx8_cmd_buffer.c
+ */
+ if (!BITSET_TEST(state->dynamic, MESA_VK_DYNAMIC_VI)) {
+ genX(emit_vertex_input)(NULL,
+ pipeline->vertex_input_data,
+ pipeline, vi, true /* emit_in_pipeline */);
+ }
- struct GENX(VERTEX_ELEMENT_STATE) element = {
- .VertexBufferIndex = ANV_SVGS_VB_INDEX,
- .Valid = true,
- .SourceElementFormat = ISL_FORMAT_R32G32_UINT,
- .Component0Control = base_ctrl,
- .Component1Control = base_ctrl,
-#if GFX_VER >= 8
- .Component2Control = VFCOMP_STORE_0,
- .Component3Control = VFCOMP_STORE_0,
-#else
- .Component2Control = VFCOMP_STORE_VID,
- .Component3Control = VFCOMP_STORE_IID,
+ const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
+ const bool needs_svgs_elem = pipeline->svgs_count > 1 ||
+ !vs_prog_data->uses_drawid;
+ const uint32_t id_slot = pipeline->vs_input_elements;
+ const uint32_t drawid_slot = id_slot + needs_svgs_elem;
+ if (pipeline->svgs_count > 0) {
+ assert(pipeline->vertex_input_elems >= pipeline->svgs_count);
+ uint32_t slot_offset =
+ pipeline->vertex_input_elems - pipeline->svgs_count;
+
+ if (needs_svgs_elem) {
+#if GFX_VER < 11
+ /* From the Broadwell PRM for the 3D_Vertex_Component_Control enum:
+ * "Within a VERTEX_ELEMENT_STATE structure, if a Component
+ * Control field is set to something other than VFCOMP_STORE_SRC,
+ * no higher-numbered Component Control fields may be set to
+ * VFCOMP_STORE_SRC"
+ *
+ * This means, that if we have BaseInstance, we need BaseVertex as
+ * well. Just do all or nothing.
+ */
+ uint32_t base_ctrl = (vs_prog_data->uses_firstvertex ||
+ vs_prog_data->uses_baseinstance) ?
+ VFCOMP_STORE_SRC : VFCOMP_STORE_0;
#endif
- };
- GENX(VERTEX_ELEMENT_STATE_pack)(NULL, &p[1 + id_slot * 2], &element);
-#if GFX_VER >= 8
- anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_VF_INSTANCING), vfi) {
- vfi.VertexElementIndex = id_slot;
+ struct GENX(VERTEX_ELEMENT_STATE) element = {
+ .VertexBufferIndex = ANV_SVGS_VB_INDEX,
+ .Valid = true,
+ .SourceElementFormat = ISL_FORMAT_R32G32_UINT,
+#if GFX_VER >= 11
+ /* On gen11, these are taken care of by extra parameter slots */
+ .Component0Control = VFCOMP_STORE_0,
+ .Component1Control = VFCOMP_STORE_0,
+#else
+ .Component0Control = base_ctrl,
+ .Component1Control = base_ctrl,
+#endif
+ .Component2Control = VFCOMP_STORE_0,
+ .Component3Control = VFCOMP_STORE_0,
+ };
+ GENX(VERTEX_ELEMENT_STATE_pack)(NULL,
+ &pipeline->vertex_input_data[slot_offset * 2],
+ &element);
+ slot_offset++;
+
+ anv_pipeline_emit(pipeline, final.vf_sgvs_instancing,
+ GENX(3DSTATE_VF_INSTANCING), vfi) {
+ vfi.VertexElementIndex = id_slot;
+ }
+ }
+
+ if (vs_prog_data->uses_drawid) {
+ struct GENX(VERTEX_ELEMENT_STATE) element = {
+ .VertexBufferIndex = ANV_DRAWID_VB_INDEX,
+ .Valid = true,
+ .SourceElementFormat = ISL_FORMAT_R32_UINT,
+#if GFX_VER >= 11
+ /* On gen11, this is taken care of by extra parameter slots */
+ .Component0Control = VFCOMP_STORE_0,
+#else
+ .Component0Control = VFCOMP_STORE_SRC,
+#endif
+ .Component1Control = VFCOMP_STORE_0,
+ .Component2Control = VFCOMP_STORE_0,
+ .Component3Control = VFCOMP_STORE_0,
+ };
+ GENX(VERTEX_ELEMENT_STATE_pack)(NULL,
+ &pipeline->vertex_input_data[slot_offset * 2],
+ &element);
+ slot_offset++;
+
+ anv_pipeline_emit(pipeline, final.vf_sgvs_instancing,
+ GENX(3DSTATE_VF_INSTANCING), vfi) {
+ vfi.VertexElementIndex = drawid_slot;
+ }
}
-#endif
}
-#if GFX_VER >= 8
- anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_VF_SGVS), sgvs) {
+ anv_pipeline_emit(pipeline, final.vf_sgvs, GENX(3DSTATE_VF_SGVS), sgvs) {
sgvs.VertexIDEnable = vs_prog_data->uses_vertexid;
sgvs.VertexIDComponentNumber = 2;
sgvs.VertexIDElementOffset = id_slot;
@@ -237,93 +344,187 @@ emit_vertex_input(struct anv_graphics_pipeline *pipeline,
sgvs.InstanceIDComponentNumber = 3;
sgvs.InstanceIDElementOffset = id_slot;
}
-#endif
- const uint32_t drawid_slot = elem_count + needs_svgs_elem;
- if (vs_prog_data->uses_drawid) {
- struct GENX(VERTEX_ELEMENT_STATE) element = {
- .VertexBufferIndex = ANV_DRAWID_VB_INDEX,
- .Valid = true,
- .SourceElementFormat = ISL_FORMAT_R32_UINT,
- .Component0Control = VFCOMP_STORE_SRC,
- .Component1Control = VFCOMP_STORE_0,
- .Component2Control = VFCOMP_STORE_0,
- .Component3Control = VFCOMP_STORE_0,
- };
- GENX(VERTEX_ELEMENT_STATE_pack)(NULL,
- &p[1 + drawid_slot * 2],
- &element);
+#if GFX_VER >= 11
+ anv_pipeline_emit(pipeline, final.vf_sgvs_2, GENX(3DSTATE_VF_SGVS_2), sgvs) {
+ /* gl_BaseVertex */
+ sgvs.XP0Enable = vs_prog_data->uses_firstvertex;
+ sgvs.XP0SourceSelect = XP0_PARAMETER;
+ sgvs.XP0ComponentNumber = 0;
+ sgvs.XP0ElementOffset = id_slot;
-#if GFX_VER >= 8
- anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_VF_INSTANCING), vfi) {
- vfi.VertexElementIndex = drawid_slot;
- }
-#endif
+ /* gl_BaseInstance */
+ sgvs.XP1Enable = vs_prog_data->uses_baseinstance;
+ sgvs.XP1SourceSelect = StartingInstanceLocation;
+ sgvs.XP1ComponentNumber = 1;
+ sgvs.XP1ElementOffset = id_slot;
+
+ /* gl_DrawID */
+ sgvs.XP2Enable = vs_prog_data->uses_drawid;
+ sgvs.XP2ComponentNumber = 0;
+ sgvs.XP2ElementOffset = drawid_slot;
}
+#endif
}
void
genX(emit_urb_setup)(struct anv_device *device, struct anv_batch *batch,
const struct intel_l3_config *l3_config,
VkShaderStageFlags active_stages,
- const unsigned entry_size[4],
+ const struct intel_urb_config *urb_cfg_in,
+ struct intel_urb_config *urb_cfg_out,
enum intel_urb_deref_block_size *deref_block_size)
{
- const struct intel_device_info *devinfo = &device->info;
+ const struct intel_device_info *devinfo = device->info;
- unsigned entries[4];
- unsigned start[4];
bool constrained;
intel_get_urb_config(devinfo, l3_config,
active_stages &
VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT,
active_stages & VK_SHADER_STAGE_GEOMETRY_BIT,
- entry_size, entries, start, deref_block_size,
+ urb_cfg_out, deref_block_size,
&constrained);
-#if GFX_VERx10 == 70
- /* From the IVB PRM Vol. 2, Part 1, Section 3.2.1:
- *
- * "A PIPE_CONTROL with Post-Sync Operation set to 1h and a depth stall
- * needs to be sent just prior to any 3DSTATE_VS, 3DSTATE_URB_VS,
- * 3DSTATE_CONSTANT_VS, 3DSTATE_BINDING_TABLE_POINTER_VS,
- * 3DSTATE_SAMPLER_STATE_POINTER_VS command. Only one PIPE_CONTROL
- * needs to be sent before any combination of VS associated 3DSTATE."
- */
- anv_batch_emit(batch, GFX7_PIPE_CONTROL, pc) {
- pc.DepthStallEnable = true;
- pc.PostSyncOperation = WriteImmediateData;
- pc.Address = device->workaround_address;
- }
+#if INTEL_NEEDS_WA_16014912113
+ if (intel_urb_setup_changed(urb_cfg_in, urb_cfg_out,
+ MESA_SHADER_TESS_EVAL) && urb_cfg_in->size[0] != 0) {
+ for (int i = 0; i <= MESA_SHADER_GEOMETRY; i++) {
+ anv_batch_emit(batch, GENX(3DSTATE_URB_VS), urb) {
+ urb._3DCommandSubOpcode += i;
+ urb.VSURBStartingAddress = urb_cfg_in->start[i];
+ urb.VSURBEntryAllocationSize = urb_cfg_in->size[i] - 1;
+ urb.VSNumberofURBEntries = i == 0 ? 256 : 0;
+ }
+ }
+ genx_batch_emit_pipe_control(batch, device->info, _3D,
+ ANV_PIPE_HDC_PIPELINE_FLUSH_BIT);
+ }
#endif
for (int i = 0; i <= MESA_SHADER_GEOMETRY; i++) {
anv_batch_emit(batch, GENX(3DSTATE_URB_VS), urb) {
urb._3DCommandSubOpcode += i;
- urb.VSURBStartingAddress = start[i];
- urb.VSURBEntryAllocationSize = entry_size[i] - 1;
- urb.VSNumberofURBEntries = entries[i];
+ urb.VSURBStartingAddress = urb_cfg_out->start[i];
+ urb.VSURBEntryAllocationSize = urb_cfg_out->size[i] - 1;
+ urb.VSNumberofURBEntries = urb_cfg_out->entries[i];
+ }
+ }
+#if GFX_VERx10 >= 125
+ if (device->vk.enabled_extensions.EXT_mesh_shader) {
+ anv_batch_emit(batch, GENX(3DSTATE_URB_ALLOC_MESH), zero);
+ anv_batch_emit(batch, GENX(3DSTATE_URB_ALLOC_TASK), zero);
+ }
+#endif
+}
+
+#if GFX_VERx10 >= 125
+static void
+emit_urb_setup_mesh(struct anv_graphics_pipeline *pipeline,
+ enum intel_urb_deref_block_size *deref_block_size)
+{
+ const struct intel_device_info *devinfo = pipeline->base.base.device->info;
+
+ const struct brw_task_prog_data *task_prog_data =
+ anv_pipeline_has_stage(pipeline, MESA_SHADER_TASK) ?
+ get_task_prog_data(pipeline) : NULL;
+ const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline);
+
+ const struct intel_mesh_urb_allocation alloc =
+ intel_get_mesh_urb_config(devinfo, pipeline->base.base.l3_config,
+ task_prog_data ? task_prog_data->map.size_dw : 0,
+ mesh_prog_data->map.size_dw);
+
+ /* Zero out the primitive pipeline URB allocations. */
+ for (int i = 0; i <= MESA_SHADER_GEOMETRY; i++) {
+ anv_pipeline_emit(pipeline, final.urb, GENX(3DSTATE_URB_VS), urb) {
+ urb._3DCommandSubOpcode += i;
}
}
+
+ anv_pipeline_emit(pipeline, final.urb, GENX(3DSTATE_URB_ALLOC_TASK), urb) {
+ if (task_prog_data) {
+ urb.TASKURBEntryAllocationSize = alloc.task_entry_size_64b - 1;
+ urb.TASKNumberofURBEntriesSlice0 = alloc.task_entries;
+ urb.TASKNumberofURBEntriesSliceN = alloc.task_entries;
+ urb.TASKURBStartingAddressSlice0 = alloc.task_starting_address_8kb;
+ urb.TASKURBStartingAddressSliceN = alloc.task_starting_address_8kb;
+ }
+ }
+
+ anv_pipeline_emit(pipeline, final.urb, GENX(3DSTATE_URB_ALLOC_MESH), urb) {
+ urb.MESHURBEntryAllocationSize = alloc.mesh_entry_size_64b - 1;
+ urb.MESHNumberofURBEntriesSlice0 = alloc.mesh_entries;
+ urb.MESHNumberofURBEntriesSliceN = alloc.mesh_entries;
+ urb.MESHURBStartingAddressSlice0 = alloc.mesh_starting_address_8kb;
+ urb.MESHURBStartingAddressSliceN = alloc.mesh_starting_address_8kb;
+ }
+
+ *deref_block_size = alloc.deref_block_size;
}
+#endif
static void
emit_urb_setup(struct anv_graphics_pipeline *pipeline,
enum intel_urb_deref_block_size *deref_block_size)
{
- unsigned entry_size[4];
+#if GFX_VERx10 >= 125
+ if (anv_pipeline_is_mesh(pipeline)) {
+ emit_urb_setup_mesh(pipeline, deref_block_size);
+ return;
+ }
+#endif
for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) {
const struct brw_vue_prog_data *prog_data =
!anv_pipeline_has_stage(pipeline, i) ? NULL :
- (const struct brw_vue_prog_data *) pipeline->shaders[i]->prog_data;
+ (const struct brw_vue_prog_data *) pipeline->base.shaders[i]->prog_data;
- entry_size[i] = prog_data ? prog_data->urb_entry_size : 1;
+ pipeline->urb_cfg.size[i] = prog_data ? prog_data->urb_entry_size : 1;
}
- genX(emit_urb_setup)(pipeline->base.device, &pipeline->base.batch,
- pipeline->base.l3_config,
- pipeline->active_stages, entry_size,
- deref_block_size);
+ struct anv_device *device = pipeline->base.base.device;
+ const struct intel_device_info *devinfo = device->info;
+
+
+ bool constrained;
+ intel_get_urb_config(devinfo,
+ pipeline->base.base.l3_config,
+ pipeline->base.base.active_stages &
+ VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT,
+ pipeline->base.base.active_stages &
+ VK_SHADER_STAGE_GEOMETRY_BIT,
+ &pipeline->urb_cfg, deref_block_size,
+ &constrained);
+
+ for (int i = 0; i <= MESA_SHADER_GEOMETRY; i++) {
+ anv_pipeline_emit(pipeline, final.urb, GENX(3DSTATE_URB_VS), urb) {
+ urb._3DCommandSubOpcode += i;
+ urb.VSURBStartingAddress = pipeline->urb_cfg.start[i];
+ urb.VSURBEntryAllocationSize = pipeline->urb_cfg.size[i] - 1;
+ urb.VSNumberofURBEntries = pipeline->urb_cfg.entries[i];
+ }
+ }
+
+#if GFX_VERx10 >= 125
+ if (device->vk.enabled_extensions.EXT_mesh_shader) {
+ anv_pipeline_emit(pipeline, final.urb, GENX(3DSTATE_URB_ALLOC_TASK), zero);
+ anv_pipeline_emit(pipeline, final.urb, GENX(3DSTATE_URB_ALLOC_MESH), zero);
+ }
+#endif
+
+}
+
+static bool
+sbe_primitive_id_override(struct anv_graphics_pipeline *pipeline)
+{
+ const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
+ if (!wm_prog_data)
+ return false;
+
+ const struct intel_vue_map *fs_input_map =
+ &anv_pipeline_get_last_vue_prog_data(pipeline)->vue_map;
+
+ return (wm_prog_data->inputs & VARYING_BIT_PRIMITIVE_ID) &&
+ fs_input_map->varying_to_slot[VARYING_SLOT_PRIMITIVE_ID] == -1;
}
static void
@@ -332,117 +533,167 @@ emit_3dstate_sbe(struct anv_graphics_pipeline *pipeline)
const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) {
- anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_SBE), sbe);
-#if GFX_VER >= 8
- anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_SBE_SWIZ), sbe);
+ anv_pipeline_emit(pipeline, final.sbe, GENX(3DSTATE_SBE), sbe);
+ anv_pipeline_emit(pipeline, final.sbe_swiz, GENX(3DSTATE_SBE_SWIZ), sbe);
+#if GFX_VERx10 >= 125
+ if (anv_pipeline_is_mesh(pipeline))
+ anv_pipeline_emit(pipeline, final.sbe_mesh, GENX(3DSTATE_SBE_MESH), sbe);
#endif
return;
}
- const struct brw_vue_map *fs_input_map =
- &anv_pipeline_get_last_vue_prog_data(pipeline)->vue_map;
-
- struct GENX(3DSTATE_SBE) sbe = {
- GENX(3DSTATE_SBE_header),
- .AttributeSwizzleEnable = true,
- .PointSpriteTextureCoordinateOrigin = UPPERLEFT,
- .NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs,
- .ConstantInterpolationEnable = wm_prog_data->flat_inputs,
- };
-
-#if GFX_VER >= 9
- for (unsigned i = 0; i < 32; i++)
- sbe.AttributeActiveComponentFormat[i] = ACF_XYZW;
-#endif
-
-#if GFX_VER >= 8
- /* On Broadwell, they broke 3DSTATE_SBE into two packets */
- struct GENX(3DSTATE_SBE_SWIZ) swiz = {
- GENX(3DSTATE_SBE_SWIZ_header),
- };
-#else
-# define swiz sbe
-#endif
-
- int first_slot = brw_compute_first_urb_slot_required(wm_prog_data->inputs,
- fs_input_map);
- assert(first_slot % 2 == 0);
- unsigned urb_entry_read_offset = first_slot / 2;
- int max_source_attr = 0;
- for (uint8_t idx = 0; idx < wm_prog_data->urb_setup_attribs_count; idx++) {
- uint8_t attr = wm_prog_data->urb_setup_attribs[idx];
- int input_index = wm_prog_data->urb_setup[attr];
+ anv_pipeline_emit(pipeline, final.sbe, GENX(3DSTATE_SBE), sbe) {
+ anv_pipeline_emit(pipeline, final.sbe_swiz, GENX(3DSTATE_SBE_SWIZ), swiz) {
- assert(0 <= input_index);
-
- /* gl_Viewport, gl_Layer and FragmentShadingRateKHR are stored in the
- * VUE header
+ /* TODO(mesh): Figure out cases where we need attribute swizzling. See also
+ * calculate_urb_setup() and related functions.
*/
- if (attr == VARYING_SLOT_VIEWPORT ||
- attr == VARYING_SLOT_LAYER ||
- attr == VARYING_SLOT_PRIMITIVE_SHADING_RATE) {
- continue;
- }
-
- if (attr == VARYING_SLOT_PNTC) {
- sbe.PointSpriteTextureCoordinateEnable = 1 << input_index;
- continue;
- }
+ sbe.AttributeSwizzleEnable = anv_pipeline_is_primitive(pipeline);
+ sbe.PointSpriteTextureCoordinateOrigin = UPPERLEFT;
+ sbe.NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs;
+ sbe.ConstantInterpolationEnable = wm_prog_data->flat_inputs;
+
+ for (unsigned i = 0; i < 32; i++)
+ sbe.AttributeActiveComponentFormat[i] = ACF_XYZW;
+
+ if (anv_pipeline_is_primitive(pipeline)) {
+ const struct intel_vue_map *fs_input_map =
+ &anv_pipeline_get_last_vue_prog_data(pipeline)->vue_map;
+
+ int first_slot =
+ brw_compute_first_urb_slot_required(wm_prog_data->inputs,
+ fs_input_map);
+ assert(first_slot % 2 == 0);
+ unsigned urb_entry_read_offset = first_slot / 2;
+ int max_source_attr = 0;
+ for (uint8_t idx = 0; idx < wm_prog_data->urb_setup_attribs_count; idx++) {
+ uint8_t attr = wm_prog_data->urb_setup_attribs[idx];
+ int input_index = wm_prog_data->urb_setup[attr];
+
+ assert(0 <= input_index);
+
+ /* gl_Viewport, gl_Layer and FragmentShadingRateKHR are stored in the
+ * VUE header
+ */
+ if (attr == VARYING_SLOT_VIEWPORT ||
+ attr == VARYING_SLOT_LAYER ||
+ attr == VARYING_SLOT_PRIMITIVE_SHADING_RATE) {
+ continue;
+ }
+
+ if (attr == VARYING_SLOT_PNTC) {
+ sbe.PointSpriteTextureCoordinateEnable = 1 << input_index;
+ continue;
+ }
+
+ const int slot = fs_input_map->varying_to_slot[attr];
+
+ if (slot == -1) {
+ /* This attribute does not exist in the VUE--that means that
+ * the vertex shader did not write to it. It could be that it's
+ * a regular varying read by the fragment shader but not
+ * written by the vertex shader or it's gl_PrimitiveID. In the
+ * first case the value is undefined, in the second it needs to
+ * be gl_PrimitiveID.
+ */
+ swiz.Attribute[input_index].ConstantSource = PRIM_ID;
+ swiz.Attribute[input_index].ComponentOverrideX = true;
+ swiz.Attribute[input_index].ComponentOverrideY = true;
+ swiz.Attribute[input_index].ComponentOverrideZ = true;
+ swiz.Attribute[input_index].ComponentOverrideW = true;
+ continue;
+ }
+
+ /* We have to subtract two slots to account for the URB entry
+ * output read offset in the VS and GS stages.
+ */
+ const int source_attr = slot - 2 * urb_entry_read_offset;
+ assert(source_attr >= 0 && source_attr < 32);
+ max_source_attr = MAX2(max_source_attr, source_attr);
+ /* The hardware can only do overrides on 16 overrides at a time,
+ * and the other up to 16 have to be lined up so that the input
+ * index = the output index. We'll need to do some tweaking to
+ * make sure that's the case.
+ */
+ if (input_index < 16)
+ swiz.Attribute[input_index].SourceAttribute = source_attr;
+ else
+ assert(source_attr == input_index);
+ }
- const int slot = fs_input_map->varying_to_slot[attr];
+ sbe.VertexURBEntryReadOffset = urb_entry_read_offset;
+ sbe.VertexURBEntryReadLength = DIV_ROUND_UP(max_source_attr + 1, 2);
+ sbe.ForceVertexURBEntryReadOffset = true;
+ sbe.ForceVertexURBEntryReadLength = true;
- if (slot == -1) {
- /* This attribute does not exist in the VUE--that means that the
- * vertex shader did not write to it. It could be that it's a
- * regular varying read by the fragment shader but not written by
- * the vertex shader or it's gl_PrimitiveID. In the first case the
- * value is undefined, in the second it needs to be
- * gl_PrimitiveID.
+ /* Ask the hardware to supply PrimitiveID if the fragment shader
+ * reads it but a previous stage didn't write one.
*/
- swiz.Attribute[input_index].ConstantSource = PRIM_ID;
- swiz.Attribute[input_index].ComponentOverrideX = true;
- swiz.Attribute[input_index].ComponentOverrideY = true;
- swiz.Attribute[input_index].ComponentOverrideZ = true;
- swiz.Attribute[input_index].ComponentOverrideW = true;
- continue;
+ if (sbe_primitive_id_override(pipeline)) {
+ sbe.PrimitiveIDOverrideAttributeSelect =
+ wm_prog_data->urb_setup[VARYING_SLOT_PRIMITIVE_ID];
+ sbe.PrimitiveIDOverrideComponentX = true;
+ sbe.PrimitiveIDOverrideComponentY = true;
+ sbe.PrimitiveIDOverrideComponentZ = true;
+ sbe.PrimitiveIDOverrideComponentW = true;
+ }
+ } else {
+ assert(anv_pipeline_is_mesh(pipeline));
+#if GFX_VERx10 >= 125
+ const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline);
+ anv_pipeline_emit(pipeline, final.sbe_mesh,
+ GENX(3DSTATE_SBE_MESH), sbe_mesh) {
+ const struct brw_mue_map *mue = &mesh_prog_data->map;
+
+ assert(mue->per_vertex_header_size_dw % 8 == 0);
+ sbe_mesh.PerVertexURBEntryOutputReadOffset = mue->per_vertex_header_size_dw / 8;
+ sbe_mesh.PerVertexURBEntryOutputReadLength = DIV_ROUND_UP(mue->per_vertex_data_size_dw, 8);
+
+ /* Clip distance array is passed in the per-vertex header so that
+ * it can be consumed by the HW. If user wants to read it in the
+ * FS, adjust the offset and length to cover it. Conveniently it
+ * is at the end of the per-vertex header, right before per-vertex
+ * attributes.
+ *
+ * Note that FS attribute reading must be aware that the clip
+ * distances have fixed position.
+ */
+ if (mue->per_vertex_header_size_dw > 8 &&
+ (wm_prog_data->urb_setup[VARYING_SLOT_CLIP_DIST0] >= 0 ||
+ wm_prog_data->urb_setup[VARYING_SLOT_CLIP_DIST1] >= 0)) {
+ sbe_mesh.PerVertexURBEntryOutputReadOffset -= 1;
+ sbe_mesh.PerVertexURBEntryOutputReadLength += 1;
+ }
+
+ if (mue->user_data_in_vertex_header) {
+ sbe_mesh.PerVertexURBEntryOutputReadOffset -= 1;
+ sbe_mesh.PerVertexURBEntryOutputReadLength += 1;
+ }
+
+ assert(mue->per_primitive_header_size_dw % 8 == 0);
+ sbe_mesh.PerPrimitiveURBEntryOutputReadOffset =
+ mue->per_primitive_header_size_dw / 8;
+ sbe_mesh.PerPrimitiveURBEntryOutputReadLength =
+ DIV_ROUND_UP(mue->per_primitive_data_size_dw, 8);
+
+ /* Just like with clip distances, if Primitive Shading Rate,
+ * Viewport Index or Layer is read back in the FS, adjust the
+ * offset and length to cover the Primitive Header, where PSR,
+ * Viewport Index & Layer are stored.
+ */
+ if (wm_prog_data->urb_setup[VARYING_SLOT_VIEWPORT] >= 0 ||
+ wm_prog_data->urb_setup[VARYING_SLOT_PRIMITIVE_SHADING_RATE] >= 0 ||
+ wm_prog_data->urb_setup[VARYING_SLOT_LAYER] >= 0 ||
+ mue->user_data_in_primitive_header) {
+ assert(sbe_mesh.PerPrimitiveURBEntryOutputReadOffset > 0);
+ sbe_mesh.PerPrimitiveURBEntryOutputReadOffset -= 1;
+ sbe_mesh.PerPrimitiveURBEntryOutputReadLength += 1;
+ }
+ }
+#endif
}
-
- /* We have to subtract two slots to accout for the URB entry output
- * read offset in the VS and GS stages.
- */
- const int source_attr = slot - 2 * urb_entry_read_offset;
- assert(source_attr >= 0 && source_attr < 32);
- max_source_attr = MAX2(max_source_attr, source_attr);
- /* The hardware can only do overrides on 16 overrides at a time, and the
- * other up to 16 have to be lined up so that the input index = the
- * output index. We'll need to do some tweaking to make sure that's the
- * case.
- */
- if (input_index < 16)
- swiz.Attribute[input_index].SourceAttribute = source_attr;
- else
- assert(source_attr == input_index);
}
-
- sbe.VertexURBEntryReadOffset = urb_entry_read_offset;
- sbe.VertexURBEntryReadLength = DIV_ROUND_UP(max_source_attr + 1, 2);
-#if GFX_VER >= 8
- sbe.ForceVertexURBEntryReadOffset = true;
- sbe.ForceVertexURBEntryReadLength = true;
-#endif
-
- uint32_t *dw = anv_batch_emit_dwords(&pipeline->base.batch,
- GENX(3DSTATE_SBE_length));
- if (!dw)
- return;
- GENX(3DSTATE_SBE_pack)(&pipeline->base.batch, dw, &sbe);
-
-#if GFX_VER >= 8
- dw = anv_batch_emit_dwords(&pipeline->base.batch, GENX(3DSTATE_SBE_SWIZ_length));
- if (!dw)
- return;
- GENX(3DSTATE_SBE_SWIZ_pack)(&pipeline->base.batch, dw, &swiz);
-#endif
+ }
}
/** Returns the final polygon mode for rasterization
@@ -451,10 +702,22 @@ emit_3dstate_sbe(struct anv_graphics_pipeline *pipeline)
* different shader stages which might generate their own type of primitives.
*/
VkPolygonMode
-genX(raster_polygon_mode)(struct anv_graphics_pipeline *pipeline,
+genX(raster_polygon_mode)(const struct anv_graphics_pipeline *pipeline,
+ VkPolygonMode polygon_mode,
VkPrimitiveTopology primitive_topology)
{
- if (anv_pipeline_has_stage(pipeline, MESA_SHADER_GEOMETRY)) {
+ if (anv_pipeline_is_mesh(pipeline)) {
+ switch (get_mesh_prog_data(pipeline)->primitive_type) {
+ case MESA_PRIM_POINTS:
+ return VK_POLYGON_MODE_POINT;
+ case MESA_PRIM_LINES:
+ return VK_POLYGON_MODE_LINE;
+ case MESA_PRIM_TRIANGLES:
+ return polygon_mode;
+ default:
+ unreachable("invalid primitive type for mesh");
+ }
+ } else if (anv_pipeline_has_stage(pipeline, MESA_SHADER_GEOMETRY)) {
switch (get_gs_prog_data(pipeline)->output_topology) {
case _3DPRIM_POINTLIST:
return VK_POLYGON_MODE_POINT;
@@ -471,20 +734,20 @@ genX(raster_polygon_mode)(struct anv_graphics_pipeline *pipeline,
case _3DPRIM_QUADLIST:
case _3DPRIM_QUADSTRIP:
case _3DPRIM_POLYGON:
- return pipeline->polygon_mode;
+ return polygon_mode;
}
unreachable("Unsupported GS output topology");
} else if (anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL)) {
switch (get_tes_prog_data(pipeline)->output_topology) {
- case BRW_TESS_OUTPUT_TOPOLOGY_POINT:
+ case INTEL_TESS_OUTPUT_TOPOLOGY_POINT:
return VK_POLYGON_MODE_POINT;
- case BRW_TESS_OUTPUT_TOPOLOGY_LINE:
+ case INTEL_TESS_OUTPUT_TOPOLOGY_LINE:
return VK_POLYGON_MODE_LINE;
- case BRW_TESS_OUTPUT_TOPOLOGY_TRI_CW:
- case BRW_TESS_OUTPUT_TOPOLOGY_TRI_CCW:
- return pipeline->polygon_mode;
+ case INTEL_TESS_OUTPUT_TOPOLOGY_TRI_CW:
+ case INTEL_TESS_OUTPUT_TOPOLOGY_TRI_CCW:
+ return polygon_mode;
}
unreachable("Unsupported TCS output topology");
} else {
@@ -503,7 +766,7 @@ genX(raster_polygon_mode)(struct anv_graphics_pipeline *pipeline,
case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN:
case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY:
case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY:
- return pipeline->polygon_mode;
+ return polygon_mode;
default:
unreachable("Unsupported primitive topology");
@@ -511,42 +774,6 @@ genX(raster_polygon_mode)(struct anv_graphics_pipeline *pipeline,
}
}
-uint32_t
-genX(ms_rasterization_mode)(struct anv_graphics_pipeline *pipeline,
- VkPolygonMode raster_mode)
-{
-#if GFX_VER <= 7
- if (raster_mode == VK_POLYGON_MODE_LINE) {
- switch (pipeline->line_mode) {
- case VK_LINE_RASTERIZATION_MODE_RECTANGULAR_EXT:
- return MSRASTMODE_ON_PATTERN;
-
- case VK_LINE_RASTERIZATION_MODE_BRESENHAM_EXT:
- case VK_LINE_RASTERIZATION_MODE_RECTANGULAR_SMOOTH_EXT:
- return MSRASTMODE_OFF_PIXEL;
-
- default:
- unreachable("Unsupported line rasterization mode");
- }
- } else {
- return pipeline->rasterization_samples > 1 ?
- MSRASTMODE_ON_PATTERN : MSRASTMODE_OFF_PIXEL;
- }
-#else
- unreachable("Only on gen7");
-#endif
-}
-
-static VkProvokingVertexModeEXT
-vk_provoking_vertex_mode(const VkPipelineRasterizationStateCreateInfo *rs_info)
-{
- const VkPipelineRasterizationProvokingVertexStateCreateInfoEXT *rs_pv_info =
- vk_find_struct_const(rs_info, PIPELINE_RASTERIZATION_PROVOKING_VERTEX_STATE_CREATE_INFO_EXT);
-
- return rs_pv_info == NULL ? VK_PROVOKING_VERTEX_MODE_FIRST_VERTEX_EXT :
- rs_pv_info->provokingVertexMode;
-}
-
const uint32_t genX(vk_to_intel_cullmode)[] = {
[VK_CULL_MODE_NONE] = CULLMODE_NONE,
[VK_CULL_MODE_FRONT_BIT] = CULLMODE_FRONT,
@@ -565,302 +792,72 @@ const uint32_t genX(vk_to_intel_front_face)[] = {
[VK_FRONT_FACE_CLOCKWISE] = 0
};
-#if GFX_VER >= 9
-static VkConservativeRasterizationModeEXT
-vk_conservative_rasterization_mode(const VkPipelineRasterizationStateCreateInfo *rs_info)
-{
- const VkPipelineRasterizationConservativeStateCreateInfoEXT *cr =
- vk_find_struct_const(rs_info, PIPELINE_RASTERIZATION_CONSERVATIVE_STATE_CREATE_INFO_EXT);
-
- return cr ? cr->conservativeRasterizationMode :
- VK_CONSERVATIVE_RASTERIZATION_MODE_DISABLED_EXT;
-}
-#endif
-
-void
-genX(rasterization_mode)(VkPolygonMode raster_mode,
- VkLineRasterizationModeEXT line_mode,
- float line_width,
- uint32_t *api_mode,
- bool *msaa_rasterization_enable)
-{
-#if GFX_VER >= 8
- if (raster_mode == VK_POLYGON_MODE_LINE) {
- /* Unfortunately, configuring our line rasterization hardware on gfx8
- * and later is rather painful. Instead of giving us bits to tell the
- * hardware what line mode to use like we had on gfx7, we now have an
- * arcane combination of API Mode and MSAA enable bits which do things
- * in a table which are expected to magically put the hardware into the
- * right mode for your API. Sadly, Vulkan isn't any of the APIs the
- * hardware people thought of so nothing works the way you want it to.
- *
- * Look at the table titled "Multisample Rasterization Modes" in Vol 7
- * of the Skylake PRM for more details.
- */
- switch (line_mode) {
- case VK_LINE_RASTERIZATION_MODE_RECTANGULAR_EXT:
- *api_mode = DX100;
-#if GFX_VER <= 9
- /* Prior to ICL, the algorithm the HW uses to draw wide lines
- * doesn't quite match what the CTS expects, at least for rectangular
- * lines, so we set this to false here, making it draw parallelograms
- * instead, which work well enough.
- */
- *msaa_rasterization_enable = line_width < 1.0078125;
-#else
- *msaa_rasterization_enable = true;
-#endif
- break;
-
- case VK_LINE_RASTERIZATION_MODE_RECTANGULAR_SMOOTH_EXT:
- case VK_LINE_RASTERIZATION_MODE_BRESENHAM_EXT:
- *api_mode = DX9OGL;
- *msaa_rasterization_enable = false;
- break;
-
- default:
- unreachable("Unsupported line rasterization mode");
- }
- } else {
- *api_mode = DX100;
- *msaa_rasterization_enable = true;
- }
-#else
- unreachable("Invalid call");
-#endif
-}
-
static void
emit_rs_state(struct anv_graphics_pipeline *pipeline,
- const VkPipelineInputAssemblyStateCreateInfo *ia_info,
- const VkPipelineRasterizationStateCreateInfo *rs_info,
- const VkPipelineMultisampleStateCreateInfo *ms_info,
- const VkPipelineRasterizationLineStateCreateInfoEXT *line_info,
- const uint32_t dynamic_states,
- const struct anv_render_pass *pass,
- const struct anv_subpass *subpass,
+ const struct vk_input_assembly_state *ia,
+ const struct vk_rasterization_state *rs,
+ const struct vk_multisample_state *ms,
+ const struct vk_render_pass_state *rp,
enum intel_urb_deref_block_size urb_deref_block_size)
{
- struct GENX(3DSTATE_SF) sf = {
- GENX(3DSTATE_SF_header),
- };
-
- sf.ViewportTransformEnable = true;
- sf.StatisticsEnable = true;
- sf.VertexSubPixelPrecisionSelect = _8Bit;
- sf.AALineDistanceMode = true;
-
- switch (vk_provoking_vertex_mode(rs_info)) {
- case VK_PROVOKING_VERTEX_MODE_FIRST_VERTEX_EXT:
- sf.TriangleStripListProvokingVertexSelect = 0;
- sf.LineStripListProvokingVertexSelect = 0;
- sf.TriangleFanProvokingVertexSelect = 1;
- break;
-
- case VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT:
- sf.TriangleStripListProvokingVertexSelect = 2;
- sf.LineStripListProvokingVertexSelect = 1;
- sf.TriangleFanProvokingVertexSelect = 2;
- break;
-
- default:
- unreachable("Invalid provoking vertex mode");
- }
-
-#if GFX_VERx10 == 75
- sf.LineStippleEnable = line_info && line_info->stippledLineEnable;
-#endif
+ anv_pipeline_emit(pipeline, partial.sf, GENX(3DSTATE_SF), sf) {
+ sf.ViewportTransformEnable = true;
+ sf.StatisticsEnable = true;
+ sf.VertexSubPixelPrecisionSelect = _8Bit;
+ sf.AALineDistanceMode = true;
#if GFX_VER >= 12
- sf.DerefBlockSize = urb_deref_block_size;
+ sf.DerefBlockSize = urb_deref_block_size;
#endif
- const struct brw_vue_prog_data *last_vue_prog_data =
- anv_pipeline_get_last_vue_prog_data(pipeline);
+ bool point_from_shader;
+ if (anv_pipeline_is_primitive(pipeline)) {
+ const struct brw_vue_prog_data *last_vue_prog_data =
+ anv_pipeline_get_last_vue_prog_data(pipeline);
+ point_from_shader = last_vue_prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ;
+ } else {
+ assert(anv_pipeline_is_mesh(pipeline));
+ const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline);
+ point_from_shader = mesh_prog_data->map.start_dw[VARYING_SLOT_PSIZ] >= 0;
+ }
- if (last_vue_prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
- sf.PointWidthSource = Vertex;
- } else {
- sf.PointWidthSource = State;
- sf.PointWidth = 1.0;
+ if (point_from_shader) {
+ sf.PointWidthSource = Vertex;
+ } else {
+ sf.PointWidthSource = State;
+ sf.PointWidth = 1.0;
+ }
}
-#if GFX_VER >= 8
- struct GENX(3DSTATE_RASTER) raster = {
- GENX(3DSTATE_RASTER_header),
- };
-#else
-# define raster sf
-#endif
-
- VkPolygonMode raster_mode =
- genX(raster_polygon_mode)(pipeline, ia_info->topology);
- bool dynamic_primitive_topology =
- dynamic_states & ANV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY;
-
- /* For details on 3DSTATE_RASTER multisample state, see the BSpec table
- * "Multisample Modes State".
- */
-#if GFX_VER >= 8
- if (!dynamic_primitive_topology)
- genX(rasterization_mode)(raster_mode, pipeline->line_mode,
- rs_info->lineWidth,
- &raster.APIMode,
- &raster.DXMultisampleRasterizationEnable);
-
- /* NOTE: 3DSTATE_RASTER::ForcedSampleCount affects the BDW and SKL PMA fix
- * computations. If we ever set this bit to a different value, they will
- * need to be updated accordingly.
- */
- raster.ForcedSampleCount = FSC_NUMRASTSAMPLES_0;
- raster.ForceMultisampling = false;
-#else
- uint32_t ms_rast_mode = 0;
-
- if (!dynamic_primitive_topology)
- ms_rast_mode = genX(ms_rasterization_mode)(pipeline, raster_mode);
-
- raster.MultisampleRasterizationMode = ms_rast_mode;
-#endif
-
- raster.AntialiasingEnable =
- dynamic_primitive_topology ? 0 :
- anv_rasterization_aa_mode(raster_mode, pipeline->line_mode);
-
- raster.FrontWinding =
- dynamic_states & ANV_CMD_DIRTY_DYNAMIC_FRONT_FACE ?
- 0 : genX(vk_to_intel_front_face)[rs_info->frontFace];
- raster.CullMode =
- dynamic_states & ANV_CMD_DIRTY_DYNAMIC_CULL_MODE ?
- 0 : genX(vk_to_intel_cullmode)[rs_info->cullMode];
-
- raster.FrontFaceFillMode = genX(vk_to_intel_fillmode)[rs_info->polygonMode];
- raster.BackFaceFillMode = genX(vk_to_intel_fillmode)[rs_info->polygonMode];
- raster.ScissorRectangleEnable = true;
-
-#if GFX_VER >= 9
- /* GFX9+ splits ViewportZClipTestEnable into near and far enable bits */
- raster.ViewportZFarClipTestEnable = pipeline->depth_clip_enable;
- raster.ViewportZNearClipTestEnable = pipeline->depth_clip_enable;
-#elif GFX_VER >= 8
- raster.ViewportZClipTestEnable = pipeline->depth_clip_enable;
-#endif
-
-#if GFX_VER >= 9
- raster.ConservativeRasterizationEnable =
- vk_conservative_rasterization_mode(rs_info) !=
- VK_CONSERVATIVE_RASTERIZATION_MODE_DISABLED_EXT;
-#endif
-
- bool depth_bias_enable =
- dynamic_states & ANV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS_ENABLE ?
- 0 : rs_info->depthBiasEnable;
-
- raster.GlobalDepthOffsetEnableSolid = depth_bias_enable;
- raster.GlobalDepthOffsetEnableWireframe = depth_bias_enable;
- raster.GlobalDepthOffsetEnablePoint = depth_bias_enable;
+ anv_pipeline_emit(pipeline, partial.raster, GENX(3DSTATE_RASTER), raster) {
+ /* For details on 3DSTATE_RASTER multisample state, see the BSpec table
+ * "Multisample Modes State".
+ */
+ /* NOTE: 3DSTATE_RASTER::ForcedSampleCount affects the BDW and SKL PMA fix
+ * computations. If we ever set this bit to a different value, they will
+ * need to be updated accordingly.
+ */
+ raster.ForcedSampleCount = FSC_NUMRASTSAMPLES_0;
+ raster.ForceMultisampling = false;
-#if GFX_VER == 7
- /* Gfx7 requires that we provide the depth format in 3DSTATE_SF so that it
- * can get the depth offsets correct.
- */
- if (subpass->depth_stencil_attachment) {
- VkFormat vk_format =
- pass->attachments[subpass->depth_stencil_attachment->attachment].format;
- assert(vk_format_is_depth_or_stencil(vk_format));
- if (vk_format_aspects(vk_format) & VK_IMAGE_ASPECT_DEPTH_BIT) {
- enum isl_format isl_format =
- anv_get_isl_format(&pipeline->base.device->info, vk_format,
- VK_IMAGE_ASPECT_DEPTH_BIT,
- VK_IMAGE_TILING_OPTIMAL);
- sf.DepthBufferSurfaceFormat =
- isl_format_get_depth_format(isl_format, false);
- }
+ raster.ScissorRectangleEnable = true;
}
-#endif
-
-#if GFX_VER >= 8
- GENX(3DSTATE_SF_pack)(NULL, pipeline->gfx8.sf, &sf);
- GENX(3DSTATE_RASTER_pack)(NULL, pipeline->gfx8.raster, &raster);
-#else
-# undef raster
- GENX(3DSTATE_SF_pack)(NULL, &pipeline->gfx7.sf, &sf);
-#endif
}
static void
emit_ms_state(struct anv_graphics_pipeline *pipeline,
- const VkPipelineMultisampleStateCreateInfo *info,
- uint32_t dynamic_states)
+ const struct vk_multisample_state *ms)
{
- /* Only lookup locations if the extensions is active, otherwise the default
- * ones will be used either at device initialization time or through
- * 3DSTATE_MULTISAMPLE on Gfx7/7.5 by passing NULL locations.
- */
- if (pipeline->base.device->vk.enabled_extensions.EXT_sample_locations) {
- /* If the sample locations are dynamic, 3DSTATE_MULTISAMPLE on Gfx7/7.5
- * will be emitted dynamically, so skip it here. On Gfx8+
- * 3DSTATE_SAMPLE_PATTERN will be emitted dynamically, so skip it here.
- */
- if (!(dynamic_states & ANV_CMD_DIRTY_DYNAMIC_SAMPLE_LOCATIONS)) {
-#if GFX_VER >= 8
- genX(emit_sample_pattern)(&pipeline->base.batch,
- pipeline->dynamic_state.sample_locations.samples,
- pipeline->dynamic_state.sample_locations.locations);
-#endif
- }
+ anv_pipeline_emit(pipeline, partial.ms, GENX(3DSTATE_MULTISAMPLE), ms) {
+ ms.PixelLocation = CENTER;
- genX(emit_multisample)(&pipeline->base.batch,
- pipeline->dynamic_state.sample_locations.samples,
- pipeline->dynamic_state.sample_locations.locations);
- } else {
- /* On Gfx8+ 3DSTATE_MULTISAMPLE does not hold anything we need to modify
- * for sample locations, so we don't have to emit it dynamically.
+ /* The PRM says that this bit is valid only for DX9:
+ *
+ * SW can choose to set this bit only for DX9 API. DX10/OGL API's
+ * should not have any effect by setting or not setting this bit.
*/
-#if GFX_VER >= 8
- genX(emit_multisample)(&pipeline->base.batch,
- info ? info->rasterizationSamples : 1,
- NULL);
-#endif
- }
-
- /* From the Vulkan 1.0 spec:
- * If pSampleMask is NULL, it is treated as if the mask has all bits
- * enabled, i.e. no coverage is removed from fragments.
- *
- * 3DSTATE_SAMPLE_MASK.SampleMask is 16 bits.
- */
-#if GFX_VER >= 8
- uint32_t sample_mask = 0xffff;
-#else
- uint32_t sample_mask = 0xff;
-#endif
-
- if (info && info->pSampleMask)
- sample_mask &= info->pSampleMask[0];
-
- anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_SAMPLE_MASK), sm) {
- sm.SampleMask = sample_mask;
- }
-
- pipeline->cps_state = ANV_STATE_NULL;
-#if GFX_VER >= 11
- if (!(dynamic_states & ANV_CMD_DIRTY_DYNAMIC_SHADING_RATE) &&
- pipeline->base.device->vk.enabled_extensions.KHR_fragment_shading_rate) {
-#if GFX_VER >= 12
- struct anv_device *device = pipeline->base.device;
- const uint32_t num_dwords =
- GENX(CPS_STATE_length) * 4 * pipeline->dynamic_state.viewport.count;
- pipeline->cps_state =
- anv_state_pool_alloc(&device->dynamic_state_pool, num_dwords, 32);
-#endif
-
- genX(emit_shading_rate)(&pipeline->base.batch,
- pipeline,
- pipeline->cps_state,
- &pipeline->dynamic_state);
+ ms.PixelPositionOffsetEnable = false;
}
-#endif
}
const uint32_t genX(vk_to_intel_logic_op)[] = {
@@ -882,36 +879,6 @@ const uint32_t genX(vk_to_intel_logic_op)[] = {
[VK_LOGIC_OP_SET] = LOGICOP_SET,
};
-static const uint32_t vk_to_intel_blend[] = {
- [VK_BLEND_FACTOR_ZERO] = BLENDFACTOR_ZERO,
- [VK_BLEND_FACTOR_ONE] = BLENDFACTOR_ONE,
- [VK_BLEND_FACTOR_SRC_COLOR] = BLENDFACTOR_SRC_COLOR,
- [VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR] = BLENDFACTOR_INV_SRC_COLOR,
- [VK_BLEND_FACTOR_DST_COLOR] = BLENDFACTOR_DST_COLOR,
- [VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR] = BLENDFACTOR_INV_DST_COLOR,
- [VK_BLEND_FACTOR_SRC_ALPHA] = BLENDFACTOR_SRC_ALPHA,
- [VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA] = BLENDFACTOR_INV_SRC_ALPHA,
- [VK_BLEND_FACTOR_DST_ALPHA] = BLENDFACTOR_DST_ALPHA,
- [VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA] = BLENDFACTOR_INV_DST_ALPHA,
- [VK_BLEND_FACTOR_CONSTANT_COLOR] = BLENDFACTOR_CONST_COLOR,
- [VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR]= BLENDFACTOR_INV_CONST_COLOR,
- [VK_BLEND_FACTOR_CONSTANT_ALPHA] = BLENDFACTOR_CONST_ALPHA,
- [VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA]= BLENDFACTOR_INV_CONST_ALPHA,
- [VK_BLEND_FACTOR_SRC_ALPHA_SATURATE] = BLENDFACTOR_SRC_ALPHA_SATURATE,
- [VK_BLEND_FACTOR_SRC1_COLOR] = BLENDFACTOR_SRC1_COLOR,
- [VK_BLEND_FACTOR_ONE_MINUS_SRC1_COLOR] = BLENDFACTOR_INV_SRC1_COLOR,
- [VK_BLEND_FACTOR_SRC1_ALPHA] = BLENDFACTOR_SRC1_ALPHA,
- [VK_BLEND_FACTOR_ONE_MINUS_SRC1_ALPHA] = BLENDFACTOR_INV_SRC1_ALPHA,
-};
-
-static const uint32_t vk_to_intel_blend_op[] = {
- [VK_BLEND_OP_ADD] = BLENDFUNCTION_ADD,
- [VK_BLEND_OP_SUBTRACT] = BLENDFUNCTION_SUBTRACT,
- [VK_BLEND_OP_REVERSE_SUBTRACT] = BLENDFUNCTION_REVERSE_SUBTRACT,
- [VK_BLEND_OP_MIN] = BLENDFUNCTION_MIN,
- [VK_BLEND_OP_MAX] = BLENDFUNCTION_MAX,
-};
-
const uint32_t genX(vk_to_intel_compare_op)[] = {
[VK_COMPARE_OP_NEVER] = PREFILTEROP_NEVER,
[VK_COMPARE_OP_LESS] = PREFILTEROP_LESS,
@@ -947,656 +914,99 @@ const uint32_t genX(vk_to_intel_primitive_type)[] = {
[VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY] = _3DPRIM_TRISTRIP_ADJ,
};
-/* This function sanitizes the VkStencilOpState by looking at the compare ops
- * and trying to determine whether or not a given stencil op can ever actually
- * occur. Stencil ops which can never occur are set to VK_STENCIL_OP_KEEP.
- * This function returns true if, after sanitation, any of the stencil ops are
- * set to something other than VK_STENCIL_OP_KEEP.
- */
-static bool
-sanitize_stencil_face(VkStencilOpState *face,
- VkCompareOp depthCompareOp)
-{
- /* If compareOp is ALWAYS then the stencil test will never fail and failOp
- * will never happen. Set failOp to KEEP in this case.
- */
- if (face->compareOp == VK_COMPARE_OP_ALWAYS)
- face->failOp = VK_STENCIL_OP_KEEP;
-
- /* If compareOp is NEVER or depthCompareOp is NEVER then one of the depth
- * or stencil tests will fail and passOp will never happen.
- */
- if (face->compareOp == VK_COMPARE_OP_NEVER ||
- depthCompareOp == VK_COMPARE_OP_NEVER)
- face->passOp = VK_STENCIL_OP_KEEP;
-
- /* If compareOp is NEVER or depthCompareOp is ALWAYS then either the
- * stencil test will fail or the depth test will pass. In either case,
- * depthFailOp will never happen.
- */
- if (face->compareOp == VK_COMPARE_OP_NEVER ||
- depthCompareOp == VK_COMPARE_OP_ALWAYS)
- face->depthFailOp = VK_STENCIL_OP_KEEP;
-
- return face->failOp != VK_STENCIL_OP_KEEP ||
- face->depthFailOp != VK_STENCIL_OP_KEEP ||
- face->passOp != VK_STENCIL_OP_KEEP;
-}
-
-/* Intel hardware is fairly sensitive to whether or not depth/stencil writes
- * are enabled. In the presence of discards, it's fairly easy to get into the
- * non-promoted case which means a fairly big performance hit. From the Iron
- * Lake PRM, Vol 2, pt. 1, section 8.4.3.2, "Early Depth Test Cases":
- *
- * "Non-promoted depth (N) is active whenever the depth test can be done
- * early but it cannot determine whether or not to write source depth to
- * the depth buffer, therefore the depth write must be performed post pixel
- * shader. This includes cases where the pixel shader can kill pixels,
- * including via sampler chroma key, as well as cases where the alpha test
- * function is enabled, which kills pixels based on a programmable alpha
- * test. In this case, even if the depth test fails, the pixel cannot be
- * killed if a stencil write is indicated. Whether or not the stencil write
- * happens depends on whether or not the pixel is killed later. In these
- * cases if stencil test fails and stencil writes are off, the pixels can
- * also be killed early. If stencil writes are enabled, the pixels must be
- * treated as Computed depth (described above)."
- *
- * The same thing as mentioned in the stencil case can happen in the depth
- * case as well if it thinks it writes depth but, thanks to the depth test
- * being GL_EQUAL, the write doesn't actually matter. A little extra work
- * up-front to try and disable depth and stencil writes can make a big
- * difference.
- *
- * Unfortunately, the way depth and stencil testing is specified, there are
- * many case where, regardless of depth/stencil writes being enabled, nothing
- * actually gets written due to some other bit of state being set. This
- * function attempts to "sanitize" the depth stencil state and disable writes
- * and sometimes even testing whenever possible.
- */
-static void
-sanitize_ds_state(VkPipelineDepthStencilStateCreateInfo *state,
- bool *stencilWriteEnable,
- VkImageAspectFlags ds_aspects)
-{
- *stencilWriteEnable = state->stencilTestEnable;
-
- /* If the depth test is disabled, we won't be writing anything. Make sure we
- * treat the test as always passing later on as well.
- *
- * Also, the Vulkan spec requires that if either depth or stencil is not
- * present, the pipeline is to act as if the test silently passes. In that
- * case we won't write either.
- */
- if (!state->depthTestEnable || !(ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT)) {
- state->depthWriteEnable = false;
- state->depthCompareOp = VK_COMPARE_OP_ALWAYS;
- }
-
- if (!(ds_aspects & VK_IMAGE_ASPECT_STENCIL_BIT)) {
- *stencilWriteEnable = false;
- state->front.compareOp = VK_COMPARE_OP_ALWAYS;
- state->back.compareOp = VK_COMPARE_OP_ALWAYS;
- }
-
- /* If the stencil test is enabled and always fails, then we will never get
- * to the depth test so we can just disable the depth test entirely.
- */
- if (state->stencilTestEnable &&
- state->front.compareOp == VK_COMPARE_OP_NEVER &&
- state->back.compareOp == VK_COMPARE_OP_NEVER) {
- state->depthTestEnable = false;
- state->depthWriteEnable = false;
- }
-
- /* If depthCompareOp is EQUAL then the value we would be writing to the
- * depth buffer is the same as the value that's already there so there's no
- * point in writing it.
- */
- if (state->depthCompareOp == VK_COMPARE_OP_EQUAL)
- state->depthWriteEnable = false;
-
- /* If the stencil ops are such that we don't actually ever modify the
- * stencil buffer, we should disable writes.
- */
- if (!sanitize_stencil_face(&state->front, state->depthCompareOp) &&
- !sanitize_stencil_face(&state->back, state->depthCompareOp))
- *stencilWriteEnable = false;
-
- /* If the depth test always passes and we never write out depth, that's the
- * same as if the depth test is disabled entirely.
- */
- if (state->depthCompareOp == VK_COMPARE_OP_ALWAYS &&
- !state->depthWriteEnable)
- state->depthTestEnable = false;
-
- /* If the stencil test always passes and we never write out stencil, that's
- * the same as if the stencil test is disabled entirely.
- */
- if (state->front.compareOp == VK_COMPARE_OP_ALWAYS &&
- state->back.compareOp == VK_COMPARE_OP_ALWAYS &&
- !*stencilWriteEnable)
- state->stencilTestEnable = false;
-}
-
-static void
-emit_ds_state(struct anv_graphics_pipeline *pipeline,
- const VkPipelineDepthStencilStateCreateInfo *pCreateInfo,
- const uint32_t dynamic_states,
- const struct anv_render_pass *pass,
- const struct anv_subpass *subpass)
-{
-#if GFX_VER == 7
-# define depth_stencil_dw pipeline->gfx7.depth_stencil_state
-#elif GFX_VER == 8
-# define depth_stencil_dw pipeline->gfx8.wm_depth_stencil
-#else
-# define depth_stencil_dw pipeline->gfx9.wm_depth_stencil
-#endif
-
- if (pCreateInfo == NULL) {
- /* We're going to OR this together with the dynamic state. We need
- * to make sure it's initialized to something useful.
- */
- pipeline->writes_stencil = false;
- pipeline->stencil_test_enable = false;
- pipeline->writes_depth = false;
- pipeline->depth_test_enable = false;
- pipeline->depth_bounds_test_enable = false;
- memset(depth_stencil_dw, 0, sizeof(depth_stencil_dw));
- return;
- }
-
- VkImageAspectFlags ds_aspects = 0;
- if (subpass->depth_stencil_attachment) {
- VkFormat depth_stencil_format =
- pass->attachments[subpass->depth_stencil_attachment->attachment].format;
- ds_aspects = vk_format_aspects(depth_stencil_format);
- }
-
- VkPipelineDepthStencilStateCreateInfo info = *pCreateInfo;
- sanitize_ds_state(&info, &pipeline->writes_stencil, ds_aspects);
- pipeline->stencil_test_enable = info.stencilTestEnable;
- pipeline->writes_depth = info.depthWriteEnable;
- pipeline->depth_test_enable = info.depthTestEnable;
- pipeline->depth_bounds_test_enable = info.depthBoundsTestEnable;
-
- bool dynamic_stencil_op =
- dynamic_states & ANV_CMD_DIRTY_DYNAMIC_STENCIL_OP;
-
-#if GFX_VER <= 7
- struct GENX(DEPTH_STENCIL_STATE) depth_stencil = {
-#else
- struct GENX(3DSTATE_WM_DEPTH_STENCIL) depth_stencil = {
-#endif
- .DepthTestEnable =
- dynamic_states & ANV_CMD_DIRTY_DYNAMIC_DEPTH_TEST_ENABLE ?
- 0 : info.depthTestEnable,
-
- .DepthBufferWriteEnable =
- dynamic_states & ANV_CMD_DIRTY_DYNAMIC_DEPTH_WRITE_ENABLE ?
- 0 : info.depthWriteEnable,
-
- .DepthTestFunction =
- dynamic_states & ANV_CMD_DIRTY_DYNAMIC_DEPTH_COMPARE_OP ?
- 0 : genX(vk_to_intel_compare_op)[info.depthCompareOp],
-
- .DoubleSidedStencilEnable = true,
-
- .StencilTestEnable =
- dynamic_states & ANV_CMD_DIRTY_DYNAMIC_STENCIL_TEST_ENABLE ?
- 0 : info.stencilTestEnable,
-
- .StencilFailOp = genX(vk_to_intel_stencil_op)[info.front.failOp],
- .StencilPassDepthPassOp = genX(vk_to_intel_stencil_op)[info.front.passOp],
- .StencilPassDepthFailOp = genX(vk_to_intel_stencil_op)[info.front.depthFailOp],
- .StencilTestFunction = genX(vk_to_intel_compare_op)[info.front.compareOp],
- .BackfaceStencilFailOp = genX(vk_to_intel_stencil_op)[info.back.failOp],
- .BackfaceStencilPassDepthPassOp = genX(vk_to_intel_stencil_op)[info.back.passOp],
- .BackfaceStencilPassDepthFailOp = genX(vk_to_intel_stencil_op)[info.back.depthFailOp],
- .BackfaceStencilTestFunction = genX(vk_to_intel_compare_op)[info.back.compareOp],
- };
-
- if (dynamic_stencil_op) {
- depth_stencil.StencilFailOp = 0;
- depth_stencil.StencilPassDepthPassOp = 0;
- depth_stencil.StencilPassDepthFailOp = 0;
- depth_stencil.StencilTestFunction = 0;
- depth_stencil.BackfaceStencilFailOp = 0;
- depth_stencil.BackfaceStencilPassDepthPassOp = 0;
- depth_stencil.BackfaceStencilPassDepthFailOp = 0;
- depth_stencil.BackfaceStencilTestFunction = 0;
- }
-
-#if GFX_VER <= 7
- GENX(DEPTH_STENCIL_STATE_pack)(NULL, depth_stencil_dw, &depth_stencil);
-#else
- GENX(3DSTATE_WM_DEPTH_STENCIL_pack)(NULL, depth_stencil_dw, &depth_stencil);
-#endif
-}
-
-static bool
-is_dual_src_blend_factor(VkBlendFactor factor)
-{
- return factor == VK_BLEND_FACTOR_SRC1_COLOR ||
- factor == VK_BLEND_FACTOR_ONE_MINUS_SRC1_COLOR ||
- factor == VK_BLEND_FACTOR_SRC1_ALPHA ||
- factor == VK_BLEND_FACTOR_ONE_MINUS_SRC1_ALPHA;
-}
-
-static inline uint32_t *
-write_disabled_blend(uint32_t *state)
-{
- struct GENX(BLEND_STATE_ENTRY) entry = {
- .WriteDisableAlpha = true,
- .WriteDisableRed = true,
- .WriteDisableGreen = true,
- .WriteDisableBlue = true,
- };
- GENX(BLEND_STATE_ENTRY_pack)(NULL, state, &entry);
- return state + GENX(BLEND_STATE_ENTRY_length);
-}
-
static void
-emit_cb_state(struct anv_graphics_pipeline *pipeline,
- const VkPipelineColorBlendStateCreateInfo *info,
- const VkPipelineMultisampleStateCreateInfo *ms_info,
- uint32_t dynamic_states)
+emit_3dstate_clip(struct anv_graphics_pipeline *pipeline,
+ const struct vk_input_assembly_state *ia,
+ const struct vk_viewport_state *vp,
+ const struct vk_rasterization_state *rs)
{
- struct anv_device *device = pipeline->base.device;
const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
+ (void) wm_prog_data;
- struct GENX(BLEND_STATE) blend_state = {
-#if GFX_VER >= 8
- .AlphaToCoverageEnable = ms_info && ms_info->alphaToCoverageEnable,
- .AlphaToOneEnable = ms_info && ms_info->alphaToOneEnable,
-#endif
- };
-
- uint32_t surface_count = 0;
- struct anv_pipeline_bind_map *map;
- if (anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) {
- map = &pipeline->shaders[MESA_SHADER_FRAGMENT]->bind_map;
- surface_count = map->surface_count;
- }
-
- const uint32_t num_dwords = GENX(BLEND_STATE_length) +
- GENX(BLEND_STATE_ENTRY_length) * surface_count;
- uint32_t *blend_state_start, *state_pos;
-
- if (dynamic_states & (ANV_CMD_DIRTY_DYNAMIC_COLOR_BLEND_STATE |
- ANV_CMD_DIRTY_DYNAMIC_LOGIC_OP)) {
- const struct intel_device_info *devinfo = &pipeline->base.device->info;
- blend_state_start = devinfo->ver >= 8 ?
- pipeline->gfx8.blend_state : pipeline->gfx7.blend_state;
- pipeline->blend_state = ANV_STATE_NULL;
- } else {
- pipeline->blend_state =
- anv_state_pool_alloc(&device->dynamic_state_pool, num_dwords * 4, 64);
- blend_state_start = pipeline->blend_state.map;
- }
- state_pos = blend_state_start;
-
- bool has_writeable_rt = false;
- state_pos += GENX(BLEND_STATE_length);
-#if GFX_VER >= 8
- struct GENX(BLEND_STATE_ENTRY) bs0 = { 0 };
-#endif
- for (unsigned i = 0; i < surface_count; i++) {
- struct anv_pipeline_binding *binding = &map->surface_to_descriptor[i];
-
- /* All color attachments are at the beginning of the binding table */
- if (binding->set != ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS)
- break;
-
- /* We can have at most 8 attachments */
- assert(i < MAX_RTS);
-
- if (info == NULL || binding->index >= info->attachmentCount) {
- state_pos = write_disabled_blend(state_pos);
- continue;
- }
+ anv_pipeline_emit(pipeline, partial.clip, GENX(3DSTATE_CLIP), clip) {
+ clip.ClipEnable = true;
+ clip.StatisticsEnable = true;
+ clip.EarlyCullEnable = true;
+ clip.GuardbandClipTestEnable = true;
- if ((pipeline->dynamic_state.color_writes & (1u << binding->index)) == 0) {
- state_pos = write_disabled_blend(state_pos);
- continue;
- }
+ clip.VertexSubPixelPrecisionSelect = _8Bit;
+ clip.ClipMode = CLIPMODE_NORMAL;
- const VkPipelineColorBlendAttachmentState *a =
- &info->pAttachments[binding->index];
+ clip.MinimumPointWidth = 0.125;
+ clip.MaximumPointWidth = 255.875;
- struct GENX(BLEND_STATE_ENTRY) entry = {
-#if GFX_VER < 8
- .AlphaToCoverageEnable = ms_info && ms_info->alphaToCoverageEnable,
- .AlphaToOneEnable = ms_info && ms_info->alphaToOneEnable,
-#endif
- .LogicOpEnable = info->logicOpEnable,
- .LogicOpFunction = dynamic_states & ANV_CMD_DIRTY_DYNAMIC_LOGIC_OP ?
- 0: genX(vk_to_intel_logic_op)[info->logicOp],
+ /* TODO(mesh): Multiview. */
+ if (anv_pipeline_is_primitive(pipeline)) {
+ const struct brw_vue_prog_data *last =
+ anv_pipeline_get_last_vue_prog_data(pipeline);
- /* Vulkan specification 1.2.168, VkLogicOp:
+ /* From the Vulkan 1.0.45 spec:
*
- * "Logical operations are controlled by the logicOpEnable and
- * logicOp members of VkPipelineColorBlendStateCreateInfo. If
- * logicOpEnable is VK_TRUE, then a logical operation selected by
- * logicOp is applied between each color attachment and the
- * fragment’s corresponding output value, and blending of all
- * attachments is treated as if it were disabled."
- *
- * From the Broadwell PRM Volume 2d: Command Reference: Structures:
- * BLEND_STATE_ENTRY:
- *
- * "Enabling LogicOp and Color Buffer Blending at the same time is
- * UNDEFINED"
+ * "If the last active vertex processing stage shader entry
+ * point's interface does not include a variable decorated with
+ * ViewportIndex, then the first viewport is used."
*/
- .ColorBufferBlendEnable = !info->logicOpEnable && a->blendEnable,
- .ColorClampRange = COLORCLAMP_RTFORMAT,
- .PreBlendColorClampEnable = true,
- .PostBlendColorClampEnable = true,
- .SourceBlendFactor = vk_to_intel_blend[a->srcColorBlendFactor],
- .DestinationBlendFactor = vk_to_intel_blend[a->dstColorBlendFactor],
- .ColorBlendFunction = vk_to_intel_blend_op[a->colorBlendOp],
- .SourceAlphaBlendFactor = vk_to_intel_blend[a->srcAlphaBlendFactor],
- .DestinationAlphaBlendFactor = vk_to_intel_blend[a->dstAlphaBlendFactor],
- .AlphaBlendFunction = vk_to_intel_blend_op[a->alphaBlendOp],
- .WriteDisableAlpha = !(a->colorWriteMask & VK_COLOR_COMPONENT_A_BIT),
- .WriteDisableRed = !(a->colorWriteMask & VK_COLOR_COMPONENT_R_BIT),
- .WriteDisableGreen = !(a->colorWriteMask & VK_COLOR_COMPONENT_G_BIT),
- .WriteDisableBlue = !(a->colorWriteMask & VK_COLOR_COMPONENT_B_BIT),
- };
-
- if (a->srcColorBlendFactor != a->srcAlphaBlendFactor ||
- a->dstColorBlendFactor != a->dstAlphaBlendFactor ||
- a->colorBlendOp != a->alphaBlendOp) {
-#if GFX_VER >= 8
- blend_state.IndependentAlphaBlendEnable = true;
-#else
- entry.IndependentAlphaBlendEnable = true;
-#endif
- }
-
- /* The Dual Source Blending documentation says:
- *
- * "If SRC1 is included in a src/dst blend factor and
- * a DualSource RT Write message is not used, results
- * are UNDEFINED. (This reflects the same restriction in DX APIs,
- * where undefined results are produced if “o1” is not written
- * by a PS – there are no default values defined)."
- *
- * There is no way to gracefully fix this undefined situation
- * so we just disable the blending to prevent possible issues.
- */
- if (!wm_prog_data->dual_src_blend &&
- (is_dual_src_blend_factor(a->srcColorBlendFactor) ||
- is_dual_src_blend_factor(a->dstColorBlendFactor) ||
- is_dual_src_blend_factor(a->srcAlphaBlendFactor) ||
- is_dual_src_blend_factor(a->dstAlphaBlendFactor))) {
- vk_debug_report(&device->physical->instance->vk,
- VK_DEBUG_REPORT_WARNING_BIT_EXT,
- &device->vk.base, 0, 0, "anv",
- "Enabled dual-src blend factors without writing both targets "
- "in the shader. Disabling blending to avoid GPU hangs.");
- entry.ColorBufferBlendEnable = false;
- }
+ if (vp && (last->vue_map.slots_valid & VARYING_BIT_VIEWPORT)) {
+ clip.MaximumVPIndex = vp->viewport_count > 0 ?
+ vp->viewport_count - 1 : 0;
+ } else {
+ clip.MaximumVPIndex = 0;
+ }
- if (a->colorWriteMask != 0)
- has_writeable_rt = true;
+ /* From the Vulkan 1.0.45 spec:
+ *
+ * "If the last active vertex processing stage shader entry point's
+ * interface does not include a variable decorated with Layer, then
+ * the first layer is used."
+ */
+ clip.ForceZeroRTAIndexEnable =
+ !(last->vue_map.slots_valid & VARYING_BIT_LAYER);
+
+ } else if (anv_pipeline_is_mesh(pipeline)) {
+ const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline);
+ if (vp && vp->viewport_count > 0 &&
+ mesh_prog_data->map.start_dw[VARYING_SLOT_VIEWPORT] >= 0) {
+ clip.MaximumVPIndex = vp->viewport_count - 1;
+ } else {
+ clip.MaximumVPIndex = 0;
+ }
- /* Our hardware applies the blend factor prior to the blend function
- * regardless of what function is used. Technically, this means the
- * hardware can do MORE than GL or Vulkan specify. However, it also
- * means that, for MIN and MAX, we have to stomp the blend factor to
- * ONE to make it a no-op.
- */
- if (a->colorBlendOp == VK_BLEND_OP_MIN ||
- a->colorBlendOp == VK_BLEND_OP_MAX) {
- entry.SourceBlendFactor = BLENDFACTOR_ONE;
- entry.DestinationBlendFactor = BLENDFACTOR_ONE;
- }
- if (a->alphaBlendOp == VK_BLEND_OP_MIN ||
- a->alphaBlendOp == VK_BLEND_OP_MAX) {
- entry.SourceAlphaBlendFactor = BLENDFACTOR_ONE;
- entry.DestinationAlphaBlendFactor = BLENDFACTOR_ONE;
+ clip.ForceZeroRTAIndexEnable =
+ mesh_prog_data->map.start_dw[VARYING_SLOT_LAYER] < 0;
}
- GENX(BLEND_STATE_ENTRY_pack)(NULL, state_pos, &entry);
- state_pos += GENX(BLEND_STATE_ENTRY_length);
-#if GFX_VER >= 8
- if (i == 0)
- bs0 = entry;
-#endif
- }
-#if GFX_VER >= 8
- struct GENX(3DSTATE_PS_BLEND) blend = {
- GENX(3DSTATE_PS_BLEND_header),
- };
- blend.AlphaToCoverageEnable = blend_state.AlphaToCoverageEnable;
- blend.HasWriteableRT = has_writeable_rt;
- blend.ColorBufferBlendEnable = bs0.ColorBufferBlendEnable;
- blend.SourceAlphaBlendFactor = bs0.SourceAlphaBlendFactor;
- blend.DestinationAlphaBlendFactor = bs0.DestinationAlphaBlendFactor;
- blend.SourceBlendFactor = bs0.SourceBlendFactor;
- blend.DestinationBlendFactor = bs0.DestinationBlendFactor;
- blend.AlphaTestEnable = false;
- blend.IndependentAlphaBlendEnable = blend_state.IndependentAlphaBlendEnable;
-
- if (dynamic_states & (ANV_CMD_DIRTY_DYNAMIC_COLOR_BLEND_STATE |
- ANV_CMD_DIRTY_DYNAMIC_LOGIC_OP)) {
- GENX(3DSTATE_PS_BLEND_pack)(NULL, pipeline->gfx8.ps_blend, &blend);
- } else {
- anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_PS_BLEND), _blend)
- _blend = blend;
+ clip.NonPerspectiveBarycentricEnable = wm_prog_data ?
+ wm_prog_data->uses_nonperspective_interp_modes : 0;
}
-#else
- (void)has_writeable_rt;
-#endif
- GENX(BLEND_STATE_pack)(NULL, blend_state_start, &blend_state);
-
- if (!(dynamic_states & (ANV_CMD_DIRTY_DYNAMIC_COLOR_BLEND_STATE |
- ANV_CMD_DIRTY_DYNAMIC_LOGIC_OP))) {
- anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_BLEND_STATE_POINTERS), bsp) {
- bsp.BlendStatePointer = pipeline->blend_state.offset;
-#if GFX_VER >= 8
- bsp.BlendStatePointerValid = true;
-#endif
+#if GFX_VERx10 >= 125
+ if (anv_pipeline_is_mesh(pipeline)) {
+ const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline);
+ anv_pipeline_emit(pipeline, final.clip_mesh,
+ GENX(3DSTATE_CLIP_MESH), clip_mesh) {
+ clip_mesh.PrimitiveHeaderEnable = mesh_prog_data->map.per_primitive_header_size_dw > 0;
+ clip_mesh.UserClipDistanceClipTestEnableBitmask = mesh_prog_data->clip_distance_mask;
+ clip_mesh.UserClipDistanceCullTestEnableBitmask = mesh_prog_data->cull_distance_mask;
}
}
-}
-
-static void
-emit_3dstate_clip(struct anv_graphics_pipeline *pipeline,
- const VkPipelineInputAssemblyStateCreateInfo *ia_info,
- const VkPipelineViewportStateCreateInfo *vp_info,
- const VkPipelineRasterizationStateCreateInfo *rs_info,
- const uint32_t dynamic_states)
-{
- const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
- (void) wm_prog_data;
-
- struct GENX(3DSTATE_CLIP) clip = {
- GENX(3DSTATE_CLIP_header),
- };
-
- clip.ClipEnable = true;
- clip.StatisticsEnable = true;
- clip.EarlyCullEnable = true;
- clip.APIMode = APIMODE_D3D;
- clip.GuardbandClipTestEnable = true;
-
- /* Only enable the XY clip test when the final polygon rasterization
- * mode is VK_POLYGON_MODE_FILL. We want to leave it disabled for
- * points and lines so we get "pop-free" clipping.
- */
- VkPolygonMode raster_mode =
- genX(raster_polygon_mode)(pipeline, ia_info->topology);
- clip.ViewportXYClipTestEnable =
- dynamic_states & ANV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY ?
- 0 : (raster_mode == VK_POLYGON_MODE_FILL);
-
-#if GFX_VER >= 8
- clip.VertexSubPixelPrecisionSelect = _8Bit;
-#endif
- clip.ClipMode = CLIPMODE_NORMAL;
-
- switch (vk_provoking_vertex_mode(rs_info)) {
- case VK_PROVOKING_VERTEX_MODE_FIRST_VERTEX_EXT:
- clip.TriangleStripListProvokingVertexSelect = 0;
- clip.LineStripListProvokingVertexSelect = 0;
- clip.TriangleFanProvokingVertexSelect = 1;
- break;
-
- case VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT:
- clip.TriangleStripListProvokingVertexSelect = 2;
- clip.LineStripListProvokingVertexSelect = 1;
- clip.TriangleFanProvokingVertexSelect = 2;
- break;
-
- default:
- unreachable("Invalid provoking vertex mode");
- }
-
- clip.MinimumPointWidth = 0.125;
- clip.MaximumPointWidth = 255.875;
-
- const struct brw_vue_prog_data *last =
- anv_pipeline_get_last_vue_prog_data(pipeline);
-
- /* From the Vulkan 1.0.45 spec:
- *
- * "If the last active vertex processing stage shader entry point's
- * interface does not include a variable decorated with
- * ViewportIndex, then the first viewport is used."
- */
- if (vp_info && (last->vue_map.slots_valid & VARYING_BIT_VIEWPORT)) {
- clip.MaximumVPIndex = vp_info->viewportCount > 0 ?
- vp_info->viewportCount - 1 : 0;
- } else {
- clip.MaximumVPIndex = 0;
- }
-
- /* From the Vulkan 1.0.45 spec:
- *
- * "If the last active vertex processing stage shader entry point's
- * interface does not include a variable decorated with Layer, then
- * the first layer is used."
- */
- clip.ForceZeroRTAIndexEnable =
- !(last->vue_map.slots_valid & VARYING_BIT_LAYER);
-
-#if GFX_VER == 7
- clip.FrontWinding = genX(vk_to_intel_front_face)[rs_info->frontFace];
- clip.CullMode = genX(vk_to_intel_cullmode)[rs_info->cullMode];
- clip.ViewportZClipTestEnable = pipeline->depth_clip_enable;
- clip.UserClipDistanceClipTestEnableBitmask = last->clip_distance_mask;
- clip.UserClipDistanceCullTestEnableBitmask = last->cull_distance_mask;
-#else
- clip.NonPerspectiveBarycentricEnable = wm_prog_data ?
- (wm_prog_data->barycentric_interp_modes &
- BRW_BARYCENTRIC_NONPERSPECTIVE_BITS) != 0 : 0;
#endif
-
- GENX(3DSTATE_CLIP_pack)(NULL, pipeline->gfx7.clip, &clip);
}
static void
emit_3dstate_streamout(struct anv_graphics_pipeline *pipeline,
- const VkPipelineRasterizationStateCreateInfo *rs_info,
- const uint32_t dynamic_states)
+ const struct vk_rasterization_state *rs)
{
const struct brw_vue_prog_data *prog_data =
anv_pipeline_get_last_vue_prog_data(pipeline);
- const struct brw_vue_map *vue_map = &prog_data->vue_map;
+ const struct intel_vue_map *vue_map = &prog_data->vue_map;
nir_xfb_info *xfb_info;
if (anv_pipeline_has_stage(pipeline, MESA_SHADER_GEOMETRY))
- xfb_info = pipeline->shaders[MESA_SHADER_GEOMETRY]->xfb_info;
+ xfb_info = pipeline->base.shaders[MESA_SHADER_GEOMETRY]->xfb_info;
else if (anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL))
- xfb_info = pipeline->shaders[MESA_SHADER_TESS_EVAL]->xfb_info;
+ xfb_info = pipeline->base.shaders[MESA_SHADER_TESS_EVAL]->xfb_info;
else
- xfb_info = pipeline->shaders[MESA_SHADER_VERTEX]->xfb_info;
-
-#if GFX_VER == 7
-# define streamout_state_dw pipeline->gfx7.streamout_state
-#else
-# define streamout_state_dw pipeline->gfx8.streamout_state
-#endif
-
- struct GENX(3DSTATE_STREAMOUT) so = {
- GENX(3DSTATE_STREAMOUT_header),
- .RenderingDisable =
- (dynamic_states & ANV_CMD_DIRTY_DYNAMIC_RASTERIZER_DISCARD_ENABLE) ?
- 0 : rs_info->rasterizerDiscardEnable,
- };
-
- if (xfb_info) {
- so.SOFunctionEnable = true;
- so.SOStatisticsEnable = true;
-
- switch (vk_provoking_vertex_mode(rs_info)) {
- case VK_PROVOKING_VERTEX_MODE_FIRST_VERTEX_EXT:
- so.ReorderMode = LEADING;
- break;
-
- case VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT:
- so.ReorderMode = TRAILING;
- break;
-
- default:
- unreachable("Invalid provoking vertex mode");
- }
-
- const VkPipelineRasterizationStateStreamCreateInfoEXT *stream_info =
- vk_find_struct_const(rs_info, PIPELINE_RASTERIZATION_STATE_STREAM_CREATE_INFO_EXT);
- so.RenderStreamSelect = stream_info ?
- stream_info->rasterizationStream : 0;
-
-#if GFX_VER >= 8
- so.Buffer0SurfacePitch = xfb_info->buffers[0].stride;
- so.Buffer1SurfacePitch = xfb_info->buffers[1].stride;
- so.Buffer2SurfacePitch = xfb_info->buffers[2].stride;
- so.Buffer3SurfacePitch = xfb_info->buffers[3].stride;
-#else
- pipeline->gfx7.xfb_bo_pitch[0] = xfb_info->buffers[0].stride;
- pipeline->gfx7.xfb_bo_pitch[1] = xfb_info->buffers[1].stride;
- pipeline->gfx7.xfb_bo_pitch[2] = xfb_info->buffers[2].stride;
- pipeline->gfx7.xfb_bo_pitch[3] = xfb_info->buffers[3].stride;
-
- /* On Gfx7, the SO buffer enables live in 3DSTATE_STREAMOUT which
- * is a bit inconvenient because we don't know what buffers will
- * actually be enabled until draw time. We do our best here by
- * setting them based on buffers_written and we disable them
- * as-needed at draw time by setting EndAddress = BaseAddress.
- */
- so.SOBufferEnable0 = xfb_info->buffers_written & (1 << 0);
- so.SOBufferEnable1 = xfb_info->buffers_written & (1 << 1);
- so.SOBufferEnable2 = xfb_info->buffers_written & (1 << 2);
- so.SOBufferEnable3 = xfb_info->buffers_written & (1 << 3);
-#endif
-
- int urb_entry_read_offset = 0;
- int urb_entry_read_length =
- (prog_data->vue_map.num_slots + 1) / 2 - urb_entry_read_offset;
-
- /* We always read the whole vertex. This could be reduced at some
- * point by reading less and offsetting the register index in the
- * SO_DECLs.
- */
- so.Stream0VertexReadOffset = urb_entry_read_offset;
- so.Stream0VertexReadLength = urb_entry_read_length - 1;
- so.Stream1VertexReadOffset = urb_entry_read_offset;
- so.Stream1VertexReadLength = urb_entry_read_length - 1;
- so.Stream2VertexReadOffset = urb_entry_read_offset;
- so.Stream2VertexReadLength = urb_entry_read_length - 1;
- so.Stream3VertexReadOffset = urb_entry_read_offset;
- so.Stream3VertexReadLength = urb_entry_read_length - 1;
- }
-
- if (dynamic_states & ANV_CMD_DIRTY_DYNAMIC_RASTERIZER_DISCARD_ENABLE) {
- GENX(3DSTATE_STREAMOUT_pack)(NULL, streamout_state_dw, &so);
- } else {
- anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_STREAMOUT), _so)
- _so = so;
- }
+ xfb_info = pipeline->base.shaders[MESA_SHADER_VERTEX]->xfb_info;
if (xfb_info) {
struct GENX(SO_DECL) so_decl[MAX_XFB_STREAMS][128];
@@ -1679,16 +1089,17 @@ emit_3dstate_streamout(struct anv_graphics_pipeline *pipeline,
sbs[xfb_info->buffer_to_stream[b]] |= 1 << b;
}
- uint32_t *dw = anv_batch_emitn(&pipeline->base.batch, 3 + 2 * max_decls,
- GENX(3DSTATE_SO_DECL_LIST),
- .StreamtoBufferSelects0 = sbs[0],
- .StreamtoBufferSelects1 = sbs[1],
- .StreamtoBufferSelects2 = sbs[2],
- .StreamtoBufferSelects3 = sbs[3],
- .NumEntries0 = decls[0],
- .NumEntries1 = decls[1],
- .NumEntries2 = decls[2],
- .NumEntries3 = decls[3]);
+ uint32_t *dw = anv_pipeline_emitn(pipeline, final.so_decl_list,
+ 3 + 2 * max_decls,
+ GENX(3DSTATE_SO_DECL_LIST),
+ .StreamtoBufferSelects0 = sbs[0],
+ .StreamtoBufferSelects1 = sbs[1],
+ .StreamtoBufferSelects2 = sbs[2],
+ .StreamtoBufferSelects3 = sbs[3],
+ .NumEntries0 = decls[0],
+ .NumEntries1 = decls[1],
+ .NumEntries2 = decls[2],
+ .NumEntries3 = decls[3]);
for (int i = 0; i < max_decls; i++) {
GENX(SO_DECL_ENTRY_pack)(NULL, dw + 3 + i * 2,
@@ -1700,6 +1111,37 @@ emit_3dstate_streamout(struct anv_graphics_pipeline *pipeline,
});
}
}
+
+ anv_pipeline_emit(pipeline, partial.so, GENX(3DSTATE_STREAMOUT), so) {
+ if (xfb_info) {
+ pipeline->uses_xfb = true;
+
+ so.SOFunctionEnable = true;
+ so.SOStatisticsEnable = true;
+
+ so.Buffer0SurfacePitch = xfb_info->buffers[0].stride;
+ so.Buffer1SurfacePitch = xfb_info->buffers[1].stride;
+ so.Buffer2SurfacePitch = xfb_info->buffers[2].stride;
+ so.Buffer3SurfacePitch = xfb_info->buffers[3].stride;
+
+ int urb_entry_read_offset = 0;
+ int urb_entry_read_length =
+ (prog_data->vue_map.num_slots + 1) / 2 - urb_entry_read_offset;
+
+ /* We always read the whole vertex. This could be reduced at some
+ * point by reading less and offsetting the register index in the
+ * SO_DECLs.
+ */
+ so.Stream0VertexReadOffset = urb_entry_read_offset;
+ so.Stream0VertexReadLength = urb_entry_read_length - 1;
+ so.Stream1VertexReadOffset = urb_entry_read_offset;
+ so.Stream1VertexReadLength = urb_entry_read_length - 1;
+ so.Stream2VertexReadOffset = urb_entry_read_offset;
+ so.Stream2VertexReadLength = urb_entry_read_length - 1;
+ so.Stream3VertexReadOffset = urb_entry_read_offset;
+ so.Stream3VertexReadLength = urb_entry_read_length - 1;
+ }
+ }
}
static uint32_t
@@ -1735,8 +1177,17 @@ get_scratch_space(const struct anv_shader_bin *bin)
static UNUSED uint32_t
get_scratch_surf(struct anv_pipeline *pipeline,
+ gl_shader_stage stage,
const struct anv_shader_bin *bin)
{
+ if (bin->prog_data->total_scratch == 0)
+ return 0;
+
+ struct anv_bo *bo =
+ anv_scratch_pool_alloc(pipeline->device,
+ &pipeline->device->scratch_pool,
+ stage, bin->prog_data->total_scratch);
+ anv_reloc_list_add_bo(pipeline->batch.relocs, bo);
return anv_scratch_pool_get_surf(pipeline->device,
&pipeline->device->scratch_pool,
bin->prog_data->total_scratch) >> 4;
@@ -1745,18 +1196,18 @@ get_scratch_surf(struct anv_pipeline *pipeline,
static void
emit_3dstate_vs(struct anv_graphics_pipeline *pipeline)
{
- const struct intel_device_info *devinfo = &pipeline->base.device->info;
+ const struct intel_device_info *devinfo = pipeline->base.base.device->info;
const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
const struct anv_shader_bin *vs_bin =
- pipeline->shaders[MESA_SHADER_VERTEX];
+ pipeline->base.shaders[MESA_SHADER_VERTEX];
assert(anv_pipeline_has_stage(pipeline, MESA_SHADER_VERTEX));
- anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_VS), vs) {
+ anv_pipeline_emit(pipeline, final.vs, GENX(3DSTATE_VS), vs) {
vs.Enable = true;
vs.StatisticsEnable = true;
vs.KernelStartPointer = vs_bin->kernel.offset;
-#if GFX_VER >= 8
+#if GFX_VER < 20
vs.SIMD8DispatchEnable =
vs_prog_data->base.dispatch_mode == DISPATCH_MODE_SIMD8;
#endif
@@ -1785,7 +1236,7 @@ emit_3dstate_vs(struct anv_graphics_pipeline *pipeline)
* but the Haswell docs for the "VS Reference Count Full Force Miss
* Enable" field of the "Thread Mode" register refer to a HSW bug in
* which the VUE handle reference count would overflow resulting in
- * internal reference counting bugs. My (Jason's) best guess is that
+ * internal reference counting bugs. My (Faith's) best guess is that
* this bug cropped back up on SKL GT4 when we suddenly had more
* threads in play than any previous gfx9 hardware.
*
@@ -1805,44 +1256,42 @@ emit_3dstate_vs(struct anv_graphics_pipeline *pipeline)
vs.DispatchGRFStartRegisterForURBData =
vs_prog_data->base.base.dispatch_grf_start_reg;
-#if GFX_VER >= 8
vs.UserClipDistanceClipTestEnableBitmask =
vs_prog_data->base.clip_distance_mask;
vs.UserClipDistanceCullTestEnableBitmask =
vs_prog_data->base.cull_distance_mask;
-#endif
#if GFX_VERx10 >= 125
- vs.ScratchSpaceBuffer = get_scratch_surf(&pipeline->base, vs_bin);
+ vs.ScratchSpaceBuffer =
+ get_scratch_surf(&pipeline->base.base, MESA_SHADER_VERTEX, vs_bin);
#else
vs.PerThreadScratchSpace = get_scratch_space(vs_bin);
vs.ScratchSpaceBasePointer =
- get_scratch_address(&pipeline->base, MESA_SHADER_VERTEX, vs_bin);
+ get_scratch_address(&pipeline->base.base, MESA_SHADER_VERTEX, vs_bin);
#endif
}
}
static void
-emit_3dstate_hs_te_ds(struct anv_graphics_pipeline *pipeline,
- const VkPipelineTessellationStateCreateInfo *tess_info)
+emit_3dstate_hs_ds(struct anv_graphics_pipeline *pipeline,
+ const struct vk_tessellation_state *ts)
{
if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL)) {
- anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_HS), hs);
- anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_TE), te);
- anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_DS), ds);
+ anv_pipeline_emit(pipeline, final.hs, GENX(3DSTATE_HS), hs);
+ anv_pipeline_emit(pipeline, final.ds, GENX(3DSTATE_DS), ds);
return;
}
- const struct intel_device_info *devinfo = &pipeline->base.device->info;
+ const struct intel_device_info *devinfo = pipeline->base.base.device->info;
const struct anv_shader_bin *tcs_bin =
- pipeline->shaders[MESA_SHADER_TESS_CTRL];
+ pipeline->base.shaders[MESA_SHADER_TESS_CTRL];
const struct anv_shader_bin *tes_bin =
- pipeline->shaders[MESA_SHADER_TESS_EVAL];
+ pipeline->base.shaders[MESA_SHADER_TESS_EVAL];
const struct brw_tcs_prog_data *tcs_prog_data = get_tcs_prog_data(pipeline);
const struct brw_tes_prog_data *tes_prog_data = get_tes_prog_data(pipeline);
- anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_HS), hs) {
+ anv_pipeline_emit(pipeline, final.hs, GENX(3DSTATE_HS), hs) {
hs.Enable = true;
hs.StatisticsEnable = true;
hs.KernelStartPointer = tcs_bin->kernel.offset;
@@ -1874,11 +1323,12 @@ emit_3dstate_hs_te_ds(struct anv_graphics_pipeline *pipeline,
#endif
#if GFX_VERx10 >= 125
- hs.ScratchSpaceBuffer = get_scratch_surf(&pipeline->base, tcs_bin);
+ hs.ScratchSpaceBuffer =
+ get_scratch_surf(&pipeline->base.base, MESA_SHADER_TESS_CTRL, tcs_bin);
#else
hs.PerThreadScratchSpace = get_scratch_space(tcs_bin);
hs.ScratchSpaceBasePointer =
- get_scratch_address(&pipeline->base, MESA_SHADER_TESS_CTRL, tcs_bin);
+ get_scratch_address(&pipeline->base.base, MESA_SHADER_TESS_CTRL, tcs_bin);
#endif
#if GFX_VER == 12
@@ -1888,42 +1338,13 @@ emit_3dstate_hs_te_ds(struct anv_graphics_pipeline *pipeline,
hs.PatchCountThreshold = tcs_prog_data->patch_count_threshold;
#endif
-#if GFX_VER >= 9
+#if GFX_VER < 20
hs.DispatchMode = tcs_prog_data->base.dispatch_mode;
- hs.IncludePrimitiveID = tcs_prog_data->include_primitive_id;
#endif
- }
-
- const VkPipelineTessellationDomainOriginStateCreateInfo *domain_origin_state =
- tess_info ? vk_find_struct_const(tess_info, PIPELINE_TESSELLATION_DOMAIN_ORIGIN_STATE_CREATE_INFO) : NULL;
-
- VkTessellationDomainOrigin uv_origin =
- domain_origin_state ? domain_origin_state->domainOrigin :
- VK_TESSELLATION_DOMAIN_ORIGIN_UPPER_LEFT;
-
- anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_TE), te) {
- te.Partitioning = tes_prog_data->partitioning;
-
- if (uv_origin == VK_TESSELLATION_DOMAIN_ORIGIN_LOWER_LEFT) {
- te.OutputTopology = tes_prog_data->output_topology;
- } else {
- /* When the origin is upper-left, we have to flip the winding order */
- if (tes_prog_data->output_topology == OUTPUT_TRI_CCW) {
- te.OutputTopology = OUTPUT_TRI_CW;
- } else if (tes_prog_data->output_topology == OUTPUT_TRI_CW) {
- te.OutputTopology = OUTPUT_TRI_CCW;
- } else {
- te.OutputTopology = tes_prog_data->output_topology;
- }
- }
-
- te.TEDomain = tes_prog_data->domain;
- te.TEEnable = true;
- te.MaximumTessellationFactorOdd = 63.0;
- te.MaximumTessellationFactorNotOdd = 64.0;
- }
+ hs.IncludePrimitiveID = tcs_prog_data->include_primitive_id;
+ };
- anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_DS), ds) {
+ anv_pipeline_emit(pipeline, final.ds, GENX(3DSTATE_DS), ds) {
ds.Enable = true;
ds.StatisticsEnable = true;
ds.KernelStartPointer = tes_bin->kernel.offset;
@@ -1933,21 +1354,20 @@ emit_3dstate_hs_te_ds(struct anv_graphics_pipeline *pipeline,
ds.MaximumNumberofThreads = devinfo->max_tes_threads - 1;
ds.ComputeWCoordinateEnable =
- tes_prog_data->domain == BRW_TESS_DOMAIN_TRI;
+ tes_prog_data->domain == INTEL_TESS_DOMAIN_TRI;
ds.PatchURBEntryReadLength = tes_prog_data->base.urb_read_length;
ds.PatchURBEntryReadOffset = 0;
ds.DispatchGRFStartRegisterForURBData =
tes_prog_data->base.base.dispatch_grf_start_reg;
-#if GFX_VER >= 8
#if GFX_VER < 11
ds.DispatchMode =
tes_prog_data->base.dispatch_mode == DISPATCH_MODE_SIMD8 ?
- DISPATCH_MODE_SIMD8_SINGLE_PATCH :
- DISPATCH_MODE_SIMD4X2;
+ DISPATCH_MODE_SIMD8_SINGLE_PATCH :
+ DISPATCH_MODE_SIMD4X2;
#else
- assert(tes_prog_data->base.dispatch_mode == DISPATCH_MODE_SIMD8);
+ assert(tes_prog_data->base.dispatch_mode == INTEL_DISPATCH_MODE_SIMD8);
ds.DispatchMode = DISPATCH_MODE_SIMD8_SINGLE_PATCH;
#endif
@@ -1955,37 +1375,105 @@ emit_3dstate_hs_te_ds(struct anv_graphics_pipeline *pipeline,
tes_prog_data->base.clip_distance_mask;
ds.UserClipDistanceCullTestEnableBitmask =
tes_prog_data->base.cull_distance_mask;
-#endif
+#if GFX_VER >= 12
+ ds.PrimitiveIDNotRequired = !tes_prog_data->include_primitive_id;
+#endif
#if GFX_VERx10 >= 125
- ds.ScratchSpaceBuffer = get_scratch_surf(&pipeline->base, tes_bin);
+ ds.ScratchSpaceBuffer =
+ get_scratch_surf(&pipeline->base.base, MESA_SHADER_TESS_EVAL, tes_bin);
#else
ds.PerThreadScratchSpace = get_scratch_space(tes_bin);
ds.ScratchSpaceBasePointer =
- get_scratch_address(&pipeline->base, MESA_SHADER_TESS_EVAL, tes_bin);
+ get_scratch_address(&pipeline->base.base, MESA_SHADER_TESS_EVAL, tes_bin);
#endif
}
}
+static UNUSED bool
+geom_or_tess_prim_id_used(struct anv_graphics_pipeline *pipeline)
+{
+ const struct brw_tcs_prog_data *tcs_prog_data =
+ anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_CTRL) ?
+ get_tcs_prog_data(pipeline) : NULL;
+ const struct brw_tes_prog_data *tes_prog_data =
+ anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL) ?
+ get_tes_prog_data(pipeline) : NULL;
+ const struct brw_gs_prog_data *gs_prog_data =
+ anv_pipeline_has_stage(pipeline, MESA_SHADER_GEOMETRY) ?
+ get_gs_prog_data(pipeline) : NULL;
+
+ return (tcs_prog_data && tcs_prog_data->include_primitive_id) ||
+ (tes_prog_data && tes_prog_data->include_primitive_id) ||
+ (gs_prog_data && gs_prog_data->include_primitive_id);
+}
+
static void
-emit_3dstate_gs(struct anv_graphics_pipeline *pipeline)
+emit_3dstate_te(struct anv_graphics_pipeline *pipeline)
{
- const struct intel_device_info *devinfo = &pipeline->base.device->info;
- const struct anv_shader_bin *gs_bin =
- pipeline->shaders[MESA_SHADER_GEOMETRY];
+ anv_pipeline_emit(pipeline, partial.te, GENX(3DSTATE_TE), te) {
+ if (anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL)) {
+ const struct brw_tes_prog_data *tes_prog_data =
+ get_tes_prog_data(pipeline);
+
+ te.Partitioning = tes_prog_data->partitioning;
+ te.TEDomain = tes_prog_data->domain;
+ te.TEEnable = true;
+ te.MaximumTessellationFactorOdd = 63.0;
+ te.MaximumTessellationFactorNotOdd = 64.0;
+#if GFX_VERx10 >= 125
+ const struct anv_device *device = pipeline->base.base.device;
+ if (intel_needs_workaround(device->info, 22012699309))
+ te.TessellationDistributionMode = TEDMODE_RR_STRICT;
+ else
+ te.TessellationDistributionMode = TEDMODE_RR_FREE;
+
+ if (intel_needs_workaround(device->info, 14015055625)) {
+ /* Wa_14015055625:
+ *
+ * Disable Tessellation Distribution when primitive Id is enabled.
+ */
+ if (sbe_primitive_id_override(pipeline) ||
+ geom_or_tess_prim_id_used(pipeline))
+ te.TessellationDistributionMode = TEDMODE_OFF;
+ }
+
+#if GFX_VER >= 20
+ te.TessellationDistributionLevel = TEDLEVEL_REGION;
+#else
+ te.TessellationDistributionLevel = TEDLEVEL_PATCH;
+#endif
+ /* 64_TRIANGLES */
+ te.SmallPatchThreshold = 3;
+ /* 1K_TRIANGLES */
+ te.TargetBlockSize = 8;
+ /* 1K_TRIANGLES */
+ te.LocalBOPAccumulatorThreshold = 1;
+#endif
+ }
+ }
+}
+static void
+emit_3dstate_gs(struct anv_graphics_pipeline *pipeline)
+{
if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_GEOMETRY)) {
- anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_GS), gs);
+ anv_pipeline_emit(pipeline, partial.gs, GENX(3DSTATE_GS), gs);
return;
}
+ const struct intel_device_info *devinfo = pipeline->base.base.device->info;
+ const struct anv_shader_bin *gs_bin =
+ pipeline->base.shaders[MESA_SHADER_GEOMETRY];
const struct brw_gs_prog_data *gs_prog_data = get_gs_prog_data(pipeline);
- anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_GS), gs) {
+ anv_pipeline_emit(pipeline, partial.gs, GENX(3DSTATE_GS), gs) {
gs.Enable = true;
gs.StatisticsEnable = true;
gs.KernelStartPointer = gs_bin->kernel.offset;
+#if GFX_VER < 20
gs.DispatchMode = gs_prog_data->base.dispatch_mode;
+#endif
gs.SingleProgramFlow = false;
gs.VectorMaskEnable = false;
@@ -1995,400 +1483,201 @@ emit_3dstate_gs(struct anv_graphics_pipeline *pipeline)
gs.IncludeVertexHandles = gs_prog_data->base.include_vue_handles;
gs.IncludePrimitiveID = gs_prog_data->include_primitive_id;
- if (GFX_VER == 8) {
- /* Broadwell is weird. It needs us to divide by 2. */
- gs.MaximumNumberofThreads = devinfo->max_gs_threads / 2 - 1;
- } else {
- gs.MaximumNumberofThreads = devinfo->max_gs_threads - 1;
- }
+ gs.MaximumNumberofThreads = devinfo->max_gs_threads - 1;
gs.OutputVertexSize = gs_prog_data->output_vertex_size_hwords * 2 - 1;
gs.OutputTopology = gs_prog_data->output_topology;
- gs.VertexURBEntryReadLength = gs_prog_data->base.urb_read_length;
gs.ControlDataFormat = gs_prog_data->control_data_format;
gs.ControlDataHeaderSize = gs_prog_data->control_data_header_size_hwords;
gs.InstanceControl = MAX2(gs_prog_data->invocations, 1) - 1;
- gs.ReorderMode = TRAILING;
-#if GFX_VER >= 8
gs.ExpectedVertexCount = gs_prog_data->vertices_in;
gs.StaticOutput = gs_prog_data->static_vertex_count >= 0;
gs.StaticOutputVertexCount = gs_prog_data->static_vertex_count >= 0 ?
- gs_prog_data->static_vertex_count : 0;
-#endif
+ gs_prog_data->static_vertex_count : 0;
gs.VertexURBEntryReadOffset = 0;
gs.VertexURBEntryReadLength = gs_prog_data->base.urb_read_length;
gs.DispatchGRFStartRegisterForURBData =
gs_prog_data->base.base.dispatch_grf_start_reg;
-#if GFX_VER >= 8
gs.UserClipDistanceClipTestEnableBitmask =
gs_prog_data->base.clip_distance_mask;
gs.UserClipDistanceCullTestEnableBitmask =
gs_prog_data->base.cull_distance_mask;
-#endif
#if GFX_VERx10 >= 125
- gs.ScratchSpaceBuffer = get_scratch_surf(&pipeline->base, gs_bin);
+ gs.ScratchSpaceBuffer =
+ get_scratch_surf(&pipeline->base.base, MESA_SHADER_GEOMETRY, gs_bin);
#else
gs.PerThreadScratchSpace = get_scratch_space(gs_bin);
gs.ScratchSpaceBasePointer =
- get_scratch_address(&pipeline->base, MESA_SHADER_GEOMETRY, gs_bin);
+ get_scratch_address(&pipeline->base.base, MESA_SHADER_GEOMETRY, gs_bin);
#endif
}
}
-static bool
-has_color_buffer_write_enabled(const struct anv_graphics_pipeline *pipeline,
- const VkPipelineColorBlendStateCreateInfo *blend)
-{
- const struct anv_shader_bin *shader_bin =
- pipeline->shaders[MESA_SHADER_FRAGMENT];
- if (!shader_bin)
- return false;
-
- if (!pipeline->dynamic_state.color_writes)
- return false;
-
- const struct anv_pipeline_bind_map *bind_map = &shader_bin->bind_map;
- for (int i = 0; i < bind_map->surface_count; i++) {
- struct anv_pipeline_binding *binding = &bind_map->surface_to_descriptor[i];
-
- if (binding->set != ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS)
- continue;
-
- if (binding->index == UINT32_MAX)
- continue;
-
- if (blend && blend->pAttachments[binding->index].colorWriteMask != 0)
- return true;
- }
-
- return false;
-}
-
static void
-emit_3dstate_wm(struct anv_graphics_pipeline *pipeline, struct anv_subpass *subpass,
- const VkPipelineInputAssemblyStateCreateInfo *ia,
- const VkPipelineRasterizationStateCreateInfo *raster,
- const VkPipelineColorBlendStateCreateInfo *blend,
- const VkPipelineMultisampleStateCreateInfo *multisample,
- const VkPipelineRasterizationLineStateCreateInfoEXT *line,
- const uint32_t dynamic_states)
+emit_3dstate_wm(struct anv_graphics_pipeline *pipeline,
+ const struct vk_input_assembly_state *ia,
+ const struct vk_rasterization_state *rs,
+ const struct vk_multisample_state *ms,
+ const struct vk_color_blend_state *cb,
+ const struct vk_render_pass_state *rp)
{
const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
- struct GENX(3DSTATE_WM) wm = {
- GENX(3DSTATE_WM_header),
- };
- wm.StatisticsEnable = true;
- wm.LineEndCapAntialiasingRegionWidth = _05pixels;
- wm.LineAntialiasingRegionWidth = _10pixels;
- wm.PointRasterizationRule = RASTRULE_UPPER_RIGHT;
+ anv_pipeline_emit(pipeline, partial.wm, GENX(3DSTATE_WM), wm) {
+ wm.StatisticsEnable = true;
+ wm.LineEndCapAntialiasingRegionWidth = _05pixels;
+ wm.LineAntialiasingRegionWidth = _10pixels;
+ wm.PointRasterizationRule = RASTRULE_UPPER_LEFT;
- if (anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) {
- if (wm_prog_data->early_fragment_tests) {
+ if (anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) {
+ if (wm_prog_data->early_fragment_tests) {
wm.EarlyDepthStencilControl = EDSC_PREPS;
- } else if (wm_prog_data->has_side_effects) {
- wm.EarlyDepthStencilControl = EDSC_PSEXEC;
- } else {
- wm.EarlyDepthStencilControl = EDSC_NORMAL;
- }
-
-#if GFX_VER >= 8
- /* Gen8 hardware tries to compute ThreadDispatchEnable for us but
- * doesn't take into account KillPixels when no depth or stencil
- * writes are enabled. In order for occlusion queries to work
- * correctly with no attachments, we need to force-enable PS thread
- * dispatch.
- *
- * The BDW docs are pretty clear that that this bit isn't validated
- * and probably shouldn't be used in production:
- *
- * "This must always be set to Normal. This field should not be
- * tested for functional validation."
- *
- * Unfortunately, however, the other mechanism we have for doing this
- * is 3DSTATE_PS_EXTRA::PixelShaderHasUAV which causes hangs on BDW.
- * Given two bad options, we choose the one which works.
- */
- pipeline->force_fragment_thread_dispatch =
- wm_prog_data->has_side_effects ||
- wm_prog_data->uses_kill;
-
- if (pipeline->force_fragment_thread_dispatch ||
- !has_color_buffer_write_enabled(pipeline, blend)) {
- /* Only set this value in non dynamic mode. */
- wm.ForceThreadDispatchEnable =
- !(dynamic_states & ANV_CMD_DIRTY_DYNAMIC_COLOR_BLEND_STATE) ? ForceON : 0;
- }
-#endif
-
- wm.BarycentricInterpolationMode =
- wm_prog_data->barycentric_interp_modes;
-
-#if GFX_VER < 8
- wm.PixelShaderComputedDepthMode = wm_prog_data->computed_depth_mode;
- wm.PixelShaderUsesSourceDepth = wm_prog_data->uses_src_depth;
- wm.PixelShaderUsesSourceW = wm_prog_data->uses_src_w;
- wm.PixelShaderUsesInputCoverageMask = wm_prog_data->uses_sample_mask;
-
- /* If the subpass has a depth or stencil self-dependency, then we
- * need to force the hardware to do the depth/stencil write *after*
- * fragment shader execution. Otherwise, the writes may hit memory
- * before we get around to fetching from the input attachment and we
- * may get the depth or stencil value from the current draw rather
- * than the previous one.
- */
- wm.PixelShaderKillsPixel = subpass->has_ds_self_dep ||
- wm_prog_data->uses_kill;
-
- pipeline->force_fragment_thread_dispatch =
- wm.PixelShaderComputedDepthMode != PSCDEPTH_OFF ||
- wm_prog_data->has_side_effects ||
- wm.PixelShaderKillsPixel;
-
- if (pipeline->force_fragment_thread_dispatch ||
- has_color_buffer_write_enabled(pipeline, blend)) {
- /* Only set this value in non dynamic mode. */
- wm.ThreadDispatchEnable = !(dynamic_states & ANV_CMD_DIRTY_DYNAMIC_COLOR_BLEND_STATE);
- }
-
- if (multisample && multisample->rasterizationSamples > 1) {
- if (wm_prog_data->persample_dispatch) {
- wm.MultisampleDispatchMode = MSDISPMODE_PERSAMPLE;
+ } else if (wm_prog_data->has_side_effects) {
+ wm.EarlyDepthStencilControl = EDSC_PSEXEC;
} else {
- wm.MultisampleDispatchMode = MSDISPMODE_PERPIXEL;
+ wm.EarlyDepthStencilControl = EDSC_NORMAL;
}
- } else {
- wm.MultisampleDispatchMode = MSDISPMODE_PERSAMPLE;
- }
-
- VkPolygonMode raster_mode =
- genX(raster_polygon_mode)(pipeline, ia->topology);
-
- wm.MultisampleRasterizationMode =
- dynamic_states & ANV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY ? 0 :
- genX(ms_rasterization_mode)(pipeline, raster_mode);
-#endif
-
- wm.LineStippleEnable = line && line->stippledLineEnable;
- }
-
- uint32_t dynamic_wm_states = ANV_CMD_DIRTY_DYNAMIC_COLOR_BLEND_STATE;
-#if GFX_VER < 8
- dynamic_wm_states |= ANV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY;
-#endif
-
- if (dynamic_states & dynamic_wm_states) {
- const struct intel_device_info *devinfo = &pipeline->base.device->info;
- uint32_t *dws = devinfo->ver >= 8 ? pipeline->gfx8.wm : pipeline->gfx7.wm;
- GENX(3DSTATE_WM_pack)(NULL, dws, &wm);
- } else {
- anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_WM), _wm)
- _wm = wm;
+ /* Gen8 hardware tries to compute ThreadDispatchEnable for us but
+ * doesn't take into account KillPixels when no depth or stencil
+ * writes are enabled. In order for occlusion queries to work
+ * correctly with no attachments, we need to force-enable PS thread
+ * dispatch.
+ *
+ * The BDW docs are pretty clear that that this bit isn't validated
+ * and probably shouldn't be used in production:
+ *
+ * "This must always be set to Normal. This field should not be
+ * tested for functional validation."
+ *
+ * Unfortunately, however, the other mechanism we have for doing this
+ * is 3DSTATE_PS_EXTRA::PixelShaderHasUAV which causes hangs on BDW.
+ * Given two bad options, we choose the one which works.
+ */
+ pipeline->force_fragment_thread_dispatch =
+ wm_prog_data->has_side_effects ||
+ wm_prog_data->uses_kill;
+ }
}
}
static void
emit_3dstate_ps(struct anv_graphics_pipeline *pipeline,
- const VkPipelineColorBlendStateCreateInfo *blend,
- const VkPipelineMultisampleStateCreateInfo *multisample)
+ const struct vk_multisample_state *ms,
+ const struct vk_color_blend_state *cb)
{
UNUSED const struct intel_device_info *devinfo =
- &pipeline->base.device->info;
+ pipeline->base.base.device->info;
const struct anv_shader_bin *fs_bin =
- pipeline->shaders[MESA_SHADER_FRAGMENT];
+ pipeline->base.shaders[MESA_SHADER_FRAGMENT];
if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) {
- anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_PS), ps) {
-#if GFX_VER == 7
- /* Even if no fragments are ever dispatched, gfx7 hardware hangs if
- * we don't at least set the maximum number of threads.
- */
- ps.MaximumNumberofThreads = devinfo->max_wm_threads - 1;
-#endif
- }
+ anv_pipeline_emit(pipeline, partial.ps, GENX(3DSTATE_PS), ps);
return;
}
const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
-#if GFX_VER < 8
- /* The hardware wedges if you have this bit set but don't turn on any dual
- * source blend factors.
- */
- bool dual_src_blend = false;
- if (wm_prog_data->dual_src_blend && blend) {
- for (uint32_t i = 0; i < blend->attachmentCount; i++) {
- const VkPipelineColorBlendAttachmentState *bstate =
- &blend->pAttachments[i];
-
- if (bstate->blendEnable &&
- (is_dual_src_blend_factor(bstate->srcColorBlendFactor) ||
- is_dual_src_blend_factor(bstate->dstColorBlendFactor) ||
- is_dual_src_blend_factor(bstate->srcAlphaBlendFactor) ||
- is_dual_src_blend_factor(bstate->dstAlphaBlendFactor))) {
- dual_src_blend = true;
- break;
- }
- }
- }
-#endif
-
- anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_PS), ps) {
- ps._8PixelDispatchEnable = wm_prog_data->dispatch_8;
- ps._16PixelDispatchEnable = wm_prog_data->dispatch_16;
- ps._32PixelDispatchEnable = wm_prog_data->dispatch_32;
-
- /* From the Sky Lake PRM 3DSTATE_PS::32 Pixel Dispatch Enable:
- *
- * "When NUM_MULTISAMPLES = 16 or FORCE_SAMPLE_COUNT = 16, SIMD32
- * Dispatch must not be enabled for PER_PIXEL dispatch mode."
- *
- * Since 16x MSAA is first introduced on SKL, we don't need to apply
- * the workaround on any older hardware.
+ anv_pipeline_emit(pipeline, partial.ps, GENX(3DSTATE_PS), ps) {
+#if GFX_VER == 12
+ assert(wm_prog_data->dispatch_multi == 0 ||
+ (wm_prog_data->dispatch_multi == 16 && wm_prog_data->max_polygons == 2));
+ ps.DualSIMD8DispatchEnable = wm_prog_data->dispatch_multi;
+ /* XXX - No major improvement observed from enabling
+ * overlapping subspans, but it could be helpful
+ * in theory when the requirements listed on the
+ * BSpec page for 3DSTATE_PS_BODY are met.
*/
- if (GFX_VER >= 9 && !wm_prog_data->persample_dispatch &&
- multisample && multisample->rasterizationSamples == 16) {
- assert(ps._8PixelDispatchEnable || ps._16PixelDispatchEnable);
- ps._32PixelDispatchEnable = false;
- }
-
- ps.KernelStartPointer0 = fs_bin->kernel.offset +
- brw_wm_prog_data_prog_offset(wm_prog_data, ps, 0);
- ps.KernelStartPointer1 = fs_bin->kernel.offset +
- brw_wm_prog_data_prog_offset(wm_prog_data, ps, 1);
- ps.KernelStartPointer2 = fs_bin->kernel.offset +
- brw_wm_prog_data_prog_offset(wm_prog_data, ps, 2);
+ ps.OverlappingSubspansEnable = false;
+#endif
ps.SingleProgramFlow = false;
- ps.VectorMaskEnable = GFX_VER >= 8;
+ ps.VectorMaskEnable = wm_prog_data->uses_vmask;
/* Wa_1606682166 */
ps.SamplerCount = GFX_VER == 11 ? 0 : get_sampler_count(fs_bin);
ps.BindingTableEntryCount = fs_bin->bind_map.surface_count;
+#if GFX_VER < 20
ps.PushConstantEnable = wm_prog_data->base.nr_params > 0 ||
wm_prog_data->base.ubo_ranges[0].length;
- ps.PositionXYOffsetSelect = wm_prog_data->uses_pos_offset ?
- POSOFFSET_SAMPLE: POSOFFSET_NONE;
-#if GFX_VER < 8
- ps.AttributeEnable = wm_prog_data->num_varying_inputs > 0;
- ps.oMaskPresenttoRenderTarget = wm_prog_data->uses_omask;
- ps.DualSourceBlendEnable = dual_src_blend;
-#endif
-
-#if GFX_VERx10 == 75
- /* Haswell requires the sample mask to be set in this packet as well
- * as in 3DSTATE_SAMPLE_MASK; the values should match.
- */
- ps.SampleMask = 0xff;
#endif
-#if GFX_VER >= 9
- ps.MaximumNumberofThreadsPerPSD = 64 - 1;
-#elif GFX_VER >= 8
- ps.MaximumNumberofThreadsPerPSD = 64 - 2;
-#else
- ps.MaximumNumberofThreads = devinfo->max_wm_threads - 1;
-#endif
-
- ps.DispatchGRFStartRegisterForConstantSetupData0 =
- brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 0);
- ps.DispatchGRFStartRegisterForConstantSetupData1 =
- brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 1);
- ps.DispatchGRFStartRegisterForConstantSetupData2 =
- brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 2);
+ ps.MaximumNumberofThreadsPerPSD = devinfo->max_threads_per_psd - 1;
#if GFX_VERx10 >= 125
- ps.ScratchSpaceBuffer = get_scratch_surf(&pipeline->base, fs_bin);
+ ps.ScratchSpaceBuffer =
+ get_scratch_surf(&pipeline->base.base, MESA_SHADER_FRAGMENT, fs_bin);
#else
ps.PerThreadScratchSpace = get_scratch_space(fs_bin);
ps.ScratchSpaceBasePointer =
- get_scratch_address(&pipeline->base, MESA_SHADER_FRAGMENT, fs_bin);
+ get_scratch_address(&pipeline->base.base, MESA_SHADER_FRAGMENT, fs_bin);
#endif
}
}
-#if GFX_VER >= 8
static void
emit_3dstate_ps_extra(struct anv_graphics_pipeline *pipeline,
- struct anv_subpass *subpass,
- const VkPipelineRasterizationStateCreateInfo *rs_info)
+ const struct vk_rasterization_state *rs,
+ const struct vk_graphics_pipeline_state *state)
{
const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) {
- anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_PS_EXTRA), ps);
+ anv_pipeline_emit(pipeline, partial.ps_extra, GENX(3DSTATE_PS_EXTRA), ps);
return;
}
- anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_PS_EXTRA), ps) {
+ anv_pipeline_emit(pipeline, partial.ps_extra, GENX(3DSTATE_PS_EXTRA), ps) {
ps.PixelShaderValid = true;
+#if GFX_VER < 20
ps.AttributeEnable = wm_prog_data->num_varying_inputs > 0;
+#endif
ps.oMaskPresenttoRenderTarget = wm_prog_data->uses_omask;
- ps.PixelShaderIsPerSample = wm_prog_data->persample_dispatch;
ps.PixelShaderComputedDepthMode = wm_prog_data->computed_depth_mode;
ps.PixelShaderUsesSourceDepth = wm_prog_data->uses_src_depth;
ps.PixelShaderUsesSourceW = wm_prog_data->uses_src_w;
- /* If the subpass has a depth or stencil self-dependency, then we need
- * to force the hardware to do the depth/stencil write *after* fragment
- * shader execution. Otherwise, the writes may hit memory before we get
- * around to fetching from the input attachment and we may get the depth
- * or stencil value from the current draw rather than the previous one.
- */
- ps.PixelShaderKillsPixel = subpass->has_ds_self_dep ||
- wm_prog_data->uses_kill;
-
-#if GFX_VER >= 9
ps.PixelShaderComputesStencil = wm_prog_data->computed_stencil;
+#if GFX_VER >= 20
+ assert(!wm_prog_data->pulls_bary);
+#else
ps.PixelShaderPullsBary = wm_prog_data->pulls_bary;
+#endif
ps.InputCoverageMaskState = ICMS_NONE;
assert(!wm_prog_data->inner_coverage); /* Not available in SPIR-V */
if (!wm_prog_data->uses_sample_mask)
ps.InputCoverageMaskState = ICMS_NONE;
- else if (wm_prog_data->per_coarse_pixel_dispatch)
+ else if (brw_wm_prog_data_is_coarse(wm_prog_data, 0))
ps.InputCoverageMaskState = ICMS_NORMAL;
else if (wm_prog_data->post_depth_coverage)
ps.InputCoverageMaskState = ICMS_DEPTH_COVERAGE;
else
ps.InputCoverageMaskState = ICMS_NORMAL;
-#else
- ps.PixelShaderUsesInputCoverageMask = wm_prog_data->uses_sample_mask;
-#endif
#if GFX_VER >= 11
ps.PixelShaderRequiresSourceDepthandorWPlaneCoefficients =
wm_prog_data->uses_depth_w_coefficients;
- ps.PixelShaderIsPerCoarsePixel = wm_prog_data->per_coarse_pixel_dispatch;
#endif
}
}
static void
-emit_3dstate_vf_topology(struct anv_graphics_pipeline *pipeline)
-{
- anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_VF_TOPOLOGY), vft) {
- vft.PrimitiveTopologyType = pipeline->topology;
- }
-}
-#endif
-
-static void
emit_3dstate_vf_statistics(struct anv_graphics_pipeline *pipeline)
{
- anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_VF_STATISTICS), vfs) {
+ anv_pipeline_emit(pipeline, final.vf_statistics,
+ GENX(3DSTATE_VF_STATISTICS), vfs) {
vfs.StatisticsEnable = true;
}
}
static void
compute_kill_pixel(struct anv_graphics_pipeline *pipeline,
- const VkPipelineMultisampleStateCreateInfo *ms_info,
- const struct anv_subpass *subpass)
+ const struct vk_multisample_state *ms,
+ const struct vk_graphics_pipeline_state *state)
{
if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) {
pipeline->kill_pixel = false;
@@ -2411,31 +1700,47 @@ compute_kill_pixel(struct anv_graphics_pipeline *pipeline,
* 3DSTATE_PS_BLEND::AlphaTestEnable since Vulkan doesn't have a concept
* of an alpha test.
*/
+ pipeline->rp_has_ds_self_dep =
+ (state->pipeline_flags &
+ VK_PIPELINE_CREATE_2_DEPTH_STENCIL_ATTACHMENT_FEEDBACK_LOOP_BIT_EXT) != 0;
pipeline->kill_pixel =
- subpass->has_ds_self_dep || wm_prog_data->uses_kill ||
+ pipeline->rp_has_ds_self_dep ||
+ wm_prog_data->uses_kill ||
wm_prog_data->uses_omask ||
- (ms_info && ms_info->alphaToCoverageEnable);
+ (ms && ms->alpha_to_coverage_enable);
}
-#if GFX_VER == 12
+#if GFX_VER >= 12
static void
-emit_3dstate_primitive_replication(struct anv_graphics_pipeline *pipeline)
+emit_3dstate_primitive_replication(struct anv_graphics_pipeline *pipeline,
+ const struct vk_render_pass_state *rp)
{
- if (!pipeline->use_primitive_replication) {
- anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_PRIMITIVE_REPLICATION), pr);
+ if (anv_pipeline_is_mesh(pipeline)) {
+ anv_pipeline_emit(pipeline, final.primitive_replication,
+ GENX(3DSTATE_PRIMITIVE_REPLICATION), pr);
return;
}
- uint32_t view_mask = pipeline->subpass->view_mask;
- int view_count = util_bitcount(view_mask);
- assert(view_count > 1 && view_count <= MAX_VIEWS_FOR_PRIMITIVE_REPLICATION);
+ const int replication_count =
+ anv_pipeline_get_last_vue_prog_data(pipeline)->vue_map.num_pos_slots;
- anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_PRIMITIVE_REPLICATION), pr) {
- pr.ReplicaMask = (1 << view_count) - 1;
- pr.ReplicationCount = view_count - 1;
+ assert(replication_count >= 1);
+ if (replication_count == 1) {
+ anv_pipeline_emit(pipeline, final.primitive_replication,
+ GENX(3DSTATE_PRIMITIVE_REPLICATION), pr);
+ return;
+ }
+
+ assert(replication_count == util_bitcount(rp->view_mask));
+ assert(replication_count <= MAX_VIEWS_FOR_PRIMITIVE_REPLICATION);
+
+ anv_pipeline_emit(pipeline, final.primitive_replication,
+ GENX(3DSTATE_PRIMITIVE_REPLICATION), pr) {
+ pr.ReplicaMask = (1 << replication_count) - 1;
+ pr.ReplicationCount = replication_count - 1;
int i = 0;
- u_foreach_bit(view_index, view_mask) {
+ u_foreach_bit(view_index, rp->view_mask) {
pr.RTAIOffset[i] = view_index;
i++;
}
@@ -2443,174 +1748,293 @@ emit_3dstate_primitive_replication(struct anv_graphics_pipeline *pipeline)
}
#endif
-static VkResult
-genX(graphics_pipeline_create)(
- VkDevice _device,
- struct anv_pipeline_cache * cache,
- const VkGraphicsPipelineCreateInfo* pCreateInfo,
- const VkAllocationCallbacks* pAllocator,
- VkPipeline* pPipeline)
+#if GFX_VERx10 >= 125
+static void
+emit_task_state(struct anv_graphics_pipeline *pipeline)
{
- ANV_FROM_HANDLE(anv_device, device, _device);
- ANV_FROM_HANDLE(anv_render_pass, pass, pCreateInfo->renderPass);
- struct anv_subpass *subpass = &pass->subpasses[pCreateInfo->subpass];
- struct anv_graphics_pipeline *pipeline;
- VkResult result;
-
- assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO);
-
- /* Use the default pipeline cache if none is specified */
- if (cache == NULL && device->physical->instance->pipeline_cache_enabled)
- cache = &device->default_pipeline_cache;
-
- pipeline = vk_zalloc2(&device->vk.alloc, pAllocator, sizeof(*pipeline), 8,
- VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
- if (pipeline == NULL)
- return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
-
- result = anv_graphics_pipeline_init(pipeline, device, cache,
- pCreateInfo, pAllocator);
- if (result != VK_SUCCESS) {
- vk_free2(&device->vk.alloc, pAllocator, pipeline);
- if (result == VK_PIPELINE_COMPILE_REQUIRED_EXT)
- *pPipeline = VK_NULL_HANDLE;
- return result;
+ assert(anv_pipeline_is_mesh(pipeline));
+
+ if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_TASK)) {
+ anv_pipeline_emit(pipeline, final.task_control,
+ GENX(3DSTATE_TASK_CONTROL), zero);
+ anv_pipeline_emit(pipeline, final.task_shader,
+ GENX(3DSTATE_TASK_SHADER), zero);
+ anv_pipeline_emit(pipeline, final.task_redistrib,
+ GENX(3DSTATE_TASK_REDISTRIB), zero);
+ return;
+ }
+
+ const struct anv_shader_bin *task_bin =
+ pipeline->base.shaders[MESA_SHADER_TASK];
+
+ anv_pipeline_emit(pipeline, final.task_control,
+ GENX(3DSTATE_TASK_CONTROL), tc) {
+ tc.TaskShaderEnable = true;
+ tc.ScratchSpaceBuffer =
+ get_scratch_surf(&pipeline->base.base, MESA_SHADER_TASK, task_bin);
+ tc.MaximumNumberofThreadGroups = 511;
+ }
+
+ const struct intel_device_info *devinfo = pipeline->base.base.device->info;
+ const struct brw_task_prog_data *task_prog_data = get_task_prog_data(pipeline);
+ const struct intel_cs_dispatch_info task_dispatch =
+ brw_cs_get_dispatch_info(devinfo, &task_prog_data->base, NULL);
+
+ anv_pipeline_emit(pipeline, final.task_shader,
+ GENX(3DSTATE_TASK_SHADER), task) {
+ task.KernelStartPointer = task_bin->kernel.offset;
+ task.SIMDSize = task_dispatch.simd_size / 16;
+ task.MessageSIMD = task.SIMDSize;
+ task.NumberofThreadsinGPGPUThreadGroup = task_dispatch.threads;
+ task.ExecutionMask = task_dispatch.right_mask;
+ task.LocalXMaximum = task_dispatch.group_size - 1;
+ task.EmitLocalIDX = true;
+
+ task.NumberofBarriers = task_prog_data->base.uses_barrier;
+ task.SharedLocalMemorySize =
+ encode_slm_size(GFX_VER, task_prog_data->base.base.total_shared);
+ task.PreferredSLMAllocationSize =
+ preferred_slm_allocation_size(devinfo);
+
+ /*
+ * 3DSTATE_TASK_SHADER_DATA.InlineData[0:1] will be used for an address
+ * of a buffer with push constants and descriptor set table and
+ * InlineData[2:7] will be used for first few push constants.
+ */
+ task.EmitInlineParameter = true;
+
+ task.XP0Required = task_prog_data->uses_drawid;
}
- /* Information on which states are considered dynamic. */
- const VkPipelineDynamicStateCreateInfo *dyn_info =
- pCreateInfo->pDynamicState;
- uint32_t dynamic_states = 0;
- if (dyn_info) {
- for (unsigned i = 0; i < dyn_info->dynamicStateCount; i++)
- dynamic_states |=
- anv_cmd_dirty_bit_for_vk_dynamic_state(dyn_info->pDynamicStates[i]);
+ /* Recommended values from "Task and Mesh Distribution Programming". */
+ anv_pipeline_emit(pipeline, final.task_redistrib,
+ GENX(3DSTATE_TASK_REDISTRIB), redistrib) {
+ redistrib.LocalBOTAccumulatorThreshold = MULTIPLIER_1;
+ redistrib.SmallTaskThreshold = 1; /* 2^N */
+ redistrib.TargetMeshBatchSize = devinfo->num_slices > 2 ? 3 : 5; /* 2^N */
+ redistrib.TaskRedistributionLevel = TASKREDISTRIB_BOM;
+ redistrib.TaskRedistributionMode = TASKREDISTRIB_RR_STRICT;
}
+}
+static void
+emit_mesh_state(struct anv_graphics_pipeline *pipeline)
+{
+ assert(anv_pipeline_is_mesh(pipeline));
- /* If rasterization is not enabled, various CreateInfo structs must be
- * ignored.
- */
- const bool raster_enabled =
- !pCreateInfo->pRasterizationState->rasterizerDiscardEnable ||
- (dynamic_states & ANV_CMD_DIRTY_DYNAMIC_RASTERIZER_DISCARD_ENABLE);
+ const struct anv_shader_bin *mesh_bin = pipeline->base.shaders[MESA_SHADER_MESH];
- const VkPipelineViewportStateCreateInfo *vp_info =
- raster_enabled ? pCreateInfo->pViewportState : NULL;
+ anv_pipeline_emit(pipeline, final.mesh_control,
+ GENX(3DSTATE_MESH_CONTROL), mc) {
+ mc.MeshShaderEnable = true;
+ mc.ScratchSpaceBuffer =
+ get_scratch_surf(&pipeline->base.base, MESA_SHADER_MESH, mesh_bin);
+ mc.MaximumNumberofThreadGroups = 511;
+ }
- const VkPipelineMultisampleStateCreateInfo *ms_info =
- raster_enabled ? pCreateInfo->pMultisampleState : NULL;
+ const struct intel_device_info *devinfo = pipeline->base.base.device->info;
+ const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline);
+ const struct intel_cs_dispatch_info mesh_dispatch =
+ brw_cs_get_dispatch_info(devinfo, &mesh_prog_data->base, NULL);
- const VkPipelineDepthStencilStateCreateInfo *ds_info =
- raster_enabled ? pCreateInfo->pDepthStencilState : NULL;
+ const unsigned output_topology =
+ mesh_prog_data->primitive_type == MESA_PRIM_POINTS ? OUTPUT_POINT :
+ mesh_prog_data->primitive_type == MESA_PRIM_LINES ? OUTPUT_LINE :
+ OUTPUT_TRI;
- const VkPipelineColorBlendStateCreateInfo *cb_info =
- raster_enabled ? pCreateInfo->pColorBlendState : NULL;
+ uint32_t index_format;
+ switch (mesh_prog_data->index_format) {
+ case BRW_INDEX_FORMAT_U32:
+ index_format = INDEX_U32;
+ break;
+ case BRW_INDEX_FORMAT_U888X:
+ index_format = INDEX_U888X;
+ break;
+ default:
+ unreachable("invalid index format");
+ }
+
+ anv_pipeline_emit(pipeline, final.mesh_shader,
+ GENX(3DSTATE_MESH_SHADER), mesh) {
+ mesh.KernelStartPointer = mesh_bin->kernel.offset;
+ mesh.SIMDSize = mesh_dispatch.simd_size / 16;
+ mesh.MessageSIMD = mesh.SIMDSize;
+ mesh.NumberofThreadsinGPGPUThreadGroup = mesh_dispatch.threads;
+ mesh.ExecutionMask = mesh_dispatch.right_mask;
+ mesh.LocalXMaximum = mesh_dispatch.group_size - 1;
+ mesh.EmitLocalIDX = true;
+
+ mesh.MaximumPrimitiveCount = MAX2(mesh_prog_data->map.max_primitives, 1) - 1;
+ mesh.OutputTopology = output_topology;
+ mesh.PerVertexDataPitch = mesh_prog_data->map.per_vertex_pitch_dw / 8;
+ mesh.PerPrimitiveDataPresent = mesh_prog_data->map.per_primitive_pitch_dw > 0;
+ mesh.PerPrimitiveDataPitch = mesh_prog_data->map.per_primitive_pitch_dw / 8;
+ mesh.IndexFormat = index_format;
+
+ mesh.NumberofBarriers = mesh_prog_data->base.uses_barrier;
+ mesh.SharedLocalMemorySize =
+ encode_slm_size(GFX_VER, mesh_prog_data->base.base.total_shared);
+ mesh.PreferredSLMAllocationSize =
+ preferred_slm_allocation_size(devinfo);
+
+ /*
+ * 3DSTATE_MESH_SHADER_DATA.InlineData[0:1] will be used for an address
+ * of a buffer with push constants and descriptor set table and
+ * InlineData[2:7] will be used for first few push constants.
+ */
+ mesh.EmitInlineParameter = true;
- const VkPipelineRasterizationLineStateCreateInfoEXT *line_info =
- vk_find_struct_const(pCreateInfo->pRasterizationState->pNext,
- PIPELINE_RASTERIZATION_LINE_STATE_CREATE_INFO_EXT);
+ mesh.XP0Required = mesh_prog_data->uses_drawid;
+ }
+ /* Recommended values from "Task and Mesh Distribution Programming". */
+ anv_pipeline_emit(pipeline, final.mesh_distrib,
+ GENX(3DSTATE_MESH_DISTRIB), distrib) {
+ distrib.DistributionMode = MESH_RR_FREE;
+ distrib.TaskDistributionBatchSize = devinfo->num_slices > 2 ? 4 : 9; /* 2^N thread groups */
+ distrib.MeshDistributionBatchSize = devinfo->num_slices > 2 ? 3 : 3; /* 2^N thread groups */
+ }
+}
+#endif
+
+void
+genX(graphics_pipeline_emit)(struct anv_graphics_pipeline *pipeline,
+ const struct vk_graphics_pipeline_state *state)
+{
enum intel_urb_deref_block_size urb_deref_block_size;
emit_urb_setup(pipeline, &urb_deref_block_size);
- assert(pCreateInfo->pVertexInputState);
- emit_vertex_input(pipeline, pCreateInfo->pVertexInputState);
- assert(pCreateInfo->pRasterizationState);
- emit_rs_state(pipeline, pCreateInfo->pInputAssemblyState,
- pCreateInfo->pRasterizationState,
- ms_info, line_info, dynamic_states, pass, subpass,
- urb_deref_block_size);
- emit_ms_state(pipeline, ms_info, dynamic_states);
- emit_ds_state(pipeline, ds_info, dynamic_states, pass, subpass);
- emit_cb_state(pipeline, cb_info, ms_info, dynamic_states);
- compute_kill_pixel(pipeline, ms_info, subpass);
-
- emit_3dstate_clip(pipeline,
- pCreateInfo->pInputAssemblyState,
- vp_info,
- pCreateInfo->pRasterizationState,
- dynamic_states);
- emit_3dstate_streamout(pipeline, pCreateInfo->pRasterizationState,
- dynamic_states);
+ emit_rs_state(pipeline, state->ia, state->rs, state->ms, state->rp,
+ urb_deref_block_size);
+ emit_ms_state(pipeline, state->ms);
+ compute_kill_pixel(pipeline, state->ms, state);
-#if GFX_VER == 12
- emit_3dstate_primitive_replication(pipeline);
+ emit_3dstate_clip(pipeline, state->ia, state->vp, state->rs);
+
+#if GFX_VER >= 12
+ emit_3dstate_primitive_replication(pipeline, state->rp);
#endif
-#if 0
- /* From gfx7_vs_state.c */
+#if GFX_VERx10 >= 125
+ bool needs_instance_granularity =
+ intel_needs_workaround(pipeline->base.base.device->info, 14019166699) &&
+ (sbe_primitive_id_override(pipeline) ||
+ geom_or_tess_prim_id_used(pipeline));
+
+ anv_pipeline_emit(pipeline, partial.vfg, GENX(3DSTATE_VFG), vfg) {
+ /* If 3DSTATE_TE: TE Enable == 1 then RR_STRICT else RR_FREE*/
+ vfg.DistributionMode =
+ anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL) ? RR_STRICT :
+ RR_FREE;
+ vfg.DistributionGranularity = needs_instance_granularity ?
+ InstanceLevelGranularity : BatchLevelGranularity;
+#if INTEL_WA_14014851047_GFX_VER
+ vfg.GranularityThresholdDisable =
+ intel_needs_workaround(pipeline->base.base.device->info, 14014851047);
+#endif
+ /* 192 vertices for TRILIST_ADJ */
+ vfg.ListNBatchSizeScale = 0;
+ /* Batch size of 384 vertices */
+ vfg.List3BatchSizeScale = 2;
+ /* Batch size of 128 vertices */
+ vfg.List2BatchSizeScale = 1;
+ /* Batch size of 128 vertices */
+ vfg.List1BatchSizeScale = 2;
+ /* Batch size of 256 vertices for STRIP topologies */
+ vfg.StripBatchSizeScale = 3;
+ /* 192 control points for PATCHLIST_3 */
+ vfg.PatchBatchSizeScale = 1;
+ /* 192 control points for PATCHLIST_3 */
+ vfg.PatchBatchSizeMultiplier = 31;
+ }
+#endif
- /**
- * From Graphics BSpec: 3D-Media-GPGPU Engine > 3D Pipeline Stages >
- * Geometry > Geometry Shader > State:
- *
- * "Note: Because of corruption in IVB:GT2, software needs to flush the
- * whole fixed function pipeline when the GS enable changes value in
- * the 3DSTATE_GS."
- *
- * The hardware architects have clarified that in this context "flush the
- * whole fixed function pipeline" means to emit a PIPE_CONTROL with the "CS
- * Stall" bit set.
- */
- if (!device->info.is_haswell && !device->info.is_baytrail)
- gfx7_emit_vs_workaround_flush(brw);
+ emit_3dstate_vf_statistics(pipeline);
+
+ if (anv_pipeline_is_primitive(pipeline)) {
+ emit_vertex_input(pipeline, state, state->vi);
+
+ emit_3dstate_vs(pipeline);
+ emit_3dstate_hs_ds(pipeline, state->ts);
+ emit_3dstate_te(pipeline);
+ emit_3dstate_gs(pipeline);
+
+ emit_3dstate_streamout(pipeline, state->rs);
+
+#if GFX_VERx10 >= 125
+ const struct anv_device *device = pipeline->base.base.device;
+ /* Disable Mesh. */
+ if (device->vk.enabled_extensions.EXT_mesh_shader) {
+ anv_pipeline_emit(pipeline, final.mesh_control,
+ GENX(3DSTATE_MESH_CONTROL), zero);
+ anv_pipeline_emit(pipeline, final.mesh_shader,
+ GENX(3DSTATE_MESH_SHADER), zero);
+ anv_pipeline_emit(pipeline, final.mesh_distrib,
+ GENX(3DSTATE_MESH_DISTRIB), zero);
+ anv_pipeline_emit(pipeline, final.clip_mesh,
+ GENX(3DSTATE_CLIP_MESH), zero);
+ anv_pipeline_emit(pipeline, final.sbe_mesh,
+ GENX(3DSTATE_SBE_MESH), zero);
+ anv_pipeline_emit(pipeline, final.task_control,
+ GENX(3DSTATE_TASK_CONTROL), zero);
+ anv_pipeline_emit(pipeline, final.task_shader,
+ GENX(3DSTATE_TASK_SHADER), zero);
+ anv_pipeline_emit(pipeline, final.task_redistrib,
+ GENX(3DSTATE_TASK_REDISTRIB), zero);
+ }
#endif
+ } else {
+ assert(anv_pipeline_is_mesh(pipeline));
- emit_3dstate_vs(pipeline);
- emit_3dstate_hs_te_ds(pipeline, pCreateInfo->pTessellationState);
- emit_3dstate_gs(pipeline);
- emit_3dstate_sbe(pipeline);
- emit_3dstate_wm(pipeline, subpass,
- pCreateInfo->pInputAssemblyState,
- pCreateInfo->pRasterizationState,
- cb_info, ms_info, line_info, dynamic_states);
- emit_3dstate_ps(pipeline, cb_info, ms_info);
-#if GFX_VER >= 8
- emit_3dstate_ps_extra(pipeline, subpass,
- pCreateInfo->pRasterizationState);
-
- if (!(dynamic_states & ANV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY))
- emit_3dstate_vf_topology(pipeline);
+ anv_pipeline_emit(pipeline, final.vf_sgvs, GENX(3DSTATE_VF_SGVS), sgvs);
+#if GFX_VER >= 11
+ anv_pipeline_emit(pipeline, final.vf_sgvs_2, GENX(3DSTATE_VF_SGVS_2), sgvs);
#endif
- emit_3dstate_vf_statistics(pipeline);
+ anv_pipeline_emit(pipeline, final.vs, GENX(3DSTATE_VS), vs);
+ anv_pipeline_emit(pipeline, final.hs, GENX(3DSTATE_HS), hs);
+ anv_pipeline_emit(pipeline, final.ds, GENX(3DSTATE_DS), ds);
+ anv_pipeline_emit(pipeline, partial.te, GENX(3DSTATE_TE), te);
+ anv_pipeline_emit(pipeline, partial.gs, GENX(3DSTATE_GS), gs);
- *pPipeline = anv_pipeline_to_handle(&pipeline->base);
+ /* BSpec 46303 forbids both 3DSTATE_MESH_CONTROL.MeshShaderEnable
+ * and 3DSTATE_STREAMOUT.SOFunctionEnable to be 1.
+ */
+ anv_pipeline_emit(pipeline, partial.so, GENX(3DSTATE_STREAMOUT), so);
- return pipeline->base.batch.status;
+#if GFX_VERx10 >= 125
+ emit_task_state(pipeline);
+ emit_mesh_state(pipeline);
+#endif
+ }
+
+ emit_3dstate_sbe(pipeline);
+ emit_3dstate_wm(pipeline, state->ia, state->rs,
+ state->ms, state->cb, state->rp);
+ emit_3dstate_ps(pipeline, state->ms, state->cb);
+ emit_3dstate_ps_extra(pipeline, state->rs, state);
}
#if GFX_VERx10 >= 125
-static void
-emit_compute_state(struct anv_compute_pipeline *pipeline,
- const struct anv_device *device)
+void
+genX(compute_pipeline_emit)(struct anv_compute_pipeline *pipeline)
{
const struct brw_cs_prog_data *cs_prog_data = get_cs_prog_data(pipeline);
anv_pipeline_setup_l3_config(&pipeline->base, cs_prog_data->base.total_shared > 0);
-
- const UNUSED struct anv_shader_bin *cs_bin = pipeline->cs;
- const struct intel_device_info *devinfo = &device->info;
-
- anv_batch_emit(&pipeline->base.batch, GENX(CFE_STATE), cfe) {
- cfe.MaximumNumberofThreads =
- devinfo->max_cs_threads * devinfo->subslice_total - 1;
- cfe.ScratchSpaceBuffer = get_scratch_surf(&pipeline->base, cs_bin);
- }
}
#else /* #if GFX_VERx10 >= 125 */
-static void
-emit_compute_state(struct anv_compute_pipeline *pipeline,
- const struct anv_device *device)
+void
+genX(compute_pipeline_emit)(struct anv_compute_pipeline *pipeline)
{
- const struct intel_device_info *devinfo = &device->info;
+ struct anv_device *device = pipeline->base.device;
+ const struct intel_device_info *devinfo = device->info;
const struct brw_cs_prog_data *cs_prog_data = get_cs_prog_data(pipeline);
anv_pipeline_setup_l3_config(&pipeline->base, cs_prog_data->base.total_shared > 0);
- const struct brw_cs_dispatch_info dispatch =
+ const struct intel_cs_dispatch_info dispatch =
brw_cs_get_dispatch_info(devinfo, cs_prog_data, NULL);
const uint32_t vfe_curbe_allocation =
ALIGN(cs_prog_data->push.per_thread.regs * dispatch.threads +
@@ -2619,43 +2043,22 @@ emit_compute_state(struct anv_compute_pipeline *pipeline,
const struct anv_shader_bin *cs_bin = pipeline->cs;
anv_batch_emit(&pipeline->base.batch, GENX(MEDIA_VFE_STATE), vfe) {
-#if GFX_VER > 7
vfe.StackSize = 0;
-#else
- vfe.GPGPUMode = true;
-#endif
vfe.MaximumNumberofThreads =
devinfo->max_cs_threads * devinfo->subslice_total - 1;
- vfe.NumberofURBEntries = GFX_VER <= 7 ? 0 : 2;
+ vfe.NumberofURBEntries = 2;
#if GFX_VER < 11
vfe.ResetGatewayTimer = true;
#endif
-#if GFX_VER <= 8
- vfe.BypassGatewayControl = true;
-#endif
- vfe.URBEntryAllocationSize = GFX_VER <= 7 ? 0 : 2;
+ vfe.URBEntryAllocationSize = 2;
vfe.CURBEAllocationSize = vfe_curbe_allocation;
if (cs_bin->prog_data->total_scratch) {
- if (GFX_VER >= 8) {
- /* Broadwell's Per Thread Scratch Space is in the range [0, 11]
- * where 0 = 1k, 1 = 2k, 2 = 4k, ..., 11 = 2M.
- */
- vfe.PerThreadScratchSpace =
- ffs(cs_bin->prog_data->total_scratch) - 11;
- } else if (GFX_VERx10 == 75) {
- /* Haswell's Per Thread Scratch Space is in the range [0, 10]
- * where 0 = 2k, 1 = 4k, 2 = 8k, ..., 10 = 2M.
- */
- vfe.PerThreadScratchSpace =
- ffs(cs_bin->prog_data->total_scratch) - 12;
- } else {
- /* IVB and BYT use the range [0, 11] to mean [1kB, 12kB]
- * where 0 = 1kB, 1 = 2kB, 2 = 3kB, ..., 11 = 12kB.
- */
- vfe.PerThreadScratchSpace =
- cs_bin->prog_data->total_scratch / 1024 - 1;
- }
+ /* Broadwell's Per Thread Scratch Space is in the range [0, 11]
+ * where 0 = 1k, 1 = 2k, 2 = 4k, ..., 11 = 2M.
+ */
+ vfe.PerThreadScratchSpace =
+ ffs(cs_bin->prog_data->total_scratch) - 11;
vfe.ScratchSpaceBasePointer =
get_scratch_address(&pipeline->base, MESA_SHADER_COMPUTE, cs_bin);
}
@@ -2670,20 +2073,19 @@ emit_compute_state(struct anv_compute_pipeline *pipeline,
.SamplerCount = GFX_VER == 11 ? 0 : get_sampler_count(cs_bin),
/* We add 1 because the CS indirect parameters buffer isn't accounted
* for in bind_map.surface_count.
+ *
+ * Typically set to 0 to avoid prefetching on every thread dispatch.
*/
- .BindingTableEntryCount = 1 + MIN2(cs_bin->bind_map.surface_count, 30),
+ .BindingTableEntryCount = devinfo->verx10 == 125 ?
+ 0 : 1 + MIN2(pipeline->cs->bind_map.surface_count, 30),
.BarrierEnable = cs_prog_data->uses_barrier,
.SharedLocalMemorySize =
encode_slm_size(GFX_VER, cs_prog_data->base.total_shared),
-#if GFX_VERx10 != 75
.ConstantURBEntryReadOffset = 0,
-#endif
.ConstantURBEntryReadLength = cs_prog_data->push.per_thread.regs,
-#if GFX_VERx10 >= 75
.CrossThreadConstantDataReadLength =
cs_prog_data->push.cross_thread.regs,
-#endif
#if GFX_VER >= 12
/* TODO: Check if we are missing workarounds and enable mid-thread
* preemption.
@@ -2706,268 +2108,38 @@ emit_compute_state(struct anv_compute_pipeline *pipeline,
#endif /* #if GFX_VERx10 >= 125 */
-static VkResult
-compute_pipeline_create(
- VkDevice _device,
- struct anv_pipeline_cache * cache,
- const VkComputePipelineCreateInfo* pCreateInfo,
- const VkAllocationCallbacks* pAllocator,
- VkPipeline* pPipeline)
-{
- ANV_FROM_HANDLE(anv_device, device, _device);
- struct anv_compute_pipeline *pipeline;
- VkResult result;
-
- assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO);
-
- /* Use the default pipeline cache if none is specified */
- if (cache == NULL && device->physical->instance->pipeline_cache_enabled)
- cache = &device->default_pipeline_cache;
-
- pipeline = vk_zalloc2(&device->vk.alloc, pAllocator, sizeof(*pipeline), 8,
- VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
- if (pipeline == NULL)
- return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
-
- result = anv_pipeline_init(&pipeline->base, device,
- ANV_PIPELINE_COMPUTE, pCreateInfo->flags,
- pAllocator);
- if (result != VK_SUCCESS) {
- vk_free2(&device->vk.alloc, pAllocator, pipeline);
- return result;
- }
-
- anv_batch_set_storage(&pipeline->base.batch, ANV_NULL_ADDRESS,
- pipeline->batch_data, sizeof(pipeline->batch_data));
-
- assert(pCreateInfo->stage.stage == VK_SHADER_STAGE_COMPUTE_BIT);
- VK_FROM_HANDLE(vk_shader_module, module, pCreateInfo->stage.module);
- result = anv_pipeline_compile_cs(pipeline, cache, pCreateInfo, module,
- pCreateInfo->stage.pName,
- pCreateInfo->stage.pSpecializationInfo);
- if (result != VK_SUCCESS) {
- anv_pipeline_finish(&pipeline->base, device, pAllocator);
- vk_free2(&device->vk.alloc, pAllocator, pipeline);
- if (result == VK_PIPELINE_COMPILE_REQUIRED_EXT)
- *pPipeline = VK_NULL_HANDLE;
- return result;
- }
-
- emit_compute_state(pipeline, device);
-
- *pPipeline = anv_pipeline_to_handle(&pipeline->base);
-
- return pipeline->base.batch.status;
-}
-
-VkResult genX(CreateGraphicsPipelines)(
- VkDevice _device,
- VkPipelineCache pipelineCache,
- uint32_t count,
- const VkGraphicsPipelineCreateInfo* pCreateInfos,
- const VkAllocationCallbacks* pAllocator,
- VkPipeline* pPipelines)
-{
- ANV_FROM_HANDLE(anv_pipeline_cache, pipeline_cache, pipelineCache);
-
- VkResult result = VK_SUCCESS;
-
- unsigned i;
- for (i = 0; i < count; i++) {
- VkResult res = genX(graphics_pipeline_create)(_device,
- pipeline_cache,
- &pCreateInfos[i],
- pAllocator, &pPipelines[i]);
-
- if (res == VK_SUCCESS)
- continue;
-
- /* Bail out on the first error != VK_PIPELINE_COMPILE_REQUIRED_EX as it
- * is not obvious what error should be report upon 2 different failures.
- * */
- result = res;
- if (res != VK_PIPELINE_COMPILE_REQUIRED_EXT)
- break;
-
- if (pCreateInfos[i].flags & VK_PIPELINE_CREATE_EARLY_RETURN_ON_FAILURE_BIT_EXT)
- break;
- }
-
- for (; i < count; i++)
- pPipelines[i] = VK_NULL_HANDLE;
-
- return result;
-}
-
-VkResult genX(CreateComputePipelines)(
- VkDevice _device,
- VkPipelineCache pipelineCache,
- uint32_t count,
- const VkComputePipelineCreateInfo* pCreateInfos,
- const VkAllocationCallbacks* pAllocator,
- VkPipeline* pPipelines)
-{
- ANV_FROM_HANDLE(anv_pipeline_cache, pipeline_cache, pipelineCache);
-
- VkResult result = VK_SUCCESS;
-
- unsigned i;
- for (i = 0; i < count; i++) {
- VkResult res = compute_pipeline_create(_device, pipeline_cache,
- &pCreateInfos[i],
- pAllocator, &pPipelines[i]);
-
- if (res == VK_SUCCESS)
- continue;
-
- /* Bail out on the first error != VK_PIPELINE_COMPILE_REQUIRED_EX as it
- * is not obvious what error should be report upon 2 different failures.
- * */
- result = res;
- if (res != VK_PIPELINE_COMPILE_REQUIRED_EXT)
- break;
-
- if (pCreateInfos[i].flags & VK_PIPELINE_CREATE_EARLY_RETURN_ON_FAILURE_BIT_EXT)
- break;
- }
-
- for (; i < count; i++)
- pPipelines[i] = VK_NULL_HANDLE;
-
- return result;
-}
-
#if GFX_VERx10 >= 125
-static void
-assert_rt_stage_index_valid(const VkRayTracingPipelineCreateInfoKHR* pCreateInfo,
- uint32_t stage_idx,
- VkShaderStageFlags valid_stages)
-{
- if (stage_idx == VK_SHADER_UNUSED_KHR)
- return;
-
- assert(stage_idx <= pCreateInfo->stageCount);
- assert(util_bitcount(pCreateInfo->pStages[stage_idx].stage) == 1);
- assert(pCreateInfo->pStages[stage_idx].stage & valid_stages);
-}
-
-static VkResult
-ray_tracing_pipeline_create(
- VkDevice _device,
- struct anv_pipeline_cache * cache,
- const VkRayTracingPipelineCreateInfoKHR* pCreateInfo,
- const VkAllocationCallbacks* pAllocator,
- VkPipeline* pPipeline)
+void
+genX(ray_tracing_pipeline_emit)(struct anv_ray_tracing_pipeline *pipeline)
{
- ANV_FROM_HANDLE(anv_device, device, _device);
- VkResult result;
-
- assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_RAY_TRACING_PIPELINE_CREATE_INFO_KHR);
-
- /* Use the default pipeline cache if none is specified */
- if (cache == NULL && device->physical->instance->pipeline_cache_enabled)
- cache = &device->default_pipeline_cache;
-
- VK_MULTIALLOC(ma);
- VK_MULTIALLOC_DECL(&ma, struct anv_ray_tracing_pipeline, pipeline, 1);
- VK_MULTIALLOC_DECL(&ma, struct anv_rt_shader_group, groups, pCreateInfo->groupCount);
- if (!vk_multialloc_zalloc2(&ma, &device->vk.alloc, pAllocator,
- VK_SYSTEM_ALLOCATION_SCOPE_DEVICE))
- return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
-
- result = anv_pipeline_init(&pipeline->base, device,
- ANV_PIPELINE_RAY_TRACING, pCreateInfo->flags,
- pAllocator);
- if (result != VK_SUCCESS) {
- vk_free2(&device->vk.alloc, pAllocator, pipeline);
- return result;
- }
-
- pipeline->group_count = pCreateInfo->groupCount;
- pipeline->groups = groups;
-
- ASSERTED const VkShaderStageFlags ray_tracing_stages =
- VK_SHADER_STAGE_RAYGEN_BIT_KHR |
- VK_SHADER_STAGE_ANY_HIT_BIT_KHR |
- VK_SHADER_STAGE_CLOSEST_HIT_BIT_KHR |
- VK_SHADER_STAGE_MISS_BIT_KHR |
- VK_SHADER_STAGE_INTERSECTION_BIT_KHR |
- VK_SHADER_STAGE_CALLABLE_BIT_KHR;
-
- for (uint32_t i = 0; i < pCreateInfo->stageCount; i++)
- assert((pCreateInfo->pStages[i].stage & ~ray_tracing_stages) == 0);
-
- for (uint32_t i = 0; i < pCreateInfo->groupCount; i++) {
- const VkRayTracingShaderGroupCreateInfoKHR *ginfo =
- &pCreateInfo->pGroups[i];
- assert_rt_stage_index_valid(pCreateInfo, ginfo->generalShader,
- VK_SHADER_STAGE_RAYGEN_BIT_KHR |
- VK_SHADER_STAGE_MISS_BIT_KHR |
- VK_SHADER_STAGE_CALLABLE_BIT_KHR);
- assert_rt_stage_index_valid(pCreateInfo, ginfo->closestHitShader,
- VK_SHADER_STAGE_CLOSEST_HIT_BIT_KHR);
- assert_rt_stage_index_valid(pCreateInfo, ginfo->anyHitShader,
- VK_SHADER_STAGE_ANY_HIT_BIT_KHR);
- assert_rt_stage_index_valid(pCreateInfo, ginfo->intersectionShader,
- VK_SHADER_STAGE_INTERSECTION_BIT_KHR);
- switch (ginfo->type) {
- case VK_RAY_TRACING_SHADER_GROUP_TYPE_GENERAL_KHR:
- assert(ginfo->generalShader < pCreateInfo->stageCount);
- assert(ginfo->anyHitShader == VK_SHADER_UNUSED_KHR);
- assert(ginfo->closestHitShader == VK_SHADER_UNUSED_KHR);
- assert(ginfo->intersectionShader == VK_SHADER_UNUSED_KHR);
- break;
-
- case VK_RAY_TRACING_SHADER_GROUP_TYPE_TRIANGLES_HIT_GROUP_KHR:
- assert(ginfo->generalShader == VK_SHADER_UNUSED_KHR);
- assert(ginfo->intersectionShader == VK_SHADER_UNUSED_KHR);
- break;
-
- case VK_RAY_TRACING_SHADER_GROUP_TYPE_PROCEDURAL_HIT_GROUP_KHR:
- assert(ginfo->generalShader == VK_SHADER_UNUSED_KHR);
- break;
-
- default:
- unreachable("Invalid ray-tracing shader group type");
- }
- }
-
- result = anv_ray_tracing_pipeline_init(pipeline, device, cache,
- pCreateInfo, pAllocator);
- if (result != VK_SUCCESS) {
- anv_pipeline_finish(&pipeline->base, device, pAllocator);
- vk_free2(&device->vk.alloc, pAllocator, pipeline);
- return result;
- }
-
for (uint32_t i = 0; i < pipeline->group_count; i++) {
struct anv_rt_shader_group *group = &pipeline->groups[i];
switch (group->type) {
case VK_RAY_TRACING_SHADER_GROUP_TYPE_GENERAL_KHR: {
- struct GFX_RT_GENERAL_SBT_HANDLE sh = {};
+ struct GENX(RT_GENERAL_SBT_HANDLE) sh = {};
sh.General = anv_shader_bin_get_bsr(group->general, 32);
- GFX_RT_GENERAL_SBT_HANDLE_pack(NULL, group->handle, &sh);
+ GENX(RT_GENERAL_SBT_HANDLE_pack)(NULL, group->handle, &sh);
break;
}
case VK_RAY_TRACING_SHADER_GROUP_TYPE_TRIANGLES_HIT_GROUP_KHR: {
- struct GFX_RT_TRIANGLES_SBT_HANDLE sh = {};
+ struct GENX(RT_TRIANGLES_SBT_HANDLE) sh = {};
if (group->closest_hit)
sh.ClosestHit = anv_shader_bin_get_bsr(group->closest_hit, 32);
if (group->any_hit)
sh.AnyHit = anv_shader_bin_get_bsr(group->any_hit, 24);
- GFX_RT_TRIANGLES_SBT_HANDLE_pack(NULL, group->handle, &sh);
+ GENX(RT_TRIANGLES_SBT_HANDLE_pack)(NULL, group->handle, &sh);
break;
}
case VK_RAY_TRACING_SHADER_GROUP_TYPE_PROCEDURAL_HIT_GROUP_KHR: {
- struct GFX_RT_PROCEDURAL_SBT_HANDLE sh = {};
+ struct GENX(RT_PROCEDURAL_SBT_HANDLE) sh = {};
if (group->closest_hit)
sh.ClosestHit = anv_shader_bin_get_bsr(group->closest_hit, 32);
sh.Intersection = anv_shader_bin_get_bsr(group->intersection, 24);
- GFX_RT_PROCEDURAL_SBT_HANDLE_pack(NULL, group->handle, &sh);
+ GENX(RT_PROCEDURAL_SBT_HANDLE_pack)(NULL, group->handle, &sh);
break;
}
@@ -2975,48 +2147,14 @@ ray_tracing_pipeline_create(
unreachable("Invalid shader group type");
}
}
-
- *pPipeline = anv_pipeline_to_handle(&pipeline->base);
-
- return pipeline->base.batch.status;
}
-VkResult
-genX(CreateRayTracingPipelinesKHR)(
- VkDevice _device,
- VkDeferredOperationKHR deferredOperation,
- VkPipelineCache pipelineCache,
- uint32_t createInfoCount,
- const VkRayTracingPipelineCreateInfoKHR* pCreateInfos,
- const VkAllocationCallbacks* pAllocator,
- VkPipeline* pPipelines)
-{
- ANV_FROM_HANDLE(anv_pipeline_cache, pipeline_cache, pipelineCache);
-
- VkResult result = VK_SUCCESS;
-
- unsigned i;
- for (i = 0; i < createInfoCount; i++) {
- VkResult res = ray_tracing_pipeline_create(_device, pipeline_cache,
- &pCreateInfos[i],
- pAllocator, &pPipelines[i]);
-
- if (res == VK_SUCCESS)
- continue;
-
- /* Bail out on the first error as it is not obvious what error should be
- * report upon 2 different failures. */
- result = res;
- if (result != VK_PIPELINE_COMPILE_REQUIRED_EXT)
- break;
-
- if (pCreateInfos[i].flags & VK_PIPELINE_CREATE_EARLY_RETURN_ON_FAILURE_BIT_EXT)
- break;
- }
-
- for (; i < createInfoCount; i++)
- pPipelines[i] = VK_NULL_HANDLE;
+#else
- return result;
+void
+genX(ray_tracing_pipeline_emit)(struct anv_ray_tracing_pipeline *pipeline)
+{
+ unreachable("Ray tracing not supported");
}
+
#endif /* GFX_VERx10 >= 125 */
diff --git a/src/intel/vulkan/genX_query.c b/src/intel/vulkan/genX_query.c
index 8978f5843a9..2cb492afcf9 100644
--- a/src/intel/vulkan/genX_query.c
+++ b/src/intel/vulkan/genX_query.c
@@ -29,15 +29,21 @@
#include "anv_private.h"
+#include "util/os_time.h"
+
#include "genxml/gen_macros.h"
#include "genxml/genX_pack.h"
+#include "ds/intel_tracepoints.h"
+
+#include "anv_internal_kernels.h"
+
/* We reserve :
* - GPR 14 for perf queries
* - GPR 15 for conditional rendering
*/
#define MI_BUILDER_NUM_ALLOC_GPRS 14
-#define MI_BUILDER_CAN_WRITE_BATCH GFX_VER >= 8
+#define MI_BUILDER_CAN_WRITE_BATCH true
#define __gen_get_batch_dwords anv_batch_emit_dwords
#define __gen_address_offset anv_address_add
#define __gen_get_batch_address(b, a) anv_batch_address(b, a)
@@ -57,6 +63,18 @@ anv_query_address(struct anv_query_pool *pool, uint32_t query)
};
}
+static void
+emit_query_mi_flush_availability(struct anv_cmd_buffer *cmd_buffer,
+ struct anv_address addr,
+ bool available)
+{
+ anv_batch_emit(&cmd_buffer->batch, GENX(MI_FLUSH_DW), flush) {
+ flush.PostSyncOperation = WriteImmediateData;
+ flush.Address = addr;
+ flush.ImmediateData = available;
+ }
+}
+
VkResult genX(CreateQueryPool)(
VkDevice _device,
const VkQueryPoolCreateInfo* pCreateInfo,
@@ -65,12 +83,10 @@ VkResult genX(CreateQueryPool)(
{
ANV_FROM_HANDLE(anv_device, device, _device);
const struct anv_physical_device *pdevice = device->physical;
-#if GFX_VER >= 8
const VkQueryPoolPerformanceCreateInfoKHR *perf_query_info = NULL;
struct intel_perf_counter_pass *counter_pass;
struct intel_perf_query_info **pass_query;
uint32_t n_passes = 0;
-#endif
uint32_t data_offset = 0;
VK_MULTIALLOC(ma);
VkResult result;
@@ -123,14 +139,13 @@ VkResult genX(CreateQueryPool)(
uint64s_per_slot = 2; /* availability + marker */
/* Align to the requirement of the layout */
- uint64s_per_slot = align_u32(uint64s_per_slot,
- DIV_ROUND_UP(layout->alignment, sizeof(uint64_t)));
+ uint64s_per_slot = align(uint64s_per_slot,
+ DIV_ROUND_UP(layout->alignment, sizeof(uint64_t)));
data_offset = uint64s_per_slot * sizeof(uint64_t);
/* Add the query data for begin & end commands */
uint64s_per_slot += 2 * DIV_ROUND_UP(layout->size, sizeof(uint64_t));
break;
}
-#if GFX_VER >= 8
case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
const struct intel_perf_query_field_layout *layout =
&pdevice->perf->query_layout;
@@ -145,10 +160,10 @@ VkResult genX(CreateQueryPool)(
perf_query_info->counterIndexCount);
vk_multialloc_add(&ma, &pass_query, struct intel_perf_query_info *,
n_passes);
- uint64s_per_slot = 4 /* availability + small batch */;
+ uint64s_per_slot = 1 /* availability */;
/* Align to the requirement of the layout */
- uint64s_per_slot = align_u32(uint64s_per_slot,
- DIV_ROUND_UP(layout->alignment, sizeof(uint64_t)));
+ uint64s_per_slot = align(uint64s_per_slot,
+ DIV_ROUND_UP(layout->alignment, sizeof(uint64_t)));
data_offset = uint64s_per_slot * sizeof(uint64_t);
/* Add the query data for begin & end commands */
uint64s_per_slot += 2 * DIV_ROUND_UP(layout->size, sizeof(uint64_t));
@@ -156,26 +171,41 @@ VkResult genX(CreateQueryPool)(
uint64s_per_slot *= n_passes;
break;
}
+ case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
+ /* Query has two values: begin and end. */
+ uint64s_per_slot = 1 + 2;
+ break;
+#if GFX_VERx10 >= 125
+ case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR:
+ case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR:
+ uint64s_per_slot = 1 + 1 /* availability + size (PostbuildInfoCurrentSize, PostbuildInfoCompactedSize) */;
+ break;
+
+ case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR:
+ case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR:
+ uint64s_per_slot = 1 + 2 /* availability + size (PostbuildInfoSerializationDesc) */;
+ break;
+
#endif
+ case VK_QUERY_TYPE_RESULT_STATUS_ONLY_KHR:
+ uint64s_per_slot = 1;
+ break;
default:
assert(!"Invalid query type");
}
- if (!vk_object_multialloc(&device->vk, &ma, pAllocator,
- VK_OBJECT_TYPE_QUERY_POOL))
- return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+ if (!vk_multialloc_zalloc2(&ma, &device->vk.alloc, pAllocator,
+ VK_SYSTEM_ALLOCATION_SCOPE_OBJECT))
+ return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
- pool->type = pCreateInfo->queryType;
- pool->pipeline_statistics = pipeline_statistics;
+ vk_query_pool_init(&device->vk, &pool->vk, pCreateInfo);
pool->stride = uint64s_per_slot * sizeof(uint64_t);
- pool->slots = pCreateInfo->queryCount;
- if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL) {
+ if (pool->vk.query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL) {
pool->data_offset = data_offset;
pool->snapshot_size = (pool->stride - data_offset) / 2;
}
-#if GFX_VER >= 8
- else if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
+ else if (pool->vk.query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
pool->pass_size = pool->stride / n_passes;
pool->data_offset = data_offset;
pool->snapshot_size = (pool->pass_size - data_offset) / 2;
@@ -192,19 +222,27 @@ VkResult genX(CreateQueryPool)(
perf_query_info->counterIndexCount,
pool->pass_query);
}
-#endif
- uint64_t size = pool->slots * (uint64_t)pool->stride;
+ uint64_t size = pool->vk.query_count * (uint64_t)pool->stride;
+
+ /* For KHR_performance_query we need some space in the buffer for a small
+ * batch updating ANV_PERF_QUERY_OFFSET_REG.
+ */
+ if (pool->vk.query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
+ pool->khr_perf_preamble_stride = 32;
+ pool->khr_perf_preambles_offset = size;
+ size += (uint64_t)pool->n_passes * pool->khr_perf_preamble_stride;
+ }
+
result = anv_device_alloc_bo(device, "query-pool", size,
ANV_BO_ALLOC_MAPPED |
- ANV_BO_ALLOC_SNOOPED,
+ ANV_BO_ALLOC_HOST_CACHED_COHERENT,
0 /* explicit_address */,
&pool->bo);
if (result != VK_SUCCESS)
goto fail;
-#if GFX_VER >= 8
- if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
+ if (pool->vk.query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
for (uint32_t p = 0; p < pool->n_passes; p++) {
struct mi_builder b;
struct anv_batch batch = {
@@ -213,13 +251,14 @@ VkResult genX(CreateQueryPool)(
};
batch.next = batch.start;
- mi_builder_init(&b, &device->info, &batch);
+ mi_builder_init(&b, device->info, &batch);
mi_store(&b, mi_reg64(ANV_PERF_QUERY_OFFSET_REG),
mi_imm(p * (uint64_t)pool->pass_size));
anv_batch_emit(&batch, GENX(MI_BATCH_BUFFER_END), bbe);
}
}
-#endif
+
+ ANV_RMV(query_pool_create, device, pool, false);
*pQueryPool = anv_query_pool_to_handle(pool);
@@ -242,47 +281,36 @@ void genX(DestroyQueryPool)(
if (!pool)
return;
+ ANV_RMV(resource_destroy, device, pool);
+
anv_device_release_bo(device, pool->bo);
vk_object_free(&device->vk, pAllocator, pool);
}
-#if GFX_VER >= 8
/**
* VK_KHR_performance_query layout :
*
* --------------------------------------------
* | availability (8b) | | |
* |-------------------------------| | |
- * | Small batch loading | | |
- * | ANV_PERF_QUERY_OFFSET_REG | | |
- * | (24b) | | Pass 0 |
- * |-------------------------------| | |
* | some padding (see | | |
- * | query_field_layout:alignment) | | |
+ * | query_field_layout:alignment) | | Pass 0 |
* |-------------------------------| | |
* | query data | | |
* | (2 * query_field_layout:size) | | |
* |-------------------------------|-- | Query 0
* | availability (8b) | | |
* |-------------------------------| | |
- * | Small batch loading | | |
- * | ANV_PERF_QUERY_OFFSET_REG | | |
- * | (24b) | | Pass 1 |
- * |-------------------------------| | |
* | some padding (see | | |
- * | query_field_layout:alignment) | | |
+ * | query_field_layout:alignment) | | Pass 1 |
* |-------------------------------| | |
* | query data | | |
* | (2 * query_field_layout:size) | | |
* |-------------------------------|-----------
* | availability (8b) | | |
* |-------------------------------| | |
- * | Small batch loading | | |
- * | ANV_PERF_QUERY_OFFSET_REG | | |
- * | (24b) | | Pass 0 |
- * |-------------------------------| | |
* | some padding (see | | |
- * | query_field_layout:alignment) | | |
+ * | query_field_layout:alignment) | | Pass 0 |
* |-------------------------------| | |
* | query data | | |
* | (2 * query_field_layout:size) | | |
@@ -333,7 +361,7 @@ khr_perf_query_ensure_relocs(struct anv_cmd_buffer *cmd_buffer)
const struct anv_physical_device *pdevice = device->physical;
cmd_buffer->self_mod_locations =
- vk_alloc(&cmd_buffer->pool->alloc,
+ vk_alloc(&cmd_buffer->vk.pool->alloc,
pdevice->n_perf_query_commands * sizeof(*cmd_buffer->self_mod_locations), 8,
VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
@@ -344,7 +372,6 @@ khr_perf_query_ensure_relocs(struct anv_cmd_buffer *cmd_buffer)
return true;
}
-#endif
/**
* VK_INTEL_performance_query layout :
@@ -396,8 +423,7 @@ query_slot(struct anv_query_pool *pool, uint32_t query)
static bool
query_is_available(struct anv_query_pool *pool, uint32_t query)
{
-#if GFX_VER >= 8
- if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
+ if (pool->vk.query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
for (uint32_t p = 0; p < pool->n_passes; p++) {
volatile uint64_t *slot =
pool->bo->map + khr_perf_query_availability_offset(pool, query, p);
@@ -406,7 +432,6 @@ query_is_available(struct anv_query_pool *pool, uint32_t query)
}
return true;
}
-#endif
return *(volatile uint64_t *)query_slot(pool, query);
}
@@ -415,17 +440,29 @@ static VkResult
wait_for_available(struct anv_device *device,
struct anv_query_pool *pool, uint32_t query)
{
- uint64_t abs_timeout = anv_get_absolute_timeout(2 * NSEC_PER_SEC);
+ /* By default we leave a 2s timeout before declaring the device lost. */
+ uint64_t rel_timeout = 2 * NSEC_PER_SEC;
+ if (pool->vk.query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
+ /* With performance queries, there is an additional 500us reconfiguration
+ * time in i915.
+ */
+ rel_timeout += 500 * 1000;
+ /* Additionally a command buffer can be replayed N times to gather data
+ * for each of the metric sets to capture all the counters requested.
+ */
+ rel_timeout *= pool->n_passes;
+ }
+ uint64_t abs_timeout_ns = os_time_get_absolute_timeout(rel_timeout);
- while (anv_gettime_ns() < abs_timeout) {
+ while (os_time_get_nano() < abs_timeout_ns) {
if (query_is_available(pool, query))
return VK_SUCCESS;
- VkResult status = anv_device_query_status(device);
+ VkResult status = vk_device_check_status(&device->vk);
if (status != VK_SUCCESS)
return status;
}
- return anv_device_set_lost(device, "query timeout");
+ return vk_device_set_lost(&device->vk, "query timeout");
}
VkResult genX(GetQueryPoolResults)(
@@ -441,14 +478,23 @@ VkResult genX(GetQueryPoolResults)(
ANV_FROM_HANDLE(anv_device, device, _device);
ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
- assert(pool->type == VK_QUERY_TYPE_OCCLUSION ||
- pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS ||
- pool->type == VK_QUERY_TYPE_TIMESTAMP ||
- pool->type == VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT ||
- pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR ||
- pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL);
-
- if (anv_device_is_lost(device))
+ assert(
+#if GFX_VERx10 >= 125
+ pool->vk.query_type == VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR ||
+ pool->vk.query_type == VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR ||
+ pool->vk.query_type == VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR ||
+ pool->vk.query_type == VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR ||
+#endif
+ pool->vk.query_type == VK_QUERY_TYPE_OCCLUSION ||
+ pool->vk.query_type == VK_QUERY_TYPE_PIPELINE_STATISTICS ||
+ pool->vk.query_type == VK_QUERY_TYPE_TIMESTAMP ||
+ pool->vk.query_type == VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT ||
+ pool->vk.query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR ||
+ pool->vk.query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL ||
+ pool->vk.query_type == VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT ||
+ pool->vk.query_type == VK_QUERY_TYPE_RESULT_STATUS_ONLY_KHR);
+
+ if (vk_device_is_lost(&device->vk))
return VK_ERROR_DEVICE_LOST;
if (pData == NULL)
@@ -487,8 +533,9 @@ VkResult genX(GetQueryPoolResults)(
bool write_results = available || (flags & VK_QUERY_RESULT_PARTIAL_BIT);
uint32_t idx = 0;
- switch (pool->type) {
- case VK_QUERY_TYPE_OCCLUSION: {
+ switch (pool->vk.query_type) {
+ case VK_QUERY_TYPE_OCCLUSION:
+ case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: {
uint64_t *slot = query_slot(pool, firstQuery + i);
if (write_results) {
/* From the Vulkan 1.2.132 spec:
@@ -507,22 +554,16 @@ VkResult genX(GetQueryPoolResults)(
case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
uint64_t *slot = query_slot(pool, firstQuery + i);
- uint32_t statistics = pool->pipeline_statistics;
+ uint32_t statistics = pool->vk.pipeline_statistics;
while (statistics) {
- uint32_t stat = u_bit_scan(&statistics);
+ UNUSED uint32_t stat = u_bit_scan(&statistics);
if (write_results) {
uint64_t result = slot[idx * 2 + 2] - slot[idx * 2 + 1];
-
- /* WaDividePSInvocationCountBy4:HSW,BDW */
- if ((device->info.ver == 8 || device->info.is_haswell) &&
- (1 << stat) == VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT)
- result >>= 2;
-
cpu_write_query_result(pData, flags, idx, result);
}
idx++;
}
- assert(idx == util_bitcount(pool->pipeline_statistics));
+ assert(idx == util_bitcount(pool->vk.pipeline_statistics));
break;
}
@@ -537,6 +578,26 @@ VkResult genX(GetQueryPoolResults)(
break;
}
+#if GFX_VERx10 >= 125
+ case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR:
+ case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR:
+ case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR: {
+ uint64_t *slot = query_slot(pool, firstQuery + i);
+ if (write_results)
+ cpu_write_query_result(pData, flags, idx, slot[1]);
+ idx++;
+ break;
+ }
+
+ case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR: {
+ uint64_t *slot = query_slot(pool, firstQuery + i);
+ if (write_results)
+ cpu_write_query_result(pData, flags, idx, slot[2]);
+ idx++;
+ break;
+ }
+#endif
+
case VK_QUERY_TYPE_TIMESTAMP: {
uint64_t *slot = query_slot(pool, firstQuery + i);
if (write_results)
@@ -545,7 +606,6 @@ VkResult genX(GetQueryPoolResults)(
break;
}
-#if GFX_VER >= 8
case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
const struct anv_physical_device *pdevice = device->physical;
assert((flags & (VK_QUERY_RESULT_WITH_AVAILABILITY_BIT |
@@ -554,7 +614,7 @@ VkResult genX(GetQueryPoolResults)(
const struct intel_perf_query_info *query = pool->pass_query[p];
struct intel_perf_query_result result;
intel_perf_query_result_clear(&result);
- intel_perf_query_result_accumulate_fields(&result, query, &device->info,
+ intel_perf_query_result_accumulate_fields(&result, query,
pool->bo->map + khr_perf_query_data_offset(pool, firstQuery + i, p, false),
pool->bo->map + khr_perf_query_data_offset(pool, firstQuery + i, p, true),
false /* no_oa_accumulate */);
@@ -562,7 +622,6 @@ VkResult genX(GetQueryPoolResults)(
}
break;
}
-#endif
case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
if (!write_results)
@@ -571,18 +630,26 @@ VkResult genX(GetQueryPoolResults)(
const struct intel_perf_query_info *query = &device->physical->perf->queries[0];
struct intel_perf_query_result result;
intel_perf_query_result_clear(&result);
- intel_perf_query_result_accumulate_fields(&result, query, &device->info,
+ intel_perf_query_result_accumulate_fields(&result, query,
query_data + intel_perf_query_data_offset(pool, false),
query_data + intel_perf_query_data_offset(pool, true),
false /* no_oa_accumulate */);
intel_perf_query_result_write_mdapi(pData, stride,
- &device->info,
+ device->info,
query, &result);
const uint64_t *marker = query_data + intel_perf_marker_offset();
- intel_perf_query_mdapi_write_marker(pData, stride, &device->info, *marker);
+ intel_perf_query_mdapi_write_marker(pData, stride, device->info, *marker);
break;
}
+ case VK_QUERY_TYPE_RESULT_STATUS_ONLY_KHR:
+ if (!write_results)
+ break;
+ const uint32_t *query_data = query_slot(pool, firstQuery + i);
+ uint32_t result = available ? *query_data : 0;
+ cpu_write_query_result(pData, flags, idx, result);
+ break;
+
default:
unreachable("invalid pool type");
}
@@ -608,15 +675,11 @@ emit_ps_depth_count(struct anv_cmd_buffer *cmd_buffer,
cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT;
genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
- anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
- pc.DestinationAddressType = DAT_PPGTT;
- pc.PostSyncOperation = WritePSDepthCount;
- pc.DepthStallEnable = true;
- pc.Address = addr;
-
- if (GFX_VER == 9 && cmd_buffer->device->info.gt == 4)
- pc.CommandStreamerStallEnable = true;
- }
+ bool cs_stall_needed = (GFX_VER == 9 && cmd_buffer->device->info->gt == 4);
+ genx_batch_emit_pipe_control_write
+ (&cmd_buffer->batch, cmd_buffer->device->info,
+ cmd_buffer->state.current_pipeline, WritePSDepthCount, addr, 0,
+ ANV_PIPE_DEPTH_STALL_BIT | (cs_stall_needed ? ANV_PIPE_CS_STALL_BIT : 0));
}
static void
@@ -635,12 +698,10 @@ emit_query_pc_availability(struct anv_cmd_buffer *cmd_buffer,
cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT;
genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
- anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
- pc.DestinationAddressType = DAT_PPGTT;
- pc.PostSyncOperation = WriteImmediateData;
- pc.Address = addr;
- pc.ImmediateData = available;
- }
+ genx_batch_emit_pipe_control_write
+ (&cmd_buffer->batch, cmd_buffer->device->info,
+ cmd_buffer->state.current_pipeline, WriteImmediateData, addr,
+ available, 0);
}
/**
@@ -652,7 +713,7 @@ emit_zero_queries(struct anv_cmd_buffer *cmd_buffer,
struct mi_builder *b, struct anv_query_pool *pool,
uint32_t first_index, uint32_t num_queries)
{
- switch (pool->type) {
+ switch (pool->vk.query_type) {
case VK_QUERY_TYPE_OCCLUSION:
case VK_QUERY_TYPE_TIMESTAMP:
/* These queries are written with a PIPE_CONTROL so clear them using the
@@ -673,6 +734,7 @@ emit_zero_queries(struct anv_cmd_buffer *cmd_buffer,
}
break;
+ case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
case VK_QUERY_TYPE_PIPELINE_STATISTICS:
case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
for (uint32_t i = 0; i < num_queries; i++) {
@@ -683,7 +745,6 @@ emit_zero_queries(struct anv_cmd_buffer *cmd_buffer,
}
break;
-#if GFX_VER >= 8
case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
for (uint32_t i = 0; i < num_queries; i++) {
for (uint32_t p = 0; p < pool->n_passes; p++) {
@@ -696,7 +757,6 @@ emit_zero_queries(struct anv_cmd_buffer *cmd_buffer,
}
break;
}
-#endif
case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL:
for (uint32_t i = 0; i < num_queries; i++) {
@@ -720,10 +780,44 @@ void genX(CmdResetQueryPool)(
{
ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
+ struct anv_physical_device *pdevice = cmd_buffer->device->physical;
+
+ /* Shader clearing is only possible on render/compute */
+ if (anv_cmd_buffer_is_render_or_compute_queue(cmd_buffer) &&
+ queryCount >= pdevice->instance->query_clear_with_blorp_threshold) {
+ trace_intel_begin_query_clear_blorp(&cmd_buffer->trace);
+
+ anv_cmd_buffer_fill_area(cmd_buffer,
+ anv_query_address(pool, firstQuery),
+ queryCount * pool->stride,
+ 0);
+
+ /* The pending clearing writes are in compute if we're in gpgpu mode on
+ * the render engine or on the compute engine.
+ */
+ if (anv_cmd_buffer_is_compute_queue(cmd_buffer) ||
+ cmd_buffer->state.current_pipeline == pdevice->gpgpu_pipeline_value) {
+ cmd_buffer->state.queries.clear_bits =
+ ANV_QUERY_COMPUTE_WRITES_PENDING_BITS;
+ } else {
+ cmd_buffer->state.queries.clear_bits =
+ ANV_QUERY_RENDER_TARGET_WRITES_PENDING_BITS(&pdevice->info);
+ }
+
+ trace_intel_end_query_clear_blorp(&cmd_buffer->trace, queryCount);
+ return;
+ }
- switch (pool->type) {
+ trace_intel_begin_query_clear_cs(&cmd_buffer->trace);
+
+ switch (pool->vk.query_type) {
case VK_QUERY_TYPE_OCCLUSION:
- case VK_QUERY_TYPE_TIMESTAMP:
+#if GFX_VERx10 >= 125
+ case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR:
+ case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR:
+ case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR:
+ case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR:
+#endif
for (uint32_t i = 0; i < queryCount; i++) {
emit_query_pc_availability(cmd_buffer,
anv_query_address(pool, firstQuery + i),
@@ -731,20 +825,37 @@ void genX(CmdResetQueryPool)(
}
break;
+ case VK_QUERY_TYPE_TIMESTAMP: {
+ for (uint32_t i = 0; i < queryCount; i++) {
+ emit_query_pc_availability(cmd_buffer,
+ anv_query_address(pool, firstQuery + i),
+ false);
+ }
+
+ /* Add a CS stall here to make sure the PIPE_CONTROL above has
+ * completed. Otherwise some timestamps written later with MI_STORE_*
+ * commands might race with the PIPE_CONTROL in the loop above.
+ */
+ anv_add_pending_pipe_bits(cmd_buffer, ANV_PIPE_CS_STALL_BIT,
+ "vkCmdResetQueryPool of timestamps");
+ genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+ break;
+ }
+
case VK_QUERY_TYPE_PIPELINE_STATISTICS:
- case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: {
+ case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
+ case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: {
struct mi_builder b;
- mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
+ mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
for (uint32_t i = 0; i < queryCount; i++)
emit_query_mi_availability(&b, anv_query_address(pool, firstQuery + i), false);
break;
}
-#if GFX_VER >= 8
case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
struct mi_builder b;
- mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
+ mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
for (uint32_t i = 0; i < queryCount; i++) {
for (uint32_t p = 0; p < pool->n_passes; p++) {
@@ -756,20 +867,24 @@ void genX(CmdResetQueryPool)(
}
break;
}
-#endif
case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
struct mi_builder b;
- mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
+ mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
for (uint32_t i = 0; i < queryCount; i++)
emit_query_mi_availability(&b, anv_query_address(pool, firstQuery + i), false);
break;
}
-
+ case VK_QUERY_TYPE_RESULT_STATUS_ONLY_KHR:
+ for (uint32_t i = 0; i < queryCount; i++)
+ emit_query_mi_flush_availability(cmd_buffer, anv_query_address(pool, firstQuery + i), false);
+ break;
default:
unreachable("Unsupported query type");
}
+
+ trace_intel_end_query_clear_cs(&cmd_buffer->trace, queryCount);
}
void genX(ResetQueryPool)(
@@ -781,14 +896,12 @@ void genX(ResetQueryPool)(
ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
for (uint32_t i = 0; i < queryCount; i++) {
- if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
-#if GFX_VER >= 8
+ if (pool->vk.query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
for (uint32_t p = 0; p < pool->n_passes; p++) {
uint64_t *pass_slot = pool->bo->map +
khr_perf_query_availability_offset(pool, firstQuery + i, p);
*pass_slot = 0;
}
-#endif
} else {
uint64_t *slot = query_slot(pool, firstQuery + i);
*slot = 0;
@@ -858,6 +971,7 @@ emit_perf_intel_query(struct anv_cmd_buffer *cmd_buffer,
case INTEL_PERF_QUERY_FIELD_TYPE_SRM_PERFCNT:
case INTEL_PERF_QUERY_FIELD_TYPE_SRM_RPSTAT:
+ case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_A:
case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_B:
case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_C: {
struct anv_address addr = anv_address_add(data_addr, field->location);
@@ -877,15 +991,22 @@ emit_perf_intel_query(struct anv_cmd_buffer *cmd_buffer,
}
}
-void genX(CmdBeginQuery)(
- VkCommandBuffer commandBuffer,
- VkQueryPool queryPool,
- uint32_t query,
- VkQueryControlFlags flags)
+static void
+emit_query_clear_flush(struct anv_cmd_buffer *cmd_buffer,
+ struct anv_query_pool *pool,
+ const char *reason)
{
- genX(CmdBeginQueryIndexedEXT)(commandBuffer, queryPool, query, flags, 0);
+ if (cmd_buffer->state.queries.clear_bits == 0)
+ return;
+
+ anv_add_pending_pipe_bits(cmd_buffer,
+ ANV_PIPE_QUERY_BITS(
+ cmd_buffer->state.queries.clear_bits),
+ reason);
+ genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
}
+
void genX(CmdBeginQueryIndexedEXT)(
VkCommandBuffer commandBuffer,
VkQueryPool queryPool,
@@ -897,22 +1018,39 @@ void genX(CmdBeginQueryIndexedEXT)(
ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
struct anv_address query_addr = anv_query_address(pool, query);
+ emit_query_clear_flush(cmd_buffer, pool, "CmdBeginQuery* flush query clears");
+
struct mi_builder b;
- mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
+ mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
+ const uint32_t mocs = anv_mocs_for_address(cmd_buffer->device, &query_addr);
+ mi_builder_set_mocs(&b, mocs);
- switch (pool->type) {
+ switch (pool->vk.query_type) {
case VK_QUERY_TYPE_OCCLUSION:
+ cmd_buffer->state.gfx.n_occlusion_queries++;
+ cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_OCCLUSION_QUERY_ACTIVE;
emit_ps_depth_count(cmd_buffer, anv_address_add(query_addr, 8));
break;
+ case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
+ genx_batch_emit_pipe_control(&cmd_buffer->batch,
+ cmd_buffer->device->info,
+ cmd_buffer->state.current_pipeline,
+ ANV_PIPE_CS_STALL_BIT |
+ ANV_PIPE_STALL_AT_SCOREBOARD_BIT);
+ mi_store(&b, mi_mem64(anv_address_add(query_addr, 8)),
+ mi_reg64(GENX(CL_INVOCATION_COUNT_num)));
+ break;
+
case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
/* TODO: This might only be necessary for certain stats */
- anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
- pc.CommandStreamerStallEnable = true;
- pc.StallAtPixelScoreboard = true;
- }
+ genx_batch_emit_pipe_control(&cmd_buffer->batch,
+ cmd_buffer->device->info,
+ cmd_buffer->state.current_pipeline,
+ ANV_PIPE_CS_STALL_BIT |
+ ANV_PIPE_STALL_AT_SCOREBOARD_BIT);
- uint32_t statistics = pool->pipeline_statistics;
+ uint32_t statistics = pool->vk.pipeline_statistics;
uint32_t offset = 8;
while (statistics) {
uint32_t stat = u_bit_scan(&statistics);
@@ -923,14 +1061,14 @@ void genX(CmdBeginQueryIndexedEXT)(
}
case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
- anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
- pc.CommandStreamerStallEnable = true;
- pc.StallAtPixelScoreboard = true;
- }
+ genx_batch_emit_pipe_control(&cmd_buffer->batch,
+ cmd_buffer->device->info,
+ cmd_buffer->state.current_pipeline,
+ ANV_PIPE_CS_STALL_BIT |
+ ANV_PIPE_STALL_AT_SCOREBOARD_BIT);
emit_xfb_query(&b, index, anv_address_add(query_addr, 8));
break;
-#if GFX_VER >= 8
case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
if (!khr_perf_query_ensure_relocs(cmd_buffer))
return;
@@ -979,12 +1117,15 @@ void genX(CmdBeginQueryIndexedEXT)(
assert(reloc_idx == pdevice->n_perf_query_commands);
- mi_self_mod_barrier(&b);
+ const struct intel_device_info *devinfo = cmd_buffer->device->info;
+ const enum intel_engine_class engine_class = cmd_buffer->queue_family->engine_class;
+ mi_self_mod_barrier(&b, devinfo->engine_class_prefetch[engine_class]);
- anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
- pc.CommandStreamerStallEnable = true;
- pc.StallAtPixelScoreboard = true;
- }
+ genx_batch_emit_pipe_control(&cmd_buffer->batch,
+ cmd_buffer->device->info,
+ cmd_buffer->state.current_pipeline,
+ ANV_PIPE_CS_STALL_BIT |
+ ANV_PIPE_STALL_AT_SCOREBOARD_BIT);
cmd_buffer->perf_query_pool = pool;
cmd_buffer->perf_reloc_idx = 0;
@@ -1007,6 +1148,7 @@ void genX(CmdBeginQueryIndexedEXT)(
case INTEL_PERF_QUERY_FIELD_TYPE_SRM_PERFCNT:
case INTEL_PERF_QUERY_FIELD_TYPE_SRM_RPSTAT:
+ case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_A:
case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_B:
case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_C:
dws =
@@ -1040,30 +1182,24 @@ void genX(CmdBeginQueryIndexedEXT)(
}
break;
}
-#endif
case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
- anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
- pc.CommandStreamerStallEnable = true;
- pc.StallAtPixelScoreboard = true;
- }
+ genx_batch_emit_pipe_control(&cmd_buffer->batch,
+ cmd_buffer->device->info,
+ cmd_buffer->state.current_pipeline,
+ ANV_PIPE_CS_STALL_BIT |
+ ANV_PIPE_STALL_AT_SCOREBOARD_BIT);
emit_perf_intel_query(cmd_buffer, pool, &b, query_addr, false);
break;
}
-
+ case VK_QUERY_TYPE_RESULT_STATUS_ONLY_KHR:
+ emit_query_mi_flush_availability(cmd_buffer, query_addr, false);
+ break;
default:
unreachable("");
}
}
-void genX(CmdEndQuery)(
- VkCommandBuffer commandBuffer,
- VkQueryPool queryPool,
- uint32_t query)
-{
- genX(CmdEndQueryIndexedEXT)(commandBuffer, queryPool, query, 0);
-}
-
void genX(CmdEndQueryIndexedEXT)(
VkCommandBuffer commandBuffer,
VkQueryPool queryPool,
@@ -1075,22 +1211,40 @@ void genX(CmdEndQueryIndexedEXT)(
struct anv_address query_addr = anv_query_address(pool, query);
struct mi_builder b;
- mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
+ mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
- switch (pool->type) {
+ switch (pool->vk.query_type) {
case VK_QUERY_TYPE_OCCLUSION:
emit_ps_depth_count(cmd_buffer, anv_address_add(query_addr, 16));
emit_query_pc_availability(cmd_buffer, query_addr, true);
+ cmd_buffer->state.gfx.n_occlusion_queries--;
+ cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_OCCLUSION_QUERY_ACTIVE;
+ break;
+
+ case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
+ /* Ensure previous commands have completed before capturing the register
+ * value.
+ */
+ genx_batch_emit_pipe_control(&cmd_buffer->batch,
+ cmd_buffer->device->info,
+ cmd_buffer->state.current_pipeline,
+ ANV_PIPE_CS_STALL_BIT |
+ ANV_PIPE_STALL_AT_SCOREBOARD_BIT);
+
+ mi_store(&b, mi_mem64(anv_address_add(query_addr, 16)),
+ mi_reg64(GENX(CL_INVOCATION_COUNT_num)));
+ emit_query_mi_availability(&b, query_addr, true);
break;
case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
/* TODO: This might only be necessary for certain stats */
- anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
- pc.CommandStreamerStallEnable = true;
- pc.StallAtPixelScoreboard = true;
- }
+ genx_batch_emit_pipe_control(&cmd_buffer->batch,
+ cmd_buffer->device->info,
+ cmd_buffer->state.current_pipeline,
+ ANV_PIPE_CS_STALL_BIT |
+ ANV_PIPE_STALL_AT_SCOREBOARD_BIT);
- uint32_t statistics = pool->pipeline_statistics;
+ uint32_t statistics = pool->vk.pipeline_statistics;
uint32_t offset = 16;
while (statistics) {
uint32_t stat = u_bit_scan(&statistics);
@@ -1103,21 +1257,21 @@ void genX(CmdEndQueryIndexedEXT)(
}
case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
- anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
- pc.CommandStreamerStallEnable = true;
- pc.StallAtPixelScoreboard = true;
- }
-
+ genx_batch_emit_pipe_control(&cmd_buffer->batch,
+ cmd_buffer->device->info,
+ cmd_buffer->state.current_pipeline,
+ ANV_PIPE_CS_STALL_BIT |
+ ANV_PIPE_STALL_AT_SCOREBOARD_BIT);
emit_xfb_query(&b, index, anv_address_add(query_addr, 16));
emit_query_mi_availability(&b, query_addr, true);
break;
-#if GFX_VER >= 8
case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
- anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
- pc.CommandStreamerStallEnable = true;
- pc.StallAtPixelScoreboard = true;
- }
+ genx_batch_emit_pipe_control(&cmd_buffer->batch,
+ cmd_buffer->device->info,
+ cmd_buffer->state.current_pipeline,
+ ANV_PIPE_CS_STALL_BIT |
+ ANV_PIPE_STALL_AT_SCOREBOARD_BIT);
cmd_buffer->perf_query_pool = pool;
if (!khr_perf_query_ensure_relocs(cmd_buffer))
@@ -1144,6 +1298,7 @@ void genX(CmdEndQueryIndexedEXT)(
case INTEL_PERF_QUERY_FIELD_TYPE_SRM_PERFCNT:
case INTEL_PERF_QUERY_FIELD_TYPE_SRM_RPSTAT:
+ case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_A:
case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_B:
case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_C:
dws =
@@ -1189,13 +1344,13 @@ void genX(CmdEndQueryIndexedEXT)(
assert(cmd_buffer->perf_reloc_idx == pdevice->n_perf_query_commands);
break;
}
-#endif
case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
- anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
- pc.CommandStreamerStallEnable = true;
- pc.StallAtPixelScoreboard = true;
- }
+ genx_batch_emit_pipe_control(&cmd_buffer->batch,
+ cmd_buffer->device->info,
+ cmd_buffer->state.current_pipeline,
+ ANV_PIPE_CS_STALL_BIT |
+ ANV_PIPE_STALL_AT_SCOREBOARD_BIT);
uint32_t marker_offset = intel_perf_marker_offset();
mi_store(&b, mi_mem64(anv_address_add(query_addr, marker_offset)),
mi_imm(cmd_buffer->intel_perf_marker));
@@ -1203,6 +1358,9 @@ void genX(CmdEndQueryIndexedEXT)(
emit_query_mi_availability(&b, query_addr, true);
break;
}
+ case VK_QUERY_TYPE_RESULT_STATUS_ONLY_KHR:
+ emit_query_mi_flush_availability(cmd_buffer, query_addr, true);
+ break;
default:
unreachable("");
@@ -1216,9 +1374,9 @@ void genX(CmdEndQueryIndexedEXT)(
* first index, mark the other query indices as being already available
* with result 0.
*/
- if (cmd_buffer->state.subpass && cmd_buffer->state.subpass->view_mask) {
+ if (cmd_buffer->state.gfx.view_mask) {
const uint32_t num_queries =
- util_bitcount(cmd_buffer->state.subpass->view_mask);
+ util_bitcount(cmd_buffer->state.gfx.view_mask);
if (num_queries > 1)
emit_zero_queries(cmd_buffer, &b, pool, query + 1, num_queries - 1);
}
@@ -1226,9 +1384,9 @@ void genX(CmdEndQueryIndexedEXT)(
#define TIMESTAMP 0x2358
-void genX(CmdWriteTimestamp)(
+void genX(CmdWriteTimestamp2)(
VkCommandBuffer commandBuffer,
- VkPipelineStageFlagBits pipelineStage,
+ VkPipelineStageFlags2 stage,
VkQueryPool queryPool,
uint32_t query)
{
@@ -1236,34 +1394,49 @@ void genX(CmdWriteTimestamp)(
ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
struct anv_address query_addr = anv_query_address(pool, query);
- assert(pool->type == VK_QUERY_TYPE_TIMESTAMP);
+ assert(pool->vk.query_type == VK_QUERY_TYPE_TIMESTAMP);
+
+ emit_query_clear_flush(cmd_buffer, pool,
+ "CmdWriteTimestamp flush query clears");
struct mi_builder b;
- mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
+ mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
- switch (pipelineStage) {
- case VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT:
+ if (stage == VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT) {
mi_store(&b, mi_mem64(anv_address_add(query_addr, 8)),
mi_reg64(TIMESTAMP));
- break;
-
- default:
+ emit_query_mi_availability(&b, query_addr, true);
+ } else {
/* Everything else is bottom-of-pipe */
cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT;
genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
- anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
- pc.DestinationAddressType = DAT_PPGTT;
- pc.PostSyncOperation = WriteTimestamp;
- pc.Address = anv_address_add(query_addr, 8);
+ bool cs_stall_needed =
+ (GFX_VER == 9 && cmd_buffer->device->info->gt == 4);
- if (GFX_VER == 9 && cmd_buffer->device->info.gt == 4)
- pc.CommandStreamerStallEnable = true;
+ if (anv_cmd_buffer_is_blitter_queue(cmd_buffer) ||
+ anv_cmd_buffer_is_video_queue(cmd_buffer)) {
+ /* Wa_16018063123 - emit fast color dummy blit before MI_FLUSH_DW. */
+ if (intel_needs_workaround(cmd_buffer->device->info, 16018063123)) {
+ genX(batch_emit_fast_color_dummy_blit)(&cmd_buffer->batch,
+ cmd_buffer->device);
+ }
+ anv_batch_emit(&cmd_buffer->batch, GENX(MI_FLUSH_DW), dw) {
+ dw.Address = anv_address_add(query_addr, 8);
+ dw.PostSyncOperation = WriteTimestamp;
+ }
+ emit_query_mi_flush_availability(cmd_buffer, query_addr, true);
+ } else {
+ genx_batch_emit_pipe_control_write
+ (&cmd_buffer->batch, cmd_buffer->device->info,
+ cmd_buffer->state.current_pipeline, WriteTimestamp,
+ anv_address_add(query_addr, 8), 0,
+ cs_stall_needed ? ANV_PIPE_CS_STALL_BIT : 0);
+ emit_query_pc_availability(cmd_buffer, query_addr, true);
}
- break;
+
}
- emit_query_pc_availability(cmd_buffer, query_addr, true);
/* When multiview is active the spec requires that N consecutive query
* indices are used, where N is the number of active views in the subpass.
@@ -1273,16 +1446,14 @@ void genX(CmdWriteTimestamp)(
* first index, mark the other query indices as being already available
* with result 0.
*/
- if (cmd_buffer->state.subpass && cmd_buffer->state.subpass->view_mask) {
+ if (cmd_buffer->state.gfx.view_mask) {
const uint32_t num_queries =
- util_bitcount(cmd_buffer->state.subpass->view_mask);
+ util_bitcount(cmd_buffer->state.gfx.view_mask);
if (num_queries > 1)
emit_zero_queries(cmd_buffer, &b, pool, query + 1, num_queries - 1);
}
}
-#if GFX_VERx10 >= 75
-
#define MI_PREDICATE_SRC0 0x2400
#define MI_PREDICATE_SRC1 0x2408
#define MI_PREDICATE_RESULT 0x2418
@@ -1341,61 +1512,92 @@ compute_query_result(struct mi_builder *b, struct anv_address addr)
mi_mem64(anv_address_add(addr, 0)));
}
-void genX(CmdCopyQueryPoolResults)(
- VkCommandBuffer commandBuffer,
- VkQueryPool queryPool,
- uint32_t firstQuery,
- uint32_t queryCount,
- VkBuffer destBuffer,
- VkDeviceSize destOffset,
- VkDeviceSize destStride,
- VkQueryResultFlags flags)
+static void
+copy_query_results_with_cs(struct anv_cmd_buffer *cmd_buffer,
+ struct anv_query_pool *pool,
+ struct anv_address dest_addr,
+ uint64_t dest_stride,
+ uint32_t first_query,
+ uint32_t query_count,
+ VkQueryResultFlags flags)
{
- ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
- ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
- ANV_FROM_HANDLE(anv_buffer, buffer, destBuffer);
+ enum anv_pipe_bits needed_flushes = 0;
- struct mi_builder b;
- mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
- struct mi_value result;
+ trace_intel_begin_query_copy_cs(&cmd_buffer->trace);
/* If render target writes are ongoing, request a render target cache flush
* to ensure proper ordering of the commands from the 3d pipe and the
* command streamer.
*/
- if (cmd_buffer->state.pending_pipe_bits & ANV_PIPE_RENDER_TARGET_BUFFER_WRITES) {
- anv_add_pending_pipe_bits(cmd_buffer,
- ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT,
- "CopyQueryPoolResults");
+ if ((cmd_buffer->state.queries.buffer_write_bits |
+ cmd_buffer->state.queries.clear_bits) &
+ ANV_QUERY_WRITES_RT_FLUSH)
+ needed_flushes |= ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;
+
+ if ((cmd_buffer->state.queries.buffer_write_bits |
+ cmd_buffer->state.queries.clear_bits) &
+ ANV_QUERY_WRITES_TILE_FLUSH)
+ needed_flushes |= ANV_PIPE_TILE_CACHE_FLUSH_BIT;
+
+ if ((cmd_buffer->state.queries.buffer_write_bits |
+ cmd_buffer->state.queries.clear_bits) &
+ ANV_QUERY_WRITES_DATA_FLUSH) {
+ needed_flushes |= (ANV_PIPE_DATA_CACHE_FLUSH_BIT |
+ ANV_PIPE_HDC_PIPELINE_FLUSH_BIT |
+ ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT);
}
- if ((flags & VK_QUERY_RESULT_WAIT_BIT) ||
- (cmd_buffer->state.pending_pipe_bits & ANV_PIPE_FLUSH_BITS) ||
- /* Occlusion & timestamp queries are written using a PIPE_CONTROL and
- * because we're about to copy values from MI commands, we need to
- * stall the command streamer to make sure the PIPE_CONTROL values have
- * landed, otherwise we could see inconsistent values & availability.
- *
- * From the vulkan spec:
- *
- * "vkCmdCopyQueryPoolResults is guaranteed to see the effect of
- * previous uses of vkCmdResetQueryPool in the same queue, without
- * any additional synchronization."
- */
- pool->type == VK_QUERY_TYPE_OCCLUSION ||
- pool->type == VK_QUERY_TYPE_TIMESTAMP) {
+ if ((cmd_buffer->state.queries.buffer_write_bits |
+ cmd_buffer->state.queries.clear_bits) &
+ ANV_QUERY_WRITES_CS_STALL)
+ needed_flushes |= ANV_PIPE_CS_STALL_BIT;
+
+ /* Occlusion & timestamp queries are written using a PIPE_CONTROL and
+ * because we're about to copy values from MI commands, we need to stall
+ * the command streamer to make sure the PIPE_CONTROL values have
+ * landed, otherwise we could see inconsistent values & availability.
+ *
+ * From the vulkan spec:
+ *
+ * "vkCmdCopyQueryPoolResults is guaranteed to see the effect of
+ * previous uses of vkCmdResetQueryPool in the same queue, without any
+ * additional synchronization."
+ */
+ if (pool->vk.query_type == VK_QUERY_TYPE_OCCLUSION ||
+ pool->vk.query_type == VK_QUERY_TYPE_TIMESTAMP)
+ needed_flushes |= ANV_PIPE_CS_STALL_BIT;
+
+ if (needed_flushes) {
anv_add_pending_pipe_bits(cmd_buffer,
- ANV_PIPE_CS_STALL_BIT,
+ needed_flushes,
"CopyQueryPoolResults");
genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
}
- struct anv_address dest_addr = anv_address_add(buffer->address, destOffset);
- for (uint32_t i = 0; i < queryCount; i++) {
- struct anv_address query_addr = anv_query_address(pool, firstQuery + i);
+ struct mi_builder b;
+ mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
+ struct mi_value result;
+
+ for (uint32_t i = 0; i < query_count; i++) {
+ struct anv_address query_addr = anv_query_address(pool, first_query + i);
+ const uint32_t mocs = anv_mocs_for_address(cmd_buffer->device, &query_addr);
+
+ mi_builder_set_mocs(&b, mocs);
+
+ /* Wait for the availability write to land before we go read the data */
+ if (flags & VK_QUERY_RESULT_WAIT_BIT) {
+ anv_batch_emit(&cmd_buffer->batch, GENX(MI_SEMAPHORE_WAIT), sem) {
+ sem.WaitMode = PollingMode;
+ sem.CompareOperation = COMPARE_SAD_EQUAL_SDD;
+ sem.SemaphoreDataDword = true;
+ sem.SemaphoreAddress = query_addr;
+ }
+ }
+
uint32_t idx = 0;
- switch (pool->type) {
+ switch (pool->vk.query_type) {
case VK_QUERY_TYPE_OCCLUSION:
+ case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
result = compute_query_result(&b, anv_address_add(query_addr, 8));
/* Like in the case of vkGetQueryPoolResults, if the query is
* unavailable and the VK_QUERY_RESULT_PARTIAL_BIT flag is set,
@@ -1403,32 +1605,23 @@ void genX(CmdCopyQueryPoolResults)(
* VK_QUERY_RESULT_PARTIAL_BIT isn't set, don't write any value.
*/
gpu_write_query_result_cond(cmd_buffer, &b, query_addr, dest_addr,
- 1 /* available */, flags, idx, result);
+ 1 /* available */, flags, idx, result);
if (flags & VK_QUERY_RESULT_PARTIAL_BIT) {
gpu_write_query_result_cond(cmd_buffer, &b, query_addr, dest_addr,
- 0 /* unavailable */, flags, idx, mi_imm(0));
+ 0 /* unavailable */, flags, idx, mi_imm(0));
}
idx++;
break;
case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
- uint32_t statistics = pool->pipeline_statistics;
+ uint32_t statistics = pool->vk.pipeline_statistics;
while (statistics) {
- uint32_t stat = u_bit_scan(&statistics);
-
+ UNUSED uint32_t stat = u_bit_scan(&statistics);
result = compute_query_result(&b, anv_address_add(query_addr,
idx * 16 + 8));
-
- /* WaDividePSInvocationCountBy4:HSW,BDW */
- if ((cmd_buffer->device->info.ver == 8 ||
- cmd_buffer->device->info.is_haswell) &&
- (1 << stat) == VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT) {
- result = mi_ushr32_imm(&b, result, 2);
- }
-
gpu_write_query_result(&b, dest_addr, flags, idx++, result);
}
- assert(idx == util_bitcount(pool->pipeline_statistics));
+ assert(idx == util_bitcount(pool->vk.pipeline_statistics));
break;
}
@@ -1444,11 +1637,23 @@ void genX(CmdCopyQueryPoolResults)(
gpu_write_query_result(&b, dest_addr, flags, idx++, result);
break;
-#if GFX_VER >= 8
+#if GFX_VERx10 >= 125
+ case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR:
+ case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR:
+ case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR:
+ result = mi_mem64(anv_address_add(query_addr, 8));
+ gpu_write_query_result(&b, dest_addr, flags, idx++, result);
+ break;
+
+ case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR:
+ result = mi_mem64(anv_address_add(query_addr, 16));
+ gpu_write_query_result(&b, dest_addr, flags, idx++, result);
+ break;
+#endif
+
case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
unreachable("Copy KHR performance query results not implemented");
break;
-#endif
default:
unreachable("unhandled query type");
@@ -1459,11 +1664,182 @@ void genX(CmdCopyQueryPoolResults)(
mi_mem64(query_addr));
}
- dest_addr = anv_address_add(dest_addr, destStride);
+ dest_addr = anv_address_add(dest_addr, dest_stride);
}
+
+ trace_intel_end_query_copy_cs(&cmd_buffer->trace, query_count);
+}
+
+static void
+copy_query_results_with_shader(struct anv_cmd_buffer *cmd_buffer,
+ struct anv_query_pool *pool,
+ struct anv_address dest_addr,
+ uint64_t dest_stride,
+ uint32_t first_query,
+ uint32_t query_count,
+ VkQueryResultFlags flags)
+{
+ struct anv_device *device = cmd_buffer->device;
+ enum anv_pipe_bits needed_flushes = 0;
+
+ trace_intel_begin_query_copy_shader(&cmd_buffer->trace);
+
+ /* If this is the first command in the batch buffer, make sure we have
+ * consistent pipeline mode.
+ */
+ if (cmd_buffer->state.current_pipeline == UINT32_MAX)
+ genX(flush_pipeline_select_3d)(cmd_buffer);
+
+ if ((cmd_buffer->state.queries.buffer_write_bits |
+ cmd_buffer->state.queries.clear_bits) & ANV_QUERY_WRITES_RT_FLUSH)
+ needed_flushes |= ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;
+
+ if ((cmd_buffer->state.queries.buffer_write_bits |
+ cmd_buffer->state.queries.clear_bits) & ANV_QUERY_WRITES_DATA_FLUSH) {
+ needed_flushes |= (ANV_PIPE_HDC_PIPELINE_FLUSH_BIT |
+ ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT);
+ }
+
+ /* Flushes for the queries to complete */
+ if (flags & VK_QUERY_RESULT_WAIT_BIT) {
+ /* Some queries are done with shaders, so we need to have them flush
+ * high level caches writes. The L3 should be shared across the GPU.
+ */
+ if (pool->vk.query_type == VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR ||
+ pool->vk.query_type == VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR ||
+ pool->vk.query_type == VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR ||
+ pool->vk.query_type == VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR) {
+ needed_flushes |= ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT;
+ }
+ /* And we need to stall for previous CS writes to land or the flushes to
+ * complete.
+ */
+ needed_flushes |= ANV_PIPE_CS_STALL_BIT;
+ }
+
+ /* Occlusion & timestamp queries are written using a PIPE_CONTROL and
+ * because we're about to copy values from MI commands, we need to stall
+ * the command streamer to make sure the PIPE_CONTROL values have
+ * landed, otherwise we could see inconsistent values & availability.
+ *
+ * From the vulkan spec:
+ *
+ * "vkCmdCopyQueryPoolResults is guaranteed to see the effect of
+ * previous uses of vkCmdResetQueryPool in the same queue, without any
+ * additional synchronization."
+ */
+ if (pool->vk.query_type == VK_QUERY_TYPE_OCCLUSION ||
+ pool->vk.query_type == VK_QUERY_TYPE_TIMESTAMP)
+ needed_flushes |= ANV_PIPE_CS_STALL_BIT;
+
+ if (needed_flushes) {
+ anv_add_pending_pipe_bits(cmd_buffer,
+ needed_flushes | ANV_PIPE_END_OF_PIPE_SYNC_BIT,
+ "CopyQueryPoolResults");
+ genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+ }
+
+ struct anv_shader_bin *copy_kernel;
+ VkResult ret =
+ anv_device_get_internal_shader(
+ cmd_buffer->device,
+ cmd_buffer->state.current_pipeline == GPGPU ?
+ ANV_INTERNAL_KERNEL_COPY_QUERY_RESULTS_COMPUTE :
+ ANV_INTERNAL_KERNEL_COPY_QUERY_RESULTS_FRAGMENT,
+ &copy_kernel);
+ if (ret != VK_SUCCESS) {
+ anv_batch_set_error(&cmd_buffer->batch, ret);
+ return;
+ }
+
+ struct anv_simple_shader state = {
+ .device = cmd_buffer->device,
+ .cmd_buffer = cmd_buffer,
+ .dynamic_state_stream = &cmd_buffer->dynamic_state_stream,
+ .general_state_stream = &cmd_buffer->general_state_stream,
+ .batch = &cmd_buffer->batch,
+ .kernel = copy_kernel,
+ .l3_config = device->internal_kernels_l3_config,
+ .urb_cfg = &cmd_buffer->state.gfx.urb_cfg,
+ };
+ genX(emit_simple_shader_init)(&state);
+
+ struct anv_state push_data_state =
+ genX(simple_shader_alloc_push)(&state,
+ sizeof(struct anv_query_copy_params));
+ if (push_data_state.map == NULL)
+ return;
+
+ struct anv_query_copy_params *params = push_data_state.map;
+
+ uint32_t copy_flags =
+ ((flags & VK_QUERY_RESULT_64_BIT) ? ANV_COPY_QUERY_FLAG_RESULT64 : 0) |
+ ((flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) ? ANV_COPY_QUERY_FLAG_AVAILABLE : 0);
+
+ uint32_t num_items = 1;
+ uint32_t data_offset = 8 /* behind availability */;
+ switch (pool->vk.query_type) {
+ case VK_QUERY_TYPE_OCCLUSION:
+ case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
+ copy_flags |= ANV_COPY_QUERY_FLAG_DELTA;
+ /* These 2 queries are the only ones where we would have partial data
+ * because they are capture with a PIPE_CONTROL post sync operation. The
+ * other ones are captured with MI_STORE_REGISTER_DATA so we're always
+ * available by the time we reach the copy command.
+ */
+ copy_flags |= (flags & VK_QUERY_RESULT_PARTIAL_BIT) ? ANV_COPY_QUERY_FLAG_PARTIAL : 0;
+ break;
+
+ case VK_QUERY_TYPE_PIPELINE_STATISTICS:
+ num_items = util_bitcount(pool->vk.pipeline_statistics);
+ copy_flags |= ANV_COPY_QUERY_FLAG_DELTA;
+ break;
+
+ case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
+ num_items = 2;
+ copy_flags |= ANV_COPY_QUERY_FLAG_DELTA;
+ break;
+
+ case VK_QUERY_TYPE_TIMESTAMP:
+ case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR:
+ case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR:
+ case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR:
+ break;
+
+ case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR:
+ data_offset += 8;
+ break;
+
+ default:
+ unreachable("unhandled query type");
+ }
+
+ *params = (struct anv_query_copy_params) {
+ .flags = copy_flags,
+ .num_queries = query_count,
+ .num_items = num_items,
+ .query_base = first_query,
+ .query_stride = pool->stride,
+ .query_data_offset = data_offset,
+ .destination_stride = dest_stride,
+ .query_data_addr = anv_address_physical(
+ (struct anv_address) {
+ .bo = pool->bo,
+ }),
+ .destination_addr = anv_address_physical(dest_addr),
+ };
+
+ genX(emit_simple_shader_dispatch)(&state, query_count, push_data_state);
+
+ /* The query copy result shader is writing using the dataport, flush
+ * HDC/Data cache depending on the generation. Also stall at pixel
+ * scoreboard in case we're doing the copy with a fragment shader.
+ */
+ cmd_buffer->state.queries.buffer_write_bits |= ANV_QUERY_WRITES_DATA_FLUSH;
+
+ trace_intel_end_query_copy_shader(&cmd_buffer->trace, query_count);
}
-#else
void genX(CmdCopyQueryPoolResults)(
VkCommandBuffer commandBuffer,
VkQueryPool queryPool,
@@ -1474,6 +1850,99 @@ void genX(CmdCopyQueryPoolResults)(
VkDeviceSize destStride,
VkQueryResultFlags flags)
{
- anv_finishme("Queries not yet supported on Ivy Bridge");
+ ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+ ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
+ ANV_FROM_HANDLE(anv_buffer, buffer, destBuffer);
+ struct anv_device *device = cmd_buffer->device;
+ struct anv_physical_device *pdevice = device->physical;
+
+ if (queryCount > pdevice->instance->query_copy_with_shader_threshold) {
+ copy_query_results_with_shader(cmd_buffer, pool,
+ anv_address_add(buffer->address,
+ destOffset),
+ destStride,
+ firstQuery,
+ queryCount,
+ flags);
+ } else {
+ copy_query_results_with_cs(cmd_buffer, pool,
+ anv_address_add(buffer->address,
+ destOffset),
+ destStride,
+ firstQuery,
+ queryCount,
+ flags);
+ }
+}
+
+#if GFX_VERx10 == 125 && ANV_SUPPORT_RT
+
+#include "grl/include/GRLRTASCommon.h"
+#include "grl/grl_metakernel_postbuild_info.h"
+
+void
+genX(CmdWriteAccelerationStructuresPropertiesKHR)(
+ VkCommandBuffer commandBuffer,
+ uint32_t accelerationStructureCount,
+ const VkAccelerationStructureKHR* pAccelerationStructures,
+ VkQueryType queryType,
+ VkQueryPool queryPool,
+ uint32_t firstQuery)
+{
+ ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+ ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
+
+ assert(queryType == VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR ||
+ queryType == VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR ||
+ queryType == VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR ||
+ queryType == VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR);
+
+ emit_query_clear_flush(cmd_buffer, pool,
+ "CmdWriteAccelerationStructuresPropertiesKHR flush query clears");
+
+ struct mi_builder b;
+ mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
+
+ for (uint32_t i = 0; i < accelerationStructureCount; i++) {
+ ANV_FROM_HANDLE(vk_acceleration_structure, accel, pAccelerationStructures[i]);
+ struct anv_address query_addr =
+ anv_address_add(anv_query_address(pool, firstQuery + i), 8);
+
+ switch (queryType) {
+ case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR:
+ genX(grl_postbuild_info_compacted_size)(cmd_buffer,
+ vk_acceleration_structure_get_va(accel),
+ anv_address_physical(query_addr));
+ break;
+
+ case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR:
+ genX(grl_postbuild_info_current_size)(cmd_buffer,
+ vk_acceleration_structure_get_va(accel),
+ anv_address_physical(query_addr));
+ break;
+
+ case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR:
+ case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR:
+ genX(grl_postbuild_info_serialized_size)(cmd_buffer,
+ vk_acceleration_structure_get_va(accel),
+ anv_address_physical(query_addr));
+ break;
+
+ default:
+ unreachable("unhandled query type");
+ }
+ }
+
+ /* TODO: Figure out why MTL needs ANV_PIPE_DATA_CACHE_FLUSH_BIT in order
+ * to not lose the availability bit.
+ */
+ anv_add_pending_pipe_bits(cmd_buffer,
+ ANV_PIPE_END_OF_PIPE_SYNC_BIT |
+ ANV_PIPE_DATA_CACHE_FLUSH_BIT,
+ "after write acceleration struct props");
+ genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+
+ for (uint32_t i = 0; i < accelerationStructureCount; i++)
+ emit_query_mi_availability(&b, anv_query_address(pool, firstQuery + i), true);
}
#endif
diff --git a/src/intel/vulkan/genX_simple_shader.c b/src/intel/vulkan/genX_simple_shader.c
new file mode 100644
index 00000000000..bfe1ba2b5bf
--- /dev/null
+++ b/src/intel/vulkan/genX_simple_shader.c
@@ -0,0 +1,704 @@
+/*
+ * Copyright © 2023 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <assert.h>
+#include <stdbool.h>
+
+#include "util/macros.h"
+
+#include "anv_private.h"
+
+#include "genxml/gen_macros.h"
+#include "genxml/genX_pack.h"
+#include "common/intel_genX_state_brw.h"
+
+static void
+genX(emit_simpler_shader_init_fragment)(struct anv_simple_shader *state)
+{
+ assert(state->cmd_buffer == NULL ||
+ state->cmd_buffer->state.current_pipeline == _3D);
+
+ struct anv_batch *batch = state->batch;
+ struct anv_device *device = state->device;
+ const struct brw_wm_prog_data *prog_data =
+ brw_wm_prog_data_const(state->kernel->prog_data);
+
+ uint32_t *dw = anv_batch_emitn(batch,
+ 1 + 2 * GENX(VERTEX_ELEMENT_STATE_length),
+ GENX(3DSTATE_VERTEX_ELEMENTS));
+ /* You might think there is some shady stuff going here and you would be
+ * right. We're setting up 2 VERTEX_ELEMENT_STATE yet we're only providing
+ * 1 (positions) VERTEX_BUFFER_STATE later.
+ *
+ * Find more about how to set up a 3D pipeline with a fragment shader but
+ * without a vertex shader in blorp_emit_vertex_elements() in
+ * blorp_genX_exec_brw.h.
+ */
+ GENX(VERTEX_ELEMENT_STATE_pack)(
+ batch, dw + 1, &(struct GENX(VERTEX_ELEMENT_STATE)) {
+ .VertexBufferIndex = 1,
+ .Valid = true,
+ .SourceElementFormat = ISL_FORMAT_R32G32B32A32_FLOAT,
+ .SourceElementOffset = 0,
+ .Component0Control = VFCOMP_STORE_SRC,
+ .Component1Control = VFCOMP_STORE_0,
+ .Component2Control = VFCOMP_STORE_0,
+ .Component3Control = VFCOMP_STORE_0,
+ });
+ GENX(VERTEX_ELEMENT_STATE_pack)(
+ batch, dw + 3, &(struct GENX(VERTEX_ELEMENT_STATE)) {
+ .VertexBufferIndex = 0,
+ .Valid = true,
+ .SourceElementFormat = ISL_FORMAT_R32G32B32_FLOAT,
+ .SourceElementOffset = 0,
+ .Component0Control = VFCOMP_STORE_SRC,
+ .Component1Control = VFCOMP_STORE_SRC,
+ .Component2Control = VFCOMP_STORE_SRC,
+ .Component3Control = VFCOMP_STORE_1_FP,
+ });
+
+ anv_batch_emit(batch, GENX(3DSTATE_VF_STATISTICS), vf);
+ anv_batch_emit(batch, GENX(3DSTATE_VF_SGVS), sgvs) {
+ sgvs.InstanceIDEnable = true;
+ sgvs.InstanceIDComponentNumber = COMP_1;
+ sgvs.InstanceIDElementOffset = 0;
+ }
+#if GFX_VER >= 11
+ anv_batch_emit(batch, GENX(3DSTATE_VF_SGVS_2), sgvs);
+#endif
+ anv_batch_emit(batch, GENX(3DSTATE_VF_INSTANCING), vfi) {
+ vfi.InstancingEnable = false;
+ vfi.VertexElementIndex = 0;
+ }
+ anv_batch_emit(batch, GENX(3DSTATE_VF_INSTANCING), vfi) {
+ vfi.InstancingEnable = false;
+ vfi.VertexElementIndex = 1;
+ }
+
+ anv_batch_emit(batch, GENX(3DSTATE_VF_TOPOLOGY), topo) {
+ topo.PrimitiveTopologyType = _3DPRIM_RECTLIST;
+ }
+
+ /* Emit URB setup. We tell it that the VS is active because we want it to
+ * allocate space for the VS. Even though one isn't run, we need VUEs to
+ * store the data that VF is going to pass to SOL.
+ */
+ struct intel_urb_config urb_cfg_out = {
+ .size = { DIV_ROUND_UP(32, 64), 1, 1, 1 },
+ };
+
+ genX(emit_l3_config)(batch, device, state->l3_config);
+
+ state->cmd_buffer->state.current_l3_config = state->l3_config;
+
+ enum intel_urb_deref_block_size deref_block_size;
+ genX(emit_urb_setup)(device, batch, state->l3_config,
+ VK_SHADER_STAGE_VERTEX_BIT | VK_SHADER_STAGE_FRAGMENT_BIT,
+ state->urb_cfg, &urb_cfg_out, &deref_block_size);
+
+ anv_batch_emit(batch, GENX(3DSTATE_PS_BLEND), ps_blend) {
+ ps_blend.HasWriteableRT = true;
+ }
+
+ anv_batch_emit(batch, GENX(3DSTATE_WM_DEPTH_STENCIL), wm);
+
+#if GFX_VER >= 12
+ anv_batch_emit(batch, GENX(3DSTATE_DEPTH_BOUNDS), db) {
+ db.DepthBoundsTestEnable = false;
+ db.DepthBoundsTestMinValue = 0.0;
+ db.DepthBoundsTestMaxValue = 1.0;
+ }
+#endif
+
+ anv_batch_emit(batch, GENX(3DSTATE_MULTISAMPLE), ms);
+ anv_batch_emit(batch, GENX(3DSTATE_SAMPLE_MASK), sm) {
+ sm.SampleMask = 0x1;
+ }
+
+ anv_batch_emit(batch, GENX(3DSTATE_VS), vs);
+ anv_batch_emit(batch, GENX(3DSTATE_HS), hs);
+ anv_batch_emit(batch, GENX(3DSTATE_TE), te);
+ anv_batch_emit(batch, GENX(3DSTATE_DS), DS);
+
+#if GFX_VERx10 >= 125
+ if (device->vk.enabled_extensions.EXT_mesh_shader) {
+ anv_batch_emit(batch, GENX(3DSTATE_MESH_CONTROL), mesh);
+ anv_batch_emit(batch, GENX(3DSTATE_TASK_CONTROL), task);
+ }
+#endif
+
+ anv_batch_emit(batch, GENX(3DSTATE_STREAMOUT), so);
+
+ anv_batch_emit(batch, GENX(3DSTATE_GS), gs);
+
+ anv_batch_emit(batch, GENX(3DSTATE_CLIP), clip) {
+ clip.PerspectiveDivideDisable = true;
+ }
+
+ anv_batch_emit(batch, GENX(3DSTATE_SF), sf) {
+#if GFX_VER >= 12
+ sf.DerefBlockSize = deref_block_size;
+#endif
+ }
+
+ anv_batch_emit(batch, GENX(3DSTATE_RASTER), raster) {
+ raster.CullMode = CULLMODE_NONE;
+ }
+
+ anv_batch_emit(batch, GENX(3DSTATE_SBE), sbe) {
+ sbe.VertexURBEntryReadOffset = 1;
+ sbe.NumberofSFOutputAttributes = prog_data->num_varying_inputs;
+ sbe.VertexURBEntryReadLength = MAX2((prog_data->num_varying_inputs + 1) / 2, 1);
+ sbe.ConstantInterpolationEnable = prog_data->flat_inputs;
+ sbe.ForceVertexURBEntryReadLength = true;
+ sbe.ForceVertexURBEntryReadOffset = true;
+ for (unsigned i = 0; i < 32; i++)
+ sbe.AttributeActiveComponentFormat[i] = ACF_XYZW;
+ }
+
+ anv_batch_emit(batch, GENX(3DSTATE_WM), wm);
+
+ anv_batch_emit(batch, GENX(3DSTATE_PS), ps) {
+ intel_set_ps_dispatch_state(&ps, device->info, prog_data,
+ 1 /* rasterization_samples */,
+ 0 /* msaa_flags */);
+
+ ps.VectorMaskEnable = prog_data->uses_vmask;
+
+ ps.BindingTableEntryCount = GFX_VER == 9 ? 1 : 0;
+#if GFX_VER < 20
+ ps.PushConstantEnable = prog_data->base.nr_params > 0 ||
+ prog_data->base.ubo_ranges[0].length;
+#endif
+
+ ps.DispatchGRFStartRegisterForConstantSetupData0 =
+ brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 0);
+ ps.DispatchGRFStartRegisterForConstantSetupData1 =
+ brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 1);
+#if GFX_VER < 20
+ ps.DispatchGRFStartRegisterForConstantSetupData2 =
+ brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 2);
+#endif
+
+ ps.KernelStartPointer0 = state->kernel->kernel.offset +
+ brw_wm_prog_data_prog_offset(prog_data, ps, 0);
+ ps.KernelStartPointer1 = state->kernel->kernel.offset +
+ brw_wm_prog_data_prog_offset(prog_data, ps, 1);
+#if GFX_VER < 20
+ ps.KernelStartPointer2 = state->kernel->kernel.offset +
+ brw_wm_prog_data_prog_offset(prog_data, ps, 2);
+#endif
+
+ ps.MaximumNumberofThreadsPerPSD = device->info->max_threads_per_psd - 1;
+ }
+
+ anv_batch_emit(batch, GENX(3DSTATE_PS_EXTRA), psx) {
+ psx.PixelShaderValid = true;
+#if GFX_VER < 20
+ psx.AttributeEnable = prog_data->num_varying_inputs > 0;
+#endif
+ psx.PixelShaderIsPerSample = prog_data->persample_dispatch;
+ psx.PixelShaderComputedDepthMode = prog_data->computed_depth_mode;
+ psx.PixelShaderComputesStencil = prog_data->computed_stencil;
+ }
+
+ anv_batch_emit(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS_CC), cc) {
+ struct anv_state cc_state =
+ anv_state_stream_alloc(state->dynamic_state_stream,
+ 4 * GENX(CC_VIEWPORT_length), 32);
+ if (cc_state.map == NULL)
+ return;
+
+ struct GENX(CC_VIEWPORT) cc_viewport = {
+ .MinimumDepth = 0.0f,
+ .MaximumDepth = 1.0f,
+ };
+ GENX(CC_VIEWPORT_pack)(NULL, cc_state.map, &cc_viewport);
+ cc.CCViewportPointer = cc_state.offset;
+ }
+
+#if GFX_VER >= 12
+ /* Disable Primitive Replication. */
+ anv_batch_emit(batch, GENX(3DSTATE_PRIMITIVE_REPLICATION), pr);
+#endif
+
+ anv_batch_emit(batch, GENX(3DSTATE_PUSH_CONSTANT_ALLOC_VS), alloc);
+ anv_batch_emit(batch, GENX(3DSTATE_PUSH_CONSTANT_ALLOC_HS), alloc);
+ anv_batch_emit(batch, GENX(3DSTATE_PUSH_CONSTANT_ALLOC_DS), alloc);
+ anv_batch_emit(batch, GENX(3DSTATE_PUSH_CONSTANT_ALLOC_GS), alloc);
+ anv_batch_emit(batch, GENX(3DSTATE_PUSH_CONSTANT_ALLOC_PS), alloc) {
+ alloc.ConstantBufferOffset = 0;
+ alloc.ConstantBufferSize = device->info->max_constant_urb_size_kb;
+ }
+
+#if GFX_VERx10 == 125
+ /* DG2: Wa_22011440098
+ * MTL: Wa_18022330953
+ *
+ * In 3D mode, after programming push constant alloc command immediately
+ * program push constant command(ZERO length) without any commit between
+ * them.
+ *
+ * Note that Wa_16011448509 isn't needed here as all address bits are zero.
+ */
+ anv_batch_emit(batch, GENX(3DSTATE_CONSTANT_ALL), c) {
+ /* Update empty push constants for all stages (bitmask = 11111b) */
+ c.ShaderUpdateEnable = 0x1f;
+ c.MOCS = anv_mocs(device, NULL, 0);
+ }
+#endif
+
+#if GFX_VER == 9
+ /* Allocate a binding table for Gfx9 for 2 reason :
+ *
+ * 1. we need a to emit a 3DSTATE_BINDING_TABLE_POINTERS_PS to make the
+ * HW apply the preceeding 3DSTATE_CONSTANT_PS
+ *
+ * 2. Emitting an empty 3DSTATE_BINDING_TABLE_POINTERS_PS would cause RT
+ * writes (even though they're empty) to disturb later writes
+ * (probably due to RT cache)
+ *
+ * Our binding table only has one entry to the null surface.
+ */
+ uint32_t bt_offset;
+ state->bt_state =
+ anv_cmd_buffer_alloc_binding_table(state->cmd_buffer, 1, &bt_offset);
+ if (state->bt_state.map == NULL) {
+ VkResult result = anv_cmd_buffer_new_binding_table_block(state->cmd_buffer);
+ if (result != VK_SUCCESS)
+ return;
+
+ /* Re-emit state base addresses so we get the new surface state base
+ * address before we start emitting binding tables etc.
+ */
+ genX(cmd_buffer_emit_bt_pool_base_address)(state->cmd_buffer);
+
+ state->bt_state =
+ anv_cmd_buffer_alloc_binding_table(state->cmd_buffer, 1, &bt_offset);
+ assert(state->bt_state.map != NULL);
+ }
+
+ uint32_t *bt_map = state->bt_state.map;
+ bt_map[0] = anv_bindless_state_for_binding_table(
+ device,
+ device->null_surface_state).offset + bt_offset;
+
+ state->cmd_buffer->state.descriptors_dirty |= VK_SHADER_STAGE_FRAGMENT_BIT;
+#endif
+
+ /* Flag all the instructions emitted by the memcpy. */
+ struct anv_gfx_dynamic_state *hw_state =
+ &state->cmd_buffer->state.gfx.dyn_state;
+
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_URB);
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VF_STATISTICS);
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VF);
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VF_TOPOLOGY);
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VERTEX_INPUT);
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VF_SGVS);
+#if GFX_VER >= 11
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VF_SGVS_2);
+#endif
+#if GFX_VER >= 12
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_PRIMITIVE_REPLICATION);
+#endif
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_STREAMOUT);
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VIEWPORT_CC);
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_CLIP);
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_RASTER);
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_SAMPLE_MASK);
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_MULTISAMPLE);
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_DEPTH_BOUNDS);
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_WM);
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_WM_DEPTH_STENCIL);
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_SF);
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_SBE);
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VS);
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_HS);
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_DS);
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_TE);
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_GS);
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_PS);
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_PS_EXTRA);
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_PS_BLEND);
+ if (device->vk.enabled_extensions.EXT_mesh_shader) {
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_MESH_CONTROL);
+ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_TASK_CONTROL);
+ }
+
+ /* Update urb config after simple shader. */
+ memcpy(&state->cmd_buffer->state.gfx.urb_cfg, &urb_cfg_out,
+ sizeof(struct intel_urb_config));
+
+ state->cmd_buffer->state.gfx.vb_dirty = BITFIELD_BIT(0);
+ state->cmd_buffer->state.gfx.dirty |= ~(ANV_CMD_DIRTY_INDEX_BUFFER |
+ ANV_CMD_DIRTY_XFB_ENABLE);
+ state->cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_FRAGMENT_BIT;
+ state->cmd_buffer->state.gfx.push_constant_stages = VK_SHADER_STAGE_FRAGMENT_BIT;
+}
+
+static void
+genX(emit_simpler_shader_init_compute)(struct anv_simple_shader *state)
+{
+ assert(state->cmd_buffer == NULL ||
+ state->cmd_buffer->state.current_pipeline == GPGPU);
+
+#if GFX_VERx10 >= 125
+ struct anv_shader_bin *cs_bin = state->kernel;
+ const struct brw_cs_prog_data *prog_data =
+ (const struct brw_cs_prog_data *) cs_bin->prog_data;
+ /* Currently our simple shaders are simple enough that they never spill. */
+ assert(prog_data->base.total_scratch == 0);
+ if (state->cmd_buffer != NULL) {
+ genX(cmd_buffer_ensure_cfe_state)(state->cmd_buffer, 0);
+ } else {
+ anv_batch_emit(state->batch, GENX(CFE_STATE), cfe) {
+ cfe.MaximumNumberofThreads =
+ state->device->info->max_cs_threads *
+ state->device->info->subslice_total;
+ }
+ }
+#endif
+}
+
+/** Initialize a simple shader emission */
+void
+genX(emit_simple_shader_init)(struct anv_simple_shader *state)
+{
+ assert(state->kernel->stage == MESA_SHADER_FRAGMENT ||
+ state->kernel->stage == MESA_SHADER_COMPUTE);
+
+ if (state->kernel->stage == MESA_SHADER_FRAGMENT)
+ genX(emit_simpler_shader_init_fragment)(state);
+ else
+ genX(emit_simpler_shader_init_compute)(state);
+}
+
+/** Allocate push constant data for a simple shader */
+struct anv_state
+genX(simple_shader_alloc_push)(struct anv_simple_shader *state, uint32_t size)
+{
+ struct anv_state s;
+
+ if (state->kernel->stage == MESA_SHADER_FRAGMENT) {
+ s = anv_state_stream_alloc(state->dynamic_state_stream,
+ size, ANV_UBO_ALIGNMENT);
+ } else {
+#if GFX_VERx10 >= 125
+ s = anv_state_stream_alloc(state->general_state_stream, align(size, 64), 64);
+#else
+ s = anv_state_stream_alloc(state->dynamic_state_stream, size, 64);
+#endif
+ }
+
+ if (s.map == NULL)
+ anv_batch_set_error(state->batch, VK_ERROR_OUT_OF_DEVICE_MEMORY);
+
+ return s;
+}
+
+/** Get the address of allocated push constant data by
+ * genX(simple_shader_alloc_push)
+ */
+struct anv_address
+genX(simple_shader_push_state_address)(struct anv_simple_shader *state,
+ struct anv_state push_state)
+{
+ if (state->kernel->stage == MESA_SHADER_FRAGMENT) {
+ return anv_state_pool_state_address(
+ &state->device->dynamic_state_pool, push_state);
+ } else {
+#if GFX_VERx10 >= 125
+ return anv_state_pool_state_address(
+ &state->device->general_state_pool, push_state);
+#else
+ return anv_state_pool_state_address(
+ &state->device->dynamic_state_pool, push_state);
+#endif
+ }
+}
+
+/** Emit a simple shader dispatch */
+void
+genX(emit_simple_shader_dispatch)(struct anv_simple_shader *state,
+ uint32_t num_threads,
+ struct anv_state push_state)
+{
+ struct anv_device *device = state->device;
+ struct anv_batch *batch = state->batch;
+ struct anv_address push_addr =
+ anv_state_pool_state_address(&device->dynamic_state_pool, push_state);
+
+ if (state->kernel->stage == MESA_SHADER_FRAGMENT) {
+ /* At the moment we require a command buffer associated with this
+ * emission as we need to allocate binding tables on Gfx9.
+ */
+ assert(state->cmd_buffer != NULL);
+
+ struct anv_state vs_data_state =
+ anv_state_stream_alloc(state->dynamic_state_stream,
+ 9 * sizeof(uint32_t), 32);
+ if (vs_data_state.map == NULL)
+ return;
+
+ float x0 = 0.0f, x1 = MIN2(num_threads, 8192);
+ float y0 = 0.0f, y1 = DIV_ROUND_UP(num_threads, 8192);
+ float z = 0.0f;
+
+ float *vertices = vs_data_state.map;
+ vertices[0] = x1; vertices[1] = y1; vertices[2] = z; /* v0 */
+ vertices[3] = x0; vertices[4] = y1; vertices[5] = z; /* v1 */
+ vertices[6] = x0; vertices[7] = y0; vertices[8] = z; /* v2 */
+
+ uint32_t *dw = anv_batch_emitn(batch,
+ 1 + GENX(VERTEX_BUFFER_STATE_length),
+ GENX(3DSTATE_VERTEX_BUFFERS));
+ GENX(VERTEX_BUFFER_STATE_pack)(batch, dw + 1,
+ &(struct GENX(VERTEX_BUFFER_STATE)) {
+ .VertexBufferIndex = 0,
+ .AddressModifyEnable = true,
+ .BufferStartingAddress = (struct anv_address) {
+ .bo = device->dynamic_state_pool.block_pool.bo,
+ .offset = vs_data_state.offset,
+ },
+ .BufferPitch = 3 * sizeof(float),
+ .BufferSize = 9 * sizeof(float),
+ .MOCS = anv_mocs(device, NULL, 0),
+#if GFX_VER >= 12
+ .L3BypassDisable = true,
+#endif
+ });
+
+#if GFX_VERx10 > 120
+ dw =
+ anv_batch_emitn(batch,
+ GENX(3DSTATE_CONSTANT_ALL_length) +
+ GENX(3DSTATE_CONSTANT_ALL_DATA_length),
+ GENX(3DSTATE_CONSTANT_ALL),
+ .ShaderUpdateEnable = BITFIELD_BIT(MESA_SHADER_FRAGMENT),
+ .PointerBufferMask = 0x1,
+ .MOCS = anv_mocs(device, NULL, 0));
+
+ GENX(3DSTATE_CONSTANT_ALL_DATA_pack)(
+ batch, dw + GENX(3DSTATE_CONSTANT_ALL_length),
+ &(struct GENX(3DSTATE_CONSTANT_ALL_DATA)) {
+ .PointerToConstantBuffer = push_addr,
+ .ConstantBufferReadLength = DIV_ROUND_UP(push_state.alloc_size, 32),
+ });
+#else
+ /* The Skylake PRM contains the following restriction:
+ *
+ * "The driver must ensure The following case does not occur
+ * without a flush to the 3D engine: 3DSTATE_CONSTANT_* with
+ * buffer 3 read length equal to zero committed followed by a
+ * 3DSTATE_CONSTANT_* with buffer 0 read length not equal to
+ * zero committed."
+ *
+ * To avoid this, we program the highest slot.
+ */
+ anv_batch_emit(batch, GENX(3DSTATE_CONSTANT_PS), c) {
+ c.MOCS = anv_mocs(device, NULL, 0);
+ c.ConstantBody.ReadLength[3] = DIV_ROUND_UP(push_state.alloc_size, 32);
+ c.ConstantBody.Buffer[3] = push_addr;
+ }
+#endif
+
+#if GFX_VER == 9
+ /* Why are the push constants not flushed without a binding table
+ * update??
+ */
+ anv_batch_emit(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS_PS), btp) {
+ btp.PointertoPSBindingTable = state->bt_state.offset;
+ }
+#endif
+
+ genX(emit_breakpoint)(batch, device, true);
+ anv_batch_emit(batch, GENX(3DPRIMITIVE), prim) {
+ prim.VertexAccessType = SEQUENTIAL;
+ prim.PrimitiveTopologyType = _3DPRIM_RECTLIST;
+ prim.VertexCountPerInstance = 3;
+ prim.InstanceCount = 1;
+ }
+ genX(batch_emit_post_3dprimitive_was)(batch, device, _3DPRIM_RECTLIST, 3);
+ genX(emit_breakpoint)(batch, device, false);
+ } else {
+ const struct intel_device_info *devinfo = device->info;
+ const struct brw_cs_prog_data *prog_data =
+ (const struct brw_cs_prog_data *) state->kernel->prog_data;
+ const struct intel_cs_dispatch_info dispatch =
+ brw_cs_get_dispatch_info(devinfo, prog_data, NULL);
+
+#if GFX_VERx10 >= 125
+ anv_batch_emit(batch, GENX(COMPUTE_WALKER), cw) {
+ cw.SIMDSize = dispatch.simd_size / 16;
+ cw.MessageSIMD = dispatch.simd_size / 16,
+ cw.IndirectDataStartAddress = push_state.offset;
+ cw.IndirectDataLength = push_state.alloc_size;
+ cw.LocalXMaximum = prog_data->local_size[0] - 1;
+ cw.LocalYMaximum = prog_data->local_size[1] - 1;
+ cw.LocalZMaximum = prog_data->local_size[2] - 1;
+ cw.ThreadGroupIDXDimension = DIV_ROUND_UP(num_threads,
+ dispatch.simd_size);
+ cw.ThreadGroupIDYDimension = 1;
+ cw.ThreadGroupIDZDimension = 1;
+ cw.ExecutionMask = dispatch.right_mask;
+ cw.PostSync.MOCS = anv_mocs(device, NULL, 0);
+
+#if GFX_VERx10 >= 125
+ cw.GenerateLocalID = prog_data->generate_local_id != 0;
+ cw.EmitLocal = prog_data->generate_local_id;
+ cw.WalkOrder = prog_data->walk_order;
+ cw.TileLayout = prog_data->walk_order == INTEL_WALK_ORDER_YXZ ?
+ TileY32bpe : Linear;
+#endif
+
+ cw.InterfaceDescriptor = (struct GENX(INTERFACE_DESCRIPTOR_DATA)) {
+ .KernelStartPointer = state->kernel->kernel.offset +
+ brw_cs_prog_data_prog_offset(prog_data,
+ dispatch.simd_size),
+ .SamplerStatePointer = 0,
+ .BindingTablePointer = 0,
+ .BindingTableEntryCount = 0,
+ .NumberofThreadsinGPGPUThreadGroup = dispatch.threads,
+ .SharedLocalMemorySize = encode_slm_size(GFX_VER,
+ prog_data->base.total_shared),
+ .NumberOfBarriers = prog_data->uses_barrier,
+ };
+ }
+#else
+ const uint32_t vfe_curbe_allocation =
+ ALIGN(prog_data->push.per_thread.regs * dispatch.threads +
+ prog_data->push.cross_thread.regs, 2);
+
+ /* From the Sky Lake PRM Vol 2a, MEDIA_VFE_STATE:
+ *
+ * "A stalling PIPE_CONTROL is required before MEDIA_VFE_STATE unless
+ * the only bits that are changed are scoreboard related: Scoreboard
+ * Enable, Scoreboard Type, Scoreboard Mask, Scoreboard * Delta. For
+ * these scoreboard related states, a MEDIA_STATE_FLUSH is
+ * sufficient."
+ */
+ enum anv_pipe_bits emitted_bits = 0;
+ genX(emit_apply_pipe_flushes)(batch, device, GPGPU, ANV_PIPE_CS_STALL_BIT,
+ &emitted_bits);
+
+ /* If we have a command buffer allocated with the emission, update the
+ * pending bits.
+ */
+ if (state->cmd_buffer)
+ anv_cmd_buffer_update_pending_query_bits(state->cmd_buffer, emitted_bits);
+
+ anv_batch_emit(batch, GENX(MEDIA_VFE_STATE), vfe) {
+ vfe.StackSize = 0;
+ vfe.MaximumNumberofThreads =
+ devinfo->max_cs_threads * devinfo->subslice_total - 1;
+ vfe.NumberofURBEntries = 2;
+#if GFX_VER < 11
+ vfe.ResetGatewayTimer = true;
+#endif
+ vfe.URBEntryAllocationSize = 2;
+ vfe.CURBEAllocationSize = vfe_curbe_allocation;
+
+ if (prog_data->base.total_scratch) {
+ /* Broadwell's Per Thread Scratch Space is in the range [0, 11]
+ * where 0 = 1k, 1 = 2k, 2 = 4k, ..., 11 = 2M.
+ */
+ vfe.PerThreadScratchSpace =
+ ffs(prog_data->base.total_scratch) - 11;
+ vfe.ScratchSpaceBasePointer =
+ (struct anv_address) {
+ .bo = anv_scratch_pool_alloc(device,
+ &device->scratch_pool,
+ MESA_SHADER_COMPUTE,
+ prog_data->base.total_scratch),
+ .offset = 0,
+ };
+ }
+ }
+ struct anv_state iface_desc_state =
+ anv_state_stream_alloc(state->dynamic_state_stream,
+ GENX(INTERFACE_DESCRIPTOR_DATA_length) * 4, 64);
+ if (iface_desc_state.map == NULL)
+ return;
+
+ struct GENX(INTERFACE_DESCRIPTOR_DATA) iface_desc = {
+ .KernelStartPointer = state->kernel->kernel.offset +
+ brw_cs_prog_data_prog_offset(prog_data,
+ dispatch.simd_size),
+
+ .SamplerCount = 0,
+ .BindingTableEntryCount = 0,
+ .BarrierEnable = prog_data->uses_barrier,
+ .SharedLocalMemorySize = encode_slm_size(GFX_VER,
+ prog_data->base.total_shared),
+
+ .ConstantURBEntryReadOffset = 0,
+ .ConstantURBEntryReadLength = prog_data->push.per_thread.regs,
+ .CrossThreadConstantDataReadLength = prog_data->push.cross_thread.regs,
+#if GFX_VER >= 12
+ /* TODO: Check if we are missing workarounds and enable mid-thread
+ * preemption.
+ *
+ * We still have issues with mid-thread preemption (it was already
+ * disabled by the kernel on gfx11, due to missing workarounds). It's
+ * possible that we are just missing some workarounds, and could
+ * enable it later, but for now let's disable it to fix a GPU in
+ * compute in Car Chase (and possibly more).
+ */
+ .ThreadPreemptionDisable = true,
+#endif
+ .NumberofThreadsinGPGPUThreadGroup = dispatch.threads,
+ };
+ GENX(INTERFACE_DESCRIPTOR_DATA_pack)(batch, iface_desc_state.map, &iface_desc);
+ anv_batch_emit(batch, GENX(MEDIA_INTERFACE_DESCRIPTOR_LOAD), mid) {
+ mid.InterfaceDescriptorTotalLength = iface_desc_state.alloc_size;
+ mid.InterfaceDescriptorDataStartAddress = iface_desc_state.offset;
+ }
+ anv_batch_emit(batch, GENX(MEDIA_CURBE_LOAD), curbe) {
+ curbe.CURBEDataStartAddress = push_state.offset;
+ curbe.CURBETotalDataLength = push_state.alloc_size;
+ }
+ anv_batch_emit(batch, GENX(GPGPU_WALKER), ggw) {
+ ggw.SIMDSize = dispatch.simd_size / 16;
+ ggw.ThreadDepthCounterMaximum = 0;
+ ggw.ThreadHeightCounterMaximum = 0;
+ ggw.ThreadWidthCounterMaximum = dispatch.threads - 1;
+ ggw.ThreadGroupIDXDimension = DIV_ROUND_UP(num_threads,
+ dispatch.simd_size);
+ ggw.ThreadGroupIDYDimension = 1;
+ ggw.ThreadGroupIDZDimension = 1;
+ ggw.RightExecutionMask = dispatch.right_mask;
+ ggw.BottomExecutionMask = 0xffffffff;
+ }
+#endif
+ }
+}
+
+void
+genX(emit_simple_shader_end)(struct anv_simple_shader *state)
+{
+ anv_batch_emit(state->batch, GENX(MI_BATCH_BUFFER_END), end);
+
+ if ((state->batch->next - state->batch->start) & 4)
+ anv_batch_emit(state->batch, GENX(MI_NOOP), noop);
+}
diff --git a/src/intel/vulkan/genX_state.c b/src/intel/vulkan/genX_state.c
deleted file mode 100644
index dd8ada9087a..00000000000
--- a/src/intel/vulkan/genX_state.c
+++ /dev/null
@@ -1,894 +0,0 @@
-/*
- * Copyright © 2015 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-#include <assert.h>
-#include <stdbool.h>
-#include <string.h>
-#include <unistd.h>
-#include <fcntl.h>
-
-#include "anv_private.h"
-
-#include "common/intel_aux_map.h"
-#include "common/intel_sample_positions.h"
-#include "genxml/gen_macros.h"
-#include "genxml/genX_pack.h"
-
-#include "vk_util.h"
-
-/**
- * Compute an \p n x \p m pixel hashing table usable as slice, subslice or
- * pixel pipe hashing table. The resulting table is the cyclic repetition of
- * a fixed pattern with periodicity equal to \p period.
- *
- * If \p index is specified to be equal to \p period, a 2-way hashing table
- * will be generated such that indices 0 and 1 are returned for the following
- * fractions of entries respectively:
- *
- * p_0 = ceil(period / 2) / period
- * p_1 = floor(period / 2) / period
- *
- * If \p index is even and less than \p period, a 3-way hashing table will be
- * generated such that indices 0, 1 and 2 are returned for the following
- * fractions of entries:
- *
- * p_0 = (ceil(period / 2) - 1) / period
- * p_1 = floor(period / 2) / period
- * p_2 = 1 / period
- *
- * The equations above apply if \p flip is equal to 0, if it is equal to 1 p_0
- * and p_1 will be swapped for the result. Note that in the context of pixel
- * pipe hashing this can be always 0 on Gfx12 platforms, since the hardware
- * transparently remaps logical indices found on the table to physical pixel
- * pipe indices from the highest to lowest EU count.
- */
-UNUSED static void
-calculate_pixel_hashing_table(unsigned n, unsigned m,
- unsigned period, unsigned index, bool flip,
- uint32_t *p)
-{
- for (unsigned i = 0; i < n; i++) {
- for (unsigned j = 0; j < m; j++) {
- const unsigned k = (i + j) % period;
- p[j + m * i] = (k == index ? 2 : (k & 1) ^ flip);
- }
- }
-}
-
-static void
-genX(emit_slice_hashing_state)(struct anv_device *device,
- struct anv_batch *batch)
-{
- device->slice_hash = (struct anv_state) { 0 };
-
-#if GFX_VER == 11
- assert(device->info.ppipe_subslices[2] == 0);
-
- if (device->info.ppipe_subslices[0] == device->info.ppipe_subslices[1])
- return;
-
- unsigned size = GENX(SLICE_HASH_TABLE_length) * 4;
- device->slice_hash =
- anv_state_pool_alloc(&device->dynamic_state_pool, size, 64);
-
- const bool flip = device->info.ppipe_subslices[0] <
- device->info.ppipe_subslices[1];
- struct GENX(SLICE_HASH_TABLE) table;
- calculate_pixel_hashing_table(16, 16, 3, 3, flip, table.Entry[0]);
-
- GENX(SLICE_HASH_TABLE_pack)(NULL, device->slice_hash.map, &table);
-
- anv_batch_emit(batch, GENX(3DSTATE_SLICE_TABLE_STATE_POINTERS), ptr) {
- ptr.SliceHashStatePointerValid = true;
- ptr.SliceHashTableStatePointer = device->slice_hash.offset;
- }
-
- anv_batch_emit(batch, GENX(3DSTATE_3D_MODE), mode) {
- mode.SliceHashingTableEnable = true;
- }
-#elif GFX_VERx10 == 120
- /* For each n calculate ppipes_of[n], equal to the number of pixel pipes
- * present with n active dual subslices.
- */
- unsigned ppipes_of[3] = {};
-
- for (unsigned n = 0; n < ARRAY_SIZE(ppipes_of); n++) {
- for (unsigned p = 0; p < ARRAY_SIZE(device->info.ppipe_subslices); p++)
- ppipes_of[n] += (device->info.ppipe_subslices[p] == n);
- }
-
- /* Gfx12 has three pixel pipes. */
- assert(ppipes_of[0] + ppipes_of[1] + ppipes_of[2] == 3);
-
- if (ppipes_of[2] == 3 || ppipes_of[0] == 2) {
- /* All three pixel pipes have the maximum number of active dual
- * subslices, or there is only one active pixel pipe: Nothing to do.
- */
- return;
- }
-
- anv_batch_emit(batch, GENX(3DSTATE_SUBSLICE_HASH_TABLE), p) {
- p.SliceHashControl[0] = TABLE_0;
-
- if (ppipes_of[2] == 2 && ppipes_of[0] == 1)
- calculate_pixel_hashing_table(8, 16, 2, 2, 0, p.TwoWayTableEntry[0]);
- else if (ppipes_of[2] == 1 && ppipes_of[1] == 1 && ppipes_of[0] == 1)
- calculate_pixel_hashing_table(8, 16, 3, 3, 0, p.TwoWayTableEntry[0]);
-
- if (ppipes_of[2] == 2 && ppipes_of[1] == 1)
- calculate_pixel_hashing_table(8, 16, 5, 4, 0, p.ThreeWayTableEntry[0]);
- else if (ppipes_of[2] == 2 && ppipes_of[0] == 1)
- calculate_pixel_hashing_table(8, 16, 2, 2, 0, p.ThreeWayTableEntry[0]);
- else if (ppipes_of[2] == 1 && ppipes_of[1] == 1 && ppipes_of[0] == 1)
- calculate_pixel_hashing_table(8, 16, 3, 3, 0, p.ThreeWayTableEntry[0]);
- else
- unreachable("Illegal fusing.");
- }
-
- anv_batch_emit(batch, GENX(3DSTATE_3D_MODE), p) {
- p.SubsliceHashingTableEnable = true;
- p.SubsliceHashingTableEnableMask = true;
- }
-#endif
-}
-
-static VkResult
-init_render_queue_state(struct anv_queue *queue)
-{
- struct anv_device *device = queue->device;
- struct anv_batch batch;
-
- uint32_t cmds[64];
- batch.start = batch.next = cmds;
- batch.end = (void *) cmds + sizeof(cmds);
-
- anv_batch_emit(&batch, GENX(PIPELINE_SELECT), ps) {
-#if GFX_VER >= 9
- ps.MaskBits = GFX_VER >= 12 ? 0x13 : 3;
- ps.MediaSamplerDOPClockGateEnable = GFX_VER >= 12;
-#endif
- ps.PipelineSelection = _3D;
- }
-
-#if GFX_VER == 9
- anv_batch_write_reg(&batch, GENX(CACHE_MODE_1), cm1) {
- cm1.FloatBlendOptimizationEnable = true;
- cm1.FloatBlendOptimizationEnableMask = true;
- cm1.MSCRAWHazardAvoidanceBit = true;
- cm1.MSCRAWHazardAvoidanceBitMask = true;
- cm1.PartialResolveDisableInVC = true;
- cm1.PartialResolveDisableInVCMask = true;
- }
-#endif
-
- anv_batch_emit(&batch, GENX(3DSTATE_AA_LINE_PARAMETERS), aa);
-
- anv_batch_emit(&batch, GENX(3DSTATE_DRAWING_RECTANGLE), rect) {
- rect.ClippedDrawingRectangleYMin = 0;
- rect.ClippedDrawingRectangleXMin = 0;
- rect.ClippedDrawingRectangleYMax = UINT16_MAX;
- rect.ClippedDrawingRectangleXMax = UINT16_MAX;
- rect.DrawingRectangleOriginY = 0;
- rect.DrawingRectangleOriginX = 0;
- }
-
-#if GFX_VER >= 8
- anv_batch_emit(&batch, GENX(3DSTATE_WM_CHROMAKEY), ck);
-
- genX(emit_sample_pattern)(&batch, 0, NULL);
-
- /* The BDW+ docs describe how to use the 3DSTATE_WM_HZ_OP instruction in the
- * section titled, "Optimized Depth Buffer Clear and/or Stencil Buffer
- * Clear." It mentions that the packet overrides GPU state for the clear
- * operation and needs to be reset to 0s to clear the overrides. Depending
- * on the kernel, we may not get a context with the state for this packet
- * zeroed. Do it ourselves just in case. We've observed this to prevent a
- * number of GPU hangs on ICL.
- */
- anv_batch_emit(&batch, GENX(3DSTATE_WM_HZ_OP), hzp);
-#endif
-
-#if GFX_VER == 11
- /* The default behavior of bit 5 "Headerless Message for Pre-emptable
- * Contexts" in SAMPLER MODE register is set to 0, which means
- * headerless sampler messages are not allowed for pre-emptable
- * contexts. Set the bit 5 to 1 to allow them.
- */
- anv_batch_write_reg(&batch, GENX(SAMPLER_MODE), sm) {
- sm.HeaderlessMessageforPreemptableContexts = true;
- sm.HeaderlessMessageforPreemptableContextsMask = true;
- }
-
- /* Bit 1 "Enabled Texel Offset Precision Fix" must be set in
- * HALF_SLICE_CHICKEN7 register.
- */
- anv_batch_write_reg(&batch, GENX(HALF_SLICE_CHICKEN7), hsc7) {
- hsc7.EnabledTexelOffsetPrecisionFix = true;
- hsc7.EnabledTexelOffsetPrecisionFixMask = true;
- }
-
- anv_batch_write_reg(&batch, GENX(TCCNTLREG), tcc) {
- tcc.L3DataPartialWriteMergingEnable = true;
- tcc.ColorZPartialWriteMergingEnable = true;
- tcc.URBPartialWriteMergingEnable = true;
- tcc.TCDisable = true;
- }
-#endif
- genX(emit_slice_hashing_state)(device, &batch);
-
-#if GFX_VER >= 11
- /* hardware specification recommends disabling repacking for
- * the compatibility with decompression mechanism in display controller.
- */
- if (device->info.disable_ccs_repack) {
- anv_batch_write_reg(&batch, GENX(CACHE_MODE_0), cm0) {
- cm0.DisableRepackingforCompression = true;
- cm0.DisableRepackingforCompressionMask = true;
- }
- }
-
- /* an unknown issue is causing vs push constants to become
- * corrupted during object-level preemption. For now, restrict
- * to command buffer level preemption to avoid rendering
- * corruption.
- */
- anv_batch_write_reg(&batch, GENX(CS_CHICKEN1), cc1) {
- cc1.ReplayMode = MidcmdbufferPreemption;
- cc1.ReplayModeMask = true;
- }
-
-#if GFX_VERx10 < 125
-#define AA_LINE_QUALITY_REG GENX(3D_CHICKEN3)
-#else
-#define AA_LINE_QUALITY_REG GENX(CHICKEN_RASTER_1)
-#endif
-
- /* Enable the new line drawing algorithm that produces higher quality
- * lines.
- */
- anv_batch_write_reg(&batch, AA_LINE_QUALITY_REG, c3) {
- c3.AALineQualityFix = true;
- c3.AALineQualityFixMask = true;
- }
-#endif
-
-#if GFX_VER == 12
- if (device->info.has_aux_map) {
- uint64_t aux_base_addr = intel_aux_map_get_base(device->aux_map_ctx);
- assert(aux_base_addr % (32 * 1024) == 0);
- anv_batch_emit(&batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
- lri.RegisterOffset = GENX(GFX_AUX_TABLE_BASE_ADDR_num);
- lri.DataDWord = aux_base_addr & 0xffffffff;
- }
- anv_batch_emit(&batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
- lri.RegisterOffset = GENX(GFX_AUX_TABLE_BASE_ADDR_num) + 4;
- lri.DataDWord = aux_base_addr >> 32;
- }
- }
-#endif
-
- /* Set the "CONSTANT_BUFFER Address Offset Disable" bit, so
- * 3DSTATE_CONSTANT_XS buffer 0 is an absolute address.
- *
- * This is only safe on kernels with context isolation support.
- */
- if (GFX_VER >= 8 && device->physical->has_context_isolation) {
-#if GFX_VER >= 9
- anv_batch_write_reg(&batch, GENX(CS_DEBUG_MODE2), csdm2) {
- csdm2.CONSTANT_BUFFERAddressOffsetDisable = true;
- csdm2.CONSTANT_BUFFERAddressOffsetDisableMask = true;
- }
-#elif GFX_VER == 8
- anv_batch_write_reg(&batch, GENX(INSTPM), instpm) {
- instpm.CONSTANT_BUFFERAddressOffsetDisable = true;
- instpm.CONSTANT_BUFFERAddressOffsetDisableMask = true;
- }
-#endif
- }
-
-#if GFX_VER >= 11
- /* Starting with GFX version 11, SLM is no longer part of the L3$ config
- * so it never changes throughout the lifetime of the VkDevice.
- */
- const struct intel_l3_config *cfg = intel_get_default_l3_config(&device->info);
- genX(emit_l3_config)(&batch, device, cfg);
- device->l3_config = cfg;
-#endif
-
- anv_batch_emit(&batch, GENX(MI_BATCH_BUFFER_END), bbe);
-
- assert(batch.next <= batch.end);
-
- return anv_queue_submit_simple_batch(queue, &batch);
-}
-
-void
-genX(init_physical_device_state)(ASSERTED struct anv_physical_device *device)
-{
- assert(device->info.verx10 == GFX_VERx10);
-}
-
-VkResult
-genX(init_device_state)(struct anv_device *device)
-{
- VkResult res;
-
- for (uint32_t i = 0; i < device->queue_count; i++) {
- struct anv_queue *queue = &device->queues[i];
- switch (queue->family->engine_class) {
- case I915_ENGINE_CLASS_RENDER:
- res = init_render_queue_state(queue);
- break;
- default:
- res = vk_error(VK_ERROR_INITIALIZATION_FAILED);
- break;
- }
- if (res != VK_SUCCESS)
- return res;
- }
-
- return res;
-}
-
-void
-genX(emit_l3_config)(struct anv_batch *batch,
- const struct anv_device *device,
- const struct intel_l3_config *cfg)
-{
- UNUSED const struct intel_device_info *devinfo = &device->info;
-
-#if GFX_VER >= 8
-
-#if GFX_VER >= 12
-#define L3_ALLOCATION_REG GENX(L3ALLOC)
-#define L3_ALLOCATION_REG_num GENX(L3ALLOC_num)
-#else
-#define L3_ALLOCATION_REG GENX(L3CNTLREG)
-#define L3_ALLOCATION_REG_num GENX(L3CNTLREG_num)
-#endif
-
- anv_batch_write_reg(batch, L3_ALLOCATION_REG, l3cr) {
- if (cfg == NULL) {
-#if GFX_VER >= 12
- l3cr.L3FullWayAllocationEnable = true;
-#else
- unreachable("Invalid L3$ config");
-#endif
- } else {
-#if GFX_VER < 11
- l3cr.SLMEnable = cfg->n[INTEL_L3P_SLM];
-#endif
-#if GFX_VER == 11
- /* Wa_1406697149: Bit 9 "Error Detection Behavior Control" must be
- * set in L3CNTLREG register. The default setting of the bit is not
- * the desirable behavior.
- */
- l3cr.ErrorDetectionBehaviorControl = true;
- l3cr.UseFullWays = true;
-#endif /* GFX_VER == 11 */
- assert(cfg->n[INTEL_L3P_IS] == 0);
- assert(cfg->n[INTEL_L3P_C] == 0);
- assert(cfg->n[INTEL_L3P_T] == 0);
- l3cr.URBAllocation = cfg->n[INTEL_L3P_URB];
- l3cr.ROAllocation = cfg->n[INTEL_L3P_RO];
- l3cr.DCAllocation = cfg->n[INTEL_L3P_DC];
- l3cr.AllAllocation = cfg->n[INTEL_L3P_ALL];
- }
- }
-
-#else /* GFX_VER < 8 */
-
- const bool has_dc = cfg->n[INTEL_L3P_DC] || cfg->n[INTEL_L3P_ALL];
- const bool has_is = cfg->n[INTEL_L3P_IS] || cfg->n[INTEL_L3P_RO] ||
- cfg->n[INTEL_L3P_ALL];
- const bool has_c = cfg->n[INTEL_L3P_C] || cfg->n[INTEL_L3P_RO] ||
- cfg->n[INTEL_L3P_ALL];
- const bool has_t = cfg->n[INTEL_L3P_T] || cfg->n[INTEL_L3P_RO] ||
- cfg->n[INTEL_L3P_ALL];
-
- assert(!cfg->n[INTEL_L3P_ALL]);
-
- /* When enabled SLM only uses a portion of the L3 on half of the banks,
- * the matching space on the remaining banks has to be allocated to a
- * client (URB for all validated configurations) set to the
- * lower-bandwidth 2-bank address hashing mode.
- */
- const bool urb_low_bw = cfg->n[INTEL_L3P_SLM] && !devinfo->is_baytrail;
- assert(!urb_low_bw || cfg->n[INTEL_L3P_URB] == cfg->n[INTEL_L3P_SLM]);
-
- /* Minimum number of ways that can be allocated to the URB. */
- const unsigned n0_urb = devinfo->is_baytrail ? 32 : 0;
- assert(cfg->n[INTEL_L3P_URB] >= n0_urb);
-
- anv_batch_write_reg(batch, GENX(L3SQCREG1), l3sqc) {
- l3sqc.ConvertDC_UC = !has_dc;
- l3sqc.ConvertIS_UC = !has_is;
- l3sqc.ConvertC_UC = !has_c;
- l3sqc.ConvertT_UC = !has_t;
-#if GFX_VERx10 == 75
- l3sqc.L3SQGeneralPriorityCreditInitialization = SQGPCI_DEFAULT;
-#else
- l3sqc.L3SQGeneralPriorityCreditInitialization =
- devinfo->is_baytrail ? BYT_SQGPCI_DEFAULT : SQGPCI_DEFAULT;
-#endif
- l3sqc.L3SQHighPriorityCreditInitialization = SQHPCI_DEFAULT;
- }
-
- anv_batch_write_reg(batch, GENX(L3CNTLREG2), l3cr2) {
- l3cr2.SLMEnable = cfg->n[INTEL_L3P_SLM];
- l3cr2.URBLowBandwidth = urb_low_bw;
- l3cr2.URBAllocation = cfg->n[INTEL_L3P_URB] - n0_urb;
-#if !GFX_VERx10 == 75
- l3cr2.ALLAllocation = cfg->n[INTEL_L3P_ALL];
-#endif
- l3cr2.ROAllocation = cfg->n[INTEL_L3P_RO];
- l3cr2.DCAllocation = cfg->n[INTEL_L3P_DC];
- }
-
- anv_batch_write_reg(batch, GENX(L3CNTLREG3), l3cr3) {
- l3cr3.ISAllocation = cfg->n[INTEL_L3P_IS];
- l3cr3.ISLowBandwidth = 0;
- l3cr3.CAllocation = cfg->n[INTEL_L3P_C];
- l3cr3.CLowBandwidth = 0;
- l3cr3.TAllocation = cfg->n[INTEL_L3P_T];
- l3cr3.TLowBandwidth = 0;
- }
-
-#if GFX_VERx10 == 75
- if (device->physical->cmd_parser_version >= 4) {
- /* Enable L3 atomics on HSW if we have a DC partition, otherwise keep
- * them disabled to avoid crashing the system hard.
- */
- anv_batch_write_reg(batch, GENX(SCRATCH1), s1) {
- s1.L3AtomicDisable = !has_dc;
- }
- anv_batch_write_reg(batch, GENX(CHICKEN3), c3) {
- c3.L3AtomicDisableMask = true;
- c3.L3AtomicDisable = !has_dc;
- }
- }
-#endif /* GFX_VERx10 == 75 */
-
-#endif /* GFX_VER < 8 */
-}
-
-void
-genX(emit_multisample)(struct anv_batch *batch, uint32_t samples,
- const VkSampleLocationEXT *locations)
-{
- anv_batch_emit(batch, GENX(3DSTATE_MULTISAMPLE), ms) {
- ms.NumberofMultisamples = __builtin_ffs(samples) - 1;
-
- ms.PixelLocation = CENTER;
-#if GFX_VER >= 8
- /* The PRM says that this bit is valid only for DX9:
- *
- * SW can choose to set this bit only for DX9 API. DX10/OGL API's
- * should not have any effect by setting or not setting this bit.
- */
- ms.PixelPositionOffsetEnable = false;
-#else
-
- if (locations) {
- switch (samples) {
- case 1:
- INTEL_SAMPLE_POS_1X_ARRAY(ms.Sample, locations);
- break;
- case 2:
- INTEL_SAMPLE_POS_2X_ARRAY(ms.Sample, locations);
- break;
- case 4:
- INTEL_SAMPLE_POS_4X_ARRAY(ms.Sample, locations);
- break;
- case 8:
- INTEL_SAMPLE_POS_8X_ARRAY(ms.Sample, locations);
- break;
- default:
- break;
- }
- } else {
- switch (samples) {
- case 1:
- INTEL_SAMPLE_POS_1X(ms.Sample);
- break;
- case 2:
- INTEL_SAMPLE_POS_2X(ms.Sample);
- break;
- case 4:
- INTEL_SAMPLE_POS_4X(ms.Sample);
- break;
- case 8:
- INTEL_SAMPLE_POS_8X(ms.Sample);
- break;
- default:
- break;
- }
- }
-#endif
- }
-}
-
-#if GFX_VER >= 8
-void
-genX(emit_sample_pattern)(struct anv_batch *batch, uint32_t samples,
- const VkSampleLocationEXT *locations)
-{
- /* See the Vulkan 1.0 spec Table 24.1 "Standard sample locations" and
- * VkPhysicalDeviceFeatures::standardSampleLocations.
- */
- anv_batch_emit(batch, GENX(3DSTATE_SAMPLE_PATTERN), sp) {
- if (locations) {
- /* The Skylake PRM Vol. 2a "3DSTATE_SAMPLE_PATTERN" says:
- *
- * "When programming the sample offsets (for NUMSAMPLES_4 or _8
- * and MSRASTMODE_xxx_PATTERN), the order of the samples 0 to 3
- * (or 7 for 8X, or 15 for 16X) must have monotonically increasing
- * distance from the pixel center. This is required to get the
- * correct centroid computation in the device."
- *
- * However, the Vulkan spec seems to require that the the samples
- * occur in the order provided through the API. The standard sample
- * patterns have the above property that they have monotonically
- * increasing distances from the center but client-provided ones do
- * not. As long as this only affects centroid calculations as the
- * docs say, we should be ok because OpenGL and Vulkan only require
- * that the centroid be some lit sample and that it's the same for
- * all samples in a pixel; they have no requirement that it be the
- * one closest to center.
- */
- switch (samples) {
- case 1:
- INTEL_SAMPLE_POS_1X_ARRAY(sp._1xSample, locations);
- break;
- case 2:
- INTEL_SAMPLE_POS_2X_ARRAY(sp._2xSample, locations);
- break;
- case 4:
- INTEL_SAMPLE_POS_4X_ARRAY(sp._4xSample, locations);
- break;
- case 8:
- INTEL_SAMPLE_POS_8X_ARRAY(sp._8xSample, locations);
- break;
-#if GFX_VER >= 9
- case 16:
- INTEL_SAMPLE_POS_16X_ARRAY(sp._16xSample, locations);
- break;
-#endif
- default:
- break;
- }
- } else {
- INTEL_SAMPLE_POS_1X(sp._1xSample);
- INTEL_SAMPLE_POS_2X(sp._2xSample);
- INTEL_SAMPLE_POS_4X(sp._4xSample);
- INTEL_SAMPLE_POS_8X(sp._8xSample);
-#if GFX_VER >= 9
- INTEL_SAMPLE_POS_16X(sp._16xSample);
-#endif
- }
- }
-}
-#endif
-
-#if GFX_VER >= 11
-void
-genX(emit_shading_rate)(struct anv_batch *batch,
- const struct anv_graphics_pipeline *pipeline,
- struct anv_state cps_states,
- struct anv_dynamic_state *dynamic_state)
-{
- const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
- const bool cps_enable = wm_prog_data && wm_prog_data->per_coarse_pixel_dispatch;
-
-#if GFX_VER == 11
- anv_batch_emit(batch, GENX(3DSTATE_CPS), cps) {
- cps.CoarsePixelShadingMode = cps_enable ? CPS_MODE_CONSTANT : CPS_MODE_NONE;
- if (cps_enable) {
- cps.MinCPSizeX = dynamic_state->fragment_shading_rate.width;
- cps.MinCPSizeY = dynamic_state->fragment_shading_rate.height;
- }
- }
-#elif GFX_VER == 12
- for (uint32_t i = 0; i < dynamic_state->viewport.count; i++) {
- uint32_t *cps_state_dwords =
- cps_states.map + GENX(CPS_STATE_length) * 4 * i;
- struct GENX(CPS_STATE) cps_state = {
- .CoarsePixelShadingMode = cps_enable ? CPS_MODE_CONSTANT : CPS_MODE_NONE,
- };
-
- if (cps_enable) {
- cps_state.MinCPSizeX = dynamic_state->fragment_shading_rate.width;
- cps_state.MinCPSizeY = dynamic_state->fragment_shading_rate.height;
- }
-
- GENX(CPS_STATE_pack)(NULL, cps_state_dwords, &cps_state);
- }
-
- anv_batch_emit(batch, GENX(3DSTATE_CPS_POINTERS), cps) {
- cps.CoarsePixelShadingStateArrayPointer = cps_states.offset;
- }
-#endif
-}
-#endif /* GFX_VER >= 11 */
-
-static uint32_t
-vk_to_intel_tex_filter(VkFilter filter, bool anisotropyEnable)
-{
- switch (filter) {
- default:
- assert(!"Invalid filter");
- case VK_FILTER_NEAREST:
- return anisotropyEnable ? MAPFILTER_ANISOTROPIC : MAPFILTER_NEAREST;
- case VK_FILTER_LINEAR:
- return anisotropyEnable ? MAPFILTER_ANISOTROPIC : MAPFILTER_LINEAR;
- }
-}
-
-static uint32_t
-vk_to_intel_max_anisotropy(float ratio)
-{
- return (anv_clamp_f(ratio, 2, 16) - 2) / 2;
-}
-
-static const uint32_t vk_to_intel_mipmap_mode[] = {
- [VK_SAMPLER_MIPMAP_MODE_NEAREST] = MIPFILTER_NEAREST,
- [VK_SAMPLER_MIPMAP_MODE_LINEAR] = MIPFILTER_LINEAR
-};
-
-static const uint32_t vk_to_intel_tex_address[] = {
- [VK_SAMPLER_ADDRESS_MODE_REPEAT] = TCM_WRAP,
- [VK_SAMPLER_ADDRESS_MODE_MIRRORED_REPEAT] = TCM_MIRROR,
- [VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE] = TCM_CLAMP,
- [VK_SAMPLER_ADDRESS_MODE_MIRROR_CLAMP_TO_EDGE] = TCM_MIRROR_ONCE,
- [VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER] = TCM_CLAMP_BORDER,
-};
-
-/* Vulkan specifies the result of shadow comparisons as:
- * 1 if ref <op> texel,
- * 0 otherwise.
- *
- * The hardware does:
- * 0 if texel <op> ref,
- * 1 otherwise.
- *
- * So, these look a bit strange because there's both a negation
- * and swapping of the arguments involved.
- */
-static const uint32_t vk_to_intel_shadow_compare_op[] = {
- [VK_COMPARE_OP_NEVER] = PREFILTEROP_ALWAYS,
- [VK_COMPARE_OP_LESS] = PREFILTEROP_LEQUAL,
- [VK_COMPARE_OP_EQUAL] = PREFILTEROP_NOTEQUAL,
- [VK_COMPARE_OP_LESS_OR_EQUAL] = PREFILTEROP_LESS,
- [VK_COMPARE_OP_GREATER] = PREFILTEROP_GEQUAL,
- [VK_COMPARE_OP_NOT_EQUAL] = PREFILTEROP_EQUAL,
- [VK_COMPARE_OP_GREATER_OR_EQUAL] = PREFILTEROP_GREATER,
- [VK_COMPARE_OP_ALWAYS] = PREFILTEROP_NEVER,
-};
-
-#if GFX_VER >= 9
-static const uint32_t vk_to_intel_sampler_reduction_mode[] = {
- [VK_SAMPLER_REDUCTION_MODE_WEIGHTED_AVERAGE_EXT] = STD_FILTER,
- [VK_SAMPLER_REDUCTION_MODE_MIN_EXT] = MINIMUM,
- [VK_SAMPLER_REDUCTION_MODE_MAX_EXT] = MAXIMUM,
-};
-#endif
-
-VkResult genX(CreateSampler)(
- VkDevice _device,
- const VkSamplerCreateInfo* pCreateInfo,
- const VkAllocationCallbacks* pAllocator,
- VkSampler* pSampler)
-{
- ANV_FROM_HANDLE(anv_device, device, _device);
- struct anv_sampler *sampler;
-
- assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO);
-
- sampler = vk_object_zalloc(&device->vk, pAllocator, sizeof(*sampler),
- VK_OBJECT_TYPE_SAMPLER);
- if (!sampler)
- return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
-
- sampler->n_planes = 1;
-
- uint32_t border_color_stride = GFX_VERx10 == 75 ? 512 : 64;
- uint32_t border_color_offset;
- ASSERTED bool has_custom_color = false;
- if (pCreateInfo->borderColor <= VK_BORDER_COLOR_INT_OPAQUE_WHITE) {
- border_color_offset = device->border_colors.offset +
- pCreateInfo->borderColor *
- border_color_stride;
- } else {
- assert(GFX_VER >= 8);
- sampler->custom_border_color =
- anv_state_reserved_pool_alloc(&device->custom_border_colors);
- border_color_offset = sampler->custom_border_color.offset;
- }
-
-#if GFX_VER >= 9
- unsigned sampler_reduction_mode = STD_FILTER;
- bool enable_sampler_reduction = false;
-#endif
-
- vk_foreach_struct(ext, pCreateInfo->pNext) {
- switch (ext->sType) {
- case VK_STRUCTURE_TYPE_SAMPLER_YCBCR_CONVERSION_INFO: {
- VkSamplerYcbcrConversionInfo *pSamplerConversion =
- (VkSamplerYcbcrConversionInfo *) ext;
- ANV_FROM_HANDLE(anv_ycbcr_conversion, conversion,
- pSamplerConversion->conversion);
-
- /* Ignore conversion for non-YUV formats. This fulfills a requirement
- * for clients that want to utilize same code path for images with
- * external formats (VK_FORMAT_UNDEFINED) and "regular" RGBA images
- * where format is known.
- */
- if (conversion == NULL || !conversion->format->can_ycbcr)
- break;
-
- sampler->n_planes = conversion->format->n_planes;
- sampler->conversion = conversion;
- break;
- }
-#if GFX_VER >= 9
- case VK_STRUCTURE_TYPE_SAMPLER_REDUCTION_MODE_CREATE_INFO: {
- VkSamplerReductionModeCreateInfo *sampler_reduction =
- (VkSamplerReductionModeCreateInfo *) ext;
- sampler_reduction_mode =
- vk_to_intel_sampler_reduction_mode[sampler_reduction->reductionMode];
- enable_sampler_reduction = true;
- break;
- }
-#endif
- case VK_STRUCTURE_TYPE_SAMPLER_CUSTOM_BORDER_COLOR_CREATE_INFO_EXT: {
- VkSamplerCustomBorderColorCreateInfoEXT *custom_border_color =
- (VkSamplerCustomBorderColorCreateInfoEXT *) ext;
- if (sampler->custom_border_color.map == NULL)
- break;
- struct gfx8_border_color *cbc = sampler->custom_border_color.map;
- if (custom_border_color->format == VK_FORMAT_B4G4R4A4_UNORM_PACK16) {
- /* B4G4R4A4_UNORM_PACK16 is treated as R4G4B4A4_UNORM_PACK16 with
- * a swizzle, but this does not carry over to the sampler for
- * border colors, so we need to do the swizzle ourselves here.
- */
- cbc->uint32[0] = custom_border_color->customBorderColor.uint32[2];
- cbc->uint32[1] = custom_border_color->customBorderColor.uint32[1];
- cbc->uint32[2] = custom_border_color->customBorderColor.uint32[0];
- cbc->uint32[3] = custom_border_color->customBorderColor.uint32[3];
- } else {
- /* Both structs share the same layout, so just copy them over. */
- memcpy(cbc, &custom_border_color->customBorderColor,
- sizeof(VkClearColorValue));
- }
- has_custom_color = true;
- break;
- }
- default:
- anv_debug_ignored_stype(ext->sType);
- break;
- }
- }
-
- assert((sampler->custom_border_color.map == NULL) || has_custom_color);
-
- if (device->physical->has_bindless_samplers) {
- /* If we have bindless, allocate enough samplers. We allocate 32 bytes
- * for each sampler instead of 16 bytes because we want all bindless
- * samplers to be 32-byte aligned so we don't have to use indirect
- * sampler messages on them.
- */
- sampler->bindless_state =
- anv_state_pool_alloc(&device->dynamic_state_pool,
- sampler->n_planes * 32, 32);
- }
-
- for (unsigned p = 0; p < sampler->n_planes; p++) {
- const bool plane_has_chroma =
- sampler->conversion && sampler->conversion->format->planes[p].has_chroma;
- const VkFilter min_filter =
- plane_has_chroma ? sampler->conversion->chroma_filter : pCreateInfo->minFilter;
- const VkFilter mag_filter =
- plane_has_chroma ? sampler->conversion->chroma_filter : pCreateInfo->magFilter;
- const bool enable_min_filter_addr_rounding = min_filter != VK_FILTER_NEAREST;
- const bool enable_mag_filter_addr_rounding = mag_filter != VK_FILTER_NEAREST;
- /* From Broadwell PRM, SAMPLER_STATE:
- * "Mip Mode Filter must be set to MIPFILTER_NONE for Planar YUV surfaces."
- */
- const bool isl_format_is_planar_yuv = sampler->conversion &&
- isl_format_is_yuv(sampler->conversion->format->planes[0].isl_format) &&
- isl_format_is_planar(sampler->conversion->format->planes[0].isl_format);
-
- const uint32_t mip_filter_mode =
- isl_format_is_planar_yuv ?
- MIPFILTER_NONE : vk_to_intel_mipmap_mode[pCreateInfo->mipmapMode];
-
- struct GENX(SAMPLER_STATE) sampler_state = {
- .SamplerDisable = false,
- .TextureBorderColorMode = DX10OGL,
-
-#if GFX_VER >= 11
- .CPSLODCompensationEnable = true,
-#endif
-
-#if GFX_VER >= 8
- .LODPreClampMode = CLAMP_MODE_OGL,
-#else
- .LODPreClampEnable = CLAMP_ENABLE_OGL,
-#endif
-
-#if GFX_VER == 8
- .BaseMipLevel = 0.0,
-#endif
- .MipModeFilter = mip_filter_mode,
- .MagModeFilter = vk_to_intel_tex_filter(mag_filter, pCreateInfo->anisotropyEnable),
- .MinModeFilter = vk_to_intel_tex_filter(min_filter, pCreateInfo->anisotropyEnable),
- .TextureLODBias = anv_clamp_f(pCreateInfo->mipLodBias, -16, 15.996),
- .AnisotropicAlgorithm =
- pCreateInfo->anisotropyEnable ? EWAApproximation : LEGACY,
- .MinLOD = anv_clamp_f(pCreateInfo->minLod, 0, 14),
- .MaxLOD = anv_clamp_f(pCreateInfo->maxLod, 0, 14),
- .ChromaKeyEnable = 0,
- .ChromaKeyIndex = 0,
- .ChromaKeyMode = 0,
- .ShadowFunction =
- vk_to_intel_shadow_compare_op[pCreateInfo->compareEnable ?
- pCreateInfo->compareOp : VK_COMPARE_OP_NEVER],
- .CubeSurfaceControlMode = OVERRIDE,
-
- .BorderColorPointer = border_color_offset,
-
-#if GFX_VER >= 8
- .LODClampMagnificationMode = MIPNONE,
-#endif
-
- .MaximumAnisotropy = vk_to_intel_max_anisotropy(pCreateInfo->maxAnisotropy),
- .RAddressMinFilterRoundingEnable = enable_min_filter_addr_rounding,
- .RAddressMagFilterRoundingEnable = enable_mag_filter_addr_rounding,
- .VAddressMinFilterRoundingEnable = enable_min_filter_addr_rounding,
- .VAddressMagFilterRoundingEnable = enable_mag_filter_addr_rounding,
- .UAddressMinFilterRoundingEnable = enable_min_filter_addr_rounding,
- .UAddressMagFilterRoundingEnable = enable_mag_filter_addr_rounding,
- .TrilinearFilterQuality = 0,
- .NonnormalizedCoordinateEnable = pCreateInfo->unnormalizedCoordinates,
- .TCXAddressControlMode = vk_to_intel_tex_address[pCreateInfo->addressModeU],
- .TCYAddressControlMode = vk_to_intel_tex_address[pCreateInfo->addressModeV],
- .TCZAddressControlMode = vk_to_intel_tex_address[pCreateInfo->addressModeW],
-
-#if GFX_VER >= 9
- .ReductionType = sampler_reduction_mode,
- .ReductionTypeEnable = enable_sampler_reduction,
-#endif
- };
-
- GENX(SAMPLER_STATE_pack)(NULL, sampler->state[p], &sampler_state);
-
- if (sampler->bindless_state.map) {
- memcpy(sampler->bindless_state.map + p * 32,
- sampler->state[p], GENX(SAMPLER_STATE_length) * 4);
- }
- }
-
- *pSampler = anv_sampler_to_handle(sampler);
-
- return VK_SUCCESS;
-}
diff --git a/src/intel/vulkan/gfx7_cmd_buffer.c b/src/intel/vulkan/gfx7_cmd_buffer.c
deleted file mode 100644
index b092bd8c377..00000000000
--- a/src/intel/vulkan/gfx7_cmd_buffer.c
+++ /dev/null
@@ -1,476 +0,0 @@
-/*
- * Copyright © 2015 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-#include <assert.h>
-#include <stdbool.h>
-#include <string.h>
-#include <unistd.h>
-#include <fcntl.h>
-
-#include "anv_private.h"
-#include "vk_format.h"
-
-#include "genxml/gen_macros.h"
-#include "genxml/genX_pack.h"
-
-#if GFX_VERx10 == 70
-static int64_t
-clamp_int64(int64_t x, int64_t min, int64_t max)
-{
- if (x < min)
- return min;
- else if (x < max)
- return x;
- else
- return max;
-}
-
-void
-gfx7_cmd_buffer_emit_scissor(struct anv_cmd_buffer *cmd_buffer)
-{
- struct anv_framebuffer *fb = cmd_buffer->state.framebuffer;
- uint32_t count = cmd_buffer->state.gfx.dynamic.scissor.count;
- const VkRect2D *scissors = cmd_buffer->state.gfx.dynamic.scissor.scissors;
-
- /* Wa_1409725701:
- * "The viewport-specific state used by the SF unit (SCISSOR_RECT) is
- * stored as an array of up to 16 elements. The location of first
- * element of the array, as specified by Pointer to SCISSOR_RECT, should
- * be aligned to a 64-byte boundary.
- */
- uint32_t alignment = 64;
- struct anv_state scissor_state =
- anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, count * 8, alignment);
-
- for (uint32_t i = 0; i < count; i++) {
- const VkRect2D *s = &scissors[i];
-
- /* Since xmax and ymax are inclusive, we have to have xmax < xmin or
- * ymax < ymin for empty clips. In case clip x, y, width height are all
- * 0, the clamps below produce 0 for xmin, ymin, xmax, ymax, which isn't
- * what we want. Just special case empty clips and produce a canonical
- * empty clip. */
- static const struct GFX7_SCISSOR_RECT empty_scissor = {
- .ScissorRectangleYMin = 1,
- .ScissorRectangleXMin = 1,
- .ScissorRectangleYMax = 0,
- .ScissorRectangleXMax = 0
- };
-
- const int max = 0xffff;
-
- uint32_t y_min = s->offset.y;
- uint32_t x_min = s->offset.x;
- uint32_t y_max = s->offset.y + s->extent.height - 1;
- uint32_t x_max = s->offset.x + s->extent.width - 1;
-
- /* Do this math using int64_t so overflow gets clamped correctly. */
- if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) {
- y_min = clamp_int64((uint64_t) y_min,
- cmd_buffer->state.render_area.offset.y, max);
- x_min = clamp_int64((uint64_t) x_min,
- cmd_buffer->state.render_area.offset.x, max);
- y_max = clamp_int64((uint64_t) y_max, 0,
- cmd_buffer->state.render_area.offset.y +
- cmd_buffer->state.render_area.extent.height - 1);
- x_max = clamp_int64((uint64_t) x_max, 0,
- cmd_buffer->state.render_area.offset.x +
- cmd_buffer->state.render_area.extent.width - 1);
- } else if (fb) {
- y_min = clamp_int64((uint64_t) y_min, 0, max);
- x_min = clamp_int64((uint64_t) x_min, 0, max);
- y_max = clamp_int64((uint64_t) y_max, 0, fb->height - 1);
- x_max = clamp_int64((uint64_t) x_max, 0, fb->width - 1);
- }
-
- struct GFX7_SCISSOR_RECT scissor = {
- .ScissorRectangleYMin = y_min,
- .ScissorRectangleXMin = x_min,
- .ScissorRectangleYMax = y_max,
- .ScissorRectangleXMax = x_max
- };
-
- if (s->extent.width <= 0 || s->extent.height <= 0) {
- GFX7_SCISSOR_RECT_pack(NULL, scissor_state.map + i * 8,
- &empty_scissor);
- } else {
- GFX7_SCISSOR_RECT_pack(NULL, scissor_state.map + i * 8, &scissor);
- }
- }
-
- anv_batch_emit(&cmd_buffer->batch,
- GFX7_3DSTATE_SCISSOR_STATE_POINTERS, ssp) {
- ssp.ScissorRectPointer = scissor_state.offset;
- }
-}
-#endif
-
-static uint32_t vk_to_intel_index_type(VkIndexType type)
-{
- switch (type) {
- case VK_INDEX_TYPE_UINT8_EXT:
- return INDEX_BYTE;
- case VK_INDEX_TYPE_UINT16:
- return INDEX_WORD;
- case VK_INDEX_TYPE_UINT32:
- return INDEX_DWORD;
- default:
- unreachable("invalid index type");
- }
-}
-
-static uint32_t restart_index_for_type(VkIndexType type)
-{
- switch (type) {
- case VK_INDEX_TYPE_UINT8_EXT:
- return UINT8_MAX;
- case VK_INDEX_TYPE_UINT16:
- return UINT16_MAX;
- case VK_INDEX_TYPE_UINT32:
- return UINT32_MAX;
- default:
- unreachable("invalid index type");
- }
-}
-
-void genX(CmdBindIndexBuffer)(
- VkCommandBuffer commandBuffer,
- VkBuffer _buffer,
- VkDeviceSize offset,
- VkIndexType indexType)
-{
- ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
- ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
-
- cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_INDEX_BUFFER;
- if (GFX_VERx10 == 75)
- cmd_buffer->state.restart_index = restart_index_for_type(indexType);
- cmd_buffer->state.gfx.gfx7.index_buffer = buffer;
- cmd_buffer->state.gfx.gfx7.index_type = vk_to_intel_index_type(indexType);
- cmd_buffer->state.gfx.gfx7.index_offset = offset;
-}
-
-static uint32_t
-get_depth_format(struct anv_cmd_buffer *cmd_buffer)
-{
- const struct anv_render_pass *pass = cmd_buffer->state.pass;
- const struct anv_subpass *subpass = cmd_buffer->state.subpass;
-
- if (!subpass->depth_stencil_attachment)
- return D16_UNORM;
-
- struct anv_render_pass_attachment *att =
- &pass->attachments[subpass->depth_stencil_attachment->attachment];
-
- switch (att->format) {
- case VK_FORMAT_D16_UNORM:
- case VK_FORMAT_D16_UNORM_S8_UINT:
- return D16_UNORM;
-
- case VK_FORMAT_X8_D24_UNORM_PACK32:
- case VK_FORMAT_D24_UNORM_S8_UINT:
- return D24_UNORM_X8_UINT;
-
- case VK_FORMAT_D32_SFLOAT:
- case VK_FORMAT_D32_SFLOAT_S8_UINT:
- return D32_FLOAT;
-
- default:
- return D16_UNORM;
- }
-}
-
-void
-genX(cmd_buffer_flush_dynamic_state)(struct anv_cmd_buffer *cmd_buffer)
-{
- struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
- struct anv_dynamic_state *d = &cmd_buffer->state.gfx.dynamic;
-
- if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY) {
- uint32_t topology;
- if (anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL))
- topology = pipeline->topology;
- else
- topology = genX(vk_to_intel_primitive_type)[d->primitive_topology];
-
- cmd_buffer->state.gfx.primitive_topology = topology;
- }
-
- if (cmd_buffer->state.gfx.dirty & (ANV_CMD_DIRTY_PIPELINE |
- ANV_CMD_DIRTY_RENDER_TARGETS |
- ANV_CMD_DIRTY_DYNAMIC_LINE_WIDTH |
- ANV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS |
- ANV_CMD_DIRTY_DYNAMIC_CULL_MODE |
- ANV_CMD_DIRTY_DYNAMIC_FRONT_FACE |
- ANV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS_ENABLE |
- ANV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY)) {
- /* Take dynamic primitive topology in to account with
- * 3DSTATE_SF::MultisampleRasterizationMode
- */
- uint32_t ms_rast_mode = 0;
-
- if (cmd_buffer->state.gfx.pipeline->dynamic_states &
- ANV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY) {
- VkPrimitiveTopology primitive_topology =
- cmd_buffer->state.gfx.dynamic.primitive_topology;
-
- VkPolygonMode dynamic_raster_mode =
- genX(raster_polygon_mode)(cmd_buffer->state.gfx.pipeline,
- primitive_topology);
-
- ms_rast_mode =
- genX(ms_rasterization_mode)(pipeline, dynamic_raster_mode);
- }
-
- uint32_t sf_dw[GENX(3DSTATE_SF_length)];
- struct GENX(3DSTATE_SF) sf = {
- GENX(3DSTATE_SF_header),
- .DepthBufferSurfaceFormat = get_depth_format(cmd_buffer),
- .LineWidth = d->line_width,
- .GlobalDepthOffsetConstant = d->depth_bias.bias,
- .GlobalDepthOffsetScale = d->depth_bias.slope,
- .GlobalDepthOffsetClamp = d->depth_bias.clamp,
- .FrontWinding = genX(vk_to_intel_front_face)[d->front_face],
- .CullMode = genX(vk_to_intel_cullmode)[d->cull_mode],
- .GlobalDepthOffsetEnableSolid = d->depth_bias_enable,
- .GlobalDepthOffsetEnableWireframe = d->depth_bias_enable,
- .GlobalDepthOffsetEnablePoint = d->depth_bias_enable,
- .MultisampleRasterizationMode = ms_rast_mode,
- };
- GENX(3DSTATE_SF_pack)(NULL, sf_dw, &sf);
-
- anv_batch_emit_merge(&cmd_buffer->batch, sf_dw, pipeline->gfx7.sf);
- }
-
- if (cmd_buffer->state.gfx.dirty & (ANV_CMD_DIRTY_DYNAMIC_BLEND_CONSTANTS |
- ANV_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE)) {
- struct anv_state cc_state =
- anv_cmd_buffer_alloc_dynamic_state(cmd_buffer,
- GENX(COLOR_CALC_STATE_length) * 4,
- 64);
- struct GENX(COLOR_CALC_STATE) cc = {
- .BlendConstantColorRed = d->blend_constants[0],
- .BlendConstantColorGreen = d->blend_constants[1],
- .BlendConstantColorBlue = d->blend_constants[2],
- .BlendConstantColorAlpha = d->blend_constants[3],
- .StencilReferenceValue = d->stencil_reference.front & 0xff,
- .BackfaceStencilReferenceValue = d->stencil_reference.back & 0xff,
- };
- GENX(COLOR_CALC_STATE_pack)(NULL, cc_state.map, &cc);
-
- anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CC_STATE_POINTERS), ccp) {
- ccp.ColorCalcStatePointer = cc_state.offset;
- }
- }
-
- if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_DYNAMIC_LINE_STIPPLE) {
- anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_LINE_STIPPLE), ls) {
- ls.LineStipplePattern = d->line_stipple.pattern;
- ls.LineStippleInverseRepeatCount =
- 1.0f / MAX2(1, d->line_stipple.factor);
- ls.LineStippleRepeatCount = d->line_stipple.factor;
- }
- }
-
- if (cmd_buffer->state.gfx.dirty & (ANV_CMD_DIRTY_PIPELINE |
- ANV_CMD_DIRTY_RENDER_TARGETS |
- ANV_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK |
- ANV_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK |
- ANV_CMD_DIRTY_DYNAMIC_DEPTH_TEST_ENABLE |
- ANV_CMD_DIRTY_DYNAMIC_DEPTH_WRITE_ENABLE |
- ANV_CMD_DIRTY_DYNAMIC_DEPTH_COMPARE_OP |
- ANV_CMD_DIRTY_DYNAMIC_STENCIL_TEST_ENABLE |
- ANV_CMD_DIRTY_DYNAMIC_STENCIL_OP)) {
- uint32_t depth_stencil_dw[GENX(DEPTH_STENCIL_STATE_length)];
-
- struct GENX(DEPTH_STENCIL_STATE) depth_stencil = {
- .StencilTestMask = d->stencil_compare_mask.front & 0xff,
- .StencilWriteMask = d->stencil_write_mask.front & 0xff,
-
- .BackfaceStencilTestMask = d->stencil_compare_mask.back & 0xff,
- .BackfaceStencilWriteMask = d->stencil_write_mask.back & 0xff,
-
- .StencilBufferWriteEnable =
- (d->stencil_write_mask.front || d->stencil_write_mask.back) &&
- d->stencil_test_enable,
-
- .DepthTestEnable = d->depth_test_enable,
- .DepthBufferWriteEnable = d->depth_test_enable && d->depth_write_enable,
- .DepthTestFunction = genX(vk_to_intel_compare_op)[d->depth_compare_op],
- .StencilTestEnable = d->stencil_test_enable,
- .StencilFailOp = genX(vk_to_intel_stencil_op)[d->stencil_op.front.fail_op],
- .StencilPassDepthPassOp = genX(vk_to_intel_stencil_op)[d->stencil_op.front.pass_op],
- .StencilPassDepthFailOp = genX(vk_to_intel_stencil_op)[d->stencil_op.front.depth_fail_op],
- .StencilTestFunction = genX(vk_to_intel_compare_op)[d->stencil_op.front.compare_op],
- .BackfaceStencilFailOp = genX(vk_to_intel_stencil_op)[d->stencil_op.back.fail_op],
- .BackfaceStencilPassDepthPassOp = genX(vk_to_intel_stencil_op)[d->stencil_op.back.pass_op],
- .BackfaceStencilPassDepthFailOp = genX(vk_to_intel_stencil_op)[d->stencil_op.back.depth_fail_op],
- .BackfaceStencilTestFunction = genX(vk_to_intel_compare_op)[d->stencil_op.back.compare_op],
- };
- GENX(DEPTH_STENCIL_STATE_pack)(NULL, depth_stencil_dw, &depth_stencil);
-
- struct anv_state ds_state =
- anv_cmd_buffer_merge_dynamic(cmd_buffer, depth_stencil_dw,
- pipeline->gfx7.depth_stencil_state,
- GENX(DEPTH_STENCIL_STATE_length), 64);
-
- anv_batch_emit(&cmd_buffer->batch,
- GENX(3DSTATE_DEPTH_STENCIL_STATE_POINTERS), dsp) {
- dsp.PointertoDEPTH_STENCIL_STATE = ds_state.offset;
- }
- }
-
- if (cmd_buffer->state.gfx.gfx7.index_buffer &&
- cmd_buffer->state.gfx.dirty & (ANV_CMD_DIRTY_PIPELINE |
- ANV_CMD_DIRTY_INDEX_BUFFER |
- ANV_CMD_DIRTY_DYNAMIC_PRIMITIVE_RESTART_ENABLE)) {
- struct anv_buffer *buffer = cmd_buffer->state.gfx.gfx7.index_buffer;
- uint32_t offset = cmd_buffer->state.gfx.gfx7.index_offset;
-
-#if GFX_VERx10 == 75
- anv_batch_emit(&cmd_buffer->batch, GFX75_3DSTATE_VF, vf) {
- vf.IndexedDrawCutIndexEnable = d->primitive_restart_enable;
- vf.CutIndex = cmd_buffer->state.restart_index;
- }
-#endif
-
- anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_INDEX_BUFFER), ib) {
-#if GFX_VERx10 != 75
- ib.CutIndexEnable = d->primitive_restart_enable;
-#endif
- ib.IndexFormat = cmd_buffer->state.gfx.gfx7.index_type;
- ib.MOCS = anv_mocs(cmd_buffer->device,
- buffer->address.bo,
- ISL_SURF_USAGE_INDEX_BUFFER_BIT);
-
- ib.BufferStartingAddress = anv_address_add(buffer->address, offset);
- ib.BufferEndingAddress = anv_address_add(buffer->address,
- buffer->size);
- }
- }
-
- /* 3DSTATE_WM in the hope we can avoid spawning fragment shaders
- * threads or if we have dirty dynamic primitive topology state and
- * need to toggle 3DSTATE_WM::MultisampleRasterizationMode dynamically.
- */
- if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_DYNAMIC_COLOR_BLEND_STATE ||
- cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY) {
- const uint8_t color_writes = cmd_buffer->state.gfx.dynamic.color_writes;
-
- bool dirty_color_blend =
- cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_DYNAMIC_COLOR_BLEND_STATE;
-
- bool dirty_primitive_topology =
- cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY;
-
- VkPolygonMode dynamic_raster_mode;
- VkPrimitiveTopology primitive_topology =
- cmd_buffer->state.gfx.dynamic.primitive_topology;
- dynamic_raster_mode =
- genX(raster_polygon_mode)(cmd_buffer->state.gfx.pipeline,
- primitive_topology);
-
- if (dirty_color_blend || dirty_primitive_topology) {
- uint32_t dwords[GENX(3DSTATE_WM_length)];
- struct GENX(3DSTATE_WM) wm = {
- GENX(3DSTATE_WM_header),
-
- .ThreadDispatchEnable = pipeline->force_fragment_thread_dispatch ||
- color_writes,
- .MultisampleRasterizationMode =
- genX(ms_rasterization_mode)(pipeline, dynamic_raster_mode),
- };
- GENX(3DSTATE_WM_pack)(NULL, dwords, &wm);
-
- anv_batch_emit_merge(&cmd_buffer->batch, dwords, pipeline->gfx7.wm);
- }
-
- }
-
- if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_DYNAMIC_SAMPLE_LOCATIONS) {
- genX(emit_multisample)(&cmd_buffer->batch,
- cmd_buffer->state.gfx.dynamic.sample_locations.samples,
- cmd_buffer->state.gfx.dynamic.sample_locations.locations);
- }
-
- if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_DYNAMIC_COLOR_BLEND_STATE ||
- cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_DYNAMIC_LOGIC_OP) {
- const uint8_t color_writes = cmd_buffer->state.gfx.dynamic.color_writes;
- bool dirty_color_blend =
- cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_DYNAMIC_COLOR_BLEND_STATE;
-
- /* Blend states of each RT */
- uint32_t surface_count = 0;
- struct anv_pipeline_bind_map *map;
- if (anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) {
- map = &pipeline->shaders[MESA_SHADER_FRAGMENT]->bind_map;
- surface_count = map->surface_count;
- }
-
- uint32_t blend_dws[GENX(BLEND_STATE_length) +
- MAX_RTS * GENX(BLEND_STATE_ENTRY_length)];
- uint32_t *dws = blend_dws;
- memset(blend_dws, 0, sizeof(blend_dws));
-
- /* Skip this part */
- dws += GENX(BLEND_STATE_length);
-
- bool dirty_logic_op =
- cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_DYNAMIC_LOGIC_OP;
-
- for (uint32_t i = 0; i < surface_count; i++) {
- struct anv_pipeline_binding *binding = &map->surface_to_descriptor[i];
- bool write_disabled =
- dirty_color_blend && (color_writes & (1u << binding->index)) == 0;
- struct GENX(BLEND_STATE_ENTRY) entry = {
- .WriteDisableAlpha = write_disabled,
- .WriteDisableRed = write_disabled,
- .WriteDisableGreen = write_disabled,
- .WriteDisableBlue = write_disabled,
- .LogicOpFunction =
- dirty_logic_op ? genX(vk_to_intel_logic_op)[d->logic_op] : 0,
- };
- GENX(BLEND_STATE_ENTRY_pack)(NULL, dws, &entry);
- dws += GENX(BLEND_STATE_ENTRY_length);
- }
-
- uint32_t num_dwords = GENX(BLEND_STATE_length) +
- GENX(BLEND_STATE_ENTRY_length) * surface_count;
-
- struct anv_state blend_states =
- anv_cmd_buffer_merge_dynamic(cmd_buffer, blend_dws,
- pipeline->gfx7.blend_state, num_dwords, 64);
- anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_BLEND_STATE_POINTERS), bsp) {
- bsp.BlendStatePointer = blend_states.offset;
- }
- }
-
- cmd_buffer->state.gfx.dirty = 0;
-}
-
-void
-genX(cmd_buffer_enable_pma_fix)(struct anv_cmd_buffer *cmd_buffer,
- bool enable)
-{
- /* The NP PMA fix doesn't exist on gfx7 */
-}
diff --git a/src/intel/vulkan/gfx8_cmd_buffer.c b/src/intel/vulkan/gfx8_cmd_buffer.c
deleted file mode 100644
index 95250fa01d0..00000000000
--- a/src/intel/vulkan/gfx8_cmd_buffer.c
+++ /dev/null
@@ -1,844 +0,0 @@
-/*
- * Copyright © 2015 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-#include <assert.h>
-#include <stdbool.h>
-#include <string.h>
-#include <unistd.h>
-#include <fcntl.h>
-
-#include "anv_private.h"
-
-#include "genxml/gen_macros.h"
-#include "genxml/genX_pack.h"
-#include "common/intel_guardband.h"
-
-#if GFX_VER == 8
-void
-gfx8_cmd_buffer_emit_viewport(struct anv_cmd_buffer *cmd_buffer)
-{
- struct anv_framebuffer *fb = cmd_buffer->state.framebuffer;
- uint32_t count = cmd_buffer->state.gfx.dynamic.viewport.count;
- const VkViewport *viewports =
- cmd_buffer->state.gfx.dynamic.viewport.viewports;
- struct anv_state sf_clip_state =
- anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, count * 64, 64);
-
- for (uint32_t i = 0; i < count; i++) {
- const VkViewport *vp = &viewports[i];
-
- /* The gfx7 state struct has just the matrix and guardband fields, the
- * gfx8 struct adds the min/max viewport fields. */
- struct GENX(SF_CLIP_VIEWPORT) sfv = {
- .ViewportMatrixElementm00 = vp->width / 2,
- .ViewportMatrixElementm11 = vp->height / 2,
- .ViewportMatrixElementm22 = vp->maxDepth - vp->minDepth,
- .ViewportMatrixElementm30 = vp->x + vp->width / 2,
- .ViewportMatrixElementm31 = vp->y + vp->height / 2,
- .ViewportMatrixElementm32 = vp->minDepth,
- .XMinClipGuardband = -1.0f,
- .XMaxClipGuardband = 1.0f,
- .YMinClipGuardband = -1.0f,
- .YMaxClipGuardband = 1.0f,
- .XMinViewPort = vp->x,
- .XMaxViewPort = vp->x + vp->width - 1,
- .YMinViewPort = MIN2(vp->y, vp->y + vp->height),
- .YMaxViewPort = MAX2(vp->y, vp->y + vp->height) - 1,
- };
-
- if (fb) {
- /* We can only calculate a "real" guardband clip if we know the
- * framebuffer at the time we emit the packet. Otherwise, we have
- * fall back to a worst-case guardband of [-1, 1].
- */
- intel_calculate_guardband_size(fb->width, fb->height,
- sfv.ViewportMatrixElementm00,
- sfv.ViewportMatrixElementm11,
- sfv.ViewportMatrixElementm30,
- sfv.ViewportMatrixElementm31,
- &sfv.XMinClipGuardband,
- &sfv.XMaxClipGuardband,
- &sfv.YMinClipGuardband,
- &sfv.YMaxClipGuardband);
- }
-
- GENX(SF_CLIP_VIEWPORT_pack)(NULL, sf_clip_state.map + i * 64, &sfv);
- }
-
- anv_batch_emit(&cmd_buffer->batch,
- GENX(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP), clip) {
- clip.SFClipViewportPointer = sf_clip_state.offset;
- }
-}
-
-void
-gfx8_cmd_buffer_emit_depth_viewport(struct anv_cmd_buffer *cmd_buffer,
- bool depth_clamp_enable)
-{
- uint32_t count = cmd_buffer->state.gfx.dynamic.viewport.count;
- const VkViewport *viewports =
- cmd_buffer->state.gfx.dynamic.viewport.viewports;
- struct anv_state cc_state =
- anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, count * 8, 32);
-
- for (uint32_t i = 0; i < count; i++) {
- const VkViewport *vp = &viewports[i];
-
- /* From the Vulkan spec:
- *
- * "It is valid for minDepth to be greater than or equal to
- * maxDepth."
- */
- float min_depth = MIN2(vp->minDepth, vp->maxDepth);
- float max_depth = MAX2(vp->minDepth, vp->maxDepth);
-
- struct GENX(CC_VIEWPORT) cc_viewport = {
- .MinimumDepth = depth_clamp_enable ? min_depth : 0.0f,
- .MaximumDepth = depth_clamp_enable ? max_depth : 1.0f,
- };
-
- GENX(CC_VIEWPORT_pack)(NULL, cc_state.map + i * 8, &cc_viewport);
- }
-
- anv_batch_emit(&cmd_buffer->batch,
- GENX(3DSTATE_VIEWPORT_STATE_POINTERS_CC), cc) {
- cc.CCViewportPointer = cc_state.offset;
- }
-}
-#endif
-
-void
-genX(cmd_buffer_enable_pma_fix)(struct anv_cmd_buffer *cmd_buffer, bool enable)
-{
- if (cmd_buffer->state.pma_fix_enabled == enable)
- return;
-
- cmd_buffer->state.pma_fix_enabled = enable;
-
- /* According to the Broadwell PIPE_CONTROL documentation, software should
- * emit a PIPE_CONTROL with the CS Stall and Depth Cache Flush bits set
- * prior to the LRI. If stencil buffer writes are enabled, then a Render
- * Cache Flush is also necessary.
- *
- * The Skylake docs say to use a depth stall rather than a command
- * streamer stall. However, the hardware seems to violently disagree.
- * A full command streamer stall seems to be needed in both cases.
- */
- anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
- pc.DepthCacheFlushEnable = true;
- pc.CommandStreamerStallEnable = true;
- pc.RenderTargetCacheFlushEnable = true;
-#if GFX_VER >= 12
- pc.TileCacheFlushEnable = true;
-
- /* Wa_1409600907: "PIPE_CONTROL with Depth Stall Enable bit must
- * be set with any PIPE_CONTROL with Depth Flush Enable bit set.
- */
- pc.DepthStallEnable = true;
-#endif
- }
-
-#if GFX_VER == 9
-
- uint32_t cache_mode;
- anv_pack_struct(&cache_mode, GENX(CACHE_MODE_0),
- .STCPMAOptimizationEnable = enable,
- .STCPMAOptimizationEnableMask = true);
- anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
- lri.RegisterOffset = GENX(CACHE_MODE_0_num);
- lri.DataDWord = cache_mode;
- }
-
-#elif GFX_VER == 8
-
- uint32_t cache_mode;
- anv_pack_struct(&cache_mode, GENX(CACHE_MODE_1),
- .NPPMAFixEnable = enable,
- .NPEarlyZFailsDisable = enable,
- .NPPMAFixEnableMask = true,
- .NPEarlyZFailsDisableMask = true);
- anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
- lri.RegisterOffset = GENX(CACHE_MODE_1_num);
- lri.DataDWord = cache_mode;
- }
-
-#endif /* GFX_VER == 8 */
-
- /* After the LRI, a PIPE_CONTROL with both the Depth Stall and Depth Cache
- * Flush bits is often necessary. We do it regardless because it's easier.
- * The render cache flush is also necessary if stencil writes are enabled.
- *
- * Again, the Skylake docs give a different set of flushes but the BDW
- * flushes seem to work just as well.
- */
- anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
- pc.DepthStallEnable = true;
- pc.DepthCacheFlushEnable = true;
- pc.RenderTargetCacheFlushEnable = true;
-#if GFX_VER >= 12
- pc.TileCacheFlushEnable = true;
-#endif
- }
-}
-
-UNUSED static bool
-want_depth_pma_fix(struct anv_cmd_buffer *cmd_buffer)
-{
- assert(GFX_VER == 8);
-
- /* From the Broadwell PRM Vol. 2c CACHE_MODE_1::NP_PMA_FIX_ENABLE:
- *
- * SW must set this bit in order to enable this fix when following
- * expression is TRUE.
- *
- * 3DSTATE_WM::ForceThreadDispatch != 1 &&
- * !(3DSTATE_RASTER::ForceSampleCount != NUMRASTSAMPLES_0) &&
- * (3DSTATE_DEPTH_BUFFER::SURFACE_TYPE != NULL) &&
- * (3DSTATE_DEPTH_BUFFER::HIZ Enable) &&
- * !(3DSTATE_WM::EDSC_Mode == EDSC_PREPS) &&
- * (3DSTATE_PS_EXTRA::PixelShaderValid) &&
- * !(3DSTATE_WM_HZ_OP::DepthBufferClear ||
- * 3DSTATE_WM_HZ_OP::DepthBufferResolve ||
- * 3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve Enable ||
- * 3DSTATE_WM_HZ_OP::StencilBufferClear) &&
- * (3DSTATE_WM_DEPTH_STENCIL::DepthTestEnable) &&
- * (((3DSTATE_PS_EXTRA::PixelShaderKillsPixels ||
- * 3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||
- * 3DSTATE_PS_BLEND::AlphaToCoverageEnable ||
- * 3DSTATE_PS_BLEND::AlphaTestEnable ||
- * 3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable) &&
- * 3DSTATE_WM::ForceKillPix != ForceOff &&
- * ((3DSTATE_WM_DEPTH_STENCIL::DepthWriteEnable &&
- * 3DSTATE_DEPTH_BUFFER::DEPTH_WRITE_ENABLE) ||
- * (3DSTATE_WM_DEPTH_STENCIL::Stencil Buffer Write Enable &&
- * 3DSTATE_DEPTH_BUFFER::STENCIL_WRITE_ENABLE &&
- * 3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE))) ||
- * (3DSTATE_PS_EXTRA:: Pixel Shader Computed Depth mode != PSCDEPTH_OFF))
- */
-
- /* These are always true:
- * 3DSTATE_WM::ForceThreadDispatch != 1 &&
- * !(3DSTATE_RASTER::ForceSampleCount != NUMRASTSAMPLES_0)
- */
-
- /* We only enable the PMA fix if we know for certain that HiZ is enabled.
- * If we don't know whether HiZ is enabled or not, we disable the PMA fix
- * and there is no harm.
- *
- * (3DSTATE_DEPTH_BUFFER::SURFACE_TYPE != NULL) &&
- * 3DSTATE_DEPTH_BUFFER::HIZ Enable
- */
- if (!cmd_buffer->state.hiz_enabled)
- return false;
-
- /* 3DSTATE_PS_EXTRA::PixelShaderValid */
- struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
- if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT))
- return false;
-
- /* !(3DSTATE_WM::EDSC_Mode == EDSC_PREPS) */
- const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
- if (wm_prog_data->early_fragment_tests)
- return false;
-
- /* We never use anv_pipeline for HiZ ops so this is trivially true:
- * !(3DSTATE_WM_HZ_OP::DepthBufferClear ||
- * 3DSTATE_WM_HZ_OP::DepthBufferResolve ||
- * 3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve Enable ||
- * 3DSTATE_WM_HZ_OP::StencilBufferClear)
- */
-
- /* 3DSTATE_WM_DEPTH_STENCIL::DepthTestEnable */
- if (!pipeline->depth_test_enable)
- return false;
-
- /* (((3DSTATE_PS_EXTRA::PixelShaderKillsPixels ||
- * 3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||
- * 3DSTATE_PS_BLEND::AlphaToCoverageEnable ||
- * 3DSTATE_PS_BLEND::AlphaTestEnable ||
- * 3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable) &&
- * 3DSTATE_WM::ForceKillPix != ForceOff &&
- * ((3DSTATE_WM_DEPTH_STENCIL::DepthWriteEnable &&
- * 3DSTATE_DEPTH_BUFFER::DEPTH_WRITE_ENABLE) ||
- * (3DSTATE_WM_DEPTH_STENCIL::Stencil Buffer Write Enable &&
- * 3DSTATE_DEPTH_BUFFER::STENCIL_WRITE_ENABLE &&
- * 3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE))) ||
- * (3DSTATE_PS_EXTRA:: Pixel Shader Computed Depth mode != PSCDEPTH_OFF))
- */
- return (pipeline->kill_pixel && (pipeline->writes_depth ||
- pipeline->writes_stencil)) ||
- wm_prog_data->computed_depth_mode != PSCDEPTH_OFF;
-}
-
-UNUSED static bool
-want_stencil_pma_fix(struct anv_cmd_buffer *cmd_buffer)
-{
- if (GFX_VER > 9)
- return false;
- assert(GFX_VER == 9);
-
- /* From the Skylake PRM Vol. 2c CACHE_MODE_1::STC PMA Optimization Enable:
- *
- * Clearing this bit will force the STC cache to wait for pending
- * retirement of pixels at the HZ-read stage and do the STC-test for
- * Non-promoted, R-computed and Computed depth modes instead of
- * postponing the STC-test to RCPFE.
- *
- * STC_TEST_EN = 3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE &&
- * 3DSTATE_WM_DEPTH_STENCIL::StencilTestEnable
- *
- * STC_WRITE_EN = 3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE &&
- * (3DSTATE_WM_DEPTH_STENCIL::Stencil Buffer Write Enable &&
- * 3DSTATE_DEPTH_BUFFER::STENCIL_WRITE_ENABLE)
- *
- * COMP_STC_EN = STC_TEST_EN &&
- * 3DSTATE_PS_EXTRA::PixelShaderComputesStencil
- *
- * SW parses the pipeline states to generate the following logical
- * signal indicating if PMA FIX can be enabled.
- *
- * STC_PMA_OPT =
- * 3DSTATE_WM::ForceThreadDispatch != 1 &&
- * !(3DSTATE_RASTER::ForceSampleCount != NUMRASTSAMPLES_0) &&
- * 3DSTATE_DEPTH_BUFFER::SURFACE_TYPE != NULL &&
- * 3DSTATE_DEPTH_BUFFER::HIZ Enable &&
- * !(3DSTATE_WM::EDSC_Mode == 2) &&
- * 3DSTATE_PS_EXTRA::PixelShaderValid &&
- * !(3DSTATE_WM_HZ_OP::DepthBufferClear ||
- * 3DSTATE_WM_HZ_OP::DepthBufferResolve ||
- * 3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve Enable ||
- * 3DSTATE_WM_HZ_OP::StencilBufferClear) &&
- * (COMP_STC_EN || STC_WRITE_EN) &&
- * ((3DSTATE_PS_EXTRA::PixelShaderKillsPixels ||
- * 3DSTATE_WM::ForceKillPix == ON ||
- * 3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||
- * 3DSTATE_PS_BLEND::AlphaToCoverageEnable ||
- * 3DSTATE_PS_BLEND::AlphaTestEnable ||
- * 3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable) ||
- * (3DSTATE_PS_EXTRA::Pixel Shader Computed Depth mode != PSCDEPTH_OFF))
- */
-
- /* These are always true:
- * 3DSTATE_WM::ForceThreadDispatch != 1 &&
- * !(3DSTATE_RASTER::ForceSampleCount != NUMRASTSAMPLES_0)
- */
-
- /* We only enable the PMA fix if we know for certain that HiZ is enabled.
- * If we don't know whether HiZ is enabled or not, we disable the PMA fix
- * and there is no harm.
- *
- * (3DSTATE_DEPTH_BUFFER::SURFACE_TYPE != NULL) &&
- * 3DSTATE_DEPTH_BUFFER::HIZ Enable
- */
- if (!cmd_buffer->state.hiz_enabled)
- return false;
-
- /* We can't possibly know if HiZ is enabled without the framebuffer */
- assert(cmd_buffer->state.framebuffer);
-
- /* HiZ is enabled so we had better have a depth buffer with HiZ */
- const struct anv_image_view *ds_iview =
- anv_cmd_buffer_get_depth_stencil_view(cmd_buffer);
- assert(ds_iview && ds_iview->image->planes[0].aux_usage == ISL_AUX_USAGE_HIZ);
-
- /* 3DSTATE_PS_EXTRA::PixelShaderValid */
- struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
- if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT))
- return false;
-
- /* !(3DSTATE_WM::EDSC_Mode == 2) */
- const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
- if (wm_prog_data->early_fragment_tests)
- return false;
-
- /* We never use anv_pipeline for HiZ ops so this is trivially true:
- * !(3DSTATE_WM_HZ_OP::DepthBufferClear ||
- * 3DSTATE_WM_HZ_OP::DepthBufferResolve ||
- * 3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve Enable ||
- * 3DSTATE_WM_HZ_OP::StencilBufferClear)
- */
-
- /* 3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE &&
- * 3DSTATE_WM_DEPTH_STENCIL::StencilTestEnable
- */
- const bool stc_test_en =
- (ds_iview->image->vk.aspects & VK_IMAGE_ASPECT_STENCIL_BIT) &&
- pipeline->stencil_test_enable;
-
- /* 3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE &&
- * (3DSTATE_WM_DEPTH_STENCIL::Stencil Buffer Write Enable &&
- * 3DSTATE_DEPTH_BUFFER::STENCIL_WRITE_ENABLE)
- */
- const bool stc_write_en =
- (ds_iview->image->vk.aspects & VK_IMAGE_ASPECT_STENCIL_BIT) &&
- (cmd_buffer->state.gfx.dynamic.stencil_write_mask.front ||
- cmd_buffer->state.gfx.dynamic.stencil_write_mask.back) &&
- pipeline->writes_stencil;
-
- /* STC_TEST_EN && 3DSTATE_PS_EXTRA::PixelShaderComputesStencil */
- const bool comp_stc_en = stc_test_en && wm_prog_data->computed_stencil;
-
- /* COMP_STC_EN || STC_WRITE_EN */
- if (!(comp_stc_en || stc_write_en))
- return false;
-
- /* (3DSTATE_PS_EXTRA::PixelShaderKillsPixels ||
- * 3DSTATE_WM::ForceKillPix == ON ||
- * 3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||
- * 3DSTATE_PS_BLEND::AlphaToCoverageEnable ||
- * 3DSTATE_PS_BLEND::AlphaTestEnable ||
- * 3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable) ||
- * (3DSTATE_PS_EXTRA::Pixel Shader Computed Depth mode != PSCDEPTH_OFF)
- */
- return pipeline->kill_pixel ||
- wm_prog_data->computed_depth_mode != PSCDEPTH_OFF;
-}
-
-void
-genX(cmd_buffer_flush_dynamic_state)(struct anv_cmd_buffer *cmd_buffer)
-{
- struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
- struct anv_dynamic_state *d = &cmd_buffer->state.gfx.dynamic;
-
- if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY) {
- uint32_t topology;
- if (anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL))
- topology = pipeline->topology;
- else
- topology = genX(vk_to_intel_primitive_type)[d->primitive_topology];
-
- cmd_buffer->state.gfx.primitive_topology = topology;
-
- anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VF_TOPOLOGY), vft) {
- vft.PrimitiveTopologyType = topology;
- }
- }
-
- if (cmd_buffer->state.gfx.dirty & (ANV_CMD_DIRTY_PIPELINE |
- ANV_CMD_DIRTY_DYNAMIC_LINE_WIDTH)) {
- uint32_t sf_dw[GENX(3DSTATE_SF_length)];
- struct GENX(3DSTATE_SF) sf = {
- GENX(3DSTATE_SF_header),
- };
-#if GFX_VER == 8
- if (cmd_buffer->device->info.is_cherryview) {
- sf.CHVLineWidth = d->line_width;
- } else {
- sf.LineWidth = d->line_width;
- }
-#else
- sf.LineWidth = d->line_width,
-#endif
- GENX(3DSTATE_SF_pack)(NULL, sf_dw, &sf);
- anv_batch_emit_merge(&cmd_buffer->batch, sf_dw, pipeline->gfx8.sf);
- }
-
- if (cmd_buffer->state.gfx.dirty & (ANV_CMD_DIRTY_PIPELINE |
- ANV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS |
- ANV_CMD_DIRTY_DYNAMIC_CULL_MODE |
- ANV_CMD_DIRTY_DYNAMIC_FRONT_FACE |
- ANV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS_ENABLE |
- ANV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY)) {
- /* Take dynamic primitive topology in to account with
- * 3DSTATE_RASTER::APIMode
- * 3DSTATE_RASTER::DXMultisampleRasterizationEnable
- * 3DSTATE_RASTER::AntialiasingEnable
- */
- uint32_t api_mode = 0;
- bool msaa_raster_enable = false;
- bool aa_enable = 0;
-
- if (cmd_buffer->state.gfx.pipeline->dynamic_states &
- ANV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY) {
- VkPrimitiveTopology primitive_topology =
- cmd_buffer->state.gfx.dynamic.primitive_topology;
-
- VkPolygonMode dynamic_raster_mode =
- genX(raster_polygon_mode)(cmd_buffer->state.gfx.pipeline,
- primitive_topology);
-
- genX(rasterization_mode)(
- dynamic_raster_mode, pipeline->line_mode, d->line_width,
- &api_mode, &msaa_raster_enable);
-
- aa_enable =
- anv_rasterization_aa_mode(dynamic_raster_mode,
- pipeline->line_mode);
- }
-
- uint32_t raster_dw[GENX(3DSTATE_RASTER_length)];
- struct GENX(3DSTATE_RASTER) raster = {
- GENX(3DSTATE_RASTER_header),
- .APIMode = api_mode,
- .DXMultisampleRasterizationEnable = msaa_raster_enable,
- .AntialiasingEnable = aa_enable,
- .GlobalDepthOffsetConstant = d->depth_bias.bias,
- .GlobalDepthOffsetScale = d->depth_bias.slope,
- .GlobalDepthOffsetClamp = d->depth_bias.clamp,
- .CullMode = genX(vk_to_intel_cullmode)[d->cull_mode],
- .FrontWinding = genX(vk_to_intel_front_face)[d->front_face],
- .GlobalDepthOffsetEnableSolid = d->depth_bias_enable,
- .GlobalDepthOffsetEnableWireframe = d->depth_bias_enable,
- .GlobalDepthOffsetEnablePoint = d->depth_bias_enable,
- };
- GENX(3DSTATE_RASTER_pack)(NULL, raster_dw, &raster);
- anv_batch_emit_merge(&cmd_buffer->batch, raster_dw,
- pipeline->gfx8.raster);
- }
-
- /* Stencil reference values moved from COLOR_CALC_STATE in gfx8 to
- * 3DSTATE_WM_DEPTH_STENCIL in gfx9. That means the dirty bits gets split
- * across different state packets for gfx8 and gfx9. We handle that by
- * using a big old #if switch here.
- */
-#if GFX_VER == 8
- if (cmd_buffer->state.gfx.dirty & (ANV_CMD_DIRTY_DYNAMIC_BLEND_CONSTANTS |
- ANV_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE)) {
- struct anv_state cc_state =
- anv_cmd_buffer_alloc_dynamic_state(cmd_buffer,
- GENX(COLOR_CALC_STATE_length) * 4,
- 64);
- struct GENX(COLOR_CALC_STATE) cc = {
- .BlendConstantColorRed = d->blend_constants[0],
- .BlendConstantColorGreen = d->blend_constants[1],
- .BlendConstantColorBlue = d->blend_constants[2],
- .BlendConstantColorAlpha = d->blend_constants[3],
- .StencilReferenceValue = d->stencil_reference.front & 0xff,
- .BackfaceStencilReferenceValue = d->stencil_reference.back & 0xff,
- };
- GENX(COLOR_CALC_STATE_pack)(NULL, cc_state.map, &cc);
-
- anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CC_STATE_POINTERS), ccp) {
- ccp.ColorCalcStatePointer = cc_state.offset;
- ccp.ColorCalcStatePointerValid = true;
- }
- }
-
- if (cmd_buffer->state.gfx.dirty & (ANV_CMD_DIRTY_PIPELINE |
- ANV_CMD_DIRTY_RENDER_TARGETS |
- ANV_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK |
- ANV_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK |
- ANV_CMD_DIRTY_DYNAMIC_DEPTH_TEST_ENABLE |
- ANV_CMD_DIRTY_DYNAMIC_DEPTH_WRITE_ENABLE |
- ANV_CMD_DIRTY_DYNAMIC_DEPTH_COMPARE_OP |
- ANV_CMD_DIRTY_DYNAMIC_STENCIL_TEST_ENABLE |
- ANV_CMD_DIRTY_DYNAMIC_STENCIL_OP)) {
- uint32_t wm_depth_stencil_dw[GENX(3DSTATE_WM_DEPTH_STENCIL_length)];
-
- struct GENX(3DSTATE_WM_DEPTH_STENCIL wm_depth_stencil) = {
- GENX(3DSTATE_WM_DEPTH_STENCIL_header),
-
- .StencilTestMask = d->stencil_compare_mask.front & 0xff,
- .StencilWriteMask = d->stencil_write_mask.front & 0xff,
-
- .BackfaceStencilTestMask = d->stencil_compare_mask.back & 0xff,
- .BackfaceStencilWriteMask = d->stencil_write_mask.back & 0xff,
-
- .StencilBufferWriteEnable =
- (d->stencil_write_mask.front || d->stencil_write_mask.back) &&
- d->stencil_test_enable,
-
- .DepthTestEnable = d->depth_test_enable,
- .DepthBufferWriteEnable = d->depth_test_enable && d->depth_write_enable,
- .DepthTestFunction = genX(vk_to_intel_compare_op)[d->depth_compare_op],
- .StencilTestEnable = d->stencil_test_enable,
- .StencilFailOp = genX(vk_to_intel_stencil_op)[d->stencil_op.front.fail_op],
- .StencilPassDepthPassOp = genX(vk_to_intel_stencil_op)[d->stencil_op.front.pass_op],
- .StencilPassDepthFailOp = genX(vk_to_intel_stencil_op)[d->stencil_op.front.depth_fail_op],
- .StencilTestFunction = genX(vk_to_intel_compare_op)[d->stencil_op.front.compare_op],
- .BackfaceStencilFailOp = genX(vk_to_intel_stencil_op)[d->stencil_op.back.fail_op],
- .BackfaceStencilPassDepthPassOp = genX(vk_to_intel_stencil_op)[d->stencil_op.back.pass_op],
- .BackfaceStencilPassDepthFailOp = genX(vk_to_intel_stencil_op)[d->stencil_op.back.depth_fail_op],
- .BackfaceStencilTestFunction = genX(vk_to_intel_compare_op)[d->stencil_op.back.compare_op],
- };
- GENX(3DSTATE_WM_DEPTH_STENCIL_pack)(NULL, wm_depth_stencil_dw,
- &wm_depth_stencil);
-
- anv_batch_emit_merge(&cmd_buffer->batch, wm_depth_stencil_dw,
- pipeline->gfx8.wm_depth_stencil);
-
- genX(cmd_buffer_enable_pma_fix)(cmd_buffer,
- want_depth_pma_fix(cmd_buffer));
- }
-#else
- if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_DYNAMIC_BLEND_CONSTANTS) {
- struct anv_state cc_state =
- anv_cmd_buffer_alloc_dynamic_state(cmd_buffer,
- GENX(COLOR_CALC_STATE_length) * 4,
- 64);
- struct GENX(COLOR_CALC_STATE) cc = {
- .BlendConstantColorRed = d->blend_constants[0],
- .BlendConstantColorGreen = d->blend_constants[1],
- .BlendConstantColorBlue = d->blend_constants[2],
- .BlendConstantColorAlpha = d->blend_constants[3],
- };
- GENX(COLOR_CALC_STATE_pack)(NULL, cc_state.map, &cc);
-
- anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CC_STATE_POINTERS), ccp) {
- ccp.ColorCalcStatePointer = cc_state.offset;
- ccp.ColorCalcStatePointerValid = true;
- }
- }
-
- if (cmd_buffer->state.gfx.dirty & (ANV_CMD_DIRTY_PIPELINE |
- ANV_CMD_DIRTY_RENDER_TARGETS |
- ANV_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK |
- ANV_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK |
- ANV_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE |
- ANV_CMD_DIRTY_DYNAMIC_DEPTH_TEST_ENABLE |
- ANV_CMD_DIRTY_DYNAMIC_DEPTH_WRITE_ENABLE |
- ANV_CMD_DIRTY_DYNAMIC_DEPTH_COMPARE_OP |
- ANV_CMD_DIRTY_DYNAMIC_STENCIL_TEST_ENABLE |
- ANV_CMD_DIRTY_DYNAMIC_STENCIL_OP)) {
- uint32_t dwords[GENX(3DSTATE_WM_DEPTH_STENCIL_length)];
- struct GENX(3DSTATE_WM_DEPTH_STENCIL) wm_depth_stencil = {
- GENX(3DSTATE_WM_DEPTH_STENCIL_header),
-
- .StencilTestMask = d->stencil_compare_mask.front & 0xff,
- .StencilWriteMask = d->stencil_write_mask.front & 0xff,
-
- .BackfaceStencilTestMask = d->stencil_compare_mask.back & 0xff,
- .BackfaceStencilWriteMask = d->stencil_write_mask.back & 0xff,
-
- .StencilReferenceValue = d->stencil_reference.front & 0xff,
- .BackfaceStencilReferenceValue = d->stencil_reference.back & 0xff,
-
- .StencilBufferWriteEnable =
- (d->stencil_write_mask.front || d->stencil_write_mask.back) &&
- d->stencil_test_enable,
-
- .DepthTestEnable = d->depth_test_enable,
- .DepthBufferWriteEnable = d->depth_test_enable && d->depth_write_enable,
- .DepthTestFunction = genX(vk_to_intel_compare_op)[d->depth_compare_op],
- .StencilTestEnable = d->stencil_test_enable,
- .StencilFailOp = genX(vk_to_intel_stencil_op)[d->stencil_op.front.fail_op],
- .StencilPassDepthPassOp = genX(vk_to_intel_stencil_op)[d->stencil_op.front.pass_op],
- .StencilPassDepthFailOp = genX(vk_to_intel_stencil_op)[d->stencil_op.front.depth_fail_op],
- .StencilTestFunction = genX(vk_to_intel_compare_op)[d->stencil_op.front.compare_op],
- .BackfaceStencilFailOp = genX(vk_to_intel_stencil_op)[d->stencil_op.back.fail_op],
- .BackfaceStencilPassDepthPassOp = genX(vk_to_intel_stencil_op)[d->stencil_op.back.pass_op],
- .BackfaceStencilPassDepthFailOp = genX(vk_to_intel_stencil_op)[d->stencil_op.back.depth_fail_op],
- .BackfaceStencilTestFunction = genX(vk_to_intel_compare_op)[d->stencil_op.back.compare_op],
-
- };
- GENX(3DSTATE_WM_DEPTH_STENCIL_pack)(NULL, dwords, &wm_depth_stencil);
-
- anv_batch_emit_merge(&cmd_buffer->batch, dwords,
- pipeline->gfx9.wm_depth_stencil);
-
- genX(cmd_buffer_enable_pma_fix)(cmd_buffer,
- want_stencil_pma_fix(cmd_buffer));
- }
-#endif
-
-#if GFX_VER >= 12
- if(cmd_buffer->state.gfx.dirty & (ANV_CMD_DIRTY_PIPELINE |
- ANV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS |
- ANV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS_TEST_ENABLE)) {
- anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_DEPTH_BOUNDS), db) {
- db.DepthBoundsTestValueModifyDisable = false;
- db.DepthBoundsTestEnableModifyDisable = false;
- db.DepthBoundsTestEnable = d->depth_bounds_test_enable;
- db.DepthBoundsTestMinValue = d->depth_bounds.min;
- db.DepthBoundsTestMaxValue = d->depth_bounds.max;
- }
- }
-#endif
-
- if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_DYNAMIC_LINE_STIPPLE) {
- anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_LINE_STIPPLE), ls) {
- ls.LineStipplePattern = d->line_stipple.pattern;
- ls.LineStippleInverseRepeatCount =
- 1.0f / MAX2(1, d->line_stipple.factor);
- ls.LineStippleRepeatCount = d->line_stipple.factor;
- }
- }
-
- if (cmd_buffer->state.gfx.dirty & (ANV_CMD_DIRTY_PIPELINE |
- ANV_CMD_DIRTY_INDEX_BUFFER |
- ANV_CMD_DIRTY_DYNAMIC_PRIMITIVE_RESTART_ENABLE)) {
- anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VF), vf) {
- vf.IndexedDrawCutIndexEnable = d->primitive_restart_enable;
- vf.CutIndex = cmd_buffer->state.restart_index;
- }
- }
-
- if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_DYNAMIC_SAMPLE_LOCATIONS) {
- genX(emit_sample_pattern)(&cmd_buffer->batch,
- cmd_buffer->state.gfx.dynamic.sample_locations.samples,
- cmd_buffer->state.gfx.dynamic.sample_locations.locations);
- }
-
- if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_DYNAMIC_COLOR_BLEND_STATE ||
- cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_DYNAMIC_LOGIC_OP) {
- const uint8_t color_writes = cmd_buffer->state.gfx.dynamic.color_writes;
- /* 3DSTATE_WM in the hope we can avoid spawning fragment shaders
- * threads.
- */
- bool dirty_color_blend =
- cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_DYNAMIC_COLOR_BLEND_STATE;
-
- if (dirty_color_blend) {
- uint32_t dwords[MAX2(GENX(3DSTATE_WM_length),
- GENX(3DSTATE_PS_BLEND_length))];
- struct GENX(3DSTATE_WM) wm = {
- GENX(3DSTATE_WM_header),
-
- .ForceThreadDispatchEnable = (pipeline->force_fragment_thread_dispatch ||
- !color_writes) ? ForceON : 0,
- };
- GENX(3DSTATE_WM_pack)(NULL, dwords, &wm);
-
- anv_batch_emit_merge(&cmd_buffer->batch, dwords, pipeline->gfx8.wm);
-
- /* 3DSTATE_PS_BLEND to be consistent with the rest of the
- * BLEND_STATE_ENTRY.
- */
- struct GENX(3DSTATE_PS_BLEND) ps_blend = {
- GENX(3DSTATE_PS_BLEND_header),
- .HasWriteableRT = color_writes != 0,
- };
- GENX(3DSTATE_PS_BLEND_pack)(NULL, dwords, &ps_blend);
- anv_batch_emit_merge(&cmd_buffer->batch, dwords, pipeline->gfx8.ps_blend);
- }
-
- /* Blend states of each RT */
- uint32_t surface_count = 0;
- struct anv_pipeline_bind_map *map;
- if (anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) {
- map = &pipeline->shaders[MESA_SHADER_FRAGMENT]->bind_map;
- surface_count = map->surface_count;
- }
-
- uint32_t blend_dws[GENX(BLEND_STATE_length) +
- MAX_RTS * GENX(BLEND_STATE_ENTRY_length)];
- uint32_t *dws = blend_dws;
- memset(blend_dws, 0, sizeof(blend_dws));
-
- /* Skip this part */
- dws += GENX(BLEND_STATE_length);
-
- bool dirty_logic_op =
- cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_DYNAMIC_LOGIC_OP;
-
- for (uint32_t i = 0; i < surface_count; i++) {
- struct anv_pipeline_binding *binding = &map->surface_to_descriptor[i];
- bool write_disabled =
- dirty_color_blend && (color_writes & (1u << binding->index)) == 0;
- struct GENX(BLEND_STATE_ENTRY) entry = {
- .WriteDisableAlpha = write_disabled,
- .WriteDisableRed = write_disabled,
- .WriteDisableGreen = write_disabled,
- .WriteDisableBlue = write_disabled,
- .LogicOpFunction =
- dirty_logic_op ? genX(vk_to_intel_logic_op)[d->logic_op] : 0,
- };
- GENX(BLEND_STATE_ENTRY_pack)(NULL, dws, &entry);
- dws += GENX(BLEND_STATE_ENTRY_length);
- }
-
- uint32_t num_dwords = GENX(BLEND_STATE_length) +
- GENX(BLEND_STATE_ENTRY_length) * surface_count;
-
- struct anv_state blend_states =
- anv_cmd_buffer_merge_dynamic(cmd_buffer, blend_dws,
- pipeline->gfx8.blend_state, num_dwords, 64);
- anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_BLEND_STATE_POINTERS), bsp) {
- bsp.BlendStatePointer = blend_states.offset;
- bsp.BlendStatePointerValid = true;
- }
- }
-
-#if GFX_VER >= 11
- if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_DYNAMIC_SHADING_RATE) {
- struct anv_state cps_states = ANV_STATE_NULL;
-
-#if GFX_VER >= 12
- uint32_t count = cmd_buffer->state.gfx.dynamic.viewport.count;
- cps_states =
- anv_cmd_buffer_alloc_dynamic_state(cmd_buffer,
- GENX(CPS_STATE_length) * 4 * count,
- 32);
-#endif /* GFX_VER >= 12 */
-
- genX(emit_shading_rate)(&cmd_buffer->batch, pipeline, cps_states,
- &cmd_buffer->state.gfx.dynamic);
- }
-#endif /* GFX_VER >= 11 */
-
- cmd_buffer->state.gfx.dirty = 0;
-}
-
-static uint32_t vk_to_intel_index_type(VkIndexType type)
-{
- switch (type) {
- case VK_INDEX_TYPE_UINT8_EXT:
- return INDEX_BYTE;
- case VK_INDEX_TYPE_UINT16:
- return INDEX_WORD;
- case VK_INDEX_TYPE_UINT32:
- return INDEX_DWORD;
- default:
- unreachable("invalid index type");
- }
-}
-
-static uint32_t restart_index_for_type(VkIndexType type)
-{
- switch (type) {
- case VK_INDEX_TYPE_UINT8_EXT:
- return UINT8_MAX;
- case VK_INDEX_TYPE_UINT16:
- return UINT16_MAX;
- case VK_INDEX_TYPE_UINT32:
- return UINT32_MAX;
- default:
- unreachable("invalid index type");
- }
-}
-
-void genX(CmdBindIndexBuffer)(
- VkCommandBuffer commandBuffer,
- VkBuffer _buffer,
- VkDeviceSize offset,
- VkIndexType indexType)
-{
- ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
- ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
-
- cmd_buffer->state.restart_index = restart_index_for_type(indexType);
-
- anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_INDEX_BUFFER), ib) {
- ib.IndexFormat = vk_to_intel_index_type(indexType);
- ib.MOCS = anv_mocs(cmd_buffer->device,
- buffer->address.bo,
- ISL_SURF_USAGE_INDEX_BUFFER_BIT);
-#if GFX_VER >= 12
- ib.L3BypassDisable = true;
-#endif
- ib.BufferStartingAddress = anv_address_add(buffer->address, offset);
- ib.BufferSize = buffer->size - offset;
- }
-
- cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_INDEX_BUFFER;
-}
diff --git a/src/intel/vulkan/grl/.gitignore b/src/intel/vulkan/grl/.gitignore
new file mode 100644
index 00000000000..e2850ca03b1
--- /dev/null
+++ b/src/intel/vulkan/grl/.gitignore
@@ -0,0 +1 @@
+parsetab.py
diff --git a/src/intel/vulkan/grl/genX_grl.h b/src/intel/vulkan/grl/genX_grl.h
new file mode 100644
index 00000000000..57aefa72de0
--- /dev/null
+++ b/src/intel/vulkan/grl/genX_grl.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright © 2021 Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef ANV_GRL_H
+#define ANV_GRL_H
+
+#include "grl/grl_cl_kernel.h"
+#include "genxml/gen_macros.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct anv_cmd_buffer;
+struct anv_kernel_arg;
+
+void
+genX(grl_dispatch)(struct anv_cmd_buffer *cmd_buffer,
+ enum grl_cl_kernel kernel,
+ const uint32_t *global_size,
+ uint32_t arg_count,
+ const struct anv_kernel_arg *args);
+
+void
+genX(grl_load_rt_uuid)(uint8_t *out_uuid);
+
+uint32_t
+genX(grl_max_scratch_size)(void);
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif /* ANV_GRL_H */
diff --git a/src/intel/vulkan/grl/genX_grl_dispatch.c b/src/intel/vulkan/grl/genX_grl_dispatch.c
new file mode 100644
index 00000000000..aeb76b79bd0
--- /dev/null
+++ b/src/intel/vulkan/grl/genX_grl_dispatch.c
@@ -0,0 +1,113 @@
+/*
+ * Copyright © 2021 Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "anv_private.h"
+#include "genX_grl.h"
+
+static struct anv_shader_bin *
+get_shader_bin(struct anv_device *device,
+ enum grl_cl_kernel kernel)
+{
+ const char *key = genX(grl_get_cl_kernel_sha1)(kernel);
+ int key_len = strlen(key);
+
+ bool cache_hit = false;
+ struct anv_shader_bin *bin =
+ anv_device_search_for_kernel(device, device->internal_cache,
+ key, key_len, &cache_hit);
+ if (bin != NULL)
+ return bin;
+
+ uint32_t dummy_param[32];
+ struct brw_kernel kernel_data;
+ genX(grl_get_cl_kernel)(&kernel_data, kernel);
+
+ assert(kernel_data.prog_data.base.nr_params <= ARRAY_SIZE(dummy_param));
+ kernel_data.prog_data.base.param = dummy_param;
+
+ struct anv_push_descriptor_info empty_push_desc_info = {};
+ struct anv_pipeline_bind_map bind_map = {
+ .kernel_args_size = kernel_data.args_size,
+ .kernel_arg_count = kernel_data.arg_count,
+ .kernel_args = (struct brw_kernel_arg_desc *)kernel_data.args,
+ };
+
+ struct anv_shader_upload_params upload_params = {
+ .stage = MESA_SHADER_KERNEL,
+ .key_data = key,
+ .key_size = key_len,
+ .kernel_data = kernel_data.code,
+ .kernel_size = kernel_data.prog_data.base.program_size,
+ .prog_data = &kernel_data.prog_data.base,
+ .prog_data_size = sizeof(kernel_data.prog_data),
+ .bind_map = &bind_map,
+ .push_desc_info = &empty_push_desc_info,
+ };
+
+ bin = anv_device_upload_kernel(device, device->internal_cache,
+ &upload_params);
+
+ /* The cache already has a reference and it's not going anywhere so there
+ * is no need to hold a second reference.
+ */
+ anv_shader_bin_unref(device, bin);
+
+ return bin;
+}
+
+void
+genX(grl_dispatch)(struct anv_cmd_buffer *cmd_buffer,
+ enum grl_cl_kernel kernel,
+ const uint32_t *global_size,
+ uint32_t arg_count,
+ const struct anv_kernel_arg *args)
+{
+ struct anv_device *device = cmd_buffer->device;
+
+ const struct intel_l3_weights w =
+ intel_get_default_l3_weights(device->info, true, true);
+
+ struct anv_kernel ak = {
+ .bin = get_shader_bin(device, kernel),
+ .l3_config = intel_get_l3_config(device->info, w),
+ };
+
+ genX(cmd_buffer_dispatch_kernel)(cmd_buffer, &ak, global_size,
+ arg_count, args);
+}
+
+uint32_t
+genX(grl_max_scratch_size)(void)
+{
+ uint32_t scratch_size = 0;
+
+ for (uint32_t i = 0; i < GRL_CL_KERNEL_MAX; i++) {
+ struct brw_kernel kernel_data;
+ genX(grl_get_cl_kernel)(&kernel_data, i);
+
+ scratch_size = MAX2(kernel_data.prog_data.base.total_scratch,
+ scratch_size);
+ }
+
+ return scratch_size;
+}
diff --git a/src/intel/vulkan/grl/genX_grl_uuid.cpp b/src/intel/vulkan/grl/genX_grl_uuid.cpp
new file mode 100644
index 00000000000..cf6b425fe2b
--- /dev/null
+++ b/src/intel/vulkan/grl/genX_grl_uuid.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright © 2021 Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <assert.h>
+#include <string.h>
+
+#include "genX_grl.h"
+#include "include/GRLGen12.h"
+
+#include "vulkan/vulkan_core.h"
+
+extern "C" void
+genX(grl_load_rt_uuid)(uint8_t *out_uuid);
+
+extern "C" void
+genX(grl_load_rt_uuid)(uint8_t *out_uuid)
+{
+ assert(sizeof(GRL::RTAS::GEN12::BVH_MAGIC) == VK_UUID_SIZE);
+ memcpy(out_uuid, GRL::RTAS::GEN12::BVH_MAGIC, VK_UUID_SIZE);
+}
diff --git a/src/intel/vulkan/grl/gpu/AABB.h b/src/intel/vulkan/grl/gpu/AABB.h
new file mode 100644
index 00000000000..11d848e3c09
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/AABB.h
@@ -0,0 +1,450 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#pragma once
+
+#include "shared.h"
+#include "intrinsics.h"
+#ifndef __OPENCL_VERSION__
+#include "stdio.h"
+#endif
+
+GRL_NAMESPACE_BEGIN(GRL)
+GRL_NAMESPACE_BEGIN(RTAS)
+/* ====== QUAD ENCODING config ====== */
+
+#define QUAD_GEOMID_BITS 27 // dxr limit is 2^24 geos... we have headroom
+#define QUAD_PRIMID_DIFF_BITS (32 - QUAD_GEOMID_BITS)
+#define QUAD_GEOMID_MASK ((1<<QUAD_GEOMID_BITS)-1)
+
+#define QUAD_PRIMID_BITS 29 // dxr limit is 2^29 prims total within one blas
+#define QUAD_PRIMID_MASK ((1<<QUAD_PRIMID_BITS)-1)
+
+#define INSTANCE_ID_BITS 24
+#define INSTANCE_ID_MASK ((1<<INSTANCE_ID_BITS)-1)
+
+// JDB TODO: Make this a separate, dedicated structure.. Aliasing a float4 AABB as a primref is needlessly obfuscated
+
+typedef struct AABB PrimRef;
+
+GRL_INLINE void AABB_init(struct AABB *aabb)
+{
+ aabb->lower = (float4)(INFINITY, INFINITY, INFINITY, 0);
+ aabb->upper = -(float4)(INFINITY, INFINITY, INFINITY, 0);
+}
+
+GRL_INLINE uint PRIMREF_geomID( PrimRef* aabb)
+{
+ const uint v = as_uint(aabb->lower.w);
+ return v & QUAD_GEOMID_MASK;
+}
+
+GRL_INLINE uint PRIMREF_primID0( PrimRef* aabb)
+{
+ return as_uint( aabb->upper.w ) & QUAD_PRIMID_MASK;
+}
+
+GRL_INLINE uint PRIMREF_primID1( PrimRef* aabb)
+{
+ const uint v = as_uint(aabb->lower.w);
+ const uint primID0 = as_uint(aabb->upper.w) & QUAD_PRIMID_MASK;
+ const uint deltaID = v >> QUAD_GEOMID_BITS;
+ const uint primID1 = primID0 + deltaID;
+ return primID1;
+}
+
+GRL_INLINE uint PRIMREF_geomFlags( PrimRef* aabb )
+{
+ const uint v = as_uint( aabb->upper.w );
+ return (v >> QUAD_PRIMID_BITS) ;
+}
+
+GRL_INLINE uint PRIMREF_instanceIndex( PrimRef* aabb )
+{
+ return as_uint(aabb->lower.w) & INSTANCE_ID_MASK;
+}
+
+GRL_INLINE uchar PRIMREF_instanceMask( PrimRef* aabb )
+{
+ return as_uint(aabb->lower.w) >> INSTANCE_ID_BITS;
+}
+
+GRL_INLINE void PRIMREF_setProceduralMetaData( PrimRef* primref, uint geomID, uint primID, uint geomFlags )
+{
+ /* encode geomID, primID */
+ uint flags = (geomFlags << QUAD_PRIMID_BITS);
+ primref->lower.w = as_float( geomID );
+ primref->upper.w = as_float( primID | flags );
+}
+
+GRL_INLINE void PRIMREF_setQuadMetaData( PrimRef* primref, uint primID0, uint primID1, uint geomID, uint geomFlags )
+{
+ const uint primID_diff = primID1 - primID0;
+ uint flags = geomFlags << QUAD_PRIMID_BITS;
+
+ primref->lower.w = as_float( geomID | (primID_diff << QUAD_GEOMID_BITS) );
+ primref->upper.w = as_float( (primID0 | flags) );
+}
+
+GRL_INLINE void PRIMREF_setAABB( PrimRef* primref, float3 lower, float3 upper )
+{
+ primref->lower.xyz = lower.xyz;
+ primref->upper.xyz = upper.xyz;
+}
+
+GRL_INLINE PrimRef PRIMREF_set_instance( float3 lower, float3 upper, uint instanceIndex, uint instanceMask, uint rootOffset, bool is_procedural )
+{
+ PrimRef new_ref;
+ new_ref.lower.xyz = lower;
+ new_ref.lower.w = as_float(instanceIndex | (instanceMask << 24));
+ new_ref.upper.xyz = upper;
+ new_ref.upper.w = as_float(rootOffset + (is_procedural? 0x80000000 : 0));
+ return new_ref;
+}
+
+GRL_INLINE bool PRIMREF_isProceduralInstance( PrimRef* primref )
+{
+ return (as_uint(primref->upper.w) & 0x80000000) != 0;
+}
+
+GRL_INLINE uint PRIMREF_instanceRootNodeOffset(PrimRef* primref)
+{
+ return (as_uint(primref->upper.w) & 0x7fffffff);
+}
+
+GRL_INLINE float3 PRIMREF_lower( PrimRef* primref )
+{
+ return primref->lower.xyz;
+}
+GRL_INLINE float3 PRIMREF_upper( PrimRef* primref )
+{
+ return primref->upper.xyz;
+}
+
+GRL_INLINE void AABB_extend(struct AABB *aabb, struct AABB *v)
+{
+ aabb->lower = min(aabb->lower, v->lower);
+ aabb->upper = max(aabb->upper, v->upper);
+}
+
+GRL_INLINE void AABB_extend_point(struct AABB *aabb, const float4 p)
+{
+ aabb->lower = min(aabb->lower, p);
+ aabb->upper = max(aabb->upper, p);
+}
+
+GRL_INLINE void AABB_extendlu(struct AABB *aabb, const float4 lower, const float4 upper)
+{
+ aabb->lower = min(aabb->lower, lower);
+ aabb->upper = max(aabb->upper, upper);
+}
+
+GRL_INLINE struct AABB AABB_enlarge(struct AABB *aabb, const float v)
+{
+ struct AABB box;
+ box.lower = aabb->lower - (float4)v;
+ box.upper = aabb->upper + (float4)v;
+ return box;
+}
+
+GRL_INLINE void AABB_intersect(struct AABB *aabb, struct AABB *v)
+{
+ aabb->lower = max(aabb->lower, v->lower);
+ aabb->upper = min(aabb->upper, v->upper);
+}
+
+GRL_INLINE float4 AABB_size(struct AABB *aabb)
+{
+ return aabb->upper - aabb->lower;
+}
+
+GRL_INLINE float4 AABB_centroid2(struct AABB *aabb)
+{
+ return aabb->lower + aabb->upper;
+}
+
+GRL_INLINE float AABB_halfArea(struct AABB *aabb)
+{
+ const float4 d = AABB_size(aabb);
+ return halfarea(d.xyz);
+}
+
+GRL_INLINE float AABB_intersecion_size(struct AABB* aabb, struct AABB* v)
+{
+ struct AABB temp = *aabb;
+ AABB_intersect(&temp, v);
+ float4 len = AABB_size(&temp);
+ float ret = 0.0f;
+ if (len.x >= 0.0f && len.y >= 0.0f && len.z >= 0.0f) {
+ float3 v = { len.x, len.y, len.z };
+ ret = halfarea(v);
+ }
+ return ret;
+}
+
+GRL_INLINE bool AABB_subset(struct AABB* small, struct AABB* big)
+{
+ const int4 b0 = small->lower >= big->lower;
+ const int4 b1 = small->upper <= big->upper;
+ const int4 b = b0 & b1;
+ return b.x & b.y & b.z;
+}
+
+GRL_INLINE struct AABB AABBfromAABB3f(const struct AABB3f box)
+{
+ struct AABB box4d = {
+ {box.lower[0], box.lower[1], box.lower[2], 0.0f},
+ {box.upper[0], box.upper[1], box.upper[2], 0.0f}
+ };
+ return box4d;
+}
+
+GRL_INLINE struct AABB3f AABB3fFromAABB(const struct AABB box)
+{
+ struct AABB3f box3d = {
+ {box.lower[0], box.lower[1], box.lower[2]},
+ {box.upper[0], box.upper[1], box.upper[2]}
+ };
+ return box3d;
+}
+
+GRL_INLINE bool AABB_verify(struct AABB* aabb)
+{
+ bool error = false;
+ if (aabb->lower.x > aabb->upper.x)
+ error = true;
+ if (aabb->lower.y > aabb->upper.y)
+ error = true;
+ if (aabb->lower.z > aabb->upper.z)
+ error = true;
+ if (!isfinite(aabb->lower.x))
+ error = true;
+ if (!isfinite(aabb->lower.y))
+ error = true;
+ if (!isfinite(aabb->lower.z))
+ error = true;
+ if (!isfinite(aabb->upper.x))
+ error = true;
+ if (!isfinite(aabb->upper.y))
+ error = true;
+ if (!isfinite(aabb->upper.z))
+ error = true;
+ return error;
+}
+
+GRL_INLINE void AABB_print(struct AABB* aabb)
+{
+ printf("AABB {\n area = %f\n lower = %f\n upper = %f\n geomID = %i primID0 = %i primID1 = %i\n aabb->lower.w = %x aabb->upper.w = %x }\n",
+ AABB_halfArea(aabb),
+ aabb->lower.xyz,
+ aabb->upper.xyz,
+ PRIMREF_geomID(aabb),
+ PRIMREF_primID0(aabb),
+ PRIMREF_primID1(aabb),
+ as_uint(aabb->lower.w),
+ as_uint(aabb->upper.w));
+}
+
+#ifdef __OPENCL_VERSION__
+
+GRL_INLINE PrimRef PrimRef_sub_group_shuffle(PrimRef* primRef, const uint slotID)
+{
+ PrimRef shuffledPrimref;
+ shuffledPrimref.lower.x = intel_sub_group_shuffle(primRef->lower.x, slotID);
+ shuffledPrimref.lower.y = intel_sub_group_shuffle(primRef->lower.y, slotID);
+ shuffledPrimref.lower.z = intel_sub_group_shuffle(primRef->lower.z, slotID);
+ shuffledPrimref.lower.w = intel_sub_group_shuffle(primRef->lower.w, slotID);
+ shuffledPrimref.upper.x = intel_sub_group_shuffle(primRef->upper.x, slotID);
+ shuffledPrimref.upper.y = intel_sub_group_shuffle(primRef->upper.y, slotID);
+ shuffledPrimref.upper.z = intel_sub_group_shuffle(primRef->upper.z, slotID);
+ shuffledPrimref.upper.w = intel_sub_group_shuffle(primRef->upper.w, slotID);
+ return shuffledPrimref;
+}
+
+GRL_INLINE struct AABB AABB_sub_group_broadcast(struct AABB *aabb, const uint slotID)
+{
+ struct AABB bounds;
+ bounds.lower.x = sub_group_broadcast(aabb->lower.x, slotID);
+ bounds.lower.y = sub_group_broadcast(aabb->lower.y, slotID);
+ bounds.lower.z = sub_group_broadcast(aabb->lower.z, slotID);
+ bounds.lower.w = 0;
+ bounds.upper.x = sub_group_broadcast(aabb->upper.x, slotID);
+ bounds.upper.y = sub_group_broadcast(aabb->upper.y, slotID);
+ bounds.upper.z = sub_group_broadcast(aabb->upper.z, slotID);
+ bounds.upper.w = 0;
+ return bounds;
+}
+GRL_INLINE struct AABB AABB_sub_group_shuffle(struct AABB* aabb, const uint slotID)
+{
+ struct AABB bounds;
+ bounds.lower.x = intel_sub_group_shuffle(aabb->lower.x, slotID);
+ bounds.lower.y = intel_sub_group_shuffle(aabb->lower.y, slotID);
+ bounds.lower.z = intel_sub_group_shuffle(aabb->lower.z, slotID);
+ bounds.lower.w = 0;
+ bounds.upper.x = intel_sub_group_shuffle(aabb->upper.x, slotID);
+ bounds.upper.y = intel_sub_group_shuffle(aabb->upper.y, slotID);
+ bounds.upper.z = intel_sub_group_shuffle(aabb->upper.z, slotID);
+ bounds.upper.w = 0;
+ return bounds;
+}
+
+GRL_INLINE uint AABB_sub_group_shuffle_coordPerLane(struct AABB* aabb, const uint slotID)
+{
+ float coordData[8] = {
+ sub_group_broadcast(aabb->lower.x, slotID),
+ sub_group_broadcast(aabb->lower.y, slotID),
+ sub_group_broadcast(aabb->lower.z, slotID),
+ sub_group_broadcast(aabb->lower.w, slotID),
+ sub_group_broadcast(aabb->upper.x, slotID),
+ sub_group_broadcast(aabb->upper.y, slotID),
+ sub_group_broadcast(aabb->upper.z, slotID),
+ sub_group_broadcast(aabb->upper.w, slotID) };
+
+ uint coordDataFiltered;
+ const uint lane = get_sub_group_local_id();
+ if (lane < 8) coordDataFiltered = as_uint(coordData[lane]);
+ return coordDataFiltered;
+}
+
+GRL_INLINE struct AABB AABB_sub_group_reduce(struct AABB *aabb)
+{
+ struct AABB bounds;
+ bounds.lower.x = sub_group_reduce_min(aabb->lower.x);
+ bounds.lower.y = sub_group_reduce_min(aabb->lower.y);
+ bounds.lower.z = sub_group_reduce_min(aabb->lower.z);
+ bounds.lower.w = 0;
+ bounds.upper.x = sub_group_reduce_max(aabb->upper.x);
+ bounds.upper.y = sub_group_reduce_max(aabb->upper.y);
+ bounds.upper.z = sub_group_reduce_max(aabb->upper.z);
+ bounds.upper.w = 0;
+ return bounds;
+}
+
+
+GRL_INLINE struct AABB AABB_sub_group_reduce_N6( struct AABB* aabb )
+{
+ float3 l = aabb->lower.xyz;
+ float3 u = aabb->upper.xyz;
+ l = min( l, intel_sub_group_shuffle_down( l, l, 4 ) );
+ l = min( l, intel_sub_group_shuffle_down( l, l, 2 ) );
+ l = min( l, intel_sub_group_shuffle_down( l, l, 1 ) );
+ u = max( u, intel_sub_group_shuffle_down( u, u, 4 ) );
+ u = max( u, intel_sub_group_shuffle_down( u, u, 2 ) );
+ u = max( u, intel_sub_group_shuffle_down( u, u, 1 ) );
+
+ struct AABB bounds;
+ bounds.lower.x = l.x;
+ bounds.lower.y = l.y;
+ bounds.lower.z = l.z;
+ bounds.lower.w = 0;
+ bounds.upper.x = u.x;
+ bounds.upper.y = u.y;
+ bounds.upper.z = u.z;
+ bounds.upper.w = 0;
+ return bounds;
+}
+
+
+GRL_INLINE struct AABB AABB_work_group_reduce(struct AABB *aabb)
+{
+ struct AABB bounds;
+ bounds.lower.x = work_group_reduce_min(aabb->lower.x);
+ bounds.lower.y = work_group_reduce_min(aabb->lower.y);
+ bounds.lower.z = work_group_reduce_min(aabb->lower.z);
+ bounds.upper.x = work_group_reduce_max(aabb->upper.x);
+ bounds.upper.y = work_group_reduce_max(aabb->upper.y);
+ bounds.upper.z = work_group_reduce_max(aabb->upper.z);
+ return bounds;
+}
+
+GRL_INLINE struct AABB AABB_sub_group_scan_exclusive_min_max(struct AABB *aabb)
+{
+ struct AABB bounds;
+ bounds.lower.x = sub_group_scan_exclusive_min(aabb->lower.x);
+ bounds.lower.y = sub_group_scan_exclusive_min(aabb->lower.y);
+ bounds.lower.z = sub_group_scan_exclusive_min(aabb->lower.z);
+ bounds.lower.w = 0;
+ bounds.upper.x = sub_group_scan_exclusive_max(aabb->upper.x);
+ bounds.upper.y = sub_group_scan_exclusive_max(aabb->upper.y);
+ bounds.upper.z = sub_group_scan_exclusive_max(aabb->upper.z);
+ bounds.upper.w = 0;
+ return bounds;
+}
+
+GRL_INLINE struct AABB AABB_sub_group_scan_inclusive_min_max(struct AABB *aabb)
+{
+ struct AABB bounds;
+ bounds.lower.x = sub_group_scan_inclusive_min(aabb->lower.x);
+ bounds.lower.y = sub_group_scan_inclusive_min(aabb->lower.y);
+ bounds.lower.z = sub_group_scan_inclusive_min(aabb->lower.z);
+ bounds.lower.w = 0;
+ bounds.upper.x = sub_group_scan_inclusive_max(aabb->upper.x);
+ bounds.upper.y = sub_group_scan_inclusive_max(aabb->upper.y);
+ bounds.upper.z = sub_group_scan_inclusive_max(aabb->upper.z);
+ bounds.upper.w = 0;
+ return bounds;
+}
+
+GRL_INLINE void AABB_global_atomic_merge(global struct AABB *global_aabb, struct AABB *aabb)
+{
+ atomic_min((volatile __global float *)&global_aabb->lower + 0, aabb->lower.x);
+ atomic_min((volatile __global float *)&global_aabb->lower + 1, aabb->lower.y);
+ atomic_min((volatile __global float *)&global_aabb->lower + 2, aabb->lower.z);
+ atomic_max((volatile __global float *)&global_aabb->upper + 0, aabb->upper.x);
+ atomic_max((volatile __global float *)&global_aabb->upper + 1, aabb->upper.y);
+ atomic_max((volatile __global float *)&global_aabb->upper + 2, aabb->upper.z);
+}
+
+GRL_INLINE void AABB_global_atomic_merge_lu(global struct AABB* global_aabb, float3 lower, float3 upper )
+{
+ atomic_min((volatile __global float*) & global_aabb->lower + 0, lower.x);
+ atomic_min((volatile __global float*) & global_aabb->lower + 1, lower.y);
+ atomic_min((volatile __global float*) & global_aabb->lower + 2, lower.z);
+ atomic_max((volatile __global float*) & global_aabb->upper + 0, upper.x);
+ atomic_max((volatile __global float*) & global_aabb->upper + 1, upper.y);
+ atomic_max((volatile __global float*) & global_aabb->upper + 2, upper.z);
+}
+
+GRL_INLINE void AABB_global_atomic_merge_sub_group_lu(uniform global struct AABB* aabb, float3 lower, float3 upper)
+{
+ uint lane = get_sub_group_local_id();
+ float l[3];
+ l[0] = sub_group_reduce_min(lower.x);
+ l[1] = sub_group_reduce_min(lower.y);
+ l[2] = sub_group_reduce_min(lower.z);
+ float u[3];
+ u[0] = sub_group_reduce_max(upper.x);
+ u[1] = sub_group_reduce_max(upper.y);
+ u[2] = sub_group_reduce_max(upper.z);
+
+ if (lane < 3)
+ {
+ atomic_min((global float*)&aabb->lower + lane, l[lane]);
+ atomic_max((global float*)&aabb->upper + lane, u[lane]);
+ }
+}
+
+
+GRL_INLINE void AABB_local_atomic_merge(local struct AABB *aabb, const float4 lower, const float4 upper)
+{
+ if (lower.x < aabb->lower.x)
+ atomic_min((local float *)&aabb->lower + 0, lower.x);
+ if (lower.y < aabb->lower.y)
+ atomic_min((local float *)&aabb->lower + 1, lower.y);
+ if (lower.z < aabb->lower.z)
+ atomic_min((local float *)&aabb->lower + 2, lower.z);
+ if (upper.x > aabb->upper.x)
+ atomic_max((local float *)&aabb->upper + 0, upper.x);
+ if (upper.y > aabb->upper.y)
+ atomic_max((local float *)&aabb->upper + 1, upper.y);
+ if (upper.z > aabb->upper.z)
+ atomic_max((local float *)&aabb->upper + 2, upper.z);
+}
+#endif
+
+GRL_NAMESPACE_END(RTAS)
+GRL_NAMESPACE_END(GRL) \ No newline at end of file
diff --git a/src/intel/vulkan/grl/gpu/api_interface.h b/src/intel/vulkan/grl/gpu/api_interface.h
new file mode 100644
index 00000000000..71a1fff6327
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/api_interface.h
@@ -0,0 +1,840 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#pragma once
+#include "GRLStructs.h"
+#include "shared.h"
+#include "libs/lsc_intrinsics.h"
+
+typedef struct Geo GRL_RAYTRACING_GEOMETRY_DESC;
+
+typedef struct GRL_RAYTRACING_AABB
+{
+ float MinX;
+ float MinY;
+ float MinZ;
+ float MaxX;
+ float MaxY;
+ float MaxZ;
+} GRL_RAYTRACING_AABB;
+
+GRL_INLINE void GLR_set_raytracing_aabb(GRL_RAYTRACING_AABB* dest, struct AABB* source)
+{
+ dest->MinX = source->lower.x;
+ dest->MinY = source->lower.y;
+ dest->MinZ = source->lower.z;
+ dest->MaxX = source->upper.x;
+ dest->MaxY = source->upper.y;
+ dest->MaxZ = source->upper.z;
+}
+
+GRL_INLINE uint3 GRL_load_triangle(global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, const uint triID)
+{
+ global char* indices = (global char*)geomDesc->Desc.Triangles.pIndexBuffer;
+ uint index_format = geomDesc->Desc.Triangles.IndexFormat;
+
+ if (index_format == INDEX_FORMAT_R32_UINT)
+ {
+ const uint* data = (const uint*)(indices + triID * 3 * 4);
+ return (uint3)(data[0], data[1], data[2]);
+ }
+ else if (index_format == INDEX_FORMAT_NONE)
+ {
+ return (uint3)(triID * 3, triID * 3 + 1, triID * 3 + 2);
+ }
+ else
+ {
+ const ushort* data = (const ushort*)(indices + triID * 3 * 2);
+ return (uint3)(data[0], data[1], data[2]);
+ }
+}
+
+GRL_INLINE uint3 GRL_load_indices_from_buffer(global char* indices, const uint index_format, const uint triID)
+{
+ if (index_format == INDEX_FORMAT_R32_UINT)
+ {
+ return load_uint3_L1C_L3C((global uint3*)(indices + triID * 3 * 4), 0);
+ }
+ else if (index_format == INDEX_FORMAT_NONE)
+ {
+ return (uint3)(triID * 3, triID * 3 + 1, triID * 3 + 2);
+ }
+ else
+ {
+ const ushort* data = (const ushort*)(indices + triID * 3 * 2);
+ return (uint3)(data[0], data[1], data[2]);
+ }
+}
+
+// Load all 3 indices from one triangle, and a single index from another
+GRL_INLINE uint4 GRL_load_quad_indices(global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, uint triID, uint triID_1, ushort fourth_vert)
+{
+ global char* indices = (global char*)geomDesc->Desc.Triangles.pIndexBuffer;
+ uint index_format = geomDesc->Desc.Triangles.IndexFormat;
+
+ if (index_format == INDEX_FORMAT_R32_UINT)
+ {
+ const uint* data0 = (const uint*)(indices + triID * 3 * 4);
+ const uint* data1 = (const uint*)(indices + triID_1 * 3 * 4);
+ return (uint4)(data0[0], data0[1], data0[2], data1[fourth_vert]);
+ }
+ else if (index_format == INDEX_FORMAT_NONE)
+ {
+ return (uint4)(triID * 3, triID * 3 + 1, triID * 3 + 2, triID_1 * 3 + fourth_vert);
+ }
+ else
+ {
+ const ushort* data0 = (const ushort*)(indices + triID * 3 * 2);
+ const ushort* data1 = (const ushort*)(indices + triID_1 * 3 * 2);
+ return (uint4)(data0[0], data0[1], data0[2], data1[fourth_vert]);
+ }
+}
+
+GRL_INLINE void GRL_set_Type(GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, GeometryType type)
+{
+ geomDesc->Type = type;
+}
+
+GRL_INLINE GeometryType GRL_get_Type(const GRL_RAYTRACING_GEOMETRY_DESC* geomDesc)
+{
+ return geomDesc->Type;
+}
+
+GRL_INLINE void GRL_set_Flags(GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, uint8_t flags)
+{
+ geomDesc->Flags = flags;
+}
+
+GRL_INLINE uint8_t GRL_get_Flags(const GRL_RAYTRACING_GEOMETRY_DESC* geomDesc)
+{
+ return geomDesc->Flags;
+}
+
+GRL_INLINE void GRL_set_triangles_Transform(GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, gpuva_t transform)
+{
+ geomDesc->Desc.Triangles.pTransformBuffer = transform;
+}
+
+GRL_INLINE gpuva_t GRL_get_triangles_Transform(const GRL_RAYTRACING_GEOMETRY_DESC* geomDesc)
+{
+ return geomDesc->Desc.Triangles.pTransformBuffer;
+}
+
+GRL_INLINE void GRL_set_triangles_IndexFormat(GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, IndexFormat format)
+{
+ geomDesc->Desc.Triangles.IndexFormat = format;
+}
+
+GRL_INLINE IndexFormat GRL_get_triangles_IndexFormat(const GRL_RAYTRACING_GEOMETRY_DESC* geomDesc)
+{
+ return geomDesc->Desc.Triangles.IndexFormat;
+}
+
+GRL_INLINE void GRL_set_triangles_VertexFormat(GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, VertexFormat format)
+{
+ geomDesc->Desc.Triangles.VertexFormat = format;
+}
+
+GRL_INLINE VertexFormat GRL_get_triangles_VertexFormat(const GRL_RAYTRACING_GEOMETRY_DESC* geomDesc)
+{
+ return geomDesc->Desc.Triangles.VertexFormat;
+}
+
+GRL_INLINE void GRL_set_triangles_IndexCount(GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, dword count)
+{
+ geomDesc->Desc.Triangles.IndexCount = count;
+}
+
+GRL_INLINE dword GRL_get_triangles_IndexCount(const GRL_RAYTRACING_GEOMETRY_DESC* geomDesc)
+{
+ return geomDesc->Desc.Triangles.IndexCount;
+}
+
+GRL_INLINE void GRL_set_triangles_VertexCount(GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, dword count)
+{
+ geomDesc->Desc.Triangles.VertexCount = count;
+}
+
+GRL_INLINE dword GRL_get_triangles_VertexCount(const GRL_RAYTRACING_GEOMETRY_DESC* geomDesc)
+{
+ return geomDesc->Desc.Triangles.VertexCount;
+}
+
+GRL_INLINE void GRL_set_triangles_IndexBuffer(GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, gpuva_t buffer)
+{
+ geomDesc->Desc.Triangles.pIndexBuffer = buffer;
+}
+
+GRL_INLINE gpuva_t GRL_get_triangles_IndexBuffer(const GRL_RAYTRACING_GEOMETRY_DESC* geomDesc)
+{
+ return geomDesc->Desc.Triangles.pIndexBuffer;
+}
+
+GRL_INLINE void GRL_set_triangles_VertexBuffer_StartAddress(GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, gpuva_t address)
+{
+ geomDesc->Desc.Triangles.pVertexBuffer = address;
+}
+
+GRL_INLINE gpuva_t GRL_get_triangles_VertexBuffer_StartAddress(const GRL_RAYTRACING_GEOMETRY_DESC* geomDesc)
+{
+ return geomDesc->Desc.Triangles.pVertexBuffer;
+}
+
+GRL_INLINE void GRL_set_triangles_VertexBuffer_StrideInBytes(GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, unsigned long stride)
+{
+ geomDesc->Desc.Triangles.VertexBufferByteStride = stride;
+}
+
+GRL_INLINE unsigned long GRL_get_triangles_VertexBuffer_StrideInBytes(const GRL_RAYTRACING_GEOMETRY_DESC* geomDesc)
+{
+ return geomDesc->Desc.Triangles.VertexBufferByteStride;
+}
+
+GRL_INLINE unsigned long GRL_get_triangles_IndexFormatSizeInBytes(GRL_RAYTRACING_GEOMETRY_DESC* geomDesc)
+{
+ return (unsigned long)(geomDesc->Desc.Triangles.IndexFormat);
+}
+
+GRL_INLINE void GRL_set_procedurals_AABBCount(GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, dword count)
+{
+ geomDesc->Desc.Procedural.AABBCount = count;
+}
+
+GRL_INLINE dword GRL_get_procedurals_AABBCount(const GRL_RAYTRACING_GEOMETRY_DESC* geomDesc)
+{
+ return geomDesc->Desc.Procedural.AABBCount;
+}
+
+GRL_INLINE void GRL_set_procedurals_AABBs_StartAddress(GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, gpuva_t address)
+{
+ geomDesc->Desc.Procedural.pAABBs_GPUVA = address;
+}
+
+GRL_INLINE gpuva_t GRL_get_procedurals_AABBs_StartAddress(const GRL_RAYTRACING_GEOMETRY_DESC* geomDesc)
+{
+ return geomDesc->Desc.Procedural.pAABBs_GPUVA;
+}
+
+GRL_INLINE void GRL_set_procedurals_AABBs_StrideInBytes(GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, qword stride)
+{
+ geomDesc->Desc.Procedural.AABBByteStride = stride;
+}
+
+GRL_INLINE qword GRL_get_procedurals_AABBs_StrideInBytes(const GRL_RAYTRACING_GEOMETRY_DESC* geomDesc)
+{
+ return geomDesc->Desc.Procedural.AABBByteStride;
+}
+
+GRL_INLINE uint GRL_is_procedural(GRL_RAYTRACING_GEOMETRY_DESC* desc)
+{
+ return desc->Type == (unsigned char)GEOMETRY_TYPE_PROCEDURAL;
+}
+
+GRL_INLINE uint GRL_is_triangle(GRL_RAYTRACING_GEOMETRY_DESC* desc)
+{
+ return desc->Type != (unsigned char)GEOMETRY_TYPE_PROCEDURAL;
+}
+
+GRL_INLINE unsigned int GRL_get_ShaderIndex_Mask(GRL_RAYTRACING_GEOMETRY_DESC* desc)
+{
+ return 0x00FFFFFF;
+}
+
+GRL_INLINE dword GRL_atomic_add_triangles_VertexCount(GRL_RAYTRACING_GEOMETRY_DESC* desc, dword value)
+{
+ return atomic_add((global uint*) & desc->Desc.Triangles.VertexCount, value);
+}
+
+GRL_INLINE unsigned int GRL_get_primitive_count(GRL_RAYTRACING_GEOMETRY_DESC* desc)
+{
+ if (GRL_is_triangle(desc))
+ {
+ if (desc->Desc.Triangles.IndexFormat == INDEX_FORMAT_NONE)
+ {
+ return desc->Desc.Triangles.VertexCount / 3;
+ }
+ else
+ {
+ return desc->Desc.Triangles.IndexCount / 3;
+ }
+ }
+ else
+ {
+ return desc->Desc.Procedural.AABBCount;
+ }
+}
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable // to leaf half values
+
+GRL_INLINE float snorm_to_float(short v)
+{
+ return min(1.0f, max(-1.0f, ((float)v) * (1.0f / 32767.0f))); // FIXME: do we have intrinsic for this?
+}
+
+GRL_INLINE float snorm8_to_float(signed char v)
+{
+ return min(1.0f, max(-1.0f, ((float)v) * (1.0f / 127.0f))); // FIXME: do we have intrinsic for this?
+}
+
+GRL_INLINE float unorm_to_float(unsigned short v)
+{
+ return min(1.0f, max(0.0f, ((float)v) * (1.0f / 65535.0f))); // FIXME: do we have intrinsic for this?
+}
+
+//only lower 10 bits of v are used
+GRL_INLINE float unorm10_to_float(unsigned v)
+{
+ const unsigned short mask = (unsigned short)((1u << 10u) - 1u);
+ const unsigned short v10 = (unsigned short)v & mask;
+ return min(1.0f, max(0.0f, ((float)v10) * (1.0f / 1023.0f))); // FIXME: do we have intrinsic for this?
+}
+
+GRL_INLINE float unorm8_to_float(unsigned char v)
+{
+ return min(1.0f, max(0.0f, ((float)v) * (1.0f / 255.0f))); // FIXME: do we have intrinsic for this?
+}
+
+GRL_INLINE float4 GRL_load_vertex(global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, const uint vtxID)
+{
+ float4 v = (float4)(0, 0, 0, 0);
+ global char* vertices = (global char*)geomDesc->Desc.Triangles.pVertexBuffer;
+ uint vertex_stride = geomDesc->Desc.Triangles.VertexBufferByteStride;
+ uint vertex_format = geomDesc->Desc.Triangles.VertexFormat;
+
+ if (vertex_format == VERTEX_FORMAT_R32G32B32_FLOAT)
+ {
+ const float* data = (const float*)(vertices + vtxID * vertex_stride);
+ v = (float4)(data[0], data[1], data[2], 0.0f);
+ }
+ else if (vertex_format == VERTEX_FORMAT_R32G32_FLOAT)
+ {
+ const float* data = (const float*)(vertices + vtxID * vertex_stride);
+ v = (float4)(data[0], data[1], 0.0f, 0.0f);
+ }
+ else if (vertex_format == VERTEX_FORMAT_R16G16B16A16_FLOAT)
+ {
+ const half* data = (const half*)(vertices + vtxID * vertex_stride);
+ v = (float4)(data[0], data[1], data[2], 0.0f);
+ }
+ else if (vertex_format == VERTEX_FORMAT_R16G16_FLOAT)
+ {
+ const half* data = (const half*)(vertices + vtxID * vertex_stride);
+ v = (float4)(data[0], data[1], 0.0f, 0.0f);
+ }
+ else if (vertex_format == VERTEX_FORMAT_R16G16B16A16_SNORM)
+ {
+ const short* data = (const short*)(vertices + vtxID * vertex_stride);
+ v = (float4)(snorm_to_float(data[0]),
+ snorm_to_float(data[1]),
+ snorm_to_float(data[2]),
+ 0.0f);
+ }
+ else if (vertex_format == VERTEX_FORMAT_R16G16_SNORM)
+ {
+ const short* data = (const short*)(vertices + vtxID * vertex_stride);
+ v = (float4)(snorm_to_float(data[0]),
+ snorm_to_float(data[1]),
+ 0.0f,
+ 0.0f);
+ }
+ else if (vertex_format == VERTEX_FORMAT_R16G16B16A16_UNORM)
+ {
+ const unsigned short* data = (const unsigned short*)(vertices + vtxID * vertex_stride);
+ v = (float4)(unorm_to_float(data[0]),
+ unorm_to_float(data[1]),
+ unorm_to_float(data[2]),
+ 0.0f);
+ }
+ else if (vertex_format == VERTEX_FORMAT_R16G16_UNORM)
+ {
+ const unsigned short* data = (const unsigned short*)(vertices + vtxID * vertex_stride);
+ v = (float4)(unorm_to_float(data[0]),
+ unorm_to_float(data[1]),
+ 0.0f,
+ 0.0f);
+ }
+ else if (vertex_format == VERTEX_FORMAT_R10G10B10A2_UNORM)
+ {
+ const unsigned data = *(const unsigned*)(vertices + vtxID * vertex_stride);
+ v = (float4)(unorm10_to_float(data),
+ unorm10_to_float((data >> 10)),
+ unorm10_to_float((data >> 20)),
+ 0.0f);
+ }
+ else if (vertex_format == VERTEX_FORMAT_R8G8B8A8_UNORM)
+ {
+ const unsigned char* data = (const unsigned char*)(vertices + vtxID * vertex_stride);
+ v = (float4)(unorm8_to_float(data[0]),
+ unorm8_to_float(data[1]),
+ unorm8_to_float(data[2]),
+ 0.0f);
+ }
+ else if (vertex_format == VERTEX_FORMAT_R8G8_UNORM)
+ {
+ const unsigned char* data = (const unsigned char*)(vertices + vtxID * vertex_stride);
+ v = (float4)(unorm8_to_float(data[0]),
+ unorm8_to_float(data[1]),
+ 0.0f,
+ 0.0f);
+ }
+ else if (vertex_format == VERTEX_FORMAT_R8G8B8A8_SNORM)
+ {
+ const unsigned char* data = (const unsigned char*)(vertices + vtxID * vertex_stride);
+ v = (float4)(snorm8_to_float(data[0]),
+ snorm8_to_float(data[1]),
+ snorm8_to_float(data[2]),
+ 0.0f);
+ }
+ else if (vertex_format == VERTEX_FORMAT_R8G8_SNORM)
+ {
+ const unsigned char* data = (const unsigned char*)(vertices + vtxID * vertex_stride);
+ v = (float4)(snorm8_to_float(data[0]),
+ snorm8_to_float(data[1]),
+ 0.0f,
+ 0.0f);
+ }
+
+ /* perform vertex transformation */
+ if (geomDesc->Desc.Triangles.pTransformBuffer)
+ {
+ global float* xfm = (global float*)geomDesc->Desc.Triangles.pTransformBuffer;
+ const float x = xfm[0] * v.x + xfm[1] * v.y + xfm[2] * v.z + xfm[3];
+ const float y = xfm[4] * v.x + xfm[5] * v.y + xfm[6] * v.z + xfm[7];
+ const float z = xfm[8] * v.x + xfm[9] * v.y + xfm[10] * v.z + xfm[11];
+ v = (float4)(x, y, z, 0.0f);
+ }
+
+ return v;
+}
+
+GRL_INLINE void GRL_load_triangle_vertices(global char* vertices, const uint vertex_format, const uint vertex_stride, global float* transform_buffer, const uint vtx0ID, const uint vtx1ID, const uint vtx2ID, float4* out)
+{
+ if (vertex_format == VERTEX_FORMAT_R32G32B32_FLOAT)
+ {
+ const float3 data0 = as_float3(load_uint3_L1C_L3C((global uint3*)(vertices + vtx0ID * vertex_stride), 0));
+ const float3 data1 = as_float3(load_uint3_L1C_L3C((global uint3*)(vertices + vtx1ID * vertex_stride), 0));
+ const float3 data2 = as_float3(load_uint3_L1C_L3C((global uint3*)(vertices + vtx2ID * vertex_stride), 0));
+ out[0] = (float4)(data0[0], data0[1], data0[2], 0.0f);
+ out[1] = (float4)(data1[0], data1[1], data1[2], 0.0f);
+ out[2] = (float4)(data2[0], data2[1], data2[2], 0.0f);
+ }
+ else if (vertex_format == VERTEX_FORMAT_R32G32_FLOAT)
+ {
+ const float* data0 = (const float*)(vertices + vtx0ID * vertex_stride);
+ const float* data1 = (const float*)(vertices + vtx1ID * vertex_stride);
+ const float* data2 = (const float*)(vertices + vtx2ID * vertex_stride);
+ out[0] = (float4)(data0[0], data0[1], 0.0f, 0.0f);
+ out[1] = (float4)(data1[0], data1[1], 0.0f, 0.0f);
+ out[2] = (float4)(data2[0], data2[1], 0.0f, 0.0f);
+ }
+ else if (vertex_format == VERTEX_FORMAT_R16G16B16A16_FLOAT)
+ {
+ const half* data0 = (const half*)(vertices + vtx0ID * vertex_stride);
+ const half* data1 = (const half*)(vertices + vtx1ID * vertex_stride);
+ const half* data2 = (const half*)(vertices + vtx2ID * vertex_stride);
+ out[0] = (float4)(data0[0], data0[1], data0[2], 0.0f);
+ out[1] = (float4)(data1[0], data1[1], data1[2], 0.0f);
+ out[2] = (float4)(data2[0], data2[1], data2[2], 0.0f);
+ }
+ else if (vertex_format == VERTEX_FORMAT_R16G16_FLOAT)
+ {
+ const half* data0 = (const half*)(vertices + vtx0ID * vertex_stride);
+ const half* data1 = (const half*)(vertices + vtx1ID * vertex_stride);
+ const half* data2 = (const half*)(vertices + vtx2ID * vertex_stride);
+ out[0] = (float4)(data0[0], data0[1], 0.0f, 0.0f);
+ out[1] = (float4)(data1[0], data1[1], 0.0f, 0.0f);
+ out[2] = (float4)(data2[0], data2[1], 0.0f, 0.0f);
+ }
+ else if (vertex_format == VERTEX_FORMAT_R16G16B16A16_SNORM)
+ {
+ const short* data0 = (const short*)(vertices + vtx0ID * vertex_stride);
+ const short* data1 = (const short*)(vertices + vtx1ID * vertex_stride);
+ const short* data2 = (const short*)(vertices + vtx2ID * vertex_stride);
+ out[0] = (float4)(snorm_to_float(data0[0]), snorm_to_float(data0[1]), snorm_to_float(data0[2]), 0.0f);
+ out[1] = (float4)(snorm_to_float(data1[0]), snorm_to_float(data1[1]), snorm_to_float(data1[2]), 0.0f);
+ out[2] = (float4)(snorm_to_float(data2[0]), snorm_to_float(data2[1]), snorm_to_float(data2[2]), 0.0f);
+ }
+ else if (vertex_format == VERTEX_FORMAT_R16G16_SNORM)
+ {
+ const short* data0 = (const short*)(vertices + vtx0ID * vertex_stride);
+ const short* data1 = (const short*)(vertices + vtx1ID * vertex_stride);
+ const short* data2 = (const short*)(vertices + vtx2ID * vertex_stride);
+ out[0] = (float4)(snorm_to_float(data0[0]), snorm_to_float(data0[1]), 0.0f, 0.0f);
+ out[1] = (float4)(snorm_to_float(data1[0]), snorm_to_float(data1[1]), 0.0f, 0.0f);
+ out[2] = (float4)(snorm_to_float(data2[0]), snorm_to_float(data2[1]), 0.0f, 0.0f);
+ }
+ else if (vertex_format == VERTEX_FORMAT_R16G16B16A16_UNORM)
+ {
+ const unsigned short* data0 = (const unsigned short*)(vertices + vtx0ID * vertex_stride);
+ const unsigned short* data1 = (const unsigned short*)(vertices + vtx1ID * vertex_stride);
+ const unsigned short* data2 = (const unsigned short*)(vertices + vtx2ID * vertex_stride);
+ out[0] = (float4)(unorm_to_float(data0[0]), unorm_to_float(data0[1]), unorm_to_float(data0[2]), 0.0f);
+ out[1] = (float4)(unorm_to_float(data1[0]), unorm_to_float(data1[1]), unorm_to_float(data1[2]), 0.0f);
+ out[2] = (float4)(unorm_to_float(data2[0]), unorm_to_float(data2[1]), unorm_to_float(data2[2]), 0.0f);
+ }
+ else if (vertex_format == VERTEX_FORMAT_R16G16_UNORM)
+ {
+ const unsigned short* data0 = (const unsigned short*)(vertices + vtx0ID * vertex_stride);
+ const unsigned short* data1 = (const unsigned short*)(vertices + vtx1ID * vertex_stride);
+ const unsigned short* data2 = (const unsigned short*)(vertices + vtx2ID * vertex_stride);
+ out[0] = (float4)(unorm_to_float(data0[0]), unorm_to_float(data0[1]), 0.0f, 0.0f);
+ out[1] = (float4)(unorm_to_float(data1[0]), unorm_to_float(data1[1]), 0.0f, 0.0f);
+ out[2] = (float4)(unorm_to_float(data2[0]), unorm_to_float(data2[1]), 0.0f, 0.0f);
+ }
+ else if (vertex_format == VERTEX_FORMAT_R10G10B10A2_UNORM)
+ {
+ const unsigned data0 = *(const unsigned*)(vertices + vtx0ID * vertex_stride);
+ const unsigned data1 = *(const unsigned*)(vertices + vtx1ID * vertex_stride);
+ const unsigned data2 = *(const unsigned*)(vertices + vtx2ID * vertex_stride);
+ out[0] = (float4)(unorm10_to_float(data0), unorm10_to_float(data0 >> 10), unorm10_to_float(data0 >> 20), 0.0f);
+ out[1] = (float4)(unorm10_to_float(data1), unorm10_to_float(data1 >> 10), unorm10_to_float(data1 >> 20), 0.0f);
+ out[2] = (float4)(unorm10_to_float(data2), unorm10_to_float(data2 >> 10), unorm10_to_float(data2 >> 20), 0.0f);
+ }
+ else if (vertex_format == VERTEX_FORMAT_R8G8B8A8_UNORM)
+ {
+ const unsigned char* data0 = (const unsigned char*)(vertices + vtx0ID * vertex_stride);
+ const unsigned char* data1 = (const unsigned char*)(vertices + vtx1ID * vertex_stride);
+ const unsigned char* data2 = (const unsigned char*)(vertices + vtx2ID * vertex_stride);
+ out[0] = (float4)(unorm8_to_float(data0[0]), unorm8_to_float(data0[1]), unorm8_to_float(data0[2]), 0.0f);
+ out[1] = (float4)(unorm8_to_float(data1[0]), unorm8_to_float(data1[1]), unorm8_to_float(data1[2]), 0.0f);
+ out[2] = (float4)(unorm8_to_float(data2[0]), unorm8_to_float(data2[1]), unorm8_to_float(data2[2]), 0.0f);
+ }
+ else if (vertex_format == VERTEX_FORMAT_R8G8_UNORM)
+ {
+ const unsigned char* data0 = (const unsigned char*)(vertices + vtx0ID * vertex_stride);
+ const unsigned char* data1 = (const unsigned char*)(vertices + vtx1ID * vertex_stride);
+ const unsigned char* data2 = (const unsigned char*)(vertices + vtx2ID * vertex_stride);
+ out[0] = (float4)(unorm8_to_float(data0[0]), unorm8_to_float(data0[1]), 0.0f, 0.0f);
+ out[1] = (float4)(unorm8_to_float(data1[0]), unorm8_to_float(data1[1]), 0.0f, 0.0f);
+ out[2] = (float4)(unorm8_to_float(data2[0]), unorm8_to_float(data2[1]), 0.0f, 0.0f);
+ }
+ else if (vertex_format == VERTEX_FORMAT_R8G8B8A8_SNORM)
+ {
+ const unsigned char* data0 = (const unsigned char*)(vertices + vtx0ID * vertex_stride);
+ const unsigned char* data1 = (const unsigned char*)(vertices + vtx1ID * vertex_stride);
+ const unsigned char* data2 = (const unsigned char*)(vertices + vtx2ID * vertex_stride);
+ out[0] = (float4)(snorm8_to_float(data0[0]), snorm8_to_float(data0[1]), snorm8_to_float(data0[2]), 0.0f);
+ out[1] = (float4)(snorm8_to_float(data1[0]), snorm8_to_float(data1[1]), snorm8_to_float(data1[2]), 0.0f);
+ out[2] = (float4)(snorm8_to_float(data2[0]), snorm8_to_float(data2[1]), snorm8_to_float(data2[2]), 0.0f);
+ }
+ else if (vertex_format == VERTEX_FORMAT_R8G8_SNORM)
+ {
+ const unsigned char* data0 = (const unsigned char*)(vertices + vtx0ID * vertex_stride);
+ const unsigned char* data1 = (const unsigned char*)(vertices + vtx1ID * vertex_stride);
+ const unsigned char* data2 = (const unsigned char*)(vertices + vtx2ID * vertex_stride);
+ out[0] = (float4)(snorm8_to_float(data0[0]), snorm8_to_float(data0[1]), 0.0f, 0.0f);
+ out[1] = (float4)(snorm8_to_float(data1[0]), snorm8_to_float(data1[1]), 0.0f, 0.0f);
+ out[2] = (float4)(snorm8_to_float(data2[0]), snorm8_to_float(data2[1]), 0.0f, 0.0f);
+ }
+
+ /* perform vertex transformation */
+ if (transform_buffer)
+ {
+ global float* xfm = (global float*)transform_buffer;
+ for (uint i = 0; i < 3; ++i)
+ {
+ const float x = xfm[0] * out[i].x + xfm[1] * out[i].y + xfm[2] * out[i].z + xfm[3];
+ const float y = xfm[4] * out[i].x + xfm[5] * out[i].y + xfm[6] * out[i].z + xfm[7];
+ const float z = xfm[8] * out[i].x + xfm[9] * out[i].y + xfm[10] * out[i].z + xfm[11];
+ out[i] = (float4)(x, y, z, 0.0f);
+ }
+ }
+}
+
+GRL_INLINE void GRL_load_quad_vertices_no_stride(global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc,
+ float3* out0, float3* out1, float3* out2, float3* out3,
+ const uint4 vtxID, const uint vertex_format, global char* vertices)
+{
+ float3 v0, v1, v2, v3;
+
+ if (vertex_format == VERTEX_FORMAT_R32G32B32_FLOAT)
+ {
+ const float* data0 = (const float*)(vertices + vtxID.x);
+ const float* data1 = (const float*)(vertices + vtxID.y);
+ const float* data2 = (const float*)(vertices + vtxID.z);
+ const float* data3 = (const float*)(vertices + vtxID.w);
+ v0 = (float3)(data0[0], data0[1], data0[2]);
+ v1 = (float3)(data1[0], data1[1], data1[2]);
+ v2 = (float3)(data2[0], data2[1], data2[2]);
+ v3 = (float3)(data3[0], data3[1], data3[2]);
+ }
+ else if (vertex_format == VERTEX_FORMAT_R32G32_FLOAT)
+ {
+ const float* data0 = (const float*)(vertices + vtxID.x);
+ const float* data1 = (const float*)(vertices + vtxID.y);
+ const float* data2 = (const float*)(vertices + vtxID.z);
+ const float* data3 = (const float*)(vertices + vtxID.w);
+ v0 = (float3)(data0[0], data0[1], 0.0f);
+ v1 = (float3)(data1[0], data1[1], 0.0f);
+ v2 = (float3)(data2[0], data2[1], 0.0f);
+ v3 = (float3)(data3[0], data3[1], 0.0f);
+ }
+ else if (vertex_format == VERTEX_FORMAT_R16G16B16A16_FLOAT)
+ {
+ const half* data0 = (const half*)(vertices + vtxID.x);
+ const half* data1 = (const half*)(vertices + vtxID.y);
+ const half* data2 = (const half*)(vertices + vtxID.z);
+ const half* data3 = (const half*)(vertices + vtxID.w);
+ v0 = (float3)(data0[0], data0[1], data0[2]);
+ v1 = (float3)(data1[0], data1[1], data1[2]);
+ v2 = (float3)(data2[0], data2[1], data2[2]);
+ v3 = (float3)(data3[0], data3[1], data3[2]);
+ }
+ else if (vertex_format == VERTEX_FORMAT_R16G16_FLOAT)
+ {
+ const half* data0 = (const half*)(vertices + vtxID.x);
+ const half* data1 = (const half*)(vertices + vtxID.y);
+ const half* data2 = (const half*)(vertices + vtxID.z);
+ const half* data3 = (const half*)(vertices + vtxID.w);
+ v0 = (float3)(data0[0], data0[1], 0.0f);
+ v1 = (float3)(data1[0], data1[1], 0.0f);
+ v2 = (float3)(data2[0], data2[1], 0.0f);
+ v3 = (float3)(data3[0], data3[1], 0.0f);
+ }
+ else if (vertex_format == VERTEX_FORMAT_R16G16B16A16_SNORM)
+ {
+ const short* data0 = (const short*)(vertices + vtxID.x);
+ const short* data1 = (const short*)(vertices + vtxID.y);
+ const short* data2 = (const short*)(vertices + vtxID.z);
+ const short* data3 = (const short*)(vertices + vtxID.w);
+ v0 = (float3)(snorm_to_float(data0[0]), snorm_to_float(data0[1]), snorm_to_float(data0[2]));
+ v1 = (float3)(snorm_to_float(data1[0]), snorm_to_float(data1[1]), snorm_to_float(data1[2]));
+ v2 = (float3)(snorm_to_float(data2[0]), snorm_to_float(data2[1]), snorm_to_float(data2[2]));
+ v3 = (float3)(snorm_to_float(data3[0]), snorm_to_float(data3[1]), snorm_to_float(data3[2]));
+ }
+ else if (vertex_format == VERTEX_FORMAT_R16G16_SNORM)
+ {
+ const short* data0 = (const short*)(vertices + vtxID.x);
+ const short* data1 = (const short*)(vertices + vtxID.y);
+ const short* data2 = (const short*)(vertices + vtxID.z);
+ const short* data3 = (const short*)(vertices + vtxID.w);
+ v0 = (float3)(snorm_to_float(data0[0]), snorm_to_float(data0[1]), 0.0f);
+ v1 = (float3)(snorm_to_float(data1[0]), snorm_to_float(data1[1]), 0.0f);
+ v2 = (float3)(snorm_to_float(data2[0]), snorm_to_float(data2[1]), 0.0f);
+ v3 = (float3)(snorm_to_float(data3[0]), snorm_to_float(data3[1]), 0.0f);
+ }
+ else if (vertex_format == VERTEX_FORMAT_R16G16B16A16_UNORM)
+ {
+ const unsigned short* data0 = (const unsigned short*)(vertices + vtxID.x);
+ const unsigned short* data1 = (const unsigned short*)(vertices + vtxID.y);
+ const unsigned short* data2 = (const unsigned short*)(vertices + vtxID.z);
+ const unsigned short* data3 = (const unsigned short*)(vertices + vtxID.w);
+ v0 = (float3)(unorm_to_float(data0[0]), unorm_to_float(data0[1]), unorm_to_float(data0[2]));
+ v1 = (float3)(unorm_to_float(data1[0]), unorm_to_float(data1[1]), unorm_to_float(data1[2]));
+ v2 = (float3)(unorm_to_float(data2[0]), unorm_to_float(data2[1]), unorm_to_float(data2[2]));
+ v3 = (float3)(unorm_to_float(data3[0]), unorm_to_float(data3[1]), unorm_to_float(data3[2]));
+ }
+ else if (vertex_format == VERTEX_FORMAT_R16G16_UNORM)
+ {
+ const unsigned short* data0 = (const unsigned short*)(vertices + vtxID.x);
+ const unsigned short* data1 = (const unsigned short*)(vertices + vtxID.y);
+ const unsigned short* data2 = (const unsigned short*)(vertices + vtxID.z);
+ const unsigned short* data3 = (const unsigned short*)(vertices + vtxID.w);
+ v0 = (float3)(unorm_to_float(data0[0]), unorm_to_float(data0[1]), 0.0f);
+ v1 = (float3)(unorm_to_float(data1[0]), unorm_to_float(data1[1]), 0.0f);
+ v2 = (float3)(unorm_to_float(data2[0]), unorm_to_float(data2[1]), 0.0f);
+ v3 = (float3)(unorm_to_float(data3[0]), unorm_to_float(data3[1]), 0.0f);
+ }
+ else if (vertex_format == VERTEX_FORMAT_R10G10B10A2_UNORM)
+ {
+ const unsigned data0 = *(const unsigned*)(vertices + vtxID.x);
+ const unsigned data1 = *(const unsigned*)(vertices + vtxID.y);
+ const unsigned data2 = *(const unsigned*)(vertices + vtxID.z);
+ const unsigned data3 = *(const unsigned*)(vertices + vtxID.w);
+ v0 = (float3)(unorm10_to_float(data0), unorm10_to_float((data0 >> 10)), unorm10_to_float((data0 >> 20)));
+ v1 = (float3)(unorm10_to_float(data1), unorm10_to_float((data1 >> 10)), unorm10_to_float((data1 >> 20)));
+ v2 = (float3)(unorm10_to_float(data2), unorm10_to_float((data2 >> 10)), unorm10_to_float((data2 >> 20)));
+ v3 = (float3)(unorm10_to_float(data3), unorm10_to_float((data3 >> 10)), unorm10_to_float((data3 >> 20)));
+ }
+ else if (vertex_format == VERTEX_FORMAT_R8G8B8A8_UNORM)
+ {
+ const unsigned char* data0 = (const unsigned char*)(vertices + vtxID.x);
+ const unsigned char* data1 = (const unsigned char*)(vertices + vtxID.y);
+ const unsigned char* data2 = (const unsigned char*)(vertices + vtxID.z);
+ const unsigned char* data3 = (const unsigned char*)(vertices + vtxID.w);
+ v0 = (float3)(unorm8_to_float(data0[0]), unorm8_to_float(data0[1]), unorm8_to_float(data0[2]));
+ v1 = (float3)(unorm8_to_float(data1[0]), unorm8_to_float(data1[1]), unorm8_to_float(data1[2]));
+ v2 = (float3)(unorm8_to_float(data2[0]), unorm8_to_float(data2[1]), unorm8_to_float(data2[2]));
+ v3 = (float3)(unorm8_to_float(data3[0]), unorm8_to_float(data3[1]), unorm8_to_float(data3[2]));
+ }
+ else if (vertex_format == VERTEX_FORMAT_R8G8_UNORM)
+ {
+ const unsigned char* data0 = (const unsigned char*)(vertices + vtxID.x);
+ const unsigned char* data1 = (const unsigned char*)(vertices + vtxID.y);
+ const unsigned char* data2 = (const unsigned char*)(vertices + vtxID.z);
+ const unsigned char* data3 = (const unsigned char*)(vertices + vtxID.w);
+ v0 = (float3)(unorm8_to_float(data0[0]), unorm8_to_float(data0[1]), 0.0f);
+ v1 = (float3)(unorm8_to_float(data1[0]), unorm8_to_float(data1[1]), 0.0f);
+ v2 = (float3)(unorm8_to_float(data2[0]), unorm8_to_float(data2[1]), 0.0f);
+ v3 = (float3)(unorm8_to_float(data3[0]), unorm8_to_float(data3[1]), 0.0f);
+ }
+ else if (vertex_format == VERTEX_FORMAT_R8G8B8A8_SNORM)
+ {
+ const signed char* data0 = (const signed char*)(vertices + vtxID.x);
+ const signed char* data1 = (const signed char*)(vertices + vtxID.y);
+ const signed char* data2 = (const signed char*)(vertices + vtxID.z);
+ const signed char* data3 = (const signed char*)(vertices + vtxID.w);
+ v0 = (float3)(snorm8_to_float(data0[0]), snorm8_to_float(data0[1]), snorm8_to_float(data0[2]));
+ v1 = (float3)(snorm8_to_float(data1[0]), snorm8_to_float(data1[1]), snorm8_to_float(data1[2]));
+ v2 = (float3)(snorm8_to_float(data2[0]), snorm8_to_float(data2[1]), snorm8_to_float(data2[2]));
+ v3 = (float3)(snorm8_to_float(data3[0]), snorm8_to_float(data3[1]), snorm8_to_float(data3[2]));
+ }
+ else if (vertex_format == VERTEX_FORMAT_R8G8_SNORM)
+ {
+ const signed char* data0 = (const signed char*)(vertices + vtxID.x);
+ const signed char* data1 = (const signed char*)(vertices + vtxID.y);
+ const signed char* data2 = (const signed char*)(vertices + vtxID.z);
+ const signed char* data3 = (const signed char*)(vertices + vtxID.w);
+ v0 = (float3)(snorm8_to_float(data0[0]), snorm8_to_float(data0[1]), 0.0f);
+ v1 = (float3)(snorm8_to_float(data1[0]), snorm8_to_float(data1[1]), 0.0f);
+ v2 = (float3)(snorm8_to_float(data2[0]), snorm8_to_float(data2[1]), 0.0f);
+ v3 = (float3)(snorm8_to_float(data3[0]), snorm8_to_float(data3[1]), 0.0f);
+ }
+ else
+ {
+ v0 = (float3)(0.0f, 0.0f, 0.0f);
+ v1 = (float3)(0.0f, 0.0f, 0.0f);
+ v2 = (float3)(0.0f, 0.0f, 0.0f);
+ v3 = (float3)(0.0f, 0.0f, 0.0f);
+ }
+
+
+ /* perform vertex transformation */
+ if (geomDesc->Desc.Triangles.pTransformBuffer)
+ {
+ global float* xfm = (global float*)geomDesc->Desc.Triangles.pTransformBuffer;
+
+ v0.xyz = (float3)(
+ xfm[0] * v0.x + xfm[1] * v0.y + xfm[2] * v0.z + xfm[3],
+ xfm[4] * v0.x + xfm[5] * v0.y + xfm[6] * v0.z + xfm[7],
+ xfm[8] * v0.x + xfm[9] * v0.y + xfm[10] * v0.z + xfm[11]
+ );
+
+ v1.xyz = (float3)(
+ xfm[0] * v1.x + xfm[1] * v1.y + xfm[2] * v1.z + xfm[3],
+ xfm[4] * v1.x + xfm[5] * v1.y + xfm[6] * v1.z + xfm[7],
+ xfm[8] * v1.x + xfm[9] * v1.y + xfm[10] * v1.z + xfm[11]
+ );
+
+ v2.xyz = (float3)(
+ xfm[0] * v2.x + xfm[1] * v2.y + xfm[2] * v2.z + xfm[3],
+ xfm[4] * v2.x + xfm[5] * v2.y + xfm[6] * v2.z + xfm[7],
+ xfm[8] * v2.x + xfm[9] * v2.y + xfm[10] * v2.z + xfm[11]
+ );
+
+ v3.xyz = (float3)(
+ xfm[0] * v3.x + xfm[1] * v3.y + xfm[2] * v3.z + xfm[3],
+ xfm[4] * v3.x + xfm[5] * v3.y + xfm[6] * v3.z + xfm[7],
+ xfm[8] * v3.x + xfm[9] * v3.y + xfm[10] * v3.z + xfm[11]
+ );
+ }
+
+ *out0 = v0;
+ *out1 = v1;
+ *out2 = v2;
+ *out3 = v3;
+}
+
+
+GRL_INLINE void GRL_load_quad_vertices(global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc,
+ float3* out0, float3* out1, float3* out2, float3* out3,
+ uint4 vtxID)
+{
+ global char* vertices = (global char*)geomDesc->Desc.Triangles.pVertexBuffer;
+ uint vertex_format = geomDesc->Desc.Triangles.VertexFormat;
+ uint vertex_stride = geomDesc->Desc.Triangles.VertexBufferByteStride;
+
+ vtxID *= vertex_stride;
+
+ GRL_load_quad_vertices_no_stride(geomDesc, out0, out1, out2, out3,
+ vtxID, vertex_format, vertices);
+}
+
+
+GRL_INLINE GRL_RAYTRACING_AABB GRL_load_aabb(global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, const uint primID)
+{
+ global char* aabb0 = (global char*)geomDesc->Desc.Procedural.pAABBs_GPUVA;
+ global char* aabb = aabb0 + (primID * geomDesc->Desc.Procedural.AABBByteStride);
+ return *(global GRL_RAYTRACING_AABB*)aabb;
+}
+
+// same as for d3d12
+typedef struct GRL_RAYTRACING_INSTANCE_DESC
+{
+ float Transform[12];
+ // unsigned int InstanceID : 24;
+ // unsigned int InstanceMask : 8;
+ uint32_t DW0;
+ // unsigned int InstanceContributionToHitGroupIndex : 24;
+ // unsigned int Flags : 8;
+ uint32_t DW1;
+ global char* AccelerationStructure;
+} GRL_RAYTRACING_INSTANCE_DESC;
+
+GRL_INLINE float GRL_get_transform(const GRL_RAYTRACING_INSTANCE_DESC* d, const uint32_t row, const uint32_t column)
+{
+ return d->Transform[row * 4 + column];
+}
+
+GRL_INLINE uint32_t GRL_get_instanceID(const GRL_RAYTRACING_INSTANCE_DESC* d)
+{
+ return d->DW0 & ((1 << 24) - 1);
+}
+
+GRL_INLINE uint32_t GRL_get_InstanceMask(const GRL_RAYTRACING_INSTANCE_DESC* d)
+{
+ return d->DW0 >> 24;
+}
+
+GRL_INLINE uint32_t GRL_get_InstanceContributionToHitGroupIndex(const GRL_RAYTRACING_INSTANCE_DESC* d)
+{
+ return d->DW1 & ((1 << 24) - 1);
+}
+
+GRL_INLINE uint32_t GRL_get_InstanceFlags(const GRL_RAYTRACING_INSTANCE_DESC* d)
+{
+ return d->DW1 >> 24;
+}
+
+GRL_INLINE gpuva_t GRL_get_AccelerationStructure(const GRL_RAYTRACING_INSTANCE_DESC* d)
+{
+ return (gpuva_t)d->AccelerationStructure;
+}
+
+GRL_INLINE void GRL_set_transform(GRL_RAYTRACING_INSTANCE_DESC* d, const uint32_t row, const uint32_t column, float value)
+{
+ d->Transform[row * 4 + column] = value;
+}
+
+GRL_INLINE void GRL_set_instanceID(GRL_RAYTRACING_INSTANCE_DESC* d, const uint32_t id)
+{
+ d->DW0 &= 255 << 24;
+ d->DW0 |= id & ((1 << 24) - 1);
+}
+
+GRL_INLINE void GRL_set_InstanceMask(GRL_RAYTRACING_INSTANCE_DESC* d, const uint32_t mask)
+{
+ d->DW0 &= ((1 << 24) - 1);
+ d->DW0 |= mask << 24;
+}
+
+GRL_INLINE void GRL_set_InstanceContributionToHitGroupIndex(GRL_RAYTRACING_INSTANCE_DESC* d, const uint32_t contribution)
+{
+ d->DW1 &= 255 << 24;
+ d->DW1 |= contribution & ((1 << 24) - 1);
+}
+
+GRL_INLINE void GRL_set_InstanceFlags(GRL_RAYTRACING_INSTANCE_DESC* d, const uint32_t flags)
+{
+ d->DW1 &= ((1 << 24) - 1);
+ d->DW1 |= flags << 24;
+}
+
+GRL_INLINE void GRL_set_AccelerationStructure(GRL_RAYTRACING_INSTANCE_DESC* d, gpuva_t address)
+{
+ d->AccelerationStructure = (global char*)address;
+}
diff --git a/src/intel/vulkan/grl/gpu/atomic_update.cl b/src/intel/vulkan/grl/gpu/atomic_update.cl
new file mode 100644
index 00000000000..5171a122dc1
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/atomic_update.cl
@@ -0,0 +1,1112 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#include "GRLGen12.h"
+
+#include "bvh_build_refit.h"
+#include "bvh_build_treelet_refit.h"
+
+
+struct RefitScratch
+{
+ float lower[3];
+ uint mask;
+ float upper[3];
+ uint _pad;
+
+};
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(64, 1, 1))) void kernel
+init_refit_scratch(
+ global struct BVHBase* bvh,
+ global struct RefitScratch* scratch )
+{
+ uint tid = get_local_id(0) + get_group_id(0)*get_local_size(0);
+
+ if ( tid < BVHBase_GetNumInternalNodes(bvh) )
+ {
+ float4 v = (float4) (FLT_MAX,FLT_MAX,FLT_MAX,0);
+ store_uint4_L1WB_L3WB( (global uint4*) &scratch[tid], 0, as_uint4(v) );
+ store_uint4_L1WB_L3WB( (global uint4*) &scratch[tid], 1, as_uint4(v) );
+ }
+}
+
+bool is_fat_leaf( InternalNode* curNode )
+{
+ return curNode->nodeType != BVH_INTERNAL_NODE; // TODO: Not enough for traversal shaders!! if ts enabled need to check child types
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(64, 1, 1))) void kernel
+build_fatleaf_table(
+ global struct BVHBase* bvh )
+{
+ uint tid = get_local_id(0) + get_group_id(0)*get_local_size(0);
+
+ if ( tid < BVHBase_GetNumInternalNodes(bvh) )
+ {
+ InternalNode* curNode = BVHBase_GetInternalNodes(bvh)+tid;
+
+ if ( is_fat_leaf(curNode) )
+ {
+ uint offs = atomic_inc_global( &bvh->fatLeafCount );
+
+ BackPointers* backPointers = BVHBase_GetBackPointers(bvh);
+ uint bp = *InnerNode_GetBackPointer(backPointers, tid);
+
+ LeafTableEntry* leaf = BVHBase_GetFatLeafTable(bvh)+offs;
+ leaf->backpointer = bp;
+ leaf->inner_node_index = tid;
+ leaf->leaf_index = (BVH_ROOT_NODE_OFFSET/64) + tid + curNode->childOffset - bvh->quadLeafStart;
+ }
+ }
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(64, 1, 1))) void kernel
+build_fatleaf_table_new_update(
+ global struct Globals *globals,
+ global struct BVHBase* bvh )
+{
+ uint tid = get_local_id(0) + get_group_id(0)*get_local_size(0);
+
+ if ( tid < BVHBase_GetNumInternalNodes(bvh) )
+ {
+ InternalNode* curNode = BVHBase_GetInternalNodes(bvh)+tid;
+
+ if ( is_fat_leaf(curNode) )
+ {
+ // This implementation uses fatleaf table structure but it is actually quad table
+ // Also tested implementation that process 2 fatleafs per SIMD line as we iterate over the children
+ // but performance was worse
+ BackPointers* backPointers = BVHBase_GetBackPointers(bvh);
+ uint bp = *InnerNode_GetBackPointer(backPointers, tid);
+ uint fatLeafTableStart = bvh->fatLeafTableStart;
+
+ uint leaf_index = (BVH_ROOT_NODE_OFFSET/64) + tid + curNode->childOffset - bvh->quadLeafStart;
+ uint numChildren = (bp >> 3) & 0x7;
+
+ uint quad_leaf_table_index = leaf_index;
+
+ // Check if num children is outside of the % 256 work group
+ // If so, move these cases to the offset after numQuads and push them to the leftovers part
+ // where fatleaves are stored every 8th pos with additional padding
+ // This way we will not have the case in leftovers table where single fatleaf has children in 2 separate work groups
+
+ uint prev_group = leaf_index & 255;
+ uint next_group = (leaf_index + (numChildren - 1)) & 255;
+ uint slm_pos = prev_group;
+ bool is_leftover = prev_group > next_group;
+
+ if(is_leftover)
+ {
+ LeafTableEntry* leafBase = (LeafTableEntry*)(((char*)bvh) + (64u * fatLeafTableStart + 12 * quad_leaf_table_index));
+ uint numQuads_aligned_256 = (globals->numPrimitives + 255) & ~255;
+
+ uint leftovers_offset = atomic_add_global( &bvh->quadLeftoversCountNewAtomicUpdate, 8 );
+
+ for(uint i = 0; i < BVH_NODE_N6; i++)
+ {
+ uint pos = (i < numChildren) ? i : 0;
+ LeafTableEntry* leaf_null = &leafBase[pos];
+ leaf_null->leaf_index = -1 << 3;
+ }
+
+ quad_leaf_table_index = numQuads_aligned_256 + leftovers_offset;
+ slm_pos = leftovers_offset & 255;
+ }
+
+ LeafTableEntry* leaf = (LeafTableEntry*)(((char*)bvh) + (64u * fatLeafTableStart + 12 * quad_leaf_table_index));
+
+ for(uint i = 0; i < BVH_NODE_N6; i++)
+ {
+ uint pos = (i < numChildren) ? i : 0;
+ LeafTableEntry* leafCur = &leaf[pos];
+ leafCur->backpointer = bp;
+ leafCur->inner_node_index = (tid << 8) | slm_pos;
+ leafCur->leaf_index = (leaf_index << 3) | pos;
+ }
+
+ // Need to clean the unused area where we pad to 8 for leftovers
+ if(is_leftover)
+ {
+ for(uint i = 1; i < 8; i++)
+ {
+ uint pos = (i >= numChildren) ? i : 7;
+ LeafTableEntry* leafCur = &leaf[pos];
+ leafCur->leaf_index = -1 << 3;
+ }
+ }
+ }
+ }
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(64, 1, 1))) void kernel
+build_innernode_table(
+ global struct BVHBase* bvh )
+{
+ uint tid = get_local_id(0) + get_group_id(0)*get_local_size(0);
+
+ if ( tid < BVHBase_GetNumInternalNodes(bvh) )
+ {
+ InternalNode* curNode = BVHBase_GetInternalNodes(bvh)+tid;
+
+ if ( !is_fat_leaf( curNode ) )
+ {
+ uint offs = atomic_inc_global( &bvh->innerCount );
+
+ BackPointers* backPointers = BVHBase_GetBackPointers(bvh);
+ uint bp = *InnerNode_GetBackPointer(backPointers, tid);
+
+ InnerNodeTableEntry* inner = BVHBase_GetInnerNodeTable(bvh)+offs;
+ inner->node_index_and_numchildren = (tid<<3) | ((bp>>3) &7);
+ inner->first_child = tid + curNode->childOffset;
+ }
+ }
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(256, 1, 1))) void kernel
+fixup_quad_table(
+ global struct BVHBase* bvh )
+{
+ // This kernel has 2 work groups that set the magic number for unused data in
+ // fatleaf table. One work group for thelast group of the first part where quads are packed,
+ // second one for the last group of the part where quads are stored padded
+
+ uint numQuads = BVHBase_GetNumQuads(bvh);
+ uint numQuadLeftovers = bvh->quadLeftoversCountNewAtomicUpdate;
+ uint numQuadLeftovers_aligned_256 = (numQuadLeftovers + 255) & ~255;
+
+ uint numQuads_aligned_256 = (numQuads + 255) & ~255;
+ uint quadOffsetEnd = numQuads_aligned_256 + get_group_id(0) * numQuadLeftovers_aligned_256;
+ uint quadOffsetStart = quadOffsetEnd - 256;
+
+ uint quads_number_last_group = (get_group_id(0) == 0) ? numQuads : numQuads_aligned_256 + numQuadLeftovers;
+
+ uint leftovers = quadOffsetEnd - quads_number_last_group;
+
+ uint tid = get_local_id(0) > (255 - leftovers) ? get_local_id(0) : 256 - leftovers;
+
+ if(leftovers != 0)
+ {
+ LeafTableEntry* leafBvh = BVHBase_GetFatLeafTable(bvh);
+
+ LeafTableEntry* leaf = &leafBvh[quadOffsetStart + tid];
+ leaf->leaf_index = -1 << 3;
+ }
+
+ if(get_group_id(0) == 1 && get_local_id(0) == 0)
+ bvh->quadTableSizeNewAtomicUpdate = quadOffsetEnd;
+}
+
+
+// updates one quad leaf and gets BBOX contatining it
+GRL_INLINE void refit_bottom_child_quad_WB(
+ global struct QuadLeaf* quad,
+ global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc,
+ struct AABB* childAABB)
+{
+ /* get the geomID and primID0/1 for both quad triangles */
+ const uint geomID = PrimLeaf_GetGeoIndex(&quad->leafDesc);
+ const uint primID0 = quad->primIndex0;
+ const uint primID1 = primID0 + QuadLeaf_GetPrimIndexDelta(quad);
+ ushort fourth_vert = 0;
+
+ if (primID1 != primID0)
+ {
+ ushort packed_indices = QuadLeaf_GetSecondTriangleIndices(quad);
+ fourth_vert = ((packed_indices & 0x0C) == 0x0C) ? 1 : fourth_vert;
+ fourth_vert = ((packed_indices & 0x30) == 0x30) ? 2 : fourth_vert;
+ }
+
+ global GRL_RAYTRACING_GEOMETRY_DESC* desc = geomDesc + geomID;
+
+ uint4 indices = GRL_load_quad_indices(desc, primID0, primID1, fourth_vert);
+
+ // read the indices of the 4 verts we want
+ float3 vtx0, vtx1, vtx2, vtx3;
+ GRL_load_quad_vertices(desc, &vtx0, &vtx1, &vtx2, &vtx3, indices);
+
+ childAABB->lower.xyz = min( min( vtx0, vtx1 ), min(vtx2,vtx3) );
+ childAABB->upper.xyz = max( max( vtx0, vtx1 ), max(vtx2,vtx3) );
+
+ float4 pack0 = (float4) ( vtx0.x, vtx0.y, vtx0.z, vtx1.x );
+ float4 pack1 = (float4) ( vtx1.y, vtx1.z, vtx2.x, vtx2.y );
+ float4 pack2 = (float4) ( vtx2.z, vtx3.x, vtx3.y, vtx3.z );
+
+ global uint4* dst_verts = (global uint4*) &(quad->v[0][0]);
+ store_uint4_L1WB_L3WB( dst_verts, 0, as_uint4(pack0) );
+ store_uint4_L1WB_L3WB( dst_verts, 1, as_uint4(pack1) );
+ store_uint4_L1WB_L3WB( dst_verts, 2, as_uint4(pack2) );
+}
+
+inline uchar4 uchar4_shuffle_down( uchar4 v, uint offs )
+{
+ uint vi = as_uint(v);
+ return as_uchar4(intel_sub_group_shuffle_down(vi,vi,offs));
+}
+inline uchar4 uchar4_broadcast( uchar4 v, uint offs )
+{
+ uint vi = as_uint(v);
+ return as_uchar4(sub_group_broadcast(vi,offs));
+}
+
+GRL_INLINE void sg_InternalNode_setFields(
+ struct InternalNode* node,
+ struct AABB reduced_aabb,
+ const int offset, const uint nodeType, struct AABB* input_aabb,
+ const uint numChildren, const uchar nodeMask )
+{
+ const float up = 1.0f + ulp;
+ const float down = 1.0f - ulp;
+
+ struct AABB conservative_aabb = conservativeAABB(&reduced_aabb);
+ const float3 org = conservative_aabb.lower.xyz;
+
+ const float3 len = AABB_size(&conservative_aabb).xyz * up;
+ int3 exp;
+ const float3 mant = frexp_vec3(len, &exp);
+ exp += (mant > (float3)QUANT_MAX_MANT ? (int3)1 : (int3)0);
+
+ uchar4 lower_uchar = 0x80;
+ uchar4 upper_uchar = 0;
+
+ ushort lane = get_sub_group_local_id();
+ ushort simd8_id = lane/8;
+ ushort logical_lane = lane%8;
+
+ if( logical_lane < numChildren )
+ {
+ struct AABB child_aabb = conservativeAABB( input_aabb ); // conservative ???
+
+ float3 lower = floor( bitShiftLdexp3( (child_aabb.lower.xyz - org) * down, -exp + 8 ) );
+ lower = clamp( lower, (float)(QUANT_MIN), (float)(QUANT_MAX) );
+ float3 upper = ceil( bitShiftLdexp3( (child_aabb.upper.xyz - org) * up, -exp + 8 ) );
+ upper = clamp( upper, (float)(QUANT_MIN), (float)(QUANT_MAX) );
+ lower_uchar.xyz = convert_uchar3_rtn( lower );
+ upper_uchar.xyz = convert_uchar3_rtp( upper );
+ }
+
+ uchar4 lo0 = lower_uchar;
+ uchar4 lo1 = uchar4_shuffle_down( lower_uchar, 1 );
+ uchar4 lo2 = uchar4_shuffle_down( lower_uchar, 2 );
+ uchar4 lo3 = uchar4_shuffle_down( lower_uchar, 3 );
+ uchar4 lo4 = uchar4_shuffle_down( lower_uchar, 4 );
+ uchar4 lo5 = uchar4_shuffle_down( lower_uchar, 5 );
+
+ uchar4 hi0 = upper_uchar;
+ uchar4 hi1 = uchar4_shuffle_down( upper_uchar,1 );
+ uchar4 hi2 = uchar4_shuffle_down( upper_uchar,2 );
+ uchar4 hi3 = uchar4_shuffle_down( upper_uchar,3 );
+ uchar4 hi4 = uchar4_shuffle_down( upper_uchar,4 );
+ uchar4 hi5 = uchar4_shuffle_down( upper_uchar,5 );
+
+ if( logical_lane == 0 )
+ {
+ uchar childBlockStride = 0x01 + (uint)(nodeType == NODE_TYPE_INSTANCE);
+
+ uint4 block0 = (uint4)(as_uint(org.x), as_uint(org.y), as_uint(org.z), offset);
+
+ char3 exp_char = (char3)(exp.x,exp.y,exp.z);
+
+ uint4 block1 = (uint4)(
+ as_uint((uchar4)(nodeType, 0 /* padding */, exp_char.x, exp_char.y)),
+ as_uint((uchar4)(exp_char.z, nodeMask, childBlockStride, childBlockStride)) ,
+ as_uint((uchar4)(childBlockStride, childBlockStride, childBlockStride, childBlockStride)) ,
+ as_uint((uchar4)(lo0.x,lo1.x,lo2.x,lo3.x))
+ );
+
+ uint4 block2 = (uint4)(
+ as_uint((uchar4)(lo4.x,lo5.x,hi0.x,hi1.x)) ,
+ as_uint((uchar4)(hi2.x,hi3.x,hi4.x,hi5.x)) ,
+ as_uint((uchar4)(lo0.y,lo1.y,lo2.y,lo3.y)) ,
+ as_uint((uchar4)(lo4.y,lo5.y,hi0.y,hi1.y))
+ );
+
+ uint4 block3 = (uint4)(
+ as_uint((uchar4)(hi2.y,hi3.y,hi4.y,hi5.y)),
+ as_uint((uchar4)(lo0.z,lo1.z,lo2.z,lo3.z)),
+ as_uint((uchar4)(lo4.z,lo5.z,hi0.z,hi1.z)),
+ as_uint((uchar4)(hi2.z,hi3.z,hi4.z,hi5.z))
+ );
+
+ global uint4* pNode = (global uint4*)node;
+
+#if 0
+ printf(
+ "block0 = %08x,%08x,%08x,%08x %08x,%08x,%08x,%08x \n"
+ "block1 = %08x,%08x,%08x,%08x %08x,%08x,%08x,%08x \n"
+ "block2 = %08x,%08x,%08x,%08x %08x,%08x,%08x,%08x \n"
+ "block3 = %08x,%08x,%08x,%08x %08x,%08x,%08x,%08x \n" ,
+ block0.x,block0.y,block0.z,block0.w,
+ pNode[0].x, pNode[0].y, pNode[0].z, pNode[0].w,
+ block1.x,block1.y,block1.z,block1.w,
+ pNode[1].x, pNode[1].y, pNode[1].z, pNode[1].w,
+ block2.x,block2.y,block2.z,block2.w,
+ pNode[2].x, pNode[2].y, pNode[2].z, pNode[2].w ,
+ block3.x,block3.y,block3.z,block3.w,
+ pNode[3].x, pNode[3].y, pNode[3].z, pNode[3].w );
+#endif
+
+ store_uint4_L1WB_L3WB( pNode, 0, block0 );
+ store_uint4_L1WB_L3WB( pNode, 1, block1 );
+ store_uint4_L1WB_L3WB( pNode, 2, block2 );
+ store_uint4_L1WB_L3WB( pNode, 3, block3 );
+ }
+
+}
+
+
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(256, 1, 1)))
+void kernel
+traverse_aabbs_quad(
+ global struct BVHBase* bvh,
+ global struct RefitScratch* scratch,
+ global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc
+ )
+{
+
+ uniform uint num_nodes = BVHBase_GetNumInternalNodes(bvh);
+ varying ushort lane = get_sub_group_local_id();
+
+ uniform uint num_leaves = bvh->fatLeafCount;
+
+ local struct RefitScratch local_scratch[256];
+ if( get_local_id(0) < min(num_nodes,256u) )
+ {
+ for( uint i=0; i<3; i++ ){
+ local_scratch[get_local_id(0)].lower[i] = FLT_MAX;
+ local_scratch[get_local_id(0)].upper[i] = FLT_MAX;
+ }
+ }
+
+ barrier( CLK_LOCAL_MEM_FENCE );
+
+
+ ushort SIMD8_PER_SG = get_sub_group_size()/8;
+ ushort SIMD8_PER_WG = get_num_sub_groups()*SIMD8_PER_SG;
+ ushort simd8_local_id = get_sub_group_local_id()/8;
+ ushort simd8_id = get_sub_group_id()*SIMD8_PER_SG + simd8_local_id;
+ ushort logical_lane = lane%8;
+
+ uniform uint fatleaf_index = simd8_id + get_group_id(0)*SIMD8_PER_WG;
+
+
+ if ( fatleaf_index < num_leaves )
+ {
+ LeafTableEntry* leaf = BVHBase_GetFatLeafTable(bvh)+fatleaf_index;
+ uint innerNodeIdx = leaf->inner_node_index;
+ uint bp = leaf->backpointer;
+ uint leaf_index = leaf->leaf_index;
+
+ varying InternalNode* curNode = BVHBase_GetInternalNodes(bvh)+innerNodeIdx;
+ varying QuadLeaf* quad = BVHBase_GetQuadLeaves(bvh) + leaf_index;
+
+ uint childOffs = (((char*)quad) - ((char*)curNode))/64;
+
+ varying struct AABB childrenBox;
+ AABB_init(&childrenBox);
+
+ uint numChildren = (bp >> 3) & 0x7;
+ if (logical_lane < numChildren)
+ {
+ refit_bottom_child_quad_WB(
+ (global struct QuadLeaf*) &quad[logical_lane],
+ geomDesc,
+ &childrenBox );
+ }
+
+ struct AABB reduce_bounds0 = AABB_sub_group_reduce_N6(&childrenBox);
+ struct AABB reduce_bounds = AABB_sub_group_broadcast(&reduce_bounds0,0);
+ for (uint i = 1; i < SIMD8_PER_SG; i++)
+ {
+ struct AABB reduce_bounds1 = AABB_sub_group_broadcast(&reduce_bounds0, 8*i);
+ int3 is_upper_lane = ((uint3)(i)) == simd8_local_id;
+ reduce_bounds.lower.xyz = select( reduce_bounds.lower.xyz, reduce_bounds1.lower.xyz, is_upper_lane );
+ reduce_bounds.upper.xyz = select( reduce_bounds.upper.xyz, reduce_bounds1.upper.xyz, is_upper_lane );
+ }
+
+ sg_InternalNode_setFields(
+ curNode,
+ reduce_bounds,
+ childOffs,
+ NODE_TYPE_QUAD,
+ &childrenBox,
+ numChildren,
+ 0xff );
+
+ // atomic min operation vectorized across 6 lanes
+ // [ lower.xyz ][-][upper.xyz][-]
+ //
+ // Lanes 3 and 7 are inactive. 'upper' is negated
+ bool atomic_mask = (1<<logical_lane) & 0x77;
+
+ uint lmod = logical_lane % 4;
+ uint ldiv = logical_lane / 4;
+ float vlo = reduce_bounds.lower.x;
+ float vhi = reduce_bounds.upper.x;
+ vlo = (lmod == 1) ? reduce_bounds.lower.y : vlo;
+ vhi = (lmod == 1) ? reduce_bounds.upper.y : vhi;
+ vlo = (lmod == 2) ? reduce_bounds.lower.z : vlo;
+ vhi = (lmod == 2) ? reduce_bounds.upper.z : vhi;
+ float v = (ldiv == 0) ? vlo : -vhi;
+
+
+ global float* pv = (global float*) &scratch[innerNodeIdx];
+
+ store_uint_L1WB_L3WB( (global uint*)(pv+logical_lane), 0, as_uint(v));
+
+ BackPointers* backPointers = BVHBase_GetBackPointers(bvh);
+ uint parent = (bp >> 6);
+
+ // check for parent != 0x03FFFFFF once to be sure we don't enter parent >= 256
+ if(atomic_mask && parent != 0x03FFFFFF)
+ {
+ while( parent >= 256 )
+ {
+ innerNodeIdx = parent;
+ bp = *InnerNode_GetBackPointer(backPointers, innerNodeIdx);
+ atomic_min( ((global float*) &(scratch[innerNodeIdx]))+logical_lane, v );
+ parent = bp >> 6;
+ }
+ while( parent != 0x03FFFFFF )
+ {
+ innerNodeIdx = parent;
+ bp = *InnerNode_GetBackPointer(backPointers, innerNodeIdx);
+ atomic_min( ((local float*) &(local_scratch[innerNodeIdx]))+logical_lane, v );
+ parent = bp >> 6;
+ }
+ }
+
+ }
+
+
+ barrier( CLK_LOCAL_MEM_FENCE );
+ num_nodes = min(num_nodes,256u);
+
+ local float* in = (local float*)&local_scratch[0];
+ global float* out = (global float*)&scratch[0];
+
+ for (uint i = get_local_id(0); i < num_nodes*6; i += 256 )
+ {
+ // since we want to save [ lower.xyz ][-][upper.xyz][-] i.e 0,1,2, 4,5,6 etc. we need to offset +1 for every triplet
+ uint idx = i + (i/3);
+
+ float v = in[idx];
+ if( v != FLT_MAX )
+ atomic_min( out + idx , v );
+ }
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(64, 1, 1)))
+void kernel
+write_inner_nodes(
+ global struct BVHBase* bvh,
+ global struct RefitScratch* scratch
+ )
+{
+ uint SIMD8_PER_SG = get_sub_group_size()/8;
+ uniform uint node_id = SIMD8_PER_SG * get_sub_group_global_id() + (get_sub_group_local_id()/8);
+ varying ushort lane = get_sub_group_local_id() % 8;
+ varying uint num_inners = bvh->innerCount;
+
+ if ( node_id < num_inners )
+ {
+ InnerNodeTableEntry* entry = BVHBase_GetInnerNodeTable(bvh) + node_id;
+ uint node_index = entry->node_index_and_numchildren>>3;
+ uint numChildren = entry->node_index_and_numchildren & 7;
+ uint first_child = entry->first_child;
+
+ varying InternalNode* curNode = BVHBase_GetInternalNodes(bvh)+node_index;
+
+ varying struct AABB childAABB;
+ AABB_init(&childAABB);
+
+ if( lane < numChildren )
+ {
+ uint child = first_child + lane;
+ childAABB.lower.x = scratch[child].lower[0];
+ childAABB.lower.y = scratch[child].lower[1];
+ childAABB.lower.z = scratch[child].lower[2];
+ childAABB.upper.x = -scratch[child].upper[0];
+ childAABB.upper.y = -scratch[child].upper[1];
+ childAABB.upper.z = -scratch[child].upper[2];
+ }
+
+ varying struct AABB reduce_bounds0 = AABB_sub_group_reduce_N6(&childAABB);
+ struct AABB reduce_bounds = AABB_sub_group_broadcast(&reduce_bounds0,0);
+ for (uint i = 1; i < SIMD8_PER_SG; i++)
+ {
+ struct AABB reduce_bounds1 = AABB_sub_group_broadcast(&reduce_bounds0, 8*i);
+ int3 is_upper_lane = ((uint3)(i)) == (get_sub_group_local_id()/8);
+ reduce_bounds.lower.xyz = select( reduce_bounds.lower.xyz, reduce_bounds1.lower.xyz, is_upper_lane );
+ reduce_bounds.upper.xyz = select( reduce_bounds.upper.xyz, reduce_bounds1.upper.xyz, is_upper_lane );
+ }
+
+ sg_InternalNode_setFields(
+ curNode,
+ reduce_bounds,
+ first_child - node_index,
+ NODE_TYPE_INTERNAL,
+ &childAABB,
+ numChildren,
+ 0xff );
+
+ }
+
+ if (node_id == 0 && lane == 0 )
+ {
+ bvh->Meta.bounds.lower[0] = scratch[0].lower[0];
+ bvh->Meta.bounds.lower[1] = scratch[0].lower[1];
+ bvh->Meta.bounds.lower[2] = scratch[0].lower[2];
+ bvh->Meta.bounds.upper[0] = -scratch[0].upper[0];
+ bvh->Meta.bounds.upper[1] = -scratch[0].upper[1];
+ bvh->Meta.bounds.upper[2] = -scratch[0].upper[2];
+ }
+
+}
+
+
+
+#if 1
+#define SLM_BOX_COUNT 1024
+
+struct AABB load_box( uint place, local struct AABB* local_boxes, global struct AABB* extra_boxes )
+{
+ if( place < SLM_BOX_COUNT )
+ return local_boxes[place];
+ else
+ return extra_boxes[place-SLM_BOX_COUNT];
+}
+
+void store_box( struct AABB box, uint place, local struct AABB* local_boxes, global struct AABB* extra_boxes )
+{
+ if (place < SLM_BOX_COUNT)
+ {
+ local_boxes[place] = box;
+ }
+ else
+ {
+ global uint4* ptr = (global uint4*)&extra_boxes[place-SLM_BOX_COUNT];
+ store_uint4_L1WB_L3WB( ptr, 0, as_uint4(box.lower) );
+ store_uint4_L1WB_L3WB( ptr+1, 0, as_uint4(box.upper) );
+ }
+}
+
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(512, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16)))
+void kernel
+update_single_group_quads(
+ global struct BVHBase* bvh,
+ global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc,
+ global struct AABB* extra_boxes
+)
+{
+ uniform uint tid = get_sub_group_global_id();
+ uniform uint num_nodes = BVHBase_GetNumInternalNodes(bvh);
+ uniform uint num_leaves = bvh->fatLeafCount;
+ uniform uint num_inners = bvh->innerCount;
+
+ varying ushort lane = get_sub_group_local_id();
+
+ local struct AABB local_boxes[SLM_BOX_COUNT]; // == 32KB
+
+ // initialize nodes
+ for (uint i = get_local_id( 0 ); i < num_nodes; i+= get_local_size(0))
+ {
+ struct AABB tmp;
+ AABB_init(&tmp);
+ tmp.upper = -tmp.upper;
+ store_box( tmp, i, local_boxes, extra_boxes );
+ }
+
+
+ if( num_nodes > SLM_BOX_COUNT )
+ mem_fence_workgroup_default();
+
+ barrier( CLK_LOCAL_MEM_FENCE );
+
+
+ ushort SIMD8_PER_SG = get_sub_group_size()/8;
+ ushort NUM_SIMD8 = get_num_sub_groups()*SIMD8_PER_SG;
+ ushort simd8_local_id = get_sub_group_local_id()/8;
+ ushort simd8_id = get_sub_group_id()*SIMD8_PER_SG + simd8_local_id;
+ ushort logical_lane = lane%8;
+
+
+ for ( uint i = simd8_id; i < num_leaves; i+= NUM_SIMD8 )
+ {
+ LeafTableEntry* leaf = BVHBase_GetFatLeafTable(bvh)+i;
+ uint innerNodeIdx = leaf->inner_node_index;
+ uint bp = leaf->backpointer;
+ uint leaf_index = leaf->leaf_index;
+
+ varying InternalNode* curNode = BVHBase_GetInternalNodes(bvh)+innerNodeIdx;
+ QuadLeaf* quad = BVHBase_GetQuadLeaves(bvh) + leaf_index;
+
+ uint childOffs = (((char*)quad) - ((char*)curNode))/64;
+
+ varying struct AABB childrenBox;
+ AABB_init(&childrenBox);
+
+ uint numChildren = (bp >> 3) & 0x7;
+ if (logical_lane < numChildren)
+ {
+
+ refit_bottom_child_quad_WB(
+ (global struct QuadLeaf*) &quad[logical_lane],
+ geomDesc,
+ &childrenBox );
+ }
+
+ struct AABB reduce_bounds0 = AABB_sub_group_reduce_N6(&childrenBox);
+ struct AABB reduce_bounds = AABB_sub_group_broadcast(&reduce_bounds0,0);
+ for (uint i = 1; i < SIMD8_PER_SG; i++)
+ {
+ struct AABB reduce_bounds1 = AABB_sub_group_broadcast(&reduce_bounds0, 8*i);
+ int3 is_upper_lane = ((uint3)(i)) == simd8_local_id;
+ reduce_bounds.lower.xyz = select( reduce_bounds.lower.xyz, reduce_bounds1.lower.xyz, is_upper_lane );
+ reduce_bounds.upper.xyz = select( reduce_bounds.upper.xyz, reduce_bounds1.upper.xyz, is_upper_lane );
+ }
+
+
+ if( logical_lane == 0 )
+ {
+ struct AABB negated = reduce_bounds;
+ negated.upper = -negated.upper;
+ store_box( negated, innerNodeIdx, local_boxes, extra_boxes );
+ }
+
+ sg_InternalNode_setFields(
+ curNode,
+ reduce_bounds,
+ childOffs,
+ NODE_TYPE_QUAD,
+ &childrenBox,
+ numChildren,
+ 0xff );
+
+
+ // atomic min operation vectorized across 6 lanes
+ // [ lower.xyz ][-][upper.xyz][-]
+ //
+ // Lanes 3 and 7 are inactive. 'upper' is negated
+ uint lmod = logical_lane % 4;
+ uint ldiv = logical_lane / 4;
+ float vlo = reduce_bounds.lower.x;
+ float vhi = reduce_bounds.upper.x;
+ vlo = (lmod == 1) ? reduce_bounds.lower.y : vlo;
+ vhi = (lmod == 1) ? reduce_bounds.upper.y : vhi;
+ vlo = (lmod == 2) ? reduce_bounds.lower.z : vlo;
+ vhi = (lmod == 2) ? reduce_bounds.upper.z : vhi;
+ float v = (ldiv == 0) ? vlo : -vhi;
+ bool atomic_mask = (1<<logical_lane) & 0x77;
+
+ BackPointers* backPointers = BVHBase_GetBackPointers(bvh);
+ uint parent = (bp >> 6);
+
+ // check for parent != 0x03FFFFFF once to be sure we don't enter parent >= SLM_BOX_COUNT
+ if(atomic_mask && parent != 0x03FFFFFF)
+ {
+ while( parent >= SLM_BOX_COUNT )
+ {
+ innerNodeIdx = parent;
+ bp = *InnerNode_GetBackPointer(backPointers, innerNodeIdx);
+ atomic_min( ((global float*) &(extra_boxes[innerNodeIdx-SLM_BOX_COUNT]))+logical_lane, v );
+ parent = bp >> 6;
+ }
+ while( parent != 0x03FFFFFF )
+ {
+ innerNodeIdx = parent;
+ bp = *InnerNode_GetBackPointer(backPointers, innerNodeIdx);
+ atomic_min( ((local float*) &(local_boxes[innerNodeIdx]))+logical_lane, v );
+ parent = bp >> 6;
+ }
+ }
+
+ }
+
+ if( num_nodes > SLM_BOX_COUNT )
+ mem_fence_workgroup_default();
+
+ barrier( CLK_LOCAL_MEM_FENCE );
+
+ for ( uint i = simd8_id; i < num_inners; i+= NUM_SIMD8 )
+ {
+ InnerNodeTableEntry* inner = BVHBase_GetInnerNodeTable(bvh) + i;
+ uint node_index = inner->node_index_and_numchildren>>3;
+ uint numChildren = inner->node_index_and_numchildren & 7;
+ uint first_child = inner->first_child;
+
+ varying InternalNode* curNode = BVHBase_GetInternalNodes(bvh)+ node_index;
+
+ //if (curNode->nodeType == BVH_INTERNAL_NODE) // TODO: Needs updating for traversal shaders
+ { // TODO: Consider using an inner node table or UC load to avoid polluting LSC with these reads
+ uint child = first_child + logical_lane;
+
+ bool child_valid = (logical_lane < numChildren);
+
+ struct AABB childAABB;
+ AABB_init(&childAABB);
+ if (child_valid)
+ {
+ childAABB = load_box( child, local_boxes, extra_boxes );
+ childAABB.upper = -childAABB.upper;
+ }
+
+ varying struct AABB reduce_bounds0 = AABB_sub_group_reduce_N6(&childAABB);
+ struct AABB reduce_bounds = AABB_sub_group_broadcast(&reduce_bounds0,0);
+ for (uint i = 1; i < SIMD8_PER_SG; i++)
+ {
+ struct AABB reduce_bounds1 = AABB_sub_group_broadcast(&reduce_bounds0, 8*i);
+ int3 is_upper_lane = ((uint3)(i)) == (get_sub_group_local_id()/8);
+ reduce_bounds.lower.xyz = select( reduce_bounds.lower.xyz, reduce_bounds1.lower.xyz, is_upper_lane );
+ reduce_bounds.upper.xyz = select( reduce_bounds.upper.xyz, reduce_bounds1.upper.xyz, is_upper_lane );
+ }
+
+ sg_InternalNode_setFields(
+ curNode,
+ reduce_bounds,
+ first_child - node_index,
+ NODE_TYPE_INTERNAL,
+ &childAABB,
+ numChildren,
+ 0xff );
+ }
+ }
+
+
+ if (get_sub_group_id() == 0 && lane == 0 )
+ {
+ bvh->Meta.bounds.lower[0] = local_boxes[0].lower.x;
+ bvh->Meta.bounds.lower[1] = local_boxes[0].lower.y;
+ bvh->Meta.bounds.lower[2] = local_boxes[0].lower.z;
+ bvh->Meta.bounds.upper[0] = -local_boxes[0].upper.x;
+ bvh->Meta.bounds.upper[1] = -local_boxes[0].upper.y;
+ bvh->Meta.bounds.upper[2] = -local_boxes[0].upper.z;
+ }
+
+}
+#endif
+
+GRL_INLINE void traverse_aabbs_new_update_func(
+ global struct BVHBase* bvh,
+ global char* vertices,
+ global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc,
+ global struct RefitScratch* scratch,
+ uint vertex_format,
+ local struct AABB3f* children_AABBs,
+ local uint* num_fat_leaves,
+ local struct LeafTableEntry* leafTable_local,
+ const bool single_geo
+ )
+{
+ // The first part of the kernel with vertices loads/stores is executed with quad per work item,
+ // using previously prepared QuadDataIndices to get the quad data and vert indices
+ // Second part of the kernel that does the reduction, update fatleaf ain bvh and bottom up is
+ // executed per simd.
+ // For bottom up tested also with local part (using local scratch) but since there is not enough SLM additional
+ // barriers were needed to clean and reuse SLM, which curretnly kills performance. Could be worth to revisit
+ // on future gens.
+
+ varying uint lid = get_local_id(0);
+ varying uint tid = lid + get_group_id(0)*get_local_size(0);
+
+ num_fat_leaves[0] = 0;
+ leafTable_local[lid].leaf_index = -1 << 3;
+
+ LeafTableEntry* leaf = (LeafTableEntry*)(((char*)bvh) + (64u * bvh->fatLeafTableStart + 12 * tid));
+ uint innerNodeIdx_mem = leaf->inner_node_index;
+ uint bp = leaf->backpointer;
+ uint leaf_index_mem = leaf->leaf_index;
+
+ uint numChildren = (bp >> 3) & 0x7;
+
+ uint leaf_index = leaf_index_mem >> 3;
+ uint slm_child_offset = leaf_index_mem & 0x7;
+
+ uint innerNodeIdx = innerNodeIdx_mem >> 8;
+ uint slm_pos_main = innerNodeIdx_mem & 0xFF;
+
+ uint first_el_of_group = get_group_id(0)*get_local_size(0);
+ uint quadsNum = BVHBase_GetNumQuads(bvh);
+ uint expected_tid = first_el_of_group < quadsNum ? first_el_of_group : quadsNum - 1;
+
+ // Skip writes when not all children for single fatleaf are present in this work group
+ bool skip_tid = leaf_index == 0x1FFFFFFF;
+ leaf_index = skip_tid ? expected_tid : leaf_index;
+
+ // Compute bounding box for quads
+ varying struct AABB3f childrenBox;
+
+ tid = leaf_index + slm_child_offset;
+
+ // Read vertex indices and quad header from separate buffer
+ uint quadIndicesStart = bvh->quadIndicesDataStart;
+ varying struct QuadDataIndices* vertex_indice_ptr = (QuadDataIndices*)(((char*)bvh) + (64u * quadIndicesStart + 32 * tid));
+ QuadDataIndices vertexMap = vertex_indice_ptr[0];
+
+ varying global uint4* bounds = (global uint4*)((char*)bvh + (64*bvh->quadLeafStart + 64*tid) );
+ uint4 quad_data = (uint4)(vertexMap.header_data[0], vertexMap.header_data[1], vertexMap.header_data[2], vertexMap.header_data[3]);
+ uint4 indices = (uint4)(vertexMap.vert_idx[0], vertexMap.vert_idx[1], vertexMap.vert_idx[2], vertexMap.vert_idx[3]);
+
+ global GRL_RAYTRACING_GEOMETRY_DESC* desc = geomDesc;
+
+ if(!single_geo)
+ {
+ uint geomID = vertexMap.header_data[0] & 0xFFFFFF;
+ desc += geomID;
+ vertices = (global char*)desc->Desc.Triangles.pVertexBuffer;
+ vertex_format = desc->Desc.Triangles.VertexFormat;
+ }
+
+ float3 vtx0, vtx1, vtx2, vtx3;
+ GRL_load_quad_vertices_no_stride(desc, &vtx0, &vtx1, &vtx2, &vtx3, indices, vertex_format, vertices);
+
+ for(uint i = 0; i < 3; i++)
+ childrenBox.lower[i] = min( min( vtx0[i], vtx1[i] ), min(vtx2[i],vtx3[i]) );
+
+ for(uint i = 0; i < 3; i++)
+ childrenBox.upper[i] = max( max( vtx0[i], vtx1[i] ), max(vtx2[i],vtx3[i]) );
+
+ float4 pack0 = (float4) ( vtx0.x, vtx0.y, vtx0.z, vtx1.x );
+ float4 pack1 = (float4) ( vtx1.y, vtx1.z, vtx2.x, vtx2.y );
+ float4 pack2 = (float4) ( vtx2.z, vtx3.x, vtx3.y, vtx3.z );
+
+ // Store quad data in bvh
+ // Make sure this goes without partial writes to get best perf
+ store_uint4_L1WB_L3WB( bounds, 0, quad_data );
+ store_uint4_L1WB_L3WB( bounds, 1, as_uint4(pack0) );
+ store_uint4_L1WB_L3WB( bounds, 2, as_uint4(pack1) );
+ store_uint4_L1WB_L3WB( bounds, 3, as_uint4(pack2) );
+
+ barrier( CLK_LOCAL_MEM_FENCE );
+
+ struct AABB reduce_bounds;
+
+ if(!skip_tid)
+ {
+ // Store AABB in SLM, to be used later for children quantization in fatleaf
+ children_AABBs[slm_pos_main + slm_child_offset] = childrenBox;
+
+ if(slm_child_offset == 0)
+ {
+ uint offset = atomic_inc_local(&num_fat_leaves[0]);
+ leafTable_local[offset].inner_node_index = innerNodeIdx_mem;
+ leafTable_local[offset].backpointer = bp;
+ leafTable_local[offset].leaf_index = leaf_index_mem;
+ }
+ }
+
+ barrier( CLK_LOCAL_MEM_FENCE );
+
+ varying ushort lane = get_sub_group_local_id();
+ ushort SIMD8_PER_SG = get_sub_group_size()/8;
+ ushort SIMD8_PER_WG = get_num_sub_groups()*SIMD8_PER_SG;
+ ushort simd8_local_id = get_sub_group_local_id()/8;
+ ushort simd8_id = get_sub_group_id()*SIMD8_PER_SG + simd8_local_id;
+ ushort logical_lane = lane%8;
+
+ uint fatleaves_aligned_32 = (num_fat_leaves[0] + 31) & ~31;
+
+ for(uint offset = 0; offset < fatleaves_aligned_32; offset += 32)
+ {
+ uniform uint fatleaf_index = simd8_id + offset;
+ uint innerNodeIdx_mem = leafTable_local[fatleaf_index].inner_node_index;
+ uint bp = leafTable_local[fatleaf_index].backpointer;
+ uint leaf_index_mem = leafTable_local[fatleaf_index].leaf_index;
+
+ uint numChildren = (bp >> 3) & 0x7;
+
+ uint leaf_index = leaf_index_mem >> 3;
+ uint slm_child_offset = leaf_index_mem & 0x7;
+
+ uint innerNodeIdx = innerNodeIdx_mem >> 8;
+ uint slm_pos_main = innerNodeIdx_mem & 0xFF;
+
+ bool skip_tid = leaf_index == 0x1FFFFFFF;
+ bool active_lane = (logical_lane < numChildren);
+ uint lane_children = active_lane ? logical_lane : 0;
+
+ fatleaf_index = leaf_index;
+
+ varying InternalNode* curNode = (InternalNode*)(((char*)bvh) + (BVH_ROOT_NODE_OFFSET + 64 * innerNodeIdx));
+
+ global struct Quad *quads = (global struct Quad *)((char*)bvh + 64*bvh->quadLeafStart );
+
+ varying struct AABB childrenBox_bu;
+ AABB_init(&childrenBox_bu);
+
+ if(!skip_tid)
+ childrenBox_bu = AABBfromAABB3f(children_AABBs[slm_pos_main + lane_children]);
+
+ struct AABB reduce_bounds0 = AABB_sub_group_reduce_N6(&childrenBox_bu);
+ struct AABB reduce_bounds = AABB_sub_group_broadcast(&reduce_bounds0,0);
+
+ for (uint i = 1; i < SIMD8_PER_SG; i++)
+ {
+ struct AABB reduce_bounds1 = AABB_sub_group_broadcast(&reduce_bounds0, 8*i);
+ int3 is_upper_lane = ((uint3)(i)) == simd8_local_id;
+ reduce_bounds.lower.xyz = select( reduce_bounds.lower.xyz, reduce_bounds1.lower.xyz, is_upper_lane );
+ reduce_bounds.upper.xyz = select( reduce_bounds.upper.xyz, reduce_bounds1.upper.xyz, is_upper_lane );
+ }
+
+ if(!skip_tid)
+ {
+ uint quad_offset = 64u * bvh->quadLeafStart + 64 * fatleaf_index;
+ varying QuadLeaf* quad = (QuadLeaf*)(((char*)bvh) + quad_offset);
+ uint childOffs = (((char*)quad) - ((char*)curNode))/64;
+
+ sg_InternalNode_setFields(
+ curNode,
+ reduce_bounds,
+ childOffs,
+ NODE_TYPE_QUAD,
+ &childrenBox_bu,
+ numChildren,
+ 0xff );
+
+ bool atomic_mask = (1<<logical_lane) & 0x77;
+
+ uint lmod = logical_lane % 4;
+ uint ldiv = logical_lane / 4;
+ float vlo = reduce_bounds.lower.x;
+ float vhi = reduce_bounds.upper.x;
+ vlo = (lmod == 1) ? reduce_bounds.lower.y : vlo;
+ vhi = (lmod == 1) ? reduce_bounds.upper.y : vhi;
+ vlo = (lmod == 2) ? reduce_bounds.lower.z : vlo;
+ vhi = (lmod == 2) ? reduce_bounds.upper.z : vhi;
+ float v = (ldiv == 0) ? vlo : -vhi;
+
+ global float* pv = (global float*) &scratch[innerNodeIdx];
+
+ store_uint_L1WB_L3WB( (global uint*)(pv+logical_lane), 0, as_uint(v));
+
+ BackPointers* backPointers = BVHBase_GetBackPointers(bvh);
+ uint parent = (bp >> 6);
+
+ global float* parent_v = (global float*) &(scratch[parent]) + logical_lane;
+
+ if(atomic_mask && (*parent_v >= v) && (parent != 0x03FFFFFF))
+ {
+ innerNodeIdx = parent;
+ bp = *InnerNode_GetBackPointer(backPointers, innerNodeIdx);
+ atomic_min( parent_v, v );
+ parent = bp >> 6;
+
+ if(parent != 0x03FFFFFF)
+ {
+ while( parent != 0x03FFFFFF )
+ {
+ innerNodeIdx = parent;
+ bp = *InnerNode_GetBackPointer(backPointers, innerNodeIdx);
+
+ global float* parent_v_global = (global float*) &(scratch[innerNodeIdx]) + logical_lane;
+ if(*parent_v_global >= v)
+ atomic_min( parent_v_global, v );
+ else
+ break;
+
+ parent = bp >> 6;
+ }
+ }
+ }
+ }
+ }
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(256, 1, 1)))
+__attribute__( (intel_reqd_sub_group_size( 16 )) )
+void kernel
+traverse_aabbs_new_update(
+ global struct BVHBase* bvh,
+ global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc,
+ global struct RefitScratch* scratch
+ )
+{
+ varying uint lid = get_local_id(0);
+ varying uint tid = lid + get_group_id(0)*get_local_size(0);
+
+ local struct AABB3f children_AABBs[256];
+ local struct LeafTableEntry leafTable_local[256];
+ local uint num_fat_leaves;
+
+ traverse_aabbs_new_update_func(bvh, (global char*)geomDesc /* not used */, geomDesc, scratch, (uint)-1 /* not used */,
+ &children_AABBs[0], &num_fat_leaves, &leafTable_local[0], false);
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(256, 1, 1)))
+__attribute__( (intel_reqd_sub_group_size( 16 )) )
+void kernel
+traverse_aabbs_new_update_single_geo(
+ global struct BVHBase* bvh,
+ global char* vertices,
+ global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc,
+ global struct RefitScratch* scratch,
+ const uint vertex_format
+ )
+{
+ varying uint lid = get_local_id(0);
+ varying uint tid = lid + get_group_id(0)*get_local_size(0);
+
+ local struct AABB3f children_AABBs[256];
+ local struct LeafTableEntry leafTable_local[256];
+ local uint num_fat_leaves;
+
+ if(vertex_format == VERTEX_FORMAT_R32G32B32_FLOAT)
+ traverse_aabbs_new_update_func(bvh, vertices, geomDesc, scratch, VERTEX_FORMAT_R32G32B32_FLOAT,
+ &children_AABBs[0], &num_fat_leaves, &leafTable_local[0], true);
+ else if(vertex_format == VERTEX_FORMAT_R32G32_FLOAT)
+ traverse_aabbs_new_update_func(bvh, vertices, geomDesc, scratch, VERTEX_FORMAT_R32G32_FLOAT,
+ &children_AABBs[0], &num_fat_leaves, &leafTable_local[0], true);
+ else if(vertex_format == VERTEX_FORMAT_R16G16B16A16_FLOAT)
+ traverse_aabbs_new_update_func(bvh, vertices, geomDesc, scratch, VERTEX_FORMAT_R16G16B16A16_FLOAT,
+ &children_AABBs[0], &num_fat_leaves, &leafTable_local[0], true);
+ else if(vertex_format == VERTEX_FORMAT_R16G16_FLOAT)
+ traverse_aabbs_new_update_func(bvh, vertices, geomDesc, scratch, VERTEX_FORMAT_R16G16_FLOAT,
+ &children_AABBs[0], &num_fat_leaves, &leafTable_local[0], true);
+ else if(vertex_format == VERTEX_FORMAT_R16G16B16A16_SNORM)
+ traverse_aabbs_new_update_func(bvh, vertices, geomDesc, scratch, VERTEX_FORMAT_R16G16B16A16_SNORM,
+ &children_AABBs[0], &num_fat_leaves, &leafTable_local[0], true);
+ else if(vertex_format == VERTEX_FORMAT_R16G16_SNORM)
+ traverse_aabbs_new_update_func(bvh, vertices, geomDesc, scratch, VERTEX_FORMAT_R16G16_SNORM,
+ &children_AABBs[0], &num_fat_leaves, &leafTable_local[0], true);
+ else if(vertex_format == VERTEX_FORMAT_R16G16B16A16_UNORM)
+ traverse_aabbs_new_update_func(bvh, vertices, geomDesc, scratch, VERTEX_FORMAT_R16G16B16A16_UNORM,
+ &children_AABBs[0], &num_fat_leaves, &leafTable_local[0], true);
+ else if(vertex_format == VERTEX_FORMAT_R16G16_UNORM)
+ traverse_aabbs_new_update_func(bvh, vertices, geomDesc, scratch, VERTEX_FORMAT_R16G16_UNORM,
+ &children_AABBs[0], &num_fat_leaves, &leafTable_local[0], true);
+ else if(vertex_format == VERTEX_FORMAT_R10G10B10A2_UNORM)
+ traverse_aabbs_new_update_func(bvh, vertices, geomDesc, scratch, VERTEX_FORMAT_R10G10B10A2_UNORM,
+ &children_AABBs[0], &num_fat_leaves, &leafTable_local[0], true);
+ else if(vertex_format == VERTEX_FORMAT_R8G8B8A8_UNORM)
+ traverse_aabbs_new_update_func(bvh, vertices, geomDesc, scratch, VERTEX_FORMAT_R8G8B8A8_UNORM,
+ &children_AABBs[0], &num_fat_leaves, &leafTable_local[0], true);
+ else if(vertex_format == VERTEX_FORMAT_R8G8_UNORM)
+ traverse_aabbs_new_update_func(bvh, vertices, geomDesc, scratch, VERTEX_FORMAT_R8G8_UNORM,
+ &children_AABBs[0], &num_fat_leaves, &leafTable_local[0], true);
+ else if(vertex_format == VERTEX_FORMAT_R8G8B8A8_SNORM)
+ traverse_aabbs_new_update_func(bvh, vertices, geomDesc, scratch, VERTEX_FORMAT_R8G8B8A8_SNORM,
+ &children_AABBs[0], &num_fat_leaves, &leafTable_local[0], true);
+ else if(vertex_format == VERTEX_FORMAT_R8G8_SNORM)
+ traverse_aabbs_new_update_func(bvh, vertices, geomDesc, scratch, VERTEX_FORMAT_R8G8_SNORM,
+ &children_AABBs[0], &num_fat_leaves, &leafTable_local[0], true);
+ else
+ traverse_aabbs_new_update_func(bvh, vertices, geomDesc, scratch, (uint)-1,
+ &children_AABBs[0], &num_fat_leaves, &leafTable_local[0], true);
+}
diff --git a/src/intel/vulkan/grl/gpu/atomic_update.grl b/src/intel/vulkan/grl/gpu/atomic_update.grl
new file mode 100644
index 00000000000..9e1d6923d4a
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/atomic_update.grl
@@ -0,0 +1,198 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+module atomic_update;
+
+kernel_module atomic_update ("atomic_update.cl")
+{
+ links lsc_intrinsics;
+ kernel init_refit_scratch < kernelFunction = "init_refit_scratch" >;
+ kernel traverse_aabbs_quad < kernelFunction = "traverse_aabbs_quad" >;
+ kernel write_inner_nodes < kernelFunction = "write_inner_nodes" >;
+ kernel build_fatleaf_table < kernelFunction = "build_fatleaf_table" >;
+ kernel build_innernode_table < kernelFunction = "build_innernode_table" >;
+
+ kernel update_single_group_quads < kernelFunction = "update_single_group_quads" >;
+
+ kernel build_fatleaf_table_new_update < kernelFunction = "build_fatleaf_table_new_update" >;
+ kernel fixup_quad_table < kernelFunction = "fixup_quad_table" >;
+ kernel traverse_aabbs_new_update < kernelFunction = "traverse_aabbs_new_update" >;
+ kernel traverse_aabbs_new_update_single_geo < kernelFunction = "traverse_aabbs_new_update_single_geo" >;
+}
+
+import struct MKBuilderState "structs.grl";
+
+// this metakernel only initializes registers for use in a batching loop by "init_refit_scratch"
+metakernel init_refit_scratch_metakernel_registers()
+{
+ REG0.hi = 0;
+ REG1 = 3;
+ REG2 = 63;
+ REG3 = 4;
+ REG4 = 2;
+
+ DISPATCHDIM_Y = 1;
+ DISPATCHDIM_Z = 1;
+}
+
+metakernel init_refit_scratch( qword bvh_base, qword scratch)//, dword max_inner_nodes )
+{
+ REG0.lo = load_dword( bvh_base + 12 ); // TODO: DON'T HARDCODE!!
+ define C_3 REG1;
+ define C_63 REG2;
+ define C_4 REG3;
+ define C_2 REG4;
+
+ REG0 = REG0 - C_3; // nodedataCurr - fixed offset
+ REG0 = REG0 + C_63; // + 63
+ REG0 = REG0 >> C_4; // >> 4
+ REG0 = REG0 >> C_2; // >> 2 == >> 6 == /64
+
+ DISPATCHDIM_X = REG0.lo;
+
+ dispatch_indirect init_refit_scratch//( (max_inner_nodes+63)/64, 1, 1 )
+ args(bvh_base,scratch);
+
+}
+
+metakernel build_node_tables( qword bvh_base )
+{
+ REG0 = load_dword( bvh_base + 12 ); // TODO: DON'T HARDCODE!!
+ REG1 = 2;
+ REG2 = 63;
+ REG3 = 4;
+ REG4 = 3; // fixed offset... TODO: DON'T HARDCODE!!
+
+ REG0 = REG0 - REG4; // nodedataCurr - fixed offset
+ REG0 = REG0 + REG2; // + 63
+ REG0 = REG0 >> REG3; // >> 4
+ REG0 = REG0 >> REG1; // >> 2 == >> 6 == /64
+
+ DISPATCHDIM_X = REG0.lo;
+ DISPATCHDIM_Y = 1;
+ DISPATCHDIM_Z = 1;
+
+ dispatch_indirect build_fatleaf_table//( (max_inner_nodes+63)/64, 1, 1 )
+ args(bvh_base);
+ dispatch_indirect build_innernode_table//( (max_inner_nodes+63)/64, 1, 1 )
+ args(bvh_base);
+}
+
+metakernel build_node_tables_new_update( MKBuilderState state, qword bvh_base )
+{
+ REG0 = load_dword( bvh_base + 12 ); // TODO: DON'T HARDCODE!!
+ REG1 = 2;
+ REG2 = 63;
+ REG3 = 4;
+ REG4 = 3; // fixed offset... TODO: DON'T HARDCODE!!
+
+ REG0 = REG0 - REG4; // nodedataCurr - fixed offset
+ REG0 = REG0 + REG2; // + 63
+ REG0 = REG0 >> REG3; // >> 4
+ REG0 = REG0 >> REG1; // >> 2 == >> 6 == /64
+
+ DISPATCHDIM_X = REG0.lo;
+ DISPATCHDIM_Y = 1;
+ DISPATCHDIM_Z = 1;
+
+ dispatch_indirect build_fatleaf_table_new_update//( (max_inner_nodes+63)/64, 1, 1 )
+ args(state.build_globals, bvh_base);
+ dispatch_indirect build_innernode_table//( (max_inner_nodes+63)/64, 1, 1 )
+ args(bvh_base);
+}
+
+metakernel fixup_quad_table( qword bvh_base )
+{
+ dispatch fixup_quad_table(2,1,1)
+ args(bvh_base);
+}
+
+// this metakernel only initializes registers for use in a batching loop by "traverse_aabbs_quad" and "write_inner_nodes"
+metakernel init_traverse_aabbs_quad_and_write_inner_nodes()
+{
+ REG0.hi = 0;
+ REG1 = 1;
+ REG2 = 31;
+ REG3 = 4;
+ REG4 = 2;
+ REG5 = 7;
+ REG6 = 255;
+ DISPATCHDIM_Y = 1;
+ DISPATCHDIM_Z = 1;
+}
+
+metakernel traverse_aabbs_quad( qword bvh_base, qword scratch, qword geos)//, dword max_inner_nodes )
+{
+
+ REG0.lo = load_dword( bvh_base + 64 ); // TODO: DOn't hardcode!
+ define C_1 REG1;
+ define C_31 REG2;
+ define C_4 REG3;
+
+ REG0 = REG0 + C_31; // + 31
+ REG0 = REG0 >> C_4; // >> 4
+ REG0 = REG0 >> C_1; // >> 1 == >> 5 == /32
+
+ DISPATCHDIM_X = REG0.lo;
+
+ dispatch_indirect traverse_aabbs_quad//( (max_inner_nodes+32)/32, 1, 1 )
+ args(bvh_base,scratch,geos);
+}
+
+metakernel write_inner_nodes( qword bvh_base, qword scratch )//, dword max_inner_nodes )
+{
+ REG0.lo = load_dword( bvh_base + 68 ); // TODO: DOn't hardcode!
+ define C_1 REG1;
+ define C_2 REG4;
+ define C_7 REG5;
+
+ REG0 = REG0 + C_7; // + 7
+ REG0 = REG0 >> C_2; // >> 2
+ REG0 = REG0 >> C_1; // >> 1 ==> >> 3 (/8)
+ DISPATCHDIM_X = REG0.lo;
+
+ dispatch_indirect write_inner_nodes//( (max_inner_nodes+7)/8, 1, 1 )
+ args(bvh_base,scratch);
+}
+
+metakernel update_single_group_quads( qword bvh_base, qword geos, qword aabbs )
+{
+ dispatch update_single_group_quads(1,1,1) //( (max_inner_nodes+1)/2, 1, 1 )
+ args(bvh_base,geos,aabbs);
+}
+
+metakernel traverse_aabbs_new_update( qword bvh_base, qword geos, qword scratch )
+{
+ REG0.lo = load_dword( bvh_base + 84 ); // TODO: DOn't hardcode!
+ define C_255 REG6;
+ define C_4 REG3;
+
+ REG0 = REG0 + C_255; // + 255
+ REG0 = REG0 >> C_4; // >> 4
+ REG0 = REG0 >> C_4; // >> 4 == >> 8 == /32
+
+ DISPATCHDIM_X = REG0.lo;
+
+ dispatch_indirect traverse_aabbs_new_update//( (max_inner_nodes+255)/256, 1, 1 )
+ args(bvh_base, geos, scratch);
+}
+
+metakernel traverse_aabbs_new_update_single_geo( qword bvh_base, qword vertices, qword geos, qword scratch, dword vertex_format )
+{
+ REG0.lo = load_dword( bvh_base + 84 ); // TODO: DOn't hardcode!
+ define C_255 REG6;
+ define C_4 REG3;
+
+ REG0 = REG0 + C_255; // + 255
+ REG0 = REG0 >> C_4; // >> 4
+ REG0 = REG0 >> C_4; // >> 4 == >> 8 == /32
+
+ DISPATCHDIM_X = REG0.lo;
+
+ dispatch_indirect traverse_aabbs_new_update_single_geo//( (max_inner_nodes+255)/256, 1, 1 )
+ args(bvh_base, vertices, geos, scratch, vertex_format);
+} \ No newline at end of file
diff --git a/src/intel/vulkan/grl/gpu/binned_sah_shared.h b/src/intel/vulkan/grl/gpu/binned_sah_shared.h
new file mode 100644
index 00000000000..8b22f6612cd
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/binned_sah_shared.h
@@ -0,0 +1,265 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+//
+// This file contains structure definitions shared by GRL OCL kernels and host code
+//
+
+#include "GRLGen12.h"
+#pragma once
+
+#define BFS_NUM_BINS 16
+#define BFS_NUM_VCONTEXTS 256
+#define BFS_MAX_DEPTH 32
+
+#define TRIVIAL_BUILD_THRESHOLD 6
+#define SINGLE_WG_BUILD_THRESHOLD 256
+
+#define QNODE_GLOBAL_ROOT_BUFFER_MIN_ENTRIES_NUM 16384
+
+
+typedef uchar vcontext_id_t;
+
+GRL_NAMESPACE_BEGIN(GRL)
+GRL_NAMESPACE_BEGIN(RTAS)
+GRL_NAMESPACE_BEGIN(GPUBVHBuilder)
+
+struct BFS_Split
+{
+ float sah;
+ int dim;
+ int pos;
+};
+
+
+struct BFS_BinInfo
+{
+ float min_max[18 * BFS_NUM_BINS]; // layout: bins[axis][num_bins][6]
+ // The 6 are lower(xyz) and -upper(xyz)
+ // bins use negated-max so that we can use vectorized mins instead of min/max pairs
+ uint counts[3 * BFS_NUM_BINS];
+};
+
+enum_uint8(SAHBuildFlags)
+{
+ SAH_FLAG_NEED_BACKPOINTERS = 1, // identifies a mixed internal node where each child can have a different type
+ SAH_FLAG_NEED_MASKS = 2
+};
+
+struct SAHBuildGlobals
+{
+ qword p_primref_index_buffers;
+ qword p_primrefs_buffer;
+ qword p_bvh2;
+ qword p_globals; // TODO: deprecate this
+ qword p_bvh_base;
+ gpuva_t p_qnode_root_buffer;
+
+ dword flags; // bit 1 is 'alloc_backpointers'. bit 2 is 'need_masks'
+ dword num_primrefs;
+ dword leaf_size;
+ dword leaf_type;
+
+ dword root_buffer_num_produced;
+ dword root_buffer_num_produced_hi;
+ dword root_buffer_num_consumed;
+ dword root_buffer_num_consumed_hi;
+ dword root_buffer_num_to_consume;
+ dword root_buffer_num_to_consume_hi;
+};
+
+struct SAHBuildBuffersInfo
+{
+ gpuva_t p_globals;
+ gpuva_t p_primref_index_buffers;
+ gpuva_t p_primrefs_buffer;
+ gpuva_t p_bvh2;
+ gpuva_t p_bvh_base;
+ gpuva_t p_qnode_root_buffer;
+ dword sah_globals_flags;
+ dword _pad;
+ gpuva_t _pad2;
+};
+
+typedef union LRBounds
+{
+ struct
+ {
+ struct AABB3f left_centroid_bounds;
+ struct AABB3f left_geom_bounds;
+ struct AABB3f right_centroid_bounds;
+ struct AABB3f right_geom_bounds;
+ } boxes;
+ struct
+ {
+ float Array[24];
+ } scalars;
+} LRBounds;
+
+
+struct VContext
+{
+ uint dispatch_primref_begin; // range of primrefs for this task
+ uint dispatch_primref_end;
+ uint bvh2_root; // BVH2 root node for this task
+ uint tree_depth; // depth of this node in the tree
+ uint num_left; // primref counts
+ uint num_right;
+ uint lr_mask; // lower 8b : left mask. upper 8b : right mask
+ uint batch_index;
+
+ // pass1 global working state and output
+ struct BFS_Split split;
+ struct BFS_BinInfo global_bin_info;
+
+ // pass2 global working state and output
+ LRBounds lr_bounds;
+};
+
+
+
+struct BFSDispatchRecord
+{
+ ushort batch_index;
+ ushort context_id;
+};
+
+
+struct BFSDispatchQueue
+{
+ uint num_dispatches;
+ uint wg_count[BFS_NUM_VCONTEXTS];
+ struct BFSDispatchRecord records[BFS_NUM_VCONTEXTS];
+};
+
+struct BFS1SpillStackEntry
+{
+ uint primref_begin;
+ uint primref_end;
+ uint bvh2_root;
+ ushort tree_depth;
+ ushort batch_index;
+};
+
+struct BFS1SpillStack
+{
+ uint size;
+ struct BFS1SpillStackEntry entries[BFS_NUM_VCONTEXTS * BFS_MAX_DEPTH];
+};
+
+struct QNodeGlobalRootBufferEntry
+{
+ uint bvh2_node;
+ uint qnode;
+ uint build_idx;
+ uint _pad;
+};
+
+struct QNodeGlobalRootBuffer
+{
+ uint curr_entries_offset; // we use "entries" as two buffers, so offset is either 0 or QNODE_GLOBAL_ROOT_BUFFER_MIN_ENTRIES_NUM
+ struct QNodeGlobalRootBufferEntry entries[QNODE_GLOBAL_ROOT_BUFFER_MIN_ENTRIES_NUM * 2];
+};
+
+struct DFSDispatchRecord
+{
+ uint primref_base;
+ uint bvh2_base;
+ uint batch_index;
+ ushort num_primrefs;
+ ushort tree_depth;
+};
+
+
+struct DFSDispatchQueue
+{
+ struct DFSDispatchRecord records[BFS_NUM_VCONTEXTS * 2];
+};
+
+#define VCONTEXT_STATE_EXECUTING 0
+#define VCONTEXT_STATE_UNALLOCATED 1
+
+union SchedulerUnion
+{
+ struct VContextScheduler
+ {
+ /////////////////////////////////////////////////////////////
+ // State data used for communication with command streamer
+ // NOTE: This part must match definition in 'new_sah_builder.grl'
+ /////////////////////////////////////////////////////////////
+
+ dword num_bfs_wgs;
+ dword num_dfs_wgs;
+
+ dword scheduler_postsync;
+ dword _pad1;
+
+ dword num_trivial_builds; // number of trivial builds (#primrefs < leaf_size).
+ dword num_single_builds; // number of single-wg builds (#primrefs < threshold)
+
+ dword batched_build_wg_count; // number of wgs to dispatch for initial BFS pass
+ dword batched_build_loop_mask; // value is 0 if #builds <= #contexts. else 1 command streamer uses this as a loop condition
+
+ /////////////////////////////////////////////////////////////
+
+ dword batched_build_count; // number of batched builds in the SAHBuildGlobals buffer
+ dword batched_build_offset; // location of the first batched-build in the SAHBuildGlobals buffer
+
+ dword vcontext_state[BFS_NUM_VCONTEXTS];
+
+ struct BFSDispatchQueue bfs_queue;
+ struct DFSDispatchQueue dfs_queue;
+
+ struct VContext contexts[BFS_NUM_VCONTEXTS];
+
+ struct BFS1SpillStack bfs2_spill_stack;
+ } vContextScheduler;
+
+ struct QnodeScheduler
+ {
+ dword num_qnode_grb_curr_entries;
+ dword num_qnode_grb_new_entries;
+
+ dword scheduler_postsync;
+ dword _pad1;
+
+ dword num_trivial_builds; // number of trivial builds (#primrefs < leaf_size).
+ dword num_single_builds; // number of single-wg builds (#primrefs < threshold)
+
+ dword batched_builds_to_process;
+ dword num_max_qnode_global_root_buffer_entries; // number of maximum entries to global root buffer
+
+ /////////////////////////////////////////////////////////////
+
+ dword batched_build_count; // number of batched builds in the SAHBuildGlobals buffer
+ dword batched_build_offset; // location of the first batched-build in the SAHBuildGlobals buffer
+
+ struct QNodeGlobalRootBuffer qnode_global_root_buffer;
+ } qnodeScheduler;
+};
+
+
+struct BVH2Node
+{
+ struct AABB3f box;
+ uint meta_u; // leaf: primref start. inner: offset from node to its first child
+ uint meta_ss;
+ //ushort meta_s; // leaf: primref count. inner: offset from first to second child, in nodes
+ //uchar is_inner; // 1 if inner, 0 if leaf
+ //uchar mask;
+};
+
+struct BVH2
+{
+ uint num_nodes;
+ uint _pad[7]; // align to 32B
+};
+
+
+GRL_NAMESPACE_END(GPUBVHBuilder)
+GRL_NAMESPACE_END(RTAS)
+GRL_NAMESPACE_END(GRL)
diff --git a/src/intel/vulkan/grl/gpu/build_leaf.grl b/src/intel/vulkan/grl/gpu/build_leaf.grl
new file mode 100644
index 00000000000..7b154d03b43
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/build_leaf.grl
@@ -0,0 +1,206 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+module leaf_builder;
+
+kernel_module leaf_kernels ("bvh_build_leaf.cl")
+{
+ links lsc_intrinsics;
+
+ kernel opencl_kernel_primref_to_quads < kernelFunction="primref_to_quads" >;
+ kernel opencl_kernel_primref_to_procedurals < kernelFunction="primref_to_procedurals" >;
+ kernel opencl_kernel_create_HW_instance_nodes < kernelFunction="create_HW_instance_nodes" >;
+ kernel opencl_kernel_create_HW_instance_nodes_pointers < kernelFunction="create_HW_instance_nodes_pointers" >;
+}
+
+import struct MKBuilderState "structs.grl";
+import struct MKSizeEstimate "structs.grl";
+
+const Instances_GROUPSIZE = 16;
+
+metakernel buildLeafDXR_instances(
+ MKBuilderState state,
+ qword build_primref_index_buffers,
+ qword srcInstanceDescrArray,
+ dword stride,
+ dword offset,
+ dword numPrims)
+{
+ define num_groups (numPrims+Instances_GROUPSIZE-1)/Instances_GROUPSIZE;
+ dispatch opencl_kernel_create_HW_instance_nodes(num_groups,1,1) args(
+ state.build_globals,
+ build_primref_index_buffers,
+ state.build_primref_buffer,
+ state.bvh_buffer,
+ srcInstanceDescrArray,
+ stride,
+ offset);
+}
+
+metakernel buildLeafDXR_instances_indirect(
+ MKBuilderState state,
+ qword build_primref_index_buffers,
+ qword srcInstanceDescrArray,
+ qword indirectBuildRangeInfo,
+ dword stride,
+ dword offset)
+{
+ define num_groups REG0;
+ define groupsize_1 REG1; // groupsize - 1
+ define C_4 REG2;
+
+ // init with primitiveCount
+ num_groups = load_dword(indirectBuildRangeInfo);
+ groupsize_1 = 15; // Instances_GROUPSIZE - 1
+ C_4 = 4; // log_2(Instances_GROUPSIZE)
+
+ num_groups = num_groups + groupsize_1;
+ num_groups = num_groups >> C_4; // num_groups / Instances_GROUPSIZE;
+
+ DISPATCHDIM_X = num_groups.lo;
+ DISPATCHDIM_Y = 1;
+ DISPATCHDIM_Z = 1;
+
+ dispatch_indirect opencl_kernel_create_HW_instance_nodes args(
+ state.build_globals,
+ build_primref_index_buffers,
+ state.build_primref_buffer,
+ state.bvh_buffer,
+ srcInstanceDescrArray,
+ stride,
+ offset);
+}
+
+metakernel buildLeafDXR_instances_pointers(
+ MKBuilderState state,
+ qword build_primref_index_buffers,
+ qword srcInstanceDescrArrayPtr,
+ dword stride,
+ dword offset,
+ dword numPrims)
+{
+ define num_groups (numPrims+Instances_GROUPSIZE-1)/Instances_GROUPSIZE;
+ dispatch opencl_kernel_create_HW_instance_nodes_pointers(num_groups,1,1) args(
+ state.build_globals,
+ build_primref_index_buffers,
+ state.build_primref_buffer,
+ state.bvh_buffer,
+ srcInstanceDescrArrayPtr,
+ stride,
+ offset);
+}
+
+metakernel buildLeafDXR_instances_pointers_indirect(
+ MKBuilderState state,
+ qword build_primref_index_buffers,
+ qword srcInstanceDescrArrayPtr,
+ qword indirectBuildRangeInfo,
+ dword stride,
+ dword offset)
+{
+ define num_groups REG0;
+ define groupsize_1 REG1; // groupsize - 1
+ define C_4 REG2;
+
+ // init with primitiveCount
+ num_groups = load_dword(indirectBuildRangeInfo);
+ groupsize_1 = 15; // Instances_GROUPSIZE - 1
+ C_4 = 4; // log_2(Instances_GROUPSIZE)
+
+ num_groups = num_groups + groupsize_1;
+ num_groups = num_groups >> C_4; // num_groups / Instances_GROUPSIZE;
+
+ DISPATCHDIM_X = num_groups.lo;
+ DISPATCHDIM_Y = 1;
+ DISPATCHDIM_Z = 1;
+
+ dispatch_indirect opencl_kernel_create_HW_instance_nodes_pointers args(
+ state.build_globals,
+ build_primref_index_buffers,
+ state.build_primref_buffer,
+ state.bvh_buffer,
+ srcInstanceDescrArrayPtr,
+ stride,
+ offset);
+}
+
+metakernel buildLeafDXR_procedurals(
+ MKBuilderState state,
+ qword build_primref_index_buffers,
+ dword stride,
+ dword offset,
+ qword p_numPrimitives)
+{
+ define C_1 REG0;
+ define REG_PRIMS_PER_WG REG1;
+ define REG_PRIMS_PER_WG_SHR REG2;
+
+ C_1 = 1;
+ REG_PRIMS_PER_WG = 16;
+ REG_PRIMS_PER_WG_SHR = 4;// We cannot use div, so we use shift right instead (shift by 4 = div by 16 elements)
+
+ define reg_numPrimitives REG3;
+ define reg_num_wgs REG4;
+
+ reg_numPrimitives = load_dword(p_numPrimitives);
+ reg_num_wgs = reg_numPrimitives + REG_PRIMS_PER_WG;
+ reg_num_wgs = reg_num_wgs - C_1;
+ reg_num_wgs = reg_num_wgs >> REG_PRIMS_PER_WG_SHR;
+
+ DISPATCHDIM_X = reg_num_wgs;
+ DISPATCHDIM_Y = 1;
+ DISPATCHDIM_Z = 1;
+
+ dispatch_indirect opencl_kernel_primref_to_procedurals args(
+ state.build_globals,
+ state.build_primref_buffer,
+ build_primref_index_buffers,
+ state.bvh_buffer,
+ state.geomDesc_buffer,
+ stride,
+ offset);
+}
+
+metakernel buildLeafDXR_quads(
+ MKBuilderState state,
+ qword build_primref_index_buffers,
+ dword stride,
+ dword offset,
+ qword p_numPrimitives,
+ dword allow_update)
+{
+ define C_1 REG0;
+ define REG_PRIMS_PER_WG REG1;
+ define SHIFT REG2;
+
+ C_1 = 1;
+ REG_PRIMS_PER_WG = 32;
+ SHIFT = 4;// We cannot use div, so we use shift right instead (shift by 4 = div by 16 elements)
+
+ define reg_numPrimitives REG3;
+ define reg_num_wgs REG4;
+
+ reg_numPrimitives = load_dword(p_numPrimitives);
+ reg_num_wgs = reg_numPrimitives + REG_PRIMS_PER_WG;
+ reg_num_wgs = reg_num_wgs - C_1;
+ reg_num_wgs = reg_num_wgs >> SHIFT;
+ reg_num_wgs = reg_num_wgs >> C_1;
+
+ DISPATCHDIM_X = reg_num_wgs;
+ DISPATCHDIM_Y = 1;
+ DISPATCHDIM_Z = 1;
+
+ dispatch_indirect opencl_kernel_primref_to_quads args(
+ state.build_globals,
+ state.build_primref_buffer,
+ build_primref_index_buffers,
+ state.bvh_buffer,
+ state.geomDesc_buffer,
+ stride,
+ offset,
+ allow_update);
+}
diff --git a/src/intel/vulkan/grl/gpu/build_primref.grl b/src/intel/vulkan/grl/gpu/build_primref.grl
new file mode 100644
index 00000000000..33728bd01f6
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/build_primref.grl
@@ -0,0 +1,229 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+module build_primref;
+
+kernel_module primref_kernels ("bvh_build_primref.cl")
+{
+ links lsc_intrinsics;
+
+ kernel opencl_kernel_primrefs_from_DXR_instances < kernelFunction="primrefs_from_DXR_instances" >;
+ kernel opencl_kernel_primrefs_from_DXR_instances_indirect < kernelFunction="primrefs_from_DXR_instances_indirect" >;
+ kernel opencl_kernel_primrefs_from_DXR_instances_pointers < kernelFunction="primrefs_from_DXR_instances_pointers" >;
+ kernel opencl_kernel_primrefs_from_DXR_instances_pointers_indirect < kernelFunction="primrefs_from_DXR_instances_pointers_indirect" >;
+
+ kernel opencl_kernel_triangles_to_primrefs < kernelFunction="triangles_to_primrefs" >;
+ kernel opencl_kernel_triangles_to_primrefs_indirect < kernelFunction="triangles_to_primrefs_indirect" >;
+ kernel opencl_kernel_procedurals_to_primrefs < kernelFunction="procedurals_to_primrefs" >;
+ kernel opencl_kernel_procedurals_to_primrefs_indirect < kernelFunction="procedurals_to_primrefs_indirect" >;
+}
+
+import struct MKBuilderState "structs.grl";
+import struct MKSizeEstimate "structs.grl";
+
+
+const PrimirefsFromInstances_GROUPSIZE = 16;
+
+metakernel buildPrimirefsFromInstances(
+ qword instanceDescBuff,
+ MKSizeEstimate estimate,
+ MKBuilderState build_state,
+ dword allowUpdate)
+{
+ define num_groups ((estimate.numPrimitives + PrimirefsFromInstances_GROUPSIZE-1)/PrimirefsFromInstances_GROUPSIZE);
+ dispatch opencl_kernel_primrefs_from_DXR_instances(num_groups,1,1) args(
+ build_state.build_globals,
+ build_state.bvh_buffer,
+ instanceDescBuff,
+ estimate.numPrimitives,
+ build_state.build_primref_buffer,
+ allowUpdate);
+}
+
+metakernel buildPrimirefsFromInstancesIndirect(
+ qword instanceDescBuff,
+ qword indirectBuildRangeInfo,
+ MKBuilderState build_state,
+ dword allowUpdate)
+{
+ define num_groups REG0;
+ define groupsize_1 REG1; // groupsize - 1
+ define C_4 REG2;
+
+ // init with primitiveCount
+ num_groups = load_dword(indirectBuildRangeInfo);
+ groupsize_1 = 15; // PrimirefsFromInstances_GROUPSIZE - 1
+ C_4 = 4; // log_2(PrimirefsFromInstances_GROUPSIZE)
+
+ num_groups = num_groups + groupsize_1;
+ num_groups = num_groups >> C_4; // num_groups / PrimirefsFromInstances_GROUPSIZE;
+
+ DISPATCHDIM_X = num_groups.lo;
+ DISPATCHDIM_Y = 1;
+ DISPATCHDIM_Z = 1;
+
+ dispatch_indirect opencl_kernel_primrefs_from_DXR_instances_indirect args(
+ build_state.build_globals,
+ build_state.bvh_buffer,
+ instanceDescBuff,
+ indirectBuildRangeInfo,
+ build_state.build_primref_buffer,
+ allowUpdate);
+}
+
+metakernel buildPrimirefsFromInstancesArrOfPtrs(
+ qword instanceDescPtrArrayBuff,
+ MKSizeEstimate estimate,
+ MKBuilderState build_state,
+ dword allowUpdate)
+{
+ define num_groups ((estimate.numPrimitives + PrimirefsFromInstances_GROUPSIZE-1)/PrimirefsFromInstances_GROUPSIZE);
+ dispatch opencl_kernel_primrefs_from_DXR_instances_pointers(num_groups,1,1) args(
+ build_state.build_globals,
+ build_state.bvh_buffer,
+ instanceDescPtrArrayBuff,
+ estimate.numPrimitives,
+ build_state.build_primref_buffer,
+ allowUpdate);
+}
+
+metakernel buildPrimirefsFromInstancesArrOfPtrsIndirect(
+ qword instanceDescPtrArrayBuff,
+ qword indirectBuildRangeInfo,
+ MKSizeEstimate estimate,
+ MKBuilderState build_state,
+ dword allowUpdate)
+{
+ define num_groups REG0;
+ define groupsize_1 REG1; // groupsize - 1
+ define C_4 REG2;
+
+ // init with primitiveCount
+ num_groups = load_dword(indirectBuildRangeInfo);
+ groupsize_1 = 15; // PrimirefsFromInstances_GROUPSIZE - 1
+ C_4 = 4; // log_2(PrimirefsFromInstances_GROUPSIZE)
+
+ num_groups = num_groups + groupsize_1;
+ num_groups = num_groups >> C_4; // num_groups / PrimirefsFromInstances_GROUPSIZE;
+
+ DISPATCHDIM_X = num_groups.lo;
+ DISPATCHDIM_Y = 1;
+ DISPATCHDIM_Z = 1;
+
+ dispatch_indirect opencl_kernel_primrefs_from_DXR_instances_pointers_indirect args(
+ build_state.build_globals,
+ build_state.bvh_buffer,
+ instanceDescPtrArrayBuff,
+ build_state.build_primref_buffer,
+ indirectBuildRangeInfo,
+ allowUpdate);
+}
+
+
+
+
+metakernel primrefs_from_tris(
+ MKBuilderState build_state,
+ MKSizeEstimate estimate,
+ qword geo_ptr,
+ dword geom_id,
+ dword geom_flags,
+ dword num_prims)
+{
+ define num_threads ((num_prims+15)/16);
+ dispatch opencl_kernel_triangles_to_primrefs(num_threads,1,1) args(
+ build_state.build_globals,
+ build_state.bvh_buffer,
+ build_state.build_primref_buffer,
+ geo_ptr,
+ (geom_id & 0x00ffffff) + (geom_flags<<24),
+ num_prims);
+}
+
+metakernel primrefs_from_tris_indirect(
+ MKBuilderState build_state,
+ MKSizeEstimate estimate,
+ qword geo_ptr,
+ qword indirectBuildRangeInfo,
+ dword geom_id,
+ dword geom_flags)
+{
+ define num_groups REG0;
+ define groupsize_1 REG1; // groupsize - 1
+ define C_4 REG2;
+
+ // init with primitiveCount
+ num_groups = load_dword(indirectBuildRangeInfo);
+ groupsize_1 = 15; // PrimirefsFromInstances_GROUPSIZE - 1
+ C_4 = 4; // log_2(PrimirefsFromInstances_GROUPSIZE)
+
+ num_groups = num_groups + groupsize_1;
+ num_groups = num_groups >> C_4; // num_groups / PrimirefsFromInstances_GROUPSIZE;
+
+ DISPATCHDIM_X = num_groups.lo;
+ DISPATCHDIM_Y = 1;
+ DISPATCHDIM_Z = 1;
+
+ dispatch_indirect opencl_kernel_triangles_to_primrefs_indirect args(
+ build_state.build_globals,
+ build_state.bvh_buffer,
+ build_state.build_primref_buffer,
+ geo_ptr,
+ indirectBuildRangeInfo,
+ (geom_id & 0x00ffffff) + (geom_flags << 24));
+}
+
+metakernel primrefs_from_proc(
+ MKBuilderState build_state,
+ MKSizeEstimate estimate,
+ qword geo_ptr,
+ dword geom_id,
+ dword geom_flags,
+ dword num_prims)
+{
+ define num_threads ((num_prims+15)/16);
+ dispatch opencl_kernel_procedurals_to_primrefs(num_threads,1,1) args(
+ build_state.build_globals,
+ build_state.bvh_buffer,
+ build_state.build_primref_buffer,
+ geo_ptr,
+ (geom_id & 0x00ffffff) + (geom_flags<<24),
+ num_prims);
+}
+
+metakernel primrefs_from_proc_indirect(
+ MKBuilderState build_state,
+ MKSizeEstimate estimate,
+ qword geo_ptr,
+ qword indirectBuildRangeInfo,
+ dword geom_id,
+ dword geom_flags)
+{
+ define num_groups REG0;
+ define groupsize_1 REG1; // groupsize - 1
+ define C_4 REG2;
+
+ // init with primitiveCount
+ num_groups = load_dword(indirectBuildRangeInfo);
+ groupsize_1 = 15; // PrimirefsFromInstances_GROUPSIZE - 1
+ C_4 = 4; // log_2(PrimirefsFromInstances_GROUPSIZE)
+
+ num_groups = num_groups + groupsize_1;
+ num_groups = num_groups >> C_4; // num_groups / PrimirefsFromInstances_GROUPSIZE;
+
+ DISPATCHDIM_X = num_groups.lo;
+ DISPATCHDIM_Y = 1;
+ DISPATCHDIM_Z = 1;
+
+ dispatch_indirect opencl_kernel_procedurals_to_primrefs_indirect args(
+ build_state.build_globals,
+ build_state.bvh_buffer,
+ build_state.build_primref_buffer,
+ geo_ptr,
+ indirectBuildRangeInfo,
+ (geom_id & 0x00ffffff) + (geom_flags<<24));
+}
diff --git a/src/intel/vulkan/grl/gpu/build_refit.grl b/src/intel/vulkan/grl/gpu/build_refit.grl
new file mode 100644
index 00000000000..46d6e76add2
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/build_refit.grl
@@ -0,0 +1,324 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+module build_refit;
+
+kernel_module morton_kernels ("bvh_build_refit.cl")
+{
+ links lsc_intrinsics;
+
+ kernel update_instance_leaves < kernelFunction="update_instance_leaves" >;
+ kernel refit_indirect_sg < kernelFunction="Refit_indirect_sg" >;
+ kernel update_instance_leaves_indirect < kernelFunction="update_instance_leaves_indirect" >;
+
+
+}
+
+const INSTANCE_LEAF_GROUP_SIZE = 16;
+const REFIT_GROUP_SIZE = 8;
+
+metakernel update_instance_leaves(
+ qword bvh,
+ qword dxrInstancesArray,
+ qword dxrInstancesPtrArray,
+ qword instance_leaf_aabbs,
+ dword num_instances )
+{
+ define num_groups (num_instances + INSTANCE_LEAF_GROUP_SIZE - 1) / INSTANCE_LEAF_GROUP_SIZE;
+
+ dispatch update_instance_leaves(num_groups, 1, 1) args(
+ bvh,
+ dxrInstancesArray,
+ dxrInstancesPtrArray,
+ instance_leaf_aabbs);
+}
+
+metakernel update_instance_leaves_indirect(
+ qword bvh,
+ qword dxrInstancesArray,
+ qword dxrInstancesPtrArray,
+ qword instance_leaf_aabbs,
+ qword indirectBuildRangeInfo)
+{
+ define num_groups REG0;
+ define groupsize_1 REG1; // groupsize - 1
+ define C_4 REG2;
+
+ // init with primitiveCount
+ num_groups = load_dword(indirectBuildRangeInfo);
+ groupsize_1 = 15; // INSTANCE_LEAF_GROUP_SIZE - 1
+ C_4 = 4; // log_2(INSTANCE_LEAF_GROUP_SIZE)
+
+ num_groups = num_groups + groupsize_1;
+ num_groups = num_groups >> C_4; // num_groups / INSTANCE_LEAF_GROUP_SIZE;
+
+ DISPATCHDIM_X = num_groups.lo;
+ DISPATCHDIM_Y = 1;
+ DISPATCHDIM_Z = 1;
+
+ dispatch_indirect update_instance_leaves_indirect args(
+ bvh,
+ dxrInstancesArray,
+ dxrInstancesPtrArray,
+ instance_leaf_aabbs,
+ indirectBuildRangeInfo);
+}
+
+/*
+metakernel refit(
+ qword bvh,
+ qword geomDesc,
+ qword instance_aabbs,
+ dword dispatchSize )
+{
+ define num_groups (dispatchSize + REFIT_GROUP_SIZE - 1) / REFIT_GROUP_SIZE;
+
+ dispatch refit(num_groups, 1, 1) args(
+ bvh,
+ geomDesc,
+ instance_aabbs);
+}
+
+const REFIT_SIMD_SIZE = 8;
+const REFIT_SIMD_SIZE_SHIFT = 3;
+
+metakernel refit_indirect(
+ qword bvh,
+ qword bvh_inner_nodes_start_value,
+ qword bvh_inner_nodes_end,
+ qword geomDesc,
+ qword instance_aabbs )
+{
+ define cRoundingSIMD REG4;
+ define TWO REG3;
+ define ONE REG5;
+ cRoundingSIMD = (REFIT_SIMD_SIZE - 1);
+
+ TWO = 2;
+ ONE = 1;
+
+ REG0 = bvh_inner_nodes_start_value;
+ REG1 = load_dword(bvh_inner_nodes_end);
+ REG1.hi = 0;
+ REG2 = REG1 - REG0;
+ REG2 = REG2 + cRoundingSIMD;
+ REG2 = REG2 >> TWO; // JDB: >>3 must be implemented as >>2 then >>1 because command streamer
+ REG2 = REG2 >> ONE; // only supports pow2 shifts because somebody wanted to save area.
+
+ DISPATCHDIM_X = REG2.lo;
+ DISPATCHDIM_Y = 1;
+ DISPATCHDIM_Z = 1;
+
+ dispatch_indirect refit_indirect args(
+ bvh,
+ geomDesc,
+ instance_aabbs);
+
+}
+*/
+
+metakernel refit_indirect_sg(
+ qword bvh,
+ qword bvh_inner_nodes_start_value,
+ qword bvh_inner_nodes_end,
+ qword geomDesc,
+ qword instance_aabbs )
+{
+
+ REG0 = bvh_inner_nodes_start_value;
+ REG1.lo = load_dword(bvh_inner_nodes_end);
+ REG1.hi = 0;
+ REG2 = REG1 - REG0;
+
+ DISPATCHDIM_X = REG2.lo;
+ DISPATCHDIM_Y = 1;
+ DISPATCHDIM_Z = 1;
+
+ dispatch_indirect refit_indirect_sg args(
+ bvh,
+ geomDesc,
+ instance_aabbs);
+
+}
+/*
+////////////////////////////////////////////////////////////////
+// constructing treelets
+// phase 1: mark nodes that will be roots of bottom treelets
+// also for each node leave a number of startpoints that are under it and max depth of the path from the node
+metakernel find_refit_treelets(
+ qword bvh,
+ qword treelet_node_data,
+ qword scratch_startpoints,
+ qword startpointAlloc,
+ qword bvh_inner_nodes_start_value,
+ qword bvh_inner_nodes_end )
+{
+ define cRoundingSIMD REG4;
+ define TWO REG3;
+ define ONE REG5;
+ cRoundingSIMD = (REFIT_SIMD_SIZE - 1);
+
+ TWO = 2;
+ ONE = 1;
+
+ REG0 = bvh_inner_nodes_start_value;
+ REG1.lo = load_dword(bvh_inner_nodes_end);
+ REG1.hi = 0;
+ REG2 = REG1 - REG0;
+ REG2 = REG2 + cRoundingSIMD;
+ REG2 = REG2 >> TWO; // JDB: >>3 must be implemented as >>2 then >>1 because command streamer
+ REG2 = REG2 >> ONE; // only supports pow2 shifts because somebody wanted to save area.
+
+ DISPATCHDIM_X = REG2.lo;
+ DISPATCHDIM_Y = 1;
+ DISPATCHDIM_Z = 1;
+
+ dispatch_indirect find_refit_treelets args(
+ bvh,
+ treelet_node_data,
+ scratch_startpoints,
+ startpointAlloc);
+}
+
+
+////////////////////////////////////////////////////////////////
+// constructing treelets
+// phase 2 totally parallel, run threads up to assign startpoints to given treelet
+//
+metakernel assign_refit_startpoints_to_treelets(
+ qword bvh,
+ qword treelet_node_data,
+ qword scratch_startpoints,
+ qword bvh_inner_nodes_start_value,
+ qword bvh_inner_nodes_end )
+{
+ define cRoundingSIMD REG4;
+ define TWO REG3;
+ define ONE REG5;
+ cRoundingSIMD = (REFIT_SIMD_SIZE - 1);
+
+ TWO = 2;
+ ONE = 1;
+
+ REG0 = bvh_inner_nodes_start_value;
+ REG1.lo = load_dword(bvh_inner_nodes_end);
+ REG1.hi = 0;
+ REG2 = REG1 - REG0;
+ REG2 = REG2 + cRoundingSIMD;
+ REG2 = REG2 >> TWO; // JDB: >>3 must be implemented as >>2 then >>1 because command streamer
+ REG2 = REG2 >> ONE; // only supports pow2 shifts because somebody wanted to save area.
+
+ DISPATCHDIM_X = REG2.lo;
+ DISPATCHDIM_Y = 1;
+ DISPATCHDIM_Z = 1;
+
+ dispatch_indirect assign_refit_startpoints_to_treelets args(
+ bvh,
+ treelet_node_data,
+ scratch_startpoints);
+}
+
+
+////////////////////////////////////////////////////////////////
+// constructing treelets
+// phase 3 local work: group per treelet, sort the startpoints in treelets ?// by length of the path
+metakernel finalize_treelets_in_groups(
+ qword bvh,
+ qword scratch_startpoints,
+ qword ptrNumTreelets )
+{
+ REG0 = load_qword(ptrNumTreelets);
+
+ DISPATCHDIM_X = REG0.lo;
+ DISPATCHDIM_Y = 1;
+ DISPATCHDIM_Z = 1;
+
+ dispatch_indirect finalize_treelets_in_groups args(
+ bvh,
+ scratch_startpoints);
+}
+
+
+////////////////////////////////////////////////////////////////
+// Updating treelets
+// phase 1 update vertex and generate boxes for vertices
+//
+
+const PER_GROUP_ELEMENTS_ROUNDING = 15;
+const PER_GROUP_ELEMENTS_SHIFT = 4;
+
+metakernel init_treelets_refit(qword pSquashGroupsCountToReset)
+{
+ REG1 = 0;
+ store_qword(pSquashGroupsCountToReset, REG1);
+ DISPATCHDIM_Y = 1;
+ DISPATCHDIM_Z = 1;
+ //REG4 = PER_GROUP_ELEMENTS_SHIFT;
+ //REG5.hi = PER_GROUP_ELEMENTS_ROUNDING;
+ //REG5.lo = 0;
+}
+
+metakernel update_quads(
+ qword scratch_box,
+ qword bvh,
+ qword input,
+ dword numPrimsDividedBy32,
+ qword bigSquashInput)
+{
+ //REG0 = load_qword(quads_nodes_begin_end_pair);
+ //REG1.hi = REG0.lo; // this holds inner nodes begin
+ //REG2 = REG0 - REG1;
+ //REG2 = REG2 + REG5;
+ //REG2 = REG2 >> REG4;
+ //DISPATCHDIM_X = REG2.hi;
+
+ dispatch refit_quads(numPrimsDividedBy32, 1, 1) args(
+ bvh,
+ input,
+ scratch_box,
+ numPrimsDividedBy32,
+ bigSquashInput );
+}
+
+//
+////////////////////////////////////////////////////////////////
+
+
+////////////////////////////////////////////////////////////////
+//
+// phase 1 or 2 - update primitives as well as bottom up refit internal nodes
+// in single dispatch (in single group per tree)
+metakernel refit_tree_by_group_including_quads(
+ qword squashed_inputs,
+ dword numBuilds
+)
+{
+ dispatch refit_tree_per_group(numBuilds, 1, 1) args(
+ squashed_inputs);
+}
+//
+////////////////////////////////////////////////////////////////
+
+
+////////////////////////////////////////////////////////////////
+//
+// phase 2 bottom up refit internal nodes
+//
+metakernel refit_treelet_per_group(
+ qword bigSquashInput,
+ qword ptrNumTreelets)
+{
+ DISPATCHDIM_X = load_dword(ptrNumTreelets);
+
+ dispatch_indirect refit_treelet_per_group args(
+ bigSquashInput);
+}
+//
+////////////////////////////////////////////////////////////////
+
+#endif
+*/
diff --git a/src/intel/vulkan/grl/gpu/bvh_build_BFS.cl b/src/intel/vulkan/grl/gpu/bvh_build_BFS.cl
new file mode 100644
index 00000000000..d72f192056e
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/bvh_build_BFS.cl
@@ -0,0 +1,4823 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#include "binned_sah_shared.h"
+
+#include "libs/lsc_intrinsics.h"
+#include "intrinsics.h"
+#include "AABB.h"
+#include "AABB3f.h"
+
+#include "qbvh6.h"
+#include "common.h"
+
+#include "libs/lsc_intrinsics.h"
+
+#define SGPRINT_16x(prefix,fmt,type,val) {\
+ type v0 = sub_group_broadcast( val, 0 );\
+ type v1 = sub_group_broadcast( val, 1 );\
+ type v2 = sub_group_broadcast( val, 2 );\
+ type v3 = sub_group_broadcast( val, 3 );\
+ type v4 = sub_group_broadcast( val, 4 );\
+ type v5 = sub_group_broadcast( val, 5 );\
+ type v6 = sub_group_broadcast( val, 6 );\
+ type v7 = sub_group_broadcast( val, 7 );\
+ type v8 = sub_group_broadcast( val, 8 );\
+ type v9 = sub_group_broadcast( val, 9 );\
+ type v10 = sub_group_broadcast( val, 10 );\
+ type v11 = sub_group_broadcast( val, 11 );\
+ type v12 = sub_group_broadcast( val, 12 );\
+ type v13 = sub_group_broadcast( val, 13 );\
+ type v14 = sub_group_broadcast( val, 14 );\
+ type v15 = sub_group_broadcast( val, 15 );\
+ sub_group_barrier(CLK_LOCAL_MEM_FENCE); \
+ if( get_sub_group_local_id() == 0 ) { \
+ printf(prefix fmt fmt fmt fmt fmt fmt fmt fmt \
+ fmt fmt fmt fmt fmt fmt fmt fmt"\n" , \
+ v0,v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13,v14,v15);}}
+
+
+#define SGPRINT_6x(prefix,fmt,type,val) {\
+ type v0 = sub_group_broadcast( val, 0 );\
+ type v1 = sub_group_broadcast( val, 1 );\
+ type v2 = sub_group_broadcast( val, 2 );\
+ type v3 = sub_group_broadcast( val, 3 );\
+ type v4 = sub_group_broadcast( val, 4 );\
+ type v5 = sub_group_broadcast( val, 5 );\
+ sub_group_barrier(CLK_LOCAL_MEM_FENCE); \
+ if( get_sub_group_local_id() == 0 ) { \
+ printf(prefix fmt fmt fmt fmt fmt fmt "\n" , \
+ v0,v1,v2,v3,v4,v5);}}
+
+#define BFS_WG_SIZE 512
+
+#define BFS_NUM_VCONTEXTS 256 // must be multiple of 64
+
+#define TREE_ARITY 6
+
+#define DFS_WG_SIZE 256
+#define DFS_THRESHOLD 256
+
+
+void BFSDispatchQueue_print(struct BFSDispatchQueue* q, uint n)
+{
+ for (uint i = 0; i < q->num_dispatches; i++)
+ printf(" %u,ctx=%u,batch=%u\n", q->wg_count[i], q->records[i].context_id, q->records[i].batch_index);
+}
+
+void VContextScheduler_print(struct VContextScheduler* scheduler)
+{
+ if (get_local_id(0) == 0)
+ {
+ printf("SCHEDULER:\n");
+ printf(" bfs=%u dfs=%u\n", scheduler->num_bfs_wgs, scheduler->num_dfs_wgs);
+
+ printf("BFS QUEUE:\n");
+ BFSDispatchQueue_print(&scheduler->bfs_queue, scheduler->num_bfs_wgs);
+
+
+ printf("DFS QUEUE\n");
+ for (uint i = 0; i < scheduler->num_dfs_wgs; i++)
+ {
+ struct DFSDispatchRecord* r = &scheduler->dfs_queue.records[i];
+ printf(" (%u-%u) root=%u depth=%u batch_index=%u\n",
+ r->primref_base, r->primref_base + r->num_primrefs,
+ r->bvh2_base, r->tree_depth, r->batch_index);
+ }
+
+ printf("CONTEXTS:\n");
+ for (uint i = 0; i < BFS_NUM_VCONTEXTS; i++)
+ {
+ if (scheduler->vcontext_state[i] != VCONTEXT_STATE_UNALLOCATED)
+ {
+ printf(" context: %u state=%u\n", i, scheduler->vcontext_state[i]);
+ printf(" prims: %u-%u\n", scheduler->contexts[i].dispatch_primref_begin, scheduler->contexts[i].dispatch_primref_end);
+ printf(" depth: %u\n", scheduler->contexts[i].tree_depth);
+ printf(" root: %u\n", scheduler->contexts[i].bvh2_root);
+ printf(" batch: %u\n", scheduler->contexts[i].batch_index);
+ }
+ }
+
+
+
+ }
+
+}
+
+
+inline float3 select_min(float3 v, bool mask)
+{
+ return (float3)(mask ? v.x : (float)(INFINITY),
+ mask ? v.y : (float)(INFINITY),
+ mask ? v.z : (float)(INFINITY));
+}
+inline float3 select_max(float3 v, bool mask)
+{
+ return (float3)(mask ? v.x : -(float)(INFINITY),
+ mask ? v.y : -(float)(INFINITY),
+ mask ? v.z : -(float)(INFINITY));
+}
+
+///////////////////////////////////////////////////////////////////////////
+
+// The 'LRBounds' structure uses negated-max to allow
+// both atomic_min and atomic_max to be issued fused into one message
+
+struct AABB3f LRBounds_get_left_centroid( LRBounds* b )
+{
+ struct AABB3f* pbox = &b->boxes.left_centroid_bounds;
+ return AABB3f_construct( AABB3f_load_lower(pbox), -AABB3f_load_upper(pbox) );
+}
+struct AABB3f LRBounds_get_right_centroid( LRBounds* b )
+{
+ struct AABB3f* pbox = &b->boxes.right_centroid_bounds;
+ return AABB3f_construct( AABB3f_load_lower(pbox), -AABB3f_load_upper(pbox) );
+}
+struct AABB3f LRBounds_get_left_geom( LRBounds* b )
+{
+ struct AABB3f* pbox = &b->boxes.left_geom_bounds;
+ return AABB3f_construct( AABB3f_load_lower(pbox), -AABB3f_load_upper(pbox) );
+}
+struct AABB3f LRBounds_get_right_geom( LRBounds* b )
+{
+ struct AABB3f* pbox = &b->boxes.right_geom_bounds;
+ return AABB3f_construct( AABB3f_load_lower(pbox), -AABB3f_load_upper(pbox) );
+}
+
+
+void LRBounds_merge_left( local LRBounds* b, float3 CMin, float3 CMax, float3 GMin, float3 GMax )
+{
+ // All of the input vectors have come from sub-group reductions and are thus uniform
+ // Using atomic_min calls as below results in IGC generating 12 atomic_min messages and a large stack of movs
+ // The code below should result in 1 atomic_min message and a simularly large stack of movs
+
+ float mergeVal0 = INFINITY;
+ float mergeVal1 = INFINITY;
+ uint i = get_sub_group_local_id();
+
+ // insert the various merge values into one register
+ // We use two parallel variables here to enable some ILP
+
+ uint imod = (i>=6) ? (i-6) : i;
+ mergeVal0 = (imod==0) ? CMin.x : mergeVal0;
+ mergeVal1 = (imod==0) ? GMin.x : mergeVal1;
+
+ mergeVal0 = (imod==1) ? CMin.y : mergeVal0;
+ mergeVal1 = (imod==1) ? GMin.y : mergeVal1;
+
+ mergeVal0 = (imod==2) ? CMin.z : mergeVal0;
+ mergeVal1 = (imod==2) ? GMin.z : mergeVal1;
+
+ mergeVal0 = (imod==3) ? -CMax.x : mergeVal0;
+ mergeVal1 = (imod==3) ? -GMax.x : mergeVal1;
+
+ mergeVal0 = (imod==4) ? -CMax.y : mergeVal0;
+ mergeVal1 = (imod==4) ? -GMax.y : mergeVal1;
+
+ mergeVal0 = (imod==5) ? -CMax.z : mergeVal0;
+ mergeVal1 = (imod==5) ? -GMax.z : mergeVal1;
+
+ float merge = (i<6) ? mergeVal0 : mergeVal1;
+ if( i < 12 )
+ atomic_min( &b->scalars.Array[i], merge );
+
+ //atomic_min( &b->boxes.left_centroid_bounds.lower[0], CMin.x );
+ //atomic_min( &b->boxes.left_centroid_bounds.lower[1], CMin.y );
+ //atomic_min( &b->boxes.left_centroid_bounds.lower[2], CMin.z );
+ //atomic_min( &b->boxes.left_centroid_bounds.upper[0], -CMax.x );
+ //atomic_min( &b->boxes.left_centroid_bounds.upper[1], -CMax.y );
+ //atomic_min( &b->boxes.left_centroid_bounds.upper[2], -CMax.z );
+ //atomic_min( &b->boxes.left_geom_bounds.lower[0], GMin.x );
+ //atomic_min( &b->boxes.left_geom_bounds.lower[1], GMin.y );
+ //atomic_min( &b->boxes.left_geom_bounds.lower[2], GMin.z );
+ //atomic_min( &b->boxes.left_geom_bounds.upper[0], -GMax.x );
+ //atomic_min( &b->boxes.left_geom_bounds.upper[1], -GMax.y );
+ //atomic_min( &b->boxes.left_geom_bounds.upper[2], -GMax.z );
+}
+
+void LRBounds_merge_right( local LRBounds* b, float3 CMin, float3 CMax, float3 GMin, float3 GMax )
+{
+ // All of the input vectors have come from sub-group reductions and are thus uniform
+ // Using atomic_min calls as below results in IGC generating 12 atomic_min messages and a large stack of movs
+ // The code below should result in 1 atomic_min message and a simularly large stack of movs
+
+ float mergeVal0 = INFINITY;
+ float mergeVal1 = INFINITY;
+ uint i = get_sub_group_local_id();
+
+ // insert the various merge values into one register
+ // We use two parallel variables here to enable some ILP
+
+ uint imod = (i>=6) ? (i-6) : i;
+ mergeVal0 = (imod==0) ? CMin.x : mergeVal0;
+ mergeVal1 = (imod==0) ? GMin.x : mergeVal1;
+
+ mergeVal0 = (imod==1) ? CMin.y : mergeVal0;
+ mergeVal1 = (imod==1) ? GMin.y : mergeVal1;
+
+ mergeVal0 = (imod==2) ? CMin.z : mergeVal0;
+ mergeVal1 = (imod==2) ? GMin.z : mergeVal1;
+
+ mergeVal0 = (imod==3) ? -CMax.x : mergeVal0;
+ mergeVal1 = (imod==3) ? -GMax.x : mergeVal1;
+
+ mergeVal0 = (imod==4) ? -CMax.y : mergeVal0;
+ mergeVal1 = (imod==4) ? -GMax.y : mergeVal1;
+
+ mergeVal0 = (imod==5) ? -CMax.z : mergeVal0;
+ mergeVal1 = (imod==5) ? -GMax.z : mergeVal1;
+
+ float merge = (i<6) ? mergeVal0 : mergeVal1;
+ if( i < 12 )
+ atomic_min( &b->scalars.Array[i+12], merge );
+
+ //atomic_min( &b->boxes.right_centroid_bounds.lower[0], CMin.x );
+ //atomic_min( &b->boxes.right_centroid_bounds.lower[1], CMin.y );
+ //atomic_min( &b->boxes.right_centroid_bounds.lower[2], CMin.z );
+ //atomic_min( &b->boxes.right_centroid_bounds.upper[0], -CMax.x );
+ //atomic_min( &b->boxes.right_centroid_bounds.upper[1], -CMax.y );
+ //atomic_min( &b->boxes.right_centroid_bounds.upper[2], -CMax.z );
+ //atomic_min( &b->boxes.right_geom_bounds.lower[0], GMin.x );
+ //atomic_min( &b->boxes.right_geom_bounds.lower[1], GMin.y );
+ //atomic_min( &b->boxes.right_geom_bounds.lower[2], GMin.z );
+ //atomic_min( &b->boxes.right_geom_bounds.upper[0], -GMax.x );
+ //atomic_min( &b->boxes.right_geom_bounds.upper[1], -GMax.y );
+ //atomic_min( &b->boxes.right_geom_bounds.upper[2], -GMax.z );
+}
+
+void LRBounds_merge( global LRBounds* globalBounds, local LRBounds* localBounds )
+{
+ uint i = get_local_id(0);
+ if( i < 24 )
+ atomic_min(&globalBounds->scalars.Array[i], localBounds->scalars.Array[i] );
+}
+
+
+void LRBounds_init( LRBounds* bounds )
+{
+ uint i = get_local_id(0) * 4;
+ if( i < 24 )
+ {
+ // compiler should merge it into a 4xdword send
+ bounds->scalars.Array[i+0] = INFINITY;
+ bounds->scalars.Array[i+1] = INFINITY;
+ bounds->scalars.Array[i+2] = INFINITY;
+ bounds->scalars.Array[i+3] = INFINITY;
+ }
+
+}
+
+
+inline void LRBounds_init_subgroup( LRBounds* bounds)
+{
+ uint sg_size = get_sub_group_size();
+ uint lane = get_sub_group_local_id();
+
+ for (uint i = lane * 4; i < 24; i += sg_size * 4)
+ {
+ // compiler should merge it into a 4xdword send
+ bounds->scalars.Array[i+0] = INFINITY;
+ bounds->scalars.Array[i+1] = INFINITY;
+ bounds->scalars.Array[i+2] = INFINITY;
+ bounds->scalars.Array[i+3] = INFINITY;
+ }
+
+}
+
+///////////////////////////////////////////////////////////////////////////
+
+inline void BinInfo_init(struct BFS_BinInfo* bin_info)
+{
+ for (uint id = get_local_id(0) * 4; id < 18 * BFS_NUM_BINS; id += get_local_size(0) * 4)
+ {
+ float inf = INFINITY;
+ // compiler should merge it into a 4xdword send
+ bin_info->min_max[id+0] = inf;
+ bin_info->min_max[id+1] = inf;
+ bin_info->min_max[id+2] = inf;
+ bin_info->min_max[id+3] = inf;
+ }
+ for (uint id = get_local_id(0) * 4; id < 3 * BFS_NUM_BINS; id += get_local_size(0) * 4)
+ {
+ // compiler should merge it into a 4xdword send
+ bin_info->counts[id+0] = 0;
+ bin_info->counts[id+1] = 0;
+ bin_info->counts[id+2] = 0;
+ bin_info->counts[id+3] = 0;
+ }
+}
+
+
+// copy global to local
+inline void BinInfo_copy( local struct BFS_BinInfo* local_bin_info, global struct BFS_BinInfo* global_bin_info )
+{
+ for (uint id = get_local_id(0); id < 18 * BFS_NUM_BINS; id += get_local_size(0))
+ {
+ float inf = INFINITY ;
+ float f = global_bin_info->min_max[id];
+ local_bin_info->min_max[id] = f;
+ }
+ for (uint id = get_local_id(0); id < 3 * BFS_NUM_BINS; id += get_local_size(0))
+ {
+ local_bin_info->counts[id] = global_bin_info->counts[id];
+ }
+}
+
+inline void BinInfo_init_subgroup(struct BFS_BinInfo* bin_info)
+{
+ uint sg_size = get_sub_group_size();
+ uint lane = get_sub_group_local_id();
+
+ for (uint i = lane * 4; i < 3 * BFS_NUM_BINS; i += sg_size * 4)
+ {
+ // compiler should merge it into a 4xdword send
+ bin_info->counts[i+0] = 0;
+ bin_info->counts[i+1] = 0;
+ bin_info->counts[i+2] = 0;
+ bin_info->counts[i+3] = 0;
+ }
+
+
+ for (uint i = lane * 4; i < 18 * BFS_NUM_BINS; i += sg_size * 4)
+ {
+ // compiler should merge it into a 4xdword send
+ bin_info->min_max[i+0] = INFINITY;
+ bin_info->min_max[i+1] = INFINITY;
+ bin_info->min_max[i+2] = INFINITY;
+ bin_info->min_max[i+3] = INFINITY;
+ }
+
+}
+
+float3 shuffle_down_float3( float3 a, float3 b, uint delta )
+{
+ return (float3)(
+ intel_sub_group_shuffle_down( a.x, b.x, delta ),
+ intel_sub_group_shuffle_down( a.y, b.y, delta ),
+ intel_sub_group_shuffle_down( a.z, b.z, delta )
+ );
+}
+
+
+
+
+void BinInfo_primref_ballot_loop( local struct BFS_BinInfo* bin_info, uint axis, uint bin, float3 lower, float3 upper, bool active_lane )
+{
+ local float* bins_min = &bin_info->min_max[0];
+ local float* bins_max = &bin_info->min_max[3];
+
+ varying uint place = (bin + axis*BFS_NUM_BINS);
+ varying uint lane = get_sub_group_local_id();
+
+ uniform uint active_mask = intel_sub_group_ballot(active_lane);
+
+ while( active_mask )
+ {
+ uniform uint leader = ctz( active_mask );
+ uniform uint lead_place = intel_sub_group_shuffle( place, leader );
+ varying bool matching_bin = lead_place == place && active_lane;
+
+ varying float3 lo = (float3)(INFINITY,INFINITY,INFINITY);
+ varying float3 hi = (float3)(-INFINITY,-INFINITY,-INFINITY);
+ if (matching_bin)
+ {
+ lo = lower.xyz;
+ hi = upper.xyz;
+ }
+
+ lo = sub_group_reduce_min_float3( lo );
+ hi = sub_group_reduce_max_float3( hi );
+
+ {
+ // atomic min operation vectorized across 6 lanes
+ // [ lower.xyz ][-][upper.xyz][-]
+ //
+ // Lanes 3 and 7 are inactive
+
+ uint lmod = lane % 4;
+ uint ldiv = lane / 4;
+ float vlo = lo.x;
+ float vhi = hi.x;
+ vlo = (lmod == 1) ? lo.y : vlo;
+ vhi = (lmod == 1) ? hi.y : vhi;
+ vlo = (lmod == 2) ? lo.z : vlo;
+ vhi = (lmod == 2) ? hi.z : vhi;
+
+ float v = (ldiv == 0) ? vlo : -vhi;
+
+ if( (1<<lane) & 0x77 )
+ atomic_min( &bin_info->min_max[ 6*lead_place + lmod + 3*ldiv ], v );
+ }
+
+ //if( lane == 0 )
+ // atomic_add_local(&bin_info->counts[lead_place], popcount(active_mask & intel_sub_group_ballot(matching_bin)) );
+
+ active_mask = active_mask & intel_sub_group_ballot(!matching_bin);
+ }
+}
+
+inline void BinInfo_add_primref(struct BinMapping* binMapping, local struct BFS_BinInfo* bin_info, PrimRef* primref, bool active_lane )
+{
+
+ const float4 lower = primref->lower;
+ const float4 upper = primref->upper;
+ const float4 p = lower + upper;
+ const uint4 i = convert_uint4( (p - binMapping->ofs) * binMapping->scale );
+
+ BinInfo_primref_ballot_loop( bin_info, 0, i.x, lower.xyz, upper.xyz, active_lane );
+ BinInfo_primref_ballot_loop( bin_info, 1, i.y, lower.xyz, upper.xyz, active_lane );
+ BinInfo_primref_ballot_loop( bin_info, 2, i.z, lower.xyz, upper.xyz, active_lane );
+
+ if (active_lane)
+ {
+ atomic_inc_local( &bin_info->counts[i.x + 0 * BFS_NUM_BINS] );
+ atomic_inc_local( &bin_info->counts[i.y + 1 * BFS_NUM_BINS] );
+ atomic_inc_local( &bin_info->counts[i.z + 2 * BFS_NUM_BINS] );
+ }
+}
+
+inline void BinInfo_merge(global struct BFS_BinInfo* global_info, local struct BFS_BinInfo* local_info)
+{
+ uint id = get_local_id(0);
+ for (uint id = get_local_id(0); id < 18 * BFS_NUM_BINS; id += get_local_size(0))
+ {
+ float v = local_info->min_max[id];
+ if( v != INFINITY )
+ atomic_min(&global_info->min_max[id], v);
+ }
+ for (uint id = get_local_id(0); id < 3 * BFS_NUM_BINS; id += get_local_size(0))
+ {
+ uint c = local_info->counts[id];
+ if( c )
+ atomic_add_global(&global_info->counts[id], c);
+ }
+}
+
+inline struct AABB3f BinInfo_get_AABB(struct BFS_BinInfo* bin_info, ushort bin, ushort axis)
+{
+ float* min = &bin_info->min_max[6*(bin + axis*BFS_NUM_BINS)];
+ float* max = min + 3;
+ struct AABB3f box;
+ for (uint i = 0; i < 3; i++)
+ {
+ box.lower[i] = min[i];
+ box.upper[i] = -max[i];
+ }
+
+ return box;
+}
+
+inline uint3 BinInfo_get_counts(struct BFS_BinInfo* bin_info, ushort bin)
+{
+ uint3 counts;
+ counts.x = bin_info->counts[bin + 0 * BFS_NUM_BINS]; // TODO: block load these
+ counts.y = bin_info->counts[bin + 1 * BFS_NUM_BINS];
+ counts.z = bin_info->counts[bin + 2 * BFS_NUM_BINS];
+ return counts;
+}
+inline uint BinInfo_get_count(struct BFS_BinInfo* bin_info, ushort bin, ushort axis)
+{
+ return bin_info->counts[bin + axis * BFS_NUM_BINS];
+}
+
+
+void BVH2_Initialize( struct BVH2* bvh )
+{
+ bvh->num_nodes = 1;
+}
+
+inline bool BVH2_IsInnerNode( global struct BVH2* bvh, uint node_index )
+{
+ global struct BVH2Node* n = ((global struct BVH2Node*)(bvh + 1)) + node_index;
+ return (n->meta_ss & 0x10000) != 0;
+}
+inline uint BVH2_GetRoot( struct BVH2* bvh )
+{
+ return 0;
+}
+
+//////////////////////////////////////////////
+// BVH2NodeMetaData funcs
+//////////////////////////////////////////////
+struct BVH2NodeMetaData
+{
+ uint meta_u; // leaf: primref start. inner: offset from node to its first child
+ uint meta_ss;
+};
+
+inline struct BVH2NodeMetaData BVH2_GetNodeMetaData( global struct BVH2* bvh, uint node_index )
+{
+ global struct BVH2Node* n = ((global struct BVH2Node*)(bvh + 1)) + node_index;
+ struct BVH2NodeMetaData meta;
+ meta.meta_u = n->meta_u;
+ meta.meta_ss = n->meta_ss;
+ return meta;
+}
+
+inline bool BVH2NodeMetaData_IsInnerNode( struct BVH2NodeMetaData* meta )
+{
+ return (meta->meta_ss & 0x10000) != 0;
+}
+
+inline ushort BVH2NodeMetaData_GetLeafPrimCount( struct BVH2NodeMetaData* meta )
+{
+ return meta->meta_ss & 0xffff;
+}
+
+inline uint BVH2NodeMetaData_GetLeafPrimStart( struct BVH2NodeMetaData* meta )
+{
+ return meta->meta_u;
+}
+
+inline uint BVH2NodeMetaData_GetMask( struct BVH2NodeMetaData* meta )
+{
+ return (meta->meta_ss>>24);
+}
+
+//////////////////////////////////////////////
+
+inline ushort BVH2_GetLeafPrimCount( struct BVH2* bvh, uint node_index )
+{
+ struct BVH2Node* n = ((struct BVH2Node*)(bvh + 1)) + node_index;
+ return n->meta_ss & 0xffff;
+}
+inline uint BVH2_GetLeafPrimStart( struct BVH2* bvh, uint node_index )
+{
+ struct BVH2Node* n = ((struct BVH2Node*)(bvh + 1)) + node_index;
+ return n->meta_u;
+}
+inline uint2 BVH2_GetChildIndices( struct BVH2* bvh, uint node_index )
+{
+ struct BVH2Node* n = ((struct BVH2Node*)(bvh + 1)) + node_index;
+ uint2 idx;
+ idx.x = n->meta_u;
+ idx.y = idx.x + (n->meta_ss & 0xffff);
+ return idx;
+}
+
+inline float BVH2_GetNodeArea( global struct BVH2* bvh, uint node_index )
+{
+ global struct BVH2Node* n = ((global struct BVH2Node*)(bvh + 1)) + node_index;
+ return AABB3f_halfArea( &n->box );
+}
+
+
+inline struct AABB3f BVH2_GetNodeBox( global struct BVH2* bvh, uint node_index )
+{
+ global struct BVH2Node* n = ((global struct BVH2Node*)(bvh + 1)) + node_index;
+ return n->box;
+}
+inline void BVH2_SetNodeBox( global struct BVH2* bvh, uint node_index, struct AABB3f* box )
+{
+ global struct BVH2Node* n = ((global struct BVH2Node*)(bvh + 1)) + node_index;
+ n->box = *box;
+}
+
+inline void BVH2_SetNodeBox_lu( global struct BVH2* bvh, uint node_index, float3 lower, float3 upper )
+{
+ global struct BVH2Node* n = ((global struct BVH2Node*)(bvh + 1)) + node_index;
+ AABB3f_set( &n->box, lower, upper );
+}
+
+inline void BVH2_InitNodeBox( struct BVH2* bvh, uint node_index )
+{
+ struct BVH2Node* n = ((struct BVH2Node*)(bvh + 1)) + node_index;
+ AABB3f_init( &n->box );
+}
+
+inline struct AABB BVH2_GetAABB( global struct BVH2* bvh, uint node_index )
+{
+ global struct BVH2Node* n = ((global struct BVH2Node*)(bvh + 1)) + node_index;
+ struct AABB r;
+ r.lower.xyz = AABB3f_load_lower( &n->box );
+ r.upper.xyz = AABB3f_load_upper( &n->box );
+ return r;
+}
+
+inline void BVH2_WriteInnerNode( global struct BVH2* bvh, uint node_index, struct AABB3f* box, uint2 child_offsets, uint mask )
+{
+ global struct BVH2Node* n = ((global struct BVH2Node*)(bvh + 1)) + node_index;
+ n->box = *box;
+ n->meta_u = child_offsets.x;
+ n->meta_ss = 0x10000 + (child_offsets.y - child_offsets.x) + (mask<<24);
+ // n->is_inner = true;
+}
+
+inline void BVH2_WriteLeafNode( global struct BVH2* bvh, uint node_index, struct AABB3f* box, uint prim_start, uint prim_count, uint mask )
+{
+ global struct BVH2Node* n = ((global struct BVH2Node*)(bvh + 1)) + node_index;
+ n->box = *box;
+ n->meta_u = prim_start;
+ n->meta_ss = prim_count + (mask<<24);
+ // n->is_inner = true;
+}
+
+inline uint BVH2_GetMask( global struct BVH2* bvh, uint node_index )
+{
+ global struct BVH2Node* n = ((global struct BVH2Node*)(bvh + 1)) + node_index;
+ return (n->meta_ss>>24);
+}
+
+
+uint BVH2_AllocateNodes( global struct BVH2* bvh, uint num_nodes )
+{
+ return atomic_add_global( &bvh->num_nodes, num_nodes );
+}
+
+inline void BVH2_AtomicMergeNodeBox( global struct BVH2* bvh, uint node_index, float3 lower, float3 upper )
+{
+ global struct BVH2Node* n = ((global struct BVH2Node*)(bvh + 1)) + node_index;
+ AABB3f_atomic_merge_global_lu( &n->box, lower, upper );
+}
+
+
+void BVH2_print( global struct BVH2* bvh, uint start_node )
+{
+ if ( get_local_id( 0 ) == 0 && get_sub_group_id() == 0 )
+ {
+ uint num_nodes = bvh->num_nodes;
+
+ uint2 stack[BFS_MAX_DEPTH * 2];
+ uint sp = 0;
+
+ printf( "allocated_nodes=%u\n", num_nodes );
+
+ stack[sp++] = (uint2)(start_node, 0);
+ while ( sp > 0 )
+ {
+ uint2 data = stack[--sp];
+ uint node = data.x;
+ uint depth = data.y;
+
+ for ( uint i = 0; i < depth; i++ )
+ printf( " " );
+
+ if ( BVH2_IsInnerNode( bvh, node ) )
+ {
+ uint2 kids = BVH2_GetChildIndices( bvh, node );
+ printf( " %5u: inner: %u %u \n", node, kids.x, kids.y );
+ stack[sp++] = (uint2)(kids.y, depth + 1);
+ stack[sp++] = (uint2)(kids.x, depth + 1);
+
+ struct AABB3f l = BVH2_GetNodeBox( bvh, kids.x );
+ struct AABB3f r = BVH2_GetNodeBox( bvh, kids.y );
+ struct AABB3f p = BVH2_GetNodeBox( bvh, node );
+
+ float3 pl = AABB3f_load_lower( &p );
+ float3 pu = AABB3f_load_upper( &p );
+ float3 ll = AABB3f_load_lower( &l );
+ float3 lu = AABB3f_load_upper( &l );
+ float3 rl = AABB3f_load_lower( &r );
+ float3 ru = AABB3f_load_upper( &r );
+ if ( any( ll < pl ) || any( rl < pl ) ||
+ any( lu > pu ) || any( ru > pu ) )
+ {
+ for ( uint i = 0; i < depth; i++ )
+ printf( " " );
+
+ printf( "BAD_BOUNDS!!!!!!!! %u\n", node );
+ }
+
+
+ }
+ else
+ {
+
+ uint start = BVH2_GetLeafPrimStart( bvh, node );
+ uint count = BVH2_GetLeafPrimCount( bvh, node );
+ printf( " %5u: leaf: start=%u count=%u\n ",node,start,count );
+
+ }
+ }
+ }
+ barrier( CLK_LOCAL_MEM_FENCE );
+}
+
+
+global uint* SAHBuildGlobals_GetPrimrefIndices_In( struct SAHBuildGlobals* globals, bool odd_pass )
+{
+ uint num_refs = globals->num_primrefs;
+ global uint* ib = (global uint*) globals->p_primref_index_buffers;
+ return ib + (odd_pass ? num_refs : 0);
+}
+
+global uint* SAHBuildGlobals_GetPrimrefIndices_Out( struct SAHBuildGlobals* globals, bool odd_pass )
+{
+ uint num_refs = globals->num_primrefs;
+ global uint* ib = (global uint*) globals->p_primref_index_buffers;
+ return ib + (odd_pass ? 0 : num_refs);
+}
+
+global PrimRef* SAHBuildGlobals_GetPrimrefs( struct SAHBuildGlobals* globals )
+{
+ return (global PrimRef*) globals->p_primrefs_buffer;
+}
+
+global struct BVH2* SAHBuildGlobals_GetBVH2( struct SAHBuildGlobals* globals )
+{
+ return (global struct BVH2*)globals->p_bvh2;
+}
+
+uint SAHBuildGlobals_GetLeafSizeInBytes( struct SAHBuildGlobals* globals )
+{
+ return globals->leaf_size;
+}
+
+uint SAHBuildGlobals_GetLeafType( struct SAHBuildGlobals* globals )
+{
+ return globals->leaf_type;
+}
+
+uint SAHBuildGlobals_GetInternalNodeType( struct SAHBuildGlobals* globals )
+{
+ return NODE_TYPE_INTERNAL;
+}
+
+global struct BVHBase* SAHBuildGlobals_GetBVHBase( struct SAHBuildGlobals* globals )
+{
+ return (global struct BVHBase*) globals->p_bvh_base;
+}
+
+uint SAHBuildGlobals_GetTotalPrimRefs( struct SAHBuildGlobals* globals )
+{
+ return globals->num_primrefs;
+}
+
+inline bool SAHBuildGlobals_NeedBackPointers( struct SAHBuildGlobals* globals )
+{
+ return globals->flags & SAH_FLAG_NEED_BACKPOINTERS;
+}
+inline bool SAHBuildGlobals_NeedMasks( struct SAHBuildGlobals* globals )
+{
+ return globals->flags & SAH_FLAG_NEED_MASKS;
+}
+
+
+void SAHBuildGlobals_print( struct SAHBuildGlobals* globals )
+{
+ if ( get_local_id( 0 ) == 0 )
+ {
+ printf( "SAHBuildGlobals: %p\n", globals );
+ printf( " p_primref_index_buffers =%p\n", globals->p_primref_index_buffers );
+ printf( " p_primrefs_buffer =%p\n", globals->p_primrefs_buffer );
+ printf( " p_bvh2 =%p\n", globals->p_bvh2 );
+ printf( " p_globals =%p\n", globals->p_globals );
+ printf( " p_bvh_base =%p\n", globals->p_bvh_base );
+ printf( " num_primrefs = %u\n", globals->num_primrefs );
+ printf( " leaf_size = %u\n", globals->leaf_size );
+ printf( " leaf_type = %u\n", globals->leaf_type );
+ printf( " p_qnode_buffer = %p\n", globals->p_qnode_root_buffer);
+ }
+
+ barrier( CLK_LOCAL_MEM_FENCE );
+}
+
+
+uint get_num_wgs(uint thread_count, uint WG_SIZE)
+{
+ return (thread_count + WG_SIZE - 1) / WG_SIZE;
+}
+
+
+
+
+
+struct BFSDispatchArgs
+{
+ global struct VContextScheduler* scheduler;
+ global struct VContext* context;
+ global struct BVH2* bvh2;
+ global uint* primref_index_in;
+ global uint* primref_index_out;
+ global PrimRef* primref_buffer;
+
+ uint wg_primref_begin;
+ uint wg_primref_end;
+ uint dispatch_primref_begin;
+ uint dispatch_primref_end;
+ uint context_id;
+ uint num_wgs;
+ uint bvh2_root;
+ uint global_num_primrefs;
+ bool do_mask_processing;
+};
+
+
+
+
+// TODO_OPT: Enable larger WGs
+// We need a way to do this in a portable fashion.
+// Gen12 can support larger WGs than Gen9 can
+//
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__( (reqd_work_group_size( 512, 1, 1 )) )
+kernel void
+begin( global struct VContextScheduler* scheduler,
+ dword leaf_size,
+ dword leaf_type,
+ global uint* primref_index_buffers,
+ global PrimRef* primref_buffer,
+ global struct BVH2* bvh2,
+ global struct BVHBase* bvh_base,
+ global struct Globals* globals,
+ global struct SAHBuildGlobals* sah_globals,
+ global uint2* qnode_root_buffer,
+ dword sah_globals_flags
+ )
+{
+ dword num_primrefs = globals->numPrimitives;
+ if ( get_local_id( 0 ) == 0 )
+ {
+ sah_globals->p_primrefs_buffer = (qword) primref_buffer;
+ sah_globals->p_primref_index_buffers = (qword)primref_index_buffers;
+ sah_globals->p_bvh2 = (qword) bvh2;
+ sah_globals->p_bvh_base = (qword) bvh_base;
+ sah_globals->leaf_size = leaf_size;
+ sah_globals->leaf_type = leaf_type;
+ sah_globals->num_primrefs = num_primrefs;
+ sah_globals->p_globals = (qword) globals;
+ sah_globals->p_qnode_root_buffer = (gpuva_t) qnode_root_buffer;
+ sah_globals->flags = sah_globals_flags;
+
+ // initialize the spill stack
+ scheduler->bfs2_spill_stack.size = 0;
+
+ // initialize BVH2 node counter
+ BVH2_Initialize( bvh2 );
+
+ // configure first vcontext for first build
+ scheduler->contexts[0].dispatch_primref_begin = 0;
+ scheduler->contexts[0].dispatch_primref_end = num_primrefs;
+ scheduler->contexts[0].bvh2_root = BVH2_GetRoot( bvh2 );
+ scheduler->contexts[0].tree_depth = 0;
+ scheduler->contexts[0].batch_index = 0;
+
+ scheduler->bfs_queue.records[0].context_id = 0;
+
+ scheduler->contexts[0].num_left = 0;
+ scheduler->contexts[0].num_right = 0;
+ scheduler->contexts[0].lr_mask = 0;
+
+ // copy centroid bounds into the BVH2 root node'
+ BVH2_SetNodeBox_lu( bvh2, BVH2_GetRoot( bvh2 ), globals->centroidBounds.lower.xyz, globals->centroidBounds.upper.xyz );
+
+ // zero the trivial build counters.. these are only used by the batch-build path
+ // but single-wg QNode path (if used) depends on them
+ scheduler->num_trivial_builds = 0;
+ scheduler->num_single_builds = 0;
+
+ // initialize the root-buffer counters
+ sah_globals->root_buffer_num_produced = 0;
+ sah_globals->root_buffer_num_produced_hi = 0;
+ sah_globals->root_buffer_num_consumed = 0;
+ sah_globals->root_buffer_num_consumed_hi = 0;
+ }
+
+ // initialize vcontext states
+ for ( uint i = get_local_id( 0 ); i < BFS_NUM_VCONTEXTS; i += get_local_size( 0 ) )
+ scheduler->vcontext_state[i] = (i==0) ? VCONTEXT_STATE_EXECUTING : VCONTEXT_STATE_UNALLOCATED;
+
+ // initialize global bin info in vcontext - only context[0] will be used in first iteration
+ BinInfo_init( &scheduler->contexts[0].global_bin_info );
+ LRBounds_init( &scheduler->contexts[0].lr_bounds );
+
+ // barrier( CLK_GLOBAL_MEM_FENCE ); // lsc flush ... driver now does these as part of COMPUTE_WALKER
+}
+
+// TODO_OPT: Enable larger WGs
+// We need a way to do this in a portable fashion.
+// Gen12 can support larger WGs than Gen9 can
+//
+
+
+// TODO_OPT: Enable larger WGs
+// We need a way to do this in a portable fashion.
+// Gen12 can support larger WGs than Gen9 can
+//
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(512, 1, 1)))
+kernel void
+categorize_builds_and_init_scheduler(
+ global struct VContextScheduler* scheduler,
+ global gpuva_t* globals_ptrs, // OCL-C does not allow kernel parameters to be pointer-to-pointer, so we trick it...
+ global struct SAHBuildBuffersInfo* buffers_info,
+ global struct SAHBuildGlobals* builds_out,
+ dword num_builds
+)
+{
+ local uint num_trivial;
+ local uint num_single;
+ local uint num_full;
+
+ if (get_group_id(0) == 0) // first workgroup performs build categorization
+ {
+ if (get_local_id(0) == 0)
+ {
+ num_trivial = 0;
+ num_single = 0;
+ num_full = 0;
+ }
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ // first pass, count builds of each type
+ uint triv = 0;
+ uint single = 0;
+ uint full = 0;
+ for (uint i = get_local_id(0); i < num_builds; i += get_local_size(0))
+ {
+ global struct Globals* globals = (global struct Globals*) globals_ptrs[i];
+ dword num_refs = globals->numPrimitives;
+
+ if (num_refs <= TRIVIAL_BUILD_THRESHOLD)
+ triv++;
+ else if (num_refs <= SINGLE_WG_BUILD_THRESHOLD)
+ single++;
+ else
+ full++;
+ }
+
+ // merge counts across work-group. These variables are now offsets into this thread's ranges
+ triv = atomic_add_local(&num_trivial, triv);
+ single = atomic_add_local(&num_single, single);
+ full = atomic_add_local(&num_full, full);
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ global struct SAHBuildGlobals* trivial_builds_out = builds_out;
+ global struct SAHBuildGlobals* single_builds_out = builds_out + num_trivial;
+ global struct SAHBuildGlobals* full_builds_out = builds_out + num_trivial + num_single;
+
+ for (uint i = get_local_id(0); i < num_builds; i += get_local_size(0))
+ {
+ global struct Globals* globals = (global struct Globals*) globals_ptrs[i];
+ global struct SAHBuildBuffersInfo* buffers = &buffers_info[i];
+
+ dword num_refs = globals->numPrimitives;
+ dword leaf_type = globals->leafPrimType;
+ dword leaf_size = globals->leafSize;
+
+ global struct SAHBuildGlobals* place;
+ if (num_refs <= TRIVIAL_BUILD_THRESHOLD)
+ place = trivial_builds_out + (triv++);
+ else if (num_refs <= SINGLE_WG_BUILD_THRESHOLD)
+ place = single_builds_out + (single++);
+ else
+ place = full_builds_out + (full++);
+
+ place->p_primref_index_buffers = buffers->p_primref_index_buffers;
+ place->p_primrefs_buffer = buffers->p_primrefs_buffer;
+ place->p_bvh2 = buffers->p_bvh2;
+ place->p_bvh_base = buffers->p_bvh_base;
+ place->p_globals = (gpuva_t)globals;
+ place->num_primrefs = num_refs;
+ place->leaf_size = leaf_size;
+ place->leaf_type = leaf_type;
+ place->flags = buffers->sah_globals_flags;
+ place->p_qnode_root_buffer = buffers->p_qnode_root_buffer;
+
+ // only initialize BVH2 if it will actually be used by the build
+ // trivial passes will not use it
+ if( num_refs > SINGLE_WG_BUILD_THRESHOLD )
+ {
+ // initialize BVH2 node counter
+ global struct BVH2* bvh2 = SAHBuildGlobals_GetBVH2(place);
+ BVH2_Initialize(bvh2);
+
+ // copy centroid bounds into the BVH2 root node'
+ BVH2_SetNodeBox_lu(bvh2, BVH2_GetRoot(bvh2), globals->centroidBounds.lower.xyz, globals->centroidBounds.upper.xyz);
+ }
+ }
+
+ if (get_local_id(0) == 0)
+ {
+ scheduler->num_trivial_builds = num_trivial;
+ scheduler->num_single_builds = num_single;
+ scheduler->batched_build_offset = num_trivial + num_single;
+ scheduler->batched_build_count = num_full;
+ }
+ }
+ else // second workgroup initializes the scheduler
+ {
+ // initialize vcontext states
+ for (uint i = get_local_id(0); i < BFS_NUM_VCONTEXTS; i += get_local_size(0))
+ scheduler->vcontext_state[i] = (i == 0) ? VCONTEXT_STATE_EXECUTING : VCONTEXT_STATE_UNALLOCATED;
+
+ // initialize global bin info in vcontexts
+ for (uint i = get_sub_group_id(); i < BFS_NUM_VCONTEXTS; i += get_num_sub_groups())
+ BinInfo_init_subgroup(&scheduler->contexts[i].global_bin_info);
+
+ // initialize the spill stack
+ if (get_local_id(0) == 0)
+ scheduler->bfs2_spill_stack.size = 0;
+ }
+
+ //barrier( CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE );// lsc flush ... driver now does these as part of COMPUTE_WALKER
+}
+
+
+
+
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(BFS_NUM_VCONTEXTS, 1, 1)))
+kernel void
+begin_batchable(
+ global struct VContextScheduler* scheduler,
+ global struct SAHBuildGlobals* sah_globals
+)
+{
+ ushort scheduler_build_offset = scheduler->batched_build_offset;
+ ushort scheduler_num_builds = scheduler->batched_build_count;
+
+ ushort num_builds = min( scheduler_num_builds, (ushort)BFS_NUM_VCONTEXTS );
+
+ uint num_wgs = 0;
+
+ ushort tid = get_local_id(0);
+ if ( tid < num_builds )
+ {
+ ushort batch_index = scheduler_build_offset + tid;
+
+ uint num_primrefs = sah_globals[batch_index].num_primrefs;
+
+ // configure first vcontext for first build
+ scheduler->contexts[tid].dispatch_primref_begin = 0;
+ scheduler->contexts[tid].dispatch_primref_end = num_primrefs;
+ scheduler->contexts[tid].bvh2_root = BVH2_GetRoot( SAHBuildGlobals_GetBVH2(&sah_globals[batch_index]) );
+ scheduler->contexts[tid].tree_depth = 0;
+ scheduler->contexts[tid].batch_index = batch_index;
+ scheduler->vcontext_state[tid] = VCONTEXT_STATE_EXECUTING;
+
+ scheduler->contexts[tid].num_left = 0;
+ scheduler->contexts[tid].num_right = 0;
+ scheduler->contexts[tid].lr_mask = 0;
+
+ num_wgs = get_num_wgs( num_primrefs, BFS_WG_SIZE );
+
+ scheduler->bfs_queue.wg_count[tid] = num_wgs;
+ scheduler->bfs_queue.records[tid].batch_index = batch_index;
+ scheduler->bfs_queue.records[tid].context_id = tid;
+ }
+
+ num_wgs = work_group_reduce_add(num_wgs);
+
+ if (tid == 0)
+ {
+ // write out build count and offset for next BFS iteration
+ scheduler->batched_build_offset = scheduler_build_offset + num_builds;
+ scheduler->batched_build_count = scheduler_num_builds - num_builds;
+
+ // write out initial WG count and loop termination mask for command streamer to consume
+ scheduler->batched_build_wg_count = num_wgs;
+ scheduler->batched_build_loop_mask = (scheduler_num_builds > num_builds) ? 1 : 0;
+
+ scheduler->bfs_queue.num_dispatches = num_builds;
+ }
+
+ for ( uint i = get_sub_group_id(); i < num_builds; i += get_num_sub_groups() )
+ BinInfo_init_subgroup( &scheduler->contexts[i].global_bin_info );
+
+ for ( uint i = get_sub_group_id(); i < num_builds; i += get_num_sub_groups() )
+ LRBounds_init_subgroup( &scheduler->contexts[i].lr_bounds );
+}
+
+
+
+bool is_leaf( uint num_refs )
+{
+ return num_refs <= TREE_ARITY;
+}
+
+bool is_dfs( uint num_refs )
+{
+ return num_refs > TREE_ARITY&& num_refs <= DFS_THRESHOLD;
+}
+
+bool is_bfs( uint num_refs )
+{
+ return num_refs > DFS_THRESHOLD;
+}
+
+int2 is_leaf_2( uint2 num_refs )
+{
+ return num_refs.xy <= TREE_ARITY;
+}
+int2 is_bfs_2( uint2 num_refs )
+{
+ return num_refs.xy > DFS_THRESHOLD;
+}
+
+int2 is_dfs_2( uint2 num_refs )
+{
+ return num_refs.xy > TREE_ARITY && num_refs.xy <= DFS_THRESHOLD;
+}
+
+#if 0
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(16, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16)))
+kernel void
+sg_scheduler( global struct VContextScheduler* scheduler )
+{
+ local struct BFS1SpillStackEntry SLM_local_spill_stack[BFS_NUM_VCONTEXTS];
+ local uchar SLM_context_state[BFS_NUM_VCONTEXTS];
+ local vcontext_id_t SLM_free_list[BFS_NUM_VCONTEXTS];
+ local vcontext_id_t SLM_exec_list[BFS_NUM_VCONTEXTS];
+
+
+ varying ushort lane = get_sub_group_local_id();
+
+ uniform uint free_list_size = 0;
+ uniform uint exec_list_size = 0;
+
+ // read context states, build lists of free and executing contexts
+ for (varying uint i = lane; i < BFS_NUM_VCONTEXTS; i += get_sub_group_size())
+ {
+ uchar state = scheduler->vcontext_state[i];
+ SLM_context_state[i] = state;
+
+ uniform ushort exec_mask = intel_sub_group_ballot(state == VCONTEXT_STATE_EXECUTING);
+
+ varying ushort prefix_exec = subgroup_bit_prefix_exclusive(exec_mask);
+ varying ushort prefix_free = lane - prefix_exec;
+ varying ushort exec_list_pos = exec_list_size + prefix_exec;
+ varying ushort free_list_pos = free_list_size + prefix_free;
+
+ if (state == VCONTEXT_STATE_EXECUTING)
+ SLM_exec_list[exec_list_pos] = i;
+ else
+ SLM_free_list[free_list_pos] = i;
+
+ uniform ushort num_exec = popcount(exec_mask);
+ exec_list_size += num_exec;
+ free_list_size += get_sub_group_size() - num_exec;
+ }
+
+ uniform uint total_bfs_dispatches = 0;
+ uniform uint total_dfs_dispatches = 0;
+ uniform uint bfs_spill_stack_size = 0;
+ uniform uint total_bfs_wgs = 0;
+
+ // process executing context. accumulate bfs/dfs dispatches and free-list entries
+ for (uint i = 0; i < exec_list_size; i+= get_sub_group_size() )
+ {
+ varying ushort num_dfs_dispatches = 0;
+ varying ushort num_bfs_spills = 0;
+
+ varying ushort num_bfs_children;
+ varying ushort context_id;
+ struct VContext* context;
+ varying uint num_left ;
+ varying uint num_right ;
+ varying uint primref_begin ;
+ varying uint primref_end ;
+ varying uint depth ;
+
+ bool active_lane = (i + lane) < exec_list_size;
+ if ( active_lane )
+ {
+ context_id = SLM_exec_list[i + lane];
+ context = &scheduler->contexts[context_id];
+
+ num_left = context->num_left;
+ num_right = context->num_right;
+ primref_begin = context->dispatch_primref_begin;
+ primref_end = context->dispatch_primref_end;
+ depth = context->tree_depth;
+
+ // get dispatch counts
+
+ num_dfs_dispatches = is_dfs(num_left) + is_dfs(num_right);
+ num_bfs_children = is_bfs(num_left) + is_bfs(num_right);
+ num_bfs_spills = (num_bfs_children == 2) ? 1 : 0;
+ }
+
+ // allocate space for DFS, BFS dispatches, and BFS spills
+ varying uint dfs_pos = total_dfs_dispatches + sub_group_scan_exclusive_add(num_dfs_dispatches);
+ varying ushort mask_bfs_spills = intel_sub_group_ballot(num_bfs_children & 2); // spill if #children == 2
+ varying ushort mask_bfs_dispatches = intel_sub_group_ballot(num_bfs_children & 3); // dispatch if #children == 1 or 2
+ varying uint bfs_spill_pos = bfs_spill_stack_size + subgroup_bit_prefix_exclusive(mask_bfs_spills);
+ varying uint bfs_dispatch_pos = total_bfs_dispatches + subgroup_bit_prefix_exclusive(mask_bfs_dispatches);
+
+ total_dfs_dispatches += sub_group_reduce_add(num_dfs_dispatches);
+ bfs_spill_stack_size += popcount(mask_bfs_spills);
+ total_bfs_dispatches += popcount(mask_bfs_dispatches);
+
+ varying uint num_bfs_wgs = 0;
+ if (active_lane)
+ {
+ if (num_dfs_dispatches)
+ {
+ if (is_dfs(num_left))
+ {
+ scheduler->dfs_queue.records[dfs_pos].primref_base = primref_begin;
+ scheduler->dfs_queue.records[dfs_pos].num_primrefs = num_left;
+ scheduler->dfs_queue.records[dfs_pos].bvh2_base = context->left_bvh2_root;
+ scheduler->dfs_queue.records[dfs_pos].tree_depth = depth + 1;
+ dfs_pos++;
+ }
+ if (is_dfs(num_right))
+ {
+ scheduler->dfs_queue.records[dfs_pos].primref_base = primref_begin + num_left;
+ scheduler->dfs_queue.records[dfs_pos].num_primrefs = num_right;
+ scheduler->dfs_queue.records[dfs_pos].bvh2_base = context->right_bvh2_root;
+ scheduler->dfs_queue.records[dfs_pos].tree_depth = depth + 1;
+ }
+ }
+
+ uint num_bfs_children = is_bfs(num_left) + is_bfs(num_right);
+ if (num_bfs_children == 2)
+ {
+ // spill the right child.. push an entry onto local spill stack
+ SLM_local_spill_stack[bfs_spill_pos].primref_begin = primref_begin + num_left;
+ SLM_local_spill_stack[bfs_spill_pos].primref_end = primref_end;
+ SLM_local_spill_stack[bfs_spill_pos].bvh2_root = context->right_bvh2_root;
+ SLM_local_spill_stack[bfs_spill_pos].tree_depth = depth + 1;
+
+ // setup BFS1 dispatch for left child
+ context->dispatch_primref_end = primref_begin + num_left;
+ context->bvh2_root = context->left_bvh2_root;
+ context->tree_depth = depth + 1;
+ num_bfs_wgs = get_num_wgs(num_left, BFS_WG_SIZE);
+
+ scheduler->bfs_queue.wg_count[bfs_dispatch_pos] = num_bfs_wgs;
+ scheduler->bfs_queue.records[bfs_dispatch_pos].context_id = context_id;
+ }
+ else if (num_bfs_children == 1)
+ {
+ // setup BFS1 dispatch for whichever child wants it
+ if (is_bfs(num_left))
+ {
+ // bfs on left child
+ context->dispatch_primref_end = context->dispatch_primref_begin + num_left;
+ context->bvh2_root = context->left_bvh2_root;
+ context->tree_depth = depth + 1;
+ num_bfs_wgs = get_num_wgs(num_left, BFS_WG_SIZE);
+ }
+ else
+ {
+ // bfs on right child
+ context->dispatch_primref_begin = context->dispatch_primref_begin + num_left;
+ context->bvh2_root = context->right_bvh2_root;
+ context->tree_depth = depth + 1;
+ num_bfs_wgs = get_num_wgs(num_right, BFS_WG_SIZE);
+ }
+
+ scheduler->bfs_queue.wg_count[bfs_dispatch_pos] = num_bfs_wgs;
+ scheduler->bfs_queue.records[bfs_dispatch_pos].context_id = context_id;
+ }
+ else
+ {
+ // no bfs dispatch.. this context is now free
+ SLM_context_state[context_id] = VCONTEXT_STATE_UNALLOCATED;
+ }
+ }
+
+ // count bfs work groups
+ total_bfs_wgs += sub_group_reduce_add(num_bfs_wgs);
+
+ // add newly deallocated contexts to the free list
+ uniform uint free_mask = intel_sub_group_ballot( active_lane && num_bfs_children == 0);
+ varying uint free_list_pos = free_list_size + subgroup_bit_prefix_exclusive(free_mask);
+ free_list_size += popcount(free_mask);
+
+ if ( free_mask & (1<<lane) )
+ SLM_free_list[free_list_pos] = context_id;
+
+ }
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ // if we have more free contexts than spills, read additional spills from the scheduler's spill stack
+ uniform uint memory_spill_stack_size = scheduler->bfs2_spill_stack.size;
+
+ if(bfs_spill_stack_size < free_list_size && memory_spill_stack_size > 0 )
+ {
+ uniform uint read_count = min(free_list_size - bfs_spill_stack_size, memory_spill_stack_size);
+
+ for (varying uint i = lane; i < read_count; i+= get_sub_group_size())
+ SLM_local_spill_stack[bfs_spill_stack_size + i] = scheduler->bfs2_spill_stack.entries[memory_spill_stack_size - 1 - i];
+
+ bfs_spill_stack_size += read_count;
+ memory_spill_stack_size -= read_count;
+ }
+
+ // steal pending BFS work and assign it to free contexts
+ uniform uint num_steals = min(bfs_spill_stack_size, free_list_size);
+
+ for (uniform uint i = 0; i < num_steals; i += get_sub_group_size())
+ {
+ varying uint num_bfs_wgs = 0;
+
+ if (i + lane < num_steals)
+ {
+ uint context_id = SLM_free_list[i+lane];
+ struct VContext* context = &scheduler->contexts[context_id];
+ struct BFS1SpillStackEntry entry = SLM_local_spill_stack[i+lane];
+
+ context->dispatch_primref_begin = entry.primref_begin;
+ context->dispatch_primref_end = entry.primref_end;
+ context->bvh2_root = entry.bvh2_root;
+ context->tree_depth = entry.tree_depth;
+
+ num_bfs_wgs = get_num_wgs(entry.primref_end - entry.primref_begin, BFS_WG_SIZE);
+
+ scheduler->bfs_queue.wg_count[total_bfs_dispatches + i + lane] = num_bfs_wgs;
+ scheduler->bfs_queue.records[total_bfs_dispatches + i + lane].context_id = context_id;
+
+ SLM_context_state[context_id] = VCONTEXT_STATE_EXECUTING;
+ }
+
+ total_bfs_wgs += sub_group_reduce_add( num_bfs_wgs );
+ }
+
+ total_bfs_dispatches += num_steals;
+
+ // write out excess spills to global spill stack
+ uniform uint extra_spills = bfs_spill_stack_size - num_steals;
+ for (varying uint i = lane; i < extra_spills; i += get_sub_group_size())
+ {
+ scheduler->bfs2_spill_stack.entries[memory_spill_stack_size + i] = SLM_local_spill_stack[num_steals+i];
+ }
+
+
+ // write out modified context states
+ for ( varying uint i = lane; i < BFS_NUM_VCONTEXTS; i += get_sub_group_size())
+ scheduler->vcontext_state[i] = SLM_context_state[i];
+
+
+ if (get_local_id(0) == 0)
+ {
+ // write out new memory stack size
+ scheduler->bfs2_spill_stack.size = memory_spill_stack_size + extra_spills;
+
+ // store workgroup counters
+ scheduler->bfs_queue.num_dispatches = total_bfs_dispatches;
+ scheduler->num_bfs_wgs = total_bfs_wgs;
+ scheduler->num_dfs_wgs = total_dfs_dispatches;
+ }
+
+ // barrier(CLK_GLOBAL_MEM_FENCE); // make memory writes globally visible// lsc flush ... driver now does these as part of COMPUTE_WALKER
+}
+#endif
+
+#define SCHEDULER_SG_SIZE 16
+#define SCHEDULER_WG_SIZE BFS_NUM_VCONTEXTS
+#define SCHEDULER_NUM_SGS (SCHEDULER_WG_SIZE / SCHEDULER_SG_SIZE)
+
+
+struct BFSDispatchArgs get_bfs_args_from_record_batchable(
+ struct BFSDispatchRecord* record,
+ global struct VContextScheduler* scheduler,
+ global struct SAHBuildGlobals* globals_buffer );
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(SCHEDULER_WG_SIZE, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(SCHEDULER_SG_SIZE)))
+kernel void
+scheduler(global struct VContextScheduler* scheduler, global struct SAHBuildGlobals* sah_globals )
+{
+ local struct BFS1SpillStackEntry SLM_local_spill_stack[2 * BFS_NUM_VCONTEXTS];
+ local uint SLM_local_spill_stack_size;
+ local uint SLM_dfs_dispatch_count;
+
+ if (get_local_id(0) == 0)
+ {
+ SLM_local_spill_stack_size = 0;
+ SLM_dfs_dispatch_count = 0;
+ }
+
+ uint context_id = get_local_id(0);
+ uint state = scheduler->vcontext_state[context_id];
+ uint initial_state = state;
+
+ uint batch_index = 0;
+ global struct VContext* context = &scheduler->contexts[context_id];
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+
+ uint global_spill_stack_size = scheduler->bfs2_spill_stack.size;
+
+
+ if (state == VCONTEXT_STATE_EXECUTING)
+ {
+ uint left_bvh2_root;
+ uint right_bvh2_root;
+
+ uint num_left = context->num_left;
+ uint num_right = context->num_right;
+
+ uint primref_begin = context->dispatch_primref_begin;
+ uint primref_end = context->dispatch_primref_end;
+
+ uint depth = context->tree_depth;
+ uint batch_index = context->batch_index;
+
+ struct BFSDispatchRecord record;
+ record.context_id = context_id;
+ record.batch_index = context->batch_index;
+
+ struct BFSDispatchArgs args = get_bfs_args_from_record_batchable( &record, scheduler, sah_globals);
+
+ // do cleanup of bfs_pass2
+ {
+ // compute geom bounds
+ struct AABB3f left_geom_bounds;
+ struct AABB3f right_geom_bounds;
+ struct AABB3f left_centroid_bounds;
+ struct AABB3f right_centroid_bounds;
+ uint2 lr_counts = (uint2)(num_left, num_right);
+
+ {
+ left_centroid_bounds = LRBounds_get_left_centroid( &context->lr_bounds );
+ left_geom_bounds = LRBounds_get_left_geom( &context->lr_bounds );
+ right_centroid_bounds = LRBounds_get_right_centroid( &context->lr_bounds );
+ right_geom_bounds = LRBounds_get_right_geom( &context->lr_bounds );
+ }
+
+ int2 v_is_leaf = is_leaf_2( lr_counts );
+ int2 v_is_dfs = is_dfs_2( lr_counts );
+ int2 v_is_bfs = is_bfs_2( lr_counts );
+ uint left_mask = args.do_mask_processing ? context->lr_mask & 0xff : 0xff;
+ uint right_mask = args.do_mask_processing ? (context->lr_mask & 0xff00) >> 8 : 0xff;
+
+ // how many BVH2 nodes do we need to allocate? For DFS, we need to pre-allocate full subtree
+ uint2 lr_node_counts = select( (uint2)(1,1), (2*lr_counts-1), v_is_dfs );
+ uint left_node_count = lr_node_counts.x;
+ uint right_node_count = lr_node_counts.y;
+
+ // allocate the nodes
+ uint first_node = BVH2_AllocateNodes( args.bvh2, left_node_count + right_node_count );
+
+ // point our root node at its children
+ left_bvh2_root = first_node;
+ right_bvh2_root = first_node + left_node_count;
+
+ // store combined geom bounds in the root node's AABB.. we previously stored centroid bounds there
+ // but node creation requires geom bounds
+ struct AABB3f geom_bounds = left_geom_bounds;
+ AABB3f_extend(&geom_bounds, &right_geom_bounds);
+ BVH2_WriteInnerNode( args.bvh2, args.bvh2_root, &geom_bounds, (uint2)(left_bvh2_root,right_bvh2_root), left_mask | right_mask );
+
+// printf(" node: %u mask: %x\n", args.bvh2_root, left_mask|right_mask );
+
+ // store the appropriate AABBs in the child nodes
+ // - BFS passes need centroid bounds
+ // - DFS passes need geom bounds
+ // Here we also write leaf connectivity information (prim start+count)
+ // this will be overwritten later if we are creating an inner node
+ struct AABB3f left_box, right_box;
+ left_box = AABB3f_select( left_geom_bounds, left_centroid_bounds, v_is_bfs.xxx );
+ right_box = AABB3f_select( right_geom_bounds, right_centroid_bounds, v_is_bfs.yyy );
+
+ uint left_start = primref_begin;
+ uint right_start = primref_begin + num_left;
+ BVH2_WriteLeafNode( args.bvh2, left_bvh2_root, &left_box, left_start, num_left, left_mask );
+ BVH2_WriteLeafNode( args.bvh2, right_bvh2_root, &right_box, right_start, num_right, right_mask );
+
+ // make input and output primref index buffers consistent in the event we're creating a leaf
+ // There should only ever be one leaf created, otherwise we'd have done a DFS pass sooner
+ if (any( v_is_leaf.xy ))
+ {
+ uint start = v_is_leaf.x ? left_start : right_start;
+ uint num_refs = v_is_leaf.x ? num_left : num_right;
+
+ for(uint i = 0; i < num_refs; i++)
+ {
+ args.primref_index_in[start + i] = args.primref_index_out[start + i];
+ }
+ }
+ }
+
+ // when BFS2 finishes, we need to dispatch two child tasks.
+ // DFS dispatches can run free and do not need a context
+ // BFS dispatches need a context.
+ // In the case where both of the child nodes are BFS, the current context can immediately run one of the child dispatches
+ // and the other is spilled for an unallocated context to pick up
+
+ uint num_dfs_dispatches = is_dfs(num_left) + is_dfs(num_right);
+ if (num_dfs_dispatches)
+ {
+ uint dfs_pos = atomic_add_local(&SLM_dfs_dispatch_count, num_dfs_dispatches);
+ if (is_dfs(num_left))
+ {
+ scheduler->dfs_queue.records[dfs_pos].primref_base = primref_begin;
+ scheduler->dfs_queue.records[dfs_pos].num_primrefs = num_left;
+ scheduler->dfs_queue.records[dfs_pos].bvh2_base = left_bvh2_root;
+ scheduler->dfs_queue.records[dfs_pos].tree_depth = depth + 1;
+ scheduler->dfs_queue.records[dfs_pos].batch_index = batch_index;
+ dfs_pos++;
+ }
+ if (is_dfs(num_right))
+ {
+ scheduler->dfs_queue.records[dfs_pos].primref_base = primref_begin + num_left;
+ scheduler->dfs_queue.records[dfs_pos].num_primrefs = num_right;
+ scheduler->dfs_queue.records[dfs_pos].bvh2_base = right_bvh2_root;
+ scheduler->dfs_queue.records[dfs_pos].tree_depth = depth + 1;
+ scheduler->dfs_queue.records[dfs_pos].batch_index = batch_index;
+ }
+ }
+
+ uint num_bfs_children = is_bfs(num_left) + is_bfs(num_right);
+ if (num_bfs_children)
+ {
+ uint place = atomic_add_local(&SLM_local_spill_stack_size, num_bfs_children);
+ if (is_bfs(num_left))
+ {
+ SLM_local_spill_stack[place].primref_begin = primref_begin;
+ SLM_local_spill_stack[place].primref_end = primref_begin + num_left;
+ SLM_local_spill_stack[place].bvh2_root = left_bvh2_root;
+ SLM_local_spill_stack[place].tree_depth = depth + 1;
+ SLM_local_spill_stack[place].batch_index = batch_index;
+ place++;
+ }
+ if (is_bfs(num_right))
+ {
+ SLM_local_spill_stack[place].primref_begin = primref_begin + num_left;
+ SLM_local_spill_stack[place].primref_end = primref_end;
+ SLM_local_spill_stack[place].bvh2_root = right_bvh2_root;
+ SLM_local_spill_stack[place].tree_depth = depth + 1;
+ SLM_local_spill_stack[place].batch_index = batch_index;
+ place++;
+ }
+ }
+ }
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ uint local_spill_stack_size = SLM_local_spill_stack_size;
+
+ struct BFS1SpillStackEntry entry;
+ state = VCONTEXT_STATE_UNALLOCATED;
+ if (context_id < local_spill_stack_size)
+ {
+ // pull BFS work from the local spill stack if there's enough work there
+ entry = SLM_local_spill_stack[context_id];
+ state = VCONTEXT_STATE_EXECUTING;
+ }
+ else if ((context_id - local_spill_stack_size) < (global_spill_stack_size))
+ {
+ // if there isn't enough work on the local stack, consume from the global one
+ uint global_pos = (global_spill_stack_size - 1) - (context_id - local_spill_stack_size);
+ entry = scheduler->bfs2_spill_stack.entries[global_pos];
+ state = VCONTEXT_STATE_EXECUTING;
+ }
+
+ // contexts which received work set themselves up for the next BFS1 dispatch
+ uint num_bfs_wgs = 0;
+ uint num_bfs_dispatches = 0;
+ if (state == VCONTEXT_STATE_EXECUTING)
+ {
+ context->dispatch_primref_begin = entry.primref_begin;
+ context->dispatch_primref_end = entry.primref_end;
+ context->bvh2_root = entry.bvh2_root;
+ context->tree_depth = entry.tree_depth;
+ context->batch_index = entry.batch_index;
+
+ context->num_left = 0;
+ context->num_right = 0;
+ context->lr_mask = 0;
+
+ batch_index = entry.batch_index;
+ num_bfs_wgs = get_num_wgs(entry.primref_end - entry.primref_begin, BFS_WG_SIZE);
+ num_bfs_dispatches = 1;
+ }
+
+
+ if (local_spill_stack_size > BFS_NUM_VCONTEXTS)
+ {
+ // write out additional spills if we produced more work than we can consume
+ uint excess_spills = local_spill_stack_size - BFS_NUM_VCONTEXTS;
+ uint write_base = global_spill_stack_size;
+ uint lid = get_local_id(0);
+ if (lid < excess_spills)
+ scheduler->bfs2_spill_stack.entries[write_base + lid] = SLM_local_spill_stack[BFS_NUM_VCONTEXTS + lid];
+
+ if (lid == 0)
+ scheduler->bfs2_spill_stack.size = global_spill_stack_size + excess_spills;
+ }
+ else if (global_spill_stack_size > 0)
+ {
+ // otherwise, if we consumed any spills from the global stack, update the stack size
+ if (get_local_id(0) == 0)
+ {
+ uint global_spills_consumed = min(global_spill_stack_size, BFS_NUM_VCONTEXTS - local_spill_stack_size);
+ scheduler->bfs2_spill_stack.size = global_spill_stack_size - global_spills_consumed;
+ }
+ }
+
+
+ // Do various WG reductions.. the code below is a hand-written version of the following:
+ //
+ // uint bfs_dispatch_queue_pos = work_group_scan_exclusive_add( num_bfs_dispatches );
+ // uint reduce_num_bfs_wgs = work_group_reduce_add(num_bfs_wgs);
+ // uint reduce_num_bfs_dispatches = work_group_reduce_add(num_bfs_dispatches);
+ uint bfs_dispatch_queue_pos;
+ uint reduce_num_bfs_dispatches;
+ uint reduce_num_bfs_wgs;
+ local uint partial_dispatches[SCHEDULER_WG_SIZE / SCHEDULER_SG_SIZE];
+ local uint partial_wgs[SCHEDULER_WG_SIZE / SCHEDULER_SG_SIZE];
+ {
+ partial_dispatches[get_sub_group_id()] = sub_group_reduce_add(num_bfs_dispatches);
+ partial_wgs[get_sub_group_id()] = sub_group_reduce_add(num_bfs_wgs);
+
+ uint sg_prefix = sub_group_scan_exclusive_add(num_bfs_dispatches);
+
+ uint prefix_dispatches = 0;
+ uint total_dispatches = 0;
+ uint total_wgs = 0;
+ ushort lane = get_sub_group_local_id();
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ for (ushort i = 0; i < SCHEDULER_NUM_SGS; i += SCHEDULER_SG_SIZE) // this loop is intended to be fully unrolled after compilation
+ {
+ uint p_dispatch = partial_dispatches[i + lane];
+ uint p_wg = partial_wgs[i + lane];
+
+ prefix_dispatches += (i + lane < get_sub_group_id()) ? p_dispatch : 0;
+ total_dispatches += p_dispatch;
+ total_wgs += p_wg;
+ }
+
+ bfs_dispatch_queue_pos = sg_prefix + sub_group_reduce_add(prefix_dispatches);
+ reduce_num_bfs_dispatches = sub_group_reduce_add(total_dispatches);
+ reduce_num_bfs_wgs = sub_group_reduce_add(total_wgs);
+ }
+
+ // insert records into BFS queue
+ if (num_bfs_dispatches)
+ {
+ scheduler->bfs_queue.wg_count[bfs_dispatch_queue_pos] = num_bfs_wgs;
+ scheduler->bfs_queue.records[bfs_dispatch_queue_pos].context_id = context_id;
+ scheduler->bfs_queue.records[bfs_dispatch_queue_pos].batch_index = batch_index;
+ }
+
+
+ // store modified vcontext state if it has changed
+ if (initial_state != state)
+ scheduler->vcontext_state[context_id] = state;
+
+
+ // store workgroup counters
+ if (get_local_id(0) == 0)
+ {
+ scheduler->bfs_queue.num_dispatches = reduce_num_bfs_dispatches;
+ scheduler->num_bfs_wgs = reduce_num_bfs_wgs;
+ scheduler->num_dfs_wgs = SLM_dfs_dispatch_count;
+ }
+
+ const uint contexts_to_clear = min( (uint)BFS_NUM_VCONTEXTS, (uint)(local_spill_stack_size+global_spill_stack_size) );
+
+ for ( uint i = get_sub_group_id(); i < contexts_to_clear; i += get_num_sub_groups() )
+ BinInfo_init_subgroup( &scheduler->contexts[i].global_bin_info );
+
+ for ( uint i = get_sub_group_id(); i < contexts_to_clear; i += get_num_sub_groups() )
+ LRBounds_init_subgroup( &scheduler->contexts[i].lr_bounds );
+}
+
+#if 0
+uint record_search( struct BFSDispatchRecord* record_out, global struct BFSDispatchQueue* queue )
+{
+ uint group = get_group_id(0);
+ ushort lane = get_sub_group_local_id();
+ uint num_dispatches = queue->num_dispatches;
+ uint base = 0;
+ for (uint i = 0; i < num_dispatches; i += get_sub_group_size())
+ {
+ uint counts = intel_sub_group_block_read(&queue->wg_count[i]);
+
+ for (uint j = 0; j < get_sub_group_size(); j++)
+ {
+ uint n = sub_group_broadcast(counts, j);
+ if (group < n)
+ {
+ *record_out = queue->records[i + j];
+ return group;
+ }
+ group -= n;
+ }
+ }
+
+ return 0; // NOTE: unreachable in practice
+}
+#endif
+
+
+uint record_search(struct BFSDispatchRecord* record_out, global struct BFSDispatchQueue* queue)
+{
+ uint group = get_group_id(0);
+
+ uint num_dispatches = queue->num_dispatches;
+
+ uint dispatch_id = 0;
+ uint local_id = 0;
+ uint i = 0;
+ do
+ {
+ uint counts = intel_sub_group_block_read(&queue->wg_count[i]);
+ uint prefix = sub_group_scan_exclusive_add(counts);
+
+ uint g = group - prefix;
+ uint ballot = intel_sub_group_ballot(g < counts);
+ if (ballot)
+ {
+ uint lane = ctz(ballot);
+ dispatch_id = i + lane;
+ local_id = intel_sub_group_shuffle(g, lane);
+ break;
+ }
+
+ group -= sub_group_broadcast(prefix + counts, get_sub_group_size() - 1);
+
+ i += get_sub_group_size();
+ } while (i < num_dispatches);
+
+
+ *record_out = queue->records[dispatch_id];
+ return local_id;
+}
+
+
+
+
+struct BFSDispatchArgs get_bfs_args(struct BFSDispatchRecord* record, global struct VContextScheduler* scheduler, global struct SAHBuildGlobals* globals, uint local_group_id)
+{
+ uint context_id = record->context_id;
+ struct VContext* context = &scheduler->contexts[context_id];
+ bool odd_pass = context->tree_depth & 1;
+
+ struct BFSDispatchArgs args;
+ args.scheduler = scheduler;
+ args.primref_index_in = SAHBuildGlobals_GetPrimrefIndices_In( globals, odd_pass );
+ args.primref_index_out = SAHBuildGlobals_GetPrimrefIndices_Out( globals, odd_pass );
+ args.primref_buffer = SAHBuildGlobals_GetPrimrefs( globals );
+ args.wg_primref_begin = context->dispatch_primref_begin + local_group_id * BFS_WG_SIZE;
+ args.wg_primref_end = min( args.wg_primref_begin + BFS_WG_SIZE, context->dispatch_primref_end );
+ args.dispatch_primref_begin = context->dispatch_primref_begin;
+ args.dispatch_primref_end = context->dispatch_primref_end;
+ args.context_id = context_id;
+ args.context = &scheduler->contexts[context_id];
+ args.num_wgs = ((args.dispatch_primref_end - args.dispatch_primref_begin) + BFS_WG_SIZE - 1) / BFS_WG_SIZE;
+ args.bvh2_root = context->bvh2_root;
+ args.bvh2 = SAHBuildGlobals_GetBVH2( globals );
+ args.global_num_primrefs = SAHBuildGlobals_GetTotalPrimRefs( globals );
+ args.do_mask_processing = SAHBuildGlobals_NeedMasks( globals );
+ return args;
+}
+
+struct BFSDispatchArgs get_bfs_args_queue( global struct BFSDispatchQueue* queue,
+ global struct VContextScheduler* scheduler,
+ global struct SAHBuildGlobals* globals )
+{
+
+ // TODO_OPT: Load this entire prefix array into SLM instead of searching..
+ // Or use sub-group ops
+
+ struct BFSDispatchRecord record;
+ uint local_group_id = record_search(&record, queue);
+
+ return get_bfs_args(&record, scheduler, globals, local_group_id);
+}
+
+
+struct BFSDispatchArgs get_bfs_args_from_record( struct BFSDispatchRecord* record,
+ global struct VContextScheduler* scheduler,
+ global struct SAHBuildGlobals* globals )
+{
+ return get_bfs_args(record, scheduler, globals, 0);
+}
+
+
+struct BFSDispatchArgs get_bfs_args_batchable(
+ global struct BFSDispatchQueue* queue,
+ global struct VContextScheduler* scheduler,
+ global struct SAHBuildGlobals* globals_buffer )
+{
+
+ // TODO_OPT: Load this entire prefix array into SLM instead of searching..
+ // Or use sub-group ops
+
+ struct BFSDispatchRecord record;
+ uint local_group_id = record_search(&record, queue);
+
+ global struct SAHBuildGlobals* globals = globals_buffer + record.batch_index;
+
+ return get_bfs_args(&record, scheduler, globals, local_group_id);
+}
+
+
+struct BFSDispatchArgs get_bfs_args_from_record_batchable(
+ struct BFSDispatchRecord* record,
+ global struct VContextScheduler* scheduler,
+ global struct SAHBuildGlobals* globals_buffer )
+{
+ global struct SAHBuildGlobals* globals = globals_buffer + record->batch_index;
+
+ return get_bfs_args(record, scheduler, globals, 0);
+}
+
+struct BFSDispatchArgs get_bfs_args_initial( global struct VContextScheduler* scheduler, global struct SAHBuildGlobals* globals )
+{
+ uint context_id = 0;
+
+ uint num_refs = SAHBuildGlobals_GetTotalPrimRefs( globals );
+
+ struct BFSDispatchArgs args;
+ args.scheduler = scheduler;
+ args.primref_index_in = SAHBuildGlobals_GetPrimrefIndices_In( globals, false );
+ args.primref_index_out = SAHBuildGlobals_GetPrimrefIndices_Out( globals, false );
+ args.primref_buffer = SAHBuildGlobals_GetPrimrefs( globals );
+ args.wg_primref_begin = get_group_id(0) * BFS_WG_SIZE;
+ args.wg_primref_end = min( args.wg_primref_begin + BFS_WG_SIZE, num_refs );
+ args.dispatch_primref_begin = 0;
+ args.dispatch_primref_end = num_refs;
+ args.context_id = context_id;
+ args.context = &scheduler->contexts[context_id];
+ args.num_wgs = ((args.dispatch_primref_end - args.dispatch_primref_begin) + BFS_WG_SIZE - 1) / BFS_WG_SIZE;
+ args.bvh2 = SAHBuildGlobals_GetBVH2( globals );
+ args.bvh2_root = BVH2_GetRoot( args.bvh2 );
+ args.global_num_primrefs = SAHBuildGlobals_GetTotalPrimRefs( globals );
+ args.do_mask_processing = SAHBuildGlobals_NeedMasks(globals);
+ return args;
+}
+
+
+inline void BinMapping_init( struct BinMapping* binMapping, struct AABB3f* centBounds, const uint bins )
+{
+ const float4 eps = 1E-34f;
+ const float4 omega = 1E+34f;
+ float3 l = AABB3f_load_lower( centBounds );
+ float3 u = AABB3f_load_upper( centBounds );
+ float4 diag;
+ diag.xyz = max( eps.xyz, u - l );
+ diag.w = 0;
+ float4 scale = (float4)(0.99f * (float)bins) / diag;
+ scale = select( (float4)(0.0f), scale, diag > eps );
+ scale = select( (float4)(0.0f), scale, diag < omega );
+ binMapping->scale = scale;
+ binMapping->ofs.xyz = l.xyz;
+ binMapping->ofs.w = 0;
+}
+
+
+inline ulong getBestSplit( float3 sah, uint ID, const float4 scale, const ulong defaultSplit )
+{
+ ulong splitX = (((ulong)as_uint( sah.x )) << 32) | ((uint)ID << 2) | 0;
+ ulong splitY = (((ulong)as_uint( sah.y )) << 32) | ((uint)ID << 2) | 1;
+ ulong splitZ = (((ulong)as_uint( sah.z )) << 32) | ((uint)ID << 2) | 2;
+ /* ignore zero sized dimensions */
+ splitX = select( splitX, defaultSplit, (ulong)(scale.x == 0) );
+ splitY = select( splitY, defaultSplit, (ulong)(scale.y == 0) );
+ splitZ = select( splitZ, defaultSplit, (ulong)(scale.z == 0) );
+ ulong bestSplit = min( min( splitX, splitY ), splitZ );
+ bestSplit = sub_group_reduce_min( bestSplit );
+ return bestSplit;
+}
+
+
+
+inline float left_to_right_area16( struct AABB3f* low )
+{
+ struct AABB3f low_prefix = AABB3f_sub_group_scan_exclusive_min_max( low );
+ return halfArea_AABB3f( &low_prefix );
+}
+
+inline uint left_to_right_counts16( uint low )
+{
+ return sub_group_scan_exclusive_add( low );
+}
+
+inline float right_to_left_area16( struct AABB3f* low )
+{
+ const uint subgroupLocalID = get_sub_group_local_id();
+ const uint subgroup_size = get_sub_group_size();
+ const uint ID = subgroup_size - 1 - subgroupLocalID;
+ struct AABB3f low_reverse = AABB3f_sub_group_shuffle( low, ID );
+ struct AABB3f low_prefix = AABB3f_sub_group_scan_inclusive_min_max( &low_reverse );
+ const float low_area = intel_sub_group_shuffle( halfArea_AABB3f( &low_prefix ), ID );
+ return low_area;
+}
+
+inline uint right_to_left_counts16( uint low )
+{
+ const uint subgroupLocalID = get_sub_group_local_id();
+ const uint subgroup_size = get_sub_group_size();
+ const uint ID = subgroup_size - 1 - subgroupLocalID;
+ const uint low_reverse = intel_sub_group_shuffle( low, ID );
+ const uint low_prefix = sub_group_scan_inclusive_add( low_reverse );
+ return intel_sub_group_shuffle( low_prefix, ID );
+}
+
+inline float2 left_to_right_area32( struct AABB3f* low, struct AABB3f* high )
+{
+ struct AABB3f low_prefix = AABB3f_sub_group_scan_exclusive_min_max( low );
+ struct AABB3f low_reduce = AABB3f_sub_group_reduce( low );
+ struct AABB3f high_prefix = AABB3f_sub_group_scan_exclusive_min_max( high );
+ AABB3f_extend( &high_prefix, &low_reduce );
+ const float low_area = halfArea_AABB3f( &low_prefix );
+ const float high_area = halfArea_AABB3f( &high_prefix );
+ return (float2)(low_area, high_area);
+}
+
+inline uint2 left_to_right_counts32( uint low, uint high )
+{
+ const uint low_prefix = sub_group_scan_exclusive_add( low );
+ const uint low_reduce = sub_group_reduce_add( low );
+ const uint high_prefix = sub_group_scan_exclusive_add( high );
+ return (uint2)(low_prefix, low_reduce + high_prefix);
+}
+
+inline float2 right_to_left_area32( struct AABB3f* low, struct AABB3f* high )
+{
+ const uint subgroupLocalID = get_sub_group_local_id();
+ const uint subgroup_size = get_sub_group_size();
+ const uint ID = subgroup_size - 1 - subgroupLocalID;
+ struct AABB3f low_reverse = AABB3f_sub_group_shuffle( high, ID );
+ struct AABB3f high_reverse = AABB3f_sub_group_shuffle( low, ID );
+ struct AABB3f low_prefix = AABB3f_sub_group_scan_inclusive_min_max( &low_reverse );
+ struct AABB3f low_reduce = AABB3f_sub_group_reduce( &low_reverse );
+ struct AABB3f high_prefix = AABB3f_sub_group_scan_inclusive_min_max( &high_reverse );
+ AABB3f_extend( &high_prefix, &low_reduce );
+ const float low_area = intel_sub_group_shuffle( halfArea_AABB3f( &high_prefix ), ID );
+ const float high_area = intel_sub_group_shuffle( halfArea_AABB3f( &low_prefix ), ID );
+ return (float2)(low_area, high_area);
+}
+
+inline uint2 right_to_left_counts32( uint low, uint high )
+{
+ const uint subgroupLocalID = get_sub_group_local_id();
+ const uint subgroup_size = get_sub_group_size();
+ const uint ID = subgroup_size - 1 - subgroupLocalID;
+ const uint low_reverse = intel_sub_group_shuffle( high, ID );
+ const uint high_reverse = intel_sub_group_shuffle( low, ID );
+ const uint low_prefix = sub_group_scan_inclusive_add( low_reverse );
+ const uint low_reduce = sub_group_reduce_add( low_reverse );
+ const uint high_prefix = sub_group_scan_inclusive_add( high_reverse ) + low_reduce;
+ return (uint2)(intel_sub_group_shuffle( high_prefix, ID ), intel_sub_group_shuffle( low_prefix, ID ));
+}
+
+inline uint fastDivideBy6_uint( uint v )
+{
+#if 1
+ const ulong u = (ulong)v >> 1;
+ return (uint)((u * 0x55555556ul) >> 32);
+#else
+ return v / 6;
+#endif
+}
+
+inline uint3 fastDivideBy6_uint3( uint3 v )
+{
+ return (uint3)(fastDivideBy6_uint( v.x ), fastDivideBy6_uint( v.y ), fastDivideBy6_uint( v.z ));
+}
+
+#define SAH_LOG_BLOCK_SHIFT 2
+
+inline struct BFS_Split BinInfo_reduce( struct BFS_BinInfo* binInfo, const float4 scale )
+{
+ const uint subgroupLocalID = get_sub_group_local_id();
+ const uint subgroup_size = get_sub_group_size();
+
+ struct AABB3f boundsX = BinInfo_get_AABB( binInfo, subgroupLocalID, 0 );
+
+ const float lr_areaX = left_to_right_area16( &boundsX );
+ const float rl_areaX = right_to_left_area16( &boundsX );
+
+ struct AABB3f boundsY = BinInfo_get_AABB( binInfo, subgroupLocalID, 1 );
+
+ const float lr_areaY = left_to_right_area16( &boundsY );
+ const float rl_areaY = right_to_left_area16( &boundsY );
+
+ struct AABB3f boundsZ = BinInfo_get_AABB( binInfo, subgroupLocalID, 2 );
+
+ const float lr_areaZ = left_to_right_area16( &boundsZ );
+ const float rl_areaZ = right_to_left_area16( &boundsZ );
+
+ const uint3 counts = BinInfo_get_counts( binInfo, subgroupLocalID );
+
+ const uint lr_countsX = left_to_right_counts16( counts.x );
+ const uint rl_countsX = right_to_left_counts16( counts.x );
+ const uint lr_countsY = left_to_right_counts16( counts.y );
+ const uint rl_countsY = right_to_left_counts16( counts.y );
+ const uint lr_countsZ = left_to_right_counts16( counts.z );
+ const uint rl_countsZ = right_to_left_counts16( counts.z );
+
+ const float3 lr_area = (float3)(lr_areaX, lr_areaY, lr_areaZ);
+ const float3 rl_area = (float3)(rl_areaX, rl_areaY, rl_areaZ);
+
+ const uint3 lr_count = fastDivideBy6_uint3( (uint3)(lr_countsX, lr_countsY, lr_countsZ) + 6 - 1 );
+ const uint3 rl_count = fastDivideBy6_uint3( (uint3)(rl_countsX, rl_countsY, rl_countsZ) + 6 - 1 );
+ float3 sah = fma( lr_area, convert_float3( lr_count ), rl_area * convert_float3( rl_count ) );
+
+ /* first bin is invalid */
+ sah.x = select( (float)(INFINITY), sah.x, subgroupLocalID != 0 );
+ sah.y = select( (float)(INFINITY), sah.y, subgroupLocalID != 0 );
+ sah.z = select( (float)(INFINITY), sah.z, subgroupLocalID != 0 );
+
+ const ulong defaultSplit = (((ulong)as_uint( (float)(INFINITY) )) << 32);
+
+ const ulong bestSplit = getBestSplit( sah, subgroupLocalID, scale, defaultSplit );
+
+ struct BFS_Split split;
+ split.sah = as_float( (uint)(bestSplit >> 32) );
+ split.dim = (uint)bestSplit & 3;
+ split.pos = (uint)bestSplit >> 2;
+
+ return split;
+}
+
+
+struct BFS_BinInfoReduce3_SLM
+{
+ uint sah[3*BFS_NUM_BINS];
+};
+
+
+
+inline struct BFS_Split BinInfo_reduce3( local struct BFS_BinInfoReduce3_SLM* slm, struct BFS_BinInfo* binInfo, const float4 scale )
+{
+ // process each bin/axis combination across sub-groups
+ for (uint i = get_sub_group_id(); i < 3 * BFS_NUM_BINS; i += get_num_sub_groups())
+ {
+ uint my_bin = i % BFS_NUM_BINS;
+ uint my_axis = i / BFS_NUM_BINS;
+
+ float3 left_lower = (float3)(INFINITY,INFINITY,INFINITY);
+ float3 left_upper = -left_lower;
+ float3 right_lower = (float3)(INFINITY,INFINITY,INFINITY);
+ float3 right_upper = -right_lower;
+
+ // load the other bins and assign them to the left or to the right
+ // of this subgroup's bin
+ uint lane = get_sub_group_local_id();
+ struct AABB3f sg_bins = BinInfo_get_AABB(binInfo,lane,my_axis);
+
+ bool is_left = (lane < my_bin);
+ float3 lower = AABB3f_load_lower(&sg_bins);
+ float3 upper = AABB3f_load_upper(&sg_bins);
+
+ float3 lower_l = select_min( lower, is_left );
+ float3 upper_l = select_max( upper, is_left );
+ float3 lower_r = select_min( lower, !is_left );
+ float3 upper_r = select_max( upper, !is_left );
+
+ lower_l = sub_group_reduce_min_float3( lower_l );
+ lower_r = sub_group_reduce_min_float3( lower_r );
+ upper_l = sub_group_reduce_max_float3( upper_l );
+ upper_r = sub_group_reduce_max_float3( upper_r );
+ float3 dl = upper_l - lower_l;
+ float3 dr = upper_r - lower_r;
+ float area_l = dl.x* (dl.y + dl.z) + (dl.y * dl.z);
+ float area_r = dr.x* (dr.y + dr.z) + (dr.y * dr.z);
+
+ // get the counts
+ uint sg_bin_count = BinInfo_get_count(binInfo, lane, my_axis);
+ uint count_l = (is_left) ? sg_bin_count : 0;
+ uint count_r = (is_left) ? 0 : sg_bin_count;
+ count_l = sub_group_reduce_add(count_l);
+ count_r = sub_group_reduce_add(count_r);
+
+ // compute sah
+ count_l = fastDivideBy6_uint(count_l + 6 - 1);
+ count_r = fastDivideBy6_uint(count_r + 6 - 1);
+ float lr_partial = area_l * count_l;
+ float rl_partial = area_r * count_r;
+ float sah = lr_partial + rl_partial;
+
+ // first bin is invalid
+ sah = select((float)(INFINITY), sah, my_bin != 0);
+
+ // ignore zero sized dimensions
+ sah = select( sah, (float)(INFINITY), (scale.x == 0 && my_axis == 0) );
+ sah = select( sah, (float)(INFINITY), (scale.y == 0 && my_axis == 1) );
+ sah = select( sah, (float)(INFINITY), (scale.z == 0 && my_axis == 2) );
+
+ // tuck the axis into the bottom bits of sah cost.
+ // The result is an integer between 0 and +inf (7F800000)
+ // If we have 3 axes with infinite sah cost, we will select axis 0
+ slm->sah[i] = (as_uint(sah)&~0x3) | my_axis;
+ }
+
+ barrier( CLK_LOCAL_MEM_FENCE );
+
+ // reduce split candidates down to one subgroup
+ // sah is strictly positive, so integer compares can be used
+ // which results in a faster sub_group_reduce_min()
+ //
+ uint best_sah = 0xffffffff;
+
+ uint lid = get_sub_group_local_id();
+ if (lid < BFS_NUM_BINS)
+ {
+ best_sah = slm->sah[lid];
+ lid += BFS_NUM_BINS;
+ best_sah = min( best_sah, slm->sah[lid] );
+ lid += BFS_NUM_BINS;
+ best_sah = min( best_sah, slm->sah[lid] );
+ }
+
+ uint reduced_bestsah = sub_group_reduce_min( best_sah );
+ uint best_bin = ctz(intel_sub_group_ballot(best_sah == reduced_bestsah));
+ uint best_axis = as_uint(reduced_bestsah) & 0x3;
+
+ struct BFS_Split ret;
+ ret.sah = as_float(reduced_bestsah);
+ ret.dim = best_axis;
+ ret.pos = best_bin;
+ return ret;
+}
+
+
+struct BFS_BinInfoReduce_SLM
+{
+ struct
+ {
+ float sah;
+ uint bin;
+ } axisInfo[3];
+};
+
+
+
+inline struct BFS_Split BinInfo_reduce2( local struct BFS_BinInfoReduce_SLM* slm, struct BFS_BinInfo* binInfo, const float4 scale, uint num_primrefs)
+{
+ ushort my_axis = get_sub_group_id();
+ ushort my_bin = get_sub_group_local_id();
+
+ if (my_axis < 3)
+ {
+ struct AABB3f aabb = BinInfo_get_AABB(binInfo, my_bin, my_axis);
+ uint count = BinInfo_get_count(binInfo, my_bin, my_axis);
+
+ float lr_area = left_to_right_area16(&aabb);
+ float rl_area = right_to_left_area16(&aabb);
+
+ uint lr_count = sub_group_scan_exclusive_add(count);
+ uint rl_count = num_primrefs - lr_count;
+
+ lr_count = fastDivideBy6_uint(lr_count + 6 - 1);
+ rl_count = fastDivideBy6_uint(rl_count + 6 - 1);
+ float lr_partial = lr_area * lr_count;
+ float rl_partial = rl_area * rl_count;
+ float sah = lr_partial + rl_partial;
+
+ // first bin is invalid
+ sah = select((float)(INFINITY), sah, my_bin != 0);
+
+ float best_sah = sub_group_reduce_min( sah );
+ uint best_bin = ctz(intel_sub_group_ballot(sah == best_sah));
+
+ // ignore zero sized dimensions
+ best_sah = select( best_sah, (float)(INFINITY), (scale.x == 0 && my_axis == 0) );
+ best_sah = select( best_sah, (float)(INFINITY), (scale.y == 0 && my_axis == 1) );
+ best_sah = select( best_sah, (float)(INFINITY), (scale.z == 0 && my_axis == 2) );
+
+ if (get_sub_group_local_id() == 0)
+ {
+ slm->axisInfo[my_axis].sah = best_sah;
+ slm->axisInfo[my_axis].bin = best_bin;
+ }
+ }
+ barrier( CLK_LOCAL_MEM_FENCE );
+
+ float sah = (float)(INFINITY);
+ if( get_sub_group_local_id() < 3 )
+ sah = slm->axisInfo[get_sub_group_local_id()].sah;
+
+ float bestsah = min(sub_group_broadcast(sah, 0), min(sub_group_broadcast(sah, 1), sub_group_broadcast(sah, 2)));
+ uint bestAxis = ctz( intel_sub_group_ballot(bestsah == sah) );
+
+ struct BFS_Split split;
+ split.sah = bestsah;
+ split.dim = bestAxis;
+ split.pos = slm->axisInfo[bestAxis].bin;
+ return split;
+}
+
+
+inline bool is_left( struct BinMapping* binMapping, struct BFS_Split* split, struct AABB* primref )
+{
+ const uint dim = split->dim;
+ const float lower = primref->lower[dim];
+ const float upper = primref->upper[dim];
+ const float c = lower + upper;
+ const uint pos = convert_uint_rtz( (c - binMapping->ofs[dim]) * binMapping->scale[dim] );
+ return pos < split->pos;
+}
+
+struct BFS_Pass1_SLM
+{
+ struct BFS_BinInfo bin_info;
+// struct BFS_BinInfoReduce3_SLM reduce3;
+};
+
+
+void DO_BFS_pass1( local struct BFS_Pass1_SLM* slm,
+ uint thread_primref_id,
+ bool thread_primref_valid,
+ struct BFSDispatchArgs args
+ )
+{
+ local struct BFS_BinInfo* local_bin_info = &slm->bin_info;
+ global struct VContext* context = args.context;
+ struct AABB3f centroid_bounds = BVH2_GetNodeBox( args.bvh2, args.bvh2_root ); // root AABB is initialized to centroid bounds
+
+ struct BinMapping bin_mapping;
+ BinMapping_init( &bin_mapping, &centroid_bounds, BFS_NUM_BINS );
+
+ // fetch this thread's primref
+ PrimRef ref;
+ if ( thread_primref_valid )
+ ref = args.primref_buffer[thread_primref_id];
+
+ // init bin info
+ BinInfo_init( local_bin_info );
+
+ // fence on local bin-info init
+ barrier( CLK_LOCAL_MEM_FENCE );
+
+ // merge this thread's primref into local bin info
+ BinInfo_add_primref( &bin_mapping, local_bin_info, &ref, thread_primref_valid );
+
+ // fence on local bin-info update
+ barrier( CLK_LOCAL_MEM_FENCE );
+
+ BinInfo_merge(&context->global_bin_info, local_bin_info);
+}
+
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__( (reqd_work_group_size(BFS_WG_SIZE,1,1)))
+__attribute__((intel_reqd_sub_group_size(16)))
+kernel void
+BFS_pass1_indexed(
+ global struct VContextScheduler* scheduler,
+ global struct SAHBuildGlobals* sah_globals )
+{
+ local struct BFS_Pass1_SLM slm;
+ struct BFSDispatchArgs args = get_bfs_args_queue( &scheduler->bfs_queue, scheduler, sah_globals );
+
+ bool thread_primref_valid = (args.wg_primref_begin + get_local_id( 0 )) < args.wg_primref_end;
+ uint thread_primref_id = 0;
+ if ( thread_primref_valid )
+ thread_primref_id = args.primref_index_in[args.wg_primref_begin + get_local_id( 0 )];
+
+ DO_BFS_pass1( &slm, thread_primref_id, thread_primref_valid, args );
+}
+
+
+__attribute__( (reqd_work_group_size( BFS_WG_SIZE, 1, 1 )) )
+__attribute__((intel_reqd_sub_group_size(16)))
+kernel void
+BFS_pass1_initial( global struct VContextScheduler* scheduler, global struct SAHBuildGlobals* sah_globals )
+{
+ local struct BFS_Pass1_SLM slm;
+ struct BFSDispatchArgs args = get_bfs_args_initial( scheduler, sah_globals );
+
+ uint thread_primref_id = args.wg_primref_begin + get_local_id( 0 );
+ bool thread_primref_valid = thread_primref_id < args.wg_primref_end;
+
+ DO_BFS_pass1( &slm, thread_primref_id, thread_primref_valid, args );
+}
+
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(BFS_WG_SIZE, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16)))
+kernel void
+BFS_pass1_indexed_batchable(
+ global struct VContextScheduler* scheduler,
+ global struct SAHBuildGlobals* globals_buffer )
+{
+ local struct BFS_Pass1_SLM slm;
+ struct BFSDispatchArgs args = get_bfs_args_batchable( &scheduler->bfs_queue, scheduler, globals_buffer );
+
+ bool thread_primref_valid = (args.wg_primref_begin + get_local_id(0)) < args.wg_primref_end;
+ uint thread_primref_id = 0;
+ if (thread_primref_valid)
+ thread_primref_id = args.primref_index_in[args.wg_primref_begin + get_local_id(0)];
+
+ DO_BFS_pass1(&slm, thread_primref_id, thread_primref_valid, args);
+}
+
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(BFS_WG_SIZE, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16)))
+kernel void
+BFS_pass1_initial_batchable( global struct VContextScheduler* scheduler, global struct SAHBuildGlobals* globals_buffer )
+{
+ local struct BFS_Pass1_SLM slm;
+ struct BFSDispatchArgs args = get_bfs_args_batchable( &scheduler->bfs_queue, scheduler, globals_buffer );
+
+ uint thread_primref_id = args.wg_primref_begin + get_local_id(0);
+ bool thread_primref_valid = thread_primref_id < args.wg_primref_end;
+
+ DO_BFS_pass1(&slm, thread_primref_id, thread_primref_valid, args);
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+///
+/// BVH2 construction -- BFS Phase Pass2
+///
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct BFS_Pass2_SLM
+{
+ struct BFS_BinInfoReduce3_SLM reduce3;
+ //struct AABB3f left_centroid_bounds;
+ //struct AABB3f right_centroid_bounds;
+ //struct AABB3f left_geom_bounds;
+ //struct AABB3f right_geom_bounds;
+ LRBounds lr_bounds;
+ uint left_count;
+ uint right_count;
+ uint lr_mask;
+ uint left_primref_base;
+ uint right_primref_base;
+// uint num_wgs;
+
+// uint output_indices[BFS_WG_SIZE];
+};
+
+
+
+
+
+
+
+void DO_BFS_pass2(
+ local struct BFS_Pass2_SLM* slm,
+ uint thread_primref_id,
+ bool thread_primref_valid,
+ struct BFSDispatchArgs args
+)
+{
+ global struct VContext* context = args.context;
+
+ struct AABB3f centroid_bounds = BVH2_GetNodeBox( args.bvh2, args.bvh2_root );
+
+ // load the thread's primref
+ PrimRef ref;
+ if ( thread_primref_valid )
+ ref = args.primref_buffer[thread_primref_id];
+
+ struct BinMapping bin_mapping;
+ BinMapping_init( &bin_mapping, &centroid_bounds, BFS_NUM_BINS );
+
+ // initialize working SLM space
+ LRBounds_init(&slm->lr_bounds);
+ if(get_local_id(0) == 0)
+ {
+ slm->left_count = 0;
+ slm->right_count = 0;
+
+ if( args.do_mask_processing )
+ slm->lr_mask = 0;
+ }
+
+ // compute split - every workgroup does the same computation
+ // local barrier inside BinInfo_reduce3
+ struct BFS_Split split = BinInfo_reduce3( &slm->reduce3, &context->global_bin_info,bin_mapping.scale );
+
+ uint wg_prim_count = args.wg_primref_end - args.wg_primref_begin;
+
+ // partition primrefs into L/R subsets...
+ bool go_left = false;
+ if (split.sah == (float)(INFINITY)) // no valid split, split in the middle.. This can happen due to floating-point limit cases in huge scenes
+ go_left = get_local_id(0) < (wg_prim_count / 2);
+ else
+ go_left = is_left( &bin_mapping, &split, &ref );
+
+ // assign this primref a position in the output array, and expand corresponding centroid-bounds
+ uint local_index;
+ {
+ float3 centroid = ref.lower.xyz + ref.upper.xyz;
+
+ uint l_ballot = intel_sub_group_ballot( go_left && thread_primref_valid );
+ uint r_ballot = intel_sub_group_ballot( !go_left && thread_primref_valid );
+ if (l_ballot)
+ {
+ bool active_lane = l_ballot & (1 << get_sub_group_local_id());
+ float3 Cmin, Cmax, Gmin, Gmax;
+ Cmin = select_min( centroid.xyz, active_lane );
+ Cmax = select_max( centroid.xyz, active_lane );
+ Gmin = select_min( ref.lower.xyz, active_lane );
+ Gmax = select_max( ref.upper.xyz, active_lane );
+
+ Cmin = sub_group_reduce_min_float3( Cmin );
+ Cmax = sub_group_reduce_max_float3( Cmax );
+ Gmin = sub_group_reduce_min_float3( Gmin );
+ Gmax = sub_group_reduce_max_float3( Gmax );
+
+ LRBounds_merge_left( &slm->lr_bounds, Cmin,Cmax,Gmin,Gmax );
+ }
+
+ if (r_ballot)
+ {
+ bool active_lane = r_ballot & (1 << get_sub_group_local_id());
+ float3 Cmin, Cmax, Gmin, Gmax;
+ Cmin = select_min(centroid.xyz, active_lane);
+ Cmax = select_max(centroid.xyz, active_lane);
+ Gmin = select_min(ref.lower.xyz, active_lane);
+ Gmax = select_max(ref.upper.xyz, active_lane);
+
+ Cmin = sub_group_reduce_min_float3(Cmin);
+ Cmax = sub_group_reduce_max_float3(Cmax);
+ Gmin = sub_group_reduce_min_float3(Gmin);
+ Gmax = sub_group_reduce_max_float3(Gmax);
+
+ LRBounds_merge_right( &slm->lr_bounds, Cmin,Cmax,Gmin,Gmax );
+ }
+
+ if( args.do_mask_processing )
+ {
+ uint mask =0;
+ if (thread_primref_valid)
+ {
+ mask = PRIMREF_instanceMask(&ref) ;
+ mask = go_left ? mask : mask<<8;
+ }
+
+ // TODO OPT: there is no 'sub_group_reduce_or' and IGC does not do the reduction trick
+ // for atomics on sub-group uniform addresses
+ for( uint i= get_sub_group_size()/2; i>0; i/= 2)
+ mask = mask | intel_sub_group_shuffle_down(mask,mask,i);
+ if( get_sub_group_local_id() == 0 )
+ atomic_or_local( &slm->lr_mask, mask );
+ }
+
+ uint l_base = 0;
+ uint r_base = 0;
+ if( get_sub_group_local_id() == 0 && l_ballot )
+ l_base = atomic_add_local( &slm->left_count, popcount(l_ballot) );
+ if( get_sub_group_local_id() == 0 && r_ballot )
+ r_base = atomic_add_local( &slm->right_count, popcount(r_ballot) );
+
+ sub_group_barrier( CLK_LOCAL_MEM_FENCE );
+ l_base = sub_group_broadcast(l_base,0);
+ r_base = sub_group_broadcast(r_base,0);
+
+ l_base = l_base + subgroup_bit_prefix_exclusive( l_ballot );
+ r_base = r_base + subgroup_bit_prefix_exclusive( r_ballot );
+
+ local_index = (go_left) ? l_base : r_base;
+ }
+
+
+ barrier( CLK_LOCAL_MEM_FENCE );
+
+ // merge local into global
+ // TODO_OPT: Look at spreading some of this across subgroups
+ if ( get_sub_group_id() == 0 )
+ {
+ // allocate primref space for this wg and merge local/global centroid bounds
+ uint num_left = slm->left_count;
+ {
+ if (num_left && get_sub_group_local_id() == 0)
+ {
+ num_left = atomic_add_global( &context->num_left, num_left );
+ slm->left_primref_base = args.dispatch_primref_begin + num_left;
+ }
+ }
+ uint num_right = slm->right_count;
+ {
+ if (num_right && get_sub_group_local_id() == 0)
+ {
+ num_right = atomic_add_global( &context->num_right, num_right );
+ slm->right_primref_base = (args.dispatch_primref_end - 1) - num_right;
+ }
+ }
+
+ if( args.do_mask_processing && get_sub_group_local_id() == 0 )
+ atomic_or_global( &context->lr_mask, slm->lr_mask );
+ }
+
+ barrier( CLK_LOCAL_MEM_FENCE );
+
+ LRBounds_merge( &context->lr_bounds, &slm->lr_bounds );
+
+ // move thread's primref ID into correct position in output index buffer
+ if (thread_primref_valid)
+ {
+ uint pos = go_left ? slm->left_primref_base + local_index
+ : slm->right_primref_base - local_index;
+
+ args.primref_index_out[pos] = thread_primref_id;
+ }
+}
+
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__( (reqd_work_group_size( BFS_WG_SIZE, 1, 1 )) )
+__attribute__( (intel_reqd_sub_group_size( 16 )) )
+kernel void
+BFS_pass2_indexed( global struct VContextScheduler* scheduler, global struct SAHBuildGlobals* sah_globals )
+{
+ local struct BFS_Pass2_SLM slm;
+ struct BFSDispatchArgs args = get_bfs_args_queue( &scheduler->bfs_queue, scheduler, sah_globals );
+
+ bool thread_primref_valid = (args.wg_primref_begin + get_local_id( 0 )) < args.wg_primref_end;
+ uint thread_primref_id = 0;
+ if ( thread_primref_valid )
+ thread_primref_id = args.primref_index_in[args.wg_primref_begin + get_local_id( 0 )];
+
+ DO_BFS_pass2( &slm, thread_primref_id, thread_primref_valid, args );
+}
+
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__( (reqd_work_group_size( BFS_WG_SIZE, 1, 1 )) )
+__attribute__( (intel_reqd_sub_group_size( 16 )) )
+kernel void
+BFS_pass2_initial( global struct VContextScheduler* scheduler, global struct SAHBuildGlobals* sah_globals )
+{
+ local struct BFS_Pass2_SLM slm;
+ struct BFSDispatchArgs args = get_bfs_args_initial( scheduler, sah_globals );
+
+ uint thread_primref_id = args.wg_primref_begin + get_local_id( 0 );
+ bool thread_primref_valid = thread_primref_id < args.wg_primref_end;
+
+ DO_BFS_pass2( &slm, thread_primref_id, thread_primref_valid, args );
+}
+
+
+__attribute__((reqd_work_group_size(BFS_WG_SIZE, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16)))
+kernel void
+BFS_pass2_indexed_batchable( global struct VContextScheduler* scheduler, global struct SAHBuildGlobals* globals_buffer )
+{
+ local struct BFS_Pass2_SLM slm;
+ struct BFSDispatchArgs args = get_bfs_args_batchable(&scheduler->bfs_queue, scheduler, globals_buffer );
+
+ bool thread_primref_valid = (args.wg_primref_begin + get_local_id(0)) < args.wg_primref_end;
+ uint thread_primref_id = 0;
+ if (thread_primref_valid)
+ thread_primref_id = args.primref_index_in[args.wg_primref_begin + get_local_id(0)];
+
+ DO_BFS_pass2(&slm, thread_primref_id, thread_primref_valid, args);
+
+}
+
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(BFS_WG_SIZE, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16)))
+kernel void
+BFS_pass2_initial_batchable(global struct VContextScheduler* scheduler, global struct SAHBuildGlobals* globals_buffer)
+{
+ local struct BFS_Pass2_SLM slm;
+ struct BFSDispatchArgs args = get_bfs_args_batchable(&scheduler->bfs_queue, scheduler, globals_buffer );
+
+ uint thread_primref_id = args.wg_primref_begin + get_local_id(0);
+ bool thread_primref_valid = thread_primref_id < args.wg_primref_end;
+
+ DO_BFS_pass2(&slm, thread_primref_id, thread_primref_valid, args);
+}
+
+
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+///
+/// BVH2 construction -- DFS Phase
+///
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct DFSArgs
+{
+ uint primref_base;
+ uint global_bvh2_base;
+ bool do_mask_processing;
+ ushort num_primrefs;
+ global uint* primref_indices_in;
+ global uint* primref_indices_out;
+ global PrimRef* primref_buffer;
+ global struct BVH2* global_bvh2;
+};
+
+
+struct DFSPrimRefAABB
+{
+ half lower[3];
+ half upper[3];
+};
+
+void DFSPrimRefAABB_init( struct DFSPrimRefAABB* bb )
+{
+ bb->lower[0] = 1;
+ bb->lower[1] = 1;
+ bb->lower[2] = 1;
+ bb->upper[0] = 0;
+ bb->upper[1] = 0;
+ bb->upper[2] = 0;
+}
+
+void DFSPrimRefAABB_extend( struct DFSPrimRefAABB* aabb, struct DFSPrimRefAABB* v )
+{
+ aabb->lower[0] = min( aabb->lower[0], v->lower[0] );
+ aabb->lower[1] = min( aabb->lower[1], v->lower[1] );
+ aabb->lower[2] = min( aabb->lower[2], v->lower[2] );
+ aabb->upper[0] = max( aabb->upper[0], v->upper[0] );
+ aabb->upper[1] = max( aabb->upper[1], v->upper[1] );
+ aabb->upper[2] = max( aabb->upper[2], v->upper[2] );
+}
+
+half DFSPrimRefAABB_halfArea( struct DFSPrimRefAABB* aabb )
+{
+ const half3 d = (half3)(aabb->upper[0] - aabb->lower[0], aabb->upper[1] - aabb->lower[1], aabb->upper[2] - aabb->lower[2]);
+ return fma( d.x, (d.y + d.z), d.y * d.z );
+}
+
+struct DFSPrimRef
+{
+ struct DFSPrimRefAABB aabb;
+ ushort2 meta;
+};
+
+void DFSPrimRef_SetBVH2Root( struct DFSPrimRef* ref, ushort root )
+{
+ ref->meta.y = root;
+}
+
+uint DFSPrimRef_GetInputIndex( struct DFSPrimRef* ref )
+{
+ return ref->meta.x;
+}
+
+uint DFSPrimRef_GetBVH2Parent( struct DFSPrimRef* ref )
+{
+ return ref->meta.y;
+}
+
+
+struct PrimRefSet
+{
+ struct DFSPrimRefAABB AABB[DFS_WG_SIZE];
+ ushort2 meta[DFS_WG_SIZE];
+ uint input_indices[DFS_WG_SIZE];
+};
+
+
+
+
+local struct DFSPrimRefAABB* PrimRefSet_GetAABBPointer( local struct PrimRefSet* refs, ushort id )
+{
+ return &refs->AABB[id];
+}
+struct DFSPrimRef PrimRefSet_GetPrimRef( local struct PrimRefSet* refs, ushort id )
+{
+ struct DFSPrimRef r;
+ r.aabb = refs->AABB[id];
+ r.meta = refs->meta[id];
+ return r;
+}
+void PrimRefSet_SetPrimRef( local struct PrimRefSet* refs, struct DFSPrimRef ref, ushort id )
+{
+ refs->AABB[id] = ref.aabb;
+ refs->meta[id] = ref.meta;
+}
+
+void PrimRefSet_SetPrimRef_FullPrecision( struct AABB3f* root_aabb, local struct PrimRefSet* refs, PrimRef ref, ushort id )
+{
+ float3 root_l = AABB3f_load_lower( root_aabb );
+ float3 root_u = AABB3f_load_upper( root_aabb );
+ float3 d = root_u - root_l;
+ float scale = 1.0f / max( d.x, max( d.y, d.z ) );
+
+ float3 l = ref.lower.xyz;
+ float3 u = ref.upper.xyz;
+ half3 lh = convert_half3_rtz( (l - root_l) * scale );
+ half3 uh = convert_half3_rtp( (u - root_l) * scale );
+
+ refs->AABB[id].lower[0] = lh.x;
+ refs->AABB[id].lower[1] = lh.y;
+ refs->AABB[id].lower[2] = lh.z;
+ refs->AABB[id].upper[0] = uh.x;
+ refs->AABB[id].upper[1] = uh.y;
+ refs->AABB[id].upper[2] = uh.z;
+ refs->meta[id].x = id;
+ refs->meta[id].y = 0;
+}
+
+
+
+void DFS_CreatePrimRefSet( struct DFSArgs args,
+ local struct PrimRefSet* prim_refs )
+{
+ ushort id = get_local_id( 0 );
+ ushort num_primrefs = args.num_primrefs;
+
+ struct AABB3f box = BVH2_GetNodeBox( args.global_bvh2, args.global_bvh2_base );
+ if ( id < num_primrefs )
+ {
+ PrimRef ref = args.primref_buffer[args.primref_indices_in[id]];
+ prim_refs->input_indices[id] = args.primref_indices_in[id];
+ PrimRefSet_SetPrimRef_FullPrecision( &box, prim_refs, ref, id );
+ }
+}
+
+struct ThreadRangeInfo
+{
+ uchar start;
+ uchar local_num_prims;
+ uchar bvh2_root;
+ bool active;
+};
+
+struct BVHBuildLocals // size: ~3.8K
+{
+ uchar2 axis_and_left_count[ DFS_WG_SIZE ];
+ struct ThreadRangeInfo range[ DFS_WG_SIZE ];
+ uint sah[ DFS_WG_SIZE ];
+};
+
+#define LOCAL_BVH2_NODE_COUNT (2*(DFS_WG_SIZE) -1)
+
+struct LocalBVH2
+{
+ uint nodes[LOCAL_BVH2_NODE_COUNT];
+ uint num_nodes;
+
+ // bit layout is for a node is
+ // uchar child_ptr; // this is right_child_index >> 1. right child's msb is always 0
+ // uchar primref_base; // index of the node's first primref. will be 0 at the root
+ // uchar parent_dist; // distance in nodes from this node to its parent
+ // uchar prim_counter; // number of prims in this subtree. For a complete tree (256 prims), the root may be off by 1
+
+ // for a WG size of 256, 8b is enough for parent distance, because the tree is built in level order
+ // the maximum distance between parent and child occurs for a complete tree.
+ // in this scenario the left-most leaf has index 255, its parent has index 127, the deltas to the children are 128 and 129
+};
+
+
+void LocalBVH2_Initialize( struct LocalBVH2* bvh2, ushort num_prims )
+{
+ bvh2->num_nodes = 1;
+ bvh2->nodes[0] = min(num_prims,(ushort)255);
+}
+
+
+
+void LocalBVH2_Initialize_Presplit(struct LocalBVH2* bvh2, ushort num_prims, ushort left_count, ushort right_count )
+{
+ bvh2->num_nodes = 3;
+ bvh2->nodes[0] = min(num_prims, (ushort)255);
+
+ ushort bvh2_root = 0;
+ ushort child_place = 1;
+
+ uint child_ptr = (child_place + 1) >> 1;
+ bvh2->nodes[bvh2_root] |= (child_ptr) << 24;
+
+ uint parent_dist = child_place - bvh2_root;
+
+ // initialize child nodes
+ ushort primref_base_left = 0;
+ ushort primref_base_right = left_count;
+ uint left = (primref_base_left << 16) + ((parent_dist << 8)) + left_count;
+ uint right = (primref_base_right << 16) + ((parent_dist + 1) << 8) + right_count;
+ bvh2->nodes[child_place] = left;
+ bvh2->nodes[child_place + 1] = right;
+}
+
+
+void LocalBVH2_CreateInnerNode( local struct LocalBVH2* bvh2, ushort bvh2_root, uint primref_base_left, uint primref_base_right )
+{
+ ushort child_place = atomic_add_local( &(bvh2-> num_nodes), 2 );
+
+ uint child_ptr = (child_place + 1) >> 1;
+ bvh2->nodes[bvh2_root] |= (child_ptr) << 24;
+
+ uint parent_dist = child_place - bvh2_root;
+
+ // initialize child nodes
+ uint left = (primref_base_left << 16) + ((parent_dist << 8));
+ uint right = (primref_base_right << 16) + ((parent_dist + 1) << 8);
+ bvh2->nodes[child_place] = left;
+ bvh2->nodes[child_place + 1] = right;
+}
+
+ushort2 LocalBVH2_GetChildIndices( struct LocalBVH2* bvh2, ushort bvh2_root )
+{
+ ushort right_idx = (bvh2->nodes[bvh2_root] & 0xff000000) >> 23;
+ return (ushort2)(right_idx - 1, right_idx);
+}
+
+
+ushort LocalBVH2_IncrementPrimCount( local struct LocalBVH2* bvh2, ushort bvh2_root )
+{
+ // increment only the lower 8 bits. Algorithm will not overflow by design
+ return atomic_inc_local( &bvh2->nodes[bvh2_root] ) & 0xff;
+}
+
+ushort LocalBVH2_SetLeafPrimCount(local struct LocalBVH2* bvh2, ushort bvh2_root, ushort count)
+{
+ return bvh2->nodes[bvh2_root] |= (count& 0xff);
+}
+
+bool LocalBVH2_IsRoot( struct LocalBVH2* bvh2, ushort node_id )
+{
+ return node_id == 0;
+}
+
+ushort LocalBVH2_GetLeafPrimrefStart( struct LocalBVH2* bvh2, ushort bvh2_node_id )
+{
+ return (bvh2->nodes[bvh2_node_id] >> 16) & 255;
+}
+
+bool LocalBVH2_IsLeftChild( struct LocalBVH2* bvh2, ushort parent_node, ushort current_node )
+{
+ return (current_node & 1); // nodes are allocated in pairs. first node is root, left child is an odd index
+}
+
+ushort LocalBVH2_GetParent( struct LocalBVH2* bvh2, ushort node )
+{
+ return node - ((bvh2->nodes[node] >> 8) & 255);
+}
+
+uint LocalBVH2_GetNodeCount( struct LocalBVH2* bvh2 )
+{
+ return bvh2->num_nodes;
+}
+
+bool LocalBVH2_IsLeaf( struct LocalBVH2* bvh2, ushort node_index )
+{
+ return (bvh2->nodes[node_index] & 255) <= TREE_ARITY;
+}
+
+ushort LocalBVH2_GetLeafPrimCount( struct LocalBVH2* bvh2, ushort node_index )
+{
+ return (bvh2->nodes[node_index] & 255);
+}
+
+void DFS_ConstructBVH2( local struct LocalBVH2* bvh2,
+ local struct PrimRefSet* prim_refs,
+ ushort bvh2_root,
+ ushort prim_range_start,
+ ushort local_num_prims,
+ ushort global_num_prims,
+ local struct BVHBuildLocals* locals,
+ local uint* num_active_threads )
+{
+ ushort tid = get_local_id( 0 );
+ ushort primref_position = tid;
+
+ bool active_thread = tid < global_num_prims;
+
+ // Handle cases where initial binner creates leaves
+ if ( active_thread && local_num_prims <= TREE_ARITY )
+ {
+ struct DFSPrimRef ref = PrimRefSet_GetPrimRef(prim_refs, primref_position);
+ DFSPrimRef_SetBVH2Root(&ref, bvh2_root);
+ PrimRefSet_SetPrimRef(prim_refs, ref, primref_position);
+ active_thread = false;
+ if (primref_position == prim_range_start)
+ atomic_sub_local(num_active_threads, local_num_prims);
+ }
+
+ barrier( CLK_LOCAL_MEM_FENCE );
+
+ locals->range[ tid ].start = prim_range_start;
+ locals->range[ tid ].local_num_prims = local_num_prims;
+ locals->range[ tid ].bvh2_root = bvh2_root;
+ locals->range[ tid ].active = active_thread;
+
+ do
+ {
+ if(active_thread && prim_range_start == primref_position)
+ locals->sah[primref_position] = UINT_MAX;
+
+ barrier( CLK_LOCAL_MEM_FENCE );
+
+ if ( active_thread )
+ {
+ local struct DFSPrimRefAABB* my_box = PrimRefSet_GetAABBPointer( prim_refs, primref_position );
+
+ // each thread evaluates a possible split candidate. Scan primrefs and compute sah cost
+ // do this axis-by-axis to keep register pressure low
+ float best_sah = INFINITY;
+ ushort best_axis = 3;
+ ushort best_count = 0;
+
+ struct DFSPrimRefAABB box_left[3];
+ struct DFSPrimRefAABB box_right[3];
+ float CSplit[3];
+ ushort count_left[3];
+
+ for ( ushort axis = 0; axis < 3; axis++ )
+ {
+ DFSPrimRefAABB_init( &box_left[axis] );
+ DFSPrimRefAABB_init( &box_right[axis] );
+
+ CSplit[axis] = my_box->lower[axis] + my_box->upper[axis];
+ count_left[axis] = 0;
+ }
+
+ // scan primrefs in our subtree and partition using this thread's prim as a split plane
+ {
+ struct DFSPrimRefAABB box = *PrimRefSet_GetAABBPointer( prim_refs, prim_range_start );
+
+ for ( ushort p = 1; p < local_num_prims; p++ )
+ {
+ struct DFSPrimRefAABB next_box = *PrimRefSet_GetAABBPointer( prim_refs, prim_range_start + p ); //preloading box for next iteration
+
+ for( ushort axis = 0; axis < 3; axis++ )
+ {
+ float c = box.lower[axis] + box.upper[axis];
+
+ if ( c < CSplit[axis] )
+ {
+ // this primitive is to our left.
+ DFSPrimRefAABB_extend( &box_left[axis], &box );
+ count_left[axis]++;
+ }
+ else
+ {
+ // this primitive is to our right
+ DFSPrimRefAABB_extend( &box_right[axis], &box );
+ }
+ }
+
+ box = next_box;
+ }
+
+ // last iteration without preloading box
+ for( ushort axis = 0; axis < 3; axis++ )
+ {
+ float c = box.lower[axis] + box.upper[axis];
+
+ if ( c < CSplit[axis] )
+ {
+ // this primitive is to our left.
+ DFSPrimRefAABB_extend( &box_left[axis], &box );
+ count_left[axis]++;
+ }
+ else
+ {
+ // this primitive is to our right
+ DFSPrimRefAABB_extend( &box_right[axis], &box );
+ }
+ }
+
+ }
+
+ for ( ushort axis = 0; axis < 3; axis++ )
+ {
+ float Al = DFSPrimRefAABB_halfArea( &box_left[axis] );
+ float Ar = DFSPrimRefAABB_halfArea( &box_right[axis] );
+
+ // Avoid NANs in SAH calculation in the corner case where all prims go right
+ // In this case we set Al=Ar, because such a split will only be selected if all primrefs
+ // are co-incident.. In that case, we will fall back to split-in-the-middle and both subtrees
+ // should store the same quantized area value
+ if ( count_left[axis] == 0 )
+ Al = Ar;
+
+ // compute sah cost
+ ushort count_right = local_num_prims - count_left[axis];
+ float sah = Ar * count_right + Al * count_left[axis];
+
+ // keep this split if it is better than the previous one, or if the previous one was a corner-case
+ if ( sah < best_sah || best_count == 0 )
+ {
+ // yes, keep it
+ best_axis = axis;
+ best_sah = sah;
+ best_count = count_left[axis];
+ }
+ }
+
+ // write split information to SLM
+ locals->axis_and_left_count[primref_position].x = best_axis;
+ locals->axis_and_left_count[primref_position].y = best_count;
+ uint sah = as_uint(best_sah);
+ // break ties by axis to ensure deterministic split selection
+ // otherwise builder can produce non-deterministic tree structure run to run
+ // based on the ordering of primitives (which can vary due to non-determinism in atomic counters)
+ // Embed split axis and index into sah value; compute min over sah and max over axis
+ sah = ( ( sah & ~1023 ) | ( 2 - best_axis ) << 8 | tid );
+
+ // reduce on split candidates in our local subtree and decide the best one
+ atomic_min_local( &locals->sah[ prim_range_start ], sah);
+ }
+
+
+ barrier( CLK_LOCAL_MEM_FENCE );
+
+ ushort split_index = locals->sah[ prim_range_start ] & 255;
+ ushort split_axis = locals->axis_and_left_count[split_index].x;
+ ushort split_left_count = locals->axis_and_left_count[split_index].y;
+
+ if ( (primref_position == split_index) && active_thread )
+ {
+ // first thread in a given subtree creates the inner node
+ ushort start_left = prim_range_start;
+ ushort start_right = prim_range_start + split_left_count;
+ if ( split_left_count == 0 )
+ start_right = start_left + (local_num_prims / 2); // handle split-in-the-middle case
+
+ LocalBVH2_CreateInnerNode( bvh2, bvh2_root, start_left, start_right );
+ }
+
+
+ barrier( CLK_LOCAL_MEM_FENCE );
+
+ struct DFSPrimRef ref;
+ ushort new_primref_position;
+
+ if ( active_thread )
+ {
+ ushort2 kids = LocalBVH2_GetChildIndices( bvh2, bvh2_root );
+ bool go_left;
+
+ if ( split_left_count == 0 )
+ {
+ // We chose a split with no left-side prims
+ // This will only happen if all primrefs are located in the exact same position
+ // In that case, fall back to split-in-the-middle
+ split_left_count = (local_num_prims / 2);
+ go_left = (primref_position - prim_range_start < split_left_count);
+ }
+ else
+ {
+ // determine what side of the split this thread's primref belongs on
+ local struct DFSPrimRefAABB* my_box = PrimRefSet_GetAABBPointer( prim_refs, primref_position );
+ local struct DFSPrimRefAABB* split_box = PrimRefSet_GetAABBPointer( prim_refs, split_index );
+ float c = my_box->lower[split_axis] + my_box->upper[split_axis];
+ float Csplit = split_box->lower[split_axis] + split_box->upper[split_axis];
+ go_left = c < Csplit;
+ }
+
+ // adjust state variables for next loop iteration
+ bvh2_root = (go_left) ? kids.x : kids.y;
+ local_num_prims = (go_left) ? split_left_count : (local_num_prims - split_left_count);
+ prim_range_start = (go_left) ? prim_range_start : prim_range_start + split_left_count;
+
+ // determine the new primref position by incrementing a counter in the destination subtree
+ new_primref_position = prim_range_start + LocalBVH2_IncrementPrimCount( bvh2, bvh2_root );
+
+ // load our primref from its previous position
+ ref = PrimRefSet_GetPrimRef( prim_refs, primref_position );
+ }
+
+ barrier( CLK_LOCAL_MEM_FENCE );
+
+ if ( active_thread )
+ {
+ // write our primref into its sorted position and note which node it went in
+ DFSPrimRef_SetBVH2Root( &ref, bvh2_root );
+ PrimRefSet_SetPrimRef( prim_refs, ref, new_primref_position );
+ primref_position = new_primref_position;
+
+
+ // deactivate all threads whose subtrees are small enough to form a leaf
+ if ( local_num_prims <= TREE_ARITY )
+ {
+ active_thread = false;
+ if( primref_position == prim_range_start )
+ atomic_sub_local( num_active_threads, local_num_prims );
+ }
+
+ locals->range[ primref_position ].start = prim_range_start;
+ locals->range[ primref_position ].local_num_prims = local_num_prims;
+ locals->range[ primref_position ].bvh2_root = bvh2_root;
+ locals->range[ primref_position ].active = active_thread;
+ }
+
+ barrier( CLK_LOCAL_MEM_FENCE );
+
+ // if we'll have next iteration then load from SLM
+ if(*num_active_threads)
+ {
+ prim_range_start = locals->range[ tid ].start;
+ local_num_prims = locals->range[ tid ].local_num_prims;
+ bvh2_root = locals->range[ tid ].bvh2_root;
+ active_thread = locals->range[ tid ].active;
+ primref_position = tid;
+ }
+ else
+ {
+ break;
+ }
+
+ } while ( true );
+
+}
+
+
+#define REFIT_BIT_DWORDS (LOCAL_BVH2_NODE_COUNT - DFS_WG_SIZE)/32
+
+struct RefitBits
+{
+ uint bits[REFIT_BIT_DWORDS];
+};
+
+struct DFS_SLM
+{
+ union
+ {
+ struct LocalBVH2 bvh2;
+ struct {
+ struct AABB3f centroid_bounds;
+ uint left_count;
+ uint right_count;
+ struct BFS_BinInfo bins;
+ struct BFS_BinInfoReduce3_SLM reduce3;
+ } binning;
+
+ } u1;
+
+ union
+ {
+ struct {
+ struct PrimRefSet prim_refs;
+ struct BVHBuildLocals locals;
+ } pass0;
+
+ struct AABB3f node_boxes[LOCAL_BVH2_NODE_COUNT];
+
+ } u2;
+
+ union
+ {
+ uchar bytes[DFS_WG_SIZE];
+ uint dwords[DFS_WG_SIZE/4];
+ } mask_info;
+
+ struct RefitBits refit_bits;
+
+};
+
+
+void DFS_InitialBinningPass(
+ local struct BFS_BinInfo* bins,
+ local struct BFS_BinInfoReduce3_SLM* reduce3,
+ uniform local struct AABB3f* centroid_bounds,
+ local struct PrimRefSet* refs,
+ local uint* left_counter,
+ local uint* right_counter,
+ ushort num_refs )
+{
+ uint tid = get_local_id(0);
+
+ // initialize SLM structures
+ if (tid == 0)
+ {
+ AABB3f_init(centroid_bounds);
+ *left_counter = 0;
+ *right_counter = 0;
+ }
+
+ BinInfo_init(bins);
+
+ PrimRef ref;
+ struct DFSPrimRef dfs_ref;
+
+ if (tid < num_refs)
+ {
+ dfs_ref = PrimRefSet_GetPrimRef(refs, tid);
+ struct DFSPrimRefAABB box = dfs_ref.aabb;
+ ref.lower.xyz = (float3)(box.lower[0], box.lower[1], box.lower[2]);
+ ref.upper.xyz = (float3)(box.upper[0], box.upper[1], box.upper[2]);
+ }
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ // compute centroid bounds so that we can bin
+ if (tid < num_refs)
+ {
+ float3 centroid = ref.lower.xyz + ref.upper.xyz;
+ Uniform_AABB3f_atomic_merge_local_sub_group_lu(centroid_bounds, centroid, centroid);
+ }
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ // add primrefs to bins
+ struct BinMapping mapping;
+ BinMapping_init(&mapping, centroid_bounds, BFS_NUM_BINS);
+
+ BinInfo_add_primref( &mapping, bins, &ref, tid<num_refs );
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ // compute split - every sub_group computes different bin
+ struct BFS_Split split = BinInfo_reduce3(reduce3, bins, mapping.scale);
+
+
+ bool go_left = false;
+ uint local_pos = 0;
+ if (tid < num_refs)
+ {
+ // partition primrefs into L/R subsets...
+ if (split.sah == (float)(INFINITY)) // no valid split, split in the middle.. This can happen due to floating-point limit cases in huge scenes
+ go_left = tid < (num_refs / 2);
+ else
+ go_left = is_left(&mapping, &split, &ref);
+
+ if (go_left)
+ local_pos = atomic_inc_local(left_counter);
+ else
+ local_pos = num_refs - (1+ atomic_inc_local(right_counter));
+
+ PrimRefSet_SetPrimRef(refs, dfs_ref, local_pos);
+ }
+
+}
+
+
+void Do_DFS( struct DFSArgs args, local struct DFS_SLM* slm, local uint* num_active_threads )
+{
+ local struct LocalBVH2* bvh2 = &slm->u1.bvh2;
+
+ global struct BVH2* global_bvh2 = args.global_bvh2;
+
+ PrimRef ref;
+ uint parent_node;
+
+ {
+ local struct BVHBuildLocals* locals = &slm->u2.pass0.locals;
+ local struct PrimRefSet* prim_refs = &slm->u2.pass0.prim_refs;
+
+ DFS_CreatePrimRefSet(args, prim_refs);
+
+ uint local_id = get_local_id(0);
+
+ ushort bvh2_root = 0;
+ ushort prim_range_start = 0;
+ ushort local_num_prims = args.num_primrefs;
+
+ if(local_id == 0)
+ *num_active_threads = local_num_prims;
+
+ // barrier for DFS_CreatePrimRefSet and num_active_threads
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ // initial binning pass if number of primrefs is large
+ if( args.num_primrefs > 32 )
+ {
+ DFS_InitialBinningPass(&slm->u1.binning.bins, &slm->u1.binning.reduce3, &slm->u1.binning.centroid_bounds, prim_refs,
+ &slm->u1.binning.left_count, &slm->u1.binning.right_count, args.num_primrefs);
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ ushort left_count = slm->u1.binning.left_count;
+ ushort right_count = args.num_primrefs - left_count;
+ if (get_local_id(0) == 0)
+ LocalBVH2_Initialize_Presplit(bvh2, args.num_primrefs, left_count, right_count);
+
+ bvh2_root = (local_id < left_count) ? 1 : 2;
+ local_num_prims = (local_id < left_count) ? left_count : right_count;
+ prim_range_start = (local_id < left_count) ? 0 : left_count;
+ }
+ else
+ {
+ if (get_local_id(0) == 0)
+ LocalBVH2_Initialize(bvh2, args.num_primrefs);
+ }
+
+ DFS_ConstructBVH2( bvh2, prim_refs, bvh2_root, prim_range_start, local_num_prims, args.num_primrefs, locals, num_active_threads);
+
+ // move the prim refs into their sorted position
+ // keep this thread's primref around for later use
+ if ( local_id < args.num_primrefs )
+ {
+ struct DFSPrimRef dfs_ref = PrimRefSet_GetPrimRef( prim_refs, local_id );
+
+ uint input_id = DFSPrimRef_GetInputIndex( &dfs_ref );
+
+ parent_node = DFSPrimRef_GetBVH2Parent( &dfs_ref );
+
+ uint primref_index = prim_refs->input_indices[input_id];
+ ref = args.primref_buffer[primref_index];
+ args.primref_indices_out[local_id] = primref_index;
+ args.primref_indices_in[local_id] = primref_index;
+ // these buffers are not read again until the end of kernel
+ }
+
+ barrier( CLK_LOCAL_MEM_FENCE );
+
+ }
+
+
+ // initialize flags for determining when subtrees are done refit
+ if ( get_local_id( 0 ) < REFIT_BIT_DWORDS )
+ slm->refit_bits.bits[get_local_id( 0 )] = 0;
+
+
+ // stash full-precision primref AABBs in slm storage
+ local struct AABB3f* slm_boxes = &slm->u2.node_boxes[0];
+ bool active_thread = get_local_id( 0 ) < args.num_primrefs;
+ if( active_thread )
+ {
+ AABB3f_set( &slm_boxes[get_local_id( 0 )], ref.lower.xyz, ref.upper.xyz );
+
+ // stash instance masks in SLM storage
+ if( args.do_mask_processing )
+ slm->mask_info.bytes[get_local_id(0)] = PRIMREF_instanceMask( &ref );
+ }
+
+ barrier( CLK_LOCAL_MEM_FENCE );
+
+ // Refit leaf nodes
+ uint box_index;
+ if ( active_thread )
+ {
+ // the thread for the first primref in every leaf is the one that will ascend
+ // remaining threads merge their AABB/mask into the first one and terminate
+ uint first_ref = LocalBVH2_GetLeafPrimrefStart( bvh2, parent_node );
+ if ( first_ref != get_local_id( 0 ) )
+ {
+ AABB3f_atomic_merge_local_lu( &slm_boxes[first_ref], ref.lower.xyz, ref.upper.xyz );
+
+ if( args.do_mask_processing )
+ {
+ uint dword_index = first_ref/4;
+ uint shift = (first_ref%4)*8;
+ uint mask = PRIMREF_instanceMask(&ref) << shift;
+ atomic_or_local( &slm->mask_info.dwords[dword_index], mask );
+ }
+ active_thread = false; // switch off all primref threads except the first one
+ }
+
+ box_index = first_ref;
+ }
+
+ barrier( CLK_LOCAL_MEM_FENCE );
+
+ if ( active_thread )
+ {
+ uint current_node = parent_node;
+ parent_node = LocalBVH2_GetParent( bvh2, current_node );
+
+ // write out the leaf node's AABB
+ uint num_prims = LocalBVH2_GetLeafPrimCount( bvh2, current_node );
+ uint prim_offs = args.primref_base + LocalBVH2_GetLeafPrimrefStart( bvh2, current_node );
+
+ uint mask = 0xff;
+ if( args.do_mask_processing )
+ mask = slm->mask_info.bytes[box_index];
+
+ BVH2_WriteLeafNode( global_bvh2, args.global_bvh2_base + current_node, &slm_boxes[box_index], prim_offs, num_prims, mask );
+
+ // we no longer need the BVH2 bits for this node, so re-purpose the memory to store the AABB index
+ bvh2->nodes[current_node] = box_index;
+
+ // toggle flag bit in parent node. The second thread to flip the bit is the one that gets to proceed
+ uint thread_mask = (1 << (parent_node % 32));
+ if ( (atomic_xor_local( &slm->refit_bits.bits[parent_node / 32], thread_mask ) & thread_mask) == 0 )
+ active_thread = false;
+ }
+
+ // count how many active threads in sub_group we have and increment wg's number of active threads
+ uint sg_active = sub_group_reduce_add(active_thread ? 1 : 0);
+ if(get_sub_group_local_id() == 0)
+ {
+ atomic_add_local(num_active_threads, sg_active);
+ }
+
+ // refit internal nodes:
+ // walk up the tree and refit AABBs
+
+ do
+ {
+ barrier( CLK_LOCAL_MEM_FENCE ); // we need this barrier because we need to make sure all threads read num_active_threads before modifying it
+ if ( active_thread )
+ {
+ uint current_node = parent_node;
+ parent_node = LocalBVH2_GetParent( bvh2, current_node );
+
+ // pull left/right box indices from current node
+ ushort2 kids = LocalBVH2_GetChildIndices( bvh2, current_node );
+
+ uint left_box = bvh2->nodes[kids.x];
+ uint right_box = bvh2->nodes[kids.y];
+
+ struct AABB3f left = slm_boxes[left_box];
+ struct AABB3f right = slm_boxes[right_box];
+ AABB3f_extend( &left, &right );
+
+ uint2 child_offsets = (uint2)(
+ args.global_bvh2_base + kids.x,
+ args.global_bvh2_base + kids.y);
+
+ uint mask = 0xff;
+ if( args.do_mask_processing )
+ {
+ mask = slm->mask_info.bytes[left_box]
+ | slm->mask_info.bytes[right_box];
+ slm->mask_info.bytes[left_box] = mask;
+ }
+
+ BVH2_WriteInnerNode( args.global_bvh2, args.global_bvh2_base+current_node, &left, child_offsets, mask );
+
+ slm_boxes[left_box] = left;
+ bvh2->nodes[current_node] = left_box;
+
+ // stop at the root
+ if ( LocalBVH2_IsRoot( bvh2, current_node ) )
+ {
+ active_thread = false;
+ atomic_dec_local(num_active_threads);
+ }
+ else
+ {
+ // toggle flag bit in parent node. The second thread to flip the bit is the one that gets to proceed
+ uint mask = (1 << (parent_node % 32));
+ if ( (atomic_xor_local( &slm->refit_bits.bits[parent_node / 32], mask ) & mask) == 0 )
+ {
+ active_thread = false;
+ atomic_dec_local(num_active_threads);
+ }
+ }
+ }
+
+ barrier( CLK_LOCAL_MEM_FENCE );
+ } while ( *num_active_threads > 0 );
+}
+
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__( (reqd_work_group_size(DFS_WG_SIZE,1,1) ))
+__attribute__( (intel_reqd_sub_group_size(16)) )
+kernel void
+DFS( global struct VContextScheduler* scheduler,
+ global struct SAHBuildGlobals* globals_buffer )
+{
+ local struct DFS_SLM slm;
+ local struct DFSDispatchRecord record;
+ local uint num_active_threads;
+
+ if ( get_local_id( 0 ) == 0 )
+ {
+ // pop an entry off the DFS dispatch queue
+ //uint wg_index = atomic_dec_global( &scheduler->num_dfs_wgs ) - 1;
+ //record = scheduler->dfs_queue.records[wg_index];
+
+ // TODO: The version above races, but is considerably faster... investigate
+ uint wg_index = get_group_id(0);
+ record = scheduler->dfs_queue.records[wg_index];
+ write_mem_fence( CLK_LOCAL_MEM_FENCE );
+ atomic_dec_global( &scheduler->num_dfs_wgs );
+ }
+
+ barrier( CLK_LOCAL_MEM_FENCE );
+
+
+ bool odd_pass = record.tree_depth & 1;
+
+ global struct SAHBuildGlobals* sah_globals = globals_buffer + record.batch_index;
+
+ struct DFSArgs args;
+ args.num_primrefs = record.num_primrefs;
+ args.primref_indices_in = SAHBuildGlobals_GetPrimrefIndices_In( sah_globals, odd_pass );
+ args.primref_indices_out = SAHBuildGlobals_GetPrimrefIndices_Out( sah_globals, odd_pass );
+ args.primref_buffer = SAHBuildGlobals_GetPrimrefs( sah_globals );
+ args.global_bvh2 = SAHBuildGlobals_GetBVH2( sah_globals );
+ args.primref_indices_in += record.primref_base;
+ args.primref_indices_out += record.primref_base;
+ args.primref_base = record.primref_base;
+ args.global_bvh2_base = record.bvh2_base;
+ args.do_mask_processing = SAHBuildGlobals_NeedMasks( sah_globals );
+
+ Do_DFS( args, &slm, &num_active_threads );
+
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+///
+/// BVH2 to BVH6
+///
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+
+struct BuildFlatTreeArgs
+{
+ ushort leaf_size_in_bytes;
+ ushort leaf_type;
+ ushort inner_node_type;
+ bool do_mask_processing;
+
+ global uint* primref_indices;
+ global PrimRef* primref_buffer;
+ global struct Globals* globals;
+ global struct BVHBase* bvh_base;
+ global struct BVH2* bvh2;
+};
+
+
+// lane i in the return value is the index of the ith largest primref in the input
+// the return value can be used with shuffle() to move data into its sorted position
+// the elements of 'key' must be unique.. only the first 6 elements are sorted
+varying ushort SUBGROUP_get_sort_indices_N6( varying uint key )
+{
+ // each lane computes the number of items larger than it
+ // this is its position in the descending order
+ // TODO_OPT: Compiler can vectorize these uint16 adds by packing into lower and upper halves of same GPR.... make sure it does it
+ // if compiler is not generating optimal code, consider moving to Cm
+
+ varying ushort cmp0 = (sub_group_broadcast(key, 0) > key) ? 1 : 0;
+ varying ushort cmp1 = (sub_group_broadcast(key, 1) > key) ? 1 : 0;
+ varying ushort cmp2 = (sub_group_broadcast(key, 2) > key) ? 1 : 0;
+ varying ushort cmp3 = (sub_group_broadcast(key, 3) > key) ? 1 : 0;
+ varying ushort cmp4 = (sub_group_broadcast(key, 4) > key) ? 1 : 0;
+ varying ushort cmp5 = (sub_group_broadcast(key, 5) > key) ? 1 : 0;
+ varying ushort a = cmp0 + cmp2 + cmp4;
+ varying ushort b = cmp1 + cmp3 + cmp5;
+ varying ushort num_larger = a + b;
+
+ // each lane determines which of the input elements it should pull
+ varying ushort lane = get_sub_group_local_id();
+ a = (sub_group_broadcast(num_larger, 0) == lane) ? 0 : 0;
+ b = (sub_group_broadcast(num_larger, 1) == lane) ? 1 : 0;
+ a += (sub_group_broadcast(num_larger, 2) == lane) ? 2 : 0;
+ b += (sub_group_broadcast(num_larger, 3) == lane) ? 3 : 0;
+ a += (sub_group_broadcast(num_larger, 4) == lane) ? 4 : 0;
+ b += (sub_group_broadcast(num_larger, 5) == lane) ? 5 : 0;
+ return a + b;
+}
+
+uint SUBGROUP_area_to_sort_key( varying float area, uniform ushort num_children )
+{
+ varying ushort lane = get_sub_group_local_id();
+ area = (lane < num_children) ? area : 0; // put inactive nodes last
+
+ // drop LSBs and break ties by lane number to ensure unique keys
+ // use descending lane IDs to ensure that sort is stable if the upper MSBs are equal.
+ // If we do not do this it can lead to non-deterministic tree structure
+ return (as_uint(area) & 0xffffff80) + (lane^(get_sub_group_size()-1));
+}
+
+// lane i in the return value is the index of the ith largest primref in the input
+// the return value can be used with shuffle() to move data into its sorted position
+// the elements of 'key' must be unique.. only the first 6 elements are sorted
+varying ushort SUBGROUP_get_sort_indices_N6_2xSIMD8_in_SIMD16( varying uint key )
+{
+ // each lane computes the number of items larger than it
+ // this is its position in the descending order
+ // TODO_OPT: Compiler can vectorize these uint16 adds by packing into lower and upper halves of same GPR.... make sure it does it
+ // if compiler is not generating optimal code, consider moving to Cm
+
+ varying ushort cmp0 = (sub_group_broadcast(key, 0) > key) ? 1 : 0;
+ varying ushort cmp1 = (sub_group_broadcast(key, 1) > key) ? 1 : 0;
+ varying ushort cmp2 = (sub_group_broadcast(key, 2) > key) ? 1 : 0;
+ varying ushort cmp3 = (sub_group_broadcast(key, 3) > key) ? 1 : 0;
+ varying ushort cmp4 = (sub_group_broadcast(key, 4) > key) ? 1 : 0;
+ varying ushort cmp5 = (sub_group_broadcast(key, 5) > key) ? 1 : 0;
+ varying ushort a = cmp0 + cmp2 + cmp4;
+ varying ushort b = cmp1 + cmp3 + cmp5;
+ varying ushort num_larger = a + b;
+
+ varying ushort cmp0_1 = (sub_group_broadcast(key, 8) > key) ? 1 : 0;
+ varying ushort cmp1_1 = (sub_group_broadcast(key, 9) > key) ? 1 : 0;
+ varying ushort cmp2_1 = (sub_group_broadcast(key, 10) > key) ? 1 : 0;
+ varying ushort cmp3_1 = (sub_group_broadcast(key, 11) > key) ? 1 : 0;
+ varying ushort cmp4_1 = (sub_group_broadcast(key, 12) > key) ? 1 : 0;
+ varying ushort cmp5_1 = (sub_group_broadcast(key, 13) > key) ? 1 : 0;
+ varying ushort a_1 = cmp0_1 + cmp2_1 + cmp4_1;
+ varying ushort b_1 = cmp1_1 + cmp3_1 + cmp5_1;
+ varying ushort num_larger_1 = a_1 + b_1;
+
+ // each lane determines which of the input elements it should pull
+ varying ushort lane = get_sub_group_local_id();
+ if(lane < 8)
+ {
+ a = (sub_group_broadcast(num_larger, 0) == lane) ? 0 : 0;
+ b = (sub_group_broadcast(num_larger, 1) == lane) ? 1 : 0;
+ a += (sub_group_broadcast(num_larger, 2) == lane) ? 2 : 0;
+ b += (sub_group_broadcast(num_larger, 3) == lane) ? 3 : 0;
+ a += (sub_group_broadcast(num_larger, 4) == lane) ? 4 : 0;
+ b += (sub_group_broadcast(num_larger, 5) == lane) ? 5 : 0;
+ }
+ else
+ {
+ a = (sub_group_broadcast(num_larger_1, 8) == lane-8) ? 8 : 8;
+ b = (sub_group_broadcast(num_larger_1, 9) == lane-8) ? 1 : 0;
+ a += (sub_group_broadcast(num_larger_1, 10) == lane-8) ? 2 : 0;
+ b += (sub_group_broadcast(num_larger_1, 11) == lane-8) ? 3 : 0;
+ a += (sub_group_broadcast(num_larger_1, 12) == lane-8) ? 4 : 0;
+ b += (sub_group_broadcast(num_larger_1, 13) == lane-8) ? 5 : 0;
+ }
+
+ return a + b;
+}
+
+uint SUBGROUP_area_to_sort_key_2xSIMD8_in_SIMD16( varying float area, uniform ushort num_children )
+{
+ varying ushort lane = get_sub_group_local_id() % 8;
+ area = (lane < num_children) ? area : 0; // put inactive nodes last
+
+ // drop LSBs and break ties by lane number to ensure unique keys
+ // use descending lane IDs to ensure that sort is stable if the upper MSBs are equal.
+ // If we do not do this it can lead to non-deterministic tree structure
+ return (as_uint(area) & 0xffffff80) + (lane^7);
+}
+
+ushort SUBGROUP_BuildFlatTreeNode(
+ uniform struct BuildFlatTreeArgs args,
+ uniform uint bvh2_root,
+ uniform struct InternalNode* qnode,
+ uniform uint qnode_index,
+ varying uint3* sg_children_out // if an inner node is created, receives the indices of the 6 child nodes (X), and the QNode position (y), and num_children(z)
+ // if a leaf is created, receives number of primrefs (z)
+) // return value is the number of child nodes or 0 for a leaf
+{
+ global struct BVH2* bvh2 = args.bvh2;
+ varying ushort lane = get_sub_group_local_id();
+
+ global struct BVHBase* base = args.bvh_base;
+
+
+ if ( !BVH2_IsInnerNode( bvh2, bvh2_root ) )
+ {
+ uniform ushort num_prims = BVH2_GetLeafPrimCount( bvh2, bvh2_root );
+ uniform uint primref_start = BVH2_GetLeafPrimStart( bvh2, bvh2_root );
+ varying uint primref_index = primref_start + ((lane < num_prims) ? lane : 0);
+
+ varying uint ref_id = args.primref_indices[primref_index];
+ varying PrimRef ref = args.primref_buffer[ref_id];
+ uniform char* leaf_mem_base = (char*)BVHBase_GetQuadLeaves( args.bvh_base );
+ uniform char* leaf_mem = leaf_mem_base + primref_start * args.leaf_size_in_bytes;
+
+ uniform int offset = (int)(leaf_mem - (char*)qnode);
+ offset = offset >> 6;
+
+ varying uint key = SUBGROUP_area_to_sort_key(AABB_halfArea(&ref), num_prims );
+ varying ushort sort_index = SUBGROUP_get_sort_indices_N6(key);
+ ref = PrimRef_sub_group_shuffle(&ref, sort_index);
+ ref_id = intel_sub_group_shuffle(ref_id, sort_index);
+
+ if (lane < num_prims)
+ args.primref_indices[primref_index] = ref_id;
+
+ uint global_num_prims = args.globals->numPrimitives;
+ char* bvh_mem = (char*) args.bvh_base;
+
+ if(lane < num_prims)
+ args.primref_indices[primref_index + global_num_prims] = qnode - (struct InternalNode*)bvh_mem;
+
+ if (args.leaf_type == NODE_TYPE_INSTANCE)
+ subgroup_setInstanceQBVHNodeN( offset, &ref, num_prims, (struct QBVHNodeN*)qnode, lane < num_prims ? PRIMREF_instanceMask(&ref) : 0 );
+ else
+ subgroup_setQBVHNodeN( offset, args.leaf_type, &ref, num_prims, (struct QBVHNodeN*)qnode, BVH_NODE_DEFAULT_MASK );
+
+ sg_children_out->z = num_prims;
+ return 0;
+ }
+ else
+ {
+ // collapse BVH2 into BVH6.
+ // We will spread the root node's children across the subgroup, and keep adding SIMD lanes until we have enough
+ uniform ushort num_children = 2;
+
+ uniform uint2 kids = BVH2_GetChildIndices( bvh2, bvh2_root );
+ varying uint sg_bvh2_node = kids.x;
+ if ( lane == 1 )
+ sg_bvh2_node = kids.y;
+
+ do
+ {
+ // choose the inner node with maximum area to replace.
+ // Its left child goes in its old location. Its right child goes in a new lane
+
+ // TODO_OPT: We re-read the AABBs again and again to compute area
+ // ... store per-lane boxes instead and pre-compute areas
+
+ varying float sg_area = BVH2_GetNodeArea( bvh2, sg_bvh2_node );
+ varying bool sg_is_inner = BVH2_IsInnerNode( bvh2, sg_bvh2_node );
+ sg_area = (sg_is_inner && lane < num_children) ? sg_area : 0; // prevent early exit if the largest child is a leaf
+
+ uniform float max_area = sub_group_reduce_max_N6( sg_area );
+ varying bool sg_reducable = max_area == sg_area && (lane < num_children) && sg_is_inner;
+ uniform uint mask = intel_sub_group_ballot( sg_reducable );
+
+ // TODO_OPT: Some of these ops seem redundant.. look at trimming further
+
+ if ( mask == 0 )
+ break;
+
+ // choose the inner node with maximum area to replace
+ uniform ushort victim_child = ctz( mask );
+ uniform uint victim_node = sub_group_broadcast( sg_bvh2_node, victim_child );
+ kids = BVH2_GetChildIndices( bvh2, victim_node );
+
+ if ( lane == victim_child )
+ sg_bvh2_node = kids.x;
+ else if ( lane == num_children )
+ sg_bvh2_node = kids.y;
+
+ num_children++;
+
+ } while ( num_children < TREE_ARITY );
+
+ // allocate inner node space
+ uniform uint kids_offset;
+ if (get_sub_group_local_id() == 0)
+ kids_offset = allocate_inner_nodes( args.bvh_base, num_children );
+ kids_offset = sub_group_broadcast(kids_offset, 0);
+
+ uniform struct QBVHNodeN* kid = (((struct QBVHNodeN*)args.bvh_base) + kids_offset);
+ uniform int offset = (int)((char*)kid - (char*)qnode) >> 6;
+
+#if 0
+ uniform uint kids_offset;
+ if ( get_sub_group_local_id() == 0 )
+ kids_offset = alloc_node_mem( args.globals, sizeof( struct QBVHNodeN ) * num_children );
+ kids_offset = sub_group_broadcast( kids_offset, 0 );
+
+
+ // create inner node
+ uniform struct QBVHNodeN* kid = (struct QBVHNodeN*) ((char*)(args.bvh_base) + kids_offset);
+ uniform int offset = (int)((char*)kid - (char*)qnode) >> 6;
+#endif
+ uniform uint child_type = args.inner_node_type;
+
+ // sort child nodes in descending order by AABB area
+ varying struct AABB box = BVH2_GetAABB( bvh2, sg_bvh2_node );
+ varying uint key = SUBGROUP_area_to_sort_key(AABB_halfArea(&box), num_children );
+ varying ushort sort_index = SUBGROUP_get_sort_indices_N6(key);
+ box = AABB_sub_group_shuffle(&box, sort_index);
+ sg_bvh2_node = intel_sub_group_shuffle(sg_bvh2_node, sort_index);
+
+ uniform uint node_mask = (args.do_mask_processing) ? BVH2_GetMask( bvh2, bvh2_root ) : 0xff;
+
+ subgroup_setQBVHNodeN( offset, child_type, &box, num_children, (struct QBVHNodeN*)qnode, node_mask );
+
+ // return child information
+ *sg_children_out = (uint3)(sg_bvh2_node, qnode_index + offset + get_sub_group_local_id(), num_children );
+ return num_children;
+ }
+}
+
+ushort SUBGROUP_BuildFlatTreeNode_2xSIMD8_in_SIMD16(
+ uniform struct BuildFlatTreeArgs args,
+ varying uint bvh2_root,
+ varying struct InternalNode* qnode_base,
+ varying uint qnode_index,
+ varying uint3* sg_children_out, // if an inner node is created, receives the indices of the 6 child nodes (X), and the QNode position (y), and num_children(z)
+ // if a leaf is created, receives number of primrefs (z)
+ bool active_lane
+) // return value is the number of child nodes or 0 for a leaf
+{
+ global struct BVH2* bvh2 = args.bvh2;
+ varying ushort SIMD16_lane = get_sub_group_local_id();
+ varying ushort SIMD8_lane = get_sub_group_local_id() % 8;
+ varying ushort SIMD8_id = get_sub_group_local_id() / 8;
+ varying ushort lane = get_sub_group_local_id();
+ global struct BVHBase* base = args.bvh_base;
+
+ struct BVH2NodeMetaData nodeMetaData = BVH2_GetNodeMetaData( bvh2, bvh2_root );
+
+ bool is_leaf = active_lane && !BVH2NodeMetaData_IsInnerNode( &nodeMetaData );
+ bool is_inner = active_lane && BVH2NodeMetaData_IsInnerNode( &nodeMetaData );
+
+ uchar mask = BVH_NODE_DEFAULT_MASK;
+ if(is_inner)
+ mask = (args.do_mask_processing) ? BVH2NodeMetaData_GetMask( &nodeMetaData ) : 0xff;
+
+ int offset;
+
+ varying struct InternalNode* qnode = qnode_base + qnode_index;
+ // TOOD: we don't need unions, I left them only for readability
+ union {
+ uint num_prims;
+ uint num_children;
+ } lane_num_data;
+
+ union {
+ PrimRef ref; // this is in fact AABB
+ struct AABB box;
+ } lane_box_data;
+
+ union {
+ uint ref_id;
+ uint sg_bvh2_node;
+ } lane_id_data;
+
+ // for leafs
+ varying uint primref_index;
+
+ if(is_leaf)
+ {
+ lane_num_data.num_prims = BVH2NodeMetaData_GetLeafPrimCount( &nodeMetaData );
+ uint primref_start = BVH2NodeMetaData_GetLeafPrimStart( &nodeMetaData );
+ primref_index = primref_start + ((SIMD8_lane < lane_num_data.num_prims) ? SIMD8_lane : 0);
+
+ lane_id_data.ref_id = args.primref_indices[primref_index];
+ lane_box_data.ref = args.primref_buffer[lane_id_data.ref_id];
+ char* leaf_mem_base = (char*)BVHBase_GetQuadLeaves( args.bvh_base );
+ char* leaf_mem = leaf_mem_base + primref_start * args.leaf_size_in_bytes;
+
+ offset = (int)(leaf_mem - (char*)qnode);
+ offset = offset >> 6;
+ }
+
+
+ if(intel_sub_group_ballot(is_inner))
+ {
+ // collapse BVH2 into BVH6.
+ // We will spread the root node's children across the subgroup, and keep adding SIMD lanes until we have enough
+
+ uint2 kids;
+ if(is_inner)
+ {
+ lane_num_data.num_children = 2;
+ kids = BVH2_GetChildIndices( bvh2, bvh2_root );
+
+ lane_id_data.sg_bvh2_node = kids.x;
+ if ( SIMD8_lane == 1 )
+ lane_id_data.sg_bvh2_node = kids.y;
+ }
+
+ bool active = is_inner;
+ do
+ {
+ // choose the inner node with maximum area to replace.
+ // Its left child goes in its old location. Its right child goes in a new lane
+
+ // TODO_OPT: We re-read the AABBs again and again to compute area
+ // ... store per-lane boxes instead and pre-compute areas
+
+ varying float sg_area = 0;
+ varying bool sg_is_inner = false;
+ if(active)
+ {
+ sg_area = BVH2_GetNodeArea( bvh2, lane_id_data.sg_bvh2_node );
+ sg_is_inner = BVH2_IsInnerNode( bvh2, lane_id_data.sg_bvh2_node );
+ sg_area = (sg_is_inner && SIMD8_lane < lane_num_data.num_children) ? sg_area : 0; // prevent early exit if the largest child is a leaf
+ }
+
+ float max_area = sub_group_reduce_max_N6_2xSIMD8_in_SIMD16( sg_area );
+ varying bool sg_reducable = max_area == sg_area && sg_is_inner && (SIMD8_lane < lane_num_data.num_children);
+ uint mask = intel_sub_group_ballot( sg_reducable ) & (0xFF << SIMD8_id * 8); // we'll end up with two different masks for two SIMD8 in SIMD16 due to bits masking
+
+ // TODO_OPT: Some of these ops seem redundant.. look at trimming further
+
+ if ( mask == 0 )
+ active = false;
+
+ // choose the inner node with maximum area to replace
+ ushort victim_child = ctz( mask );
+ uint victim_node = intel_sub_group_shuffle( lane_id_data.sg_bvh2_node, victim_child );
+ if(active)
+ {
+ kids = BVH2_GetChildIndices( bvh2, victim_node );
+
+ if ( SIMD16_lane == victim_child ) // we use SIMD16_lane, cause victim_child was calculated based on SIMD16 i.e. second node will have victim from 8..13
+ lane_id_data.sg_bvh2_node = kids.x;
+ else if ( SIMD8_lane == lane_num_data.num_children )
+ lane_id_data.sg_bvh2_node = kids.y;
+
+ lane_num_data.num_children++;
+
+ if(lane_num_data.num_children >= TREE_ARITY)
+ active = false;
+ }
+
+ } while ( intel_sub_group_ballot(active) ); // if any active, then continue
+
+ // sum children from both halfs of SIMD16 to allocate nodes only once per sub_group
+ uniform ushort num_children = is_inner ? lane_num_data.num_children : 0;
+ uniform ushort first_SIMD8_num_children = sub_group_broadcast(num_children, 0);
+ uniform ushort second_SIMD8_num_children = sub_group_broadcast(num_children, 8);
+
+ num_children = first_SIMD8_num_children + second_SIMD8_num_children;
+ uint kids_offset;
+
+ // allocate inner node space
+ if(num_children && SIMD16_lane == 0)
+ kids_offset = allocate_inner_nodes( args.bvh_base, num_children );
+ kids_offset = sub_group_broadcast(kids_offset, 0);
+ if((is_inner))
+ {
+ kids_offset += SIMD8_id * first_SIMD8_num_children;
+
+ struct QBVHNodeN* kid = (((struct QBVHNodeN*)args.bvh_base) + kids_offset);
+
+ offset = (int)((char*)kid - (char*)qnode) >> 6;
+ lane_box_data.box = BVH2_GetAABB( bvh2, lane_id_data.sg_bvh2_node );
+ }
+ }
+
+ // sort child nodes in descending order by AABB area
+ varying uint key = SUBGROUP_area_to_sort_key_2xSIMD8_in_SIMD16(AABB_halfArea(&lane_box_data.box), lane_num_data.num_children );
+ varying ushort sort_index = SUBGROUP_get_sort_indices_N6_2xSIMD8_in_SIMD16(key);
+ lane_box_data.box = PrimRef_sub_group_shuffle(&lane_box_data.box, sort_index);
+ lane_id_data.sg_bvh2_node = intel_sub_group_shuffle(lane_id_data.sg_bvh2_node, sort_index);
+
+ char* bvh_mem = (char*) args.bvh_base;
+ if (is_leaf && SIMD8_lane < lane_num_data.num_prims)
+ {
+ args.primref_indices[primref_index] = lane_id_data.ref_id;
+ args.primref_indices[primref_index + args.globals->numPrimitives] = qnode - (struct InternalNode*)bvh_mem;
+ }
+
+ bool degenerated = false;
+ uint node_type = is_leaf ? args.leaf_type : args.inner_node_type;
+
+ if(args.leaf_type == NODE_TYPE_INSTANCE)
+ degenerated = subgroup_setInstanceBox_2xSIMD8_in_SIMD16(&lane_box_data.box, lane_num_data.num_children, &mask, SIMD8_lane < lane_num_data.num_prims ? PRIMREF_instanceMask(&lane_box_data.ref) : 0, is_leaf);
+
+ subgroup_setQBVHNodeN_setFields_2xSIMD8_in_SIMD16(offset, node_type, &lane_box_data.box, lane_num_data.num_children, mask, (struct QBVHNodeN*)(qnode), degenerated, active_lane);
+
+ // return child information
+ if(is_inner)
+ {
+ sg_children_out->x = lane_id_data.sg_bvh2_node;
+ sg_children_out->y = qnode_index + offset + SIMD8_lane;
+ }
+
+ sg_children_out->z = lane_num_data.num_children;
+
+ return is_inner ? lane_num_data.num_children : 0;
+}
+
+void check_primref_integrity( global struct SAHBuildGlobals* globals )
+{
+ global uint* primref_in = SAHBuildGlobals_GetPrimrefIndices_In( globals, 0 );
+ global uint* primref_out = SAHBuildGlobals_GetPrimrefIndices_Out( globals, 0 );
+ dword num_primrefs = SAHBuildGlobals_GetTotalPrimRefs( globals );
+ if ( get_local_id( 0 ) == 0 )
+ {
+ for ( uint i = 0; i < num_primrefs; i++ )
+ {
+ primref_out[i] = 0;
+ }
+
+ for ( uint i = 0; i < num_primrefs; i++ )
+ primref_out[primref_in[i]]++;
+
+ for ( uint i = 0; i < num_primrefs; i++ )
+ if ( primref_out[i] != 1 )
+ printf( "Foo: %u %u\n", i, primref_out[i] );
+ }
+}
+
+
+
+
+void check_bvh2(global struct SAHBuildGlobals* globals )
+{
+ global struct BVH2* bvh2 = SAHBuildGlobals_GetBVH2(globals);
+ global uint* primref_in = SAHBuildGlobals_GetPrimrefIndices_In(globals, 0);
+ global uint* primref_out = SAHBuildGlobals_GetPrimrefIndices_Out(globals, 0);
+ dword num_primrefs = SAHBuildGlobals_GetTotalPrimRefs(globals);
+
+ if (get_local_id(0) == 0)
+ {
+ for (uint i = 0; i < num_primrefs; i++)
+ primref_out[i] = 0;
+
+ uint stack[256];
+ uint sp=0;
+ uint r = BVH2_GetRoot(bvh2);
+ stack[sp++] = r;
+ while (sp)
+ {
+ r = stack[--sp];
+ if (BVH2_IsInnerNode(bvh2,r))
+ {
+ uint2 kids = BVH2_GetChildIndices( bvh2, r);
+ if (kids.x >= bvh2->num_nodes || kids.y >= bvh2->num_nodes)
+ {
+ printf("BVH2!! Bad node index found!\n");
+ return;
+ }
+
+ stack[sp++] = kids.x;
+ stack[sp++] = kids.y;
+ }
+ else
+ {
+ uint ref = BVH2_GetLeafPrimStart(bvh2,r);
+ uint count = BVH2_GetLeafPrimCount(bvh2,r);
+ if( count == 0 )
+ {
+ printf("BVH2!! Empty leaf found!\n");
+ return;
+ }
+ for (uint i = 0; i < count; i++)
+ {
+ if (ref + i > num_primrefs)
+ {
+ printf("BVH2!! Bad leaf range!\n");
+ return;
+ }
+ uint c = primref_out[ref+i];
+ if (c != 0)
+ {
+ printf("BVH2!! overlapped prim ranges\n");
+ return;
+ }
+ primref_out[ref+i] = 1;
+ if (primref_in[ref + i] >= num_primrefs)
+ {
+ printf("BAD PRIMREF ID FOUND!\n");
+ return;
+ }
+ }
+ }
+ }
+ }
+
+ printf("bvh2 is ok!\n");
+}
+
+
+#if 0
+// TODO_OPT: Enable larger WGs. WGSize 512 at SIMD8 hangs on Gen9, but Gen12 can go bigger
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__( (reqd_work_group_size(256,1,1)) )
+__attribute__( (intel_reqd_sub_group_size(8) ) )
+kernel void
+build_qnodes( global struct SAHBuildGlobals* globals, global struct VContextScheduler* scheduler )
+{
+ globals = globals + (scheduler->num_trivial_builds + scheduler->num_single_builds);
+ globals = globals + get_group_id(0);
+
+
+ struct BuildFlatTreeArgs args;
+ args.leaf_size_in_bytes = SAHBuildGlobals_GetLeafSizeInBytes( globals );
+ args.leaf_type = SAHBuildGlobals_GetLeafType( globals );
+ args.inner_node_type = SAHBuildGlobals_GetInternalNodeType( globals );
+ args.primref_indices = SAHBuildGlobals_GetPrimrefIndices_In( globals, 0 );
+ args.primref_buffer = SAHBuildGlobals_GetPrimrefs( globals );
+ args.bvh_base = SAHBuildGlobals_GetBVHBase( globals );
+ args.bvh2 = SAHBuildGlobals_GetBVH2( globals );
+ args.globals = (global struct Globals*) globals->p_globals;
+ args.do_mask_processing = SAHBuildGlobals_NeedMasks( globals );
+
+ dword alloc_backpointers = SAHBuildGlobals_NeedBackPointers( globals );
+ global uint2* root_buffer = (global uint2*) globals->p_qnode_root_buffer;
+ global struct InternalNode* qnodes = (global struct InternalNode*) BVHBase_GetInternalNodes( args.bvh_base );
+ global uint* back_pointers = (global uint*) BVHBase_GetBackPointers( args.bvh_base );
+
+ local uint nodes_produced;
+ if ( get_sub_group_id() == 0 )
+ {
+ // allocate first node
+ if (get_sub_group_local_id() == 0)
+ allocate_inner_nodes( args.bvh_base, 1 );
+
+ // first subgroup does first node
+ varying uint3 children_info;
+ uniform ushort num_children = SUBGROUP_BuildFlatTreeNode(args, BVH2_GetRoot(args.bvh2), qnodes, 0, &children_info );
+
+ if ( get_sub_group_local_id() < num_children )
+ root_buffer[get_sub_group_local_id()] = children_info.xy;
+
+ if ( alloc_backpointers )
+ {
+ // set root's backpointer
+ if( get_sub_group_local_id() == 0 )
+ back_pointers[0] = (0xffffffc0) | (children_info.z << 3);
+
+ // point child backpointers at the parent
+ if( get_sub_group_local_id() < num_children )
+ back_pointers[children_info.y] = 0;
+ }
+
+ if ( get_sub_group_local_id() == 0 )
+ nodes_produced = num_children;
+ }
+
+ barrier( CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE );
+
+
+ uniform uint buffer_index = get_sub_group_id();
+ uniform bool sg_active = buffer_index < nodes_produced;
+
+ while ( work_group_any( sg_active ) )
+ {
+ if( sg_active )
+ {
+ uniform uint bvh2_node = root_buffer[buffer_index].x;
+ uniform uint qnode_index = root_buffer[buffer_index].y;
+
+ // build a node
+ varying uint3 children_info;
+ uniform ushort num_children = SUBGROUP_BuildFlatTreeNode( args, bvh2_node, qnodes + qnode_index, qnode_index, &children_info );
+
+ // handle backpointers
+ if ( alloc_backpointers )
+ {
+ // update this node's backpointer with child count
+ if ( get_sub_group_local_id() == 0 )
+ back_pointers[qnode_index] |= (children_info.z << 3);
+
+ // point child backpointers at parent
+ if ( get_sub_group_local_id() < num_children )
+ back_pointers[children_info.y] = (qnode_index << 6);
+ }
+
+ if ( num_children )
+ {
+ // allocate space in the child buffer
+ uint root_buffer_position = 0;
+ if ( get_sub_group_local_id() == 0 )
+ root_buffer_position = atomic_add_local( &nodes_produced, num_children );
+ root_buffer_position = sub_group_broadcast( root_buffer_position, 0 );
+
+ // store child indices in root buffer
+ if ( get_sub_group_local_id() < num_children )
+ root_buffer[root_buffer_position + get_sub_group_local_id()] = children_info.xy;
+ }
+ }
+
+ // sync everyone
+ work_group_barrier( CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE,
+ memory_scope_work_group );
+
+
+ if( sg_active )
+ buffer_index += get_num_sub_groups();
+
+ sg_active = (buffer_index < nodes_produced);
+ }
+}
+#endif
+
+
+
+
+
+
+
+inline bool buffer_may_overflow( uint capacity, uint current_size, uint elements_processed_per_sub_group )
+{
+ uint num_consumed = min( get_num_sub_groups() * elements_processed_per_sub_group, current_size );
+ uint space_available = (capacity - current_size) + num_consumed;
+ uint space_needed = TREE_ARITY * num_consumed;
+ return space_available < space_needed;
+}
+
+inline uint build_qnodes_pc(
+ global struct SAHBuildGlobals* globals,
+ bool alloc_backpointers,
+ bool process_masks,
+ uint first_qnode,
+ uint first_bvh2_node,
+
+ local uint2* SLM_local_root_buffer,
+ local uint* SLM_ring_tail,
+ const uint RING_SIZE
+)
+
+{
+ struct BuildFlatTreeArgs args;
+ args.leaf_size_in_bytes = SAHBuildGlobals_GetLeafSizeInBytes( globals );
+ args.leaf_type = SAHBuildGlobals_GetLeafType( globals );
+ args.inner_node_type = SAHBuildGlobals_GetInternalNodeType( globals );
+ args.primref_indices = SAHBuildGlobals_GetPrimrefIndices_In( globals, 0 );
+ args.primref_buffer = SAHBuildGlobals_GetPrimrefs( globals );
+ args.bvh_base = SAHBuildGlobals_GetBVHBase( globals );
+ args.bvh2 = SAHBuildGlobals_GetBVH2( globals );
+ args.globals = (global struct Globals*) globals->p_globals;
+ args.do_mask_processing = process_masks;
+
+ global struct InternalNode* qnodes = (global struct InternalNode*) BVHBase_GetInternalNodes( args.bvh_base );
+ global uint* back_pointers = (global uint*) BVHBase_GetBackPointers( args.bvh_base );
+
+ // first subgroup adds first node
+ if ( get_sub_group_id() == 0 && get_sub_group_local_id() == 0)
+ {
+ SLM_local_root_buffer[0].x = first_bvh2_node;
+ SLM_local_root_buffer[0].y = first_qnode;
+ *SLM_ring_tail = 1;
+
+ }
+
+ uint ring_head = 0;
+ uint ring_tail = 1;
+ uint ring_size = 1;
+
+ barrier( CLK_LOCAL_MEM_FENCE );
+
+ const uniform uint elements_processed_in_sg = 2;
+
+ while ( ring_size > 0 && !buffer_may_overflow( RING_SIZE, ring_size, elements_processed_in_sg ) )
+ {
+ ushort SIMD16_lane = get_sub_group_local_id();
+
+ // SIMD16 as 2xSIMD8
+ ushort SIMD8_lane = get_sub_group_local_id() % 8;
+ ushort SIMD8_id = get_sub_group_local_id() / 8;
+ bool active_lane;
+
+ uniform uint nodes_consumed = min( get_num_sub_groups() * elements_processed_in_sg, ring_size ); // times two because we process two nodes in subgroup
+ uniform bool sg_active = get_sub_group_id() * elements_processed_in_sg < nodes_consumed;
+ ushort num_children = 0;
+ varying uint3 children_info = 0;
+
+ uint bvh2_node = 0;
+ uint qnode_index = 0;
+
+ if (sg_active)
+ {
+ ushort consumed_pos = get_sub_group_id() * elements_processed_in_sg + SIMD8_id;
+ active_lane = consumed_pos < nodes_consumed ? true : false;
+ consumed_pos = consumed_pos < nodes_consumed ? consumed_pos : consumed_pos-1;
+
+ uint buffer_index = (ring_head + consumed_pos) % RING_SIZE;
+
+ bvh2_node = SLM_local_root_buffer[buffer_index].x;
+ qnode_index = SLM_local_root_buffer[buffer_index].y;
+ }
+
+ barrier( CLK_LOCAL_MEM_FENCE );
+
+ if (sg_active)
+ {
+ // build a node
+ num_children = SUBGROUP_BuildFlatTreeNode_2xSIMD8_in_SIMD16(args, bvh2_node, qnodes, qnode_index, &children_info, active_lane);
+
+ // handle backpointers
+ // TODO_OPT: This should be separate shaders not a runtime branch
+ // doing it this way for now because GRLTLK does not make dynamic shader selection on host very easy.
+ // this needs to change... GRLTLK should
+
+ if (alloc_backpointers && active_lane)
+ {
+ // update this node's backpointer with child count
+ if (SIMD8_lane == 0)
+ back_pointers[qnode_index] |= (children_info.z << 3);
+
+ // point child backpointers at parent
+ if (SIMD8_lane < num_children)
+ back_pointers[children_info.y] = (qnode_index << 6);
+ }
+
+ // save data
+
+ uniform ushort first_SIMD8_num_children = sub_group_broadcast(num_children, 0);
+ uniform ushort second_SIMD8_num_children = sub_group_broadcast(num_children, 8);
+ uniform ushort SIMD16_num_children = first_SIMD8_num_children + second_SIMD8_num_children;
+
+ uint root_buffer_position = 0;
+
+ // allocate space in the child buffer
+ if (SIMD16_lane == 0 && SIMD16_num_children)
+ root_buffer_position = atomic_add_local(SLM_ring_tail, SIMD16_num_children);
+
+ root_buffer_position = sub_group_broadcast( root_buffer_position, 0 );
+ root_buffer_position += SIMD8_id * first_SIMD8_num_children; // update offset for second half of SIMD16
+
+ // store child indices in root buffer
+ if (SIMD8_lane < num_children)
+ {
+ uint store_pos = (root_buffer_position + SIMD8_lane) % RING_SIZE;
+ SLM_local_root_buffer[store_pos] = children_info.xy;
+ }
+ }
+
+ // sync everyone
+ barrier( CLK_LOCAL_MEM_FENCE );
+
+ ring_head += nodes_consumed;
+ ring_tail = *SLM_ring_tail;
+ ring_size = ring_tail - ring_head;
+ }
+
+ return ring_head;
+}
+
+
+
+
+inline void amplify_and_spill(
+ global struct SAHBuildGlobals* globals,
+ dword alloc_backpointers,
+ uint first_qnode,
+ uint first_bvh2_node,
+ global uint2* global_root_buffer,
+ local uint* root_buffer_counter,
+ const uint RING_SIZE
+)
+
+{
+ struct BuildFlatTreeArgs args;
+ args.leaf_size_in_bytes = SAHBuildGlobals_GetLeafSizeInBytes(globals);
+ args.leaf_type = SAHBuildGlobals_GetLeafType(globals);
+ args.inner_node_type = SAHBuildGlobals_GetInternalNodeType(globals);
+ args.primref_indices = SAHBuildGlobals_GetPrimrefIndices_In(globals, 0);
+ args.primref_buffer = SAHBuildGlobals_GetPrimrefs(globals);
+ args.bvh_base = SAHBuildGlobals_GetBVHBase(globals);
+ args.bvh2 = SAHBuildGlobals_GetBVH2(globals);
+ args.globals = (global struct Globals*) globals->p_globals;
+
+ global struct InternalNode* qnodes = (global struct InternalNode*) BVHBase_GetInternalNodes(args.bvh_base);
+ global uint* back_pointers = (global uint*) BVHBase_GetBackPointers(args.bvh_base);
+
+
+ varying uint3 children_info;
+ uniform ushort num_children = SUBGROUP_BuildFlatTreeNode(args, first_bvh2_node, qnodes + first_qnode, first_qnode, &children_info);
+
+ if (alloc_backpointers)
+ {
+ // set first node's backpointer
+ if (get_sub_group_local_id() == 0)
+ {
+ // if first node is root, use root sentinel in backpointer
+ // otherwise, need to merge the child count in with the parent offset (which was already put there by the parent's thread)
+ uint bp = 0xffffffc0;
+ if (first_qnode != 0)
+ bp = back_pointers[first_qnode];
+ bp |= (children_info.z << 3);
+
+ back_pointers[first_qnode] = bp;
+ }
+
+ // point child backpointers at the parent
+ if (get_sub_group_local_id() < num_children)
+ back_pointers[children_info.y] = (first_qnode << 6);
+ }
+
+ if (num_children)
+ {
+ uint spill_pos = 0;
+ if (get_sub_group_local_id() == 0)
+ spill_pos = atomic_add_local(root_buffer_counter,num_children);
+
+ spill_pos = sub_group_broadcast(spill_pos, 0);
+
+ if (get_sub_group_local_id() < num_children)
+ global_root_buffer[spill_pos+get_sub_group_local_id()] = children_info.xy;
+ }
+
+}
+
+
+
+
+inline void build_qnodes_pc_kickoff_func(
+ global struct SAHBuildGlobals* globals,
+ global uint2* root_buffer,
+ bool alloc_backpointers,
+ bool process_masks,
+
+ local uint2* SLM_local_root_buffer,
+ local uint* SLM_spill_pos,
+ local uint* SLM_ring_tail,
+ int RING_SIZE
+)
+{
+ // allocate first node
+ if ( get_sub_group_id() == 0 && get_sub_group_local_id() == 0 )
+ allocate_inner_nodes( SAHBuildGlobals_GetBVHBase(globals), 1 );
+
+ *SLM_spill_pos=0;
+
+ uint ring_head = build_qnodes_pc( globals, alloc_backpointers, process_masks,
+ 0, BVH2_GetRoot(SAHBuildGlobals_GetBVH2(globals)), SLM_local_root_buffer, SLM_ring_tail, RING_SIZE );
+
+
+ uint n = *SLM_ring_tail - ring_head;
+ if (n > 0)
+ {
+#if 0
+ // do an additional round of amplification so we can get more nodes into the root buffer and go wider in the next phase
+ /// JDB TODO: this is causing hangs on DG2 for metro, so disabling for now...
+ for (uint i = get_sub_group_id(); i < n; i+= get_num_sub_groups() )
+ {
+ uint consume_pos = (ring_head + i) % RING_SIZE;
+ uniform uint bvh2_root = SLM_local_root_buffer[consume_pos].x;
+ uniform uint qnode_root = SLM_local_root_buffer[consume_pos].y;
+
+ amplify_and_spill( globals, alloc_backpointers, qnode_root, bvh2_root, root_buffer, SLM_spill_pos, RING_SIZE );
+ }
+
+ barrier( CLK_LOCAL_MEM_FENCE );
+#else
+ for (uint i = get_local_id(0); i < n; i += get_local_size(0))
+ root_buffer[i] = SLM_local_root_buffer[(ring_head + i) % RING_SIZE];
+#endif
+
+ if (get_local_id(0) == 0)
+ {
+ globals->root_buffer_num_produced = n;
+ globals->root_buffer_num_produced_hi = 0;
+ globals->root_buffer_num_consumed = 0;
+ globals->root_buffer_num_consumed_hi = 0;
+ }
+ }
+}
+
+
+
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__( (reqd_work_group_size( 256, 1, 1 )) )
+__attribute__( (intel_reqd_sub_group_size( 16 )) )
+kernel void
+build_qnodes_pc_kickoff(
+ global struct SAHBuildGlobals* globals,
+ global uint2* root_buffer,
+ dword sah_flags
+)
+{
+ bool alloc_backpointers = sah_flags & SAH_FLAG_NEED_BACKPOINTERS;
+ bool process_masks = sah_flags & SAH_FLAG_NEED_MASKS;
+
+
+ const int RING_SIZE = 64;
+
+ local uint2 SLM_local_root_buffer[RING_SIZE];
+ local uint SLM_spill_pos;
+ local uint SLM_ring_tail;
+
+ build_qnodes_pc_kickoff_func(globals,
+ root_buffer,
+ alloc_backpointers,
+ process_masks,
+ SLM_local_root_buffer,
+ &SLM_spill_pos,
+ &SLM_ring_tail,
+ RING_SIZE
+ );
+}
+
+
+
+
+inline void build_qnodes_pc_amplify_func(
+ global struct SAHBuildGlobals* globals,
+ global uint2* root_buffer,
+ bool alloc_backpointers,
+ bool process_masks,
+
+ local uint2* SLM_local_root_buffer,
+ local uint* SLM_broadcast,
+ local uint* SLM_ring_tail,
+ int RING_SIZE
+ )
+{
+ // TODO_OPT: Probably don't need this atomic.. could clear 'num_consumed' every time
+ // and just use get_group_id()
+ //
+
+ if (get_local_id(0) == 0)
+ *SLM_broadcast = atomic_inc_global(&globals->root_buffer_num_consumed);
+
+ barrier( CLK_LOCAL_MEM_FENCE );
+
+ uniform uint consume_pos = *SLM_broadcast;
+ uniform uint bvh2_root = root_buffer[consume_pos].x;
+ uniform uint qnode_root = root_buffer[consume_pos].y;
+
+ uint ring_head = build_qnodes_pc(globals, alloc_backpointers,process_masks,
+ qnode_root, bvh2_root, SLM_local_root_buffer, SLM_ring_tail, RING_SIZE);
+
+ // TODO_OPT: Instead of spilling the nodes, do one more round of amplification and write
+ // generated children directly into the root buffer. This should allow faster amplification
+
+ // spill root buffer contents
+ uint n = *SLM_ring_tail - ring_head;
+ if (n > 0)
+ {
+
+ if (get_local_id(0) == 0)
+ *SLM_broadcast = atomic_add_global(&globals->root_buffer_num_produced, n);
+
+ barrier( CLK_LOCAL_MEM_FENCE );
+ uint produce_pos = *SLM_broadcast;
+
+ for (uint i = get_local_id(0); i < n; i += get_local_size(0))
+ root_buffer[produce_pos + i] = SLM_local_root_buffer[(ring_head + i) % RING_SIZE];
+ }
+}
+
+
+
+
+
+// Process two nodes per wg during amplification phase.
+// DOing it this way ensures maximum parallelism
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(16, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16)))
+kernel void
+build_qnodes_pc_amplify(
+ global struct SAHBuildGlobals* globals,
+ global uint2* root_buffer,
+ dword sah_flags )
+{
+ bool alloc_backpointers = sah_flags & SAH_FLAG_NEED_BACKPOINTERS;
+
+ struct BuildFlatTreeArgs args;
+ args.leaf_size_in_bytes = SAHBuildGlobals_GetLeafSizeInBytes(globals);
+ args.leaf_type = SAHBuildGlobals_GetLeafType(globals);
+ args.inner_node_type = SAHBuildGlobals_GetInternalNodeType(globals);
+ args.primref_indices = SAHBuildGlobals_GetPrimrefIndices_In(globals, 0);
+ args.primref_buffer = SAHBuildGlobals_GetPrimrefs(globals);
+ args.bvh_base = SAHBuildGlobals_GetBVHBase(globals);
+ args.bvh2 = SAHBuildGlobals_GetBVH2(globals);
+ args.globals = (global struct Globals*) globals->p_globals;
+ args.do_mask_processing = sah_flags & SAH_FLAG_NEED_MASKS;
+
+ global struct InternalNode* qnodes = (global struct InternalNode*) BVHBase_GetInternalNodes(args.bvh_base);
+ global uint* back_pointers = (global uint*) BVHBase_GetBackPointers(args.bvh_base);
+
+ ushort SIMD16_lane = get_sub_group_local_id();
+
+ // SIMD16 as 2xSIMD8
+ ushort SIMD8_lane = get_sub_group_local_id() % 8;
+ ushort SIMD8_id = get_sub_group_local_id() / 8;
+ bool active_lane = false;
+
+ uint consume_pos;
+ consume_pos = globals->root_buffer_num_consumed + get_group_id(0) * 2; // times 2 because we process two nodes in workgroup
+ consume_pos += SIMD8_id;
+
+ active_lane = consume_pos < globals->root_buffer_num_to_consume ? true : false;
+ consume_pos = consume_pos < globals->root_buffer_num_to_consume ? consume_pos : consume_pos-1;
+
+ uint first_bvh2_node = root_buffer[consume_pos].x;
+ uint first_qnode = root_buffer[consume_pos].y;
+
+ varying uint3 children_info;
+ ushort num_children = SUBGROUP_BuildFlatTreeNode_2xSIMD8_in_SIMD16(args, first_bvh2_node, qnodes, first_qnode, &children_info, active_lane);
+
+ if (alloc_backpointers && active_lane)
+ {
+ // set first node's backpointer
+ if (SIMD8_lane == 0)
+ {
+ // if first node is root, use root sentinel in backpointer
+ // otherwise, need to merge the child count in with the parent offset (which was already put there by the parent's thread)
+ uint bp = 0xffffffc0;
+ if (first_qnode != 0)
+ bp = back_pointers[first_qnode];
+ bp |= (children_info.z << 3);
+
+ back_pointers[first_qnode] = bp;
+ }
+
+ // point child backpointers at the parent
+ if (SIMD8_lane < num_children)
+ back_pointers[children_info.y] = (first_qnode << 6);
+ }
+
+ // save data
+ {
+ // sum children from both halfs of SIMD16 to do only one atomic per sub_group
+ uint produce_pos;
+ uniform ushort first_SIMD8_num_children = sub_group_broadcast(num_children, 0);
+ uniform ushort second_SIMD8_num_children = sub_group_broadcast(num_children, 8);
+ uniform ushort SIMD16_num_children = first_SIMD8_num_children + second_SIMD8_num_children;
+
+ if (SIMD16_lane == 0 && SIMD16_num_children)
+ produce_pos = atomic_add_global(&globals->root_buffer_num_produced, SIMD16_num_children);
+
+ produce_pos = sub_group_broadcast(produce_pos, 0);
+ produce_pos += SIMD8_id * first_SIMD8_num_children; // update offset for second half of SIMD16
+
+ if (SIMD8_lane < num_children)
+ {
+ root_buffer[produce_pos + SIMD8_lane] = children_info.xy;
+ }
+ }
+}
+
+
+//////////
+//
+// Batched version of qnode creation
+//
+//////////
+
+
+
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(1, 1, 1)))
+kernel void
+build_qnodes_init_scheduler_batched(global struct QnodeScheduler* scheduler, dword num_builds, dword num_max_qnode_global_root_buffer_entries)
+{
+
+ scheduler->batched_build_offset = scheduler->num_trivial_builds + scheduler->num_single_builds;
+ scheduler->batched_build_count = num_builds - scheduler->batched_build_offset;
+ scheduler->num_max_qnode_global_root_buffer_entries = num_max_qnode_global_root_buffer_entries;
+
+ const uint num_builds_to_process = scheduler->batched_build_count;
+ const uint max_qnode_grb_entries = scheduler->num_max_qnode_global_root_buffer_entries;
+
+ scheduler->batched_builds_to_process = num_builds_to_process;
+ scheduler->num_qnode_grb_curr_entries = (num_builds_to_process + 15) / 16; // here we store number of workgroups for "build_qnodes_begin_batchable" kernel
+ scheduler->num_qnode_grb_new_entries = num_builds_to_process;
+ scheduler->qnode_global_root_buffer.curr_entries_offset = max_qnode_grb_entries;
+}
+
+
+
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(16, 1, 1)))
+__attribute__( (intel_reqd_sub_group_size( 16 )) )
+kernel void
+build_qnodes_begin_batchable(global struct QnodeScheduler* scheduler,
+ global struct SAHBuildGlobals* builds_globals)
+{
+ const uint tid = get_group_id(0) * get_local_size(0) + get_local_id(0);
+
+ const uint num_builds_to_process = scheduler->batched_builds_to_process;
+
+ if(tid < num_builds_to_process)
+ {
+ const uint build_idx = scheduler->batched_build_offset + tid;
+
+ uint bvh2_node = BVH2_GetRoot(SAHBuildGlobals_GetBVH2(&builds_globals[build_idx]));
+ uint qnode = 0;
+ struct QNodeGlobalRootBufferEntry entry = { bvh2_node, qnode, build_idx, 1};
+ scheduler->qnode_global_root_buffer.entries[tid] = entry;
+
+ builds_globals[build_idx].root_buffer_num_produced = 0;
+ builds_globals[build_idx].root_buffer_num_produced_hi = 0;
+ builds_globals[build_idx].root_buffer_num_consumed = 0;
+ builds_globals[build_idx].root_buffer_num_consumed_hi = 0;
+
+ // allocate first node for this build
+ //allocate_inner_nodes( SAHBuildGlobals_GetBVHBase(&builds_globals[build_idx]), 1 );
+ SAHBuildGlobals_GetBVHBase(&builds_globals[build_idx])->nodeDataCur++;
+ }
+}
+
+
+
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__( (reqd_work_group_size( 1, 1, 1 )) )
+kernel void
+build_qnodes_scheduler(global struct QnodeScheduler* scheduler)
+{
+ const uint max_qnode_grb_entries = scheduler->num_max_qnode_global_root_buffer_entries;
+
+ uint new_entries = min(scheduler->num_qnode_grb_new_entries, max_qnode_grb_entries);
+
+ scheduler->num_qnode_grb_curr_entries = new_entries;
+ scheduler->num_qnode_grb_new_entries = 0;
+ scheduler->qnode_global_root_buffer.curr_entries_offset = scheduler->qnode_global_root_buffer.curr_entries_offset ? 0 : max_qnode_grb_entries;
+}
+
+
+
+
+// TODO_OPT: Enable larger WGs. WGSize 512 at SIMD8 hangs on Gen9, but Gen12 can go bigger
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__( (reqd_work_group_size( 32, 1, 1 )) )
+__attribute__( (intel_reqd_sub_group_size( 16 )) )
+kernel void
+build_qnodes_pc_amplify_batched(
+ global struct SAHBuildGlobals* builds_globals,
+ global struct QnodeScheduler* scheduler
+ )
+{
+ const uint group_id = get_group_id(0);
+
+ global struct QNodeGlobalRootBuffer* global_root_buffer = &scheduler->qnode_global_root_buffer;
+ const uint curr_entries_offset = global_root_buffer->curr_entries_offset;
+ struct QNodeGlobalRootBufferEntry entry = global_root_buffer->entries[curr_entries_offset + group_id];
+
+ const uint build_id = entry.build_idx;
+
+ global struct SAHBuildGlobals* globals = &builds_globals[build_id];
+ global uint2* root_buffer = (global uint2*)globals->p_qnode_root_buffer;
+ bool alloc_backpointers = SAHBuildGlobals_NeedBackPointers(globals);
+ bool process_masks = SAHBuildGlobals_NeedMasks(globals);
+
+ const int RING_SIZE = 32; // for 2 SGs, 16 should result in 2 rounds: one SG produces 6, then 2 SGs consume 2 and produce 12
+ // for 4 SGs, 32 results in 2 rounds: one SG produces 6, 4 SGs consume 4 and produce 24, resulting in 26
+
+ local uint2 SLM_local_root_buffer[RING_SIZE];
+ local uint SLM_broadcast;
+ local uint SLM_ring_tail;
+ local uint SLM_grb_broadcast;
+
+
+ //// This below can be moved to separate function if needed for TLAS ////
+
+ uniform uint bvh2_root = entry.bvh2_node;
+ uniform uint qnode_root = entry.qnode;
+
+ uint ring_head = build_qnodes_pc(globals, alloc_backpointers, process_masks,
+ qnode_root, bvh2_root, SLM_local_root_buffer, &SLM_ring_tail, RING_SIZE);
+
+ // spill root buffer contents
+ uint n = SLM_ring_tail - ring_head;
+ if (n > 0)
+ {
+ const uint max_qnode_grb_entries = scheduler->num_max_qnode_global_root_buffer_entries;
+
+ if (get_local_id(0) == 0)
+ {
+ SLM_grb_broadcast = atomic_add_global(&scheduler->num_qnode_grb_new_entries, n);
+
+ if(SLM_grb_broadcast >= max_qnode_grb_entries) // if global_root_buffer is full, then make space in build's root_buffer
+ SLM_broadcast = atomic_add_global(&globals->root_buffer_num_produced, n);
+ else if( (SLM_grb_broadcast + n) >= max_qnode_grb_entries) // if we exceed global_root_buffer with our entries, then make space in build's root_buffer
+ SLM_broadcast = atomic_add_global(&globals->root_buffer_num_produced, n - (max_qnode_grb_entries - SLM_grb_broadcast));
+ }
+
+ barrier( CLK_LOCAL_MEM_FENCE );
+
+ uint produce_pos = SLM_broadcast;
+
+ uint grb_produce_num = n; // grb stands for global_root_buffer
+ uint lrb_produce_num = 0; // lrb stands for local root buffer, meaning this build's root_buffer
+
+ if(SLM_grb_broadcast >= max_qnode_grb_entries) // if global_root_buffer is full, don't write to it
+ {
+ grb_produce_num = 0;
+ lrb_produce_num = n;
+ }
+ else if( (SLM_grb_broadcast + n) >= max_qnode_grb_entries) // if we exceed global_root_buffer with our entries, then decrease amount of entries and store rest in build's root buffer
+ {
+ grb_produce_num = max_qnode_grb_entries - SLM_grb_broadcast;
+ lrb_produce_num = n - grb_produce_num;
+ }
+
+ // save data to global_root_buffer
+ for(uint i = get_local_id(0); i < grb_produce_num; i += get_local_size(0))
+ {
+ const uint2 slm_record = SLM_local_root_buffer[(ring_head + i) % RING_SIZE];
+
+ struct QNodeGlobalRootBufferEntry new_entry;
+ new_entry.bvh2_node = slm_record.x;
+ new_entry.qnode = slm_record.y;
+ new_entry.build_idx = entry.build_idx;
+
+ const uint new_entries_offset = curr_entries_offset ? 0 : max_qnode_grb_entries;
+ global_root_buffer->entries[new_entries_offset + SLM_grb_broadcast + i] = new_entry;
+ }
+
+ // if anything left, write to build's root buffer
+ for (uint i = get_local_id(0); i < lrb_produce_num; i += get_local_size(0))
+ root_buffer[produce_pos + i] = SLM_local_root_buffer[(ring_head + i + grb_produce_num) % RING_SIZE];
+ }
+}
+
+
+
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__( (reqd_work_group_size( 16, 1, 1 )) )
+__attribute__( (intel_reqd_sub_group_size( 16 )) )
+kernel void
+build_qnodes_try_to_fill_grb_batched(
+ global struct SAHBuildGlobals* builds_globals,
+ global struct QnodeScheduler* scheduler
+ )
+{
+ const uint build_id = scheduler->batched_build_offset + get_group_id(0);
+ global struct SAHBuildGlobals* globals = &builds_globals[build_id];
+ global uint2* root_buffer = (global uint2*)globals->p_qnode_root_buffer;
+
+ global struct QNodeGlobalRootBuffer* qnode_root_buffer = (global struct QNodeGlobalRootBuffer*)&scheduler->qnode_global_root_buffer;
+
+ const uint num_produced = globals->root_buffer_num_produced;
+ const uint num_consumed = globals->root_buffer_num_consumed;
+ const uint entries = num_produced - num_consumed; // entries to build's root buffer
+
+ if(!entries)
+ return;
+
+ uint global_root_buffer_offset;
+ if(get_local_id(0) == 0)
+ global_root_buffer_offset = atomic_add_global(&scheduler->num_qnode_grb_new_entries, entries);
+
+ global_root_buffer_offset = sub_group_broadcast(global_root_buffer_offset, 0);
+
+ const uint max_qnode_grb_entries = scheduler->num_max_qnode_global_root_buffer_entries;
+
+ if(global_root_buffer_offset >= max_qnode_grb_entries) // if global_root_buffer is full, then return
+ return;
+
+ uint global_root_buffer_produce_num = entries;
+ if(global_root_buffer_offset + entries >= max_qnode_grb_entries) // if we exceed global_root_buffer with our entries, then reduce number of entries to push
+ global_root_buffer_produce_num = max_qnode_grb_entries - global_root_buffer_offset;
+
+ for(uint i = get_local_id(0); i < global_root_buffer_produce_num; i += get_local_size(0))
+ {
+ const uint2 entry = root_buffer[num_consumed + i];
+
+ struct QNodeGlobalRootBufferEntry new_entry;
+ new_entry.bvh2_node = entry.x;
+ new_entry.qnode = entry.y;
+ new_entry.build_idx = build_id;
+
+ const uint new_entries_offset = qnode_root_buffer->curr_entries_offset ? 0 : max_qnode_grb_entries;
+ qnode_root_buffer->entries[new_entries_offset + global_root_buffer_offset + i] = new_entry;
+ }
+
+ if(get_local_id(0) == 0)
+ globals->root_buffer_num_consumed += global_root_buffer_produce_num;
+}
diff --git a/src/intel/vulkan/grl/gpu/bvh_build_DFS.cl b/src/intel/vulkan/grl/gpu/bvh_build_DFS.cl
new file mode 100644
index 00000000000..1f64ef3fbe2
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/bvh_build_DFS.cl
@@ -0,0 +1,2025 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#include "intrinsics.h"
+#include "AABB3f.h"
+#include "AABB.h"
+#include "GRLGen12.h"
+#include "quad.h"
+#include "common.h"
+#include "instance.h"
+
+#include "api_interface.h"
+
+#include "binned_sah_shared.h"
+
+
+#if 0
+#define LOOP_TRIPWIRE_INIT uint _loop_trip=0;
+
+#define LOOP_TRIPWIRE_INCREMENT(max_iterations) \
+ _loop_trip++;\
+ if ( _loop_trip > max_iterations )\
+ {\
+ if( get_local_id(0) == 0 )\
+ printf( "@@@@@@@@@@@@@@@@@@@@ TRIPWIRE!!!!!!!!!!! group=%u\n", get_group_id(0) );\
+ break;\
+ }
+#else
+
+#define LOOP_TRIPWIRE_INIT
+#define LOOP_TRIPWIRE_INCREMENT(max_iterations)
+
+#endif
+
+
+// =========================================================
+// DFS
+// =========================================================
+
+// there are 128 threads x SIMD16 == 2048 lanes in a DSS
+// There is 128KB of SLM. Upper limit of 64KB per WG, so target is 2 groups of 1024 lanes @ 64K each
+// --> Full occupancy requires using less than 64B per lane
+//
+// Groups of 256 lanes gives us 16KB per group
+//
+
+// We use subgroups very heavily here in order to avoid
+// use of per-thread scratch space for intermediate values
+
+#define DFS_WG_SIZE 256
+#define DFS_NUM_SUBGROUPS 16
+#define DFS_BVH2_NODE_COUNT (2*(DFS_WG_SIZE)-1)
+#define TREE_ARITY 6
+
+// FlatTree node limits:
+// these are the derivations if we always collapse to one primitive and pack nodes as tightly as possible
+// If BVH2 construction is allowed to terminate early and place multiple prims in a leaf, these numbers will be too low
+#if 0
+
+// maximum flattree size is the number of inner nodes in a full M-ary tree with one leaf per primitive
+// This is given by I = (L-1)/(M-1)
+// For a 256 thread workgroup, L=256, M=6, this gives: 51
+#define DFS_MAX_FLATTREE_NODES 51
+
+
+// A flattree leaf is a node which contains only primitives.
+//
+// The maximum number of leaves is related to the number of nodes as:
+// L(N) = ((M-1)*N + 1) / M
+//
+#define DFS_MAX_FLATTREE_LEAFS 43 // = 43 for 256 thread WG (L=256, M=6)
+
+#else
+
+// This is the result of estimate_qbvh6_nodes(256)
+
+#define DFS_MAX_FLATTREE_LEAFS 256
+#define DFS_MAX_FLATTREE_NODES 307 // 256 fat-leaves + 51 inner nodes. 51 = ceil(256/5)
+#define DFS_MAX_FLATTREE_DEPTH 52 // number of inner nodes in the worst-case tree
+
+#endif
+
+#define uniform
+#define varying
+
+
+struct DFSArgs
+{
+ global struct BVHBase* bvh_base;
+ global PrimRef* primref_buffer;
+ ushort leaf_node_type;
+ ushort inner_node_type;
+ ushort leaf_size_in_bytes;
+ bool need_backpointers;
+ bool need_masks;
+ ushort num_primrefs;
+ global uint* primref_index_buffer;
+};
+
+
+struct DFSPrimRefAABB
+{
+ half lower[3];
+ half upper[3];
+};
+
+GRL_INLINE void DFSPrimRefAABB_init( struct DFSPrimRefAABB* bb )
+{
+ bb->lower[0] = 1;
+ bb->lower[1] = 1;
+ bb->lower[2] = 1;
+ bb->upper[0] = 0;
+ bb->upper[1] = 0;
+ bb->upper[2] = 0;
+}
+
+GRL_INLINE void DFSPrimRefAABB_extend( struct DFSPrimRefAABB* aabb, struct DFSPrimRefAABB* v )
+{
+ aabb->lower[0] = min( aabb->lower[0], v->lower[0] );
+ aabb->lower[1] = min( aabb->lower[1], v->lower[1] );
+ aabb->lower[2] = min( aabb->lower[2], v->lower[2] );
+ aabb->upper[0] = max( aabb->upper[0], v->upper[0] );
+ aabb->upper[1] = max( aabb->upper[1], v->upper[1] );
+ aabb->upper[2] = max( aabb->upper[2], v->upper[2] );
+}
+
+GRL_INLINE float DFSPrimRefAABB_halfArea( struct DFSPrimRefAABB* aabb )
+{
+ const half3 d = (half3)(aabb->upper[0] - aabb->lower[0], aabb->upper[1] - aabb->lower[1], aabb->upper[2] - aabb->lower[2]);
+ return fma( d.x, (d.y + d.z), d.y * d.z );
+}
+
+GRL_INLINE struct DFSPrimRefAABB DFSPrimRefAABB_sub_group_reduce( struct DFSPrimRefAABB* aabb )
+{
+ struct DFSPrimRefAABB bounds;
+ bounds.lower[0] = sub_group_reduce_min( aabb->lower[0] );
+ bounds.lower[1] = sub_group_reduce_min( aabb->lower[1] );
+ bounds.lower[2] = sub_group_reduce_min( aabb->lower[2] );
+ bounds.upper[0] = sub_group_reduce_max( aabb->upper[0] );
+ bounds.upper[1] = sub_group_reduce_max( aabb->upper[1] );
+ bounds.upper[2] = sub_group_reduce_max( aabb->upper[2] );
+ return bounds;
+}
+
+struct DFSPrimRef
+{
+ struct DFSPrimRefAABB aabb;
+ uint2 meta;
+};
+
+struct PrimRefMeta
+{
+ uchar2 meta;
+};
+
+GRL_INLINE uint PrimRefMeta_GetInputIndex( struct PrimRefMeta* it )
+{
+ return it->meta.x;
+}
+GRL_INLINE uint PrimRefMeta_GetInstanceMask( struct PrimRefMeta* it )
+{
+ return it->meta.y;
+}
+
+
+struct PrimRefSet
+{
+ struct AABB3f root_aabb;
+ struct DFSPrimRefAABB AABB[DFS_WG_SIZE];
+ uint2 meta[DFS_WG_SIZE];
+
+};
+
+GRL_INLINE local struct DFSPrimRefAABB* PrimRefSet_GetAABBPointer( local struct PrimRefSet* refs, ushort id )
+{
+ return &refs->AABB[id];
+}
+
+GRL_INLINE float PrimRefSet_GetMaxAABBArea( local struct PrimRefSet* refs )
+{
+ float3 root_l = AABB3f_load_lower( &refs->root_aabb );
+ float3 root_u = AABB3f_load_upper( &refs->root_aabb );
+ float3 d = root_u - root_l;
+ float scale = 1.0f / max( d.x, max( d.y, d.z ) );
+
+ half3 dh = convert_half3_rtp( d * scale );
+ return fma( dh.x, (dh.y + dh.z), dh.y * dh.z );
+}
+
+GRL_INLINE float3 ulp3( float3 v ) {
+
+ return fabs(v) * FLT_EPSILON;
+}
+
+GRL_INLINE struct AABB PrimRefSet_ConvertAABB( local struct PrimRefSet* refs, struct DFSPrimRefAABB* box )
+{
+ float3 root_l = AABB3f_load_lower( &refs->root_aabb );
+ float3 root_u = AABB3f_load_upper( &refs->root_aabb );
+ float3 d = root_u - root_l;
+ float scale = max( d.x, max( d.y, d.z ) );
+
+ float3 l = convert_float3_rtz( (half3)(box->lower[0], box->lower[1], box->lower[2]) );
+ float3 u = convert_float3_rtp( (half3)(box->upper[0], box->upper[1], box->upper[2]) );
+ l = l * scale + root_l ;
+ u = u * scale + root_l ;
+
+ // clamping is necessary in case that a vertex lies exactly in the upper AABB plane.
+ // If we use unclamped values, roundoff error in the scale factor calculation can cause us
+ // to snap to a flattened AABB that lies outside of the original one, resulting in missed geometry.
+ u = min( u, root_u );
+ l = min( l, root_u );
+
+ struct AABB r;
+ r.lower.xyz = l.xyz;
+ r.upper.xyz = u.xyz;
+ return r;
+}
+
+GRL_INLINE PrimRef PrimRefSet_GetFullPrecisionAABB( local struct PrimRefSet* refs, ushort id )
+{
+ struct AABB r;
+ r = PrimRefSet_ConvertAABB( refs, &refs->AABB[id] );
+ r.lower.w = 0;
+ r.upper.w = 0;
+ return r;
+}
+
+GRL_INLINE uint PrimRefSet_GetInputIndex( local struct PrimRefSet* refs, ushort id )
+{
+ return refs->meta[id].x;
+}
+
+GRL_INLINE uint PrimRefSet_GetInstanceMask( local struct PrimRefSet* refs, ushort id )
+{
+ return refs->meta[id].y;
+}
+GRL_INLINE struct PrimRefMeta PrimRefSet_GetMeta( local struct PrimRefSet* refs, ushort id )
+{
+ struct PrimRefMeta meta;
+ meta.meta.x = refs->meta[id].x;
+ meta.meta.y = refs->meta[id].y;
+ return meta;
+}
+
+
+GRL_INLINE struct DFSPrimRef PrimRefSet_GetPrimRef( local struct PrimRefSet* refs, ushort id )
+{
+ struct DFSPrimRef r;
+ r.aabb = refs->AABB[id];
+ r.meta = refs->meta[id];
+ return r;
+}
+
+
+GRL_INLINE void PrimRefSet_SetPrimRef_FullPrecision( local struct PrimRefSet* refs, PrimRef ref, ushort id )
+{
+
+ float3 root_l = AABB3f_load_lower( &refs->root_aabb );
+ float3 root_u = AABB3f_load_upper( &refs->root_aabb );
+ float3 d = root_u - root_l;
+ float scale = 1.0f / max(d.x, max(d.y,d.z));
+
+ float3 l = ref.lower.xyz;
+ float3 u = ref.upper.xyz;
+ half3 lh = convert_half3_rtz( (l - root_l) * scale );
+ half3 uh = convert_half3_rtp( (u - root_l) * scale );
+
+ refs->AABB[id].lower[0] = lh.x;
+ refs->AABB[id].lower[1] = lh.y;
+ refs->AABB[id].lower[2] = lh.z;
+ refs->AABB[id].upper[0] = uh.x;
+ refs->AABB[id].upper[1] = uh.y;
+ refs->AABB[id].upper[2] = uh.z;
+ refs->meta[id].x = id;
+ refs->meta[id].y = PRIMREF_instanceMask(&ref);
+
+
+}
+
+GRL_INLINE void PrimRefSet_SetPrimRef( local struct PrimRefSet* refs, struct DFSPrimRef ref, ushort id )
+{
+ refs->AABB[id] = ref.aabb;
+ refs->meta[id] = ref.meta;
+}
+
+GRL_INLINE struct AABB3f PrimRefSet_GetRootAABB( local struct PrimRefSet* refs )
+{
+ return refs->root_aabb;
+}
+
+GRL_INLINE void SUBGROUP_PrimRefSet_Initialize( local struct PrimRefSet* refs )
+{
+ if ( get_sub_group_local_id() == 0 )
+ AABB3f_init( &refs->root_aabb ); // TODO_OPT: subgroup-vectorized version of AABB3f_init
+}
+
+
+GRL_INLINE void PrimRefSet_Printf( local struct PrimRefSet* refs, ushort num_prims )
+{
+
+ barrier( CLK_LOCAL_MEM_FENCE );
+ if ( get_local_id( 0 ) == 0 )
+ {
+ printf( "Scene AABB:\n" );
+ struct AABB3f rootBox = PrimRefSet_GetRootAABB( refs );
+ AABB3f_print( &rootBox );
+
+ float ma = PrimRefSet_GetMaxAABBArea( refs );
+
+ for ( uint i = 0; i < num_prims; i++ )
+ {
+ printf( "Ref: %u\n", i );
+ struct AABB r = PrimRefSet_GetFullPrecisionAABB( refs, i );
+ AABB_print( &r );
+
+ float a = DFSPrimRefAABB_halfArea( PrimRefSet_GetAABBPointer( refs, i ) );
+ printf( "Scaled Area: %f / %f = %f \n", a, ma, a / ma );
+
+ }
+ }
+ barrier( CLK_LOCAL_MEM_FENCE );
+}
+
+
+
+GRL_INLINE void PrimRefSet_CheckBounds( local struct PrimRefSet* refs, ushort num_prims, PrimRef* primref_buffer )
+{
+
+ barrier( CLK_LOCAL_MEM_FENCE );
+ if ( get_local_id( 0 ) == 0 )
+ {
+
+ for ( uint i = 0; i < num_prims; i++ )
+ {
+ PrimRef ref = primref_buffer[i];
+ struct AABB r2 = PrimRefSet_GetFullPrecisionAABB( refs, i );
+
+ struct DFSPrimRefAABB* box = &refs->AABB[i];
+ float3 l = convert_float3_rtz( (half3)(box->lower[0], box->lower[1], box->lower[2]) );
+ float3 u = convert_float3_rtp( (half3)(box->upper[0], box->upper[1], box->upper[2]) );
+
+ printf( " halfs:{%x,%x,%x}{%x,%x,%x}\n", as_uint(l.x), as_uint(l.y), as_uint(l.z), as_uint(u.x), as_uint(u.y), as_uint(u.z) );
+
+ printf( " {%f,%f,%f} {%f,%f,%f} {%f,%f,%f} {%f,%f,%f} {%u,%u,%u,%u,%u,%u}\n",
+ ref.lower.x, ref.lower.y, ref.lower.z, r2.lower.x, r2.lower.y, r2.lower.z,
+ ref.upper.x, ref.upper.y, ref.upper.z, r2.upper.x, r2.upper.y, r2.upper.z,
+ r2.lower.x <= ref.lower.x,
+ r2.lower.y <= ref.lower.y,
+ r2.lower.z <= ref.lower.z,
+
+ r2.upper.x >= ref.upper.x,
+ r2.upper.y >= ref.upper.y,
+ r2.upper.z >= ref.upper.z );
+
+ }
+
+ }
+ barrier( CLK_LOCAL_MEM_FENCE );
+}
+
+
+
+struct LocalBVH2
+{
+ uint num_nodes;
+ uint nodes[DFS_BVH2_NODE_COUNT];
+
+ // nodes are a bitfield:
+ // bits 8:0 (9b) ==> number of primrefs in this subtree
+ //
+ // bits 17:9 (9b) ==> for an inner node: contains offset to a pair of children
+ // ==> for a leaf node: contains index of the first primref in this leaf
+ //
+ // bits 30:18 (13b) ==> quantized AABB area (relative to root box)
+ // bit 31 (1b) ==> is_inner flag
+ //
+ // NOTE: The left child offset of any node is always odd.. therefore, it is possible to recover a bit if we need it
+ // by storing only the 8 MSBs
+};
+
+#define DFS_BVH2_AREA_QUANT 8191.0f
+
+
+
+GRL_INLINE void SUBGROUP_LocalBVH2_Initialize( local struct LocalBVH2* tree, ushort num_prims )
+{
+ tree->num_nodes = 1; // include the root node
+ tree->nodes[0] = num_prims; // initialize root node as a leaf containing the full subtree
+
+}
+
+GRL_INLINE void LocalBVH2_CreateInnerNode( local struct LocalBVH2* tree, ushort node_index,
+ ushort start_left, ushort start_right,
+ ushort quantized_left_area, ushort quantized_right_area )
+{
+ uint child_pos = atomic_add_local( &tree->num_nodes, 2 );
+
+ // set the inner node flag and child position in the parent
+ // leave the other bits intact
+ uint parent_node = tree->nodes[node_index];
+ parent_node |= 0x80000000;
+ parent_node = (parent_node & ~(0x1ff<<9)) | (child_pos << 9);
+ tree->nodes[node_index] = parent_node;
+
+ // setup children as leaf nodes with prim-count zero
+ uint left_child = (convert_uint(start_left) << 9) | (convert_uint( quantized_left_area ) << 18);
+ uint right_child = (convert_uint(start_right) << 9) | (convert_uint( quantized_right_area ) << 18);
+ tree->nodes[child_pos] = left_child;
+ tree->nodes[child_pos + 1] = right_child;
+
+}
+
+GRL_INLINE ushort LocalBVH2_IncrementPrimCount( local struct LocalBVH2* tree, ushort node_index )
+{
+ // increment only the lower bits. Given correct tree construction algorithm this will not overflow into MSBs
+ return (atomic_inc_local( &tree->nodes[node_index] )) & 0x1ff;
+}
+
+GRL_INLINE ushort LocalBVH2_GetNodeArea( local struct LocalBVH2* tree, ushort nodeID )
+{
+ return (tree->nodes[nodeID] >> 18) & 0x1FFF;
+}
+
+GRL_INLINE bool LocalBVH2_IsInnerNode( local struct LocalBVH2* tree, ushort nodeID )
+{
+ return (tree->nodes[nodeID] & 0x80000000) != 0;
+}
+
+
+GRL_INLINE ushort2 LocalBVH2_GetChildIndices( local struct LocalBVH2* tree, ushort nodeID )
+{
+ ushort idx = ((tree->nodes[nodeID] >> 9) & 0x1FF);
+ return (ushort2)(idx, idx + 1);
+}
+
+GRL_INLINE ushort LocalBVH2_GetSubtreePrimCount( local struct LocalBVH2* tree, ushort node )
+{
+ return tree->nodes[node] & 0x1FF;
+}
+
+GRL_INLINE ushort LocalBVH2_GetLeafPrimStart( local struct LocalBVH2* tree, ushort node )
+{
+ return ((tree->nodes[node] >> 9) & 0x1FF);
+}
+
+
+GRL_INLINE void LocalBVH2_Printf( local struct LocalBVH2* tree )
+{
+ barrier( CLK_LOCAL_MEM_FENCE );
+
+ if ( get_local_id( 0 ) == 0 )
+ {
+ printf( "Nodes: %u\n", tree->num_nodes );
+
+ for ( uint i = 0; i < tree->num_nodes; i++ )
+ {
+ uint num_prims = LocalBVH2_GetSubtreePrimCount( tree, i );
+ printf( "%3u : 0x%08x %3u 0x%04x ", i, tree->nodes[i], num_prims, LocalBVH2_GetNodeArea(tree,i) );
+ if ( LocalBVH2_IsInnerNode( tree, i ) )
+ {
+ ushort2 kids = LocalBVH2_GetChildIndices( tree, i );
+ printf( " INNER ( %3u %3u )\n", kids.x, kids.y );
+ }
+ else
+ {
+ printf( " LEAF {" );
+ for ( uint j = 0; j < num_prims; j++ )
+ printf( " %3u ", LocalBVH2_GetLeafPrimStart( tree, i ) + j );
+ printf( "}\n" );
+ }
+ }
+ }
+
+ barrier( CLK_LOCAL_MEM_FENCE );
+}
+
+struct FlatTreeInnerNode
+{
+ uint DW0; // lower 16b are index of corresponding LocalBVH2 node.. Bits 30:16 are an atomic flag used during refit. Bit 31 is a leaf marker
+ ushort parent_index;
+ ushort first_child;
+ uchar index_in_parent;
+ uchar num_children;
+
+ //struct DFSPrimRefAABB AABB;
+};
+
+struct FlatTree
+{
+ uint num_nodes;
+ uint qnode_byte_offset; // byte offset from the BVHBase to the flat-tree's first QNode
+ uint qnode_base_index;
+
+ struct FlatTreeInnerNode nodes[DFS_MAX_FLATTREE_NODES];
+ uchar primref_back_pointers[DFS_WG_SIZE];
+};
+
+GRL_INLINE void FlatTree_Printf( local struct FlatTree* flat_tree )
+{
+ barrier( CLK_LOCAL_MEM_FENCE );
+ if ( get_local_id( 0 ) == 0 )
+ {
+ printf( "NumNodes: %u\n", flat_tree->num_nodes );
+ for ( uint i = 0; i < flat_tree->num_nodes; i++ )
+ {
+ ushort bvh2_node = flat_tree->nodes[i].DW0 & 0xffff;
+ printf( "%2u Parent: %2u Index_in_parent: %u, NumKids: %u FirstKid: %3u bvh2: %3u DW0: 0x%x\n",
+ i,
+ flat_tree->nodes[i].parent_index,
+ flat_tree->nodes[i].index_in_parent,
+ flat_tree->nodes[i].num_children,
+ flat_tree->nodes[i].first_child,
+ bvh2_node,
+ flat_tree->nodes[i].DW0 );
+ }
+ }
+ barrier( CLK_LOCAL_MEM_FENCE );
+}
+
+
+
+
+GRL_INLINE ushort FlatTree_GetNodeCount( local struct FlatTree* flat_tree )
+{
+ return flat_tree->num_nodes;
+}
+
+GRL_INLINE uint FlatTree_GetParentIndex( local struct FlatTree* flat_tree, ushort id )
+{
+ return flat_tree->nodes[id].parent_index;
+}
+
+GRL_INLINE ushort FlatTree_GetBVH2Root( local struct FlatTree* flat_tree, ushort node_index )
+{
+ return (flat_tree->nodes[node_index].DW0) & 0xffff;
+}
+
+GRL_INLINE ushort FlatTree_GetNumChildren( local struct FlatTree* flat_tree, ushort node_index )
+{
+ return flat_tree->nodes[node_index].num_children;
+}
+
+GRL_INLINE bool FlatTree_IsLeafNode( local struct FlatTree* flat_tree, ushort node_index )
+{
+ return (flat_tree->nodes[node_index].DW0 & 0x80000000) != 0;
+}
+
+
+GRL_INLINE uint FlatTree_GetQNodeByteOffset( struct FlatTree* flat_tree, ushort node_index )
+{
+ return flat_tree->qnode_byte_offset + node_index * sizeof(struct QBVHNodeN);
+}
+
+GRL_INLINE uint FlatTree_GetQNodeIndex( struct FlatTree* flat_tree, ushort node_index )
+{
+ return flat_tree->qnode_base_index + node_index;
+}
+
+GRL_INLINE void FlatTree_AllocateQNodes( struct FlatTree* flat_tree, struct DFSArgs args )
+{
+ uint node_base = 64*allocate_inner_nodes( args.bvh_base, flat_tree->num_nodes );
+ flat_tree->qnode_base_index = (node_base - BVH_ROOT_NODE_OFFSET) / sizeof( struct QBVHNodeN );
+ flat_tree->qnode_byte_offset = node_base;
+}
+
+GRL_INLINE ushort FlatTree_GetFirstChild( struct FlatTree* flat_tree, ushort node_index )
+{
+ return flat_tree->nodes[node_index].first_child;
+}
+
+GRL_INLINE ushort FlatTree_GetPrimRefStart( struct FlatTree* flat_tree, ushort node_index )
+{
+ return flat_tree->nodes[node_index].first_child;
+}
+GRL_INLINE ushort FlatTree_GetPrimRefCount( struct FlatTree* flat_tree, ushort node_index )
+{
+ return flat_tree->nodes[node_index].num_children;
+}
+
+GRL_INLINE uint FlatTree_BuildBackPointer( local struct FlatTree* flat_tree, ushort node_index )
+{
+ uint parent_index = flat_tree->nodes[node_index].parent_index + flat_tree->qnode_base_index;
+ parent_index = (parent_index << 6) | (FlatTree_GetNumChildren( flat_tree, node_index ) << 3);
+ return parent_index;
+}
+
+
+GRL_INLINE void SUBGROUP_FlatTree_Initialize( uniform local struct FlatTree* flat_tree, struct DFSArgs args )
+{
+ if ( get_sub_group_local_id() == 0 )
+ {
+ flat_tree->num_nodes = 1;
+ flat_tree->nodes[0].DW0 = 0; // point first node at BVH2 root node, which is assumed to be at index zero
+ }
+
+}
+/*
+GRL_INLINE void SUBGROUP_FlatTree_ReduceAndSetAABB( uniform local struct FlatTree* flat_tree,
+ uniform ushort node_index,
+ varying local struct DFSPrimRefAABB* box )
+{
+ // TODO_OPT: Replace this with an optimized reduction which exploits the fact that we only ever have 6 active lanes
+ // Try using the "negated max" trick here to compute min/max simultaneously, with max in top 6 lanes
+ // This will replace 6 reductions with 3
+
+ // TODO_OPT: This only utilizes up to 6 SIMD lanes. We can use up to 12 of them by putting
+ // min into even lanes, and -max into odd lanes, and using a manual min-reduction on pairs of lanes
+
+ struct DFSPrimRefAABB bb = DFSPrimRefAABB_sub_group_reduce( box );
+ if( get_sub_group_local_id() )
+ flat_tree->nodes[node_index].AABB = bb;
+}
+*/
+
+GRL_INLINE void SUBGROUP_FlatTree_CreateInnerNode( uniform local struct FlatTree* flat_tree,
+ uniform ushort flat_tree_root,
+ varying ushort sg_child_bvh2_root,
+ uniform ushort num_children )
+{
+ uniform uint lane = get_sub_group_local_id();
+
+ // increment counter to allocate new nodes.. set required root node fields
+ uniform uint child_base;
+ if ( lane == 0 )
+ {
+ child_base = atomic_add_local( &flat_tree->num_nodes, num_children );
+ flat_tree->nodes[flat_tree_root].first_child = (uchar) child_base;
+ flat_tree->nodes[flat_tree_root].num_children = num_children;
+
+ // initialize mask bits for this node's live children
+ uint child_mask = ((1 << num_children) - 1) << 16;
+ flat_tree->nodes[flat_tree_root].DW0 |= child_mask;
+ }
+
+ child_base = sub_group_broadcast( child_base, 0 );
+
+ // initialize child nodes
+ if ( lane < num_children )
+ {
+ varying uint child = child_base + lane;
+ flat_tree->nodes[child].DW0 = sg_child_bvh2_root;
+ flat_tree->nodes[child].index_in_parent = lane;
+ flat_tree->nodes[child].parent_index = flat_tree_root;
+ }
+
+}
+
+
+
+GRL_INLINE void SUBGROUP_FlatTree_CreateLeafNode( uniform local struct FlatTree* flat_tree,
+ uniform ushort flat_tree_root,
+ uniform ushort primref_start,
+ uniform ushort num_prims )
+{
+ ushort lane = get_sub_group_local_id();
+ if ( lane < num_prims )
+ {
+ flat_tree->primref_back_pointers[primref_start + lane] = (uchar) flat_tree_root;
+ if ( lane == 0 )
+ {
+ flat_tree->nodes[flat_tree_root].first_child = (uchar) primref_start;
+ flat_tree->nodes[flat_tree_root].num_children = (uchar) num_prims;
+ flat_tree->nodes[flat_tree_root].DW0 |= 0x80000000;
+ }
+ }
+}
+
+
+GRL_INLINE uniform bool SUBGROUP_FlatTree_SignalRefitComplete( uniform local struct FlatTree* flat_tree, uniform ushort* p_node_index )
+{
+ uniform ushort node_index = *p_node_index;
+ uniform ushort parent = flat_tree->nodes[node_index].parent_index;
+ uniform ushort index_in_parent = flat_tree->nodes[node_index].index_in_parent;
+
+ // clear the corresponding mask bit in the parent node
+ uniform uint child_mask = (0x10000 << index_in_parent);
+ uniform uint old_mask_bits = 0;
+ if( get_sub_group_local_id() == 0 )
+ old_mask_bits = atomic_xor( &flat_tree->nodes[parent].DW0, child_mask );
+
+ old_mask_bits = sub_group_broadcast( old_mask_bits, 0 );
+
+ // if we cleared the last mask bit, this subgroup proceeds up the tree and refits the next node
+ // otherwise, it looks for something else to do
+ if ( ((old_mask_bits^child_mask) & 0xffff0000) == 0 )
+ {
+ *p_node_index = parent;
+ return true;
+ }
+
+ return false;
+}
+
+/*
+GRL_INLINE local struct DFSPrimRefAABB* FlatTree_GetChildAABB( local struct FlatTree* flat_tree,
+ local struct PrimRefSet* prim_refs,
+ ushort node_index, ushort child_index )
+{
+ ushort child_id = FlatTree_GetFirstChild( flat_tree, node_index ) + child_index;
+
+ if( !FlatTree_IsLeafNode( flat_tree, node_index ) )
+ return &flat_tree->nodes[child_id].AABB;
+ else
+ return PrimRefSet_GetAABBPointer( prim_refs, child_id );
+}
+*/
+GRL_INLINE uint FlatTree_GetPrimRefBackPointer( local struct FlatTree* flat_tree, ushort primref_index )
+{
+ return flat_tree->primref_back_pointers[primref_index] * sizeof(struct QBVHNodeN) + flat_tree->qnode_byte_offset;
+}
+
+
+GRL_INLINE void FlatTree_check_boxes(local struct FlatTree* flat_tree,
+ global struct AABB* primref_buffer,
+ local struct AABB3f* boxes,
+ local struct PrimRefMeta* meta )
+
+{
+ barrier(CLK_LOCAL_MEM_FENCE);
+ if (get_local_id(0) == 0)
+ {
+ printf("checking flattree bounds...\n");
+
+ for (uint i = 0; i < flat_tree->num_nodes; i++)
+ {
+ struct AABB rb;
+ rb.lower.xyz = AABB3f_load_lower(&boxes[i]);
+ rb.upper.xyz = AABB3f_load_upper(&boxes[i]);
+
+ uint offs = FlatTree_GetFirstChild( flat_tree, i );
+ uint count = FlatTree_GetNumChildren( flat_tree, i );
+
+ for (uint c = 0; c < count; c++)
+ {
+ struct AABB lb;
+ if (FlatTree_IsLeafNode( flat_tree, i ))
+ {
+ lb = primref_buffer[ PrimRefMeta_GetInputIndex( &meta[offs+c] ) ];
+ }
+ else
+ {
+ lb.lower.xyz = AABB3f_load_lower(&boxes[ offs+c ]);
+ lb.upper.xyz = AABB3f_load_upper(&boxes[ offs+c ]);
+ }
+
+ if( !AABB_subset( &lb, &rb ) )
+ printf("Bad bounds!! child %u of %u %f : %f %f : %f %f : %f %f : %f %f : %f %f : %f \n",
+ c, i ,
+ rb.lower.x, rb.upper.x, rb.lower.y, rb.upper.y, rb.lower.z, rb.upper.z,
+ lb.lower.x, lb.upper.x, lb.lower.y, lb.upper.y, lb.lower.z, lb.upper.z
+ );
+ }
+ }
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+}
+
+
+struct FlatTreeScheduler
+{
+ int num_leafs;
+ uint writeout_produce_count;
+ uint writeout_consume_count;
+ uint active_subgroups;
+ uint num_built_nodes;
+ uint num_levels; // number of depth levels in the tree
+
+ //uchar leaf_indices[DFS_MAX_FLATTREE_LEAFS]; // indices of leaf FlatTree nodes to be refitted
+ //uchar writeout_indices[DFS_MAX_FLATTREE_NODES]; // indices of flattree nodes to be written out or collapsed
+
+ ushort level_ordered_nodes[DFS_MAX_FLATTREE_NODES]; // node indices sorted by depth (pre-order, high depth before low depth)
+ ushort level_start[DFS_MAX_FLATTREE_DEPTH]; // first node at given level in the level-ordered node array
+ uint level_count[DFS_MAX_FLATTREE_DEPTH]; // number of nodes at given level
+};
+
+GRL_INLINE void SUBGROUP_FlatTreeScheduler_Initialize( uniform local struct FlatTreeScheduler* scheduler )
+{
+ scheduler->num_built_nodes = 0;
+ scheduler->num_leafs = 0;
+ scheduler->writeout_produce_count = 0;
+ scheduler->writeout_consume_count = 0;
+ scheduler->active_subgroups = DFS_NUM_SUBGROUPS;
+}
+/*
+GRL_INLINE void SUBGROUP_FlatTreeScheduler_QueueLeafForRefit( uniform local struct FlatTreeScheduler* scheduler,
+ uniform ushort leaf )
+{
+ if ( get_sub_group_local_id() == 0 )
+ scheduler->leaf_indices[atomic_inc( &scheduler->num_leafs )] = leaf;
+}*/
+
+GRL_INLINE void SUBGROUP_FlatTreeScheduler_SignalNodeBuilt( uniform local struct FlatTreeScheduler* scheduler, uniform ushort node )
+{
+ if ( get_sub_group_local_id() == 0 )
+ atomic_inc_local( &scheduler->num_built_nodes );
+}
+
+GRL_INLINE uint FlatTreeScheduler_GetNumBuiltNodes( uniform local struct FlatTreeScheduler* scheduler )
+{
+ return scheduler->num_built_nodes;
+}
+
+/*
+GRL_INLINE void SUBGROUP_FlatTreeScheduler_QueueNodeForWriteOut( uniform local struct FlatTreeScheduler* scheduler, uniform ushort node )
+{
+ if ( get_sub_group_local_id() == 0 )
+ scheduler->writeout_indices[atomic_inc( &scheduler->writeout_produce_count )] = node;
+}*/
+
+/*
+GRL_INLINE bool SUBGROUP_FlatTreeScheduler_GetRefitTask( uniform local struct FlatTreeScheduler* scheduler, uniform ushort* leaf_idx )
+{
+ // schedule the leaves in reverse order to ensure that later leaves
+ // complete before earlier ones.. This prevents contention during the WriteOut stage
+ //
+ // There is a barrier between this function and 'QueueLeafForRefit' so we can safely decrement the same counter
+ // that we incremented earlier
+ varying int idx = 0;
+ if( get_sub_group_local_id() == 0 )
+ idx = atomic_dec( &scheduler->num_leafs );
+
+ sub_group_barrier( CLK_LOCAL_MEM_FENCE );
+ idx = sub_group_broadcast( idx, 0 );
+
+ if ( idx <= 0 )
+ return false;
+
+ *leaf_idx = scheduler->leaf_indices[idx-1];
+ return true;
+}*/
+
+/*
+// Signal the scheduler that a subgroup has reached the DONE state.
+// Return true if this is the last subgroup to be done
+void SUBGROUP_FlatTreeScheduler_SubGroupDone( local struct FlatTreeScheduler* scheduler )
+{
+ if ( get_sub_group_local_id() == 0 )
+ atomic_dec( &scheduler->active_subgroups );
+}
+*/
+
+/*
+
+#define STATE_SCHEDULE_REFIT 0x1234
+#define STATE_SCHEDULE_WRITEOUT 0x5679
+#define STATE_REFIT 0xabcd
+#define STATE_WRITEOUT 0xefef
+#define STATE_DONE 0xaabb
+
+// Get a flattree node to write out. Returns the new scheduler state
+GRL_INLINE ushort SUBGROUP_FlatTreeScheduler_GetWriteOutTask( uniform local struct FlatTreeScheduler* scheduler,
+ uniform ushort num_nodes,
+ uniform ushort* node_idx )
+{
+ uniform ushort return_state = STATE_WRITEOUT;
+ uniform ushort idx = 0;
+ if ( get_sub_group_local_id() == 0 )
+ {
+ idx = atomic_inc( &scheduler->writeout_consume_count );
+
+ if ( idx >= scheduler->writeout_produce_count )
+ {
+ // more consumers than there are produced tasks....
+
+ if ( scheduler->writeout_produce_count == num_nodes )
+ {
+ // if all nodes have been written out, flattening is done
+ return_state = STATE_DONE;
+ }
+ else
+ {
+ // some writeout tasks remain, and have not been produced by refit threads yet
+ // we need to put this one back
+ atomic_dec( &scheduler->writeout_consume_count );
+ return_state = STATE_SCHEDULE_WRITEOUT;
+ }
+ }
+ else
+ {
+ // scheduled successfully
+ idx = scheduler->writeout_indices[idx];
+ }
+ }
+
+ *node_idx = sub_group_broadcast( idx, 0 );
+ return sub_group_broadcast( return_state, 0 );
+
+}
+*/
+
+
+/*
+GRL_INLINE void FlatTreeScheduler_Printf( local struct FlatTreeScheduler* scheduler )
+{
+ barrier( CLK_LOCAL_MEM_FENCE );
+
+ if ( get_local_id( 0 ) == 0 )
+ {
+ printf( "***SCHEDULER***\n" );
+ printf( "built_nodes=%u active_sgs=%u leafs=%u wo_p=%u wo_c=%u\n", scheduler->num_built_nodes, scheduler->active_subgroups, scheduler->num_leafs,
+ scheduler->writeout_produce_count, scheduler->writeout_consume_count );
+ printf( "leafs for refit: {" );
+
+ int nleaf = max( scheduler->num_leafs, 0 );
+
+ for ( uint i = 0; i < nleaf; i++ )
+ printf( "%u ", scheduler->leaf_indices[i] );
+ printf( "}\n" );
+
+ printf( "writeout queue: %u:%u {", scheduler->writeout_produce_count, scheduler->writeout_consume_count );
+ for ( uint i = 0; i < scheduler->writeout_produce_count; i++ )
+ printf( "%u ", scheduler->writeout_indices[i] );
+ printf( "}\n" );
+ }
+
+ barrier( CLK_LOCAL_MEM_FENCE );
+
+}
+*/
+
+
+GRL_INLINE void SUBGROUP_BuildFlatTreeNode( local struct LocalBVH2* bvh2,
+ local struct FlatTree* flat_tree,
+ local struct FlatTreeScheduler* scheduler,
+ uniform ushort flat_tree_root )
+{
+ varying ushort lane = get_sub_group_local_id();
+ varying ushort bvh2_root = FlatTree_GetBVH2Root( flat_tree, flat_tree_root );
+
+ if ( !LocalBVH2_IsInnerNode( bvh2, bvh2_root ) )
+ {
+ uniform ushort num_prims = LocalBVH2_GetSubtreePrimCount( bvh2, bvh2_root );
+ uniform ushort primref_start = LocalBVH2_GetLeafPrimStart( bvh2, bvh2_root );
+
+ SUBGROUP_FlatTree_CreateLeafNode( flat_tree, flat_tree_root, primref_start, num_prims );
+ }
+ else
+ {
+ // collapse BVH2 into BVH6.
+ // We will spread the root node's children across the subgroup, and keep adding SIMD lanes until we have enough
+ uniform ushort num_children = 2;
+
+ uniform ushort2 kids = LocalBVH2_GetChildIndices( bvh2, bvh2_root );
+ varying ushort sg_bvh2_node = kids.x;
+ if ( lane == 1 )
+ sg_bvh2_node = kids.y;
+
+ do
+ {
+ // choose the inner node with maximum area to replace.
+ // Its left child goes in its old location. Its right child goes in a new lane
+
+ varying ushort sg_area = LocalBVH2_GetNodeArea( bvh2, sg_bvh2_node );
+ varying bool sg_is_inner = LocalBVH2_IsInnerNode( bvh2, sg_bvh2_node );
+ sg_area = (sg_is_inner && lane < num_children) ? sg_area : 0; // prevent early exit if the largest child is a leaf
+
+ uniform ushort max_area = sub_group_reduce_max( sg_area );
+ varying bool sg_reducable = max_area == sg_area && (lane < num_children) && sg_is_inner;
+ uniform uint mask = intel_sub_group_ballot( sg_reducable );
+
+ // TODO_OPT: Some of these ops seem redundant.. look at trimming further
+ // TODO_OPT: sub_group_reduce_max results in too many instructions...... unroll the loop and specialize it..
+ // or ask IGC to give us a version that declares a static maximum number of subgroups to use
+
+ if ( mask == 0 )
+ break;
+
+ // choose the inner node with maximum area to replace
+ uniform ushort victim_child = ctz( mask );
+ uniform ushort victim_node = sub_group_broadcast( sg_bvh2_node, victim_child );
+ uniform ushort2 kids = LocalBVH2_GetChildIndices( bvh2, victim_node );
+
+ if ( lane == victim_child )
+ sg_bvh2_node = kids.x;
+ else if ( lane == num_children )
+ sg_bvh2_node = kids.y;
+
+
+ num_children++;
+
+
+ }while ( num_children < TREE_ARITY );
+
+ SUBGROUP_FlatTree_CreateInnerNode( flat_tree, flat_tree_root, sg_bvh2_node, num_children );
+ }
+
+}
+
+
+GRL_INLINE void SUBGROUP_DFS_BuildFlatTree( uniform local struct LocalBVH2* bvh2,
+ uniform local struct FlatTree* flat_tree,
+ uniform local struct FlatTreeScheduler* scheduler
+ )
+{
+
+ uniform ushort flat_tree_node_index = get_sub_group_id();
+ uniform ushort num_nodes = 1;
+ uniform ushort num_built = 0;
+
+ uint tid = get_local_id(0);
+ if (tid < DFS_MAX_FLATTREE_DEPTH)
+ {
+ scheduler->level_start[tid] = DFS_MAX_FLATTREE_NODES;
+ scheduler->level_count[tid] = 0;
+ scheduler->num_levels = 0;
+ }
+
+ LOOP_TRIPWIRE_INIT;
+
+ do
+ {
+ // process one flat tree node per sub group, as many as are available
+ //
+ // The first pass will only run one sub-group, the second up to 6, the third up to 36, and so on
+ // nodes will be processed in breadth-first order, but they are not guaranteed to be stored in this order
+ // due to use of atomic counters for node allocation
+ //
+ if ( flat_tree_node_index < num_nodes )
+ {
+ SUBGROUP_BuildFlatTreeNode( bvh2, flat_tree, scheduler, flat_tree_node_index );
+ SUBGROUP_FlatTreeScheduler_SignalNodeBuilt( scheduler, flat_tree_node_index );
+ flat_tree_node_index += get_num_sub_groups();
+ }
+
+ barrier( CLK_LOCAL_MEM_FENCE );
+
+ // bump up the node count if new nodes were created
+ // stop as soon as all flattree nodes have been processed
+ num_nodes = FlatTree_GetNodeCount( flat_tree );
+ num_built = FlatTreeScheduler_GetNumBuiltNodes( scheduler );
+
+ barrier( CLK_LOCAL_MEM_FENCE );
+
+ LOOP_TRIPWIRE_INCREMENT( 300 );
+
+ } while ( num_built < num_nodes );
+
+ barrier( CLK_LOCAL_MEM_FENCE );
+
+
+ // determine depth of each node, compute node ranges and counts for each depth level,
+ // and prepare a depth-ordered node index array
+ uint depth = 0;
+ uint level_pos = 0;
+ for( uint i=tid; i<num_nodes; i += get_local_size(0) )
+ {
+ // compute depth of this node
+ uint node_index = i;
+ while ( node_index != 0 )
+ {
+ node_index = FlatTree_GetParentIndex( flat_tree, node_index );
+ depth++;
+ }
+
+ // assign this node a position within it's depth level
+ level_pos = atomic_inc_local( &scheduler->level_count[depth] );
+
+ // compute total number of levels
+ atomic_max_local( &scheduler->num_levels, depth+1 );
+ }
+
+ barrier( CLK_LOCAL_MEM_FENCE );
+
+ for( uint i=tid; i<num_nodes; i += get_local_size(0) )
+ {
+ // prefix-sum level start positions. Re-computed for each thread
+ // TODO: Hierarchical reduction ??
+ uint level_start=0;
+ for( uint d=0; d<depth; d++ )
+ level_start += scheduler->level_count[d];
+
+ scheduler->level_start[depth] = level_start;
+
+ // scatter node indices into level-ordered node array
+ scheduler->level_ordered_nodes[level_start + level_pos] = tid;
+ }
+
+ barrier( CLK_LOCAL_MEM_FENCE );
+
+}
+
+/*
+GRL_INLINE bool SUBGROUP_RefitNode( uniform local struct FlatTree* flat_tree,
+ uniform local struct PrimRefSet* prim_refs,
+ uniform ushort* p_node_index )
+{
+
+ // fetch and reduce child AABBs across the subgroup
+ uniform ushort node_index = *p_node_index;
+ uniform ushort num_kids = FlatTree_GetNumChildren( flat_tree, node_index );
+ varying ushort sg_child_index = (get_sub_group_local_id() < num_kids) ? get_sub_group_local_id() : 0;
+
+ varying local struct DFSPrimRefAABB* box = FlatTree_GetChildAABB( flat_tree, prim_refs, node_index, sg_child_index );
+
+ SUBGROUP_FlatTree_ReduceAndSetAABB( flat_tree, node_index, box );
+
+ if ( node_index == 0 )
+ return false; // if we just refitted the root, we can stop now
+
+ // signal the parent node that this node was refitted. If this was the last child to be refitted
+ // returns true and sets 'node_index' to the parent node, so that this thread can continue refitting
+ return SUBGROUP_FlatTree_SignalRefitComplete( flat_tree, p_node_index );
+}*/
+
+GRL_INLINE struct QBVHNodeN* qnode_ptr( BVHBase* bvh_mem, uint byte_offset )
+{
+ return (struct QBVHNodeN*)(((char*)bvh_mem) + byte_offset);
+}
+
+GRL_INLINE void SUBGROUP_WriteQBVHNode(
+ uniform local struct FlatTree* flat_tree,
+ uniform local struct PrimRefMeta* primref_meta,
+ uniform local struct AABB3f* boxes,
+ uniform ushort flat_tree_root,
+ uniform struct DFSArgs args,
+ uniform local uchar* masks
+ )
+{
+
+
+ uniform ushort num_children = FlatTree_GetNumChildren( flat_tree, flat_tree_root );
+ uniform bool is_leaf = FlatTree_IsLeafNode( flat_tree, flat_tree_root );
+
+ varying ushort lane = get_sub_group_local_id();
+ varying ushort sg_child_index = (lane < num_children) ? lane : 0;
+
+ uniform ushort child_base = FlatTree_GetFirstChild( flat_tree, flat_tree_root );
+
+ varying struct AABB sg_box4;
+ if (FlatTree_IsLeafNode( flat_tree, flat_tree_root ))
+ {
+ // fetch AABBs for primrefs
+ sg_box4 = args.primref_buffer[ PrimRefMeta_GetInputIndex( &primref_meta[child_base + sg_child_index] ) ];
+
+ }
+ else
+ {
+ // fetch AABBs for child nodes
+ sg_box4.lower.xyz = AABB3f_load_lower( &boxes[child_base+sg_child_index] );
+ sg_box4.upper.xyz = AABB3f_load_upper( &boxes[child_base+sg_child_index] );
+ }
+
+
+ struct QBVHNodeN* qnode = qnode_ptr( args.bvh_base, FlatTree_GetQNodeByteOffset( flat_tree, flat_tree_root ) );
+
+ uniform int offset;
+ uniform uint child_type;
+ if ( is_leaf )
+ {
+ char* leaf_mem = (char*)BVHBase_GetQuadLeaves( args.bvh_base );
+
+ leaf_mem += ( FlatTree_GetPrimRefStart( flat_tree, flat_tree_root )) * args.leaf_size_in_bytes;
+
+ offset = (int)(leaf_mem - (char*)qnode);
+ child_type = args.leaf_node_type;
+ }
+ else
+ {
+ struct QBVHNodeN* kid = qnode_ptr( args.bvh_base, FlatTree_GetQNodeByteOffset( flat_tree, FlatTree_GetFirstChild( flat_tree, flat_tree_root ) ) );
+ offset = (int) ((char*)kid - (char*)qnode);
+ child_type = args.inner_node_type;
+ }
+ offset = offset >> 6;
+
+ if (child_type == NODE_TYPE_INSTANCE)
+ {
+ uint instanceMask = PrimRefMeta_GetInstanceMask( &primref_meta[child_base + sg_child_index] );
+ subgroup_setInstanceQBVHNodeN( offset, &sg_box4, num_children, qnode, lane < num_children ? instanceMask : 0 );
+ }
+ else
+ {
+ uint mask = BVH_NODE_DEFAULT_MASK;
+ if( args.need_masks )
+ mask = masks[flat_tree_root];
+
+ subgroup_setQBVHNodeN( offset, child_type, &sg_box4, num_children, qnode, mask );
+ }
+
+ if ( args.need_backpointers )
+ {
+ global uint* back_pointers = (global uint*) BVHBase_GetBackPointers( args.bvh_base );
+ uint idx = FlatTree_GetQNodeIndex( flat_tree, flat_tree_root );
+ uint bp = FlatTree_BuildBackPointer( flat_tree, flat_tree_root );
+ back_pointers[idx] = bp;
+ }
+
+ /*
+ // TODO_OPT: Eventually this section should also handle leaf splitting due to mixed primref types
+ // For now this is done by the leaf creation pipeline, but that path should probably be refactored
+ // such that all inner node creation is done in one place
+
+ uniform ushort num_children = FlatTree_GetNumChildren( flat_tree, flat_tree_root );
+ uniform bool is_leaf = FlatTree_IsLeafNode( flat_tree, flat_tree_root );
+
+ varying ushort lane = get_sub_group_local_id();
+ varying ushort sg_child_index = (lane < num_children) ? lane : 0;
+
+ varying local struct DFSPrimRefAABB* sg_box = FlatTree_GetChildAABB( flat_tree, prim_refs, flat_tree_root, sg_child_index );
+
+ varying struct AABB sg_box4 = PrimRefSet_ConvertAABB( prim_refs, sg_box );
+
+ struct QBVHNodeN* qnode = qnode_ptr( args.bvh_base, FlatTree_GetQNodeByteOffset( flat_tree, flat_tree_root ) );
+
+ uniform int offset;
+ uniform uint child_type;
+ if ( is_leaf )
+ {
+ char* leaf_mem = (char*)BVHBase_GetQuadLeaves( args.bvh_base );
+
+ leaf_mem += ( FlatTree_GetPrimRefStart( flat_tree, flat_tree_root )) * args.leaf_size_in_bytes;
+
+ offset = (int)(leaf_mem - (char*)qnode);
+ child_type = args.leaf_node_type;
+ }
+ else
+ {
+ struct QBVHNodeN* kid = qnode_ptr( args.bvh_base, FlatTree_GetQNodeByteOffset( flat_tree, FlatTree_GetFirstChild( flat_tree, flat_tree_root ) ) );
+ offset = (int) ((char*)kid - (char*)qnode);
+ child_type = args.inner_node_type;
+ }
+ offset = offset >> 6;
+
+ if (child_type == NODE_TYPE_INSTANCE)
+ {
+ uint instanceMask = PrimRefSet_GetInstanceMask( prim_refs, FlatTree_GetPrimRefStart(flat_tree, flat_tree_root) + lane );
+ subgroup_setInstanceQBVHNodeN( offset, &sg_box4, num_children, qnode, lane < num_children ? instanceMask : 0 );
+ }
+ else
+ subgroup_setQBVHNodeN( offset, child_type, &sg_box4, num_children, qnode );
+
+ if ( args.need_backpointers )
+ {
+ global uint* back_pointers = (global uint*) BVHBase_GetBackPointers( args.bvh_base );
+ uint idx = FlatTree_GetQNodeIndex( flat_tree, flat_tree_root );
+ uint bp = FlatTree_BuildBackPointer( flat_tree, flat_tree_root );
+ back_pointers[idx] = bp;
+ }
+ */
+}
+
+/*
+GRL_INLINE void SUBGROUP_DFS_RefitAndWriteOutFlatTree(
+ uniform local struct FlatTree* flat_tree,
+ uniform local struct PrimRefSet* prim_refs,
+ uniform local struct FlatTreeScheduler* scheduler,
+ uniform struct DFSArgs args)
+{
+
+ uniform ushort state = STATE_SCHEDULE_REFIT;
+ uniform ushort node_index = 0;
+ uniform ushort num_nodes = FlatTree_GetNodeCount(flat_tree);
+
+ {
+ LOOP_TRIPWIRE_INIT;
+
+ bool active = true;
+ bool continue_refit = false;
+ while (1)
+ {
+ if (active)
+ {
+ if (continue_refit || SUBGROUP_FlatTreeScheduler_GetRefitTask(scheduler, &node_index))
+ {
+ continue_refit = SUBGROUP_RefitNode(flat_tree, prim_refs, &node_index);
+ }
+ else
+ {
+ active = false;
+ if (get_sub_group_local_id() == 0)
+ atomic_dec(&scheduler->active_subgroups);
+
+ sub_group_barrier(CLK_LOCAL_MEM_FENCE);
+ }
+ }
+
+ barrier(CLK_LOCAL_MEM_FENCE); // finish all atomics
+ if (scheduler->active_subgroups == 0)
+ break;
+ barrier(CLK_LOCAL_MEM_FENCE); // finish all checks.. prevent race between thread which loops around and thread which doesn't
+
+ LOOP_TRIPWIRE_INCREMENT(200);
+ }
+ }
+
+ for (uint i = get_sub_group_id(); i < num_nodes; i += get_num_sub_groups())
+ SUBGROUP_WriteQBVHInnerNodes(flat_tree, prim_refs, i, args);
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+
+ // JDB: Version below attempts to interleave refit and qnode write-out
+ // This could theoretically reduce thread idle time, but it is more complex and does more atomics for scheduling
+
+#if 0
+ // after we've constructed the flat tree (phase 1), there are two things that need to happen:
+ // PHASE 2: Refit the flat tree, computing all of the node ABBs
+ // PHASE 3: Write the nodes out to memory
+ //
+ // all of this is sub-group centric. Different subgroups can execute phases 2 and 3 concurrently
+ //
+
+ // TODO_OPT: The scheduling algorithm might need to be re-thought.
+ // Fused EUs are very hard to reason about. It's possible that by scheduling independent
+ // SGs in this way we would lose a lot of performance due to fused EU serialization.
+ // Needs to be tested experimentally if such a thing is possible
+
+ uniform ushort state = STATE_SCHEDULE_REFIT;
+ uniform ushort node_index = 0;
+ uniform ushort num_nodes = FlatTree_GetNodeCount(flat_tree);
+
+ LOOP_TRIPWIRE_INIT;
+
+ do
+ {
+ // barrier necessary to protect access to scheduler->active_subgroups
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ if (state == STATE_SCHEDULE_REFIT)
+ {
+ if (SUBGROUP_FlatTreeScheduler_GetRefitTask(scheduler, &node_index))
+ state = STATE_REFIT;
+ else
+ state = STATE_SCHEDULE_WRITEOUT; // fallthrough
+ }
+ if (state == STATE_SCHEDULE_WRITEOUT)
+ {
+ state = SUBGROUP_FlatTreeScheduler_GetWriteOutTask(scheduler, num_nodes, &node_index);
+ if (state == STATE_DONE)
+ SUBGROUP_FlatTreeScheduler_SubGroupDone(scheduler);
+ }
+
+
+ // A barrier is necessary to ensure that 'QueueNodeForWriteOut' is synchronized with 'GetWriteOutTask'
+ // Note that in theory we could have the write-out tasks spin until the refit tasks clear, which would make this barrier unnecessary
+ // However, we cannot do this safely on SKUs which do not support independent subgroup forward progress.
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ if (state == STATE_REFIT)
+ {
+ uniform ushort prev_node = node_index;
+ uniform bool continue_refit = SUBGROUP_RefitNode(flat_tree, prim_refs, &node_index);
+
+ SUBGROUP_FlatTreeScheduler_QueueNodeForWriteOut(scheduler, prev_node);
+
+ if (!continue_refit)
+ state = STATE_SCHEDULE_REFIT;
+ }
+ else if (state == STATE_WRITEOUT)
+ {
+ SUBGROUP_WriteQBVHInnerNodes(flat_tree, prim_refs, node_index, args);
+ state = STATE_SCHEDULE_WRITEOUT;
+ }
+ // A barrier is necessary to ensure that 'QueueNodeForWriteOut' is synchronized with 'GetWriteOutTask'
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ LOOP_TRIPWIRE_INCREMENT(200);
+
+ } while (scheduler->active_subgroups > 0);
+
+#endif
+}
+*/
+
+GRL_INLINE void DFS_CreatePrimRefSet( struct DFSArgs args,
+ local struct PrimRefSet* prim_refs )
+{
+ ushort id = get_local_id( 0 );
+ ushort num_primrefs = args.num_primrefs;
+
+
+ PrimRef ref;
+ struct AABB3f local_aabb;
+ if ( id < num_primrefs )
+ {
+ ref = args.primref_buffer[id];
+ AABB3f_set_lower( &local_aabb, ref.lower.xyz );
+ AABB3f_set_upper( &local_aabb, ref.upper.xyz );
+ }
+ else
+ {
+ AABB3f_init( &local_aabb );
+ }
+
+ AABB3f_atomic_merge_localBB_nocheck( &prim_refs->root_aabb, &local_aabb );
+
+ barrier( CLK_LOCAL_MEM_FENCE );
+
+ if ( id < num_primrefs )
+ PrimRefSet_SetPrimRef_FullPrecision( prim_refs, ref, id );
+}
+
+
+
+struct BVHBuildLocals
+{
+ float Al[DFS_WG_SIZE];
+ float Ar[DFS_WG_SIZE];
+ uchar2 axis_and_left_count[ DFS_WG_SIZE ];
+ uint sah[DFS_WG_SIZE];
+ uint num_active_threads;
+};
+
+
+GRL_INLINE void DFS_ConstructBVH2( local struct LocalBVH2* bvh2,
+ local struct PrimRefSet* prim_refs,
+ ushort num_prims,
+ local struct BVHBuildLocals* locals )
+{
+ ushort tid = get_local_id( 0 );
+
+ ushort bvh2_root = 0;
+ ushort prim_range_start = 0;
+ ushort primref_position = tid;
+
+ bool active_thread = tid < num_prims;
+ float root_area = PrimRefSet_GetMaxAABBArea( prim_refs );
+ float area_scale = DFS_BVH2_AREA_QUANT / root_area;
+
+ locals->num_active_threads = num_prims;
+ barrier( CLK_LOCAL_MEM_FENCE );
+
+ LOOP_TRIPWIRE_INIT;
+
+ do
+ {
+ if(active_thread && prim_range_start == primref_position)
+ locals->sah[primref_position] = UINT_MAX;
+
+ if ( active_thread )
+ {
+ local struct DFSPrimRefAABB* my_box = PrimRefSet_GetAABBPointer( prim_refs, primref_position );
+
+ // each thread evaluates a possible split candidate. Scan primrefs and compute sah cost
+ // do this axis-by-axis to keep register pressure low
+ float best_sah = INFINITY;
+ ushort best_axis = 3;
+ ushort best_count = 0;
+ float best_al = INFINITY;
+ float best_ar = INFINITY;
+
+ struct DFSPrimRefAABB box_left[3];
+ struct DFSPrimRefAABB box_right[3];
+ float CSplit[3];
+ ushort count_left[3];
+
+ for ( ushort axis = 0; axis < 3; axis++ )
+ {
+ DFSPrimRefAABB_init( &box_left[axis] );
+ DFSPrimRefAABB_init( &box_right[axis] );
+
+ CSplit[axis] = my_box->lower[axis] + my_box->upper[axis];
+ count_left[axis] = 0;
+ }
+
+ // scan primrefs in our subtree and partition using this thread's prim as a split plane
+ {
+ struct DFSPrimRefAABB box = *PrimRefSet_GetAABBPointer( prim_refs, prim_range_start );
+
+ for ( ushort p = 1; p < num_prims; p++ )
+ {
+ struct DFSPrimRefAABB next_box = *PrimRefSet_GetAABBPointer( prim_refs, prim_range_start + p ); //preloading box for next iteration
+
+ for( ushort axis = 0; axis < 3; axis++ )
+ {
+ float c = box.lower[axis] + box.upper[axis];
+
+ if ( c < CSplit[axis] )
+ {
+ // this primitive is to our left.
+ DFSPrimRefAABB_extend( &box_left[axis], &box );
+ count_left[axis]++;
+ }
+ else
+ {
+ // this primitive is to our right
+ DFSPrimRefAABB_extend( &box_right[axis], &box );
+ }
+ }
+
+ box = next_box;
+ }
+
+ // last iteration without preloading box
+ for( ushort axis = 0; axis < 3; axis++ )
+ {
+ float c = box.lower[axis] + box.upper[axis];
+
+ if ( c < CSplit[axis] )
+ {
+ // this primitive is to our left.
+ DFSPrimRefAABB_extend( &box_left[axis], &box );
+ count_left[axis]++;
+ }
+ else
+ {
+ // this primitive is to our right
+ DFSPrimRefAABB_extend( &box_right[axis], &box );
+ }
+ }
+ }
+
+ for ( ushort axis = 0; axis < 3; axis++ )
+ {
+ float Al = DFSPrimRefAABB_halfArea( &box_left[axis] );
+ float Ar = DFSPrimRefAABB_halfArea( &box_right[axis] );
+
+ // Avoid NANs in SAH calculation in the corner case where all prims go right
+ // In this case we set Al=Ar, because such a split will only be selected if all primrefs
+ // are co-incident.. In that case, we will fall back to split-in-the-middle and both subtrees
+ // should store the same quantized area value
+ if ( count_left[axis] == 0 )
+ Al = Ar;
+
+ // compute sah cost
+ ushort count_right = num_prims - count_left[axis];
+ float sah = Ar * count_right + Al * count_left[axis];
+
+ // keep this split if it is better than the previous one, or if the previous one was a corner-case
+ if ( sah < best_sah || best_count == 0 )
+ {
+ // yes, keep it
+ best_axis = axis;
+ best_sah = sah;
+ best_count = count_left[axis];
+ best_al = Al;
+ best_ar = Ar;
+ }
+ }
+
+
+ // write split information to SLM
+ locals->Al[primref_position] = best_al;
+ locals->Ar[primref_position] = best_ar;
+ locals->axis_and_left_count[primref_position].x = best_axis;
+ locals->axis_and_left_count[primref_position].y = best_count;
+
+ uint sah = as_uint(best_sah);
+ // break ties by axis to ensure deterministic split selection
+ // otherwise builder can produce non-deterministic tree structure run to run
+ // based on the ordering of primitives (which can vary due to non-determinism in atomic counters)
+ // Embed split axis and index into sah value; compute min over sah and max over axis
+ sah = ( ( sah & ~1023 ) | ( 2 - best_axis ) << 8 | primref_position );
+
+ // reduce on split candidates in our local subtree and decide the best one
+ atomic_min_local( &locals->sah[ prim_range_start ], sah);
+ }
+
+ barrier( CLK_LOCAL_MEM_FENCE );
+
+ ushort split_index = locals->sah[ prim_range_start ] & 255;
+ ushort split_axis = locals->axis_and_left_count[split_index].x;
+ ushort split_left_count = locals->axis_and_left_count[split_index].y;
+ float split_al = locals->Al[split_index];
+ float split_ar = locals->Ar[split_index];
+
+ if ( (primref_position == prim_range_start) && active_thread )
+ {
+ // first thread in a given subtree creates the inner node
+ ushort quantized_left_area = convert_ushort_rtn( split_al * area_scale );
+ ushort quantized_right_area = convert_ushort_rtn( split_ar * area_scale );
+ ushort start_left = prim_range_start;
+ ushort start_right = prim_range_start + split_left_count;
+ if ( split_left_count == 0 )
+ start_right = start_left + (num_prims / 2); // handle split-in-the-middle case
+
+ LocalBVH2_CreateInnerNode( bvh2, bvh2_root,
+ start_left, start_right,
+ quantized_left_area, quantized_right_area );
+ }
+
+ barrier( CLK_LOCAL_MEM_FENCE );
+
+ struct DFSPrimRef ref;
+ ushort new_primref_position;
+
+ if ( active_thread )
+ {
+ ushort2 kids = LocalBVH2_GetChildIndices( bvh2, bvh2_root );
+ bool go_left;
+
+ if ( split_left_count == 0 )
+ {
+ // We chose a split with no left-side prims
+ // This will only happen if all primrefs are located in the exact same position
+ // In that case, fall back to split-in-the-middle
+ split_left_count = (num_prims / 2);
+ go_left = (primref_position - prim_range_start < split_left_count);
+ }
+ else
+ {
+ // determine what side of the split this thread's primref belongs on
+ local struct DFSPrimRefAABB* my_box = PrimRefSet_GetAABBPointer( prim_refs, primref_position );
+ local struct DFSPrimRefAABB* split_box = PrimRefSet_GetAABBPointer( prim_refs, split_index );
+ float c = my_box->lower[split_axis] + my_box->upper[split_axis];
+ float Csplit = split_box->lower[split_axis] + split_box->upper[split_axis];
+ go_left = c < Csplit;
+ }
+
+ // adjust state variables for next loop iteration
+ bvh2_root = (go_left) ? kids.x : kids.y;
+ num_prims = (go_left) ? split_left_count : (num_prims - split_left_count);
+ prim_range_start = (go_left) ? prim_range_start : prim_range_start + split_left_count;
+
+ // determine the new primref position by incrementing a counter in the destination subtree
+ new_primref_position = prim_range_start + LocalBVH2_IncrementPrimCount( bvh2, bvh2_root );
+
+ // load our primref from its previous position
+ ref = PrimRefSet_GetPrimRef( prim_refs, primref_position );
+ }
+
+ barrier( CLK_LOCAL_MEM_FENCE );
+
+ if ( active_thread )
+ {
+ // write our primref into its sorted position
+ PrimRefSet_SetPrimRef( prim_refs, ref, new_primref_position );
+ primref_position = new_primref_position;
+
+ // deactivate all threads whose subtrees are small enough to form a leaf
+ if ( num_prims <= TREE_ARITY )
+ {
+ active_thread = false;
+ atomic_dec_local( &locals->num_active_threads );
+ }
+ }
+
+ barrier( CLK_LOCAL_MEM_FENCE );
+
+ LOOP_TRIPWIRE_INCREMENT( 50 );
+
+
+ } while ( locals->num_active_threads > 0 );
+
+
+}
+
+
+
+// fast path for #prims <= TREE_ARITY
+GRL_INLINE void Trivial_DFS( struct DFSArgs args )
+{
+
+ ushort tid = get_local_id( 0 );
+
+ PrimRef myRef;
+ AABB_init( &myRef );
+ if( tid < args.num_primrefs )
+ myRef = args.primref_buffer[tid];
+
+ uint node_offset;
+ if ( tid == 0 )
+ node_offset = 64*allocate_inner_nodes( args.bvh_base, 1 );
+ node_offset = sub_group_broadcast(node_offset,0);
+
+ char* bvh_mem = (char*) args.bvh_base;
+ struct QBVHNodeN* qnode = (struct QBVHNodeN*) (bvh_mem + node_offset);
+
+ uint child_type = args.leaf_node_type;
+ uint prim_base = args.bvh_base->quadLeafStart*64 ;
+
+ char* leaf_mem = bvh_mem + prim_base;
+ int offset = (int)( leaf_mem - (char*)qnode );
+
+ if (child_type == NODE_TYPE_INSTANCE)
+ {
+ subgroup_setInstanceQBVHNodeN( offset >> 6, &myRef, args.num_primrefs, qnode, tid < args.num_primrefs ? PRIMREF_instanceMask(&myRef) : 0 );
+ }
+ else
+ subgroup_setQBVHNodeN( offset >> 6, child_type, &myRef, args.num_primrefs, qnode, BVH_NODE_DEFAULT_MASK );
+
+ if ( tid < args.num_primrefs )
+ {
+ global uint* primref_back_pointers = args.primref_index_buffer + args.num_primrefs;
+ uint bp = node_offset;
+
+ // TODO_OPT: Leaf creation pipeline can be made simpler by having a sideband buffer containing
+ // fatleaf index + position in fatleaf for each primref, instead of forcing leaf creation shader to reconstruct it
+ // should also probably do the fat-leaf splitting here
+ args.primref_buffer[tid] = myRef;
+ args.primref_index_buffer[tid] = tid;
+
+ primref_back_pointers[tid] = bp / sizeof(struct QBVHNodeN);
+
+ if ( tid == 0 && args.need_backpointers )
+ {
+ uint bp = ((uint)-1) << 6;
+ bp |= (args.num_primrefs) << 3;
+ *(InnerNode_GetBackPointer(BVHBase_GetBackPointers( args.bvh_base ),0)) = bp;
+ }
+ }
+}
+
+
+
+
+
+void SUBGROUP_DFS_ComputeFlatTreeBoxesAndMasks( uniform local struct FlatTree* flat_tree,
+ uniform local struct FlatTreeScheduler* flat_scheduler,
+ uniform local struct AABB3f* boxes,
+ uniform local struct PrimRefMeta* primref_meta,
+ uniform global struct AABB* primref_buffer,
+ uniform local uchar* masks,
+ bool need_masks )
+
+{
+ uniform int num_levels = (int) flat_scheduler->num_levels;
+ varying ushort lane = get_sub_group_local_id();
+
+ // iterate over depth levels in the tree... deepest to shallowest
+ for (uniform int level = num_levels - 1; level >= 0; level--)
+ {
+ // loop over a range of flattree nodes at this level, one node per sub-group
+ // TODO_OPT: Try and enable this code to process two nodes in a SIMD16 subgroup
+ uniform ushort level_start = flat_scheduler->level_start[level];
+ uniform ushort level_node_count = flat_scheduler->level_count[level];
+
+ for (uniform ushort i = get_sub_group_id(); i < level_node_count; i += get_num_sub_groups())
+ {
+ uniform ushort node_index = flat_scheduler->level_ordered_nodes[ level_start + i ];
+
+ varying struct AABB box;
+ AABB_init(&box);
+
+ uniform uint child_base = FlatTree_GetFirstChild( flat_tree, node_index );
+ uniform uint num_children = FlatTree_GetNumChildren( flat_tree, node_index );
+ varying uint child_index = child_base + ((lane<num_children)?lane : 0);
+
+ varying uint mask = 0xff;
+ if (FlatTree_IsLeafNode( flat_tree, node_index ))
+ {
+ // fetch AABBs for primrefs
+ box = primref_buffer[ PrimRefMeta_GetInputIndex( &primref_meta[child_index] ) ];
+ if( need_masks )
+ mask = PRIMREF_instanceMask(&box);
+ }
+ else
+ {
+ // fetch AABBs for child nodes
+ box.lower.xyz = AABB3f_load_lower( &boxes[child_index] );
+ box.upper.xyz = AABB3f_load_upper( &boxes[child_index] );
+ if ( need_masks )
+ mask = masks[child_index];
+ }
+
+
+ // reduce and write box
+ box = AABB_sub_group_reduce_N6( &box );
+ if( lane == 0 )
+ AABB3f_set( &boxes[node_index], box.lower.xyz, box.upper.xyz );
+
+ if( need_masks )
+ {
+ mask = sub_group_reduce_or_N6(mask);
+ masks[node_index] = mask;
+ }
+
+ }
+
+ barrier( CLK_LOCAL_MEM_FENCE );
+ }
+}
+
+
+void SUBGROUP_DFS_WriteNodes(
+ uniform local struct FlatTree* flat_tree,
+ uniform local struct AABB3f* boxes,
+ uniform local struct PrimRefMeta* primref_meta,
+ uniform struct DFSArgs args,
+ uniform local uchar* masks
+ )
+
+{
+ uniform uint num_nodes = FlatTree_GetNodeCount(flat_tree);
+
+ for ( uniform uint i = get_sub_group_id(); i < num_nodes; i += get_num_sub_groups() )
+ {
+ SUBGROUP_WriteQBVHNode( flat_tree, primref_meta, boxes, i, args, masks );
+ }
+
+}
+
+
+
+
+struct Single_WG_build_SLM
+{
+ struct FlatTree flat_tree;
+ struct FlatTreeScheduler flat_scheduler;
+ struct PrimRefMeta primitive_meta[DFS_WG_SIZE];
+
+ union
+ {
+ struct{
+ struct PrimRefSet prim_refs;
+ struct LocalBVH2 bvh2;
+ struct BVHBuildLocals bvh2_locals;
+ } s1;
+
+ struct {
+ struct AABB3f boxes[DFS_MAX_FLATTREE_NODES];
+ uchar masks[DFS_MAX_FLATTREE_NODES];
+ } s2;
+ } u;
+
+};
+
+
+GRL_INLINE void execute_single_WG_build(
+ struct DFSArgs args,
+ local struct Single_WG_build_SLM* slm
+ )
+{
+
+ ushort tid = get_local_id( 0 );
+
+ //
+ // Initialize the various SLM structures. Different sub-groups take different init paths.
+ // NOTE: even numbered subgroups here to avoid the fused-EU serialization bug
+ //
+ if ( get_sub_group_id() == 0 )
+ SUBGROUP_FlatTree_Initialize( &slm->flat_tree, args );
+ else if ( get_sub_group_id() == 2 )
+ SUBGROUP_LocalBVH2_Initialize( &slm->u.s1.bvh2, args.num_primrefs );
+ else if ( get_sub_group_id() == 4 )
+ SUBGROUP_FlatTreeScheduler_Initialize( &slm->flat_scheduler );
+ else if ( get_sub_group_id() == 6 )
+ SUBGROUP_PrimRefSet_Initialize( &slm->u.s1.prim_refs );
+
+ barrier( CLK_LOCAL_MEM_FENCE );
+
+ // load the PrimRefs
+ DFS_CreatePrimRefSet( args, &slm->u.s1.prim_refs );
+
+ // build the BVH2
+ DFS_ConstructBVH2( &slm->u.s1.bvh2, &slm->u.s1.prim_refs, args.num_primrefs, &slm->u.s1.bvh2_locals );
+
+ // copy out metadata for primrefs now that they have been sorted
+ if( tid < args.num_primrefs )
+ {
+ slm->primitive_meta[tid] = PrimRefSet_GetMeta( &slm->u.s1.prim_refs, tid );
+ }
+ barrier( CLK_LOCAL_MEM_FENCE );
+
+ // collapse into a FlatTree
+ SUBGROUP_DFS_BuildFlatTree( &slm->u.s1.bvh2, &slm->flat_tree, &slm->flat_scheduler );
+
+ // allocate output QBVH6 nodes
+ if ( get_local_id( 0 ) == 0 )
+ FlatTree_AllocateQNodes( &slm->flat_tree, args );
+
+ barrier( CLK_LOCAL_MEM_FENCE );
+
+ SUBGROUP_DFS_ComputeFlatTreeBoxesAndMasks( &slm->flat_tree, &slm->flat_scheduler, &slm->u.s2.boxes[0], slm->primitive_meta, args.primref_buffer, slm->u.s2.masks, args.need_masks );
+
+ //FlatTree_Printf( &slm->flat_tree );
+ //FlatTree_check_boxes ( &slm->flat_tree, args.primref_buffer, &slm->u.s2.boxes[0], slm->primitive_meta );
+
+ SUBGROUP_DFS_WriteNodes( &slm->flat_tree, &slm->u.s2.boxes[0], slm->primitive_meta, args, slm->u.s2.masks );
+
+
+ // generate sorted primref index buffer and backpointers to feed the leaf creation pipeilne
+ if ( tid < args.num_primrefs )
+ {
+ uint input_index = PrimRefMeta_GetInputIndex(&slm->primitive_meta[tid]);
+
+ uint bp = FlatTree_GetPrimRefBackPointer( &slm->flat_tree, tid );
+ global uint* primref_back_pointers = args.primref_index_buffer + args.num_primrefs;
+
+ args.primref_index_buffer[tid] = input_index;
+
+ primref_back_pointers[tid] = bp / sizeof(struct QBVHNodeN);
+
+ if ( tid == 0 && args.need_backpointers )
+ {
+ *(InnerNode_GetBackPointer(BVHBase_GetBackPointers( args.bvh_base ),0)) |= ((uint)-1) << 6;
+ }
+ }
+}
+
+
+
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__( (reqd_work_group_size( DFS_WG_SIZE, 1, 1 )) )
+__attribute__( (intel_reqd_sub_group_size( 16 )) )
+kernel void DFS( global struct Globals* globals,
+ global char* bvh_mem,
+ global PrimRef* primref_buffer,
+ global uint* primref_index_buffer,
+ uint alloc_backpointers
+ )
+{
+ struct DFSArgs args;
+ args.bvh_base = (global struct BVHBase*) bvh_mem;
+ args.leaf_node_type = globals->leafPrimType;
+ args.inner_node_type = NODE_TYPE_INTERNAL;
+ args.leaf_size_in_bytes = globals->leafSize;
+ args.primref_buffer = primref_buffer;
+ args.need_backpointers = alloc_backpointers != 0;
+ args.num_primrefs = globals->numPrimitives;
+ args.primref_index_buffer = primref_index_buffer;
+ args.need_masks = args.leaf_node_type == NODE_TYPE_INSTANCE;
+
+ if ( args.num_primrefs <= TREE_ARITY )
+ {
+ // TODO_OPT: This decision should be made using indirect dispatch
+ if( get_sub_group_id() == 0 )
+ Trivial_DFS( args );
+ return;
+ }
+
+ local struct Single_WG_build_SLM slm;
+
+ execute_single_WG_build( args, &slm );
+}
+
+
+
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__( (reqd_work_group_size( DFS_WG_SIZE, 1, 1 )) )
+__attribute__( (intel_reqd_sub_group_size( 16 )) )
+kernel void DFS_single_wg(
+ global struct Globals* globals,
+ global char* bvh_mem,
+ global PrimRef* primref_buffer,
+ global uint* primref_index_buffer,
+ uint sah_flags
+)
+{
+ struct DFSArgs args;
+ args.bvh_base = (global struct BVHBase*) bvh_mem;
+ args.leaf_node_type = globals->leafPrimType;
+ args.inner_node_type = NODE_TYPE_INTERNAL;
+ args.leaf_size_in_bytes = globals->leafSize;
+ args.primref_buffer = primref_buffer;
+ args.need_backpointers = sah_flags & SAH_FLAG_NEED_BACKPOINTERS;
+ args.num_primrefs = globals->numPrimitives;
+ args.primref_index_buffer = primref_index_buffer;
+ args.need_masks = sah_flags & SAH_FLAG_NEED_MASKS;
+
+ local struct Single_WG_build_SLM slm;
+
+ execute_single_WG_build( args, &slm );
+}
+
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__( (reqd_work_group_size( 16, 1, 1 )) )
+__attribute__( (intel_reqd_sub_group_size( 16 )) )
+kernel void DFS_trivial(
+ global struct Globals* globals,
+ global char* bvh_mem,
+ global PrimRef* primref_buffer,
+ global uint* primref_index_buffer,
+ uint sah_flags
+)
+{
+ struct DFSArgs args;
+ args.bvh_base = (global struct BVHBase*) bvh_mem;
+ args.leaf_node_type = globals->leafPrimType;
+ args.inner_node_type = NODE_TYPE_INTERNAL;
+ args.leaf_size_in_bytes = globals->leafSize;
+ args.primref_buffer = primref_buffer;
+ args.need_backpointers = sah_flags & SAH_FLAG_NEED_BACKPOINTERS;
+ args.num_primrefs = globals->numPrimitives;
+ args.primref_index_buffer = primref_index_buffer;
+ args.need_masks = sah_flags & SAH_FLAG_NEED_MASKS;
+
+ Trivial_DFS( args );
+}
+
+
+struct DFSArgs dfs_args_from_sah_globals( global struct SAHBuildGlobals* sah_globals )
+{
+ struct DFSArgs args;
+ args.bvh_base = (global struct BVHBase*) sah_globals->p_bvh_base;
+ args.leaf_node_type = sah_globals->leaf_type;
+ args.inner_node_type = NODE_TYPE_INTERNAL;
+ args.leaf_size_in_bytes = sah_globals->leaf_size;
+ args.primref_buffer = (global PrimRef*) sah_globals->p_primrefs_buffer;
+ args.need_backpointers = sah_globals->flags & SAH_FLAG_NEED_BACKPOINTERS;
+ args.num_primrefs = sah_globals->num_primrefs;
+ args.primref_index_buffer = (global uint*) sah_globals->p_primref_index_buffers;
+ args.need_masks = sah_globals->flags & SAH_FLAG_NEED_MASKS;
+
+ return args;
+}
+
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(DFS_WG_SIZE, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16)))
+kernel void DFS_single_wg_batchable(
+ global struct SAHBuildGlobals* globals_buffer,
+ global struct VContextScheduler* scheduler
+)
+{
+ global struct SAHBuildGlobals* sah_globals = globals_buffer + scheduler->num_trivial_builds + get_group_id(0);
+
+ struct DFSArgs args = dfs_args_from_sah_globals( sah_globals );
+
+ local struct Single_WG_build_SLM slm;
+
+ execute_single_WG_build(args, &slm);
+}
+
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(16, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16)))
+kernel void DFS_trivial_batchable(
+ global struct SAHBuildGlobals* globals_buffer
+)
+{
+ global struct SAHBuildGlobals* sah_globals = globals_buffer + get_group_id(0);
+
+ struct DFSArgs args = dfs_args_from_sah_globals(sah_globals);
+
+ Trivial_DFS(args);
+} \ No newline at end of file
diff --git a/src/intel/vulkan/grl/gpu/bvh_build_leaf.cl b/src/intel/vulkan/grl/gpu/bvh_build_leaf.cl
new file mode 100644
index 00000000000..bb220b30612
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/bvh_build_leaf.cl
@@ -0,0 +1,357 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#include "api_interface.h"
+#include "common.h"
+#include "instance.h"
+
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(32, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16)))
+void kernel
+primref_to_quads(global struct Globals *globals,
+ global struct AABB *primref,
+ global char *primref_index,
+ global char *bvh_mem,
+ global GRL_RAYTRACING_GEOMETRY_DESC *geomDesc,
+ const uint stride,
+ const uint offset,
+ const uint allow_update)
+{
+ global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem;
+ global struct Quad *quads = (global struct Quad *)(bvh_mem + 64*bvh->quadLeafStart );
+ uint quadIndicesStart = bvh->quadIndicesDataStart;
+
+ const uint numPrimitives = globals->numPrimitives;
+ uint i = get_group_id( 0 ) * get_local_size( 0 ) + get_local_id(0);
+ if (i < numPrimitives)
+ {
+ global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem;
+
+ const uint primrefID = *(uint *)(primref_index + i * stride + offset);
+
+ const uint geomID = PRIMREF_geomID(&primref[primrefID]);
+ const uint primID0 = PRIMREF_primID0(&primref[primrefID]);
+ const uint primID1 = PRIMREF_primID1(&primref[primrefID]);
+ const uint geomFlags = PRIMREF_geomFlags(&primref[primrefID]);
+
+ const uint3 tri0 = GRL_load_triangle(&geomDesc[geomID], primID0);
+ const uint3 tri1 = GRL_load_triangle(&geomDesc[geomID], primID1);
+
+ const struct TrianglePair q = TrianglePair_Constructor(tri0, primID0, tri1, primID1);
+
+ uint vertex_stride = geomDesc[geomID].Desc.Triangles.VertexBufferByteStride;
+
+ const uint4 indices = q.a;
+
+ const uint mask = 0xff; // FIXME: hardcoded mask
+ float3 vtx0, vtx1, vtx2, vtx3;
+ GRL_load_quad_vertices(&geomDesc[geomID], &vtx0, &vtx1, &vtx2, &vtx3, indices);
+
+ uint j0 = q.lb.x;
+ uint j1 = q.lb.y;
+ uint j2 = q.lb.z;
+ uint shaderIndex = (mask << 24) | geomID;
+ uint geomIndex = geomID | (geomFlags << 30);
+ uint primIndex0 = primID0;
+ const uint delta = primID1 - primID0;
+ const uint j = (((j0) << 0) | ((j1) << 2) | ((j2) << 4));
+ uint primIndex1Delta = delta | (j << 16) | (1 << 22);
+
+ uint4 pack0 = (uint4)(shaderIndex, geomIndex, primIndex0, primIndex1Delta);
+ float4 pack1 = (float4)(vtx0.x, vtx0.y, vtx0.z, vtx1.x);
+ float4 pack2 = (float4)(vtx1.y, vtx1.z, vtx2.x, vtx2.y);
+ float4 pack3 = (float4)(vtx2.z, vtx3.x, vtx3.y, vtx3.z);
+
+ global uint4* dst = (global uint4*)&quads[i];
+ store_uint4_L1WB_L3WB(dst, 0, pack0);
+ store_uint4_L1WB_L3WB(dst, 1, as_uint4(pack1));
+ store_uint4_L1WB_L3WB(dst, 2, as_uint4(pack2));
+ store_uint4_L1WB_L3WB(dst, 3, as_uint4(pack3));
+
+ if(allow_update)
+ {
+ global uint4* vertex_indice_ptr = (global uint4*)(((char*)bvh) + (64u * quadIndicesStart + 32 * i));
+
+ uint4 pack_indices = (uint4) ( indices.x , indices.y, indices.z, indices.w );
+
+ store_uint4_L1WB_L3WB( vertex_indice_ptr, 0, pack0 );
+ store_uint4_L1WB_L3WB( vertex_indice_ptr, 1, pack_indices * vertex_stride);
+ }
+
+ if (i == 0)
+ bvh->quadLeafCur += numPrimitives ;
+ }
+
+
+
+#if 0
+ global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem;
+ global struct Quad *quads = (global struct Quad *)(bvh_mem + 64*bvh->quadLeafStart );
+
+ const uint numPrimitives = globals->numPrimitives;
+ const uint startID = get_group_id( 0 ) * get_local_size( 0 );
+ const uint endID = min((uint)(startID + get_local_size( 0 )), numPrimitives);
+
+ for (uint i = startID + get_local_id(0); i < endID; i += get_local_size(0))
+ {
+ const uint primrefID = *(uint *)(primref_index + i * stride + offset);
+
+ const uint geomID = PRIMREF_geomID(&primref[primrefID]);
+ const uint primID0 = PRIMREF_primID0(&primref[primrefID]);
+ const uint primID1 = PRIMREF_primID1(&primref[primrefID]);
+ const uint geomFlags = PRIMREF_geomFlags(&primref[primrefID]);
+
+ const uint3 tri0 = GRL_load_triangle(&geomDesc[geomID], primID0);
+ const uint3 tri1 = GRL_load_triangle(&geomDesc[geomID], primID1);
+
+ const struct TrianglePair q = TrianglePair_Constructor(tri0, primID0, tri1, primID1);
+
+ const uint4 indices = q.a;
+ const uint mask = 0xff; // FIXME: hardcoded mask
+ float3 vtx0, vtx1, vtx2, vtx3;
+ GRL_load_quad_vertices(&geomDesc[geomID], &vtx0, &vtx1, &vtx2, &vtx3, indices);
+
+ setQuad(&quads[i], (float4)(vtx0,0), (float4)(vtx1,0), (float4)(vtx2,0), (float4)(vtx3,0), q.lb.x, q.lb.y, q.lb.z, geomID, primID0, primID1, mask, geomFlags );
+ }
+
+ if (get_local_id(0) + get_group_id(0)*get_local_size(0) == 0)
+ bvh->quadLeafCur += numPrimitives ;
+#endif
+}
+
+GRL_INLINE void create_procedural_leaf(global struct Globals *globals,
+ global struct AABB *primref,
+ local uint *primrefids,
+ uint numProcedurals,
+ struct QBVHNodeN *qnode,
+ global char *bvh_mem,
+ global GRL_RAYTRACING_GEOMETRY_DESC *geomDesc)
+{
+ if (get_local_id(0) >= 8)
+ return;
+
+ global struct BVHBase* bvh_base = (global struct BVHBase*)bvh_mem;
+
+ /* first read geomID of all primitives */
+ uint primrefID = -1;
+ uint geomID = -1;
+ uint geomFlags = 0;
+ if (get_local_id(0) < numProcedurals)
+ {
+ primrefID = primrefids[get_local_id(0)];
+ geomID = PRIMREF_geomID(&primref[primrefID]);
+ geomFlags = PRIMREF_geomFlags( &primref[primrefID] );
+ }
+
+ // cannot sort by geomID as bounds in parent node are then wrong
+ //ulong geomID_primrefID = (((ulong)geomID) << 32) | ((ulong)primrefID);
+ //geomID_primrefID = sort8_ascending_ulong(geomID_primrefID);
+ //geomID = geomID_primrefID >> 32;
+ //primrefID = geomID_primrefID;
+
+ /* We have to split at geomID boundaries into multiple leaves. This
+ * block calculates the lane where a leaf starts and ends. */
+ const uint geomIDprev = intel_sub_group_shuffle_up(0xFFFFFFFFu, geomID, 1u);
+ const uint geomIDnext = intel_sub_group_shuffle_down(geomID, 0xFFFFFFFFu, 1u);
+ const uint leaf_start = geomIDprev != geomID;
+ const uint leaf_end = geomIDnext != geomID;
+ const uint leaf_start_next = intel_sub_group_shuffle_down(leaf_start, 0u, 1u);
+
+ /* This computes which leaf a lane processes. E.g. form geomID =
+ * [3,3,4,4,4,0] we get leaf_id = [0,0,1,1,1,2] */
+ //const uint leaf_id = sub_group_scan_inclusive_add(leaf_start); // FIXME: exclusive?
+
+ /* This computes the n'th primitive a lane processes inside its
+ * leaf. For the example above we compute leaf_prim =
+ * [0,1,0,1,2,0]. */
+ const uint leaf_prim = get_local_id(0) - sub_group_scan_inclusive_max(leaf_start ? get_local_id(0) : 0);
+
+ /* from here on we allocate data and write to memory, thus only
+ * lanes that process a primitive should continue. */
+ if (get_local_id(0) >= numProcedurals)
+ return;
+
+ /* Here we allocate a single memory block for each required
+ * ProceduralLeaf node. We do this from a single lane to ensure
+ * the allocation is contiguous. */
+ uint leaf_base_offset = 0;
+ uint n_leafs = sub_group_reduce_add(leaf_start);
+ if (get_local_id(0) == 0)
+ leaf_base_offset = allocate_procedural_leaves( bvh_base, n_leafs );
+ leaf_base_offset = sub_group_broadcast(leaf_base_offset, 0);
+
+ /* Compute the leaf offset for each lane. */
+ uint leaf_offset = leaf_base_offset + sub_group_scan_inclusive_add(leaf_start) - 1;
+
+ struct ProceduralLeaf *pleaf = ((global struct ProceduralLeaf *)(bvh_mem)) + leaf_offset;
+
+ /* write the procedural leaf headers */
+ if (leaf_end)
+ {
+ pleaf->leafDesc.shaderIndex_geomMask = 0xFF000000 | (geomID & 0x00FFFFFF); // FIXME: use accessor function. Future extensions may have shaderIndex != geomID
+ pleaf->leafDesc.geomIndex_flags = geomID | (geomFlags<<30); // FIXME: Use setter function
+ pleaf->DW1 = 0xFFFFFFF0 | (leaf_prim + 1); // !!!
+ }
+ /* write the procedural leaf primIDs */
+ pleaf->_primIndex[leaf_prim] = PRIMREF_primID0(&primref[primrefID]);
+
+ /* update leaf node offset inside parent node */
+ if (get_local_id(0) == 0)
+ {
+ QBVH6Node_set_offset(qnode, pleaf);
+ QBVH6Node_set_type(qnode, NODE_TYPE_PROCEDURAL);
+ }
+
+ /* Let parent node children point to proper procedural leaf block
+ * and primitive. */
+ qnode->childData[get_local_id(0)] = leaf_start_next | (leaf_prim << 2);
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+GRL_ANNOTATE_BIG_REG_REQ
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16))) void kernel
+primref_to_procedurals(global struct Globals *globals,
+ global struct AABB *primref,
+ global char *primref_index,
+ global char *bvh_mem,
+ global GRL_RAYTRACING_GEOMETRY_DESC *geomDesc,
+ const uint stride,
+ const uint offset)
+{
+ global struct BVHBase *bvh = (global struct BVHBase *)bvh_mem;
+
+ const uint numPrimitives = globals->numPrimitives;
+ uint startID = get_group_id( 0 ) * get_local_size( 0 );
+ uint endID = min((uint)(startID + get_local_size( 0 )), numPrimitives);
+
+ uint offset1 = stride * globals->numPrimitives;
+ if (stride == 8)
+ offset1 = 4;
+
+ uint prev_start_back_pointer = startID == 0 ? -1 : *(uint *)(primref_index + (startID-1) * stride + offset1);
+ /* start at leaf start */
+ while (startID < numPrimitives)
+ {
+ const uint back_pointer = *(uint *)(primref_index + startID * stride + offset1);
+ if (back_pointer != prev_start_back_pointer)
+ break;
+ startID++;
+ }
+
+ uint prev_end_back_pointer = *(uint *)(primref_index + (endID-1) * stride + offset1);
+ /* end at next leaf start */
+ while (endID < numPrimitives)
+ {
+ const uint back_pointer = *(uint *)(primref_index + endID * stride + offset1);
+ if (back_pointer != prev_end_back_pointer)
+ break;
+ endID++;
+ }
+
+ local uint procedurals[16];
+
+ for (uint lid = startID + get_local_id(0); lid < endID + get_local_id(0);)
+ {
+ /* load leaf start points and back_pointer */
+ const uint primrefID = *(uint *)(primref_index + lid * stride + offset);
+ uint back_pointer = *(uint *)(primref_index + lid * stride + offset1);
+ uint prev_back_pointer = get_local_id(0) == 0 ? -1 : *(uint *)(primref_index + (lid-1) * stride + offset1);
+
+ const uint leaf_start = back_pointer != prev_back_pointer;
+ uint leaf_start_back_pointer = sub_group_broadcast(back_pointer, 0);
+
+ /* compute number of primitives inside the leaf starting at lid */
+ const uint leaf_id = sub_group_scan_inclusive_add(leaf_start);
+ uint numPrimitives = 0;
+ if (back_pointer == leaf_start_back_pointer && lid < endID)
+ numPrimitives = sub_group_reduce_add(1);
+ numPrimitives = sub_group_broadcast(numPrimitives, 0);
+
+ procedurals[get_local_id(0)] = primrefID;
+
+ struct QBVHNodeN *qnode = (struct QBVHNodeN *)bvh_mem + back_pointer;
+
+ create_procedural_leaf(globals, primref, procedurals, numPrimitives, qnode, bvh_mem, geomDesc);
+
+ lid += numPrimitives;
+ }
+}
+
+GRL_INLINE void create_HW_instance_leaf(
+ global struct BVHBase* bvh,
+ global const struct GRL_RAYTRACING_INSTANCE_DESC* instDesc,
+ uint dstLeafId,
+ uint instanceIndex,
+ uint rootNodeByteOffset,
+ uint instanceMask)
+{
+ /* convert DXR instance to instance leaf node */
+ global struct HwInstanceLeaf* leaves = (__global struct HwInstanceLeaf*)BVHBase_quadLeaves(bvh);
+ HwInstanceLeaf_Constructor(&leaves[dstLeafId], instDesc, instanceIndex, rootNodeByteOffset, instanceMask);
+}
+
+
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16)))
+void kernel create_HW_instance_nodes(
+ global const struct Globals *globals,
+ global char *primref_index,
+ global struct AABB *primref,
+ global struct BVHBase *bvh,
+ global struct GRL_RAYTRACING_INSTANCE_DESC *src_instances,
+ uint32_t stride,
+ uint32_t offset)
+{
+ uint dstLeafId = get_group_id(0) * MAX_HW_SIMD_WIDTH + get_sub_group_local_id();
+ uint num_prims = globals->numPrimitives;
+ if (dstLeafId >= num_prims)
+ return;
+ if( dstLeafId == 0 )
+ bvh->instanceLeafEnd += 2*num_prims;
+
+ /* get instance ID */
+ const uint primrefID = *(uint *)(primref_index + dstLeafId * stride + offset);
+ const uint instIndex = PRIMREF_instanceIndex(&primref[primrefID]);
+ const uint rootByteOffset = PRIMREF_instanceRootNodeOffset(&primref[primrefID]);
+ const uint instMask = PRIMREF_instanceMask(&primref[primrefID]);
+ create_HW_instance_leaf(bvh, &src_instances[instIndex], dstLeafId, instIndex, rootByteOffset, instMask );
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16)))
+void kernel create_HW_instance_nodes_pointers(
+ global const struct Globals *globals,
+ global char *primref_index,
+ global struct AABB *primref,
+ global struct BVHBase *bvh,
+ global void *instances_in,
+ uint32_t stride,
+ uint32_t offset)
+{
+ uint dstLeafId = get_group_id(0) * MAX_HW_SIMD_WIDTH + get_sub_group_local_id();
+ uint num_prims = globals->numPrimitives;
+ if (dstLeafId >= num_prims)
+ return;
+ if (dstLeafId == 0)
+ bvh->instanceLeafEnd += 2 * num_prims;
+
+ global const struct GRL_RAYTRACING_INSTANCE_DESC **instances =
+ (global const struct GRL_RAYTRACING_INSTANCE_DESC **)instances_in;
+
+ /* get instance ID */
+ const uint primrefID = *(uint *)(primref_index + dstLeafId * stride + offset);
+ const uint instIndex = PRIMREF_instanceIndex(&primref[primrefID]);
+ const uint rootByteOffset = PRIMREF_instanceRootNodeOffset(&primref[primrefID]);
+ const uint instMask = PRIMREF_instanceMask(&primref[primrefID]);
+ create_HW_instance_leaf(bvh, instances[instIndex], dstLeafId, instIndex, rootByteOffset, instMask );
+}
diff --git a/src/intel/vulkan/grl/gpu/bvh_build_presplit.cl b/src/intel/vulkan/grl/gpu/bvh_build_presplit.cl
new file mode 100644
index 00000000000..bc9cf590f51
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/bvh_build_presplit.cl
@@ -0,0 +1,556 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#include "api_interface.h"
+#include "common.h"
+
+#define GRID_SIZE 1024
+
+/*
+ This presplit item contains for each primitive a number of splits to
+ perform (priority) and the primref index.
+ */
+
+struct PresplitItem
+{
+ unsigned int index;
+ float priority;
+};
+
+/*
+
+ This function splits a line v0->v1 at position pos in dimension dim
+ and merges the bounds for the left and right line segments into
+ lbounds and rbounds.
+
+ */
+
+GRL_INLINE void splitLine(const uint dim,
+ const float pos,
+ const float4 v0,
+ const float4 v1,
+ struct AABB *lbounds,
+ struct AABB *rbounds)
+{
+ const float v0d = v0[dim];
+ const float v1d = v1[dim];
+
+ /* this point is on left side */
+ if (v0d <= pos)
+ AABB_extend_point(lbounds, v0);
+
+ /* this point is on right side */
+ if (v0d >= pos)
+ AABB_extend_point(rbounds, v0);
+
+ /* the edge crosses the splitting location */
+ if ((v0d < pos && pos < v1d) || (v1d < pos && pos < v0d))
+ {
+ const float f = (pos - v0d) / (v1d - v0d);
+ const float4 c = f * (v1 - v0) + v0;
+ AABB_extend_point(lbounds, c);
+ AABB_extend_point(rbounds, c);
+ }
+}
+
+/*
+
+ This function splits a clipped triangle v0,v1,v2 with bounds prim at
+ position pos in dimension dim and merges the bounds for the left and
+ right clipped triangle fragments into lbounds and rbounds.
+
+ */
+
+GRL_INLINE void splitTriangle(struct AABB *prim,
+ const uint dim,
+ const float pos,
+ const float4 v0,
+ const float4 v1,
+ const float4 v2,
+ struct AABB *lbounds,
+ struct AABB *rbounds)
+{
+ /* clip each triangle edge */
+ splitLine(dim, pos, v0, v1, lbounds, rbounds);
+ splitLine(dim, pos, v1, v2, lbounds, rbounds);
+ splitLine(dim, pos, v2, v0, lbounds, rbounds);
+
+ /* the triangle itself was clipped already, thus clip against triangle bounds */
+ AABB_intersect(lbounds, prim);
+ AABB_intersect(rbounds, prim);
+}
+
+float calculate_priority(struct AABB *prim, global GRL_RAYTRACING_GEOMETRY_DESC *geom)
+{
+ /* calculate projected area of first triangles */
+ const uint primID0 = PRIMREF_primID0(prim);
+ const uint3 tri0 = GRL_load_triangle(geom, primID0);
+ const float4 av0 = GRL_load_vertex(geom, tri0.x);
+ const float4 av1 = GRL_load_vertex(geom, tri0.y);
+ const float4 av2 = GRL_load_vertex(geom, tri0.z);
+ const float area_tri0 = areaProjectedTriangle(av0, av1, av2);
+
+ /* calculate projected area of second triangle */
+ const uint primID1 = PRIMREF_primID1(prim);
+ const uint3 tri1 = GRL_load_triangle(geom, primID1);
+ const float4 bv0 = GRL_load_vertex(geom, tri1.x);
+ const float4 bv1 = GRL_load_vertex(geom, tri1.y);
+ const float4 bv2 = GRL_load_vertex(geom, tri1.z);
+ const float area_tri1 = areaProjectedTriangle(bv0, bv1, bv2);
+
+ /* as priority we use the AABB area */
+ const float area_aabb = AABB_halfArea(prim);
+ float priority = area_aabb;
+
+ /* prefer triangles with a large potential SAH gain. */
+ const float area_tris = area_tri0 + area_tri1;
+ const float area_ratio = min(4.0f, area_aabb / max(1E-12f, area_tris));
+ priority *= area_ratio;
+
+ /* ignore too small primitives */
+ //const float4 size = AABB_size(prim);
+ //const float max_size = max(size.x,max(size.y,size.z));
+ //if (max_size < 0.5f*max_scene_size/GRID_SIZE)
+ // priority = 0.0f;
+
+ return priority;
+}
+
+/*
+
+ This kernel calculates for each primitive an estimated splitting priority.
+
+ */
+
+ GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) void kernel compute_num_presplits(global struct Globals *globals,
+ global struct BVHBase* bvh_base,
+ global struct AABB *primref,
+ global struct PresplitItem *presplit,
+ global GRL_RAYTRACING_GEOMETRY_DESC *geomDesc)
+{
+ //assert(sizeof(PresplitItem) == sizeof_PresplitItem);
+
+ /* calculate the range of primitives each work group should process */
+ const uint numPrimitives = globals->numPrimitives;
+ const uint startID = (get_group_id(0) + 0) * numPrimitives / get_num_groups(0);
+ const uint endID = (get_group_id(0) + 1) * numPrimitives / get_num_groups(0);
+
+ /* get scene bounding box size */
+ const float3 scene_size = AABB3f_size(&bvh_base->Meta.bounds);
+ const float max_scene_size = max(scene_size.x, max(scene_size.y, scene_size.z));
+
+ /* each work group iterates over its range of primitives */
+ for (uint i = startID + get_local_id(0); i < endID; i += get_local_size(0))
+ {
+ const uint geomID = PRIMREF_geomID(&primref[i]);
+
+ /* splitting heuristic for triangles */
+ if (GRL_is_triangle(&geomDesc[geomID]))
+ {
+ presplit[i].index = i;
+ presplit[i].priority = calculate_priority(&primref[i], &geomDesc[geomID]);
+ }
+
+ /* splitting of procedurals is not supported */
+ else if (GRL_is_procedural(&geomDesc[geomID]))
+ {
+ presplit[i].index = i;
+ presplit[i].priority = 0.0f;
+ }
+
+ else
+ {
+ //assert(false);
+ }
+ }
+
+ if (get_local_id(0) + get_group_id(0)*get_local_size(0) == 0)
+ globals->numOriginalPrimitives = globals->numPrimitives;
+}
+
+/*
+
+ This kernel computes the sum of all priorities.
+
+ */
+
+ GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(MAX_WORKGROUP_SIZE, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16))) void kernel
+priority_sum(global struct Globals *globals,
+ global struct PresplitItem *presplit,
+ uint numPrimitivesToSplit)
+{
+ const uint N = globals->numPrimitives;
+ const uint j = get_local_id(0);
+ const uint J = get_local_size(0);
+ const uint BLOCKSIZE = (N + J - 1) / J;
+ const uint start = min((j + 0) * BLOCKSIZE, N);
+ const uint end = min((j + 1) * BLOCKSIZE, N);
+
+ float prioritySum = 0;
+ for (uint i = start; i < end; i++)
+ prioritySum += presplit[i].priority;
+
+ prioritySum = work_group_reduce_add(prioritySum);
+ globals->presplitPrioritySum = prioritySum;
+
+#if 0
+ work_group_barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
+
+ float scale = 1.0f;
+ for (uint i = 0; i < 10; i++)
+ {
+ //if (j == 0)
+ //printf("prioritySum = %f\n",scale*prioritySum);
+
+ uint numSplits = 0;
+ for (uint i = start; i < end; i++)
+ numSplits += presplit[i].priority / (scale*prioritySum)*numPrimitivesToSplit;
+
+ numSplits = work_group_reduce_add(numSplits);
+
+ if (numSplits > numPrimitivesToSplit)
+ break;
+
+ //if (j == 0)
+ // printf("numSplits = %i (%i)\n",numSplits,numPrimitivesToSplit);
+
+ globals->presplitPrioritySum = scale * prioritySum;
+ scale -= 0.05f;
+ }
+#endif
+}
+
+GRL_INLINE void heapify_down(struct AABB *array, uint size)
+{
+ /* we start at the root */
+ uint cur_node_id = 0;
+ struct AABB *cur_node = array;
+
+ while (true)
+ {
+ int larger_node_id = cur_node_id;
+ struct AABB *larger_node = cur_node;
+
+ /* check if left child is largest */
+ const int left_node_id = 2 * cur_node_id + 1;
+ struct AABB *left_node = &array[left_node_id];
+ if (left_node_id < size && AABB_halfArea(left_node) > AABB_halfArea(larger_node))
+ {
+ larger_node_id = left_node_id;
+ larger_node = left_node;
+ }
+
+ /* check if right child is largest */
+ const int right_node_id = 2 * cur_node_id + 2;
+ struct AABB *right_node = &array[right_node_id];
+ if (right_node_id < size && AABB_halfArea(right_node) > AABB_halfArea(larger_node))
+ {
+ larger_node_id = right_node_id;
+ larger_node = right_node;
+ }
+
+ /* if current node is largest heap property is fulfilled and we are done */
+ if (larger_node_id == cur_node_id)
+ break;
+
+ /* otherwise we swap cur and largest */
+ struct AABB tmp = *cur_node;
+ *cur_node = *larger_node;
+ *larger_node = tmp;
+
+ /* we continue downwards with the largest node */
+ cur_node_id = larger_node_id;
+ cur_node = larger_node;
+ }
+}
+
+GRL_INLINE void heapify_up(struct AABB *array, uint cur_node_id)
+{
+ /* stop if we start at the root */
+ if (cur_node_id == 0)
+ return;
+
+ struct AABB *cur_node = &array[cur_node_id];
+
+ /* we loop until we reach the root node */
+ while (cur_node_id)
+ {
+ /* get parent node */
+ uint parent_node_id = (cur_node_id - 1) / 2;
+ struct AABB *parent_node = &array[parent_node_id];
+
+ /* if parent is larger then current we fulfill the heap property and can terminate */
+ if (AABB_halfArea(parent_node) > AABB_halfArea(cur_node))
+ break;
+
+ /* otherwise we swap cur and parent */
+ struct AABB tmp = *cur_node;
+ *cur_node = *parent_node;
+ *parent_node = tmp;
+
+ /* and continue upwards */
+ cur_node_id = parent_node_id;
+ cur_node = parent_node;
+ }
+}
+
+/* splits a quad primref */
+GRL_INLINE void splitQuadPrimRef(global GRL_RAYTRACING_GEOMETRY_DESC *geom,
+ struct AABB *cur, uint dim, float fsplit,
+ struct AABB *left, struct AABB *right)
+{
+ /* left and right bounds to compute */
+ AABB_init(left);
+ AABB_init(right);
+
+ /* load first triangle and split it */
+ const uint primID0 = PRIMREF_primID0(cur);
+ const uint3 tri0 = GRL_load_triangle(geom, primID0);
+ const float4 av0 = GRL_load_vertex(geom, tri0.x);
+ const float4 av1 = GRL_load_vertex(geom, tri0.y);
+ const float4 av2 = GRL_load_vertex(geom, tri0.z);
+ splitTriangle(cur, dim, fsplit, av0, av1, av2, left, right);
+
+ /* load second triangle and split it */
+ const uint primID1 = PRIMREF_primID1(cur);
+ const uint3 tri1 = GRL_load_triangle(geom, primID1);
+ const float4 bv0 = GRL_load_vertex(geom, tri1.x);
+ const float4 bv1 = GRL_load_vertex(geom, tri1.y);
+ const float4 bv2 = GRL_load_vertex(geom, tri1.z);
+ splitTriangle(cur, dim, fsplit, bv0, bv1, bv2, left, right);
+
+ /* copy the PrimRef payload into left and right */
+ left->lower.w = cur->lower.w;
+ left->upper.w = cur->upper.w;
+ right->lower.w = cur->lower.w;
+ right->upper.w = cur->upper.w;
+}
+
+/*
+
+ This kernel performs the actual pre-splitting. It selects split
+ locations based on an implicit octree over the scene.
+
+ */
+
+#define USE_HEAP 0
+#define HEAP_SIZE 32u
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+//__attribute__((intel_reqd_sub_group_size(16)))
+void kernel
+perform_presplits(global struct Globals *globals,
+ global struct BVHBase* bvh_base,
+ global struct AABB *primref,
+ global struct PresplitItem *presplit,
+ global char *bvh_mem,
+ global GRL_RAYTRACING_GEOMETRY_DESC *geomDesc,
+ uint numPrimitivesToSplit)
+{
+ /* calculate the range of primitives each work group should process */
+ const uint numPrimitives = globals->numPrimitives;
+ int pstart = globals->numOriginalPrimitives - numPrimitivesToSplit;
+ pstart = max(0, pstart);
+ const uint numPrimitivesToProcess = globals->numPrimitives - pstart;
+ const uint startID = (get_group_id(0) + 0) * numPrimitivesToProcess / get_num_groups(0);
+ const uint endID = (get_group_id(0) + 1) * numPrimitivesToProcess / get_num_groups(0);
+
+ /* calculates the 3D grid */
+ float4 grid_base;
+ grid_base.xyz = AABB3f_load_lower( &bvh_base->Meta.bounds );
+ grid_base.w = 0;
+
+ float4 grid_extend;
+ grid_extend.xyz = AABB3f_size(&bvh_base->Meta.bounds);
+ grid_extend.w=0;
+
+ grid_extend = max(grid_extend.x, max(grid_extend.y, grid_extend.z));
+ const float4 grid_scale = select(GRID_SIZE / grid_extend, 0.0f, grid_extend == 0.0f);
+ const float inv_grid_size = 1.0f / GRID_SIZE;
+
+ /* we have to update centroid bounds */
+ struct AABB centroidBounds;
+ AABB_init(&centroidBounds);
+
+ /* initialize heap */
+ struct AABB heap[HEAP_SIZE];
+ uint heap_size = 0;
+
+ /* each work group iterates over its range of primitives */
+ for (uint j = startID + get_local_id(0); j < endID; j += get_local_size(0))
+ {
+ /* array is in ascending order */
+ //const uint ID = numPrimitives-1-j;
+ const uint ID = pstart + j;
+ const float prob = presplit[ID].priority;
+ const uint i = presplit[ID].index;
+ const uint geomID = PRIMREF_geomID(&primref[i]);
+
+ /* do not split primitives with low splitting priority */
+ if (prob <= 0.0f)
+ continue;
+
+ /* we support splitting only for triangles */
+ if (!GRL_is_triangle(&geomDesc[geomID]))
+ continue;
+
+ /* compute number of split primitives to produce */
+ uint numSplitPrims = prob / globals->presplitPrioritySum * numPrimitivesToSplit;
+ numSplitPrims = min(HEAP_SIZE, numSplitPrims);
+
+ /* stop if not splits have to get performed */
+ if (numSplitPrims <= 1)
+ continue;
+
+ /* add primref to heap */
+ heap[0] = primref[i];
+ heap_size = 1;
+ uint heap_pos = 0;
+
+ /* iterate until all splits are done */
+ uint prims = 1;
+ uint last_heap_size = heap_size;
+ while (prims < numSplitPrims)
+ {
+ /* map the primitive bounds to the grid */
+ const float4 lower = heap[heap_pos].lower;
+ const float4 upper = heap[heap_pos].upper;
+ const float4 glower = (lower - grid_base) * grid_scale + 0.2f;
+ const float4 gupper = (upper - grid_base) * grid_scale - 0.2f;
+ uint4 ilower = convert_uint4_rtz(glower);
+ uint4 iupper = convert_uint4_rtz(gupper);
+
+ /* this ignores dimensions that are empty */
+ if (glower.x >= gupper.x)
+ iupper.x = ilower.x;
+ if (glower.y >= gupper.y)
+ iupper.y = ilower.y;
+ if (glower.z >= gupper.z)
+ iupper.z = ilower.z;
+
+ /* Now we compute a morton code for the lower and upper grid
+ * coordinates. */
+ const uint lower_code = bitInterleave3D(ilower);
+ const uint upper_code = bitInterleave3D(iupper);
+
+ /* if all bits are equal then we cannot split */
+ if (lower_code == upper_code)
+ {
+#if !USE_HEAP
+ prims++; // !!!!!!!
+
+ heap_pos++;
+ if (heap_pos == last_heap_size)
+ {
+ heap_pos = 0;
+ last_heap_size = heap_size;
+ }
+ continue;
+#else
+ if (heap_size == 1)
+ break;
+
+ const uint offset = numPrimitives + atomic_add(&globals->numSplittedPrimitives, 1);
+ primref[offset] = heap[heap_pos];
+
+ presplit[offset].index = offset;
+ presplit[offset].priority = calculate_priority(&heap[heap_pos], &geomDesc[geomID]);
+
+ heap[0] = heap[--heap_size];
+ heapify_down(heap, heap_size);
+ continue;
+#endif
+ }
+
+ /* We find the bit position of the first differing bit from the
+ * top down. This bit indicates a split position inside an
+ * implicit octree. */
+ const uint diff = 31 - clz(lower_code ^ upper_code);
+
+ /* compute octree level and dimension to perform the split in */
+ const uint level = diff / 3;
+ const uint dim = diff % 3;
+
+ /* now we compute the grid position of the split */
+ const uint isplit = iupper[dim] & ~((1 << level) - 1);
+
+ /* compute world space position of split */
+ const float fsplit = grid_base[dim] + isplit * inv_grid_size * grid_extend[dim];
+
+ /* split primref into left and right part */
+ struct AABB left, right;
+ splitQuadPrimRef(&geomDesc[geomID], &heap[heap_pos], dim, fsplit, &left, &right);
+ prims++;
+
+ /* update centroid bounds */
+ AABB_extend_point(&centroidBounds, AABB_centroid2(&left));
+ AABB_extend_point(&centroidBounds, AABB_centroid2(&right));
+
+#if !USE_HEAP
+
+ heap[heap_pos] = left;
+ heap[heap_size] = right;
+ heap_size++;
+
+ heap_pos++;
+ if (heap_pos == last_heap_size)
+ {
+ heap_pos = 0;
+ last_heap_size = heap_size;
+ }
+#else
+
+ /* insert left element into heap */
+ heap[0] = left;
+ heapify_down(heap, heap_size);
+
+ /* insert right element into heap */
+ heap[heap_size] = right;
+ heapify_up(heap, heap_size);
+
+ heap_size++;
+#endif
+ }
+
+ /* copy primities to primref array */
+ primref[i] = heap[0];
+
+ presplit[ID].index = i;
+ presplit[ID].priority = calculate_priority(&heap[0], &geomDesc[geomID]);
+
+ for (uint k = 1; k < heap_size; k++)
+ {
+ const uint offset = numPrimitives + atomic_add(&globals->numSplittedPrimitives, 1);
+ primref[offset] = heap[k];
+
+ presplit[offset].index = offset;
+ presplit[offset].priority = calculate_priority(&heap[k], &geomDesc[geomID]);
+ }
+ }
+
+ /* merge centroid bounds into global bounds */
+ centroidBounds = AABB_sub_group_reduce(&centroidBounds);
+ if (get_sub_group_local_id() == 0)
+ AABB_global_atomic_merge(&globals->centroidBounds, &centroidBounds);
+
+ work_group_barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
+
+ /* update number of primitives on finish */
+ if (Globals_OnFinish(globals))
+ {
+ globals->numPrimitives = globals->numPrimitives + globals->numSplittedPrimitives;
+ globals->numSplittedPrimitives = 0;
+
+ /* update first build record */ // FIXME: should be done in builder itself
+ global struct BuildRecord *record = (global struct BuildRecord *)(bvh_mem + bvh_base->quadLeafStart*64);
+ record->end = globals->numPrimitives;
+ }
+}
diff --git a/src/intel/vulkan/grl/gpu/bvh_build_primref.cl b/src/intel/vulkan/grl/gpu/bvh_build_primref.cl
new file mode 100644
index 00000000000..1dd9a3cdd92
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/bvh_build_primref.cl
@@ -0,0 +1,674 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#include "api_interface.h"
+#include "common.h"
+#include "instance.h"
+
+#include "bvh_build_primref.h"
+
+//#pragma OPENCL EXTENSION cl_khr_subgroup_non_uniform_vote : enable
+//int sub_group_non_uniform_any(int predicate);
+
+#define WINDOW_SIZE 16
+
+/* Representation of two merged triangles. */
+struct QuadIndices
+{
+ uint primID0, primID1;
+ uint v0, v1, v2, v3;
+};
+
+/*
+
+ This function calculates a PrimRef from a merged quad and writes
+ this PrimRef to memory.
+
+ */
+GRL_INLINE void create_prim_ref(const uint geomID,
+ const struct QuadIndices quad,
+ global GRL_RAYTRACING_GEOMETRY_DESC *geomDesc,
+ struct AABB *geometryBounds,
+ struct AABB *centroidBounds,
+ global uint *numPrimitives,
+ global struct AABB *primref)
+{
+
+ /* load quad vertices */
+ const float4 vtx0 = GRL_load_vertex(geomDesc, quad.v0); // FIXME: these multiple load_vertex calls should get merged
+ const float4 vtx1 = GRL_load_vertex(geomDesc, quad.v1);
+ const float4 vtx2 = GRL_load_vertex(geomDesc, quad.v2);
+ const float4 vtx3 = GRL_load_vertex(geomDesc, quad.v3);
+
+ /* calculate bounds for quad */
+ float4 lower = min(min(vtx0, vtx1), min(vtx2, vtx3));
+ float4 upper = max(max(vtx0, vtx1), max(vtx2, vtx3));
+
+ /* extend geometry and centroid bounds */
+ const float4 centroid2 = lower + upper;
+ AABB_extendlu(geometryBounds, lower, upper);
+ AABB_extendlu(centroidBounds, centroid2, centroid2);
+
+ PrimRef ref;
+ PRIMREF_setAABB( &ref, lower.xyz, upper.xyz );
+ PRIMREF_setQuadMetaData( &ref, quad.primID0, quad.primID1, geomID, GRL_get_Flags( geomDesc ) );
+
+ /* store primref to memory */
+ const uint offset = atomic_add_global(numPrimitives, 1);
+ primref[offset] = ref;
+}
+
+/*
+
+ This function calculates a PrimRef from a procedural and writes
+ this PrimRef to memory.
+
+ */
+GRL_INLINE void create_prim_ref_procedural(global GRL_RAYTRACING_GEOMETRY_DESC *geomDesc,
+ const uint geomID,
+ const uint primID,
+ struct AABB *geometryBounds,
+ struct AABB *centroidBounds,
+ global uint *numPrimitives,
+ global struct AABB *primref)
+{
+ /* load aabb from memory */
+ struct GRL_RAYTRACING_AABB aabb = GRL_load_aabb(&geomDesc[geomID], primID);
+
+ /* extend geometry and centroid bounds */
+ float4 lower = (float4)(aabb.MinX, aabb.MinY, aabb.MinZ, 0.0f);
+ float4 upper = (float4)(aabb.MaxX, aabb.MaxY, aabb.MaxZ, 0.0f);
+ const float4 centroid2 = lower + upper;
+ AABB_extendlu(geometryBounds, lower, upper);
+ AABB_extendlu(centroidBounds, centroid2, centroid2);
+
+ /* encode geomID, primID */
+ uint geomFlags = GRL_get_Flags(&geomDesc[geomID]);
+
+ PrimRef ref;
+ PRIMREF_setAABB( &ref, lower.xyz, upper.xyz );
+ PRIMREF_setProceduralMetaData( &ref, geomID, primID, geomFlags );
+
+ /* store primref to memory */
+ const uint offset = atomic_add_global(numPrimitives, 1);
+ primref[offset] = ref;
+}
+
+/*
+
+ This function performs a binary search to calculate the geomID and
+ primID of the i'th primitive of the scene. For the search a
+ prefix_sum array is used that stores for each location j the sum of
+ the number of primitives of all meshes k with k<j.
+
+*/
+
+struct GeomPrimID
+{
+ uint geomID, primID;
+};
+
+struct GeomPrimID binary_search_geomID_primID(global uint *prefix_sum, const uint prefix_sum_size, const uint i)
+{
+ uint l = 0;
+ uint r = prefix_sum_size;
+ uint k = 0;
+
+ while (r - l > 1)
+ {
+ const uint m = (l + r) / 2;
+ k = prefix_sum[m];
+ if (k <= i)
+ {
+ l = m;
+ }
+ else if (i < k)
+ {
+ r = m;
+ }
+ }
+
+ struct GeomPrimID id;
+ id.geomID = l;
+ id.primID = i - prefix_sum[l];
+ return id;
+}
+
+/*
+
+ Checks if a vertex contains only finite floating point numbers.
+
+ */
+
+GRL_INLINE bool isfinite_vertex(float4 vtx)
+{
+ return isfinite(vtx.x) && isfinite(vtx.y) && isfinite(vtx.z);
+}
+
+
+/*
+ Create primrefs from array of instance descriptors.
+ */
+ GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) void kernel
+primrefs_from_DXR_instances(global struct Globals *globals,
+ global struct BVHBase* bvh,
+ global __const struct GRL_RAYTRACING_INSTANCE_DESC* instances,
+ uint numInstances,
+ global struct AABB *primrefs,
+ uint allowUpdate)
+{
+ const uint instanceIndex = get_sub_group_local_id() + get_group_id(0) * MAX_HW_SIMD_WIDTH;
+ if (instanceIndex < numInstances)
+ {
+ global __const struct GRL_RAYTRACING_INSTANCE_DESC* instance = instances + instanceIndex;
+
+ primrefs_from_instances(
+ globals,
+ bvh,
+ instance,
+ instanceIndex,
+ primrefs,
+ 0,
+ allowUpdate);
+ }
+}
+
+/*
+ Create primrefs from array of instance descriptors.
+ */
+ GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+void kernel
+primrefs_from_DXR_instances_indirect(global struct Globals *globals,
+ global struct BVHBase* bvh,
+ global __const struct GRL_RAYTRACING_INSTANCE_DESC* instances,
+ global struct IndirectBuildRangeInfo* indirect_data,
+ global struct AABB *primrefs,
+ uint allowUpdate)
+{
+ // TODO: On DG2, we have 8 dwords of 'inline data' which can be pushed
+ // directly to the kernel. THe rest of the kernel args are pulled using
+ // loads from memory. It may be more efficient to put 'numInstances' and
+ // 'allowUpdate' into 'globals'
+
+ const uint instanceIndex = get_local_id(0) + get_group_id(0) * MAX_HW_SIMD_WIDTH;
+
+ if (instanceIndex < indirect_data->primitiveCount)
+ {
+ instances = (global __const struct GRL_RAYTRACING_INSTANCE_DESC*)
+ (((global char*)instances) + indirect_data->primitiveOffset);
+ global __const struct GRL_RAYTRACING_INSTANCE_DESC* instance = instances + instanceIndex;
+ primrefs_from_instances(
+ globals,
+ bvh,
+ instance,
+ instanceIndex,
+ primrefs,
+ 0,
+ allowUpdate);
+ }
+}
+
+/*
+ Create primrefs from array of pointers to instance descriptors.
+ */
+ GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) void kernel
+primrefs_from_DXR_instances_pointers(global struct Globals *globals,
+ global struct BVHBase* bvh,
+ global void *instances_in,
+ uint numInstances,
+ global struct AABB *primrefs,
+ uint allowUpdate)
+{
+ global const struct GRL_RAYTRACING_INSTANCE_DESC **instances =
+ (global const struct GRL_RAYTRACING_INSTANCE_DESC **)instances_in;
+
+ const uint instanceIndex = get_sub_group_local_id() + get_group_id(0) * MAX_HW_SIMD_WIDTH;
+ if (instanceIndex < numInstances)
+ {
+ global __const struct GRL_RAYTRACING_INSTANCE_DESC* instance = instances[instanceIndex];
+
+ primrefs_from_instances(
+ globals,
+ bvh,
+ instance,
+ instanceIndex,
+ primrefs,
+ 0,
+ allowUpdate);
+ }
+}
+
+/*
+ Create primrefs from array of pointers to instance descriptors.
+ */
+ GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+void kernel
+primrefs_from_DXR_instances_pointers_indirect(global struct Globals *globals,
+ global struct BVHBase* bvh,
+ global void *instances_in,
+ global struct AABB *primrefs,
+ global struct IndirectBuildRangeInfo* indirect_data,
+ uint allowUpdate)
+{
+ global const struct GRL_RAYTRACING_INSTANCE_DESC **instances =
+ (global const struct GRL_RAYTRACING_INSTANCE_DESC **)instances_in;
+
+ const uint instanceIndex = get_local_id(0) + get_group_id(0) * MAX_HW_SIMD_WIDTH;
+
+ if (instanceIndex < indirect_data->primitiveCount)
+ {
+ instances = (global const struct GRL_RAYTRACING_INSTANCE_DESC**)
+ (((global char*)instances) + indirect_data->primitiveOffset);
+ global __const struct GRL_RAYTRACING_INSTANCE_DESC* instance = instances[instanceIndex];
+
+ primrefs_from_instances(
+ globals,
+ bvh,
+ instance,
+ instanceIndex,
+ primrefs,
+ 0,
+ allowUpdate);
+ }
+}
+
+
+///////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////
+
+bool can_pair( uint3 a, uint3 b )
+{
+ bool match0 = any( a.xxx == b.xyz ) ? 1 : 0;
+ bool match1 = any( a.yyy == b.xyz ) ? 1 : 0;
+ bool match2 = any( a.zzz == b.xyz ) ? 1 : 0;
+ return (match0 + match1 + match2) >= 2;
+}
+
+void reduce_bounds(
+ float3 lower,
+ float3 upper,
+ global struct Globals* globals,
+ global struct BVHBase* bvh )
+{
+
+ // reduce centroid bounds... make sure to exclude lanes with invalid AABBs
+ float3 cent = lower + upper;
+ float3 cent_lower = select( (float3)(INFINITY, INFINITY, INFINITY), cent, lower <= upper);
+ float3 cent_upper = select(-(float3)(INFINITY, INFINITY, INFINITY), cent, lower <= upper);
+
+ // reduce geo bounds
+ AABB3f_atomic_merge_global_sub_group_lu( &bvh->Meta.bounds, lower, upper );
+ AABB_global_atomic_merge_sub_group_lu(&globals->centroidBounds, cent_lower, cent_upper );
+}
+
+
+struct TriState
+{
+ bool valid;
+ uint prim_index;
+ uint pairing;
+ uint3 indices;
+ float3 lower;
+ float3 upper;
+};
+
+#define NOT_PAIRED 0xffffffff
+
+void load_triangle_data(uniform global char* index_buffer,
+ uniform const uint index_format,
+ uniform global char* vertex_buffer,
+ uniform const uint vertex_format,
+ uniform const uint vertex_stride,
+ uniform global float* transform_buffer,
+ uniform uint total_vert_count,
+ struct TriState* state,
+ float4* v)
+{
+ state->indices = GRL_load_indices_from_buffer(index_buffer, index_format, state->prim_index );
+
+ const uint last_vertex = total_vert_count - 1;
+ const uint x = min(state->indices.x, last_vertex);
+ const uint y = min(state->indices.y, last_vertex);
+ const uint z = min(state->indices.z, last_vertex);
+
+ GRL_load_triangle_vertices(vertex_buffer, vertex_format, vertex_stride, transform_buffer, x, y, z, v);
+}
+
+struct TriState load_triangle( uniform global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc,
+ uniform uint base,
+ uniform uint num_prims,
+ uniform uint total_vert_count )
+{
+
+ struct TriState state;
+ state.pairing = NOT_PAIRED;
+ state.valid = false;
+ state.prim_index = base + get_sub_group_local_id();
+ state.lower = (float3)(INFINITY, INFINITY, INFINITY);
+ state.upper = -(float3)(INFINITY, INFINITY, INFINITY);
+
+ if (state.prim_index < num_prims)
+ {
+ state.valid = true;
+ float4 v[3];
+ load_triangle_data((global char*)geomDesc->Desc.Triangles.pIndexBuffer,
+ geomDesc->Desc.Triangles.IndexFormat,
+ (global char*)geomDesc->Desc.Triangles.pVertexBuffer,
+ geomDesc->Desc.Triangles.VertexFormat,
+ geomDesc->Desc.Triangles.VertexBufferByteStride,
+ (global float*)geomDesc->Desc.Triangles.pTransformBuffer,
+ total_vert_count,
+ &state,
+ v);
+
+ if (state.indices.x >= total_vert_count || state.indices.y >= total_vert_count || state.indices.z >= total_vert_count ||
+ !isfinite_vertex(v[0]) || !isfinite_vertex(v[1]) || !isfinite_vertex(v[2]) ||
+ state.indices.x == state.indices.y || state.indices.x == state.indices.z || state.indices.y == state.indices.z)
+ {
+ state.valid = false;
+ }
+ else
+ {
+ state.lower.xyz = min(v[2].xyz, min(v[1].xyz, v[0].xyz));
+ state.upper.xyz = max(v[2].xyz, max(v[1].xyz, v[0].xyz));
+ }
+ }
+ return state;
+}
+
+void broadcast_triangles_local( struct TriState* state )
+{
+ varying uint my_prim = state->prim_index;
+ varying uint my_pairing = state->pairing;
+ varying float3 my_lower = state->lower;
+ varying float3 my_upper = state->upper;
+ varying bool valid = state->valid;
+ varying uint3 indices = state->indices;
+
+ for (uniform uint broadcast_lane = 0; broadcast_lane < get_sub_group_size(); broadcast_lane++)
+ {
+ // don't broadcast invalid prims
+ if ( !sub_group_broadcast( valid, broadcast_lane ) )
+ continue;
+
+ uint broadcast_pairing = sub_group_broadcast(my_pairing, broadcast_lane);
+ uint broadcast_prim = sub_group_broadcast(my_prim, broadcast_lane);
+
+ if (broadcast_pairing == NOT_PAIRED)
+ {
+ // if the broadcast prim is not paired already, all unpaired lanes attempt to pair with it
+ bool pairable = false;
+ uint3 other_indices = sub_group_broadcast_uint3( indices, broadcast_lane );
+ if (broadcast_prim != my_prim && my_pairing == NOT_PAIRED && valid )
+ {
+ pairable = can_pair( indices, other_indices );
+ }
+
+
+ uint pairable_lane = ctz(intel_sub_group_ballot(pairable));
+ if (valid && pairable_lane < get_sub_group_size())
+ {
+ // pair the broadcast primitive with the first lane that can accept it
+ float3 broadcast_lower = sub_group_broadcast_float3(my_lower.xyz, broadcast_lane);
+ float3 broadcast_upper = sub_group_broadcast_float3(my_upper.xyz, broadcast_lane);
+ if (get_sub_group_local_id() == pairable_lane)
+ {
+ my_pairing = broadcast_prim;
+ my_lower.xyz = min(my_lower.xyz, broadcast_lower);
+ my_upper.xyz = max(my_upper.xyz, broadcast_upper);
+ }
+
+ // pair the broadcast primitive with the same that was paired to it
+ uint pairable_prim = sub_group_broadcast(my_pairing, pairable_lane);
+ if (get_sub_group_local_id() == broadcast_lane)
+ {
+ my_pairing = pairable_prim;
+ }
+ }
+ }
+ else
+ {
+ //
+ // if this lane was already paired with the broadcasting tri
+ // in an earlier loop iteration, then record the pairing in this lane's registers
+ float3 broadcast_lower = sub_group_broadcast_float3(my_lower.xyz, broadcast_lane);
+ float3 broadcast_upper = sub_group_broadcast_float3(my_upper.xyz, broadcast_lane);
+ if (broadcast_pairing == my_prim)
+ {
+ my_pairing = broadcast_prim;
+ my_lower.xyz = min(my_lower.xyz, broadcast_lower);
+ my_upper.xyz = max(my_upper.xyz, broadcast_upper);
+ }
+ }
+ }
+
+ state->pairing = my_pairing;
+ state->lower = my_lower;
+ state->upper = my_upper;
+}
+
+
+void broadcast_triangles_nonlocal(struct TriState* state, const struct TriState* other )
+{
+ varying uint my_prim = state->prim_index;
+ varying uint my_pairing = state->pairing;
+ varying float3 my_lower = state->lower;
+ varying float3 my_upper = state->upper;
+ varying bool valid = state->valid;
+ varying uint3 indices = state->indices;
+
+ for (uniform uint broadcast_lane = 0; broadcast_lane < get_sub_group_size(); broadcast_lane++)
+ {
+ // don't broadcast invalid prims
+ if (!sub_group_broadcast(other->valid, broadcast_lane))
+ continue;
+
+ uint broadcast_pairing = sub_group_broadcast(other->pairing, broadcast_lane);
+ uint broadcast_prim = sub_group_broadcast(other->prim_index, broadcast_lane);
+
+ if (broadcast_pairing == NOT_PAIRED)
+ {
+ // if the broadcast prim is not paired already, all unpaired lanes attempt to pair with it
+ bool pairable = false;
+ if ( my_pairing == NOT_PAIRED && valid )
+ {
+ uint3 other_indices = sub_group_broadcast_uint3(other->indices, broadcast_lane);
+ pairable = can_pair(indices, other_indices);
+ }
+
+ // pair the broadcast primitive with the first lane that can accept it
+ uint pairable_mask = intel_sub_group_ballot(pairable);
+ if (valid && (ctz(pairable_mask) == get_sub_group_local_id()))
+ {
+ my_pairing = broadcast_prim;
+ my_lower.xyz = min(my_lower.xyz, sub_group_broadcast_float3(other->lower.xyz, broadcast_lane));
+ my_upper.xyz = max(my_upper.xyz, sub_group_broadcast_float3(other->upper.xyz, broadcast_lane));
+ }
+ }
+
+ }
+
+ state->pairing = my_pairing;
+ state->lower = my_lower;
+ state->upper = my_upper;
+}
+
+GRL_INLINE void do_triangles_to_primrefs(
+ global struct Globals* globals,
+ global struct BVHBase* bvh,
+ global struct AABB* primref,
+ global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc,
+ uint geomID_and_flags,
+ const uint num_prims)
+{
+ uint geomID = geomID_and_flags & 0x00ffffff;
+ uint geom_flags = geomID_and_flags >> 24;
+ uint prim_base = get_group_id(0) * get_local_size(0);
+ uint total_vert_count = GRL_get_triangles_VertexCount(geomDesc);
+
+ struct TriState tri = load_triangle( geomDesc, prim_base, num_prims, total_vert_count );
+ broadcast_triangles_local( &tri );
+
+
+ // we will produce output if the lane creates a triangle (my_pairing == NOT_PAIRED)
+ // or for the lane corresponding to the larger of two triangles
+ bool will_write = (tri.pairing > tri.prim_index) && tri.valid;
+ uint write_mask = intel_sub_group_ballot(will_write);
+ uint write_offs = subgroup_bit_prefix_exclusive( write_mask );
+ uint write_count = popcount(write_mask);
+
+ // allocate space in primref buffer
+ uint write_base;
+ if( get_sub_group_local_id() == 0 )
+ write_base = atomic_add_global( &globals->numPrimitives, write_count );
+ write_offs += sub_group_broadcast( write_base, 0 );
+
+ uint primID0 = tri.prim_index;
+ uint primID1 = (tri.pairing != NOT_PAIRED) ? tri.pairing : tri.prim_index;
+
+ if (will_write)
+ {
+ PrimRef ref;
+ PRIMREF_setAABB(&ref, tri.lower.xyz, tri.upper.xyz);
+ PRIMREF_setQuadMetaData(&ref, primID0, primID1, geomID, geom_flags);
+ uint8 val = (uint8)(
+ as_uint(ref.lower.x), as_uint(ref.lower.y), as_uint(ref.lower.z), as_uint(ref.lower.w),
+ as_uint(ref.upper.x), as_uint(ref.upper.y), as_uint(ref.upper.z), as_uint(ref.upper.w));
+ store_uint8_L1WB_L3WB((global uint8*)(primref + write_offs), 0, val);
+ }
+
+ reduce_bounds( tri.lower, tri.upper, globals, bvh );
+}
+
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(16, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16))) void kernel
+triangles_to_primrefs(
+ global struct Globals* globals,
+ global struct BVHBase* bvh,
+ global struct AABB* primref,
+ global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc,
+ uint geomID_and_flags,
+ uint num_prims
+ )
+{
+ do_triangles_to_primrefs(globals, bvh, primref, geomDesc, geomID_and_flags, num_prims);
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(16, 1, 1)))
+void kernel
+triangles_to_primrefs_indirect(
+ global struct Globals* globals,
+ global struct BVHBase* bvh,
+ global struct AABB* primref,
+ global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc,
+ global struct IndirectBuildRangeInfo* indirect_data,
+ uint geomID_and_flags)
+{
+ const uint num_prims = indirect_data->primitiveCount;
+ do_triangles_to_primrefs(globals, bvh, primref, geomDesc, geomID_and_flags, num_prims);
+}
+
+GRL_INLINE void do_procedurals_to_primrefs(
+ global struct Globals* globals,
+ global struct BVHBase* bvh,
+ global struct AABB* primref,
+ global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc,
+ uint geomID_and_flags,
+ const uint num_prims)
+{
+ uint geomID = geomID_and_flags & 0x00ffffff;
+ uint geomFlags = geomID_and_flags >> 24;
+
+ uint primID = get_group_id(0) * get_local_size(0) + get_sub_group_local_id();
+
+ bool create_primref = false;
+ float3 lower = (float3)(INFINITY, INFINITY, INFINITY);
+ float3 upper = -(float3)(INFINITY, INFINITY, INFINITY);
+ if (primID < num_prims)
+ {
+ /* check if procedural is valid */
+ struct GRL_RAYTRACING_AABB aabb = GRL_load_aabb(geomDesc, primID);
+ const bool valid_min = isfinite(aabb.MinX) && isfinite(aabb.MinY) && isfinite(aabb.MinZ);
+ const bool valid_max = isfinite(aabb.MaxX) && isfinite(aabb.MaxY) && isfinite(aabb.MaxZ);
+ if (valid_min & valid_max)
+ {
+ /* load aabb from memory */
+ float3 l = (float3)(aabb.MinX, aabb.MinY, aabb.MinZ);
+ float3 u = (float3)(aabb.MaxX, aabb.MaxY, aabb.MaxZ);
+
+ // convert degenerate boxes to points at the box centroid
+ lower = min( l, u );
+ upper = max( l, u );
+
+ create_primref = true;
+ }
+ }
+
+ uint write_mask = intel_sub_group_ballot(create_primref);
+ uint write_offs = subgroup_bit_prefix_exclusive(write_mask);
+ uint write_count = popcount(write_mask);
+
+ // allocate space in primref buffer
+ uint write_base;
+ if (get_sub_group_local_id() == 0)
+ write_base = atomic_add_global(&globals->numPrimitives, write_count);
+ write_offs += sub_group_broadcast(write_base, 0);
+
+ // write the primref
+ if (create_primref)
+ {
+ PrimRef ref;
+ PRIMREF_setAABB(&ref, lower.xyz, upper.xyz);
+ PRIMREF_setProceduralMetaData(&ref, geomID, primID, geomFlags);
+ primref[write_offs] = ref;
+ }
+
+ reduce_bounds(lower, upper, globals, bvh);
+
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(16, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16))) void kernel
+procedurals_to_primrefs(
+ global struct Globals* globals,
+ global struct BVHBase* bvh,
+ global struct AABB* primref,
+ global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc,
+ uint geomID_and_flags,
+ uint num_prims
+ )
+{
+ do_procedurals_to_primrefs(globals, bvh, primref, geomDesc, geomID_and_flags, num_prims);
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(16, 1, 1)))
+void kernel
+procedurals_to_primrefs_indirect(
+ global struct Globals* globals,
+ global struct BVHBase* bvh,
+ global struct AABB* primref,
+ global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc,
+ global const struct IndirectBuildRangeInfo* indirect_data,
+ uint geomID_and_flags
+ )
+{
+ const uint num_prims = indirect_data->primitiveCount;
+ do_procedurals_to_primrefs(globals, bvh, primref, geomDesc, geomID_and_flags, num_prims);
+}
diff --git a/src/intel/vulkan/grl/gpu/bvh_build_primref.h b/src/intel/vulkan/grl/gpu/bvh_build_primref.h
new file mode 100644
index 00000000000..25e2d3df194
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/bvh_build_primref.h
@@ -0,0 +1,246 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#if 0
+/*
+
+Create primrefs from array of instance descriptors.
+
+*/
+
+void store_instance_primref(
+ global struct BVHBase* top_bvh,
+ global struct Globals* globals,
+ global PrimRef* primrefs,
+ bool alloc_primref,
+ PrimRef new_primref )
+{
+ uint allocatePrimref = alloc_primref ? 1 : 0;
+ uint index = 0;
+ uint numAllocations = sub_group_reduce_add(allocatePrimref);
+
+ if (get_sub_group_local_id() == 0)
+ {
+ index = atomic_add_global(&globals->numPrimitives, numAllocations);
+ }
+
+ index = sub_group_broadcast(index, 0);
+ index = index + sub_group_scan_exclusive_add(allocatePrimref);
+
+ if (allocatePrimref)
+ {
+ primrefs[index] = new_primref;
+ }
+
+ struct AABB centroidBounds;
+ centroidBounds.lower = centroidBounds.upper = AABB_centroid2(&new_primref);
+ struct AABB subgroup_bbox = AABB_sub_group_reduce(&new_primref);
+ struct AABB subgroup_CentroidBounds = AABB_sub_group_reduce(&centroidBounds);
+
+ if (get_sub_group_local_id() == 0)
+ {
+ AABB3f_atomic_merge_global_lu(&top_bvh->Meta.bounds, subgroup_bbox.lower.xyz, subgroup_bbox.upper.xyz);
+ AABB_global_atomic_merge(&globals->centroidBounds, &subgroup_CentroidBounds);
+ }
+}
+
+
+
+// Compute transformed blas AABB. Returns false if instance is degenerate
+bool create_instance_primref(
+ PrimRef* ref_out,
+ global __const struct GRL_RAYTRACING_INSTANCE_DESC* instance,
+ global struct BVHBase* bvh,
+ uint instanceMask,
+ uint instanceIndex
+ )
+{
+ struct AABB3f bbox;
+ bool alloc_primref = false;
+ uint rootNodeOffset = NO_NODE_OFFSET;
+ if (bvh != 0)
+ {
+ alloc_primref = true;
+ AABB3f AS_bounds = BVHBase_GetRootAABB(bvh);
+
+ const bool valid_min = isfinite(AS_bounds.lower[0]) && isfinite(AS_bounds.lower[1]) && isfinite(AS_bounds.lower[2]);
+ const bool valid_max = isfinite(AS_bounds.upper[0]) && isfinite(AS_bounds.upper[1]) && isfinite(AS_bounds.upper[2]);
+
+ if (!valid_min || !valid_max || instanceMask == 0)
+ {
+ // degenerated instance case
+
+ // TODO this should be under if ( allocate backpointers )
+ {
+ // we have to allocate the primref because this instance can be updated to non-degenerated
+ // take the origin of the instance as a bounding box.
+
+ bbox.lower[0] = instance->Transform[3];
+ bbox.lower[1] = instance->Transform[7];
+ bbox.lower[2] = instance->Transform[11];
+ bbox.upper[0] = instance->Transform[3];
+ bbox.upper[1] = instance->Transform[7];
+ bbox.upper[2] = instance->Transform[11];
+ instanceMask = 0;
+ }
+ }
+ else
+ {
+ rootNodeOffset = BVH_ROOT_NODE_OFFSET;
+ float transformOverhead = 0.0f;
+ bbox = compute_xfm_bbox(instance->Transform, BVHBase_GetRootNode(bvh), XFM_BOX_NOT_REFINED_TAKE_CLIPBOX, &AS_bounds, transformOverhead);
+ }
+ }
+
+ *ref_out = PRIMREF_set_instance(AABB3f_load_lower(&bbox), AABB3f_load_upper(&bbox), instanceIndex, instanceMask, rootNodeOffset, 0);
+ return alloc_primref;
+}
+
+GRL_INLINE void primrefs_from_instances(
+ global struct Globals* globals,
+ global struct BVHBase* top_bvh,
+ global __const struct GRL_RAYTRACING_INSTANCE_DESC* instance,
+ uint instanceIndex,
+ global struct AABB* primrefs)
+{
+ bool alloc_primref = false;
+ PrimRef new_primref;
+ AABB_init(&new_primref);
+
+ if (instance)
+ {
+ uint mask = GRL_get_InstanceMask(instance);
+ global struct BVHBase* bvh = (global struct BVHBase*)instance->AccelerationStructure;
+ alloc_primref = create_instance_primref(&new_primref, instance, bvh, mask, instanceIndex);
+ }
+
+ store_instance_primref(top_bvh, globals, primrefs, alloc_primref, new_primref);
+}
+#endif
+
+#if 1
+GRL_INLINE void primrefs_from_instances(
+ global struct Globals* globals,
+ global struct BVHBase* top_bvh,
+ global __const struct GRL_RAYTRACING_INSTANCE_DESC* instance,
+ uint instanceIndex,
+ global struct AABB* primrefs,
+ global GRL_RAYTRACING_AABB* procedural_aabb,
+ uint allowUpdate
+ )
+{
+ struct AABB3f bbox;
+ uint allocatePrimref = 0;
+
+ uint rootNodeOffset = NO_NODE_OFFSET;
+ uint instanceMask = 0;
+
+ bool is_procedural = (procedural_aabb != 0);
+
+ if( instance )
+ {
+ instanceMask = GRL_get_InstanceMask(instance) ;
+ if ( is_procedural )
+ {
+ // procedural instance primref
+ allocatePrimref = 1;
+
+ float3 lower = (float3)(procedural_aabb->MinX, procedural_aabb->MinY, procedural_aabb->MinZ);
+ float3 upper = (float3)(procedural_aabb->MaxX, procedural_aabb->MaxY, procedural_aabb->MaxZ);
+
+ if (instanceMask == 0 || any(lower > upper))
+ {
+ bbox.lower[0] = instance->Transform[3];
+ bbox.lower[1] = instance->Transform[7];
+ bbox.lower[2] = instance->Transform[11];
+ bbox.upper[0] = instance->Transform[3];
+ bbox.upper[1] = instance->Transform[7];
+ bbox.upper[2] = instance->Transform[11];
+ instanceMask = 0;
+ }
+ else
+ {
+ bbox = transform_aabb(lower, upper, instance->Transform);
+ }
+ }
+ else
+ {
+ // HW-instance primref
+
+ global struct BVHBase* bvh = instance ?
+ (global struct BVHBase*)instance->AccelerationStructure :
+ 0;
+
+ if (bvh != 0)
+ {
+ AABB3f AS_bounds = BVHBase_GetRootAABB(bvh);
+
+ const bool valid_min = isfinite(AS_bounds.lower[0]) && isfinite(AS_bounds.lower[1]) && isfinite(AS_bounds.lower[2]);
+ const bool valid_max = isfinite(AS_bounds.upper[0]) && isfinite(AS_bounds.upper[1]) && isfinite(AS_bounds.upper[2]);
+
+
+ if (valid_min && valid_max && instanceMask != 0)
+ {
+ allocatePrimref = 1;
+ rootNodeOffset = BVH_ROOT_NODE_OFFSET;
+ float transformOverhead = 0.0f;
+ bbox = compute_xfm_bbox(instance->Transform, BVHBase_GetRootNode(bvh), XFM_BOX_NOT_REFINED_TAKE_CLIPBOX, &AS_bounds, transformOverhead);
+ }
+ else if (allowUpdate)
+ {
+ // degenerated instance case
+ // we have to allocate the primref because this instance can be updated to non-degenerated
+ // take the origin of the instance as a bounding box.
+ allocatePrimref = 1;
+ bbox.lower[0] = instance->Transform[3];
+ bbox.lower[1] = instance->Transform[7];
+ bbox.lower[2] = instance->Transform[11];
+ bbox.upper[0] = instance->Transform[3];
+ bbox.upper[1] = instance->Transform[7];
+ bbox.upper[2] = instance->Transform[11];
+ instanceMask = 0;
+ }
+ }
+ }
+ }
+
+ uint index = 0;
+ uint numAllocations = sub_group_reduce_add(allocatePrimref);
+
+ if (get_sub_group_local_id() == 0)
+ {
+ index = atomic_add_global(&globals->numPrimitives, numAllocations);
+ }
+
+ index = sub_group_broadcast(index, 0);
+ index = index + sub_group_scan_exclusive_add(allocatePrimref);
+
+ struct AABB new_primref;
+ struct AABB centroidBounds;
+ if (allocatePrimref)
+ {
+ new_primref = PRIMREF_set_instance(AABB3f_load_lower(&bbox), AABB3f_load_upper(&bbox), instanceIndex, instanceMask, rootNodeOffset, is_procedural);
+ primrefs[index] = new_primref;
+ centroidBounds.lower = centroidBounds.upper = AABB_centroid2(&new_primref);
+ }
+ else
+ {
+ AABB_init(&new_primref);
+ AABB_init(&centroidBounds);
+ }
+
+
+ struct AABB subgroup_bbox = AABB_sub_group_reduce(&new_primref);
+ struct AABB subgroup_CentroidBounds = AABB_sub_group_reduce(&centroidBounds);
+
+ if (get_sub_group_local_id() == 0)
+ {
+ AABB3f_atomic_merge_global_lu(&top_bvh->Meta.bounds, subgroup_bbox.lower.xyz, subgroup_bbox.upper.xyz);
+ AABB_global_atomic_merge(&globals->centroidBounds, &subgroup_CentroidBounds);
+ }
+}
+#endif
diff --git a/src/intel/vulkan/grl/gpu/bvh_build_refit.cl b/src/intel/vulkan/grl/gpu/bvh_build_refit.cl
new file mode 100644
index 00000000000..bcda2fa54ec
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/bvh_build_refit.cl
@@ -0,0 +1,491 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#include "bvh_build_refit.h"
+#include "api_interface.h"
+#include "common.h"
+
+
+
+
+
+#if 0
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__( (reqd_work_group_size( 16, 1, 1 )) )
+void kernel
+update_instance_leaves( global struct BVHBase* bvh,
+ uint64_t dxrInstancesArray,
+ uint64_t dxrInstancesPtr,
+ global struct AABB3f* instance_aabb_scratch
+)
+{
+ uint num_leaves = BVHBase_GetNumHWInstanceLeaves( bvh );
+ uint id = get_local_id( 0 ) + get_local_size( 0 ) * get_group_id( 0 );
+ if ( id >= num_leaves )
+ return;
+
+ global struct GRL_RAYTRACING_INSTANCE_DESC* instancesArray =
+ (global struct GRL_RAYTRACING_INSTANCE_DESC*)dxrInstancesArray;
+ global struct GRL_RAYTRACING_INSTANCE_DESC** instancesPtrArray =
+ (global struct GRL_RAYTRACING_INSTANCE_DESC**)dxrInstancesPtr;
+
+ global struct HwInstanceLeaf* leafs = (global struct HwInstanceLeaf*) BVHBase_GetHWInstanceLeaves( bvh );
+
+ /* iterate over all children of the instance node and get their bounds */
+
+ uint32_t instanceIdx = HwInstanceLeafPart1_getInstanceIndex( &leafs[id] );
+ global struct GRL_RAYTRACING_INSTANCE_DESC* instance = NULL;
+ if ( dxrInstancesArray != NULL )
+ instance = &instancesArray[instanceIdx];
+ else
+ instance = instancesPtrArray[instanceIdx];
+
+ struct AffineSpace3f xfm = AffineSpace3f_load_row_major( instance->Transform );
+ global struct BVHBase* instanceBvh = (global struct BVHBase*)instance->AccelerationStructure;
+ struct AABB3f newSubtreeBounds = instanceBvh->Meta.bounds;
+ struct AABB3f bbox = AABB3f_transform( xfm, newSubtreeBounds ); // JDB TODO: Use faster abs-matrix method
+
+ const bool valid_min = isfinite( bbox.lower[0] ) && isfinite( bbox.lower[1] ) && isfinite( bbox.lower[2] );
+ const bool valid_max = isfinite( bbox.upper[0] ) && isfinite( bbox.upper[1] ) && isfinite( bbox.upper[2] );
+
+ uint mask = GRL_get_InstanceMask(instance);
+
+ uint offset = instanceBvh->rootNodeOffset;
+ if ( !valid_min || !valid_max )
+ {
+ bbox.lower[0] = xfm.p.x;
+ bbox.lower[1] = xfm.p.y;
+ bbox.lower[2] = xfm.p.z;
+ bbox.upper[0] = xfm.p.x;
+ bbox.upper[1] = xfm.p.y;
+ bbox.upper[2] = xfm.p.z;
+ offset = NO_NODE_OFFSET;
+ mask = 0;
+ }
+
+ instance_aabb_scratch[id] = bbox;
+
+ HwInstanceLeaf_Constructor( &leafs[id], instance, instanceIdx, offset, mask ); // TODO: No instance opening for refittable BVH
+}
+#endif
+
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(16, 1, 1)))
+void kernel
+update_instance_leaves(global struct BVHBase* bvh,
+ uint64_t dxrInstancesArray,
+ uint64_t dxrInstancesPtr,
+ global struct AABB3f* instance_aabb_scratch
+)
+{
+ uint num_leaves = BVHBase_GetNumHWInstanceLeaves(bvh);
+ uint id = get_local_id(0) + get_local_size(0) * get_group_id(0);
+ if (id >= num_leaves)
+ return;
+
+ DO_update_instance_leaves(
+ bvh,
+ dxrInstancesArray,
+ dxrInstancesPtr,
+ instance_aabb_scratch,
+ id,
+ 0 );
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(16, 1, 1)))
+void kernel
+update_instance_leaves_indirect(global struct BVHBase* bvh,
+ uint64_t dxrInstancesArray,
+ uint64_t dxrInstancesPtr,
+ global struct AABB3f* instance_aabb_scratch,
+ global struct IndirectBuildRangeInfo* indirect_data)
+{
+ uint num_leaves = BVHBase_GetNumHWInstanceLeaves(bvh);
+ uint id = get_local_id(0) + get_local_size(0) * get_group_id(0);
+ if (id >= num_leaves)
+ return;
+
+ DO_update_instance_leaves(
+ bvh,
+ dxrInstancesArray + indirect_data->primitiveOffset,
+ dxrInstancesPtr,
+ instance_aabb_scratch,
+ id,
+ 0 );
+}
+
+#if 0
+/*
+
+ This kernel refit a BVH. The algorithm iterates over all BVH nodes
+ to find all leaf nodes, which is where refitting starts. For these
+ leaf nodes bounds get recalculated and then propagates up the tree.
+
+ One kernel instance considers a range of inner nodes as startpoints.
+ */
+ GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(8, 1, 1))) void kernel refit(
+ global struct BVHBase *bvh,
+ global GRL_RAYTRACING_GEOMETRY_DESC* geosArray,
+ global struct AABB3f* instance_leaf_aabbs )
+{
+ /* here we temporarily store the bounds for the children of a node */
+ struct AABB childrenAABB[BVH_NODE_N6];
+
+ /* get pointer to inner nodes and back pointers */
+ global struct QBVHNodeN *inner_nodes = BVHBase_rootNode(bvh);
+ BackPointers* backPointers = BVHBase_GetBackPointers(bvh);
+
+ /* construct range of nodes that each work group will process */
+ const uint numInnerNodes = BVHBase_numNodes(bvh);
+ const uint startID = (get_group_id(0) + 0) * numInnerNodes / get_num_groups(0);
+ const uint endID = (get_group_id(0) + 1) * numInnerNodes / get_num_groups(0);
+
+ /* each workgroup iterates over its range of nodes */
+ for (uint i = startID + get_local_id(0); i < endID; i += get_local_size(0))
+ {
+ global struct QBVHNodeN* curNode = &inner_nodes[i];
+ uint numChildren = refit_bottom(bvh, geosArray,
+ instance_leaf_aabbs,
+ curNode,
+ childrenAABB,
+ *InnerNode_GetBackPointer(backPointers, i));
+ if (numChildren != 0)
+ {
+ /* update bounds of node */
+ QBVHNodeN_setBounds(curNode, childrenAABB, numChildren);
+
+ /* refit upper parts of the BVH */
+ // TODO: this will not gonna work for mixed nodes
+ refit_bottom_up(curNode, bvh, childrenAABB, numChildren);
+ }
+ }
+}
+
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(8, 1, 1)))
+void kernel Find_refit_treelets(
+ global struct BVHBase* bvh,
+ global TreeletNodeData* treelets,
+ global uint* scratchStartpoints,
+ global uint* startpointAlloc)
+{
+ find_refit_treelets(bvh,
+ treelets,
+ scratchStartpoints,
+ startpointAlloc);
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(16, 1, 1)))
+void kernel Assign_refit_startpoints_to_treelets(
+ global struct BVHBase* bvh,
+ global TreeletNodeData* treelets,
+ global uint* scratchStartpoints)
+{
+ assign_refit_startpoints_to_treelets(bvh, treelets, scratchStartpoints);
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(128, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16)))
+void kernel Finalize_treelets_in_groups(
+ global struct BVHBase* bvh,
+ global uint* scratchStartpoints )
+{
+ local uint depths[FINALIZE_TREELETS_SLM_DEPTHS_SPACE];
+
+ finalize_treelets_in_groups(bvh, scratchStartpoints, depths);
+}
+
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(256, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16)))
+void kernel Refit_quads_tree_per_group(global SquashedInput* psqinputs)
+{
+ uint group_id = get_group_id(0);
+ SquashedInput sqinput = psqinputs[group_id];
+ global struct BVHBase* bvh = sqinput.pBvh;
+ uint numLeaves = BVHBase_GetNumQuads(bvh);
+ global QuadLeaf* leafs = (global QuadLeaf*)BVHBase_GetQuadLeaves(bvh);
+
+ global void* input = sqinput.pInput;
+ global struct AABB* bbox_scratch = sqinput.bbox_scratch;
+
+ uint leafsIndexOffset = bvh->quadLeafStart - BVH_ROOT_NODE_OFFSET / 64;
+ global GRL_RAYTRACING_GEOMETRY_DESC* geosArray = (global GRL_RAYTRACING_GEOMETRY_DESC*) input;
+ uint id = get_local_id(0);
+
+ for (uint leaf_id = id; leaf_id < numLeaves; leaf_id += get_local_size(0))
+ {
+ struct AABB theAABB;
+ refit_bottom_child_quad(leafs + leaf_id, geosArray, &theAABB);
+ theAABB.lower.w = as_float(0xABBADEFFu);
+ bbox_scratch[leafsIndexOffset + leaf_id] = theAABB;
+ }
+}
+
+
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(32, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16)))
+void kernel Refit_quads(
+ global struct BVHBase* bvh,
+ global void* input,
+ global struct AABB* bbox_scratch,
+ uint numGroupsExecuted,
+ global SquashedInputGroupDesc* sqinput)
+{
+ uint numLeafs = BVHBase_GetNumQuads(bvh);
+ if (numLeafs == 0) return;
+ global QuadLeaf* leafs = (global QuadLeaf*)BVHBase_GetQuadLeaves(bvh);
+
+ global GRL_RAYTRACING_GEOMETRY_DESC* geosArray = (global GRL_RAYTRACING_GEOMETRY_DESC*) input;
+ uint leafsIndexOffset = bvh->quadLeafStart - BVH_ROOT_NODE_OFFSET / 64;
+
+ uint numLeafsPerGr = (numLeafs + (numGroupsExecuted - 1)) / numGroupsExecuted;
+
+ uint id_start = get_group_id(0) * numLeafsPerGr + get_local_id(0);
+ uint id_end = min(id_start + numLeafsPerGr, numLeafs);
+ for (uint id = id_start; id < id_end; id+= get_local_size(0))
+ {
+ struct AABB theAABB;
+ refit_bottom_child_quad(leafs + id, geosArray, &theAABB);
+ theAABB.lower.w = as_float(0xABBADEFFu);
+ bbox_scratch[leafsIndexOffset + id] = theAABB;
+ }
+
+ if (get_group_id(0) == 0 && get_local_id(0) < 16)
+ {
+
+ uint groupnr;
+ uint treeletCnt = *BVHBase_GetRefitTreeletCntPtr(bvh);
+ if (get_sub_group_local_id() == 0) {
+ groupnr = atomic_add_global(&sqinput->totalNumGroups, treeletCnt);
+ }
+ groupnr = sub_group_broadcast(groupnr, 0);
+ for (uint subtree = get_sub_group_local_id(); subtree < treeletCnt; subtree += get_sub_group_size())
+ {
+ uint gr = groupnr + subtree;
+ //printf("tree %llx, treelet %d/%d, grId %d, numStartpoints %d\n", bvh, subtree,treeletCnt, gr, BVHBase_GetRefitTreeletDescs(bvh)[subtree].numStartpoints);
+ sqinput[gr].bvh = (qword)bvh;
+ sqinput[gr].scratch = (qword)bbox_scratch;
+ sqinput[gr].groupInTree = subtree;
+ }
+ //if (get_local_id(0)==0 && treeletCnt > 1)
+ //{
+ // printf("tree %llx, tip treelet %d/%d = numStartpoints %d depth %d\n", bvh, treeletCnt, treeletCnt, BVHBase_GetRefitTreeletDescs(bvh)[treeletCnt].numStartpoints, BVHBase_GetRefitTreeletDescs(bvh)[treeletCnt].maxDepth);
+ //}
+ }
+}
+
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(256, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16)))
+void kernel
+Refit_tree_per_group_quad(
+ global SquashedInput* psqinputs)
+{
+ uint group_id = get_group_id(0);
+ SquashedInput sqinput = psqinputs[group_id];
+ global struct BVHBase* bvh = sqinput.pBvh;
+ global struct AABB* bbox_scratch = sqinput.bbox_scratch;
+ global void* pInput = sqinput.pInput;
+ local Treelet_by_single_group_locals loc;
+
+ if (*BVHBase_GetRefitTreeletCntPtr(bvh) == 0)
+ return;
+
+#if REFIT_DEBUG_CHECKS
+ uint bottoms_cnt = *BVHBase_GetRefitTreeletCntPtr(bvh);
+ if (bottoms_cnt != 1) {
+ if (get_local_id(0) == 0)
+ {
+ printf("Error: this tree has more than 1 treelets!\n");
+ }
+ return;
+ }
+#endif
+
+ /* get pointer to inner nodes and back pointers */
+ uniform global struct QBVHNodeN* inner_nodes = BVHBase_rootNode(bvh);
+
+ // uniform per group
+ uniform RefitTreelet* pTrltDsc = BVHBase_GetRefitTreeletDescs(bvh);
+
+ uint numLeafs = bvh->quadLeafCur - bvh->quadLeafStart;
+
+ if (numLeafs == 0) { return; }
+
+ uint numLeafsByOneThread = (numLeafs + (get_local_size(0) - 1)) / get_local_size(0);
+
+ update_quads(bvh, pInput, bbox_scratch, get_local_id(0), numLeafsByOneThread);
+
+ mem_fence_workgroup_default(); work_group_barrier(0);
+
+ RefitTreelet trltDsc = *pTrltDsc;
+
+ refit_treelet_by_single_group(
+ bbox_scratch,
+ &loc,
+ bvh,
+ trltDsc,
+ false,
+ true);
+
+ if (trltDsc.maxDepth > 0)
+ {
+ mem_fence_workgroup_default(); work_group_barrier(0);
+ post_refit_encode_qnode_tree_per_group(bbox_scratch,bvh);
+ }
+}
+
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(256, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16)))
+void kernel
+Refit_treelet_per_group(
+ global SquashedInputGroupDesc* sqinput)
+{
+ uint group_id = get_group_id(0);
+ global struct AABB* bbox_scratch = (global struct AABB* )sqinput[group_id].scratch;
+ global struct BVHBase* bvh = (global struct BVHBase* )sqinput[group_id].bvh;
+ group_id = sqinput[group_id].groupInTree;
+
+ /* get pointer to inner nodes and back pointers */
+ uniform global struct QBVHNodeN* inner_nodes = BVHBase_rootNode(bvh);
+
+ uint bottoms_cnt = *BVHBase_GetRefitTreeletCntPtr(bvh);
+
+ // uniform per group
+ uniform RefitTreelet* pTrltDsc = BVHBase_GetRefitTreeletDescs(bvh);
+
+ bool should_we_process_treetip = true;
+ local Treelet_by_single_group_locals loc;
+ local bool* l_should_we_process_treetip = (local bool*)&loc;
+#if REFIT_VERBOSE_LOG
+ if (group_id != 0) return;
+#endif
+
+ if (bottoms_cnt > 1)
+ {
+#if REFIT_VERBOSE_LOG
+ for (; group_id < bottoms_cnt; group_id++)
+ {
+ if (get_local_id(0) == 0) { printf("\n ====== treelet %d ====== \n", group_id); }
+ work_group_barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE, memory_scope_device);
+#endif
+ bool rootProcThread = refit_treelet_by_single_group(
+ bbox_scratch,
+ &loc,
+ bvh,
+ pTrltDsc[group_id],
+ true,
+ false);
+
+ // we have to make last group that finishes go up and process the treetip
+ if (rootProcThread)
+ {
+
+ mem_fence_gpu_invalidate();
+ uint finished_cnt = atomic_inc_global((global uint*) & bvh->refitTreeletCnt2);
+ should_we_process_treetip = finished_cnt + 1 == bottoms_cnt;
+
+ * l_should_we_process_treetip = should_we_process_treetip;
+
+ if (should_we_process_treetip) mem_fence_gpu_invalidate();
+ }
+#if REFIT_VERBOSE_LOG
+ }
+#endif
+ work_group_barrier(CLK_LOCAL_MEM_FENCE, memory_scope_work_group);
+
+ should_we_process_treetip = *l_should_we_process_treetip;
+ }
+
+ if (should_we_process_treetip)
+ {
+ //this group will process treetip
+ if (get_local_id(0) == 0) { bvh->refitTreeletCnt2 = 0; }
+ if (bottoms_cnt == 1) { bottoms_cnt = 0; }
+ refit_treelet_by_single_group(
+ bbox_scratch,
+ &loc,
+ bvh,
+ pTrltDsc[bottoms_cnt],
+ true,
+ true);
+ }
+}
+
+/*
+ This kernel refit a BVH. The algorithm iterates over all BVH nodes
+ to find all leaf nodes, which is where refitting starts. For these
+ leaf nodes bounds get recalculated and then propagates up the tree.
+
+ One kernel instance considers exactly one inner_node startpoint.
+ not range of inner nodes.
+ */
+ GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(8, 1, 1))) void kernel
+Refit_per_one_startpoint(
+ global struct BVHBase* bvh,
+ global GRL_RAYTRACING_GEOMETRY_DESC* geosArray,
+ global struct AABB3f* instance_leaf_aabbs )
+{
+ /* here we temporarily store the bounds for the children of a node */
+ struct AABB childrenAABB[BVH_NODE_N6];
+
+ /* get pointer to inner nodes and back pointers */
+ global struct QBVHNodeN* inner_nodes = BVHBase_rootNode(bvh);
+ BackPointers* backPointers = BVHBase_GetBackPointers(bvh);
+
+ /* get the inner node that we will consider as a bottom startpoint */
+ const uint numInnerNodes = BVHBase_numNodes(bvh);
+ const uint innerNodeIdx = (get_group_id(0) + 0) * get_local_size(0) + get_local_id(0);
+
+ if (innerNodeIdx >= numInnerNodes) return;
+
+ global struct QBVHNodeN* curNode = &inner_nodes[innerNodeIdx];
+ uint numChildren = refit_bottom(
+ bvh,
+ geosArray,
+ instance_leaf_aabbs,
+ curNode,
+ childrenAABB,
+ *InnerNode_GetBackPointer(backPointers, innerNodeIdx));
+
+ if (numChildren != 0)
+ {
+ /* update bounds of node */
+ QBVHNodeN_setBounds(curNode, childrenAABB, numChildren);
+
+ /* refit upper parts of the BVH */
+ /* TODO: this will not gonna work for mixed nodes */
+ refit_bottom_up(curNode, bvh, childrenAABB, numChildren);
+ }
+}
+
+#endif
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(SG_REFIT_WG_SIZE, 1, 1))) void kernel
+Refit_indirect_sg(
+ global struct BVHBase* bvh,
+ global GRL_RAYTRACING_GEOMETRY_DESC* geosArray,
+ global struct AABB3f* instance_leaf_aabbs)
+{
+ DO_Refit_per_one_startpoint_sg(bvh, geosArray, instance_leaf_aabbs, 0);
+
+}
diff --git a/src/intel/vulkan/grl/gpu/bvh_build_refit.h b/src/intel/vulkan/grl/gpu/bvh_build_refit.h
new file mode 100644
index 00000000000..522a44b23a7
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/bvh_build_refit.h
@@ -0,0 +1,546 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#pragma once
+
+#include "common.h"
+#include "api_interface.h"
+#include "instance.h"
+#include "GRLGen12.h"
+#include "libs/lsc_intrinsics.h"
+
+
+__attribute__((reqd_work_group_size(16, 1, 1)))
+void kernel
+DO_update_instance_leaves(global struct BVHBase* bvh,
+ uint64_t dxrInstancesArray,
+ uint64_t dxrInstancesPtr,
+ global struct AABB3f* instance_aabb_scratch,
+ uint id ,
+ global struct GRL_RAYTRACING_AABB* procedural_box
+)
+{
+
+ global struct GRL_RAYTRACING_INSTANCE_DESC* instancesArray =
+ (global struct GRL_RAYTRACING_INSTANCE_DESC*)dxrInstancesArray;
+ global struct GRL_RAYTRACING_INSTANCE_DESC** instancesPtrArray =
+ (global struct GRL_RAYTRACING_INSTANCE_DESC**)dxrInstancesPtr;
+
+ global struct HwInstanceLeaf* leafs = (global struct HwInstanceLeaf*) BVHBase_GetHWInstanceLeaves(bvh);
+
+
+ /* iterate over all children of the instance node and get their bounds */
+
+ uint32_t instanceIdx = HwInstanceLeafPart1_getInstanceIndex(&leafs[id]);
+ global struct GRL_RAYTRACING_INSTANCE_DESC* instance = NULL;
+ if (dxrInstancesArray != NULL)
+ instance = &instancesArray[instanceIdx];
+ else
+ instance = instancesPtrArray[instanceIdx];
+
+ uint mask = GRL_get_InstanceMask(instance);
+ uint offset = NO_NODE_OFFSET;
+
+ struct AffineSpace3f xfm = AffineSpace3f_load_row_major(instance->Transform);
+ struct AABB3f bbox;
+
+ if (procedural_box != 0)
+ {
+ bbox.lower[0] = procedural_box->MinX;
+ bbox.lower[1] = procedural_box->MinY;
+ bbox.lower[2] = procedural_box->MinZ;
+ bbox.upper[0] = procedural_box->MaxX;
+ bbox.upper[1] = procedural_box->MaxY;
+ bbox.upper[2] = procedural_box->MaxZ;
+ }
+ else
+ {
+ global struct BVHBase* instanceBvh = (global struct BVHBase*)instance->AccelerationStructure;
+ bbox = instanceBvh->Meta.bounds;
+ offset = BVH_ROOT_NODE_OFFSET;
+ }
+
+
+ const bool valid_min = isfinite(bbox.lower[0]) && isfinite(bbox.lower[1]) && isfinite(bbox.lower[2]);
+ const bool valid_max = isfinite(bbox.upper[0]) && isfinite(bbox.upper[1]) && isfinite(bbox.upper[2]);
+
+ if (!valid_min || !valid_max )
+ {
+ bbox.lower[0] = xfm.p.x;
+ bbox.lower[1] = xfm.p.y;
+ bbox.lower[2] = xfm.p.z;
+ bbox.upper[0] = xfm.p.x;
+ bbox.upper[1] = xfm.p.y;
+ bbox.upper[2] = xfm.p.z;
+ offset = NO_NODE_OFFSET;
+ mask = 0;
+ }
+ else
+ {
+ bbox = AABB3f_transform(xfm, bbox); // JDB TODO: Use faster abs-matrix method
+ }
+
+ instance_aabb_scratch[id] = bbox;
+
+ HwInstanceLeaf_Constructor(&leafs[id], instance, instanceIdx, offset, mask); // TODO: No instance opening for refittable BVH
+}
+
+/*
+ This function starts at some BVH node and refits all nodes upwards
+ to the root. At some node the algorithm only proceeds upwards if
+ all children of the current node have already been processed. This
+ is checked as each time a node is reached an atomic counter is
+ incremented, which will reach the number of children of the node at
+ some time.
+ */
+
+GRL_INLINE void refit_bottom_up(global struct QBVHNodeN *qnode_start, // start node to refit (already processed)
+ global struct BVHBase *bvh, // pointer to BVH
+ struct AABB *childrenAABB, // temporary data to use
+ uint numChildrenTotal)
+{
+ global struct QBVHNodeN *nodeData = BVHBase_nodeData(bvh);
+ BackPointers* backPointers = BVHBase_GetBackPointers(bvh);
+
+ /* compute the index of the start node */
+ uint curNodeIndex = qnode_start - nodeData;
+
+ /* the start node got already processed, thus go to its parent node */
+ curNodeIndex = *InnerNode_GetBackPointer(backPointers,curNodeIndex) >> 6;
+
+ /* end at root node */
+ while (curNodeIndex != 0x03FFFFFF)
+ {
+ /* increment refit counter that counts refitted children of current node */
+ const uint parentPointer = 1 + atomic_inc_global( (__global uint *) InnerNode_GetBackPointer(backPointers, curNodeIndex));
+
+ /* if all children got refitted, then continue */
+ const uint numChildrenRefitted = (parentPointer >> 0) & 0x7;
+ numChildrenTotal = (parentPointer >> 3) & 0x7;
+ if (numChildrenRefitted != numChildrenTotal)
+ return;
+
+ /* reset refit counter for next refit */
+ *InnerNode_GetBackPointer(backPointers, curNodeIndex) &= 0xfffffff8;
+
+ /* get bounds of all children from child nodes directly */
+ global struct QBVHNodeN *qnode = nodeData + curNodeIndex;
+ global struct QBVHNodeN *qnode_child = (global struct QBVHNodeN *)QBVHNodeN_childrenPointer(qnode);
+ for (uint k = 0; k < numChildrenTotal; k++)
+ childrenAABB[k] = getAABB_QBVHNodeN(qnode_child + k);
+
+ /* update node bounds of all children */
+ QBVHNodeN_setBounds(qnode, childrenAABB, numChildrenTotal);
+
+ write_mem_fence(CLK_GLOBAL_MEM_FENCE);
+
+ /* make parent node the current node */
+ curNodeIndex = parentPointer >> 6;
+ }
+
+ /* update QBVH6 bounds */
+ struct AABB bounds;
+ AABB_init(&bounds);
+
+ for (uint i = 0; i < numChildrenTotal; i++)
+ AABB_extend(&bounds, &childrenAABB[i]);
+
+ setBVHBaseBounds(bvh, &bounds);
+}
+
+
+GRL_INLINE void SUBGROUP_refit_bottom_up(
+ uniform global struct QBVHNodeN* qnode_start, // start node to refit (already processed)
+ uniform global struct BVHBase* bvh, // pointer to BVH
+ varying struct AABB reduce_bounds,
+ uniform uint numChildrenTotal,
+ varying ushort lane,
+ varying ushort head_lane)
+{
+ uniform global struct QBVHNodeN* nodeData = BVHBase_nodeData( bvh );
+ uniform BackPointers* backPointers = BVHBase_GetBackPointers( bvh );
+
+ /* compute the index of the start node */
+ uniform uint curNodeIndex = qnode_start - nodeData;
+
+ /* the start node got already processed, thus go to its parent node */
+ uniform curNodeIndex = *InnerNode_GetBackPointer(backPointers, curNodeIndex) >> 6;
+
+ varying struct AABB childrenAABB;
+
+ /* end at root node */
+ while ( curNodeIndex != 0x03FFFFFF )
+ {
+ mem_fence_gpu_invalidate();
+
+ /* increment refit counter that counts refitted children of current node */
+ uniform uint parentPointer = 1;
+ if (lane == 0)
+ {
+ // acquire fence ensures that all previous writes complete before the atomic starts
+ parentPointer += atomic_inc_global((__global uint *)InnerNode_GetBackPointer(backPointers, curNodeIndex));
+ }
+
+ parentPointer = intel_sub_group_shuffle( parentPointer, head_lane );
+
+ /* if all children got refitted, then continue */
+ uniform uint numChildrenRefitted = (parentPointer >> 0) & 0x7;
+ numChildrenTotal = (parentPointer >> 3) & 0x7;
+ if ( numChildrenRefitted != numChildrenTotal )
+ return;
+
+ /* reset refit counter for next refit */
+ if (lane == 0)
+ {
+ *InnerNode_GetBackPointer(backPointers, curNodeIndex) = (parentPointer & 0xfffffff8);
+ }
+
+ /* get bounds of all children from child nodes directly */
+ global struct QBVHNodeN* qnode = nodeData + curNodeIndex;
+ global struct QBVHNodeN* qnode_child = (global struct QBVHNodeN*)QBVHNodeN_childrenPointer( qnode );
+
+ varying ushort child_idx = (lane < numChildrenTotal) ? lane : 0;
+ childrenAABB = getAABB_QBVHNodeN( qnode_child + child_idx );
+
+ /* update node bounds of all children */
+ reduce_bounds = AABB_sub_group_reduce_N6( &childrenAABB );
+ reduce_bounds = AABB_sub_group_shuffle( &reduce_bounds, head_lane );
+
+ subgroup_QBVHNodeN_setBounds(qnode, reduce_bounds, childrenAABB, numChildrenTotal, lane);
+
+ /* update node mask */
+ uchar childrenMask = qnode_child[child_idx].instMask;
+
+ qnode->instMask = sub_group_reduce_or_N6(childrenMask);
+
+ /* make parent node the current node */
+ curNodeIndex = parentPointer >> 6;
+ }
+
+ /* update QBVH6 bounds */
+
+ if( lane == 0 )
+ setBVHBaseBounds( bvh, &reduce_bounds );
+}
+
+
+GRL_INLINE void quadCopyVertices(
+ const struct QuadLeaf* pQuad,
+ struct QuadLeaf* newQuad)
+{
+ const uint4* s = (const uint4*) & (pQuad->v[0][0]);
+ uint4* d = (uint4*) & (newQuad->v[0][0]);
+ const uint8* s2 = (const uint8*)(s+1);
+ uint8* d2 = (uint8*)(d+1);
+ *d = *s;
+ *d2 = *s2;
+}
+
+
+GRL_INLINE void get_updated_quad(
+ global const struct QuadLeaf* pQuad,
+ global GRL_RAYTRACING_GEOMETRY_DESC* geomDescs,
+ struct QuadLeaf* newQuad)
+{
+ struct QuadLeaf tempQuad;
+
+ // fetch non vtx data;
+ {
+ uint4* tempQuad4U = (uint4*)&tempQuad;
+ global const uint4* pQuad4U = (global const uint4*)pQuad;
+ *tempQuad4U = *pQuad4U;
+ }
+
+ /* get the geomID and primID0/1 for both quad triangles */
+ const uint geomID = PrimLeaf_GetGeoIndex(&tempQuad.leafDesc);
+ const uint primID0 = tempQuad.primIndex0;
+ const uint primID1 = tempQuad.primIndex0 + QuadLeaf_GetPrimIndexDelta(&tempQuad);
+ ushort fourth_vert = 0;
+
+ if (primID1 != primID0)
+ {
+ ushort packed_indices = QuadLeaf_GetSecondTriangleIndices(&tempQuad);
+ fourth_vert = ((packed_indices & 0x0C) == 0x0C) ? 1 : fourth_vert;
+ fourth_vert = ((packed_indices & 0x30) == 0x30) ? 2 : fourth_vert;
+ }
+
+ global GRL_RAYTRACING_GEOMETRY_DESC* desc = geomDescs + geomID;
+
+ uint4 indices = GRL_load_quad_indices(desc, primID0, primID1, fourth_vert);
+
+ // read the indices of the 4 verts we want
+ float3 vtx0, vtx1, vtx2, vtx3;
+ GRL_load_quad_vertices(desc, &vtx0, &vtx1, &vtx2, &vtx3, indices);
+
+ QuadLeaf_SetVertices(&tempQuad, vtx0, vtx1, vtx2, vtx3);
+
+ *newQuad = tempQuad;
+}
+
+// This calculates children BBs for innerNode having *all* children leafs.
+// mixed nodes will be updated by passing through bottom-up thread.
+GRL_INLINE uint refit_bottom( global struct BVHBase* bvh,
+ global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc,
+ global struct AABB3f* instance_leaf_aabbs,
+ global struct QBVHNodeN* curNode,
+ struct AABB *childrenAABB,
+ uint backPointer)
+{
+ uint numChildren = 0;
+
+ /* we start refit at leaf nodes, this case is for quad nodes */
+ if (curNode->type == BVH_QUAD_NODE)
+ {
+ global struct QuadLeaf* quads = (global struct QuadLeaf*)QBVHNodeN_childrenPointer(curNode);
+
+ /* iterate over all quads of the quad node and get their bounds */
+ numChildren = (backPointer >> 3) & 0x7;
+ for (uint k = 0; k < numChildren; k++)
+ {
+ struct QuadLeaf Q;
+ get_updated_quad(&quads[k], geomDesc, &Q);
+ quadCopyVertices(&Q, &quads[k]);
+ childrenAABB[k] = getAABB_Quad((struct Quad*)&Q); // FIXME: support leaves with more than one quad
+ }
+ }
+
+ /* we start refit at leaf nodes, this case is for procedural nodes */
+ else if (curNode->type == BVH_PROCEDURAL_NODE)
+ {
+ global struct ProceduralLeaf* leaf = (global struct ProceduralLeaf*)QBVHNodeN_childrenPointer(curNode);
+
+ /* iterate over all children of the procedural node and get their bounds */
+ numChildren = (backPointer >> 3) & 0x7;
+ for (uint k = 0; k < numChildren; k++)
+ {
+ /* extract geomID and primID from leaf */
+ const uint startPrim = QBVHNodeN_startPrim(curNode, k);
+ const uint geomID = ProceduralLeaf_geomIndex(leaf);
+ const uint primID = ProceduralLeaf_primIndex(leaf, startPrim); // FIXME: have to iterate over all primitives of leaf!
+
+ /* read bounds from geometry descriptor */
+ struct GRL_RAYTRACING_AABB aabb = GRL_load_aabb(&geomDesc[geomID], primID);
+ childrenAABB[k].lower.x = aabb.MinX;
+ childrenAABB[k].lower.y = aabb.MinY;
+ childrenAABB[k].lower.z = aabb.MinZ;
+ childrenAABB[k].upper.x = aabb.MaxX;
+ childrenAABB[k].upper.y = aabb.MaxY;
+ childrenAABB[k].upper.z = aabb.MaxZ;
+
+ /* advance leaf pointer to next child */
+ leaf += QBVHNodeN_blockIncr(curNode, k);
+ }
+ }
+
+ /* we start refit at leaf nodes, this case is for instance nodes */
+ else if (curNode->type == BVH_INSTANCE_NODE)
+ {
+ global struct HwInstanceLeaf* instancesLeaves = (global struct HwInstanceLeaf*)QBVHNodeN_childrenPointer(curNode);
+ global struct HwInstanceLeaf* leafBase = (global struct HwInstanceLeaf*) BVHBase_GetHWInstanceLeaves( bvh );
+
+ /* iterate over all children of the instance node and get their bounds */
+ numChildren = (backPointer >> 3) & 0x7;
+ for (uint k = 0; k < numChildren; k++)
+ {
+ uint leafindex = (instancesLeaves + k) - leafBase;
+ childrenAABB[k].lower.xyz = AABB3f_load_lower( &instance_leaf_aabbs[leafindex] );
+ childrenAABB[k].upper.xyz = AABB3f_load_upper( &instance_leaf_aabbs[leafindex] );
+ }
+ }
+
+ return numChildren;
+}
+
+
+
+
+
+// This calculates children BBs for innerNode having *all* children leafs.
+// mixed nodes will be updated by passing through bottom-up thread.
+GRL_INLINE uint SUBGROUP_refit_bottom(
+ uniform global struct BVHBase* bvh,
+ uniform global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc,
+ uniform global struct AABB3f* instance_leaf_aabbs,
+ uniform global struct QBVHNodeN* curNode,
+ uniform uint backPointer,
+ varying struct AABB* childrenAABB,
+ varying uchar* childrenMask,
+ varying ushort lane,
+ global uchar* is_procedural_instance
+ )
+{
+ uniform uint numChildren = 0;
+ bool enable_procedural_instance = (is_procedural_instance != 0);
+
+ /* we start refit at leaf nodes, this case is for quad nodes */
+ if (curNode->type == BVH_QUAD_NODE)
+ {
+ /* iterate over all quads of the quad node and get their bounds */
+ numChildren = (backPointer >> 3) & 0x7;
+
+ uniform global struct QuadLeaf* quads = (global struct QuadLeaf*)QBVHNodeN_childrenPointer(curNode);
+
+ struct QuadLeaf Q;
+ if (lane < numChildren)
+ {
+ get_updated_quad(&quads[lane], geomDesc, &Q);
+
+ *childrenAABB = getAABB_Quad((struct Quad*) & Q); // FIXME: support leaves with more than one quad
+
+ quadCopyVertices(&Q, &quads[lane]);
+ *childrenMask = 0xff;
+ }
+ // FIXME: support leaves with more than one quad
+ }
+
+ /* we start refit at leaf nodes, this case is for procedural nodes */
+ else if (curNode->type == BVH_PROCEDURAL_NODE)
+ {
+ uniform global struct ProceduralLeaf* leaf = (global struct ProceduralLeaf*)QBVHNodeN_childrenPointer(curNode);
+
+
+
+ /* iterate over all children of the procedural node and get their bounds */
+ numChildren = (backPointer >> 3) & 0x7;
+
+ varying uint incr = (lane < numChildren) ? InternalNode_GetChildBlockIncr((struct InternalNode*)curNode, lane) : 0;
+ incr = sub_group_scan_exclusive_add(incr);
+
+ if( lane < numChildren )
+ {
+ /* extract geomID and primID from leaf */
+ varying uint start_prim = InternalNode_GetChildStartPrim((struct InternalNode*)curNode, lane );
+ varying global struct ProceduralLeaf* my_leaf = leaf + incr;
+ const uint geomID = ProceduralLeaf_geomIndex(my_leaf);
+ const uint primID = ProceduralLeaf_primIndex(my_leaf, start_prim);
+
+ /* read bounds from geometry descriptor */
+ struct GRL_RAYTRACING_AABB aabb = GRL_load_aabb(&geomDesc[geomID], primID);
+ childrenAABB->lower.x = aabb.MinX;
+ childrenAABB->lower.y = aabb.MinY;
+ childrenAABB->lower.z = aabb.MinZ;
+ childrenAABB->upper.x = aabb.MaxX;
+ childrenAABB->upper.y = aabb.MaxY;
+ childrenAABB->upper.z = aabb.MaxZ;
+ *childrenMask = 0xff;
+ }
+ }
+
+ /* we start refit at leaf nodes, this case is for instance nodes */
+ else if ( !enable_procedural_instance && curNode->type == BVH_INSTANCE_NODE)
+ {
+ uniform global struct HwInstanceLeaf* instancesLeaves = (global struct HwInstanceLeaf*)QBVHNodeN_childrenPointer(curNode);
+ uniform global struct HwInstanceLeaf* leafBase = (global struct HwInstanceLeaf*) BVHBase_GetHWInstanceLeaves(bvh);
+
+ /* iterate over all children of the instance node and get their bounds and masks */
+ numChildren = (backPointer >> 3) & 0x7;
+ if( lane < numChildren )
+ {
+ uint leafindex = (instancesLeaves + lane) - leafBase;
+ childrenAABB->lower.xyz = AABB3f_load_lower(&instance_leaf_aabbs[leafindex]);
+ childrenAABB->upper.xyz = AABB3f_load_upper(&instance_leaf_aabbs[leafindex]);
+ *childrenMask = HwInstanceLeaf_GetInstanceMask(&leafBase[leafindex]);
+ }
+ }
+ else if (enable_procedural_instance && curNode->type == BVH_INTERNAL_NODE)
+ {
+ // Handle procedural-instance leaves
+ // TODO: Generalize this! Should re-write the kernel to work with arbitrary mixed-mode leaves
+
+ numChildren = (backPointer >> 3) & 0x7;
+ uint childType = BVH_INTERNAL_NODE;
+ if ( lane < numChildren )
+ {
+ childType = InternalNode_GetChildType( (struct InternalNode*)curNode, lane );
+ if (childType != BVH_INTERNAL_NODE)
+ {
+ uniform global struct HwInstanceLeaf* instancesLeaves = (global struct HwInstanceLeaf*)QBVHNodeN_childrenPointer( curNode );
+ uniform global struct HwInstanceLeaf* leafBase = (global struct HwInstanceLeaf*) BVHBase_GetHWInstanceLeaves( bvh );
+ uint leafindex = (instancesLeaves + lane) - leafBase;
+ childrenAABB->lower.xyz = AABB3f_load_lower( &instance_leaf_aabbs[leafindex] );
+ childrenAABB->upper.xyz = AABB3f_load_upper( &instance_leaf_aabbs[leafindex] );
+ *childrenMask = HwInstanceLeaf_GetInstanceMask( &leafBase[leafindex] );
+
+ // see if the child has flipped from procedural to non-procedural and update the child type field as needed
+ uint instanceIndex = HwInstanceLeaf_GetInstanceIndex( &leafBase[leafindex] );
+ uint newChildType = is_procedural_instance[instanceIndex] ? BVH_PROCEDURAL_NODE : BVH_INSTANCE_NODE;
+ if (newChildType != childType)
+ {
+ InternalNode_SetChildType( (struct InternalNode*)curNode, lane, newChildType );
+ }
+ }
+ }
+
+
+ // don't ascend the tree for a true internal node
+ if (sub_group_all(childType == BVH_INTERNAL_NODE))
+ numChildren = 0;
+ }
+
+ return numChildren;
+}
+
+#define SG_REFIT_WG_SIZE 8
+
+void DO_Refit_per_one_startpoint_sg(
+ global struct BVHBase* bvh,
+ global GRL_RAYTRACING_GEOMETRY_DESC* geosArray,
+ global struct AABB3f* instance_leaf_aabbs,
+ global uchar* is_procedural_instance )
+{
+ /* get pointer to inner nodes and back pointers */
+ global struct QBVHNodeN* inner_nodes = BVHBase_rootNode(bvh);
+ BackPointers* backPointers = BVHBase_GetBackPointers(bvh);
+
+ /* get the inner node that we will consider as a bottom startpoint */
+ const uint numInnerNodes = BVHBase_numNodes(bvh);
+ const uint innerNodeIdx = get_sub_group_global_id();
+
+ varying ushort lane = get_sub_group_local_id();
+
+ if (innerNodeIdx >= numInnerNodes) return;
+
+ varying struct AABB childrenAABB; // one child AABB per lane
+ AABB_init(&childrenAABB);
+
+ varying uchar childrenMask = 0; // one child mask per lane
+
+ global struct QBVHNodeN* curNode = &inner_nodes[innerNodeIdx];
+ uint backPointer = *InnerNode_GetBackPointer(backPointers, innerNodeIdx);
+ uint numChildren = SUBGROUP_refit_bottom(
+ bvh,
+ geosArray,
+ instance_leaf_aabbs,
+ curNode,
+ backPointer,
+ &childrenAABB,
+ &childrenMask,
+ lane,
+ is_procedural_instance
+ );
+
+
+ if (numChildren != 0)
+ {
+ /* update bounds of node */
+ struct AABB reduce_bounds = AABB_sub_group_reduce_N6(&childrenAABB);
+ reduce_bounds = AABB_sub_group_shuffle(&reduce_bounds, 0);
+ subgroup_QBVHNodeN_setBounds(curNode, reduce_bounds, childrenAABB, numChildren, lane);
+
+ /* update mask of node */
+ uchar mask = sub_group_reduce_or_N6(childrenMask);
+ curNode->instMask = mask;
+
+ /* Leave this fence for now for all threads, if WG size is increased (tried 128) and fence is done
+ only by the first thread (similar to morton phase1) the machine hangs. */
+ mem_fence_gpu_invalidate();
+
+ /* refit upper parts of the BVH */
+ /* TODO: this will not gonna work for mixed nodes */
+ SUBGROUP_refit_bottom_up(curNode, bvh, reduce_bounds, numChildren, lane, 0);
+ }
+} \ No newline at end of file
diff --git a/src/intel/vulkan/grl/gpu/bvh_build_sah_experimental.cl b/src/intel/vulkan/grl/gpu/bvh_build_sah_experimental.cl
new file mode 100644
index 00000000000..0a4bd3466af
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/bvh_build_sah_experimental.cl
@@ -0,0 +1,1917 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#include "common.h"
+#include "instance.h"
+
+#define DBG(x)
+
+#define ENABLE_CHECKS 0
+
+#define ENABLE_32BINS_IN_BREADTH_FIRST_PHASE 1
+
+/* todo: */
+/* - new cross WG code path for first splits */
+/* - optimize find best child loop sequence */
+/* - subgroup_setQBVHNodeN needs work on 6 slots in parallel */
+
+#define DIVIDE_BY_6 1
+
+inline uint getNumPrims(struct BuildRecord *buildRecord)
+{
+ return buildRecord->end - buildRecord->start;
+}
+
+inline void printBuildRecord(struct BuildRecord *record)
+{
+ printf("centroidBounds\n");
+ AABB_print(&record->centroidBounds);
+ printf("start %d end %d size %d depth %d \n", record->start, record->end, record->end - record->start, getBuildRecursionDepth(record));
+}
+
+inline void printBinInfo2(struct BinInfo2 *record)
+{
+ printf("boundsX[%d]\n", BINS * 2);
+ for (uint b = 0; b < BINS * 2; b++)
+ {
+ AABB3f_print(&record->boundsX[b]);
+ printf("counts.x = %d\n", record->counts[b].x);
+ }
+ printf("boundsY[%d]\n", BINS * 2);
+ for (uint b = 0; b < BINS * 2; b++)
+ {
+ AABB3f_print(&record->boundsY[b]);
+ printf("counts.y = %d\n", record->counts[b].y);
+ }
+ printf("boundsZ[%d]\n", BINS * 2);
+ for (uint b = 0; b < BINS * 2; b++)
+ {
+ AABB3f_print(&record->boundsZ[b]);
+ printf("counts.z = %d\n", record->counts[b].z);
+ }
+}
+
+inline void initBinMapping(struct BinMapping *binMapping, struct AABB *centBounds, const uint bins)
+{
+ const float4 eps = 1E-34f;
+ const float4 diag = max(eps, centBounds->upper - centBounds->lower);
+ const float4 scale = (float4)(0.99f * (float)bins) / diag;
+ binMapping->scale = select((float4)(0.0f), scale, diag > eps);
+ binMapping->ofs = centBounds->lower;
+}
+
+inline void atomicExtendLocalBuildRecord(local struct BuildRecord *buildRecord, global struct AABB *primref)
+{
+ const float4 centroid2 = primref->lower + primref->upper;
+ AABB_local_atomic_merge(&buildRecord->centroidBounds, centroid2, centroid2);
+}
+
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+
+inline void initBinInfo(struct BinInfo *binInfo)
+{
+ for (uint i = 0; i < BINS; i++)
+ {
+ AABB3f_init(&binInfo->boundsX[i]);
+ AABB3f_init(&binInfo->boundsY[i]);
+ AABB3f_init(&binInfo->boundsZ[i]);
+ binInfo->counts[i] = (uint3)(0);
+ }
+}
+
+inline void subgroup_initBinInfo(struct BinInfo *binInfo)
+{
+ const uint subgroupLocalID = get_sub_group_local_id();
+ const uint subgroup_size = get_sub_group_size();
+
+ for (uint i = subgroupLocalID; i < BINS; i += subgroup_size)
+ {
+ AABB3f_init(&binInfo->boundsX[i]);
+ AABB3f_init(&binInfo->boundsY[i]);
+ AABB3f_init(&binInfo->boundsZ[i]);
+ binInfo->counts[i] = (uint3)(0);
+ }
+}
+
+inline void parallel_initBinInfo(struct BinInfo *binInfo)
+{
+ const uint localID = get_local_id(0);
+ if (localID < BINS)
+ {
+ AABB3f_init(&binInfo->boundsX[localID]);
+ AABB3f_init(&binInfo->boundsY[localID]);
+ AABB3f_init(&binInfo->boundsZ[localID]);
+ binInfo->counts[localID] = (uint3)(0);
+ }
+}
+
+inline void atomicUpdateLocalBinInfo(struct BinMapping *binMapping, local struct BinInfo *binInfo, global struct AABB *primref)
+{
+ const float4 lower = primref->lower;
+ const float4 upper = primref->upper;
+ const float4 p = lower + upper;
+ const uint4 i = convert_uint4((p - binMapping->ofs) * binMapping->scale);
+ AABB3f_atomic_merge_local(&binInfo->boundsX[i.x], lower, upper);
+ AABB3f_atomic_merge_local(&binInfo->boundsY[i.y], lower, upper);
+ AABB3f_atomic_merge_local(&binInfo->boundsZ[i.z], lower, upper);
+ atomic_add((local uint *)&binInfo->counts[i.x] + 0, 1);
+ atomic_add((local uint *)&binInfo->counts[i.y] + 1, 1);
+ atomic_add((local uint *)&binInfo->counts[i.z] + 2, 1);
+}
+
+inline void atomicUpdateLocalBinInfo_nocheck(struct BinMapping *binMapping, local struct BinInfo *binInfo, global struct AABB *primref)
+{
+ const float4 lower = primref->lower;
+ const float4 upper = primref->upper;
+ const float4 p = lower + upper;
+ const uint4 i = convert_uint4((p - binMapping->ofs) * binMapping->scale);
+ AABB3f_atomic_merge_local_nocheck(&binInfo->boundsX[i.x], lower, upper);
+ AABB3f_atomic_merge_local_nocheck(&binInfo->boundsY[i.y], lower, upper);
+ AABB3f_atomic_merge_local_nocheck(&binInfo->boundsZ[i.z], lower, upper);
+ atomic_add((local uint *)&binInfo->counts[i.x] + 0, 1);
+ atomic_add((local uint *)&binInfo->counts[i.y] + 1, 1);
+ atomic_add((local uint *)&binInfo->counts[i.z] + 2, 1);
+}
+
+inline void updateBins(struct BinMapping *binMapping, struct BinInfo *binInfo, global struct AABB *primref)
+{
+ const float4 lower = primref->lower;
+ const float4 upper = primref->upper;
+ const float4 p = lower + upper;
+ const uint4 i = convert_uint4((p - binMapping->ofs) * binMapping->scale);
+ AABB3f_extendlu(&binInfo->boundsX[i.x], lower.xyz, upper.xyz);
+ AABB3f_extendlu(&binInfo->boundsY[i.y], lower.xyz, upper.xyz);
+ AABB3f_extendlu(&binInfo->boundsZ[i.z], lower.xyz, upper.xyz);
+ binInfo->counts[i.x].x++;
+ binInfo->counts[i.y].y++;
+ binInfo->counts[i.z].z++;
+}
+
+// =====================================================================================================================
+// =====================================================================================================================
+// =====================================================================================================================
+
+inline void parallel_initBinInfo2(struct BinInfo2 *binInfo, const uint bins)
+{
+ const uint localID = get_local_id(0);
+ if (localID < bins)
+ {
+ AABB3f_init(&binInfo->boundsX[localID]);
+ AABB3f_init(&binInfo->boundsY[localID]);
+ AABB3f_init(&binInfo->boundsZ[localID]);
+ binInfo->counts[localID] = (uint3)(0);
+ }
+}
+
+inline void atomicUpdateLocalBinInfo2(struct BinMapping *binMapping, local struct BinInfo2 *binInfo, global struct AABB *primref)
+{
+ const float4 lower = primref->lower;
+ const float4 upper = primref->upper;
+ const float4 p = lower + upper;
+ const uint4 i = convert_uint4((p - binMapping->ofs) * binMapping->scale);
+ AABB3f_atomic_merge_local(&binInfo->boundsX[i.x], lower, upper);
+ AABB3f_atomic_merge_local(&binInfo->boundsY[i.y], lower, upper);
+ AABB3f_atomic_merge_local(&binInfo->boundsZ[i.z], lower, upper);
+ atomic_add((local uint *)&binInfo->counts[i.x] + 0, 1);
+ atomic_add((local uint *)&binInfo->counts[i.y] + 1, 1);
+ atomic_add((local uint *)&binInfo->counts[i.z] + 2, 1);
+}
+
+inline void atomicUpdateGlobalFromLocalBinInfo2(global struct BinInfo2 *dest, local struct BinInfo2 *source, const uint bins)
+{
+ const uint localID = get_local_id(0);
+ if (localID < bins)
+ {
+ AABB3f_atomic_merge_global_local(&dest->boundsX[localID], &source->boundsX[localID]);
+ AABB3f_atomic_merge_global_local(&dest->boundsY[localID], &source->boundsY[localID]);
+ AABB3f_atomic_merge_global_local(&dest->boundsZ[localID], &source->boundsZ[localID]);
+ atomic_add((global uint *)&dest->counts[localID] + 0, source->counts[localID].x);
+ atomic_add((global uint *)&dest->counts[localID] + 1, source->counts[localID].y);
+ atomic_add((global uint *)&dest->counts[localID] + 2, source->counts[localID].z);
+ }
+}
+
+inline uint subgroup_getMaxAreaChild(struct AABB *childrenAABB, const uint numChildren)
+{
+ const uint subgroupLocalID = get_sub_group_local_id();
+#if 0
+ /*! find best child to split */
+ const float area = (subgroupLocalID < numChildren) & (as_uint(childrenAABB[subgroupLocalID].upper.w) > cfg_minLeafSize) ? childrenAABB[subgroupLocalID].lower.w : -(float)INFINITY;
+ const float maxArea = sub_group_reduce_max(area);
+ const uint mask = intel_sub_group_ballot(area == maxArea);
+ const uint bestChild = maxArea != -(float)INFINITY ? ctz(mask) : -1;
+#else
+ float bestArea = -(float)INFINITY;
+ int bestChild = -1;
+ for (int i = 0; i < numChildren; i++)
+ {
+ /* ignore leaves as they cannot get split */
+ if (as_uint(childrenAABB[i].upper.w) <= cfg_minLeafSize)
+ continue;
+
+ /* find child with largest surface area */
+ if (childrenAABB[i].lower.w > bestArea)
+ {
+ bestChild = i;
+ bestArea = childrenAABB[i].lower.w;
+ }
+ }
+#endif
+ return bestChild;
+}
+
+inline bool AABB_verifyBounds(struct BuildRecord *buildRecord, struct AABB *geometryBounds, struct AABB *primref)
+{
+ const float4 centroid2 = primref->lower + primref->upper;
+
+ if (centroid2.x < buildRecord->centroidBounds.lower.x)
+ return false;
+ if (centroid2.y < buildRecord->centroidBounds.lower.y)
+ return false;
+ if (centroid2.z < buildRecord->centroidBounds.lower.z)
+ return false;
+
+ if (centroid2.x > buildRecord->centroidBounds.upper.x)
+ return false;
+ if (centroid2.y > buildRecord->centroidBounds.upper.y)
+ return false;
+ if (centroid2.z > buildRecord->centroidBounds.upper.z)
+ return false;
+
+ if (primref->lower.x < geometryBounds->lower.x)
+ return false;
+ if (primref->lower.y < geometryBounds->lower.y)
+ return false;
+ if (primref->lower.z < geometryBounds->lower.z)
+ return false;
+
+ if (primref->upper.x > geometryBounds->upper.x)
+ return false;
+ if (primref->upper.y > geometryBounds->upper.y)
+ return false;
+ if (primref->upper.z > geometryBounds->upper.z)
+ return false;
+
+ return true;
+}
+
+/* initialize primref index array */
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16))) void kernel
+create_primref_index(global struct Globals *globals,
+ global struct AABB *primref,
+ global unsigned int *primref_index)
+{
+ const uint local_size = get_local_size(0);
+ const uint taskID = get_group_id(0);
+ const uint numTasks = get_num_groups(0);
+ const uint localID = get_local_id(0);
+
+ const uint startID = (taskID + 0) * globals->numPrimitives / numTasks;
+ const uint endID = (taskID + 1) * globals->numPrimitives / numTasks;
+ for (uint primID = startID + localID; primID < endID; primID += local_size)
+ primref_index[primID] = primID;
+}
+
+// ==========================================================================================================
+// ==========================================================================================================
+// ==========================================================================================================
+
+inline float left_to_right_area16(struct AABB3f *low)
+{
+ struct AABB3f low_prefix = AABB3f_sub_group_scan_exclusive_min_max(low);
+ return halfArea_AABB3f(&low_prefix);
+}
+
+inline uint left_to_right_counts16(uint low)
+{
+ return sub_group_scan_exclusive_add(low);
+}
+
+inline float right_to_left_area16(struct AABB3f *low)
+{
+ const uint subgroupLocalID = get_sub_group_local_id();
+ const uint subgroup_size = get_sub_group_size();
+ const uint ID = subgroup_size - 1 - subgroupLocalID;
+ struct AABB3f low_reverse = AABB3f_sub_group_shuffle(low, ID);
+ struct AABB3f low_prefix = AABB3f_sub_group_scan_inclusive_min_max(&low_reverse);
+ const float low_area = sub_group_broadcast(halfArea_AABB3f(&low_prefix), ID);
+ return low_area;
+}
+
+inline uint right_to_left_counts16(uint low)
+{
+ const uint subgroupLocalID = get_sub_group_local_id();
+ const uint subgroup_size = get_sub_group_size();
+ const uint ID = subgroup_size - 1 - subgroupLocalID;
+ const uint low_reverse = sub_group_broadcast(low, ID);
+ const uint low_prefix = sub_group_scan_inclusive_add(low_reverse);
+ return sub_group_broadcast(low_prefix, ID);
+}
+
+inline float2 left_to_right_area32(struct AABB3f *low, struct AABB3f *high)
+{
+ struct AABB3f low_prefix = AABB3f_sub_group_scan_exclusive_min_max(low);
+ struct AABB3f low_reduce = AABB3f_sub_group_reduce(low);
+ struct AABB3f high_prefix = AABB3f_sub_group_scan_exclusive_min_max(high);
+ AABB3f_extend(&high_prefix, &low_reduce);
+ const float low_area = halfArea_AABB3f(&low_prefix);
+ const float high_area = halfArea_AABB3f(&high_prefix);
+ return (float2)(low_area, high_area);
+}
+
+inline uint2 left_to_right_counts32(uint low, uint high)
+{
+ const uint low_prefix = sub_group_scan_exclusive_add(low);
+ const uint low_reduce = sub_group_reduce_add(low);
+ const uint high_prefix = sub_group_scan_exclusive_add(high);
+ return (uint2)(low_prefix, low_reduce + high_prefix);
+}
+
+inline float2 right_to_left_area32(struct AABB3f *low, struct AABB3f *high)
+{
+ const uint subgroupLocalID = get_sub_group_local_id();
+ const uint subgroup_size = get_sub_group_size();
+ const uint ID = subgroup_size - 1 - subgroupLocalID;
+ struct AABB3f low_reverse = AABB3f_sub_group_shuffle(high, ID);
+ struct AABB3f high_reverse = AABB3f_sub_group_shuffle(low, ID);
+ struct AABB3f low_prefix = AABB3f_sub_group_scan_inclusive_min_max(&low_reverse);
+ struct AABB3f low_reduce = AABB3f_sub_group_reduce(&low_reverse);
+ struct AABB3f high_prefix = AABB3f_sub_group_scan_inclusive_min_max(&high_reverse);
+ AABB3f_extend(&high_prefix, &low_reduce);
+ const float low_area = sub_group_broadcast(halfArea_AABB3f(&high_prefix), ID);
+ const float high_area = sub_group_broadcast(halfArea_AABB3f(&low_prefix), ID);
+ return (float2)(low_area, high_area);
+}
+
+inline uint2 right_to_left_counts32(uint low, uint high)
+{
+ const uint subgroupLocalID = get_sub_group_local_id();
+ const uint subgroup_size = get_sub_group_size();
+ const uint ID = subgroup_size - 1 - subgroupLocalID;
+ const uint low_reverse = sub_group_broadcast(high, ID);
+ const uint high_reverse = sub_group_broadcast(low, ID);
+ const uint low_prefix = sub_group_scan_inclusive_add(low_reverse);
+ const uint low_reduce = sub_group_reduce_add(low_reverse);
+ const uint high_prefix = sub_group_scan_inclusive_add(high_reverse) + low_reduce;
+ return (uint2)(sub_group_broadcast(high_prefix, ID), sub_group_broadcast(low_prefix, ID));
+}
+
+inline ulong getBestSplit(float3 sah, uint ID, const float4 scale, const ulong defaultSplit)
+{
+ ulong splitX = (((ulong)as_uint(sah.x)) << 32) | ((uint)ID << 2) | 0;
+ ulong splitY = (((ulong)as_uint(sah.y)) << 32) | ((uint)ID << 2) | 1;
+ ulong splitZ = (((ulong)as_uint(sah.z)) << 32) | ((uint)ID << 2) | 2;
+ /* ignore zero sized dimensions */
+ splitX = select(splitX, defaultSplit, (ulong)(scale.x == 0));
+ splitY = select(splitY, defaultSplit, (ulong)(scale.y == 0));
+ splitZ = select(splitZ, defaultSplit, (ulong)(scale.z == 0));
+ ulong bestSplit = min(min(splitX, splitY), splitZ);
+ bestSplit = sub_group_reduce_min(bestSplit);
+ return bestSplit;
+}
+
+inline uint fastDivideBy6_uint(uint v)
+{
+#if 1
+ const ulong u = (ulong)v >> 1;
+ return (uint)((u * 0x55555556ul) >> 32);
+#else
+ return v / 6;
+#endif
+}
+
+inline uint3 fastDivideBy6_uint3(uint3 v)
+{
+ return (uint3)(fastDivideBy6_uint(v.x), fastDivideBy6_uint(v.y), fastDivideBy6_uint(v.z));
+}
+
+inline struct Split reduceBinsAndComputeBestSplit16(struct BinInfo *binInfo, const float4 scale, uint startID, uint endID)
+{
+ const uint subgroupLocalID = get_sub_group_local_id();
+ const uint subgroup_size = get_sub_group_size();
+
+ struct AABB3f boundsX = binInfo->boundsX[subgroupLocalID];
+
+ const float lr_areaX = left_to_right_area16(&boundsX);
+ const float rl_areaX = right_to_left_area16(&boundsX);
+
+ struct AABB3f boundsY = binInfo->boundsY[subgroupLocalID];
+
+ const float lr_areaY = left_to_right_area16(&boundsY);
+ const float rl_areaY = right_to_left_area16(&boundsY);
+
+ struct AABB3f boundsZ = binInfo->boundsZ[subgroupLocalID];
+
+ const float lr_areaZ = left_to_right_area16(&boundsZ);
+ const float rl_areaZ = right_to_left_area16(&boundsZ);
+
+ const uint3 counts = binInfo->counts[subgroupLocalID];
+
+ const uint lr_countsX = left_to_right_counts16(counts.x);
+ const uint rl_countsX = right_to_left_counts16(counts.x);
+ const uint lr_countsY = left_to_right_counts16(counts.y);
+ const uint rl_countsY = right_to_left_counts16(counts.y);
+ const uint lr_countsZ = left_to_right_counts16(counts.z);
+ const uint rl_countsZ = right_to_left_counts16(counts.z);
+
+ const float3 lr_area = (float3)(lr_areaX, lr_areaY, lr_areaZ);
+ const float3 rl_area = (float3)(rl_areaX, rl_areaY, rl_areaZ);
+
+#if DIVIDE_BY_6 == 0
+ const uint blocks_shift = SAH_LOG_BLOCK_SHIFT;
+ uint3 blocks_add = (uint3)((1 << blocks_shift) - 1);
+ const uint3 lr_count = ((uint3)(lr_countsX, lr_countsY, lr_countsZ) + blocks_add) >> blocks_shift;
+ const uint3 rl_count = ((uint3)(rl_countsX, rl_countsY, rl_countsZ) + blocks_add) >> blocks_shift;
+#else
+ const uint3 lr_count = fastDivideBy6_uint3((uint3)(lr_countsX, lr_countsY, lr_countsZ) + BVH_NODE_N6 - 1);
+ const uint3 rl_count = fastDivideBy6_uint3((uint3)(rl_countsX, rl_countsY, rl_countsZ) + BVH_NODE_N6 - 1);
+#endif
+ float3 sah = fma(lr_area, convert_float3(lr_count), rl_area * convert_float3(rl_count));
+
+ /* first bin is invalid */
+
+ sah.x = select((float)(INFINITY), sah.x, subgroupLocalID != 0);
+ sah.y = select((float)(INFINITY), sah.y, subgroupLocalID != 0);
+ sah.z = select((float)(INFINITY), sah.z, subgroupLocalID != 0);
+
+ const uint mid = (startID + endID) / 2;
+ const ulong defaultSplit = (((ulong)as_uint((float)(INFINITY))) << 32) | ((uint)mid << 2) | 0;
+
+ const ulong bestSplit = getBestSplit(sah, subgroupLocalID, scale, defaultSplit);
+
+ struct Split split;
+ split.sah = as_float((uint)(bestSplit >> 32));
+ split.dim = (uint)bestSplit & 3;
+ split.pos = (uint)bestSplit >> 2;
+
+ return split;
+}
+
+inline struct Split reduceBinsAndComputeBestSplit32(struct BinInfo2 *binInfo, const float4 scale, uint startID, uint endID)
+{
+ const uint subgroupLocalID = get_sub_group_local_id();
+ const uint subgroup_size = get_sub_group_size();
+
+ struct AABB3f boundsX_low = binInfo->boundsX[subgroupLocalID];
+ struct AABB3f boundsX_high = binInfo->boundsX[subgroupLocalID + subgroup_size];
+
+ const float2 lr_areaX = left_to_right_area32(&boundsX_low, &boundsX_high);
+ const float2 rl_areaX = right_to_left_area32(&boundsX_low, &boundsX_high);
+
+ struct AABB3f boundsY_low = binInfo->boundsY[subgroupLocalID];
+ struct AABB3f boundsY_high = binInfo->boundsY[subgroupLocalID + subgroup_size];
+
+ const float2 lr_areaY = left_to_right_area32(&boundsY_low, &boundsY_high);
+ const float2 rl_areaY = right_to_left_area32(&boundsY_low, &boundsY_high);
+
+ struct AABB3f boundsZ_low = binInfo->boundsZ[subgroupLocalID];
+ struct AABB3f boundsZ_high = binInfo->boundsZ[subgroupLocalID + subgroup_size];
+
+ const float2 lr_areaZ = left_to_right_area32(&boundsZ_low, &boundsZ_high);
+ const float2 rl_areaZ = right_to_left_area32(&boundsZ_low, &boundsZ_high);
+
+ const uint3 counts_low = binInfo->counts[subgroupLocalID];
+ const uint3 counts_high = binInfo->counts[subgroupLocalID + subgroup_size];
+
+ const uint2 lr_countsX = left_to_right_counts32(counts_low.x, counts_high.x);
+ const uint2 rl_countsX = right_to_left_counts32(counts_low.x, counts_high.x);
+ const uint2 lr_countsY = left_to_right_counts32(counts_low.y, counts_high.y);
+ const uint2 rl_countsY = right_to_left_counts32(counts_low.y, counts_high.y);
+ const uint2 lr_countsZ = left_to_right_counts32(counts_low.z, counts_high.z);
+ const uint2 rl_countsZ = right_to_left_counts32(counts_low.z, counts_high.z);
+
+ const uint blocks_shift = SAH_LOG_BLOCK_SHIFT;
+ uint3 blocks_add = (uint3)((1 << blocks_shift) - 1);
+
+ /* low part: bins 0..15 */
+ const float3 lr_area_low = (float3)(lr_areaX.x, lr_areaY.x, lr_areaZ.x);
+ const float3 rl_area_low = (float3)(rl_areaX.x, rl_areaY.x, rl_areaZ.x);
+
+#if DIVIDE_BY_6 == 0
+ const uint3 lr_count_low = ((uint3)(lr_countsX.x, lr_countsY.x, lr_countsZ.x) + blocks_add) >> blocks_shift;
+ const uint3 rl_count_low = ((uint3)(rl_countsX.x, rl_countsY.x, rl_countsZ.x) + blocks_add) >> blocks_shift;
+
+#else
+ //const uint3 lr_count_low = ((uint3)(lr_countsX.x,lr_countsY.x,lr_countsZ.x)+BVH_NODE_N6-1) / BVH_NODE_N6;
+ //const uint3 rl_count_low = ((uint3)(rl_countsX.x,rl_countsY.x,rl_countsZ.x)+BVH_NODE_N6-1) / BVH_NODE_N6;
+
+ /* skip blocks for breadth-first phase */
+ const uint3 lr_count_low = ((uint3)(lr_countsX.x, lr_countsY.x, lr_countsZ.x));
+ const uint3 rl_count_low = ((uint3)(rl_countsX.x, rl_countsY.x, rl_countsZ.x));
+
+#endif
+
+ float3 sah_low = fma(lr_area_low, convert_float3(lr_count_low), rl_area_low * convert_float3(rl_count_low));
+
+ /* first bin is invalid */
+ // sah_low.x = (subgroupLocalID == 0) ? (float)(INFINITY) : sah_low.x;
+ // sah_low.y = (subgroupLocalID == 0) ? (float)(INFINITY) : sah_low.y;
+ // sah_low.z = (subgroupLocalID == 0) ? (float)(INFINITY) : sah_low.z;
+
+ sah_low.x = select((float)(INFINITY), sah_low.x, subgroupLocalID != 0);
+ sah_low.y = select((float)(INFINITY), sah_low.y, subgroupLocalID != 0);
+ sah_low.z = select((float)(INFINITY), sah_low.z, subgroupLocalID != 0);
+
+ /* high part: bins 16..31 */
+
+ const float3 lr_area_high = (float3)(lr_areaX.y, lr_areaY.y, lr_areaZ.y);
+ const float3 rl_area_high = (float3)(rl_areaX.y, rl_areaY.y, rl_areaZ.y);
+#if DIVIDE_BY_6 == 0
+ const uint3 lr_count_high = ((uint3)(lr_countsX.y, lr_countsY.y, lr_countsZ.y) + blocks_add) >> blocks_shift;
+ const uint3 rl_count_high = ((uint3)(rl_countsX.y, rl_countsY.y, rl_countsZ.y) + blocks_add) >> blocks_shift;
+#else
+ //const uint3 lr_count_high = ((uint3)(lr_countsX.y,lr_countsY.y,lr_countsZ.y)+BVH_NODE_N6-1) / BVH_NODE_N6;
+ //const uint3 rl_count_high = ((uint3)(rl_countsX.y,rl_countsY.y,rl_countsZ.y)+BVH_NODE_N6-1) / BVH_NODE_N6;
+
+ /* skip blocks for breadth-first phase */
+ const uint3 lr_count_high = ((uint3)(lr_countsX.y, lr_countsY.y, lr_countsZ.y));
+ const uint3 rl_count_high = ((uint3)(rl_countsX.y, rl_countsY.y, rl_countsZ.y));
+
+#endif
+ const float3 sah_high = fma(lr_area_high, convert_float3(lr_count_high), rl_area_high * convert_float3(rl_count_high));
+
+ const uint mid = (startID + endID) / 2;
+ const ulong defaultSplit = (((ulong)as_uint((float)(INFINITY))) << 32) | ((uint)mid << 2) | 0;
+
+ const ulong bestSplit_low = getBestSplit(sah_low, subgroupLocalID, scale, defaultSplit);
+ const ulong bestSplit_high = getBestSplit(sah_high, subgroupLocalID + subgroup_size, scale, defaultSplit);
+ const ulong bestSplit = min(bestSplit_low, bestSplit_high);
+
+ struct Split split;
+ split.sah = as_float((uint)(bestSplit >> 32));
+ split.dim = (uint)bestSplit & 3;
+ split.pos = (uint)bestSplit >> 2;
+
+ return split;
+}
+
+// =====================================================================
+
+inline float leafSAH(float geometryArea, uint prims, uint block_shift)
+{
+ return geometryArea * convert_float((prims + (1 << block_shift) - 1) >> block_shift);
+}
+
+inline bool is_left(struct BinMapping *binMapping, struct Split *split, struct AABB *primref)
+{
+ const uint dim = split->dim;
+ const float lower = primref->lower[dim];
+ const float upper = primref->upper[dim];
+ const float c = lower + upper;
+ const uint pos = convert_uint_rtz((c - binMapping->ofs[dim]) * binMapping->scale[dim]);
+ return pos < split->pos;
+}
+
+inline void serial_find_split(global struct AABB *primref,
+ struct BinMapping *binMapping,
+ struct BuildRecord *buildRecord,
+ local struct Split *split,
+ local struct BinInfo *binInfo,
+ global uint *primref_index0,
+ global uint *primref_index1)
+{
+ const uint subgroupLocalID = get_sub_group_local_id();
+ const uint subgroup_size = get_sub_group_size();
+
+ const uint startID = buildRecord->start;
+ const uint endID = buildRecord->end;
+
+ subgroup_initBinInfo(binInfo);
+
+ for (uint t = startID + subgroupLocalID; t < endID; t += subgroup_size)
+ {
+ const uint index = primref_index0[t];
+ primref_index1[t] = index;
+ atomicUpdateLocalBinInfo_nocheck(binMapping, binInfo, &primref[index]);
+ }
+}
+
+inline void serial_partition_index(global struct AABB *primref,
+ struct BinMapping *binMapping,
+ struct BuildRecord *buildRecord,
+ struct Split *inSplit,
+ struct BuildRecord *outLeft,
+ struct BuildRecord *outRight,
+ struct AABB *outGeometryBoundsLeft,
+ struct AABB *outGeometryBoundsRight,
+ global uint *primref_index0,
+ global uint *primref_index1)
+{
+ const uint localID = get_local_id(0);
+ const uint subgroupLocalID = get_sub_group_local_id();
+ const uint subgroupID = get_sub_group_id();
+ const uint subgroup_size = get_sub_group_size();
+
+ const uint begin = buildRecord->start;
+ const uint end = buildRecord->end;
+ struct Split split = *inSplit;
+
+ struct BuildRecord left;
+ struct BuildRecord right;
+ initBuildRecord(&left, begin, end);
+ initBuildRecord(&right, begin, end);
+
+ struct AABB leftAABB;
+ struct AABB rightAABB;
+ AABB_init(&leftAABB);
+ AABB_init(&rightAABB);
+
+ global uint *l = primref_index0 + begin;
+ global uint *r = primref_index0 + end;
+
+ /* no valid split, just split in the middle */
+ if (split.sah == (float)(INFINITY))
+ {
+ for (uint i = begin + subgroupLocalID; i < split.pos; i += subgroup_size)
+ {
+ const uint index = primref_index1[i];
+ const uint count = sub_group_reduce_add(1);
+ extendBuildRecord(&left, &primref[index]);
+ AABB_extendlu(&leftAABB, primref[index].lower, primref[index].upper);
+ l[subgroupLocalID] = index;
+ l += count;
+ }
+
+ for (uint i = split.pos + subgroupLocalID; i < end; i += subgroup_size)
+ {
+ const uint index = primref_index1[i];
+ const uint count = sub_group_reduce_add(1);
+ extendBuildRecord(&right, &primref[index]);
+ AABB_extendlu(&rightAABB, primref[index].lower, primref[index].upper);
+ r -= count;
+ r[subgroupLocalID] = index;
+ }
+ }
+ else
+ {
+ for (uint i = begin + subgroupLocalID; i < end; i += subgroup_size)
+ {
+ const uint index = primref_index1[i];
+ const uint isLeft = is_left(binMapping, &split, &primref[index]) ? 1 : 0;
+ const uint isRight = 1 - isLeft;
+ const uint countLeft = sub_group_reduce_add(isLeft);
+ const uint countRight = sub_group_reduce_add(isRight);
+ const uint prefixLeft = sub_group_scan_exclusive_add(isLeft);
+ const uint prefixRight = sub_group_scan_exclusive_add(isRight);
+
+ r -= countRight;
+
+ if (isLeft)
+ {
+ extendBuildRecord(&left, &primref[index]);
+ AABB_extendlu(&leftAABB, primref[index].lower, primref[index].upper);
+ l[prefixLeft] = index;
+ }
+ else
+ {
+ extendBuildRecord(&right, &primref[index]);
+ AABB_extendlu(&rightAABB, primref[index].lower, primref[index].upper);
+ r[prefixRight] = index;
+ }
+ l += countLeft;
+ }
+ }
+
+ left.centroidBounds = AABB_sub_group_reduce(&left.centroidBounds);
+ right.centroidBounds = AABB_sub_group_reduce(&right.centroidBounds);
+ leftAABB = AABB_sub_group_reduce(&leftAABB);
+ rightAABB = AABB_sub_group_reduce(&rightAABB);
+
+ if (subgroupLocalID == 0)
+ {
+ uint pos = l - primref_index0; // single first thread needs to compute "pos"
+ left.end = pos;
+ right.start = pos;
+
+ leftAABB.lower.w = AABB_halfArea(&leftAABB);
+ rightAABB.lower.w = AABB_halfArea(&rightAABB);
+
+ leftAABB.upper.w = as_float(getNumPrimsBuildRecord(&left));
+ rightAABB.upper.w = as_float(getNumPrimsBuildRecord(&right));
+
+ *outLeft = left;
+ *outRight = right;
+ *outGeometryBoundsLeft = leftAABB;
+ *outGeometryBoundsRight = rightAABB;
+ }
+
+ work_group_barrier(CLK_LOCAL_MEM_FENCE);
+
+#if ENABLE_CHECKS == 1
+ if (subgroupLocalID == 0)
+ {
+ if (AABB_verify(outLeft))
+ {
+ printf("outLeft:\n");
+ printBuildRecord(outLeft);
+ }
+ if (AABB_verify(outRight))
+ {
+ printf("outRight:\n");
+ printBuildRecord(outRight);
+ }
+ if (AABB_verify(outGeometryBoundsLeft))
+ {
+ printf("outGeometryBoundsLeft:\n");
+ AABB_print(outGeometryBoundsLeft);
+ }
+ if (AABB_verify(outGeometryBoundsRight))
+ {
+ printf("outGeometryBoundsRight:\n");
+ AABB_print(outGeometryBoundsRight);
+ }
+
+ for (uint i = outLeft->start; i < outLeft->end; i++)
+ {
+ const uint index = primref_index0[i];
+ if (split.sah != (float)(INFINITY) && !is_left(binMapping, inSplit, &primref[index]))
+ printf("check left %d \n", i);
+ if (!AABB_verifyBounds(outLeft, outGeometryBoundsLeft, &primref[index]))
+ printf("check prim ref bounds left %d \n", i);
+ }
+ for (uint i = outRight->start; i < outRight->end; i++)
+ {
+ const uint index = primref_index0[i];
+ if (split.sah != (float)(INFINITY) && is_left(binMapping, inSplit, &primref[index]))
+ printf("check right %d \n", i);
+ if (!AABB_verifyBounds(outRight, outGeometryBoundsRight, &primref[index]))
+ printf("check prim ref bounds right %d \n", i);
+ }
+ }
+#endif
+}
+
+inline uint subgroup_createLeaf_index(global struct BlockAllocator *allocator,
+ const uint start,
+ const uint end,
+ global struct AABB *primref,
+ uint primID,
+ global char *bvh_mem,
+ unsigned leafSize)
+{
+ const uint subgroupLocalID = get_sub_group_local_id();
+ const uint subgroup_size = get_sub_group_size();
+ const uint items = end - start;
+
+#if ENABLE_CHECKS == 1
+ if (items > BVH_LEAF_N_MAX)
+ printf("error items %d \n", items);
+#endif
+
+ // JDB TODO: Why was this code commented out??
+ //uint offset = (subgroupLocalID == 0) ? alloc_leaf_mem(globals,sizeof(struct Quad)*items) : 0;
+ //offset = sub_group_broadcast(offset,0);
+
+ //uint offset = globals->leaf_mem_allocator_start + start * leafSize;
+ uint offset = allocator->start + start * leafSize;
+ return offset;
+}
+
+inline uint get_qnode_index_for_backptr(void *qnode_base, void *qnode)
+{
+ size_t offset = ((size_t)qnode - (size_t)qnode_base) / sizeof(struct QBVHNodeN);
+ uint offset_u = (uint)offset;
+#if ENABLE_CHECKS
+ if ((size_t)((offset_u << 6) >> 6) != offset)
+ {
+ printf("get_qnode_index_for_backptr - index out of reach");
+ }
+#endif
+ return offset_u;
+}
+
+struct SerialBuildRecurseTemplateConst
+{
+ unsigned leafSize;
+ unsigned leafType;
+ bool allocateBackpointers;
+};
+
+// ====================================================================================
+// ====================================================================================
+// ====================================================================================
+// ====================================================================================
+// ====================================================================================
+
+inline void parallel_find_split(global struct AABB *primref,
+ local struct BuildRecord *buildRecord,
+ local struct Split *bestSplit,
+ local struct BinInfo *binInfo,
+ global uint *primref_index0,
+ global uint *primref_index1)
+{
+ const uint localID = get_local_id(0);
+ const uint local_size = get_local_size(0);
+ const uint subgroupID = get_sub_group_id();
+
+ const uint startID = buildRecord->start;
+ const uint endID = buildRecord->end;
+
+ struct BinMapping binMapping;
+ initBinMapping(&binMapping, &buildRecord->centroidBounds, BINS);
+
+ /* init bininfo */
+ parallel_initBinInfo(binInfo);
+
+ work_group_barrier(CLK_LOCAL_MEM_FENCE);
+
+ for (uint t = startID + localID; t < endID; t += local_size)
+ {
+ const uint index = primref_index0[t];
+ primref_index1[t] = index;
+ atomicUpdateLocalBinInfo(&binMapping, binInfo, &primref[index]);
+ }
+
+ work_group_barrier(CLK_LOCAL_MEM_FENCE);
+
+ /* find best dimension */
+
+ if (subgroupID == 0)
+ {
+ *bestSplit = reduceBinsAndComputeBestSplit16(binInfo, binMapping.scale, startID, endID);
+ }
+
+ work_group_barrier(CLK_LOCAL_MEM_FENCE);
+}
+
+inline void parallel_find_split32(local uint *local_sync,
+ global struct AABB *primref,
+ local struct BuildRecord *buildRecord,
+ local struct Split *bestSplit,
+ local struct BinInfo2 *binInfo2,
+ global uint *primref_index0,
+ global uint *primref_index1)
+{
+
+ const uint localID = get_local_id(0);
+ const uint local_size = get_local_size(0);
+ const uint subgroupID = get_sub_group_id();
+ const uint numSubGroups = get_num_sub_groups();
+ const uint subgroupLocalID = get_sub_group_local_id();
+
+ const uint startID = buildRecord->start;
+ const uint endID = buildRecord->end;
+
+ struct BinMapping binMapping;
+ initBinMapping(&binMapping, &buildRecord->centroidBounds, 2 * BINS);
+
+ /* init bininfo */
+ parallel_initBinInfo2(binInfo2, 2 * BINS);
+
+ if (localID == 0)
+ *local_sync = 0;
+
+ work_group_barrier(CLK_LOCAL_MEM_FENCE);
+
+ for (uint t = startID + localID; t < endID; t += local_size)
+ {
+ const uint index = primref_index0[t];
+ primref_index1[t] = index;
+ atomicUpdateLocalBinInfo2(&binMapping, binInfo2, &primref[index]);
+ }
+
+ /* find best split position using the last subgroup */
+ sub_group_barrier(CLK_LOCAL_MEM_FENCE);
+ uint syncID = subgroupLocalID == 0 ? generic_atomic_add(local_sync, 1) : 0;
+ syncID = sub_group_broadcast(syncID, 0);
+
+ if (syncID + 1 == numSubGroups)
+ {
+ *bestSplit = reduceBinsAndComputeBestSplit32(binInfo2, binMapping.scale, startID, endID);
+ DBG(if (localID == 0) printSplit(bestSplit));
+ }
+
+ work_group_barrier(CLK_LOCAL_MEM_FENCE);
+}
+
+inline void parallel_partition_index(local uint *local_sync,
+ global struct AABB *primref,
+ struct BinMapping *binMapping,
+ const uint begin,
+ const uint end,
+ struct Split *inSplit,
+ local struct BuildRecord *outLeft,
+ local struct BuildRecord *outRight,
+ local struct AABB *outGeometryBoundsLeft,
+ local struct AABB *outGeometryBoundsRight,
+ global uint *primref_index0,
+ global uint *primref_index1,
+ uint *atomicCountLeft,
+ uint *atomicCountRight)
+{
+ const uint localID = get_local_id(0);
+ const uint local_size = get_local_size(0);
+ const uint subgroupID = get_sub_group_id();
+ const uint numSubGroups = get_num_sub_groups();
+ const uint subgroup_size = get_sub_group_size();
+ const uint subgroupLocalID = get_sub_group_local_id();
+
+ const uint size = end - begin;
+ struct Split split = *inSplit;
+
+ /* init bin bounds */
+ if (localID == 0)
+ {
+ initBuildRecord(outLeft, begin, end);
+ initBuildRecord(outRight, begin, end);
+ AABB_init(outGeometryBoundsLeft);
+ AABB_init(outGeometryBoundsRight);
+ *atomicCountLeft = 0;
+ *atomicCountRight = 0;
+ *local_sync = 0;
+ }
+
+ work_group_barrier(CLK_LOCAL_MEM_FENCE); // remove ?
+
+ struct BuildRecord left;
+ struct BuildRecord right;
+ initBuildRecord(&left, begin, end);
+ initBuildRecord(&right, begin, end);
+
+ struct AABB leftAABB;
+ struct AABB rightAABB;
+ AABB_init(&leftAABB);
+ AABB_init(&rightAABB);
+
+ if (split.sah == (float)(INFINITY))
+ {
+ if (subgroupID == 0)
+ {
+ for (uint i = begin + subgroupLocalID; i < split.pos; i += subgroup_size)
+ {
+ const uint index = primref_index1[i];
+ extendBuildRecord(&left, &primref[index]);
+ AABB_extendlu(&leftAABB, primref[index].lower, primref[index].upper);
+ primref_index0[i] = index;
+ }
+
+ for (uint i = split.pos + subgroupLocalID; i < end; i += subgroup_size)
+ {
+ const uint index = primref_index1[i];
+ extendBuildRecord(&right, &primref[index]);
+ AABB_extendlu(&rightAABB, primref[index].lower, primref[index].upper);
+ primref_index0[i] = index;
+ }
+
+ left.centroidBounds = AABB_sub_group_reduce(&left.centroidBounds);
+ right.centroidBounds = AABB_sub_group_reduce(&right.centroidBounds);
+ leftAABB = AABB_sub_group_reduce(&leftAABB);
+ rightAABB = AABB_sub_group_reduce(&rightAABB);
+
+ if (localID == 0)
+ {
+ outLeft->centroidBounds = left.centroidBounds;
+ outRight->centroidBounds = right.centroidBounds;
+
+ *outGeometryBoundsLeft = leftAABB;
+ *outGeometryBoundsRight = rightAABB;
+
+ outLeft->end = split.pos;
+ outRight->start = split.pos;
+
+ outGeometryBoundsLeft->lower.w = AABB_halfArea(outGeometryBoundsLeft);
+ outGeometryBoundsRight->lower.w = AABB_halfArea(outGeometryBoundsRight);
+ outGeometryBoundsLeft->upper.w = as_float(getNumPrimsBuildRecord(outLeft));
+ outGeometryBoundsRight->upper.w = as_float(getNumPrimsBuildRecord(outRight));
+ }
+ }
+ }
+ else
+ {
+
+ const int startID = begin + ((subgroupID + 0) * size / numSubGroups);
+ const int endID = begin + ((subgroupID + 1) * size / numSubGroups);
+
+ for (uint i = startID + subgroupLocalID; i < endID; i += subgroup_size)
+ {
+ const uint index = primref_index1[i];
+ const uint isLeft = is_left(binMapping, &split, &primref[index]) ? 1 : 0;
+ const uint isRight = 1 - isLeft;
+ const uint countLeft = sub_group_reduce_add(isLeft);
+ const uint countRight = sub_group_reduce_add(isRight);
+ const uint prefixLeft = sub_group_scan_exclusive_add(isLeft);
+ const uint prefixRight = sub_group_scan_exclusive_add(isRight);
+
+ uint offsetLeft = subgroupLocalID == 0 ? generic_atomic_add(atomicCountLeft, countLeft) : 0;
+ offsetLeft = sub_group_broadcast(offsetLeft, 0);
+ uint offsetRight = subgroupLocalID == 0 ? generic_atomic_add(atomicCountRight, countRight) : 0;
+ offsetRight = sub_group_broadcast(offsetRight, 0);
+
+ if (isLeft)
+ {
+ extendBuildRecord(&left, &primref[index]);
+ AABB_extendlu(&leftAABB, primref[index].lower, primref[index].upper);
+ primref_index0[begin + offsetLeft + prefixLeft] = index;
+ }
+ else
+ {
+ extendBuildRecord(&right, &primref[index]);
+ AABB_extendlu(&rightAABB, primref[index].lower, primref[index].upper);
+ primref_index0[end - (offsetRight + countRight) + prefixRight] = index;
+ }
+ }
+ left.centroidBounds = AABB_sub_group_reduce(&left.centroidBounds);
+ right.centroidBounds = AABB_sub_group_reduce(&right.centroidBounds);
+ leftAABB = AABB_sub_group_reduce(&leftAABB);
+ rightAABB = AABB_sub_group_reduce(&rightAABB);
+
+ AABB_local_atomic_merge(&outLeft->centroidBounds, left.centroidBounds.lower, left.centroidBounds.upper);
+ AABB_local_atomic_merge(&outRight->centroidBounds, right.centroidBounds.lower, right.centroidBounds.upper);
+
+ AABB_local_atomic_merge(outGeometryBoundsLeft, leftAABB.lower, leftAABB.upper);
+ AABB_local_atomic_merge(outGeometryBoundsRight, rightAABB.lower, rightAABB.upper);
+
+ sub_group_barrier(CLK_LOCAL_MEM_FENCE);
+
+ if (subgroupLocalID == 0)
+ {
+ const uint sync = atomic_add(local_sync, 1);
+ if (sync + 1 == numSubGroups)
+ {
+ uint pos = begin + *atomicCountLeft; // single thread of last subgroup needs to compute "pos"
+ outLeft->end = pos;
+ outRight->start = pos;
+
+ outGeometryBoundsLeft->lower.w = AABB_halfArea(outGeometryBoundsLeft);
+ outGeometryBoundsRight->lower.w = AABB_halfArea(outGeometryBoundsRight);
+ outGeometryBoundsLeft->upper.w = as_float(getNumPrimsBuildRecord(outLeft));
+ outGeometryBoundsRight->upper.w = as_float(getNumPrimsBuildRecord(outRight));
+ }
+ }
+ }
+
+ work_group_barrier(CLK_LOCAL_MEM_FENCE);
+
+#if ENABLE_CHECKS == 1
+ if (localID == 0)
+ {
+ if (outLeft->end <= begin)
+ printf("pos begin error\n");
+ if (outLeft->end > end)
+ printf("pos end error\n");
+
+ for (uint i = outLeft->start; i < outLeft->end; i++)
+ {
+ const uint index = primref_index0[i];
+ //printf("left %d -> %d \n",i,index);
+ if (!is_left(binMapping, inSplit, &primref[index]))
+ printf("check left %d \n", i);
+ if (!AABB_verifyBounds(outLeft, outGeometryBoundsLeft, &primref[index]))
+ printf("check prim ref bounds left %d \n", i);
+ }
+ for (uint i = outRight->start; i < outRight->end; i++)
+ {
+ const uint index = primref_index0[i];
+ //printf("right %d -> %d \n",i,index);
+ if (is_left(binMapping, inSplit, &primref[index]))
+ printf("check right %d \n", i);
+ if (!AABB_verifyBounds(outRight, outGeometryBoundsRight, &primref[index]))
+ printf("check prim ref bounds right %d \n", i);
+ }
+ }
+#endif
+}
+
+
+#define ENABLE_LOOP_BREADTH_FIRST 0
+#if ENABLE_LOOP_BREADTH_FIRST
+// TBD It might be that layout of this impact perf.
+struct BreadthFirstLoopLocals
+{
+ struct BuildRecord local_current;
+#if ENABLE_32BINS_IN_BREADTH_FIRST_PHASE == 0
+ struct BinInfo binInfo;
+#else
+ struct BinInfo2 binInfo;
+#endif
+ struct Split split;
+ struct BuildRecord children[BVH_NODE_N + 1];
+ struct AABB childrenAABB[BVH_NODE_N + 1];
+ uint atomicCountLeft;
+ uint atomicCountRight;
+ uint local_sync;
+ uint recordID;
+ uint buildRecordIDs[BUILDRECORD_STACK_SIZE];
+ uint numBuildRecordIDs;
+ bool exit;
+};
+
+
+inline void parallel_build_breadth_first_loopT(global struct Globals *globals,
+ global struct AABB *primref,
+ global uint *primref_index,
+ global char *bvh_mem,
+ uint subtreeThreshold,
+ local struct BreadthFirstLoopLocals *L,
+ struct BreadthFirstTemplateConst T)
+{
+ const uint global_size = get_global_size(0);
+ const uint local_size = get_local_size(0);
+ const uint localID = get_local_id(0);
+ const uint taskID = get_group_id(0);
+ const uint numTasks = get_num_groups(0);
+
+ const uint subgroupID = get_sub_group_id();
+ const uint subgroupLocalID = get_sub_group_local_id();
+
+ /* double buffered primref index array */
+ global uint *primref_index0 = primref_index;
+ global uint *primref_index1 = primref_index + globals->numPrimitives;
+
+ global struct BuildRecord *records = getBuildRecords(bvh_mem, globals);
+
+#if ENABLE_32BINS_IN_BREADTH_FIRST_PHASE == 0
+ const uint bins = BINS;
+#else
+ const uint bins = 2 * BINS;
+#endif
+
+ if (localID == 0)
+ {
+ L->numBuildRecordIDs = 0;
+ L->exit = false;
+ }
+
+ work_group_barrier(CLK_LOCAL_MEM_FENCE);
+
+ while (1)
+ {
+ if (localID == 0)
+ {
+ if (L->numBuildRecordIDs == 0)
+ {
+ L->recordID = generic_atomic_add(&globals->counter, 1);
+ if (L->recordID >= globals->numBuildRecords)
+ L->exit = true;
+ }
+ else
+ {
+ L->numBuildRecordIDs--;
+ L->recordID = L->buildRecordIDs[L->numBuildRecordIDs];
+ }
+ L->local_current = records[L->recordID];
+ }
+
+ work_group_barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
+
+ /* no more buildrecords available ? */
+
+ if (L->exit)
+ break;
+
+ local struct BuildRecord *current = &L->local_current;
+ const uint items = getNumPrims(current);
+ const uint depth = getBuildRecursionDepth(current);
+
+ global unsigned int *num_records_output = &globals->numBuildRecords_extended;
+
+ struct QBVHNodeN *qnode = (struct QBVHNodeN *)current->current;
+
+ /* ignore small buildrecords */
+ if (items < max(subtreeThreshold, cfg_minLeafSize))
+ {
+ // do nothing
+ }
+ else
+ {
+ /*! find best split */
+#if ENABLE_32BINS_IN_BREADTH_FIRST_PHASE == 0
+ parallel_find_split(primref, current, &L->split, &L->binInfo, primref_index0, primref_index1);
+#else
+ parallel_find_split32(&L->local_sync, primref, current, &L->split, &L->binInfo, primref_index0, primref_index1);
+#endif
+ uint numChildren = 2;
+
+ /*! find best split */
+ struct BinMapping binMapping;
+ initBinMapping(&binMapping, &current->centroidBounds, bins);
+
+ parallel_partition_index(&L->local_sync, primref, &binMapping, current->start, current->end, &L->split, &L->children[0], &L->children[1], &L->childrenAABB[0], &L->childrenAABB[1], primref_index0, primref_index1, &L->atomicCountLeft, &L->atomicCountRight);
+
+ while (numChildren < BVH_NODE_N6)
+ {
+ /*! find best child to split */
+ const uint bestChild = subgroup_getMaxAreaChild(L->childrenAABB, numChildren);
+ if (bestChild == -1)
+ break;
+
+ /* perform best found split */
+ local struct BuildRecord *brecord = &L->children[bestChild];
+ local struct BuildRecord *lrecord = &L->children[numChildren + 0];
+ local struct BuildRecord *rrecord = &L->children[numChildren + 1];
+
+#if ENABLE_32BINS_IN_BREADTH_FIRST_PHASE == 0
+ parallel_find_split(primref, brecord, &L->split, &L->binInfo, primref_index0, primref_index1);
+#else
+ parallel_find_split32(&L->local_sync, primref, brecord, &L->split, &L->binInfo, primref_index0, primref_index1);
+#endif
+
+ initBinMapping(&binMapping, &brecord->centroidBounds, bins);
+
+ parallel_partition_index(&L->local_sync, primref, &binMapping, brecord->start, brecord->end, &L->split, lrecord, rrecord, &L->childrenAABB[numChildren + 0], &L->childrenAABB[numChildren + 1], primref_index0, primref_index1, &L->atomicCountLeft, &L->atomicCountRight);
+
+ *brecord = *rrecord;
+ L->childrenAABB[bestChild] = L->childrenAABB[numChildren + 1];
+
+ work_group_barrier(CLK_LOCAL_MEM_FENCE);
+
+ numChildren++;
+ }
+
+ //sub_group_barrier(CLK_LOCAL_MEM_FENCE);
+
+ if (localID <= 16 && subgroupID == 0)
+ {
+ global struct BVHBase *bvh_base = (global struct BVHBase *)bvh_mem;
+ global struct QBVHNodeN *nodes_start = BVHBase_nodeData(bvh_base);
+ global uint *back_pointers = BVHBase_backPointers(bvh_base);
+ uint qnode_index = 0;
+ if (T.allocateBackpointers)
+ {
+ /* index of internal node, the domain of backpointers map*/
+ qnode_index = get_qnode_index_for_backptr(nodes_start, qnode);
+ // the backpointer is already set, but we need to add/encode the num of children
+ // todo don't like the need of data read (we should just add), maybe should pass grandpa pointer in record..., or use atomic...
+ back_pointers[qnode_index] += (numChildren << 3);
+ }
+
+ /* sort children based on rnage size */
+ const uint numPrimsIDs = select((uint)0, (as_uint(L->childrenAABB[subgroupLocalID].upper.w) << 3) | subgroupLocalID, subgroupLocalID < numChildren);
+ //const uint IDs = sortBVHChildrenIDs(numPrimsIDs) & (BVH_NODE_N-1);
+ const uint IDs = numPrimsIDs & 7;
+ const uint pushIDs = convertToPushIndices8(IDs);
+
+ /* alloc #numChildren nodes at once */
+ const uint node_offset = alloc_single_node_mem(globals, sizeof(struct QBVHNodeN) * numChildren);
+
+ /* update single relative node pointer and type */
+ const int offset = encodeOffset(bvh_mem, (global void *)qnode, node_offset) >> 6;
+ const uint type = BVH_INTERNAL_NODE;
+
+ /* set parent pointer in child build records */
+ if (subgroupLocalID < numChildren)
+ {
+ setBuildRecursionDepth(&L->children[subgroupLocalID], depth + 1);
+ global uchar *child_data_ptr = (global uchar *)bvh_mem + node_offset + pushIDs * sizeof(struct QBVHNodeN);
+ L->children[subgroupLocalID].current = child_data_ptr;
+ if (T.allocateBackpointers)
+ {
+ uint child_index = get_qnode_index_for_backptr(nodes_start, child_data_ptr);
+ back_pointers[child_index] = qnode_index << 6;
+ }
+ }
+
+ /* write out qbvh node */
+ subgroup_setQBVHNodeN(offset, type, &L->childrenAABB[IDs], numChildren, qnode);
+
+ /* write out child buildrecords to memory */
+
+ uint global_records_offset = (subgroupLocalID == 0) ? atomic_add(num_records_output, numChildren - 1) : 0;
+ global_records_offset = sub_group_broadcast(global_records_offset, 0);
+
+ if (localID == 0)
+ {
+ records[L->recordID] = L->children[0];
+ L->buildRecordIDs[L->numBuildRecordIDs++] = L->recordID;
+ for (uint i = 1; i < numChildren; i++)
+ {
+ const uint ID = globals->numBuildRecords + global_records_offset + i - 1;
+ records[ID] = L->children[i];
+ L->buildRecordIDs[L->numBuildRecordIDs++] = ID;
+ }
+ }
+ }
+ }
+ work_group_barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
+ }
+
+ /* last active HW thread ? */
+ if (localID == 0)
+ {
+ const uint sync = atomic_add(&globals->sync, 1);
+ if (sync + 1 == numTasks)
+ {
+ globals->sync = 0;
+ /* set final number of buildrecords */
+ globals->numBuildRecords += globals->numBuildRecords_extended;
+ globals->numBuildRecords_extended = 0;
+ globals->counter = 0;
+ }
+ }
+}
+
+__attribute__((reqd_work_group_size(MAX_WORKGROUP_SIZE / 2, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16))) void kernel
+parallel_build_breadth_first_loop(global struct Globals *globals,
+ global struct AABB *primref,
+ global uint *primref_index,
+ global char *bvh_mem,
+ uint subtreeThreshold)
+{
+ local struct BreadthFirstLoopLocals L;
+ static const struct BreadthFirstTemplateConst T = {
+ false // bool allocateBackpointers;
+ };
+
+ parallel_build_breadth_first_loopT(globals,
+ primref,
+ primref_index,
+ bvh_mem,
+ subtreeThreshold,
+ &L,
+ T);
+}
+
+__attribute__((reqd_work_group_size(MAX_WORKGROUP_SIZE / 2, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16))) void kernel
+parallel_build_breadth_first_loop_backpointers(global struct Globals *globals,
+ global struct AABB *primref,
+ global uint *primref_index,
+ global char *bvh_mem,
+ uint subtreeThreshold)
+{
+ local struct BreadthFirstLoopLocals L;
+ static const struct BreadthFirstTemplateConst T = {
+ true // bool allocateBackpointers;
+ };
+
+ parallel_build_breadth_first_loopT(globals,
+ primref,
+ primref_index,
+ bvh_mem,
+ subtreeThreshold,
+ &L,
+ T);
+}
+// ===================================================
+// =============== experimental code =================
+// ===================================================
+#endif
+
+#define ENABLE_GLOBAL_SPLIT 0
+#if ENABLE_GLOBAL_SPLIT
+inline void parallel_partition_segment_index(local uint *local_sync,
+ global struct AABB *primref,
+ struct BinMapping *binMapping,
+ const uint begin,
+ const uint end,
+ const uint global_begin,
+ const uint global_end,
+ struct Split *inSplit,
+ local struct AABB *outLeft,
+ local struct AABB *outRight,
+ local struct AABB *outGeometryBoundsLeft,
+ local struct AABB *outGeometryBoundsRight,
+ global uint *primref_index0,
+ global uint *primref_index1,
+ uint *atomicCountLeft,
+ uint *atomicCountRight)
+{
+ const uint localID = get_local_id(0);
+ const uint local_size = get_local_size(0);
+ const uint subgroupID = get_sub_group_id();
+ const uint numSubGroups = get_num_sub_groups();
+ const uint subgroup_size = get_sub_group_size();
+ const uint subgroupLocalID = get_sub_group_local_id();
+
+ const uint size = end - begin;
+ struct Split split = *inSplit;
+
+ /* init bin bounds */
+ if (localID == 0)
+ {
+ AABB_init(outLeft);
+ AABB_init(outRight);
+ AABB_init(outGeometryBoundsLeft);
+ AABB_init(outGeometryBoundsRight);
+ *local_sync = 0;
+ }
+
+ work_group_barrier(CLK_LOCAL_MEM_FENCE);
+
+ struct AABB left;
+ struct AABB right;
+ AABB_init(&left);
+ AABB_init(&right);
+
+ struct AABB leftAABB;
+ struct AABB rightAABB;
+ AABB_init(&leftAABB);
+ AABB_init(&rightAABB);
+
+ const int startID = begin + ((subgroupID + 0) * size / numSubGroups);
+ const int endID = begin + ((subgroupID + 1) * size / numSubGroups);
+
+ for (uint i = startID + subgroupLocalID; i < endID; i += subgroup_size)
+ {
+ const uint index = primref_index1[i];
+ const uint isLeft = is_left(binMapping, &split, &primref[index]) ? 1 : 0;
+ const uint isRight = 1 - isLeft;
+ const uint countLeft = sub_group_reduce_add(isLeft);
+ const uint countRight = sub_group_reduce_add(isRight);
+ const uint prefixLeft = sub_group_scan_exclusive_add(isLeft);
+ const uint prefixRight = sub_group_scan_exclusive_add(isRight);
+
+ uint offsetLeft = subgroupLocalID == 0 ? generic_atomic_add(atomicCountLeft, countLeft) : 0;
+ offsetLeft = sub_group_broadcast(offsetLeft, 0);
+ uint offsetRight = subgroupLocalID == 0 ? generic_atomic_add(atomicCountRight, countRight) : 0;
+ offsetRight = sub_group_broadcast(offsetRight, 0);
+
+ if (isLeft)
+ {
+ AABB_extend_point(&left, AABB_centroid2(&primref[index]));
+ AABB_extendlu(&leftAABB, primref[index].lower, primref[index].upper);
+ primref_index0[global_begin + offsetLeft + prefixLeft] = index;
+ }
+ else
+ {
+ AABB_extend_point(&right, AABB_centroid2(&primref[index]));
+ AABB_extendlu(&rightAABB, primref[index].lower, primref[index].upper);
+ primref_index0[global_end - (offsetRight + countRight) + prefixRight] = index;
+ }
+ }
+ left = AABB_sub_group_reduce(&left);
+ right = AABB_sub_group_reduce(&right);
+ leftAABB = AABB_sub_group_reduce(&leftAABB);
+ rightAABB = AABB_sub_group_reduce(&rightAABB);
+
+ AABB_local_atomic_merge(outLeft, left.lower, left.upper);
+ AABB_local_atomic_merge(outRight, right.lower, right.upper);
+
+ AABB_local_atomic_merge(outGeometryBoundsLeft, leftAABB.lower, leftAABB.upper);
+ AABB_local_atomic_merge(outGeometryBoundsRight, rightAABB.lower, rightAABB.upper);
+
+ work_group_barrier(CLK_LOCAL_MEM_FENCE);
+}
+
+__attribute__((reqd_work_group_size(BINS * 2, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16)))
+void kernel global_init_split_iteration(global struct Globals *globals,
+ global struct GlobalBuildRecord *global_record,
+ global char *bvh_mem,
+ const uint subTreeThreshold)
+{
+ const uint localID = get_local_id(0);
+ const uint taskID = get_group_id(0);
+ const uint numTasks = get_num_groups(0);
+
+ global struct BuildRecord *records = getBuildRecords(bvh_mem, globals);
+
+ /* for each build record with size > subTreeThreshold initialize a global build record */
+
+ const uint startID = (taskID + 0) * globals->numBuildRecords / numTasks;
+ const uint endID = (taskID + 1) * globals->numBuildRecords / numTasks;
+
+ for (uint i = startID; i < endID; i++)
+ {
+ global struct BuildRecord *buildRecord = &records[i];
+ DBG(if (localID == 0) printf("i %d subTreeThreshold %d size %d \n", i, subTreeThreshold, buildRecord->end - buildRecord->start));
+
+ if ((buildRecord->end - buildRecord->start) > subTreeThreshold)
+ {
+ uint ID = localID == 0 ? generic_atomic_add(&globals->numGlobalBuildRecords, 1) : 0;
+
+ ID = work_group_broadcast(ID, 0);
+ global struct BinInfo2 *binInfo = &global_record[ID].binInfo;
+ global struct BinMapping *binMapping = &global_record[ID].binMapping;
+ initBinMapping(binMapping, &buildRecord->centroidBounds, 2 * BINS);
+ parallel_initBinInfo2(binInfo, 2 * BINS);
+ if (localID == 0)
+ {
+ global_record[ID].range.start = buildRecord->start;
+ global_record[ID].range.end = buildRecord->end;
+ global_record[ID].atomicCountLeft = 0;
+ global_record[ID].atomicCountRight = 0;
+ global_record[ID].buildRecordID = i;
+ AABB_init(&global_record[ID].leftCentroid);
+ AABB_init(&global_record[ID].rightCentroid);
+ AABB_init(&global_record[ID].leftGeometry);
+ AABB_init(&global_record[ID].rightGeometry);
+ }
+ }
+ }
+ DBG(
+ work_group_barrier(CLK_LOCAL_MEM_FENCE);
+ if (localID == 0)
+ printf("globals->numGlobalBuildRecords %d \n", globals->numGlobalBuildRecords););
+}
+
+__attribute__((reqd_work_group_size(MAX_WORKGROUP_SIZE, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16)))
+void kernel global_bin_iteration(global struct Globals *globals,
+ global struct AABB *primref,
+ global uint *primref_index,
+ global char *bvh_mem,
+ global struct GlobalBuildRecord *global_record)
+{
+ const uint localID = get_local_id(0);
+ const uint blockSize = get_local_size(0);
+ const uint taskID = get_group_id(0);
+ const uint numTasks = get_num_groups(0);
+
+ const uint numGlobalBuildRecords = globals->numGlobalBuildRecords;
+
+ /* early out */
+ if (numGlobalBuildRecords == 0)
+ return;
+
+ /* double buffered primref index array */
+ global uint *primref_index0 = primref_index;
+ global uint *primref_index1 = primref_index + globals->numPrimitives;
+
+ uint numBlocks = 0;
+
+ /* get total number of blocks, size of block == WG size */
+ for (uint i = 0; i < numGlobalBuildRecords; i++)
+ numBlocks += (global_record[i].range.end - global_record[i].range.start + blockSize - 1) / blockSize;
+
+ const uint startBlockID = (taskID + 0) * numBlocks / numTasks;
+ const uint endBlockID = (taskID + 1) * numBlocks / numTasks;
+ uint numBlockIDs = endBlockID - startBlockID;
+
+ uint splitRecordID = 0;
+ uint offset_start = 0;
+ uint offset_end = 0;
+ uint cur_blocks = 0;
+
+ for (uint blockCounter = 0; splitRecordID < numGlobalBuildRecords; splitRecordID++)
+ {
+ const uint sizeRecord = global_record[splitRecordID].range.end - global_record[splitRecordID].range.start;
+ const uint blocks = (sizeRecord + blockSize - 1) / blockSize;
+ if (startBlockID >= blockCounter && startBlockID < blockCounter + blocks)
+ {
+ const uint preBlocks = startBlockID - blockCounter;
+ cur_blocks = min(numBlockIDs, blocks - preBlocks);
+ offset_start = preBlocks * blockSize;
+ offset_end = min(offset_start + cur_blocks * blockSize, sizeRecord);
+ break;
+ }
+ blockCounter += blocks;
+ }
+
+ if (localID == 0)
+ DBG(printf("taskID %d numBlocks %d splitRecordID %d numBlockIDs %d offset_start %d offset_end %d cur_blocks %d \n", taskID, numBlocks, splitRecordID, numBlockIDs, offset_start, offset_end, cur_blocks));
+
+ local struct BinInfo2 local_binInfo;
+ parallel_initBinInfo2(&local_binInfo, 2 * BINS);
+ struct BinMapping binMapping = global_record[splitRecordID].binMapping;
+
+ while (1)
+ {
+ work_group_barrier(CLK_LOCAL_MEM_FENCE);
+
+ const uint startID = global_record[splitRecordID].range.start + offset_start;
+ const uint endID = global_record[splitRecordID].range.start + offset_end;
+
+ if (localID == 0)
+ DBG(printf("taskID %d startID %d endID %d \n", taskID, startID, endID));
+
+ for (uint i = startID + localID; i < endID; i += blockSize)
+ {
+ const uint index = primref_index0[i];
+ primref_index1[i] = index;
+ atomicUpdateLocalBinInfo2(&binMapping, &local_binInfo, &primref[index]);
+ }
+
+ work_group_barrier(CLK_LOCAL_MEM_FENCE); //FIXME: remove, do local sync
+ atomicUpdateGlobalFromLocalBinInfo2(&global_record[splitRecordID].binInfo, &local_binInfo, 2 * BINS);
+ work_group_barrier(CLK_LOCAL_MEM_FENCE);
+
+ numBlockIDs -= cur_blocks;
+ if (numBlockIDs == 0)
+ break;
+
+ splitRecordID++;
+ parallel_initBinInfo2(&local_binInfo, 2 * BINS);
+ binMapping = global_record[splitRecordID].binMapping;
+
+ const uint sizeRecord = global_record[splitRecordID].range.end - global_record[splitRecordID].range.start;
+ const uint blocks = (sizeRecord + blockSize - 1) / blockSize;
+ cur_blocks = min(numBlockIDs, blocks);
+ offset_start = 0;
+ offset_end = min(cur_blocks * blockSize, sizeRecord);
+
+ if (localID == 0)
+ DBG(printf("taskID %d numBlocks %d splitRecordID %d numBlockIDs %d offset_start %d offset_end %d cur_blocks %d \n", taskID, numBlocks, splitRecordID, numBlockIDs, offset_start, offset_end, cur_blocks));
+ }
+}
+
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16))) void kernel
+global_compute_best_split_iteration(global struct Globals *globals,
+ global char *bvh_mem,
+ global struct GlobalBuildRecord *global_record)
+{
+ const uint localID = get_local_id(0);
+ const uint blockSize = get_local_size(0);
+ const uint taskID = get_group_id(0);
+ const uint numTasks = get_num_groups(0);
+
+ const uint numGlobalBuildRecords = globals->numGlobalBuildRecords;
+
+ /* early out */
+ if (numGlobalBuildRecords == 0)
+ return;
+
+ const uint startRecordID = (taskID + 0) * numGlobalBuildRecords / numTasks;
+ const uint endRecordID = (taskID + 1) * numGlobalBuildRecords / numTasks;
+ for (uint i = startRecordID; i < endRecordID; i++)
+ {
+ struct Split split = reduceBinsAndComputeBestSplit32(&global_record[i].binInfo,
+ global_record[i].binMapping.scale,
+ global_record[i].range.start,
+ global_record[i].range.end);
+ if (localID == 0)
+ {
+ global_record[i].split = split;
+ global_record[i].atomicCountLeft = 0;
+ global_record[i].atomicCountRight = 0;
+ DBG(printSplit(&global_record[i].split));
+ }
+ }
+}
+
+__attribute__((reqd_work_group_size(MAX_WORKGROUP_SIZE, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16))) void kernel
+global_partition_iteration(global struct Globals *globals,
+ global struct AABB *primref,
+ global uint *primref_index,
+ global char *bvh_mem,
+ global struct GlobalBuildRecord *global_record)
+{
+
+ const uint localID = get_local_id(0);
+ const uint blockSize = get_local_size(0);
+ const uint taskID = get_group_id(0);
+ const uint numTasks = get_num_groups(0);
+
+ const uint numGlobalBuildRecords = globals->numGlobalBuildRecords;
+
+ /* early out */
+ if (numGlobalBuildRecords == 0)
+ return;
+
+ /* double buffered primref index array */
+ global uint *primref_index0 = primref_index;
+ global uint *primref_index1 = primref_index + globals->numPrimitives;
+
+ uint numBlocks = 0;
+
+ /* get total number of blocks, size of block == WG size */
+ for (uint i = 0; i < numGlobalBuildRecords; i++)
+ numBlocks += (global_record[i].range.end - global_record[i].range.start + blockSize - 1) / blockSize;
+
+ const uint startBlockID = (taskID + 0) * numBlocks / numTasks;
+ const uint endBlockID = (taskID + 1) * numBlocks / numTasks;
+ uint numBlockIDs = endBlockID - startBlockID;
+
+ uint splitRecordID = 0;
+ uint offset_start = 0;
+ uint offset_end = 0;
+ uint cur_blocks = 0;
+
+ for (uint blockCounter = 0; splitRecordID < numGlobalBuildRecords; splitRecordID++)
+ {
+ const uint sizeRecord = global_record[splitRecordID].range.end - global_record[splitRecordID].range.start;
+ const uint blocks = (sizeRecord + blockSize - 1) / blockSize;
+ if (startBlockID >= blockCounter && startBlockID < blockCounter + blocks)
+ {
+ const uint preBlocks = startBlockID - blockCounter;
+ cur_blocks = min(numBlockIDs, blocks - preBlocks);
+ offset_start = preBlocks * blockSize;
+ offset_end = min(offset_start + cur_blocks * blockSize, sizeRecord);
+ break;
+ }
+ blockCounter += blocks;
+ }
+
+ if (localID == 0)
+ DBG(printf("partition taskID %d numBlocks %d splitRecordID %d numBlockIDs %d offset_start %d offset_end %d cur_blocks %d \n", taskID, numBlocks, splitRecordID, numBlockIDs, offset_start, offset_end, cur_blocks));
+
+ local struct AABB centroidAABB[2];
+ local struct AABB geometryAABB[2];
+ local uint local_sync;
+
+ while (1)
+ {
+
+ const uint startID = global_record[splitRecordID].range.start + offset_start;
+ const uint endID = global_record[splitRecordID].range.start + offset_end;
+
+ struct BinMapping binMapping = global_record[splitRecordID].binMapping;
+ struct Split split = global_record[splitRecordID].split;
+
+ const uint global_start = global_record[splitRecordID].range.start;
+ const uint global_end = global_record[splitRecordID].range.end;
+
+ if (localID == 0)
+ DBG(printf("partition taskID %d startID %d endID %d numBlocks %d splitRecordID %d numBlockIDs %d offset_start %d offset_end %d cur_blocks %d \n", taskID, startID, endID, numBlocks, splitRecordID, numBlockIDs, offset_start, offset_end, cur_blocks));
+
+ parallel_partition_segment_index(&local_sync, primref, &binMapping, startID, endID, global_start, global_end, &split, &centroidAABB[0], &centroidAABB[1], &geometryAABB[0], &geometryAABB[1], primref_index0, primref_index1, &global_record[splitRecordID].atomicCountLeft, &global_record[splitRecordID].atomicCountRight);
+
+ /* update global structures */
+ if (localID == 0)
+ {
+ AABB_global_atomic_merge(&global_record[splitRecordID].leftCentroid, &centroidAABB[0]);
+ AABB_global_atomic_merge(&global_record[splitRecordID].rightCentroid, &centroidAABB[1]);
+ AABB_global_atomic_merge(&global_record[splitRecordID].leftGeometry, &geometryAABB[0]);
+ AABB_global_atomic_merge(&global_record[splitRecordID].rightGeometry, &geometryAABB[1]);
+ }
+
+ numBlockIDs -= cur_blocks;
+ if (numBlockIDs == 0)
+ break;
+
+ splitRecordID++;
+
+ const uint sizeRecord = global_record[splitRecordID].range.end - global_record[splitRecordID].range.start;
+ const uint blocks = (sizeRecord + blockSize - 1) / blockSize;
+ cur_blocks = min(numBlockIDs, blocks);
+ offset_start = 0;
+ offset_end = min(cur_blocks * blockSize, sizeRecord);
+ }
+}
+
+inline void printBinaryNode(struct AABB *aabb)
+{
+ printf("lower %f upper %f lower.w %d upper.w %d \n", aabb->lower, aabb->upper, as_uint(aabb->lower.w), as_uint(aabb->upper.w));
+}
+
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16)))
+void kernel global_finalize_iteration(global struct Globals *globals,
+ global struct GlobalBuildRecord *global_record,
+ global char *bvh_mem,
+ global struct AABB *binary_nodes)
+{
+ const uint localID = get_local_id(0);
+ const uint localSize = get_local_size(0);
+ const uint groupID = get_group_id(0);
+ const uint numGroups = get_num_groups(0);
+
+ global struct BuildRecord *records = getBuildRecords(bvh_mem, globals);
+
+ for (uint i = localID; i < globals->numGlobalBuildRecords; i += localSize)
+ {
+ const uint buildRecordID = global_record[i].buildRecordID;
+ const uint binaryNodeID = as_uint(records[buildRecordID].centroidBounds.lower.w);
+ /* left child buildrecord */
+ const uint leftID = buildRecordID;
+ records[leftID].start = global_record[i].range.start;
+ records[leftID].end = global_record[i].range.start + global_record[i].atomicCountLeft;
+ records[leftID].centroidBounds = global_record[i].leftCentroid;
+ /* right child buildrecord */
+ const uint rightID = generic_atomic_add(&globals->numBuildRecords, 1);
+ records[rightID].start = global_record[i].range.start + global_record[i].atomicCountLeft;
+ records[rightID].end = global_record[i].range.end;
+ records[rightID].centroidBounds = global_record[i].rightCentroid;
+ /* two binary nodes */
+ const uint binaryChildID = generic_atomic_add(&globals->numGlobalBinaryNodes, 2);
+ binary_nodes[binaryNodeID].lower.w = as_float(binaryChildID + 0);
+ binary_nodes[binaryNodeID].upper.w = as_float(binaryChildID + 1);
+ binary_nodes[binaryChildID + 0] = global_record[i].leftGeometry;
+ binary_nodes[binaryChildID + 1] = global_record[i].rightGeometry;
+ binary_nodes[binaryChildID + 0].lower.w = as_float(leftID);
+ binary_nodes[binaryChildID + 0].upper.w = as_float(-1);
+ binary_nodes[binaryChildID + 1].lower.w = as_float(rightID);
+ binary_nodes[binaryChildID + 1].upper.w = as_float(-1);
+ records[leftID].centroidBounds.lower.w = as_float(binaryChildID + 0);
+ records[rightID].centroidBounds.lower.w = as_float(binaryChildID + 1);
+ }
+
+ sub_group_barrier(CLK_LOCAL_MEM_FENCE);
+
+ if (localID == 0)
+ {
+ const uint sync = atomic_add(&globals->sync, 1);
+ if (sync + 1 == numGroups)
+ {
+ globals->sync = 0;
+ DBG(printf("globals->numBuildRecords %d \n", globals->numBuildRecords));
+ DBG(
+ for (uint i = 0; i < globals->numBuildRecords; i++) {
+ printf("i %d \n", i);
+ printBuildRecord(&records[i]);
+ } printf("Binary Tree \n");
+ for (uint i = 0; i < globals->numGlobalBinaryNodes; i++) {
+ printf("i %d \n", i);
+ printBinaryNode(&binary_nodes[i]);
+ }
+
+ );
+ globals->numGlobalBuildRecords = 0;
+ }
+ }
+}
+
+__attribute__((reqd_work_group_size(1, 1, 1))) void kernel global_build_top_level(global struct Globals *globals,
+ global struct GlobalBuildRecord *global_record,
+ global char *bvh_mem,
+ global struct AABB *binary_nodes)
+{
+#define MAX_TOP_LEVEL_STACK_DEPTH 32
+ struct AABB stack[MAX_TOP_LEVEL_STACK_DEPTH];
+ global uchar *stackParentPtrs[MAX_TOP_LEVEL_STACK_DEPTH];
+ struct AABB childrenAABB[BVH_NODE_N6];
+ float childrenHalfArea[BVH_NODE_N6];
+
+ /* build records */
+ global struct BuildRecord *record = getBuildRecords(bvh_mem, globals);
+
+ struct BVHBase *base = (struct BVHBase *)bvh_mem;
+ struct QBVHNodeN *qnode_root = (global struct QBVHNodeN *)(bvh_mem + base->rootNodeOffset);
+
+ uint stack_index = 1;
+ stack[0] = binary_nodes[0];
+ stackParentPtrs[0] = (global uchar *)qnode_root;
+
+ while (stack_index != 0)
+ {
+ stack_index--;
+
+ childrenAABB[0] = stack[stack_index];
+ struct QBVHNodeN *qnode = (struct QBVHNodeN *)stackParentPtrs[stack_index];
+ childrenHalfArea[0] = AABB_halfArea(&childrenAABB[0]);
+
+ /* buildrecord leaf => set parent pointer and continue*/
+ DBG(
+ printf("stack_index %d \n", stack_index);
+ printf("as_uint(childrenAABB[0].upper.w) %d \n", as_uint(childrenAABB[0].upper.w)););
+
+ if (as_uint(childrenAABB[0].upper.w) == -1)
+ {
+ const uint buildRecordID = as_uint(childrenAABB[0].lower.w);
+ DBG(
+ printf("leaf buildRecordID %d \n", buildRecordID);
+ printBuildRecord(&record[buildRecordID]);)
+
+ record[buildRecordID].current = (global uchar *)qnode;
+ continue;
+ }
+
+ childrenHalfArea[0] = AABB_halfArea(&childrenAABB[0]);
+
+ uint numChildren = 1;
+ while (numChildren < BVH_NODE_N6)
+ {
+ // FIXME
+
+ /*! find best child to split */
+ float bestArea = -(float)INFINITY;
+ int bestChild = -1;
+ for (int i = 0; i < numChildren; i++)
+ {
+ /* ignore leaves as they cannot get split */
+ if (as_uint(childrenAABB[i].upper.w) == -1)
+ continue;
+
+ /* find child with largest surface area */
+ if (childrenHalfArea[i] > bestArea)
+ {
+ bestChild = i;
+ bestArea = childrenAABB[i].lower.w;
+ }
+ }
+ if (bestChild == -1)
+ break;
+ const uint leftID = as_uint(childrenAABB[bestChild].lower.w);
+ const uint rightID = as_uint(childrenAABB[bestChild].upper.w);
+ childrenAABB[bestChild] = binary_nodes[leftID];
+ childrenAABB[numChildren] = binary_nodes[rightID];
+ childrenHalfArea[bestChild] = AABB_halfArea(&childrenAABB[bestChild]);
+ childrenHalfArea[numChildren] = AABB_halfArea(&childrenAABB[numChildren]);
+ numChildren++;
+ }
+
+ const uint child_node_offset = alloc_single_node_mem(globals, sizeof(struct QBVHNodeN) * numChildren);
+
+ /* update single relative node pointer */
+ const int offset = encodeOffset(bvh_mem, (global void *)qnode, child_node_offset) >> 6;
+ const uint type = BVH_INTERNAL_NODE;
+
+ setQBVHNodeN(offset, type, childrenAABB, numChildren, qnode);
+
+ DBG(
+ printQBVHNodeN(qnode);
+ printf("numChildren %d \n", numChildren);
+ for (uint i = 0; i < numChildren; i++)
+ AABB_print(&childrenAABB[i]););
+
+ /* update parent pointer of build records of all children */
+ for (uint ID = 0; ID < numChildren; ID++)
+ {
+ stack[stack_index] = childrenAABB[ID];
+ stackParentPtrs[stack_index] = (global uchar *)bvh_mem + child_node_offset + ID * sizeof(struct QBVHNodeN);
+ stack_index++;
+ }
+ }
+}
+
+#endif
diff --git a/src/intel/vulkan/grl/gpu/bvh_build_treelet_refit.h b/src/intel/vulkan/grl/gpu/bvh_build_treelet_refit.h
new file mode 100644
index 00000000000..b8cf7288f6a
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/bvh_build_treelet_refit.h
@@ -0,0 +1,1507 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#pragma once
+
+#include "bvh_build_refit.h"
+#include "libs/lsc_intrinsics.h"
+
+
+#define REFIT_DEBUG_CHECKS 0
+#define REFIT_VERBOSE_LOG 0
+
+#define NUM_STARTPOINTS_IN_SLM (1024)
+
+GRL_INLINE void storeAABBToL1(struct AABB aabb, struct AABB* ptr)
+{
+ uint8 val = (uint8)(
+ as_uint(aabb.lower.x), as_uint(aabb.lower.y), as_uint(aabb.lower.z), as_uint(aabb.lower.w),
+ as_uint(aabb.upper.x), as_uint(aabb.upper.y), as_uint(aabb.upper.z), as_uint(aabb.upper.w));
+
+ store_uint8_L1WB_L3WB((__global uint8*) ptr, 0, val);
+}
+
+GRL_INLINE void storeAABBToL3(struct AABB aabb, struct AABB* ptr)
+{
+ uint8 val = (uint8)(
+ as_uint(aabb.lower.x), as_uint(aabb.lower.y), as_uint(aabb.lower.z), as_uint(aabb.lower.w),
+ as_uint(aabb.upper.x), as_uint(aabb.upper.y), as_uint(aabb.upper.z), as_uint(aabb.upper.w));
+
+ store_uint8_L1UC_L3WB((__global uint8*) ptr, 0, val);
+}
+
+typedef struct Treelet_by_single_group_locals
+{
+ uint startpoints[NUM_STARTPOINTS_IN_SLM];
+} Treelet_by_single_group_locals;
+
+typedef struct SquashedInputGroupDesc {
+ qword bvh;
+ qword scratch;
+ uint groupInTree;
+ uint totalNumGroups; //valid only for 0th element in array, otherwise its trash padding
+} SquashedInputGroupDesc;
+
+//
+//
+// update primitives
+//
+//
+
+typedef struct SquashedInput {
+ global struct BVHBase* pBvh;
+ global void* pInput;
+ global struct AABB* bbox_scratch;
+} SquashedInput;
+
+
+
+// updates one quad leaf and gets BBOX contatining it
+GRL_INLINE void refit_bottom_child_quad(
+ global struct QuadLeaf* quad,
+ global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc,
+ struct AABB* childAABB)
+{
+ struct QuadLeaf Q;
+ get_updated_quad(quad, geomDesc, &Q);
+ quadCopyVertices(&Q, quad);
+ *childAABB = getAABB_Quad((struct Quad*) & Q); // FIXME: support leaves with more than one quad
+}
+
+// procedurals will have to go old path at first
+#if 0
+// updates one procedural leaf and gets BBOX contatining it
+GRL_INLINE void refit_bottom_child_procedural(
+ global struct ProceduralLeaf** pleaf,
+ global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc,
+ struct AABB* childAABB)
+{
+ global struct ProceduralLeaf* leaf = *pleaf;
+ /* extract geomID and primID from leaf */
+ const uint startPrim = QBVHNodeN_startPrim(curNode, child_idx);
+ const uint geomID = ProceduralLeaf_geomIndex(leaf);
+ const uint primID = ProceduralLeaf_primIndex(leaf, startPrim); // FIXME: have to iterate over all primitives of leaf!
+
+ /* read bounds from geometry descriptor */
+ struct GRL_RAYTRACING_AABB aabb = GRL_load_aabb(&geomDesc[geomID], primID);
+ childAABB->lower.x = aabb.MinX;
+ childAABB->lower.y = aabb.MinY;
+ childAABB->lower.z = aabb.MinZ;
+ childAABB->upper.x = aabb.MaxX;
+ childAABB->upper.y = aabb.MaxY;
+ childAABB->upper.z = aabb.MaxZ;
+
+ /* advance leaf pointer to next child */
+ *pleaf = leaf + QBVHNodeN_blockIncr(curNode, child_idx);
+}
+
+
+GRL_INLINE void update_procedural_leafs(
+ global struct BVHBase* bvh,
+ global void* input,
+ global struct AABB* bbox_scratch,
+ uint id,
+ uint num_done_by_one_thread)
+{
+ uint numLeaves = BVHBase_GetNumQuads(bvh);
+ uint leafsIndexOffset = bvh->proceduralDataStart - BVH_ROOT_NODE_OFFSET / 64;
+ global ProceduralLeaf* leafs = (global QuadLeaf*)BVHBase_GetProceduralLeaves(bvh);
+ uint start_leaf = id * num_done_by_one_thread;
+ uint end_leaf = min(start_leaf + num_done_by_one_thread, numLeaves);
+
+ global GRL_RAYTRACING_GEOMETRY_DESC* geosArray = (global GRL_RAYTRACING_GEOMETRY_DESC*) input;
+
+ for (uint leaf_id = start_leaf; leaf_id < end_leaf; leaf_id++)
+ {
+ struct AABB theAABB;
+ refit_bottom_child_procedural(leafs + leaf_id, geosArray, &theAABB);
+ theAABB.lower.w = as_float(0xABBADEFF);
+ theAABB.upper.w = 0x00;
+ storeAABBToL1(theAABB, &bbox[leafsIndexOffset + leaf_id]);
+ }
+}
+#endif
+
+GRL_INLINE void update_quads(
+ global struct BVHBase* bvh,
+ global void* input,
+ global struct AABB* bbox_scratch,
+ uint id,
+ uint num_done_by_one_thread)
+{
+ uint numLeaves = BVHBase_GetNumQuads(bvh);
+ uint leafsIndexOffset = bvh->quadLeafStart - BVH_ROOT_NODE_OFFSET / 64;
+ global QuadLeaf* leafs = (global QuadLeaf*)BVHBase_GetQuadLeaves(bvh);
+ uint start_leaf = id * num_done_by_one_thread;
+ uint end_leaf = min(start_leaf + num_done_by_one_thread, numLeaves);
+
+ global GRL_RAYTRACING_GEOMETRY_DESC* geosArray = (global GRL_RAYTRACING_GEOMETRY_DESC*) input;
+
+ for (uint leaf_id = start_leaf; leaf_id < end_leaf; leaf_id++)
+ {
+ struct AABB theAABB;
+ refit_bottom_child_quad(leafs + leaf_id, geosArray, &theAABB);
+ theAABB.lower.w = as_float(0xABBADEFF);
+ theAABB.upper.w = 0x00;
+ storeAABBToL1(theAABB, &bbox_scratch[leafsIndexOffset + leaf_id]);
+ }
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// core bottom-up update functions
+//
+//
+
+GRL_INLINE void quantise_bounds(
+ struct AABB* input_aabb, float3 len, float3 mant, float3 org, int3 exp,
+ uchar3* lower_uchar,
+ uchar3* upper_uchar)
+{
+ const float up = 1.0f + ulp;
+ const float down = 1.0f - ulp;
+
+ struct AABB child_aabb = conservativeAABB(input_aabb); // conservative ???
+
+ float3 lower = floor(bitShiftLdexp3((child_aabb.lower.xyz - org) * down, -exp + 8));
+ lower = clamp(lower, (float)(QUANT_MIN), (float)(QUANT_MAX));
+ float3 upper = ceil(bitShiftLdexp3((child_aabb.upper.xyz - org) * up, -exp + 8));
+ upper = clamp(upper, (float)(QUANT_MIN), (float)(QUANT_MAX));
+
+ *lower_uchar = convert_uchar3_rtn(lower);
+ *upper_uchar = convert_uchar3_rtp(upper);
+}
+
+typedef struct Qbounds_as_DW {
+ uint32_t xLL; uint32_t xLU; uint32_t xUU;
+ uint32_t yLL; uint32_t yLU; uint32_t yUU;
+ uint32_t zLL; uint32_t zLU; uint32_t zUU;
+} Qbounds_as_DW;
+
+GRL_INLINE void encodeQuantisedDataAsDW(
+ uchar3 lower_uchar,
+ uchar3 upper_uchar,
+ uint idx,
+ Qbounds_as_DW* qbounds)
+{
+ uint shift_init = idx * 8;
+ if (idx >= 4) {
+ uint shift = (shift_init - 32);
+ qbounds->xLU |= ((uint)lower_uchar.x) << shift;
+ qbounds->yLU |= ((uint)lower_uchar.y) << shift;
+ qbounds->zLU |= ((uint)lower_uchar.z) << shift;
+ }
+ else {
+ qbounds->xLL |= ((uint)lower_uchar.x) << shift_init;
+ qbounds->yLL |= ((uint)lower_uchar.y) << shift_init;
+ qbounds->zLL |= ((uint)lower_uchar.z) << shift_init;
+ }
+
+ if (idx < 2) {
+ uint shift = (shift_init + 16);
+ qbounds->xLU |= ((uint)upper_uchar.x) << shift;
+ qbounds->yLU |= ((uint)upper_uchar.y) << shift;
+ qbounds->zLU |= ((uint)upper_uchar.z) << shift;
+ }
+ else {
+ uint shift = (shift_init - 16);
+
+ qbounds->xUU |= ((uint)upper_uchar.x) << shift;
+ qbounds->yUU |= ((uint)upper_uchar.y) << shift;
+ qbounds->zUU |= ((uint)upper_uchar.z) << shift;
+ }
+}
+
+GRL_INLINE void encodeChildBounds(uchar3 lower_uchar, uchar3 upper_uchar, uint ch, struct InternalNode* qnode)
+{
+ qnode->lower_x[ch] = lower_uchar.x; qnode->upper_x[ch] = upper_uchar.x;
+ qnode->lower_y[ch] = lower_uchar.y; qnode->upper_y[ch] = upper_uchar.y;
+ qnode->lower_z[ch] = lower_uchar.z; qnode->upper_z[ch] = upper_uchar.z;
+}
+
+
+GRL_INLINE GRL_OVERLOADABLE void InternalNode_setBounds_skip_prev(struct InternalNode* qbvh_node, uint prevChildIdx, struct AABB* prev_input_aabb, struct AABB* input_aabb, uint childrenIndex, const uint numChildren, struct AABB* aabb_reduced)
+{
+
+ int3 exp;
+ const float up = 1.0f + ulp;
+ struct AABB conservative_aabb = conservativeAABB(aabb_reduced);
+ const float3 len = AABB_size(&conservative_aabb).xyz * up;
+ const float3 mant = frexp_vec3(len, &exp);
+ const float3 org = conservative_aabb.lower.xyz;
+
+ exp += (mant > (float3)QUANT_MAX_MANT ? (int3)1 : (int3)0);
+
+ qbvh_node->lower[0] = org.x; qbvh_node->lower[1] = org.y; qbvh_node->lower[2] = org.z;
+
+ qbvh_node->exp_x = exp.x; qbvh_node->exp_y = exp.y; qbvh_node->exp_z = exp.z;
+
+ Qbounds_as_DW qbounds = { 0x0 };
+
+
+ {
+ uchar3 lower_uchar, upper_uchar;
+ quantise_bounds(prev_input_aabb, len, mant, org, exp, &lower_uchar, &upper_uchar);
+
+ //encode invalid children. its enough to set 0x80 as lower_x bytes
+ uint shift = numChildren * 8;
+ uint shift2 = min(shift, 31u);
+ qbounds.xLL = (0x80808080u << shift2);
+ uint shift3 = max(shift, 32u) - 32;
+ qbounds.xLU = (ushort)(((ushort)0x8080) << (ushort)shift3);
+
+ encodeQuantisedDataAsDW(lower_uchar, upper_uchar, prevChildIdx, &qbounds);
+ //encodeChildBounds(lower_uchar, upper_uchar, prevChildIdx, qbvh_node);
+ }
+
+ uint ch = prevChildIdx == 0;
+ while (ch < numChildren) {
+ uchar3 lower_uchar, upper_uchar;
+ quantise_bounds(input_aabb + ch, len, mant, org, exp, &lower_uchar, &upper_uchar);
+ encodeQuantisedDataAsDW(lower_uchar, upper_uchar, ch, &qbounds);
+ //encodeChildBounds(lower_uchar, upper_uchar, ch, qbvh_node);
+ ch += 1 + (prevChildIdx == (ch + 1));
+ }
+ Qbounds_as_DW* qbounds_dst = (Qbounds_as_DW*)(&qbvh_node->lower_x[0]);
+ *qbounds_dst = qbounds;
+ return;
+}
+
+GRL_INLINE struct AABB refitReduce2Boxes(struct AABB A, struct AABB B)
+{
+ AABB_extend(&A, &B);
+ // to make it work for TLAS node masks change to this:
+ // A.lower.w = as_float(as_uint(A.lower.w) | as_uint(B.lower.w));
+ A.lower.w = as_float(0xABBADE00u);
+ return A;
+}
+
+GRL_INLINE void refitReduceNodePrev(
+ uint prevIdx,
+ uint leadChildIdx,
+ uint numChildren,
+ struct AABB* globalBox,
+ struct AABB* reduceBox,
+ uint depth,
+ uint NodeIndex)
+{
+ uint8_t childIgnored = (prevIdx - leadChildIdx);
+
+# if REFIT_DEBUG_CHECKS
+ bool err = false;
+ if ((as_uint(reduceBox->lower.w) & 0xFFFFFF00) != 0xABBADE00u)
+ {
+ printf("refitReduceNode6 (loc_id %d): prev (used as child %d) not updated! NodeIndex %d, child nodeIdx %d at depth %d\n",
+ get_local_id(0),
+ childIgnored,
+ NodeIndex,
+ prevIdx,
+ depth);
+ err = true;
+ }
+
+ if ((as_uint(globalBox[NodeIndex].lower.w) & 0xFFFFFF00) == 0xABBADE00u)
+ {
+ printf("refitReduceNode6 (loc_id %d): dst node already updated. NodeIndex %d depth %d\n",
+ get_local_id(0),
+ NodeIndex,
+ depth);
+ }
+
+ bool fail = false;
+ for (uint k = 0; (k < numChildren) && !err; ++k) {
+ if (k != childIgnored) {
+ if ((as_uint(globalBox[leadChildIdx + k].lower.w) & 0xFFFFFF00) != 0xABBADE00u) {
+ printf("refitReduceNode6 (loc_id %d): child %d not updated! use prev %d, NodeIndex %d, child nodeIdx %d at depth %d\n",
+ get_local_id(0),
+ k,
+ prevIdx - leadChildIdx,
+ NodeIndex,
+ leadChildIdx + k,
+ depth);
+ fail = true;
+ }
+ }
+ }
+ err |= fail;
+# endif
+
+ // for each child 3 bits contains load index
+ const uint32_t indicesEncoded =
+ (1 << 0) +
+ (2 << 3) +
+ (3 << 6) +
+ (4 << 9) +
+ (5 << 12) +
+ (0 << 15) +
+ (1 << 18) +
+ (2 << 21) +
+ (3 << 24) +
+ (4 << 27);
+ // 1,2,3,4,5
+
+
+ uint32_t indicesEncodedShifted = indicesEncoded >> (childIgnored * 3);
+
+ struct AABB* childAABB = globalBox + leadChildIdx;
+ struct AABB temp = childAABB[indicesEncodedShifted & 7];
+ indicesEncodedShifted >>= 3;
+ struct AABB* nextChild = childAABB + (indicesEncodedShifted & 7);
+ struct AABB backlog = temp;
+
+ for (uint child = 2; child < numChildren; child++)
+ {
+ temp = *nextChild;
+ *reduceBox = refitReduce2Boxes(*reduceBox, backlog);
+ indicesEncodedShifted >>= 3;
+ nextChild = childAABB + (indicesEncodedShifted & 7);
+ backlog = temp;
+ }
+
+ *reduceBox = refitReduce2Boxes(*reduceBox, backlog);
+
+#if REFIT_DEBUG_CHECKS
+ for (uint k = 0; (k < numChildren) && !err; ++k) {
+ if (k != childIgnored) {
+ if (!AABB_subset(&globalBox[leadChildIdx + k], reduceBox)) {
+ printf("refitReduceNode6 (loc_id %d): child AABB %d/%d reduction went wrong! skipped prev %d, NodeIndex %d, child nodeIdx %d at depth %d\n",
+ get_local_id(0),
+ k, numChildren,
+ prevIdx - leadChildIdx,
+ NodeIndex,
+ leadChildIdx + k,
+ depth);
+
+ err = true;
+ }
+ }
+ }
+ if (!err && ((as_uint(reduceBox->lower.w) & 0xFFFFFF00) != 0xABBADE00u)) {
+ printf("refitReduceNode6: havent set the 0xABBADEXXu marker in result node %d at depth %d!\n",
+ NodeIndex,
+ depth);
+ }
+#endif
+}
+
+
+GRL_INLINE uint hash_local_id()
+{
+ return get_sub_group_local_id() * get_num_sub_groups() + get_sub_group_id();
+}
+
+//===============================================================
+//
+// Core update function
+//
+//===============================================================
+GRL_INLINE bool refit_treelet_by_single_group(
+ global struct AABB* bbox,
+ local Treelet_by_single_group_locals* loc,
+ uniform global BVHBase* pBvh,
+ uniform RefitTreelet trltDsc,
+ bool encodeQnodes,
+ bool isTipTreelet)
+{
+ BackPointers* backpointers = BVHBase_GetBackPointers(pBvh);
+ InternalNode* internalNodes = BVHBase_GetInternalNodes(pBvh);
+ uint local_id = get_local_id(0);
+ StartPoint* startPoints = BVHBase_GetRefitStartPoints(pBvh) + trltDsc.startpoint_offset;
+
+ // special case for single path treelets, TODO rewrite it as subgroups based
+ if (trltDsc.numStartpoints == 1) {
+ if (local_id == 0) {
+ RefitTreeletTrivial desc = *((RefitTreeletTrivial*)& trltDsc);
+ uint innerNodeIdx = desc.theOnlyNodeIndex;
+ uint numChildren = desc.numChildrenOfTheNode;
+ uint childIndex = desc.childrenOffsetOfTheNode;
+ uint maxDepth = desc.maxDepth;
+
+ uint prevIdx = childIndex;
+ struct AABB myBox = bbox[childIndex];
+ struct AABB prevAABB;
+ uint backpointer = maxDepth > 0 ? *InnerNode_GetBackPointer(backpointers, innerNodeIdx) : 0;
+ InternalNode* curNode = internalNodes + innerNodeIdx;
+ uint currDepth = 0;
+
+ while (1)
+ {
+ prevAABB = myBox;
+ if (numChildren > 1) { refitReduceNodePrev(prevIdx, childIndex, numChildren, bbox, &myBox, 0, innerNodeIdx); }
+
+ if (!encodeQnodes) { myBox.upper.w = encodeQnodes ? 0 : as_float(numChildren + (childIndex << 4)); }
+
+ if (++currDepth > maxDepth) { break; }
+
+ if (encodeQnodes) {
+ InternalNode_setBounds_skip_prev(curNode, prevIdx - childIndex, &prevAABB, bbox + childIndex, childIndex, numChildren, &myBox);
+ }
+#if !REFIT_DEBUG_CHECKS
+ else
+#endif
+ { storeAABBToL1(myBox, &bbox[innerNodeIdx]); }
+
+ prevIdx = innerNodeIdx;
+ innerNodeIdx = BackPointer_GetParentIndex(backpointer);
+ backpointer = *InnerNode_GetBackPointer(backpointers, innerNodeIdx);
+ numChildren = BackPointer_GetNumChildren(backpointer);
+ curNode = internalNodes + innerNodeIdx;
+ childIndex = innerNodeIdx + curNode->childOffset;
+ }
+
+ if (isTipTreelet) {
+ AABB3f reduced3f = AABB3fFromAABB(myBox);
+ pBvh->Meta.bounds = reduced3f;
+ }
+ else {
+ storeAABBToL3(myBox, &bbox[innerNodeIdx]);
+ }
+
+ if (encodeQnodes || isTipTreelet) {
+ InternalNode_setBounds_skip_prev(curNode, prevIdx - childIndex, &prevAABB, bbox + childIndex, childIndex, numChildren, &myBox);
+ }
+
+#if REFIT_VERBOSE_LOG
+ printf("single node treelet: storing node idx %d \n", innerNodeIdx);
+#endif
+ }
+
+ return local_id == 0;
+ }
+
+ local uint* loc_startpoints = loc->startpoints;
+
+
+#if REFIT_DEBUG_CHECKS
+ if ((trltDsc.numNonTrivialStartpoints > NUM_STARTPOINTS_IN_SLM)) {
+ if(local_id == 0) printf("out of SLM space, trltDsc.depthSub_NUM_STARTPOINTS_IN_SLM > 0\n");
+ return local_id == 0;
+ }
+#endif
+
+ uint SLMedStartpointsOffset = trltDsc.numStartpoints - trltDsc.numNonTrivialStartpoints;
+
+ /*=====================================================================
+ first phase where we update startpoints nodes only
+ ----------------------------------------------------------------------*/
+ for (uint startpoint_i = local_id; startpoint_i < trltDsc.numStartpoints; startpoint_i += get_local_size(0)) {
+ uint startpoint = (uint)intel_sub_group_block_read_ui((global uint*)(startPoints + startpoint_i));
+ uint innerNodeIdx = StartPoint_GetNodeIdx(startpoint);
+ uint backpointer = *InnerNode_GetBackPointer(backpointers, innerNodeIdx);
+ if (startpoint_i >= SLMedStartpointsOffset) {
+ uint idx = startpoint_i - SLMedStartpointsOffset;
+ loc_startpoints[idx] = (BackPointer_GetParentIndex(backpointer) << 6) | StartPoint_GetDepth(startpoint);
+ }
+
+ uint numChildren = BackPointer_GetNumChildren(backpointer);
+ InternalNode* curNode = internalNodes + innerNodeIdx;
+ uint childIndex = innerNodeIdx + curNode->childOffset;
+
+ uint prevIdx = childIndex;
+ struct AABB myBox = bbox[childIndex];
+ struct AABB prevAABB = myBox;
+
+# if REFIT_DEBUG_CHECKS
+ if (numChildren == 0) {
+ printf("this node has no chidren!\n", 0);
+ AABB_init(&myBox);
+ }
+# endif
+
+ if (numChildren > 1) { refitReduceNodePrev(prevIdx, childIndex, numChildren, bbox, &myBox, 0, innerNodeIdx); }
+ myBox.upper.w = encodeQnodes ? 0 : as_float(numChildren + (childIndex << 4));
+
+#if REFIT_VERBOSE_LOG
+ printf("init phase: at depth 0 storing node idx %d \n", innerNodeIdx);
+#endif
+ storeAABBToL1(myBox, &bbox[innerNodeIdx]);
+
+ if (encodeQnodes) {
+ InternalNode_setBounds_skip_prev(curNode, 0, &prevAABB, bbox + childIndex, childIndex, numChildren, &myBox);
+ }
+ }
+
+ uniform uint CurrPeeledDepth = 1;
+ uniform uint numStartpoints = trltDsc.numNonTrivialStartpoints;
+ uint nextFloorStartpoint = hash_local_id();
+
+ uint depthOnionEnd = trltDsc.depthLess64;
+ if (get_local_size(0) == 128) { depthOnionEnd = trltDsc.depthLess128; }
+ if (get_local_size(0) == 256) { depthOnionEnd = trltDsc.depthLess256; }
+
+ /*=====================================================================
+ second phase, we update horizontally untill
+ we reach number of active path below grou size
+ ----------------------------------------------------------------------*/
+ while (CurrPeeledDepth < depthOnionEnd) {
+ mem_fence_workgroup_default();
+
+ work_group_barrier(CLK_LOCAL_MEM_FENCE, memory_scope_work_group);
+ uint start = nextFloorStartpoint;
+ nextFloorStartpoint = numStartpoints;
+
+ for (uint startpoint_i = start; startpoint_i < numStartpoints; startpoint_i += get_local_size(0)) {
+ uint startpoint = loc_startpoints[startpoint_i];
+ uint innerNodeIdx = StartPoint_GetNodeIdx(startpoint);
+ uint backpointer = *InnerNode_GetBackPointer(backpointers, innerNodeIdx);
+
+ if (StartPoint_GetDepth(startpoint) > CurrPeeledDepth) {
+ StartPoint newSP = (BackPointer_GetParentIndex(backpointer) << 6) | StartPoint_GetDepth(startpoint);
+ loc_startpoints[startpoint_i] = newSP;
+ nextFloorStartpoint = min(nextFloorStartpoint, startpoint_i);
+ }
+
+ InternalNode* curNode = internalNodes + innerNodeIdx;
+ uint childIndex = innerNodeIdx + curNode->childOffset;
+ uint numChildren = BackPointer_GetNumChildren(backpointer);
+
+ uint prevIdx = childIndex;
+ struct AABB myBox = bbox[childIndex];
+ struct AABB prevAABB = myBox;
+ refitReduceNodePrev(prevIdx, childIndex, numChildren, bbox, &myBox, CurrPeeledDepth, innerNodeIdx);
+
+ myBox.upper.w = encodeQnodes ? 0 : as_float(numChildren + (childIndex << 4));
+
+#if REFIT_VERBOSE_LOG
+ printf("onion: startpoint %d <n=%d , d=%d> at depth %d storing node idx %d \n", startpoint_i, StartPoint_GetNodeIdx(startpoint), StartPoint_GetDepth(startpoint), CurrPeeledDepth, innerNodeIdx);
+#endif
+ storeAABBToL1(myBox, &bbox[innerNodeIdx]);
+ if (encodeQnodes) {
+ InternalNode_setBounds_skip_prev(curNode, 0, &prevAABB, bbox + childIndex, childIndex, numChildren, &myBox);
+ }
+ }
+ CurrPeeledDepth++;
+ }
+
+ uint startpoint_idx = nextFloorStartpoint;
+ bool active = startpoint_idx < numStartpoints;
+
+ work_group_barrier(CLK_LOCAL_MEM_FENCE, memory_scope_work_group);
+ StartPoint startpoint = loc_startpoints[startpoint_idx];
+
+ struct AABB myBox;
+ uint prevIdx = 0;
+ uint innerNodeIdx = StartPoint_GetNodeIdx(startpoint);
+
+ /*=====================================================================
+ last phase, each thread just continues path to its end
+
+ only thread that computes the longest path leaves prematurely
+ (thats why while condition isn't <=) the code for finalizing root of treelet
+ is special and hendled afterwards
+
+ TODO: with proper assigning of paths to lanes we should reach only three
+ active lanes per physical thread quite soon for this subgroups could be used
+ ----------------------------------------------------------------------*/
+ bool prevActive = active;
+ while (CurrPeeledDepth < trltDsc.maxDepth) {
+ uint backpointer;
+ uint childIndex;
+ InternalNode* curNode = internalNodes + innerNodeIdx;
+ if (active) {
+ childIndex = innerNodeIdx + curNode->childOffset;
+ backpointer = *InnerNode_GetBackPointer(backpointers, innerNodeIdx);
+ } else if(prevActive){
+ mem_fence_workgroup_default();
+ }
+
+ prevActive = active;
+
+ work_group_barrier(0, memory_scope_work_group);
+ //printf("Start node %d at depth %d, innerNodeIdx %d dying! \n", StartPoint_GetNodeIdx(startpoint), CurrPeeledDepth, innerNodeIdx);
+ if (active) {
+
+#if REFIT_DEBUG_CHECKS
+ if (CurrPeeledDepth > StartPoint_GetDepth(startpoint))
+ {
+ printf("uppath: startpoint %d <n=%d , d=%d> at depth %d shouldn't be active!\n", startpoint_idx, StartPoint_GetNodeIdx(startpoint), StartPoint_GetDepth(startpoint), CurrPeeledDepth);
+ }
+#endif
+ if (prevIdx == 0) {
+ myBox = bbox[childIndex];
+ prevIdx = childIndex;
+ }
+ uint numChildren = BackPointer_GetNumChildren(backpointer);
+
+ struct AABB prevAABB = myBox;
+ refitReduceNodePrev(prevIdx, childIndex, numChildren, bbox, &myBox, CurrPeeledDepth, innerNodeIdx);
+ myBox.upper.w = encodeQnodes ? 0 : as_float(numChildren + (childIndex << 4));
+#if REFIT_VERBOSE_LOG
+ printf("uppath: startpoint %d <n=%d , d=%d> at depth %d storing node idx %d \n", startpoint_idx, StartPoint_GetNodeIdx(startpoint), StartPoint_GetDepth(startpoint), CurrPeeledDepth, innerNodeIdx);
+#endif
+ active = CurrPeeledDepth < StartPoint_GetDepth(startpoint);
+
+ if (encodeQnodes) {
+#if !REFIT_DEBUG_CHECKS
+ if (!active)
+#endif
+ { storeAABBToL1(myBox, &bbox[innerNodeIdx]); }
+ InternalNode_setBounds_skip_prev(curNode, prevIdx - childIndex, &prevAABB, bbox + childIndex, childIndex, numChildren, &myBox);
+ } else {
+ storeAABBToL1(myBox, &bbox[innerNodeIdx]);
+ }
+
+ prevIdx = innerNodeIdx;
+ innerNodeIdx = BackPointer_GetParentIndex(backpointer);
+ }
+
+ CurrPeeledDepth++;
+ }
+
+ {
+ uint backpointer;
+ uint childIndex;
+ InternalNode* curNode = internalNodes + innerNodeIdx;
+ if (active) {
+ childIndex = innerNodeIdx + curNode->childOffset;
+ backpointer = *InnerNode_GetBackPointer(backpointers, innerNodeIdx);
+ } else if(prevActive) {
+ mem_fence_workgroup_default();
+ }
+
+ work_group_barrier(0, memory_scope_work_group);
+
+ /*=====================================================================
+ final step, is special processing of root,
+ its different, since its box is transfered cross group (written to L3)
+ or is root of whole tree and hence fill global box in bvh MD
+ TODO: this should be done in SG as only one thread is active
+ ----------------------------------------------------------------------*/
+ if (active) {
+ if (prevIdx == 0) {
+ myBox = bbox[childIndex];
+ prevIdx = childIndex;
+ }
+ uint numChildren = BackPointer_GetNumChildren(backpointer);
+ struct AABB prevAABB = myBox;
+ refitReduceNodePrev(prevIdx, childIndex, numChildren, bbox, &myBox, CurrPeeledDepth, innerNodeIdx);
+ myBox.upper.w = encodeQnodes ? 0 : as_float(numChildren + (childIndex << 4));
+
+#if REFIT_VERBOSE_LOG
+ printf("root: startpoint %d <n=%d , d=%d> at depth %d storing node idx %d \n", startpoint_idx, StartPoint_GetNodeIdx(startpoint), StartPoint_GetDepth(startpoint), CurrPeeledDepth, innerNodeIdx/*,WeReInSIMD*/);
+#endif
+ if (isTipTreelet) {
+ AABB3f reduced3f = AABB3fFromAABB(myBox);
+ pBvh->Meta.bounds = reduced3f;
+ InternalNode_setBounds_skip_prev(curNode, prevIdx - childIndex, &prevAABB, bbox + childIndex, childIndex, numChildren, &myBox);
+ } else {
+ storeAABBToL3(myBox, &bbox[innerNodeIdx]);
+ if (encodeQnodes) {
+ InternalNode_setBounds_skip_prev(curNode, prevIdx - childIndex, &prevAABB, bbox + childIndex, childIndex, numChildren, &myBox);
+ }
+ }
+ }
+ }
+
+ return active;
+}
+
+
+//////////////////////////////////////////////////////////////////////////////////////
+//
+// Internal nodes enocding as a separate dispatch
+//
+//
+
+// encode qnodes as a separate pass
+GRL_INLINE void post_refit_encode_qnode_tree_per_group(
+ global struct AABB* bbox_scratch,
+ global struct BVHBase* bvh)
+{
+ uint numInnerNodes = BVHBase_GetNumInternalNodes(bvh);
+ InternalNode* internalNodes = BVHBase_GetInternalNodes(bvh);
+
+ for (uint nodeIdx = get_local_id(0) + 1 /*+1 because node 0 is already updated*/; nodeIdx < numInnerNodes; nodeIdx += get_local_size(0))
+ {
+ struct AABB reduced = bbox_scratch[nodeIdx];
+# if REFIT_DEBUG_CHECKS
+ if ((as_uint(reduced.lower.w) & 0xFFFFFF00) != 0xABBADE00u) {
+ printf("qnode enc group: NodeIndex %d not updated! \n", nodeIdx);
+ return;
+ }
+ for (uint k = 0; k < (as_uint(reduced.upper.w) & 7); ++k) {
+ uint childIdx = (as_uint(reduced.upper.w) >> 4) + k;
+ if ((as_uint(bbox_scratch[childIdx].lower.w) & 0xFFFFFF00) != 0xABBADE00u) {
+ printf("qnode enc group: child not updated! NodeIndex %d, child nodeIdx %d \n", nodeIdx, childIdx);
+ return;
+ }
+ }
+# endif
+ struct InternalNode* qbvh_node = internalNodes + nodeIdx;
+ uint childIndex = as_uint(reduced.upper.w) >> 4;
+ uint numChildren = as_uint(reduced.upper.w) & 7;
+ struct AABB* children = bbox_scratch + childIndex;
+ //InternalNode_setBounds(internalNodes + nodeIdx, bbox_scratch + (as_uint(reduced.upper.w) >> 4), as_uint(reduced.upper.w) & 7, &reduced);
+ InternalNode_setBounds_skip_prev(qbvh_node, 0, children, children, childIndex, numChildren, &reduced);
+ }
+}
+
+//////////////////////////////////////////////////////////////////////////////////////
+//
+// Construction of treelets and paths
+//
+//
+
+// this is tiny bit tricky, when bottom-up thread haven't yet closed treelet this is number of startpoints that are under the node
+// when thread closed treelets it the data is starts to be treelet ID
+typedef uint TreeletNodeData;
+
+typedef struct TreeletsOpenNodeInfo {
+ // bool isTreeletRoot; // : 1
+ short maxDepth; // : 14
+ uint numStartpoints;// : 16
+} TreeletsOpenNodeInfo;
+
+typedef struct TreeletsClosedNodeInfo {
+ // bool isTreeletRoot; // : 1
+ uint treeletId; // : 31 (when treelet is closed)
+} TreeletsClosedNodeInfo;
+
+GRL_INLINE TreeletNodeData ClearTreeletRoot(TreeletNodeData D)
+{
+ return D & ((1u << 31u) - 1u);
+}
+
+GRL_INLINE uint isTreeletRoot(TreeletNodeData E)
+{
+ return E >> 31;
+}
+
+GRL_INLINE uint getNumStartpoints(TreeletNodeData E)
+{
+ return E & ((1 << 16) - 1);
+}
+
+GRL_INLINE uint getMaxDepth(TreeletNodeData E)
+{
+ return (E >> 16) & ((1 << 14) - 1);
+}
+
+// single startpoint treelet
+GRL_INLINE uint isTrivialTreeletRoot(TreeletNodeData E)
+{
+ return (E >> 31) && (getMaxDepth(E) == 0);
+}
+
+GRL_INLINE TreeletNodeData SetTipStartpoint(TreeletNodeData D)
+{
+ return ClearTreeletRoot(D) | (1 << 30);
+}
+
+GRL_INLINE TreeletNodeData SetTreeletRoot(TreeletNodeData D)
+{
+ return D | (1 << 31);
+}
+
+GRL_INLINE TreeletsOpenNodeInfo DecodeOpenInfo(TreeletNodeData E)
+{
+ TreeletsOpenNodeInfo I;
+ I.maxDepth = getMaxDepth(E);
+ I.numStartpoints = getNumStartpoints(E);
+ return I;
+}
+
+GRL_INLINE TreeletNodeData EncodeOpenInfo(TreeletsOpenNodeInfo I, bool isRoot)
+{
+ TreeletNodeData D = isRoot ? (1 << 31) : 0;
+ D |= (I.maxDepth & ((1 << 14) - 1)) << 16;
+ D |= I.numStartpoints & ((1 << 16) - 1);
+ return D;
+}
+
+GRL_INLINE TreeletsClosedNodeInfo DecodeClosedInfo(TreeletNodeData E)
+{
+ TreeletsClosedNodeInfo I;
+ I.treeletId = E & ((1u << 31u) - 1u);
+ return I;
+}
+
+GRL_INLINE TreeletNodeData GRL_OVERLOADABLE EncodeClosedInfo(TreeletsClosedNodeInfo I)
+{
+ TreeletNodeData D = (1u << 31u); // closed is always a root!
+ D |= I.treeletId & ((1u << 31u) - 1u);
+ return D;
+}
+
+GRL_INLINE TreeletNodeData GRL_OVERLOADABLE EncodeClosedInfo(uint treeletId)
+{
+ TreeletNodeData D = (1 << 31); // closed is always a root!
+ D |= treeletId & ((1u << 31u) - 1u);
+ return D;
+}
+
+GRL_INLINE void chk_close_Treelet(
+ RefitTreelet* TreeletDescsArr,
+ TreeletNodeData* nodeTreeletDataArr,
+ uint* StartPointBuffer,
+ uint* currStartpoint,
+ TreeletNodeData nodeData,
+ TreeletsOpenNodeInfo* nodeOpenInfo,
+ uint nodeIdx,
+ uint* treeletDescIdx)
+{
+ if (isTreeletRoot(nodeData))
+ {
+ TreeletNodeData encoded = 0;
+ if (nodeOpenInfo->numStartpoints == 1)
+ {
+ encoded = ClearTreeletRoot(SetTipStartpoint(nodeData));
+ }
+ else
+ {
+ RefitTreelet RTdesc;
+ RTdesc.startpoint_offset = *currStartpoint;
+ *currStartpoint += nodeOpenInfo->numStartpoints;
+ RTdesc.numStartpoints = nodeOpenInfo->numStartpoints;
+ RTdesc.maxDepth = nodeOpenInfo->maxDepth;
+ TreeletDescsArr[*treeletDescIdx] = RTdesc;
+ encoded = EncodeClosedInfo(*treeletDescIdx);
+ *treeletDescIdx = *treeletDescIdx + 1;
+ TreeletsOpenNodeInfo infoDefault = { 0, 0 };
+ *nodeOpenInfo = infoDefault;
+ }
+
+ nodeTreeletDataArr[nodeIdx] = encoded;
+ }
+ // printf("close_Treelet %d, nodeOpenInfo.numStartpoints %d, RTdesc.maxDepth %d, RTdesc.startpoint_offset %d\n", treeletDescIdx, nodeOpenInfo.numStartpoints, RTdesc.maxDepth, RTdesc.startpoint_offset);
+}
+
+
+// TreeletNodeData* treelets holds per node property, after running this some of them are marked as treelet root
+GRL_INLINE void treelet_bottom_up_mark_treelets(
+ global struct BVHBase* bvh,
+ global InternalNode* internalNodes,
+ global StartPoint* scratch_startpoints,
+ uint curNodeIndex,
+ BackPointers* backPointers,
+ global TreeletNodeData* treelets,
+ uint refitTreeletsDataStart,
+ uint* startpointAlloc)
+{
+ TreeletsOpenNodeInfo currInfo;
+ currInfo.maxDepth = 0;
+ currInfo.numStartpoints = 1;
+
+ global RefitTreelet* treeletDescs = (global RefitTreelet*) (((global char*)bvh) + (refitTreeletsDataStart * 64));
+
+ treelets[curNodeIndex] = EncodeOpenInfo(currInfo, true);
+
+ /* the start node got already processed, thus go to its parent node */
+ uint parentPointer = *InnerNode_GetBackPointer(backPointers, curNodeIndex);
+ curNodeIndex = parentPointer >> 6;
+
+ bool isInTip = false;
+ while (curNodeIndex != 0x03FFFFFF)
+ {
+ uint numChildrenTotal = 0;
+ // numChildrenTotal and parentPointer gets updated...
+ // atomic trickery, on backpointers, only the last one thread enters up
+ {
+ /* increment refit counter that counts refitted children of current node */
+ global uint* pCurrentBackpointer = (global uint*)InnerNode_GetBackPointer(backPointers, curNodeIndex);
+ mem_fence_gpu_invalidate();
+ parentPointer = 1 + atomic_inc_global(pCurrentBackpointer);
+
+ /* if all children got refitted, then continue */
+ const uint numChildrenRefitted = (parentPointer >> 0) & 0x7;
+ numChildrenTotal = (parentPointer >> 3) & 0x7;
+
+ if (numChildrenRefitted != numChildrenTotal)
+ return;
+
+ /* reset refit counter for next refit */
+ *pCurrentBackpointer = (parentPointer & 0xfffffff8);
+ }
+
+ /* get children treelets */
+ global struct InternalNode* node = internalNodes + curNodeIndex;
+ uint childrenIndices = curNodeIndex + node->childOffset;
+ global TreeletNodeData* childrenTreelets = treelets + childrenIndices;
+
+ // yeah, it is possible we are pulling trash here, but we wont use it.
+ // this is for the sake of one non control flow spoiled data pull
+ TreeletNodeData dataCh0 = childrenTreelets[0]; TreeletNodeData dataCh1 = childrenTreelets[1];
+ TreeletNodeData dataCh2 = childrenTreelets[2]; TreeletNodeData dataCh3 = childrenTreelets[3];
+ TreeletNodeData dataCh4 = childrenTreelets[4]; TreeletNodeData dataCh5 = childrenTreelets[5];
+
+ // zero out the potential trash
+ if (numChildrenTotal < 3) dataCh2 = 0;
+ if (numChildrenTotal < 4) dataCh3 = 0;
+ if (numChildrenTotal < 5) dataCh4 = 0;
+ if (numChildrenTotal < 6) dataCh5 = 0;
+
+ TreeletsOpenNodeInfo infoCh0 = DecodeOpenInfo(dataCh0);
+ TreeletsOpenNodeInfo infoCh1 = DecodeOpenInfo(dataCh1);
+ TreeletsOpenNodeInfo infoCh2 = DecodeOpenInfo(dataCh2);
+ TreeletsOpenNodeInfo infoCh3 = DecodeOpenInfo(dataCh3);
+ TreeletsOpenNodeInfo infoCh4 = DecodeOpenInfo(dataCh4);
+ TreeletsOpenNodeInfo infoCh5 = DecodeOpenInfo(dataCh5);
+
+ uint numChildrenBeingRoots = isTreeletRoot(dataCh0) + isTreeletRoot(dataCh1) + isTreeletRoot(dataCh2) + isTreeletRoot(dataCh3) + isTreeletRoot(dataCh4) + isTreeletRoot(dataCh5);
+ // see if we should merge the trees, if not then we should move to tip.
+ currInfo.numStartpoints = infoCh0.numStartpoints + infoCh1.numStartpoints + infoCh2.numStartpoints + infoCh3.numStartpoints + infoCh4.numStartpoints + infoCh5.numStartpoints;
+
+ bool isTipStartpoint = false;
+ if (!isInTip)
+ {
+ // TODO: threshold could be a dynamic parameter based on the number of actual inner nodes
+ bool mergeTreelets = ((currInfo.numStartpoints > 0) && (currInfo.numStartpoints < TREELET_NUM_STARTPOINTS));
+ bool allChildrenRootsCurrently = numChildrenTotal == numChildrenBeingRoots;
+ if (mergeTreelets && allChildrenRootsCurrently)
+ {
+ childrenTreelets[0] = ClearTreeletRoot(dataCh0);
+ childrenTreelets[1] = ClearTreeletRoot(dataCh1); // -1 will be recognised then as this is not a treelet root.
+ if (numChildrenTotal > 2) childrenTreelets[2] = ClearTreeletRoot(dataCh2);
+ if (numChildrenTotal > 3) childrenTreelets[3] = ClearTreeletRoot(dataCh3);
+ if (numChildrenTotal > 4) childrenTreelets[4] = ClearTreeletRoot(dataCh4);
+ if (numChildrenTotal > 5) childrenTreelets[5] = ClearTreeletRoot(dataCh5);
+ }
+ else
+ {
+ isInTip = true;
+ isTipStartpoint = allChildrenRootsCurrently;
+ }
+ }
+
+ // close any roots underneath
+ if (isInTip && numChildrenBeingRoots)
+ {
+ uint trivialRoots = isTrivialTreeletRoot(dataCh0) + isTrivialTreeletRoot(dataCh1) + isTrivialTreeletRoot(dataCh2) +
+ isTrivialTreeletRoot(dataCh3) + isTrivialTreeletRoot(dataCh4) + isTrivialTreeletRoot(dataCh5);
+
+ uint treeletId = 0;
+ uint bottomStartpointSpace = 0;
+
+ uint startpointsFromTiptree = trivialRoots;
+
+ if (trivialRoots) isTipStartpoint = false;
+
+ if (numChildrenBeingRoots > trivialRoots)
+ {
+ startpointsFromTiptree += // startpoint ONLY from tiptree
+ (1 - isTreeletRoot(dataCh0)) * infoCh0.numStartpoints +
+ (1 - isTreeletRoot(dataCh1)) * infoCh1.numStartpoints +
+ (1 - isTreeletRoot(dataCh2)) * infoCh2.numStartpoints +
+ (1 - isTreeletRoot(dataCh3)) * infoCh3.numStartpoints +
+ (1 - isTreeletRoot(dataCh4)) * infoCh4.numStartpoints +
+ (1 - isTreeletRoot(dataCh5)) * infoCh5.numStartpoints;
+
+ treeletId = atomic_add_global((global uint*)BVHBase_GetRefitTreeletCntPtr(bvh), numChildrenBeingRoots - trivialRoots);
+ bottomStartpointSpace = atomic_add_global((global uint*)startpointAlloc, currInfo.numStartpoints - startpointsFromTiptree);
+ }
+
+ currInfo.numStartpoints = startpointsFromTiptree;
+
+ chk_close_Treelet(treeletDescs, treelets, scratch_startpoints, &bottomStartpointSpace, dataCh0, &infoCh0, childrenIndices + 0, &treeletId);
+ chk_close_Treelet(treeletDescs, treelets, scratch_startpoints, &bottomStartpointSpace, dataCh1, &infoCh1, childrenIndices + 1, &treeletId);
+ chk_close_Treelet(treeletDescs, treelets, scratch_startpoints, &bottomStartpointSpace, dataCh2, &infoCh2, childrenIndices + 2, &treeletId);
+ chk_close_Treelet(treeletDescs, treelets, scratch_startpoints, &bottomStartpointSpace, dataCh3, &infoCh3, childrenIndices + 3, &treeletId);
+ chk_close_Treelet(treeletDescs, treelets, scratch_startpoints, &bottomStartpointSpace, dataCh4, &infoCh4, childrenIndices + 4, &treeletId);
+ chk_close_Treelet(treeletDescs, treelets, scratch_startpoints, &bottomStartpointSpace, dataCh5, &infoCh5, childrenIndices + 5, &treeletId);
+ }
+
+ if (isTipStartpoint)
+ {
+ currInfo.maxDepth = 0;
+ currInfo.numStartpoints = 1;
+ }
+ else
+ {
+ // reduce max depth and number of startpoint underneath
+ currInfo.maxDepth = max(max(max(infoCh0.maxDepth, infoCh1.maxDepth),
+ max(infoCh2.maxDepth, infoCh3.maxDepth)),
+ max(infoCh4.maxDepth, infoCh5.maxDepth)) + 1;
+ }
+
+ treelets[curNodeIndex] = EncodeOpenInfo(
+ currInfo,
+ !isInTip /*mark marged treelet as an new root iff we are in bottom we */);
+
+ /* make parent node the current node */
+ curNodeIndex = parentPointer >> 6;
+ }
+
+ uint treeletId = *BVHBase_GetRefitTreeletCntPtr(bvh);
+
+ uint bottomStartpointSpace = atomic_add_global((global uint*)startpointAlloc, currInfo.numStartpoints);
+
+ treelets[0] = EncodeClosedInfo(treeletId);
+ RefitTreelet tipTreeletDesc;
+ tipTreeletDesc.startpoint_offset = bottomStartpointSpace;
+ tipTreeletDesc.numStartpoints = currInfo.numStartpoints;
+ tipTreeletDesc.maxDepth = currInfo.maxDepth;
+
+ treeletDescs[treeletId] = tipTreeletDesc;
+
+ uint realNumberOfTreelets = treeletId + 1;
+ // intentionally we set less by 1, because this number is used in num groups for dispatch which is number of bottom treelets
+ // so substract 1. Except single treelet tree which is should stay 1.
+ uint numStartingTreelets = (treeletId == 0) ? 1 : treeletId;
+
+ *BVHBase_GetRefitTreeletCntPtr(bvh) = numStartingTreelets;
+
+ uint treeletDescSpaceIn64B = (realNumberOfTreelets * sizeof(RefitTreelet) + 63) >> 6;
+ uint startpointSpaceIn64B = ((bottomStartpointSpace + currInfo.numStartpoints) * sizeof(StartPoint) + 63) >> 6;
+ bvh->refitStartPointDataStart = refitTreeletsDataStart + treeletDescSpaceIn64B;
+ bvh->BVHDataEnd = refitTreeletsDataStart +treeletDescSpaceIn64B + startpointSpaceIn64B;
+ *startpointAlloc = 0;
+}
+
+
+GRL_INLINE void find_refit_treelets(
+ global struct BVHBase* bvh,
+ global TreeletNodeData* treelets,
+ global uint* scratchStartpoints,
+ global uint* startpointAlloc)
+{
+ /* get pointer to inner nodes and back pointers */
+ uniform global InternalNode* inner_nodes = (global InternalNode*) BVHBase_GetInternalNodes(bvh);
+
+ /* construct range of nodes that each work group will process */
+ uniform const uint numInnerNodes = BVHBase_numNodes(bvh);
+
+ varying ushort lane = get_sub_group_local_id();
+ varying uint global_id = get_local_id(0) + get_group_id(0) * get_local_size(0);
+
+ uint numBackpointers = BVHBase_GetNumInternalNodes(bvh);
+
+ // align to 64B and divide
+ uint treeletOffsetIn64B = ((numBackpointers * sizeof(uint)) + 63) >> 6;
+
+ uint refitTreeletsDataStart = bvh->backPointerDataStart + treeletOffsetIn64B;
+ if (global_id == 0)
+ {
+ bvh->refitTreeletsDataStart = refitTreeletsDataStart;
+ }
+
+ global struct InternalNode* curNode = &inner_nodes[global_id];
+
+ varying ushort has_startpoint = 0;
+ if (global_id < numInnerNodes) {
+ if ((curNode->nodeType != BVH_INTERNAL_NODE))
+ {
+ has_startpoint = 1;
+ }
+ }
+
+ if (has_startpoint == 0)
+ return;
+
+ treelet_bottom_up_mark_treelets(
+ bvh,
+ inner_nodes,
+ scratchStartpoints,
+ global_id,
+ BVHBase_GetBackPointers(bvh),
+ treelets,
+ refitTreeletsDataStart,
+ startpointAlloc);
+}
+
+GRL_INLINE void assign_refit_startpoints_to_treelets(
+ global struct BVHBase* bvh,
+ global TreeletNodeData* treelets,
+ global uint* scratchStartpoints)
+{
+ /* get pointer to inner nodes and back pointers */
+ uniform global struct InternalNode* inner_nodes = (global struct InternalNode*) BVHBase_GetInternalNodes(bvh);
+
+ /* construct range of nodes that each work group will process */
+ uniform const uint numInnerNodes = BVHBase_numNodes(bvh);
+
+ varying ushort lane = get_sub_group_local_id();
+ varying uint starPointNode = get_local_id(0) + get_group_id(0) * get_local_size(0);
+ varying uint curNodeIndex = starPointNode;
+ global struct InternalNode* curNode = &inner_nodes[curNodeIndex];
+
+ varying ushort is_startpoint = 0;
+
+ if (curNodeIndex < numInnerNodes)
+ {
+ if ((curNode->nodeType != BVH_INTERNAL_NODE))
+ {
+ is_startpoint = 1;
+ }
+ }
+
+ if (is_startpoint == 0)
+ {
+ return;
+ }
+
+ BackPointers* backPointers = BVHBase_GetBackPointers(bvh);
+
+ RefitTreelet* treeletDescs = BVHBase_GetRefitTreeletDescs(bvh);
+ uint numTreelets = *BVHBase_GetRefitTreeletCntPtr(bvh);
+ if (numTreelets > 1) numTreelets++;
+
+ uint myDepthWhenDead = 0;
+ uint startpointsBeforeMe = 0;
+ bool dead = false;
+
+ uint prevNodeIndex = 0x03FFFFFF;
+
+ while (curNodeIndex != 0x03FFFFFF)
+ {
+ TreeletNodeData nodeData = treelets[curNodeIndex];
+
+ uint parentPointer = *InnerNode_GetBackPointer(backPointers, curNodeIndex);
+ uint numChildren = BackPointer_GetNumChildren(parentPointer);
+
+ // this is counterpart of atomic based entrance decision.
+ // the alive path is the longest, if two are equal take the one that came through child with smaller index.
+ if (prevNodeIndex != 0x03FFFFFF)
+ {
+ uint leadChildOfCur = curNodeIndex + inner_nodes[curNodeIndex].childOffset;
+ uint childEnd = numChildren + leadChildOfCur;
+
+ uint longestPath = 0;
+ uint longestPathChildIdx = leadChildOfCur;
+
+ for (uint child = leadChildOfCur; child < childEnd; child++)
+ {
+ TreeletNodeData childData = treelets[child];
+ if (!isTreeletRoot(childData))
+ {
+ TreeletsOpenNodeInfo childinfo = DecodeOpenInfo(childData);
+ if (longestPath <= childinfo.maxDepth) {
+ longestPathChildIdx = child;
+ longestPath = childinfo.maxDepth + 1;
+ }
+
+ if (child < prevNodeIndex)
+ {
+ // also count how many startpoints are there before me (used to place startpoint in proper slot)
+ startpointsBeforeMe += childinfo.numStartpoints;
+ }
+ }
+ }
+
+ if (!dead && prevNodeIndex != longestPathChildIdx)
+ {
+ dead = true;
+ //printf("starPointNode %d dies in node %d, myDepthWhenDead %d\n", starPointNode, curNodeIndex, myDepthWhenDead);
+ }
+
+ if (!dead) // this "if" is not an "else" to abouve as we might be dead before and comming through the same child index
+ {
+ myDepthWhenDead = longestPath;
+ // it is a startpoint
+ //printf("starPointNode %d in node %d lives up, its myDepthWhenDead %d\n", starPointNode, curNodeIndex, myDepthWhenDead);
+ }
+
+ if (starPointNode == (uint)-1) {
+ // we just entered upper treelet as treelet if we are alive, we can be a new startpoint in new treelet
+ if (dead)
+ {
+ //printf("starPointNode %d disappears in node %d, myDepthWhenDead %d\n", starPointNode, curNodeIndex, myDepthWhenDead);
+ // and we are dead, so we are not a startpoint of tip,
+ // so we must disappear to not be added as a startpoint.
+ return;
+ }
+ else
+ {
+ // it is a startpoint
+ //printf("starPointNode %d in node %d becoming its new startpoint\n", starPointNode, curNodeIndex);
+ starPointNode = curNodeIndex;
+ }
+ }
+ }
+
+ if (isTreeletRoot(nodeData))
+ {
+ TreeletsClosedNodeInfo info = DecodeClosedInfo(nodeData);
+ RefitTreelet treeletDesc = treeletDescs[info.treeletId];
+ uint startpointSlot = treeletDesc.startpoint_offset + startpointsBeforeMe;
+ scratchStartpoints[startpointSlot] = (starPointNode << 6) + (myDepthWhenDead & ((1 << 6) - 1));
+
+ //printf("Adding to treeletID %d at root %d startpoint %d StartNodeIdx %d, depth %d\n", info.treeletId, curNodeIndex, startpointSlot, starPointNode, myDepthWhenDead);
+
+ if (dead) return;
+ myDepthWhenDead = 0;
+ startpointsBeforeMe = 0;
+ starPointNode = (uint)-1;
+ }
+
+ /* make parent node the current node */
+ prevNodeIndex = curNodeIndex;
+ curNodeIndex = BackPointer_GetParentIndex(parentPointer);
+ //if(!dead)
+ //printf("starPointNode %d move from node %d to %d\n", starPointNode, prevNodeIndex, curNodeIndex);
+ }
+}
+
+const uint FINALIZE_TREELETS_SLM_DEPTHS_SPACE = 32;
+
+GRL_INLINE void finalize_treelets_in_groups(
+ global struct BVHBase* bvh,
+ global uint* scratchStartpoints,
+ local uint* depths)
+{
+ uint numTreeletsExecuted = *BVHBase_GetRefitTreeletCntPtr(bvh);
+
+ uint local_id = get_local_id(0);
+
+ uint numTreelets = (numTreeletsExecuted > 1) ? numTreeletsExecuted + 1 : numTreeletsExecuted;
+
+ RefitTreelet* treeletDescs = BVHBase_GetRefitTreeletDescs(bvh);
+
+ for (uint treeletId = get_group_id(0); treeletId < numTreelets; treeletId += numTreeletsExecuted)
+ {
+ if (treeletId == numTreeletsExecuted && treeletId != 0) { work_group_barrier(CLK_LOCAL_MEM_FENCE); }
+
+ RefitTreelet treeletDesc = treeletDescs[treeletId];
+ StartPoint* srcStartpoints = scratchStartpoints + treeletDesc.startpoint_offset;
+ if (treeletDesc.numStartpoints <= 1)
+ {
+ // for smaller latency we store 1 element treelets as RefitTreeletTrivial,
+ // this happens most of the time for tip treelet
+ if (local_id == 0)
+ {
+ RefitTreeletTrivial tr = { 0, treeletDesc.numStartpoints, 0, treeletDesc.maxDepth, 0 };
+ if (treeletDesc.numStartpoints == 1)
+ {
+ StartPoint sp = srcStartpoints[0];
+
+ tr.theOnlyNodeIndex = StartPoint_GetNodeIdx(sp);
+ uint backpointer = *InnerNode_GetBackPointer(BVHBase_GetBackPointers(bvh), tr.theOnlyNodeIndex);
+ tr.numChildrenOfTheNode = BackPointer_GetNumChildren(backpointer);
+ tr.childrenOffsetOfTheNode = BVHBase_GetInternalNodes(bvh)[tr.theOnlyNodeIndex].childOffset + tr.theOnlyNodeIndex;
+ }
+ RefitTreeletTrivial* trivial = (RefitTreeletTrivial*)(treeletDescs + treeletId);
+ *trivial = tr;
+#if REFIT_VERBOSE_LOG
+ printf("treelet trivial %d {\n theOnlyNodeIndex = %d;\n numStartpoints = %d;\n childrenOffsetOfTheNode = %d;\n maxDepth =%d;\n numChildrenOfTheNode = %d;\n}\n",
+ treeletId,
+ tr.theOnlyNodeIndex,
+ tr.numStartpoints,
+ tr.childrenOffsetOfTheNode,
+ tr.maxDepth,
+ tr.numChildrenOfTheNode);
+#endif
+ }
+ }
+ else
+ {
+#define SKIP_PATHS_SORTING 0
+#if SKIP_PATHS_SORTING
+ StartPoint* dstStartpoints = BVHBase_GetRefitStartPoints(bvh) + treeletDesc.startpoint_offset;
+ for (uint startpointID = local_id; startpointID < treeletDesc.numStartpoints; startpointID += get_local_size(0))
+ {
+ dstStartpoints[startpointID] = srcStartpoints[startpointID];
+ }
+#else
+ //if (local_id == 0) { printf("treelet %d, numStartpoints = %d\n", treeletId, numStartpoints); }
+
+ if (local_id <= treeletDesc.maxDepth) {
+ depths[local_id] = 0;
+ // printf("initializing slm treelet %d, depths[%d] = 0\n", treeletId, local_id);
+ }
+ work_group_barrier(CLK_LOCAL_MEM_FENCE);
+
+ uint loopSize = ((treeletDesc.numStartpoints + (get_sub_group_size() - 1)) / get_sub_group_size()) * get_sub_group_size();
+
+ // collect histogram of how many paths of given length we have
+
+ // keep count of depth 0
+ uint val = 0;
+
+ // optimize: we will load Startpoint only once to
+ uint S_c[8];
+ // optimize: keep accumulated numbers in registers to limit number of atomic ops
+ uint D_c[8] = { 0 };
+
+ uint cached_threshold = 8 * get_local_size(0);
+ cached_threshold = min(cached_threshold, treeletDesc.numStartpoints);
+
+ uint loop_turn = 0;
+ uint sgid = get_sub_group_local_id();
+
+ for (uint startpointID = local_id+ cached_threshold; startpointID < treeletDesc.numStartpoints; startpointID += get_local_size(0))
+ {
+ uint dstSlot = StartPoint_GetDepth(srcStartpoints[startpointID]);
+ atomic_inc((volatile local uint*) (depths + dstSlot));
+ }
+
+ uint HistogramSG = 0;
+ if (treeletDesc.maxDepth < 8)
+ {
+ for (uint startpointID = local_id; startpointID < cached_threshold; startpointID += get_local_size(0))
+ {
+ StartPoint S = srcStartpoints[startpointID];
+ S_c[loop_turn++] = S;
+ uint dstSlot = StartPoint_GetDepth(S);
+ D_c[dstSlot]++;
+ }
+
+ for (uint d = 0; d <= treeletDesc.maxDepth; d++)
+ {
+ val = sub_group_reduce_add(D_c[d]);
+ if (sgid == d)
+ {
+ HistogramSG = val;
+ }
+ }
+ if (sgid <= treeletDesc.maxDepth && HistogramSG != 0)
+ {
+ atomic_add((volatile local uint*) (depths + sgid), HistogramSG);
+ }
+ }
+ else
+ {
+ for (uint startpointID = local_id; startpointID < cached_threshold; startpointID += get_local_size(0))
+ {
+ StartPoint S = srcStartpoints[startpointID];
+ S_c[loop_turn++] = S;
+ uint dstSlot = StartPoint_GetDepth(S);
+ atomic_inc((volatile local uint*) (depths + dstSlot));
+ }
+ }
+
+ work_group_barrier(CLK_LOCAL_MEM_FENCE);
+
+#if REFIT_VERBOSE_LOG
+ if (local_id == 0)
+ {
+ for (uint d = 0; d <= treeletDesc.maxDepth; d++)
+ {
+ printf("treelet %d depths[%d] = %d\n", treeletId, d, depths[d]);
+ }
+ }
+#endif
+
+ if (treeletDesc.maxDepth < get_sub_group_size())
+ {
+ if (get_sub_group_id() == 0)
+ {
+
+ uint cntOfDepth = 0;
+ if (sgid <= treeletDesc.maxDepth) {
+ cntOfDepth = depths[sgid];
+ }
+ uint pref_sum = sub_group_scan_exclusive_add(cntOfDepth);
+ depths[sgid] = pref_sum;
+
+ uint numLeft = treeletDesc.numStartpoints - (pref_sum);
+ uint depthLess64 = (numLeft < 64 ) ? (uint)sgid : (uint)treeletDesc.maxDepth;
+ uint depthLess128 = (numLeft < 128) ? (uint)sgid : (uint)treeletDesc.maxDepth;
+ uint depthLess256 = (numLeft < 256) ? (uint)sgid : (uint)treeletDesc.maxDepth;
+
+ // filling data for thread 0 who will save this to mem
+ treeletDesc.depthLess64 = sub_group_reduce_min(depthLess64);
+ treeletDesc.depthLess128 = sub_group_reduce_min(depthLess128);
+ treeletDesc.depthLess256 = sub_group_reduce_min(depthLess256);
+ treeletDesc.numNonTrivialStartpoints = treeletDesc.numStartpoints - cntOfDepth;
+
+ if (sgid == 0) {
+ treeletDescs[treeletId] = treeletDesc;
+#if REFIT_VERBOSE_LOG
+ printf("treelet %d {\n startpoint_offset = %d;\n numStartpoints = %d;\n numNonTrivialStartpoints = %d; \n maxDepth = %d;\n depthLess64 = %d;\n depthLess128 = %d;\n depthLess256 = %d;\n}\n",
+ treeletId,
+ treeletDesc.startpoint_offset,
+ treeletDesc.numStartpoints,
+ treeletDesc.numNonTrivialStartpoints,
+ treeletDesc.maxDepth,
+ treeletDesc.depthLess64,
+ treeletDesc.depthLess128,
+ treeletDesc.depthLess256);
+#endif
+ }
+ }
+ }
+ else if (local_id <= treeletDesc.maxDepth) {
+ uint thisdepthcount = depths[local_id];
+ treeletDesc.depthLess64 = 0;
+ treeletDesc.depthLess128 = 0;
+ treeletDesc.depthLess256 = 0;
+ uint numLeft = treeletDesc.numStartpoints;
+ uint pref_sum = 0;
+
+ for (uint d = 0; d < local_id; d++)
+ {
+ uint depthCnt = depths[d];
+ if (numLeft > 64) { treeletDesc.depthLess64 = d + 1; }
+ if (numLeft > 128) { treeletDesc.depthLess128 = d + 1; }
+ if (numLeft > 256) { treeletDesc.depthLess256 = d + 1; }
+ pref_sum += depthCnt;
+ numLeft -= depthCnt;
+ if (d == 0) { treeletDesc.numNonTrivialStartpoints = numLeft; }
+ }
+
+ if (local_id == treeletDesc.maxDepth)
+ {
+ treeletDescs[treeletId] = treeletDesc;
+#if REFIT_VERBOSE_LOG
+ printf("treelet %d {\n startpoint_offset = %d;\n numStartpoints = %d;\n numNonTrivialStartpoints = %d; maxDepth = %d;\n depthLess64 = %d; depthLess128 = %d; depthLess256 = %d;\n}\n",
+ treeletId,
+ treeletDesc.startpoint_offset,
+ treeletDesc.numStartpoints,
+ treeletDesc.numNonTrivialStartpoints,
+ treeletDesc.maxDepth,
+ treeletDesc.depthLess64,
+ treeletDesc.depthLess128,
+ treeletDesc.depthLess256);
+#endif
+ }
+ }
+
+ StartPoint* dstStartpoints = BVHBase_GetRefitStartPoints(bvh) + treeletDesc.startpoint_offset;
+
+ work_group_barrier(CLK_LOCAL_MEM_FENCE);
+
+ loop_turn = 0;
+ if (treeletDesc.maxDepth < 8)
+ {
+ uint prefixSG = 0;
+
+ // make prefixSG keep interval for paths with sglid depth that is separated out for sg.
+ if (sgid <= treeletDesc.maxDepth && HistogramSG != 0)
+ {
+ prefixSG = atomic_add((volatile local uint*) (depths + sgid), HistogramSG);
+ }
+
+ // from now on all sgs run independently
+
+ // make D_c keep offset interval that is separated out for given lane
+ for (uint d = 0; d <= treeletDesc.maxDepth; d++)
+ {
+ uint thisDPrefixSg = sub_group_broadcast(prefixSG, d);
+ uint thisLaneCount = D_c[d];
+ uint laneOffset = sub_group_scan_exclusive_add(thisLaneCount);
+ D_c[d] = laneOffset + thisDPrefixSg;
+ }
+
+ for (uint startpointID = local_id; startpointID < cached_threshold; startpointID += get_local_size(0))
+ {
+ StartPoint S = S_c[loop_turn++];
+ uint d = StartPoint_GetDepth(S);
+ uint dstSlot = D_c[d]++;
+ dstStartpoints[dstSlot] = S;
+ }
+ }
+ else
+ {
+ for (uint startpointID = local_id; startpointID < cached_threshold; startpointID += get_local_size(0))
+ {
+ StartPoint S = S_c[loop_turn++];
+ uint d = StartPoint_GetDepth(S);
+ uint dstSlot = atomic_inc((volatile local uint*) (depths + d));
+ dstStartpoints[dstSlot] = S;
+ }
+ }
+
+ for (uint srcStartpointID = local_id+ cached_threshold; srcStartpointID < treeletDesc.numStartpoints; srcStartpointID += get_local_size(0))
+ {
+ StartPoint S = srcStartpoints[srcStartpointID];
+ uint d = StartPoint_GetDepth(srcStartpoints[srcStartpointID]);
+ uint dstSlot = atomic_inc((volatile local uint*) (depths+ d));
+ dstStartpoints[dstSlot] = S;
+ }
+#endif //skip sorting
+ }
+ }
+}
diff --git a/src/intel/vulkan/grl/gpu/bvh_copy.cl b/src/intel/vulkan/grl/gpu/bvh_copy.cl
new file mode 100644
index 00000000000..6e76f195095
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/bvh_copy.cl
@@ -0,0 +1,763 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#include "api_interface.h"
+#include "d3d12.h"
+#include "common.h"
+#include "mem_utils.h"
+#include "misc_shared.h"
+
+#define offsetof(TYPE, ELEMENT) ((size_t)&(((TYPE *)0)->ELEMENT))
+
+GRL_INLINE
+uint GroupCountForCopySize(uint size)
+{
+ return (size >> 8) + 4;
+}
+
+GRL_INLINE
+uint GroupCountForCopy(BVHBase* base)
+{
+ return GroupCountForCopySize(base->Meta.allocationSize);
+}
+
+GRL_INLINE void copyInstanceDescs(InstanceDesc* instances, D3D12_RAYTRACING_INSTANCE_DESC* descs, uint64_t numInstances)
+{
+ for (uint64_t instanceIndex = get_local_id(0); instanceIndex < numInstances; instanceIndex += get_local_size(0))
+ {
+ for (uint row = 0; row < 3; row++)
+ {
+ for (uint column = 0; column < 4; column++)
+ {
+ D3D12_set_transform(&descs[instanceIndex], row, column, InstanceDesc_get_transform(&instances[instanceIndex], row, column));
+ }
+ }
+ D3D12_set_instanceID(&descs[instanceIndex], InstanceDesc_get_instanceID(&instances[instanceIndex]));
+ D3D12_set_InstanceMask(&descs[instanceIndex], InstanceDesc_get_InstanceMask(&instances[instanceIndex]));
+ D3D12_set_InstanceContributionToHitGroupIndex(&descs[instanceIndex], InstanceDesc_get_InstanceContributionToHitGroupIndex(&instances[instanceIndex]));
+ D3D12_set_InstanceFlags(&descs[instanceIndex], InstanceDesc_get_InstanceFlags(&instances[instanceIndex]));
+ D3D12_set_AccelerationStructure(&descs[instanceIndex], InstanceDesc_get_AccelerationStructure(&instances[instanceIndex]));
+ }
+}
+
+GRL_INLINE void createGeoDescs(GeoMetaData* geoMetaData, D3D12_RAYTRACING_GEOMETRY_DESC* descs, uint64_t numGeos, const uint64_t dataBufferStart)
+{
+ if (get_local_id(0) == 0)
+ {
+ uint64_t previousGeoDataBufferEnd = dataBufferStart;
+ for (uint64_t geoIndex = 0; geoIndex < numGeos; geoIndex += 1)
+ {
+ D3D12_set_Type(&descs[geoIndex], (uint8_t)(0xffff & geoMetaData[geoIndex].Type));
+ D3D12_set_Flags(&descs[geoIndex], (uint8_t)(0xffff & geoMetaData[geoIndex].Flags));
+ if (geoMetaData[geoIndex].Type == GEOMETRY_TYPE_TRIANGLES)
+ {
+ // Every triangle is stored separately
+ uint64_t vertexBufferSize = 9 * sizeof(float) * geoMetaData[geoIndex].PrimitiveCount;
+ D3D12_set_triangles_Transform(&descs[geoIndex], 0);
+ D3D12_set_triangles_IndexFormat(&descs[geoIndex], INDEX_FORMAT_NONE);
+ D3D12_set_triangles_VertexFormat(&descs[geoIndex], VERTEX_FORMAT_R32G32B32_FLOAT);
+ D3D12_set_triangles_IndexCount(&descs[geoIndex], 0);
+ D3D12_set_triangles_VertexCount(&descs[geoIndex], geoMetaData[geoIndex].PrimitiveCount * 3);
+ D3D12_set_triangles_IndexBuffer(&descs[geoIndex], (D3D12_GPU_VIRTUAL_ADDRESS)previousGeoDataBufferEnd);
+ D3D12_set_triangles_VertexBuffer_StartAddress(&descs[geoIndex], (D3D12_GPU_VIRTUAL_ADDRESS)previousGeoDataBufferEnd);
+ D3D12_set_triangles_VertexBuffer_StrideInBytes(&descs[geoIndex], 3 * sizeof(float));
+ previousGeoDataBufferEnd += vertexBufferSize;
+ }
+ else
+ {
+ D3D12_set_procedurals_AABBCount(&descs[geoIndex], geoMetaData[geoIndex].PrimitiveCount);
+ D3D12_set_procedurals_AABBs_StartAddress(&descs[geoIndex], (D3D12_GPU_VIRTUAL_ADDRESS)previousGeoDataBufferEnd);
+ D3D12_set_procedurals_AABBs_StrideInBytes(&descs[geoIndex], sizeof(D3D12_RAYTRACING_AABB));
+ previousGeoDataBufferEnd += sizeof(D3D12_RAYTRACING_AABB) * geoMetaData[geoIndex].PrimitiveCount;
+ }
+ }
+ }
+}
+
+GRL_INLINE void copyIndiciesAndVerticies(D3D12_RAYTRACING_GEOMETRY_DESC* desc, QuadLeaf* quad)
+{
+ float* vertices = (float*)D3D12_get_triangles_VertexBuffer_StartAddress(desc);
+ uint64_t firstTriangleIndex = quad->primIndex0;
+ uint64_t numTriangles = QuadLeaf_IsSingleTriangle(quad) ? 1 : 2;
+
+ vertices[firstTriangleIndex * 9] = quad->v[0][0];
+ vertices[firstTriangleIndex * 9 + 1] = quad->v[0][1];
+ vertices[firstTriangleIndex * 9 + 2] = quad->v[0][2];
+
+ vertices[firstTriangleIndex * 9 + 3] = quad->v[1][0];
+ vertices[firstTriangleIndex * 9 + 4] = quad->v[1][1];
+ vertices[firstTriangleIndex * 9 + 5] = quad->v[1][2];
+
+ vertices[firstTriangleIndex * 9 + 6] = quad->v[2][0];
+ vertices[firstTriangleIndex * 9 + 7] = quad->v[2][1];
+ vertices[firstTriangleIndex * 9 + 8] = quad->v[2][2];
+
+ if (numTriangles == 2)
+ {
+ uint64_t secondTriangleIndex = firstTriangleIndex + QuadLeaf_GetPrimIndexDelta(quad);
+ uint32_t packed_indices = QuadLeaf_GetSecondTriangleIndices(quad);
+ for( size_t i=0; i<3; i++ )
+ {
+ uint32_t idx = packed_indices & 3 ; packed_indices >>= 2;
+ for( size_t j=0; j<3; j++ )
+ vertices[secondTriangleIndex * 9 + i * 3 + j] = quad->v[idx][j];
+ }
+ }
+}
+
+GRL_INLINE
+void storeProceduralDesc(
+ struct AABB procAABB,
+ uint32_t primId,
+ D3D12_RAYTRACING_GEOMETRY_DESC* geoDesc)
+{
+ D3D12_RAYTRACING_AABB* proceduralDescs = (D3D12_RAYTRACING_AABB*)D3D12_get_procedurals_AABBs_StartAddress(geoDesc);
+ D3D12_set_raytracing_aabb(&proceduralDescs[primId], &procAABB);
+}
+
+GRL_INLINE
+void copyDataFromLProcedurals(
+ BVHBase* base,
+ D3D12_RAYTRACING_GEOMETRY_DESC* descs)
+{
+ unsigned numProcedurals = BVHBase_GetNumProcedurals(base);
+ InternalNode* innerNodes = BVHBase_GetInternalNodes(base);
+ unsigned numInnerNodes = BVHBase_GetNumInternalNodes(base);
+
+ if (BVHBase_GetNumProcedurals(base) > 0) //< there's no point entering here if there are no procedurals
+ {
+
+ // iterate on all inner nodes to identify those with procedural children, we have to take aabbs from them
+ for (uint32_t nodeI = get_local_id(0); nodeI < numInnerNodes; nodeI += get_local_size(0))
+ {
+ InternalNode* innerNode = innerNodes + nodeI;
+
+ if (innerNode->nodeType == NODE_TYPE_PROCEDURAL)
+ {
+ float* origin = innerNode->lower;
+
+ global struct ProceduralLeaf* leaf = (global struct ProceduralLeaf*)QBVHNodeN_childrenPointer((struct QBVHNodeN*)innerNode);
+
+ for (uint k = 0; k < 6; k++)
+ {
+ if (InternalNode_IsChildValid(innerNode, k))
+ {
+ struct AABB3f qbounds = {
+ (float)(innerNode->lower_x[k]), (float)(innerNode->lower_y[k]), (float)(innerNode->lower_z[k]),
+ (float)(innerNode->upper_x[k]), (float)(innerNode->upper_y[k]), (float)(innerNode->upper_z[k]) };
+
+ struct AABB dequantizedAABB;
+
+ dequantizedAABB.lower[0] = origin[0] + bitShiftLdexp(qbounds.lower[0], innerNode->exp_x - 8);
+ dequantizedAABB.lower[1] = origin[1] + bitShiftLdexp(qbounds.lower[1], innerNode->exp_y - 8);
+ dequantizedAABB.lower[2] = origin[2] + bitShiftLdexp(qbounds.lower[2], innerNode->exp_z - 8);
+ dequantizedAABB.upper[0] = origin[0] + bitShiftLdexp(qbounds.upper[0], innerNode->exp_x - 8);
+ dequantizedAABB.upper[1] = origin[1] + bitShiftLdexp(qbounds.upper[1], innerNode->exp_y - 8);
+ dequantizedAABB.upper[2] = origin[2] + bitShiftLdexp(qbounds.upper[2], innerNode->exp_z - 8);
+
+ dequantizedAABB = conservativeAABB(&dequantizedAABB);
+ /* extract geomID and primID from leaf */
+ const uint startPrim = QBVHNodeN_startPrim((struct QBVHNodeN*) innerNode, k);
+ const uint geomID = ProceduralLeaf_geomIndex(leaf);
+ const uint primID = ProceduralLeaf_primIndex(leaf, startPrim); // FIXME: have to iterate over all primitives of leaf!
+
+ storeProceduralDesc(dequantizedAABB, primID, descs + geomID);
+ }
+ /* advance leaf pointer to next child */
+ leaf += QBVHNodeN_blockIncr((struct QBVHNodeN*)innerNode, k);
+ }
+
+ }
+ else if (innerNode->nodeType == NODE_TYPE_MIXED) { ERROR(); }
+ else {/* do nothing for other internal node types, they can't have procedural child (directly)*/; }
+ }
+ }
+}
+
+GRL_INLINE
+void copyDataFromQuadLeaves(BVHBase* base,
+ D3D12_RAYTRACING_GEOMETRY_DESC* descs)
+{
+ QuadLeaf* quads = BVHBase_GetQuadLeaves(base);
+ uint64_t numQuads = BVHBase_GetNumQuads(base);
+ for (uint64_t quadIdx = get_local_id(0); quadIdx < numQuads; quadIdx += get_local_size(0))
+ {
+ uint64_t descIdx = PrimLeaf_GetGeoIndex(&quads[quadIdx].leafDesc);
+ copyIndiciesAndVerticies(&descs[descIdx], &quads[quadIdx]);
+ }
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
+void kernel clone_indirect(global char* dest,
+ global char* src)
+{
+ BVHBase* base = (BVHBase*)src;
+ uint64_t bvhSize = base->Meta.allocationSize;
+
+ uint numGroups = GroupCountForCopy(base);
+ CopyMemory(dest, src, bvhSize, numGroups);
+}
+
+GRL_INLINE void compactT(global char* dest, global char* src, uint64_t compactedSize, uint skipCopy, uint groupCnt)
+{
+ global BVHBase* baseSrc = (global BVHBase*)src;
+ global BVHBase* baseDest = (global BVHBase*)dest;
+
+ uint32_t offset = sizeof(BVHBase);
+ uint32_t numNodes = BVHBase_GetNumInternalNodes(baseSrc);
+ uint32_t nodeSize = numNodes * sizeof(InternalNode);
+ offset += nodeSize;
+
+ int quadChildFix = baseSrc->quadLeafStart;
+ int procChildFix = baseSrc->proceduralDataStart;
+ int instChildFix = baseSrc->instanceLeafStart;
+
+ // serialization already copies part of bvh base so skip this part
+ CopyMemory(dest + skipCopy, src + skipCopy, sizeof(BVHBase) - skipCopy, groupCnt);
+ baseDest->Meta.allocationSize = compactedSize;
+
+ if (baseSrc->Meta.instanceCount)
+ {
+ const uint32_t instLeafsSize = BVHBase_GetNumHWInstanceLeaves(baseSrc) * sizeof(HwInstanceLeaf);
+ CopyMemory(dest + offset, (global char*)BVHBase_GetHWInstanceLeaves(baseSrc), instLeafsSize, groupCnt);
+ const uint instanceLeafStart = (uint)(offset / 64);
+ baseDest->instanceLeafStart = instanceLeafStart;
+ instChildFix -= instanceLeafStart;
+ offset += instLeafsSize;
+ baseDest->instanceLeafEnd = (uint)(offset / 64);
+ }
+ if (baseSrc->Meta.geoCount)
+ {
+ const uint quadLeafsSize = BVHBase_GetNumQuads(baseSrc) * sizeof(QuadLeaf);
+ if (quadLeafsSize)
+ {
+ CopyMemory(dest + offset, (global char*)BVHBase_GetQuadLeaves(baseSrc), quadLeafsSize, groupCnt);
+ const uint quadLeafStart = (uint)(offset / 64);
+ baseDest->quadLeafStart = quadLeafStart;
+ quadChildFix -= quadLeafStart;
+ offset += quadLeafsSize;
+ baseDest->quadLeafCur = (uint)(offset / 64);
+ }
+
+ const uint procLeafsSize = BVHBase_GetNumProcedurals(baseSrc) * sizeof(ProceduralLeaf);
+ if (procLeafsSize)
+ {
+ CopyMemory(dest + offset, (global char*)BVHBase_GetProceduralLeaves(baseSrc), procLeafsSize, groupCnt);
+ const uint proceduralDataStart = (uint)(offset / 64);
+ baseDest->proceduralDataStart = proceduralDataStart;
+ procChildFix -= proceduralDataStart;
+ offset += procLeafsSize;
+ baseDest->proceduralDataCur = (uint)(offset / 64);
+ }
+ }
+ // copy nodes with fixed child offsets
+ global uint* nodeDest = (global uint*)(dest + sizeof(BVHBase));
+ global InternalNode* nodeSrc = (global InternalNode*)BVHBase_GetInternalNodes(baseSrc);
+ // used in mixed case
+ char* instanceLeavesBegin = (char*)BVHBase_GetHWInstanceLeaves(baseSrc);
+ char* instanceLeavesEnd = (char*)BVHBase_GetHWInstanceLeaves_End(baseSrc);
+ uint localId = get_sub_group_local_id();
+ for (uint i = get_group_id(0); i < numNodes; i += groupCnt)
+ {
+ uint nodePart = CacheLineSubgroupRead((const global char*)&nodeSrc[i]);
+ char nodeType = as_char4(sub_group_broadcast(nodePart, offsetof(InternalNode, nodeType) / 4))[0];
+ if (localId * 4 == offsetof(InternalNode, childOffset))
+ {
+ int childOffset = as_int(nodePart);
+ if (nodeType == NODE_TYPE_MIXED)
+ {
+ char* childPtr = (char*)&nodeSrc[i] + 64 * childOffset;
+ if (childPtr > instanceLeavesBegin && childPtr < instanceLeavesEnd)
+ nodePart = as_int(childOffset - instChildFix);
+ }
+ else if (nodeType == NODE_TYPE_INSTANCE)
+ nodePart = as_int(childOffset - instChildFix);
+ else if (nodeType == NODE_TYPE_QUAD)
+ nodePart = as_int(childOffset - quadChildFix);
+ else if (nodeType == NODE_TYPE_PROCEDURAL)
+ nodePart = as_int(childOffset - procChildFix);
+ }
+ nodeDest[i * 16 + localId] = nodePart;
+ }
+
+ if (baseSrc->Meta.instanceCount)
+ {
+ const uint32_t instanceDescSize = baseSrc->Meta.instanceCount * sizeof(InstanceDesc);
+ CopyMemory(dest + offset, src + baseSrc->Meta.instanceDescsStart, instanceDescSize, groupCnt);
+ baseDest->Meta.instanceDescsStart = offset;
+ offset += instanceDescSize;
+ }
+ if (baseSrc->Meta.geoCount)
+ {
+ const uint32_t geoMetaSize = baseSrc->Meta.geoCount * sizeof(GeoMetaData);
+ CopyMemory(dest + offset, src + baseSrc->Meta.geoDescsStart, geoMetaSize, groupCnt);
+ baseDest->Meta.geoDescsStart = offset;
+ offset += (geoMetaSize + 63) & ~63; // align to 64
+ }
+
+ uint backPointerDataStart = offset / 64;
+ uint refitTreeletsDataStart = backPointerDataStart;
+ uint refitStartPointDataStart = backPointerDataStart;
+ uint dataEnd = backPointerDataStart;
+ uint fatLeafTableStart = dataEnd;
+ uint fatLeafCount = baseSrc->fatLeafCount;
+ uint innerTableStart = dataEnd;
+ uint innerCount = baseSrc->innerCount;
+
+ uint quadLeftoversCountNewAtomicUpdate = baseSrc->quadLeftoversCountNewAtomicUpdate;
+ uint quadTableSizeNewAtomicUpdate = baseSrc->quadTableSizeNewAtomicUpdate;
+ uint quadIndicesDataStart = dataEnd;
+
+ if (BVHBase_HasBackPointers(baseSrc))
+ {
+#if 0 //
+ const uint oldbackpontersDataStart = baseSrc->backPointerDataStart;
+ const uint shift = oldbackpontersDataStart - backPointerDataStart;
+ const uint refitStructsSize = ((BVHBase_GetRefitStructsDataSize(baseSrc)) + 63) & ~63;
+
+ CopyMemory(dest + offset, (global char*)BVHBase_GetBackPointers(baseSrc), refitStructsSize, groupCnt);
+
+ refitTreeletsDataStart = baseSrc->refitTreeletsDataStart - shift;
+ refitStartPointDataStart = baseSrc->refitStartPointDataStart - shift;
+ dataEnd = baseSrc->BVHDataEnd - shift;
+#else // compacting version
+ const uint backpointersSize = ((numNodes*sizeof(uint)) + 63) & ~63;
+ CopyMemory(dest + offset, (global char*)BVHBase_GetBackPointers(baseSrc), backpointersSize, groupCnt);
+ offset += backpointersSize;
+
+ refitTreeletsDataStart = offset / 64;
+ refitStartPointDataStart = offset / 64;
+
+ // TODO: remove treelets from .... everywhere
+ const uint treeletExecutedCnt = *BVHBase_GetRefitTreeletCntPtr(baseSrc);
+
+ if (treeletExecutedCnt)
+ {
+ const uint treeletCnt = treeletExecutedCnt > 1 ? treeletExecutedCnt + 1 : 1;
+
+ refitTreeletsDataStart = offset / 64;
+ const uint treeletsSize = ((treeletCnt * sizeof(RefitTreelet)) + 63) & ~63;
+ RefitTreelet* destTreelets = (RefitTreelet*)(dest + offset);
+ RefitTreelet* srcTreelets = BVHBase_GetRefitTreeletDescs(baseSrc);
+
+ uint numThreads = groupCnt * get_local_size(0);
+ uint globalID = (get_group_id(0) * get_local_size(0)) + get_local_id(0);
+
+ for (uint i = globalID; i < treeletCnt; i += numThreads)
+ {
+ RefitTreelet dsc = srcTreelets[i];
+ RefitTreeletTrivial* trivial_dsc = (RefitTreeletTrivial*)&dsc;
+ if (trivial_dsc->numStartpoints == 1 && trivial_dsc->childrenOffsetOfTheNode > numNodes) {
+ trivial_dsc->childrenOffsetOfTheNode -= quadChildFix;
+ }
+ destTreelets[i] = dsc;
+ }
+
+ offset += treeletsSize;
+
+ refitStartPointDataStart = offset / 64;
+ const uint startPointsSize = (BVHBase_GetRefitStartPointsSize(baseSrc) + 63) & ~63;
+ CopyMemory(dest + offset, (global char*)BVHBase_GetRefitStartPoints(baseSrc), startPointsSize, groupCnt);
+ offset += startPointsSize;
+ dataEnd = offset / 64;
+ }
+
+ uint fatleafEntriesSize = ((fatLeafCount * sizeof(LeafTableEntry) + 63) & ~63);
+ fatLeafTableStart = offset / 64;
+ if (fatleafEntriesSize) {
+ CopyMemory(dest + offset, (global char*)BVHBase_GetFatLeafTable(baseSrc), fatleafEntriesSize, groupCnt);
+ }
+ offset += fatleafEntriesSize;
+
+ // New atomic update
+ if(baseSrc->quadIndicesDataStart > baseSrc->backPointerDataStart)
+ {
+ uint numQuads = BVHBase_GetNumQuads(baseSrc);
+ uint quadTableMainBufferSize = (numQuads + 255) & ~255;
+ uint quadLeftoversSize = (quadLeftoversCountNewAtomicUpdate + 255) & ~255;
+ uint quadTableEntriesSize = (((quadTableMainBufferSize + quadLeftoversSize) * sizeof(LeafTableEntry) + 63) & ~63);
+ if (quadTableEntriesSize) {
+ CopyMemory(dest + offset, (global char*)BVHBase_GetFatLeafTable(baseSrc), quadTableEntriesSize, groupCnt);
+ }
+ offset += quadTableEntriesSize;
+
+ uint quadIndicesDataSize = ((numQuads * sizeof(QuadDataIndices) + 63) & ~63);
+ quadIndicesDataStart = offset / 64;
+ if (quadIndicesDataSize) {
+ CopyMemory(dest + offset, (global char*)BVHBase_GetQuadDataIndicesTable(baseSrc), quadIndicesDataSize, groupCnt);
+ }
+ offset += quadIndicesDataSize;
+ }
+
+ uint innerEntriesSize = ((innerCount * sizeof(InnerNodeTableEntry) + 63) & ~63);
+ innerTableStart = offset / 64;
+ if (innerEntriesSize) {
+ CopyMemory(dest + offset, (global char*)BVHBase_GetInnerNodeTable(baseSrc), innerEntriesSize, groupCnt);
+ }
+ offset += innerEntriesSize;
+
+ dataEnd = offset / 64;
+#endif
+ }
+
+ baseDest->backPointerDataStart = backPointerDataStart;
+ baseDest->refitTreeletsDataStart = refitTreeletsDataStart;
+ baseDest->refitStartPointDataStart = refitStartPointDataStart;
+ baseDest->fatLeafTableStart = fatLeafTableStart ;
+ baseDest->fatLeafCount = fatLeafCount;
+ baseDest->innerTableStart = innerTableStart;
+ baseDest->innerCount = innerCount;
+
+ baseDest->quadLeftoversCountNewAtomicUpdate = quadLeftoversCountNewAtomicUpdate;
+ baseDest->quadTableSizeNewAtomicUpdate = quadTableSizeNewAtomicUpdate;
+ baseDest->quadIndicesDataStart = quadIndicesDataStart;
+ baseDest->BVHDataEnd = dataEnd;
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+void kernel compact(global char* dest,
+ global char* src,
+ uint groupCnt)
+{
+ uint64_t compactedSize = compute_compacted_size((BVHBase*)src);
+ compactT(dest, src, compactedSize, 0, groupCnt);
+}
+
+// set serialization header along all lanes, each lane will get one dword of header plus 64bit reminding data
+GRL_INLINE
+unsigned prepare_header(
+ uint64_t headerSize,
+ uint64_t instancePtrSize,
+ uint64_t numInstances,
+ uint64_t bvhSize,
+ uint8_t* driverID,
+ uint64_t reminder)
+{
+
+ unsigned loc_id = get_sub_group_local_id();
+
+ uint64_t SerializedSizeInBytesIncludingHeader = headerSize + instancePtrSize * numInstances + bvhSize;
+ uint64_t DeserializedSizeInBytes = bvhSize;
+ uint64_t InstanceHandleCount = numInstances;
+
+ char bvh_magic_str[] = BVH_MAGIC_MACRO;
+ uint* bvh_magic_uint = (uint*)bvh_magic_str;
+
+ unsigned headerTempLanePiece;
+ if (loc_id < 4) { headerTempLanePiece = *((unsigned*)&driverID[4*loc_id]); }
+ else if (loc_id == 4) { headerTempLanePiece = bvh_magic_uint[0]; }
+ else if (loc_id == 5) { headerTempLanePiece = bvh_magic_uint[1]; }
+ else if (loc_id == 6) { headerTempLanePiece = bvh_magic_uint[2]; }
+ else if (loc_id == 7) { headerTempLanePiece = bvh_magic_uint[3]; }
+ else if (loc_id == 8) { headerTempLanePiece = (uint)SerializedSizeInBytesIncludingHeader; }
+ else if (loc_id == 9) { headerTempLanePiece = (uint)(SerializedSizeInBytesIncludingHeader >> 32ul); }
+ else if (loc_id == 10) { headerTempLanePiece = (uint)DeserializedSizeInBytes; }
+ else if (loc_id == 11) { headerTempLanePiece = (uint)(DeserializedSizeInBytes >> 32ul); }
+ else if (loc_id == 12) { headerTempLanePiece = (uint)InstanceHandleCount; }
+ else if (loc_id == 13) { headerTempLanePiece = (uint)(InstanceHandleCount >> 32ul); }
+ else if (loc_id == 14) { headerTempLanePiece = (uint)reminder; }
+ else if (loc_id == 15) { headerTempLanePiece = (uint)(reminder >> 32ul); }
+
+ return headerTempLanePiece;
+}
+
+
+
+
+GRL_INLINE
+void serializeT(
+ global byte_align64B* dest,
+ global byte_align64B* src,
+ global uint8_t* driverID,
+ uint groups_count)
+{
+ SerializationHeader* header = (SerializationHeader*)dest;
+ BVHBase* base = (BVHBase*)src;
+
+ const uint headerSize = sizeof(SerializationHeader);
+ const uint numInstances = base->Meta.instanceCount;
+ const uint instancePtrSize = sizeof(gpuva_t);
+ const uint compactedSize = compute_compacted_size(base);
+ uint local_id = get_sub_group_local_id();
+
+ // this is not 64byte aligned :(
+ const uint offsetToBvh = headerSize + instancePtrSize * numInstances;
+
+ global InstanceDesc* src_instances = 0;
+
+ if (numInstances) {
+ src_instances = (global InstanceDesc*)((uint64_t)base + base->Meta.instanceDescsStart);
+ }
+
+ // effectively this part should end up as one 64B aligned 64B write
+ if (get_group_id(0) == groups_count - 1)
+ {
+ Block64B headerPlus;
+
+ // we patch the missing piece with instance or bhv beginning (TRICK A and B)
+ // we assume header is 56B.
+ global uint64_t* srcPiece = (numInstances != 0) ? &src_instances[0].AccelerationStructureGPUVA : (global uint64_t*)src;
+
+ unsigned headerTemp;
+
+ headerTemp = prepare_header(
+ headerSize,
+ instancePtrSize,
+ numInstances,
+ compactedSize,
+ driverID,
+ *srcPiece);
+
+ CacheLineSubgroupWrite((global byte_align64B*)dest, headerTemp);
+ }
+
+ if (numInstances > 0)
+ {
+ uint instancesOffset = headerSize;
+ uint aligned_instance_ptrs_offset = ((instancesOffset + 63) >> 6) << 6;
+ uint unaligned_prefixing_instance_cnt = (aligned_instance_ptrs_offset - instancesOffset) >> 3;
+ unaligned_prefixing_instance_cnt = min(unaligned_prefixing_instance_cnt, numInstances);
+
+ global uint64_t* dst_instances = (global uint64_t*)(dest + instancesOffset);
+
+ // we've copied first instance onto a header, (see TRICK A)
+ // now we have only instances start at aligned memory
+ uint numAlignedInstances = numInstances - unaligned_prefixing_instance_cnt;
+ dst_instances += unaligned_prefixing_instance_cnt;
+ src_instances += unaligned_prefixing_instance_cnt;
+
+ if (numAlignedInstances)
+ {
+ // each 8 instances form a cacheline
+ uint numCachelines = numAlignedInstances >> 3; //qwords -> 64Bs
+ // qwords besides multiple of 8;
+ uint startReminder = numAlignedInstances & ~((1 << 3) - 1);
+ uint numreminder = numAlignedInstances & ((1 << 3) - 1);
+
+ uint task_id = get_group_id(0);
+
+ while (task_id < numCachelines)
+ {
+ uint src_id = task_id * 8 + (local_id >> 1);
+ uint* src_uncorected = (uint*)& src_instances[src_id].AccelerationStructureGPUVA;
+ uint* src = ((local_id & 1) != 0) ? src_uncorected + 1 : src_uncorected;
+ uint data = *src;
+
+ global char* dst = (global byte_align64B*)(dst_instances + (8 * task_id));
+ CacheLineSubgroupWrite(dst, data);
+ task_id += groups_count;
+ }
+
+ if (task_id == numCachelines && local_id < 8 && numreminder > 0)
+ {
+ // this should write full cacheline
+
+ uint index = startReminder + local_id;
+ // data will be taken from instances for lanes (local_id < numreminder)
+ // copy srcbvh beginning as uint64_t for remaining lanes (TRICK B)
+ global uint64_t* srcData = (local_id < numreminder) ?
+ &src_instances[index].AccelerationStructureGPUVA :
+ ((global uint64_t*)src) + (local_id - numreminder);
+ dst_instances[index] = *srcData;
+ }
+ }
+ }
+
+ // the parts above copied unaligned dst beginning of bvh (see TRICK B)
+ uint32_t unalignedPartCopiedElsewhere = (64u - (offsetToBvh & (64u - 1u)))&(64u - 1u);
+
+ compactT(dest + offsetToBvh, src, compactedSize, unalignedPartCopiedElsewhere, groups_count);
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+void kernel serialize_indirect(
+ global char* dest,
+ global char* src,
+ global uint8_t* driverID)
+{
+ BVHBase* base = (BVHBase*)src;
+ uint groups_count = GroupCountForCopy(base);
+ serializeT(dest, src, driverID, groups_count);
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+void kernel serialize_for_input_dump_indirect(
+ global struct OutputBatchPtrs* batchPtrs,
+ global dword* dstOffset,
+ global char* src,
+ global uint8_t* driverID)
+{
+ BVHBase* base = (BVHBase*)src;
+ uint groups_count = GroupCountForCopy(base);
+ global char* dest = (global char*)(batchPtrs->dataStart + *dstOffset);
+ dest += (sizeof(OutputData) + 127) & ~127;
+ serializeT(dest, src, driverID, groups_count);
+}
+
+GRL_INLINE
+void deserializeT(
+ global char* dest,
+ global char* src,
+ unsigned groupCnt)
+{
+ SerializationHeader* header = (SerializationHeader*)src;
+
+ const uint64_t headerSize = sizeof(struct SerializationHeader);
+ const uint64_t instancePtrSize = sizeof(gpuva_t);
+ const uint64_t numInstances = header->InstanceHandleCount;
+ const uint64_t offsetToBvh = headerSize + instancePtrSize * numInstances;
+ const uint64_t bvhSize = header->DeserializedSizeInBytes;
+
+ if (numInstances)
+ {
+ const bool instances_mixed_with_inner_nodes = false;
+ if (instances_mixed_with_inner_nodes)
+ {
+ // not implemented !
+ // copy each node with 64byte granularity if node is instance, patch it mid-copy
+ }
+ else
+ {
+ BVHBase* srcBvhBase = (BVHBase*)(src + offsetToBvh);
+
+ // numHWInstances can be bigger (because of rebraiding) or smaller (because of inactive instances) than
+ // numInstances (count of pointers and descriptors).
+ uint offsetToHwInstances = srcBvhBase->instanceLeafStart << 6;
+ uint numHwInstances = (srcBvhBase->instanceLeafEnd - srcBvhBase->instanceLeafStart) >> 1;
+
+ //
+ // instances are in separate memory intervals
+ // copy all the other data simple way
+ //
+ uint nodesEnd = srcBvhBase->Meta.instanceDescsStart;
+ // copy before instance leafs
+ CopyMemory(dest, (global char*)(src + offsetToBvh), offsetToHwInstances, groupCnt);
+
+ uint offsetPostInstances = srcBvhBase->instanceLeafEnd << 6;
+ uint instanceDescStart = srcBvhBase->Meta.instanceDescsStart;
+ uint sizePostInstances = instanceDescStart - offsetPostInstances;
+ // copy after instance leafs before instance desc
+ CopyMemory(dest + offsetPostInstances, (global char*)(src + offsetToBvh + offsetPostInstances), sizePostInstances, groupCnt);
+
+ uint instanceDescEnd = instanceDescStart + numInstances * sizeof(InstanceDesc);
+ uint sizePostInstanceDescs = bvhSize - instanceDescEnd;
+ // copy after instance desc
+ CopyMemory(dest + instanceDescEnd, (global char*)(src + offsetToBvh + instanceDescEnd), sizePostInstanceDescs, groupCnt);
+
+ global gpuva_t* newInstancePtrs = (global gpuva_t*)(src + headerSize);
+ global InstanceDesc* dstDesc = (global InstanceDesc*)(dest + instanceDescStart);
+ global InstanceDesc* srcDesc = (global InstanceDesc*)(src + offsetToBvh + instanceDescStart);
+
+ // copy and patch instance descriptors
+ for (uint64_t instanceIndex = get_group_id(0); instanceIndex < numInstances; instanceIndex += groupCnt)
+ {
+ InstanceDesc desc = srcDesc[instanceIndex];
+ uint64_t newInstancePtr = newInstancePtrs[instanceIndex];
+ desc.AccelerationStructureGPUVA = newInstancePtr; // patch it with new ptr;
+
+ dstDesc[instanceIndex] = desc;
+ }
+
+ // copy and patch hw instance leafs
+ global HwInstanceLeaf* dstInstleafs = (global HwInstanceLeaf*)(dest + offsetToHwInstances);
+ global HwInstanceLeaf* srcInstleafs = (global HwInstanceLeaf*)(src + offsetToBvh + offsetToHwInstances);
+
+ for (uint hwLeafIndex = get_group_id(0); hwLeafIndex < numHwInstances; hwLeafIndex += groupCnt)
+ {
+ // pull the instance from srcBVH
+ HwInstanceLeaf tmpInstleaf = srcInstleafs[hwLeafIndex];
+
+ uint swInstanceIndex = HwInstanceLeaf_GetInstanceIndex(&tmpInstleaf);
+ uint64_t childBvhPtr = (uint64_t)newInstancePtrs[swInstanceIndex];
+ uint64_t originalBvhPtr = (uint64_t)HwInstanceLeaf_GetBVH(&tmpInstleaf);
+
+ HwInstanceLeaf_SetBVH(&tmpInstleaf, childBvhPtr);
+ uint64_t startNode = HwInstanceLeaf_GetStartNode(&tmpInstleaf);
+
+ if (startNode != 0) {
+ uint64_t rootNodeOffset = startNode - originalBvhPtr;
+ HwInstanceLeaf_SetStartNode(&tmpInstleaf, childBvhPtr + rootNodeOffset);
+ }
+
+ dstInstleafs[hwLeafIndex] = tmpInstleaf;
+ }
+ }
+ }
+ else
+ {
+ CopyMemory(dest, (global char*)(src + offsetToBvh), bvhSize, groupCnt);
+ }
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
+void kernel deserialize_indirect(
+ global char* dest,
+ global char* src)
+{
+ SerializationHeader* header = (SerializationHeader*)src;
+ const uint64_t bvhSize = header->DeserializedSizeInBytes;
+ unsigned groupCnt = GroupCountForCopySize(bvhSize);
+ deserializeT(dest, src, groupCnt);
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) void kernel dxr_decode(global char* dest,
+ global char* src)
+{
+
+ DecodeHeader* header = (DecodeHeader*)dest;
+ BVHBase* base = (BVHBase*)src;
+
+ uint32_t numGeos = base->Meta.geoCount;
+ uint32_t numInstances = base->Meta.instanceCount;
+
+ if (numInstances > 0)
+ {
+ header->Type = TOP_LEVEL;
+ header->NumDesc = numInstances;
+
+ D3D12_RAYTRACING_INSTANCE_DESC* instanceDesc = (D3D12_RAYTRACING_INSTANCE_DESC*)(dest + sizeof(DecodeHeader));
+ copyInstanceDescs((InstanceDesc*)((uint64_t)base + (uint64_t)base->Meta.instanceDescsStart),
+ instanceDesc,
+ numInstances);
+ }
+ else if (numGeos > 0)
+ {
+ header->Type = BOTTOM_LEVEL;
+ header->NumDesc = numGeos;
+
+ D3D12_RAYTRACING_GEOMETRY_DESC* geomDescs = (D3D12_RAYTRACING_GEOMETRY_DESC*)(dest + sizeof(DecodeHeader));
+ uint64_t data = (uint64_t)geomDescs + sizeof(D3D12_RAYTRACING_GEOMETRY_DESC) * numGeos;
+ createGeoDescs((GeoMetaData*)((uint64_t)base + (uint64_t)base->Meta.geoDescsStart),
+ geomDescs,
+ numGeos,
+ data);
+
+ work_group_barrier(CLK_GLOBAL_MEM_FENCE);
+
+ copyDataFromQuadLeaves(base,
+ geomDescs);
+
+ copyDataFromLProcedurals(base,
+ geomDescs);
+ }
+ else
+ {
+ header->Type = BOTTOM_LEVEL;
+ header->NumDesc = 0;
+ }
+}
diff --git a/src/intel/vulkan/grl/gpu/bvh_debug.cl b/src/intel/vulkan/grl/gpu/bvh_debug.cl
new file mode 100644
index 00000000000..bce75fec3ff
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/bvh_debug.cl
@@ -0,0 +1,208 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+// @file bvh_debug.cl
+//
+// @brief routines to do basic integrity checks
+//
+// Notes:
+//
+
+#include "GRLGen12.h"
+#include "intrinsics.h"
+#include "libs/lsc_intrinsics.h"
+#include "GRLGen12IntegrityChecks.h"
+#include "api_interface.h"
+
+#define ERROR_PRINTF 0
+GRL_INLINE bool commit_err(
+ global uint* some_null,
+ global BVHBase* bvh,
+ global ERROR_INFO* err_info_slot,
+ ERROR_INFO err)
+{
+ if (err.type != error_t_no_error) {
+ uint expected = error_t_no_error;
+ atomic_compare_exchange_global(&err_info_slot->type, &expected, err.type);
+ if (expected == error_t_no_error)
+ {
+ err_info_slot->offset_in_BVH = err.offset_in_BVH;
+ err_info_slot->when = err.when;
+ err_info_slot->reserved = 0xAAACCAAA;
+ mem_fence_evict_to_memory();
+#if ERROR_PRINTF
+ printf("bvh = 0x%llX, err.type = %X, err.offset_in_BVH = %d\n", bvh, err.type, err.offset_in_BVH);
+#else
+ // This is to trigger PF. Note we have to write directly to memory.
+ // If write would stay in L3 it won't give a PF untill this will get evicted to mem.
+ store_uint_L1UC_L3UC(some_null, 0, 0x0EEE0000 + err.type);
+#endif
+ return true;
+ }
+ }
+ return false;
+}
+
+__attribute__((reqd_work_group_size(16, 1, 1)))
+void kernel check_tree_topology(
+ global uint* some_null,
+ global BVHBase* bvh,
+ global ERROR_INFO* err,
+ uint phase)
+{
+ uint globalID = get_local_id(0) + get_group_id(0) * get_local_size(0);
+
+ if (err->type != error_t_no_error) return;
+
+ uint dummy1, dummy2, dummy3;
+ ERROR_INFO reterr = check_tree_topology_helper(bvh, globalID, &dummy1, &dummy2, &dummy3, false);
+ if (reterr.type == error_t_no_error)
+ {
+ reterr = check_backpointers(bvh, globalID);
+ }
+ if (reterr.type == error_t_no_error)
+ {
+ reterr = validate_atomic_update_structs(bvh, globalID);
+ }
+ reterr.when = phase;
+ commit_err(some_null, bvh, err, reterr);
+}
+
+GRL_INLINE bool IsValid48bPtr(qword ptr)
+{
+ qword CANONIZED_BITS = 0xFFFFul << 48ul;
+ qword canonized_part = ptr & CANONIZED_BITS;
+ bool isIt = ptr != 0 && (
+ canonized_part == 0 || canonized_part == CANONIZED_BITS);
+ return isIt;
+}
+
+__attribute__((reqd_work_group_size(16, 1, 1)))
+void kernel check_geos_before_quad_update(
+ global BVHBase* bvh, //dest bvh
+ global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc,
+ global uint* some_null,
+ global ERROR_INFO* err,
+ uint phase,
+ uint numGeos,
+ uint numThreads)
+{
+ uint globalID = get_local_id(0) + get_group_id(0) * get_local_size(0);
+
+ if (err->type != error_t_no_error) return;
+
+ // first check sanity of geos
+ ERROR_INFO geo_insanity_error = { error_t_input_geo_insane, 0 };
+
+ for (uint ID = globalID; ID < numGeos; ID += numThreads * get_sub_group_size())
+ {
+ bool IsSane = IsValid48bPtr((qword)(qword)geomDesc);
+
+ if (IsSane) {
+ GRL_RAYTRACING_GEOMETRY_DESC geo = geomDesc[globalID];
+ IsSane = geo.Type < NUM_GEOMETRY_TYPES;
+ if (IsSane) {
+ if (geo.Type == GEOMETRY_TYPE_TRIANGLES) {
+ if (geo.Desc.Triangles.IndexFormat >= INDEX_FORMAT_END) {
+ IsSane = false;
+ }
+ else
+ {
+ if (geo.Desc.Triangles.IndexFormat != INDEX_FORMAT_NONE && geo.Desc.Triangles.IndexCount > 2)
+ {
+ IsSane = (geo.Desc.Triangles.VertexFormat < VERTEX_FORMAT_END) &&
+ IsValid48bPtr((qword)geo.Desc.Triangles.pVertexBuffer) &&
+ IsValid48bPtr((qword)geo.Desc.Triangles.pIndexBuffer);
+ }
+ else if (geo.Desc.Triangles.VertexCount > 2)
+ {
+ IsSane =
+ geo.Desc.Triangles.VertexFormat < VERTEX_FORMAT_END&&
+ IsValid48bPtr((qword)geo.Desc.Triangles.pVertexBuffer) != 0;
+ }
+ }
+ }
+ }
+ }
+
+ geo_insanity_error.offset_in_BVH = ID;
+ geo_insanity_error.when = phase;
+ if (!IsSane) {
+ commit_err(some_null, bvh, err, geo_insanity_error);
+ }
+ return;
+ }
+}
+
+__attribute__((reqd_work_group_size(16, 1, 1)))
+void kernel check_geos_vs_quads(
+ global BVHBase* bvh,
+ global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc,
+ global uint* some_null,
+ global ERROR_INFO* err,
+ uint phase,
+ uint numGeos,
+ uint numThreads)
+{
+ uint numQuads = BVHBase_GetNumQuads(bvh);
+
+ QuadLeaf* quads = BVHBase_GetQuadLeaves(bvh);
+
+ uint globalID = get_local_id(0) + get_group_id(0) * get_local_size(0);
+ uint qoffset = bvh->quadLeafStart;
+
+ if (err->type != error_t_no_error) return;
+
+ ERROR_INFO theErr = { error_t_no_error, 0 };
+
+ for (uint ID = globalID; ID < numQuads; ID += numThreads * get_sub_group_size())
+ {
+ ERROR_INFO quadErr = { error_t_quad_leaf_broken, qoffset + ID, phase };
+
+ QuadLeaf quad = quads[ID];
+
+ uint geoIdx = PrimLeaf_GetGeoIndex(&quad.leafDesc);
+
+ if (geoIdx > numGeos) { commit_err(some_null, bvh, err, quadErr); return; }
+
+ uint numPrimsInGeo = geomDesc[geoIdx].Desc.Triangles.IndexFormat != INDEX_FORMAT_NONE ?
+ geomDesc[geoIdx].Desc.Triangles.IndexCount / 3 :
+ geomDesc[geoIdx].Desc.Triangles.VertexCount / 3;
+
+ if(quad.primIndex0 >= numPrimsInGeo) {
+ commit_err(some_null, bvh, err, quadErr);
+ return;
+ }
+
+ if(!QuadLeaf_IsSingleTriangle(&quad) &&
+ (quad.primIndex0 + QuadLeaf_GetPrimIndexDelta(&quad) >= numPrimsInGeo))
+ {
+ commit_err(some_null, bvh, err, quadErr);
+ return;
+ }
+ }
+}
+
+__attribute__((reqd_work_group_size(16, 1, 1)))
+void kernel check_instances_linked_bvhs(
+ global uint* some_null,
+ global BVHBase* bvh,
+ global ERROR_INFO* err,
+ uint phase)
+{
+ if (err->type != error_t_no_error) return;
+
+ uint instanceLeafStart = bvh->instanceLeafStart;
+ uint instanceLeafEnd = bvh->instanceLeafEnd;
+ uint numInstances = (instanceLeafEnd - instanceLeafStart) / 2;
+
+ uint globalID = get_local_id(0) + get_group_id(0) * get_local_size(0);
+
+ ERROR_INFO reterr = check_instances_linked_bvhs_helper(bvh, globalID, /*touchBlas*/true);
+ reterr.when = phase;
+ commit_err(some_null, bvh, err, reterr);
+}
diff --git a/src/intel/vulkan/grl/gpu/bvh_debug.grl b/src/intel/vulkan/grl/gpu/bvh_debug.grl
new file mode 100644
index 00000000000..28008ab09ce
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/bvh_debug.grl
@@ -0,0 +1,107 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+module bvh_on_gpu_checks;
+
+kernel_module debug_kernels ("bvh_debug.cl")
+{
+ links lsc_intrinsics;
+ kernel opencl_check_tree_topology < kernelFunction="check_tree_topology">;
+ kernel opencl_check_instances_linked_bvhs < kernelFunction="check_instances_linked_bvhs">;
+ kernel opencl_check_geos_before_quad_update < kernelFunction="check_geos_before_quad_update">;
+ kernel opencl_check_geos_vs_quads < kernelFunction="check_geos_vs_quads">;
+}
+
+
+metakernel debug_checks_prepare_const_regs()
+{
+ define cRoundingSIMD REG4;
+ define cInit0 REG5;
+ define cShiftForSIMD REG3;
+ cRoundingSIMD = (16-1);
+ cShiftForSIMD = 4;
+ DISPATCHDIM_Y = 1;
+ DISPATCHDIM_Z = 1;
+}
+
+metakernel debug_checks_bvh_topology(
+ qword some_null_ptr,
+ qword bvh,
+ qword bvh_inner_nodes_end,
+ qword error_struct,
+ dword when,
+ dword bvh_inner_nodes_start_value )
+{
+ define cRoundingSIMD REG4;
+ define cShiftForSIMD REG3;
+ REG1.lo = load_dword(bvh_inner_nodes_end);
+ REG0 = bvh_inner_nodes_start_value;
+ REG1.hi = 0;
+ REG2 = REG1 - REG0;
+ REG2 = REG2 + cRoundingSIMD;
+ REG2 = REG2 >> cShiftForSIMD;
+
+ DISPATCHDIM_X = REG2.lo;
+
+ dispatch_indirect opencl_check_tree_topology args(
+ some_null_ptr,
+ bvh,
+ error_struct,
+ when);
+}
+
+metakernel debug_check_instances_linked_bvhs(
+ qword some_null_ptr,
+ qword bvh,
+ qword error_struct,
+ dword numHWThreads,
+ dword when)
+{
+ dispatch opencl_check_instances_linked_bvhs(numHWThreads,1,1) args(
+ some_null_ptr,
+ bvh,
+ error_struct,
+ when);
+}
+
+metakernel debug_check_geos_before_quad_update(
+ qword bvh,
+ qword geos,
+ qword some_null_ptr,
+ qword error_struct,
+ dword when,
+ dword numGeos,
+ dword numHWThreads )
+{
+ dispatch opencl_check_geos_before_quad_update(numHWThreads,1,1) args(
+ bvh,
+ geos,
+ some_null_ptr,
+ error_struct,
+ when,
+ numGeos,
+ numHWThreads );
+}
+
+metakernel debug_check_geos_vs_quads(
+ qword bvh,
+ qword geos,
+ qword some_null_ptr,
+ qword error_struct,
+ dword when,
+ dword numGeos,
+ dword numHWThreads )
+{
+ dispatch opencl_check_geos_vs_quads(numHWThreads,1,1) args(
+ bvh,
+ geos,
+ some_null_ptr,
+ error_struct,
+ when,
+ numGeos,
+ numHWThreads );
+}
diff --git a/src/intel/vulkan/grl/gpu/bvh_postbuild_info.cl b/src/intel/vulkan/grl/gpu/bvh_postbuild_info.cl
new file mode 100644
index 00000000000..4fa222b53eb
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/bvh_postbuild_info.cl
@@ -0,0 +1,97 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#include "api_interface.h"
+#include "d3d12.h"
+#include "common.h"
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(1, 1, 1))) void kernel compacted_size(global char *bvh_mem,
+ global char *postbuild_info)
+{
+ BVHBase *base = (BVHBase *)bvh_mem;
+ PostbuildInfoCompactedSize *postbuildInfoCompacted = (PostbuildInfoCompactedSize *)postbuild_info;
+
+ postbuildInfoCompacted->CompactedSizeInBytes = compute_compacted_size(base);
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(1, 1, 1))) void kernel current_size(global char *bvh_mem,
+ global char *postbuild_info)
+{
+
+ BVHBase *base = (BVHBase *)bvh_mem;
+ PostbuildInfoCurrentSize *postbuildInfoCurrent = (PostbuildInfoCurrentSize *)postbuild_info;
+
+ postbuildInfoCurrent->CurrentSizeInBytes = base->Meta.allocationSize;
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(1, 1, 1))) void kernel serialized_size(global char *bvh_mem,
+ global char *postbuild_info)
+{
+
+ BVHBase *base = (BVHBase *)bvh_mem;
+ PostbuildInfoSerializationDesc *postbuildInfoSerialization = (PostbuildInfoSerializationDesc *)postbuild_info;
+
+ uint64_t headerSize = sizeof(SerializationHeader);
+ uint64_t numInstances = base->Meta.instanceCount;
+
+ postbuildInfoSerialization->SerializedSizeInBytes = sizeof(SerializationHeader) +
+ numInstances * sizeof(gpuva_t) +
+ compute_compacted_size(base);
+ //base->Meta.allocationSize;
+ postbuildInfoSerialization->NumBottomLevelAccelerationStructurePointers = numInstances;
+}
+
+void countTrianglesAndProcedurals(GeoMetaData *geoMetaData,
+ uint64_t numGeos,
+ uint64_t *numTriangles,
+ uint64_t *numProcedurals)
+{
+ uint64_t numTrianglesLoc = 0;
+ uint64_t numProceduralsLoc = 0;
+
+ for (uint64_t geoIndex = get_local_id(0); geoIndex < numGeos; geoIndex += get_local_size(0))
+ {
+ if (geoMetaData[geoIndex].Type == GEOMETRY_TYPE_TRIANGLES)
+ {
+ *numTriangles += geoMetaData[geoIndex].PrimitiveCount;
+ }
+ else
+ {
+ *numProcedurals += geoMetaData[geoIndex].PrimitiveCount;
+ }
+ }
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(1, 1, 1))) void kernel decoded_size(global char *bvh_mem,
+ global char *postbuild_info)
+{
+ BVHBase *base = (BVHBase *)bvh_mem;
+ PostbuildInfoToolsVisualizationDesc *postbuildInfoDecoded = (PostbuildInfoToolsVisualizationDesc *)postbuild_info;
+
+ uint64_t numTriangles = 0;
+ uint64_t numProcedurals = 0;
+ countTrianglesAndProcedurals((GeoMetaData *)((uint64_t)base + (uint64_t)base->Meta.geoDescsStart),
+ base->Meta.geoCount,
+ &numTriangles,
+ &numProcedurals);
+ uint64_t numInstances = base->Meta.instanceCount;
+ uint64_t numDescs = base->Meta.geoCount;
+ uint64_t headerSize = sizeof(DecodeHeader);
+ uint64_t descsSize = numDescs * sizeof(D3D12_RAYTRACING_GEOMETRY_DESC) +
+ numInstances * sizeof(D3D12_RAYTRACING_INSTANCE_DESC);
+
+ // Each triangle is stored separately - 3 vertices (9 floats) per triangle
+ uint64_t triangleDataSize = 9 * sizeof(float);
+ uint64_t proceduralDataSize = sizeof(D3D12_RAYTRACING_AABB);
+ uint64_t geoDataSize = numTriangles * triangleDataSize + numProcedurals * proceduralDataSize;
+
+ postbuildInfoDecoded->DecodedSizeInBytes = headerSize + descsSize + geoDataSize;
+}
diff --git a/src/intel/vulkan/grl/gpu/bvh_rebraid.cl b/src/intel/vulkan/grl/gpu/bvh_rebraid.cl
new file mode 100644
index 00000000000..ab0f891acee
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/bvh_rebraid.cl
@@ -0,0 +1,1683 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#include "AABB.h"
+#include "GRLGen12.h"
+#include "api_interface.h"
+#include "common.h"
+#include "qbvh6.h"
+
+#define MAX_SPLITS_PER_INSTANCE 64
+#define NUM_REBRAID_BINS 32
+
+#define NUM_CHILDREN 6
+#define MAX_NODE_OFFSET 65535 // can't open nodes whose offsets exceed this
+
+// OCL/DPC++ *SHOULD* have a uniform keyword... but they dont... so I'm making my own
+#define uniform
+#define varying
+
+#define SGPRINT_UNIFORM(fmt,val) {sub_group_barrier(CLK_LOCAL_MEM_FENCE); if( get_sub_group_local_id() == 0 ) { printf(fmt,val); }}
+
+#define SGPRINT_6x(prefix,fmt,type,val) {\
+ type v0 = sub_group_broadcast( val, 0 );\
+ type v1 = sub_group_broadcast( val, 1 );\
+ type v2 = sub_group_broadcast( val, 2 );\
+ type v3 = sub_group_broadcast( val, 3 );\
+ type v4 = sub_group_broadcast( val, 4 );\
+ type v5 = sub_group_broadcast( val, 5 );\
+ sub_group_barrier(CLK_LOCAL_MEM_FENCE); \
+ if( get_sub_group_local_id() == 0 ) { \
+ printf(prefix fmt fmt fmt fmt fmt fmt "\n" , \
+ v0,v1,v2,v3,v4,v5);}}
+
+
+#define SGPRINT_16x(prefix,fmt,type,val) {\
+ type v0 = sub_group_broadcast( val, 0 );\
+ type v1 = sub_group_broadcast( val, 1 );\
+ type v2 = sub_group_broadcast( val, 2 );\
+ type v3 = sub_group_broadcast( val, 3 );\
+ type v4 = sub_group_broadcast( val, 4 );\
+ type v5 = sub_group_broadcast( val, 5 );\
+ type v6 = sub_group_broadcast( val, 6 );\
+ type v7 = sub_group_broadcast( val, 7 );\
+ type v8 = sub_group_broadcast( val, 8 );\
+ type v9 = sub_group_broadcast( val, 9 );\
+ type v10 = sub_group_broadcast( val, 10 );\
+ type v11 = sub_group_broadcast( val, 11 );\
+ type v12 = sub_group_broadcast( val, 12 );\
+ type v13 = sub_group_broadcast( val, 13 );\
+ type v14 = sub_group_broadcast( val, 14 );\
+ type v15 = sub_group_broadcast( val, 15 );\
+ sub_group_barrier(CLK_LOCAL_MEM_FENCE); \
+ if( get_sub_group_local_id() == 0 ) { \
+ printf(prefix fmt fmt fmt fmt fmt fmt fmt fmt \
+ fmt fmt fmt fmt fmt fmt fmt fmt"\n" , \
+ v0,v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13,v14,v15);}}
+
+#if 1
+#define GRL_ATOMIC_INC(addr) atomic_add(addr, 1);
+#else
+#define GRL_ATOMIC_INC(addr) atomic_inc(addr);
+#endif
+
+#if 0
+#define LOOP_TRIPWIRE_INIT uint _loop_trip=0;
+
+#define LOOP_TRIPWIRE_INCREMENT(max_iterations,name) \
+ _loop_trip++;\
+ if ( _loop_trip > max_iterations )\
+ {\
+ printf( "@@@@@@@@@@@@@@@@@@@@ TRIPWIRE!!!!!!!!!!!\n" );\
+ printf( name"\n");\
+ break;\
+ }
+#else
+
+#define LOOP_TRIPWIRE_INIT
+#define LOOP_TRIPWIRE_INCREMENT(max_iterations,name)
+
+#endif
+
+
+
+typedef struct SGHeap
+{
+ uint32_t key_value;
+ bool lane_mask;
+} SGHeap;
+
+GRL_INLINE void SGHeap_init(uniform SGHeap *h)
+{
+ h->lane_mask = false;
+ h->key_value = 0xbaadf00d;
+}
+
+GRL_INLINE bool SGHeap_full(uniform SGHeap *h)
+{
+ return sub_group_all(h->lane_mask);
+}
+GRL_INLINE bool SGHeap_empty(uniform SGHeap *h)
+{
+ return sub_group_all(!h->lane_mask);
+}
+
+GRL_INLINE bool SGHeap_get_lane_mask(uniform SGHeap *h)
+{
+ return h->lane_mask;
+}
+GRL_INLINE uint16_t SGHeap_get_lane_values(uniform SGHeap *h)
+{
+ return (h->key_value & 0xffff);
+}
+
+GRL_INLINE ushort isolate_lowest_bit( ushort m )
+{
+ return m & ~(m - 1);
+}
+
+
+// lane i receives the index of the ith set bit in mask.
+GRL_INLINE ushort subgroup_bit_rank( uniform ushort mask )
+{
+ varying ushort lane = get_sub_group_local_id();
+ ushort idx = 16;
+ for ( uint i = 0; i < NUM_CHILDREN; i++ )
+ {
+ ushort lo = isolate_lowest_bit( mask );
+ mask = mask ^ lo;
+ idx = (lane == i) ? lo : idx;
+ }
+
+ return ctz( idx );
+}
+
+// push a set of elements spread across a subgroup. Return mask of elements that were not pushed
+GRL_INLINE uint16_t SGHeap_vectorized_push(uniform SGHeap *h, varying uint16_t key, varying uint16_t value, uniform ushort push_mask)
+{
+
+#if 0 // an attempt to make this algorithm branchless
+ varying uint key_value = (((uint)key) << 16) | ((uint)value);
+ uniform ushort free_mask = intel_sub_group_ballot( !h->lane_mask );
+
+ varying ushort free_slot_idx = subgroup_bit_prefix_exclusive( free_mask ); // for each heap slot, what is its position in a compacted list of free slots (prefix sum)
+ varying ushort push_idx = subgroup_bit_prefix_exclusive( push_mask ); // for each lane, what is its position in a compacted list of pushing lanes (prefix sum)
+
+ uniform ushort num_pushes = min( popcount( free_mask ), popcount( push_mask ) );
+
+ varying ushort push_index = subgroup_bit_rank( push_mask ); // lane i gets the index of the i'th set bit in push_mask
+
+ varying uint shuffled = intel_sub_group_shuffle( key_value, intel_sub_group_shuffle( push_index, free_slot_idx ) );
+ varying bool pushed = false;
+ if ( !h->lane_mask && free_slot_idx < num_pushes )
+ {
+ h->lane_mask = true;
+ h->key_value = shuffled;
+ pushed = true;
+ }
+
+ return push_mask & intel_sub_group_ballot( push_idx >= num_pushes );
+#else
+
+ varying uint lane = get_sub_group_local_id();
+
+ varying uint key_value = (((uint)key) << 16) | ((uint)value);
+ uniform ushort free_mask = intel_sub_group_ballot(!h->lane_mask);
+
+ // TODO_OPT: Look for some clever way to remove this loop
+ while (free_mask && push_mask)
+ {
+ // insert first active child into first available lane
+ uniform uint child_id = ctz(push_mask);
+ uniform uint victim_lane = ctz(free_mask);
+ uniform uint kv = sub_group_broadcast( key_value, child_id );
+ if (victim_lane == lane)
+ {
+ h->lane_mask = true;
+ h->key_value = kv;
+ }
+ push_mask ^= (1 << child_id);
+ free_mask ^= (1 << victim_lane);
+ }
+
+ return push_mask;
+
+#endif
+}
+
+// push an item onto a heap that is full except for one slot
+GRL_INLINE void SGHeap_push_and_fill(uniform SGHeap *h, uniform uint16_t key, uniform uint16_t value)
+{
+ uniform uint32_t key_value = (((uint)key) << 16) | value;
+ if (!h->lane_mask)
+ {
+ h->lane_mask = true;
+ h->key_value = key_value; // only one lane will be active at this point
+ }
+}
+
+// pop the min item from a full heap
+GRL_INLINE void SGHeap_full_pop_min(uniform SGHeap *h, uniform uint16_t *key_out, uniform uint16_t *value_out)
+{
+ varying uint lane = get_sub_group_local_id();
+ uniform uint kv = sub_group_reduce_min(h->key_value);
+ if (h->key_value == kv)
+ h->lane_mask = false;
+
+ *key_out = (kv >> 16);
+ *value_out = (kv & 0xffff);
+}
+
+// pop the max item from a heap
+GRL_INLINE void SGHeap_pop_max(uniform SGHeap *h, uniform uint16_t *key_out, uniform uint16_t *value_out)
+{
+ uniform uint lane = get_sub_group_local_id();
+ uniform uint kv = sub_group_reduce_max(h->lane_mask ? h->key_value : 0);
+ if (h->key_value == kv)
+ h->lane_mask = false;
+
+ *key_out = (kv >> 16);
+ *value_out = (kv & 0xffff);
+}
+
+GRL_INLINE void SGHeap_printf( SGHeap* heap )
+{
+ uint key = heap->key_value >> 16;
+ uint value = heap->key_value & 0xffff;
+
+ if ( get_sub_group_local_id() == 0)
+ printf( "HEAP: \n" );
+ SGPRINT_16x( " mask: ", "%6u ", bool, heap->lane_mask );
+ SGPRINT_16x( " key : ", "0x%04x ", uint, key );
+ SGPRINT_16x( " val : ", "0x%04x ", uint, value );
+
+}
+
+GRL_INLINE float transformed_aabb_halfArea(float3 lower, float3 upper, const float *Transform)
+{
+ // Compute transformed extent per 'transform_aabb'. Various terms cancel
+ float3 Extent = upper - lower;
+ float ex = Extent.x * fabs(Transform[0]) + Extent.y * fabs(Transform[1]) + Extent.z * fabs(Transform[2]);
+ float ey = Extent.x * fabs(Transform[4]) + Extent.y * fabs(Transform[5]) + Extent.z * fabs(Transform[6]);
+ float ez = Extent.x * fabs(Transform[8]) + Extent.y * fabs(Transform[9]) + Extent.z * fabs(Transform[10]);
+
+ return (ex * ey) + (ey * ez) + (ex * ez);
+}
+
+GRL_INLINE uint16_t quantize_area(float relative_area)
+{
+ // clamp relative area at 0.25 (1/4 of root area)
+ // and apply a non-linear distribution because most things in real scenes are small
+ relative_area = pow(min(1.0f, relative_area * 4.0f), 0.125f);
+ return convert_ushort_rtn( relative_area * 65535.0f );
+}
+
+GRL_INLINE varying uint16_t SUBGROUP_get_child_areas(uniform InternalNode *n,
+ uniform const float *Transform,
+ uniform float relative_area_scale)
+{
+ varying uint16_t area;
+ varying uint16_t lane = get_sub_group_local_id();
+ varying int exp_x = n->exp_x;
+ varying int exp_y = n->exp_y;
+ varying int exp_z = n->exp_z;
+
+ {
+ // decode the AABB positions. Lower in the bottom 6 lanes, upper in the top
+ uniform uint8_t *px = &n->lower_x[0];
+ uniform uint8_t *py = &n->lower_y[0];
+ uniform uint8_t *pz = &n->lower_z[0];
+
+ varying float fx = convert_float(px[lane]);
+ varying float fy = convert_float(py[lane]);
+ varying float fz = convert_float(pz[lane]);
+ fx = n->lower[0] + bitShiftLdexp(fx, exp_x - 8);
+ fy = n->lower[1] + bitShiftLdexp(fy, exp_y - 8);
+ fz = n->lower[2] + bitShiftLdexp(fz, exp_z - 8);
+
+ // transform the AABBs to world space
+ varying float3 lower = (float3)(fx, fy, fz);
+ varying float3 upper = intel_sub_group_shuffle(lower, lane + 6);
+
+ {
+
+ // TODO_OPT: This is only utilizing 6 lanes.
+ // We might be able to do better by vectorizing the calculation differently
+ float a1 = transformed_aabb_halfArea( lower, upper, Transform );
+ float a2 = a1 * relative_area_scale;
+ area = quantize_area( a2 );
+ }
+ }
+
+ return area;
+}
+
+
+
+GRL_INLINE ushort get_child_area(
+ InternalNode* n,
+ ushort child,
+ const float* Transform,
+ float relative_area_scale )
+{
+ uint16_t area;
+ uint16_t lane = get_sub_group_local_id();
+ int exp_x = n->exp_x;
+ int exp_y = n->exp_y;
+ int exp_z = n->exp_z;
+
+ // decode the AABB positions. Lower in the bottom 6 lanes, upper in the top
+ uint8_t* px = &n->lower_x[0];
+ uint8_t* py = &n->lower_y[0];
+ uint8_t* pz = &n->lower_z[0];
+
+ float3 lower, upper;
+ lower.x = convert_float( n->lower_x[child] );
+ lower.y = convert_float( n->lower_y[child] );
+ lower.z = convert_float( n->lower_z[child] );
+ upper.x = convert_float( n->upper_x[child] );
+ upper.y = convert_float( n->upper_y[child] );
+ upper.z = convert_float( n->upper_z[child] );
+
+ lower.x = bitShiftLdexp( lower.x, exp_x - 8 ); // NOTE: the node's 'lower' field cancels out, so don't add it
+ lower.y = bitShiftLdexp( lower.y, exp_y - 8 ); // see transform_aabb_halfArea
+ lower.z = bitShiftLdexp( lower.z, exp_z - 8 );
+ upper.x = bitShiftLdexp( upper.x, exp_x - 8 );
+ upper.y = bitShiftLdexp( upper.y, exp_y - 8 );
+ upper.z = bitShiftLdexp( upper.z, exp_z - 8 );
+
+ float a1 = transformed_aabb_halfArea( lower, upper, Transform );
+ float a2 = a1 * relative_area_scale;
+ area = quantize_area( a2 );
+
+ return area;
+}
+
+
+GRL_INLINE varying int SUBGROUP_get_child_offsets(uniform InternalNode *n)
+{
+ varying uint lane = get_sub_group_local_id();
+ varying uint child = (lane < NUM_CHILDREN) ? lane : 0;
+
+ varying uint block_incr = InternalNode_GetChildBlockIncr( n, child );
+
+ //varying uint prefix = sub_group_scan_exclusive_add( block_incr );
+ varying uint prefix;
+ if ( NUM_CHILDREN == 6 )
+ {
+ prefix = block_incr + intel_sub_group_shuffle_up( 0u, block_incr, 1u );
+ prefix = prefix + intel_sub_group_shuffle_up( 0u, prefix, 2 );
+ prefix = prefix + intel_sub_group_shuffle_up( 0u, prefix, 4 );
+ prefix = prefix - block_incr;
+ }
+
+ return n->childOffset + prefix;
+}
+
+
+// compute the maximum number of leaf nodes that will be produced given 'num_splits' node openings
+GRL_INLINE uint get_num_nodes(uint num_splits, uint max_children)
+{
+ // each split consumes one node and replaces it with N nodes
+ // there is initially one node
+ // number of nodes is thus: N*s + 1 - s ==> (N-1)*s + 1
+ return (max_children - 1) * num_splits + 1;
+}
+
+// compute the number of node openings that can be performed given a fixed extra node budget
+GRL_INLINE uint get_num_splits(uint num_nodes, uint max_children)
+{
+ // inverse of get_num_nodes: x = (n-1)s + 1
+ // s = (x-1)/(n-1)
+ if (num_nodes == 0)
+ return 0;
+
+ return (num_nodes - 1) / (max_children - 1);
+}
+
+GRL_INLINE uint get_rebraid_bin_index(uint16_t quantized_area, uint NUM_BINS)
+{
+ // arrange bins in descending order by size
+ float relative_area = quantized_area * (1.0f/65535.0f);
+ relative_area = 1.0f - relative_area; // arrange bins largest to smallest
+ size_t bin = round(relative_area * (NUM_BINS - 1));
+ return bin;
+}
+
+GRL_INLINE global InternalNode *get_node(global BVHBase *base, int incr)
+{
+ global char *ptr = (((global char *)base) + BVH_ROOT_NODE_OFFSET); // NOTE: Assuming this will be hoisted out of inner loops
+
+ return (global InternalNode *)(ptr + incr * 64);
+}
+
+GRL_INLINE bool is_aabb_valid(float3 lower, float3 upper)
+{
+ return all(isfinite(lower)) &&
+ all(isfinite(upper)) &&
+ all(lower <= upper);
+}
+
+GRL_INLINE bool is_node_openable(InternalNode *n)
+{
+ // TODO_OPT: Optimize me by fetching dwords instead of looping over bytes
+ // TODO: OPT: Pre-compute openability and pack into the pad byte next to the nodeType field??
+ bool openable = n->nodeType == NODE_TYPE_INTERNAL;
+ if ( openable )
+ {
+ for ( uint i = 0; i < NUM_CHILDREN; i++ )
+ {
+ bool valid = InternalNode_IsChildValid( n, i );
+ uint childType = InternalNode_GetChildType( n, i );
+ openable = openable & (!valid || (childType == NODE_TYPE_INTERNAL));
+ }
+ }
+
+ return openable;
+}
+
+
+GRL_INLINE bool SUBGROUP_can_open_root(
+ uniform global BVHBase *bvh_base,
+ uniform const struct GRL_RAYTRACING_INSTANCE_DESC* instance
+ )
+{
+ if (bvh_base == 0 || GRL_get_InstanceMask(instance) == 0)
+ return false;
+
+ // TODO_OPT: SG-vectorize this AABB test
+ uniform float3 root_lower = AABB3f_load_lower(&bvh_base->Meta.bounds);
+ uniform float3 root_upper = AABB3f_load_upper(&bvh_base->Meta.bounds);
+ if (!is_aabb_valid(root_lower, root_upper))
+ return false;
+
+ uniform global InternalNode *node = get_node(bvh_base, 0);
+ if ( node->nodeType != NODE_TYPE_INTERNAL )
+ return false;
+
+ varying bool openable = true;
+ varying uint lane = get_sub_group_local_id();
+ if (lane < NUM_CHILDREN)
+ {
+ varying uint childType = InternalNode_GetChildType(node, lane);
+ varying bool valid = InternalNode_IsChildValid(node, lane);
+ openable = childType == NODE_TYPE_INTERNAL || !valid;
+ }
+
+ return sub_group_all(openable);
+}
+
+
+
+GRL_INLINE
+varying uint2
+SUBGROUP_count_instance_splits(uniform global struct AABB3f *geometry_bounds,
+ uniform global __const struct GRL_RAYTRACING_INSTANCE_DESC *instance)
+{
+ uniform global BVHBase *bvh_base = (global BVHBase *)instance->AccelerationStructure;
+ if (!SUBGROUP_can_open_root(bvh_base, instance))
+ return (uint2)(0, 0);
+
+ uniform float relative_area_scale = 1.0f / AABB3f_halfArea(geometry_bounds);
+ uniform float3 root_lower = AABB3f_load_lower(&bvh_base->Meta.bounds);
+ uniform float3 root_upper = AABB3f_load_upper(&bvh_base->Meta.bounds);
+
+ uniform uint16_t quantized_area = quantize_area(transformed_aabb_halfArea(root_lower, root_upper, instance->Transform) * relative_area_scale);
+ uniform uint16_t node_offs = 0;
+
+ uniform SGHeap heap;
+ uniform uint num_splits = 0;
+
+ SGHeap_init(&heap);
+ varying uint sg_split_counts_hi = 0; // cross-subgroup bin counters
+ varying uint sg_split_counts_lo = 0;
+
+ uniform global InternalNode* node_array = get_node( bvh_base, 0 );
+
+ LOOP_TRIPWIRE_INIT;
+
+ while (1)
+ {
+ uniform global InternalNode* node = node_array + node_offs;
+
+ // count this split
+ uniform uint bin = get_rebraid_bin_index(quantized_area, NUM_REBRAID_BINS);
+ varying uint lane = get_sub_group_local_id();
+
+ sg_split_counts_hi += ((lane + 16) == bin) ? 1 : 0;
+ sg_split_counts_lo += (lane == bin) ? 1 : 0;
+
+ // open this node and push all of its openable children to heap
+ varying uint sg_offs = node_offs + SUBGROUP_get_child_offsets(node);
+ varying bool sg_openable = 0;
+ if (lane < NUM_CHILDREN & sg_offs <= MAX_NODE_OFFSET )
+ if (InternalNode_IsChildValid(node, lane))
+ sg_openable = is_node_openable( node_array + sg_offs);
+
+ uniform uint openable_children = intel_sub_group_ballot(sg_openable);
+
+ if ( openable_children )
+ {
+ varying uint16_t sg_area = SUBGROUP_get_child_areas( node, instance->Transform, relative_area_scale );
+
+ if ( !SGHeap_full( &heap ) )
+ {
+ openable_children = SGHeap_vectorized_push( &heap, sg_area, sg_offs, openable_children );
+ }
+
+ while ( openable_children )
+ {
+ // pop min element
+ uniform uint16_t min_area;
+ uniform uint16_t min_offs;
+ SGHeap_full_pop_min( &heap, &min_area, &min_offs );
+
+ // eliminate all children smaller than heap minimum
+ openable_children &= intel_sub_group_ballot( sg_area > min_area );
+
+ if ( openable_children )
+ {
+ // if any children survived,
+ // kick out heap minimum and replace with first child.. otherwise we will re-push the minimum
+ uniform uint child_id = ctz( openable_children );
+ openable_children ^= (1 << child_id);
+ min_area = sub_group_broadcast( sg_area, child_id );
+ min_offs = sub_group_broadcast( sg_offs, child_id );
+ }
+
+ // re-insert onto heap
+ SGHeap_push_and_fill( &heap, min_area, min_offs );
+
+ // repeat until all children are accounted for. It is possible
+ // for multiple children to fit in the heap, because heap minimum is now changed and we need to recompute it
+ }
+ }
+
+ num_splits++;
+ if (num_splits == MAX_SPLITS_PER_INSTANCE)
+ break;
+
+ if (SGHeap_empty(&heap))
+ break;
+
+ // get next node from heap
+ SGHeap_pop_max(&heap, &quantized_area, &node_offs);
+
+ LOOP_TRIPWIRE_INCREMENT( 500, "rebraid_count_splits" );
+
+ }
+
+ return (uint2)(sg_split_counts_lo, sg_split_counts_hi);
+}
+
+typedef struct RebraidBuffers
+{
+ global uint *bin_split_counts; // [num_bins]
+ global uint *bin_instance_counts; // [num_bins]
+ global uint *instance_bin_counts; // num_intances * num_bins
+} RebraidBuffers;
+
+GRL_INLINE RebraidBuffers cast_rebraid_buffers(global uint *scratch, uint instanceID)
+{
+ RebraidBuffers b;
+ b.bin_split_counts = scratch;
+ b.bin_instance_counts = scratch + NUM_REBRAID_BINS;
+ b.instance_bin_counts = scratch + (2 + instanceID) * NUM_REBRAID_BINS;
+ return b;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////
+// Compute AABB
+// Dispatch one work item per instance
+///////////////////////////////////////////////////////////////////////////////////////////
+
+GRL_INLINE void rebraid_compute_AABB(
+ global struct BVHBase* bvh,
+ global __const struct GRL_RAYTRACING_INSTANCE_DESC *instance)
+{
+ // don't open null rtas
+ global BVHBase *bvh_base = (global BVHBase *)instance->AccelerationStructure;
+
+ struct AABB new_primref;
+ if (bvh_base != 0)
+ {
+ float3 root_lower = AABB3f_load_lower(&bvh_base->Meta.bounds);
+ float3 root_upper = AABB3f_load_upper(&bvh_base->Meta.bounds);
+ const float *Transform = instance->Transform;
+
+ if (is_aabb_valid(root_lower, root_upper))
+ {
+ new_primref = AABBfromAABB3f(transform_aabb(root_lower, root_upper, Transform));
+ }
+ else
+ {
+ // degenerate instance which might be updated to be non-degenerate
+ // use AABB position to guide BVH construction
+ //
+ new_primref.lower.x = Transform[3];
+ new_primref.lower.y = Transform[7];
+ new_primref.lower.z = Transform[11];
+ new_primref.upper = new_primref.lower;
+ }
+ }
+ else
+ {
+ AABB_init(&new_primref);
+ }
+
+ struct AABB subgroup_bbox = AABB_sub_group_reduce(&new_primref);
+
+ if (get_sub_group_local_id() == 0)
+ {
+ AABB3f_atomic_merge_global_lu(&bvh->Meta.bounds, subgroup_bbox.lower.xyz, subgroup_bbox.upper.xyz );
+ }
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16))) void kernel
+rebraid_computeAABB_DXR_instances(
+ global struct BVHBase* bvh,
+ global __const struct GRL_RAYTRACING_INSTANCE_DESC *instances)
+{
+ const uint instanceID = get_local_id(0) + get_group_id(0)*get_local_size(0);
+ rebraid_compute_AABB(bvh, instances + instanceID);
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16))) void kernel
+rebraid_computeAABB_DXR_instances_indirect(
+ global struct BVHBase* bvh,
+ global __const struct GRL_RAYTRACING_INSTANCE_DESC *instances,
+ global struct IndirectBuildRangeInfo const * const indirect_data)
+{
+ const uint instanceID = get_local_id(0) + get_group_id(0)*get_local_size(0);
+ instances = (global __const struct GRL_RAYTRACING_INSTANCE_DESC*)
+ (((global char*)instances) + indirect_data->primitiveOffset);
+ rebraid_compute_AABB(bvh, instances + instanceID);
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16))) void kernel
+rebraid_computeAABB_DXR_instances_pointers(
+ global struct BVHBase* bvh,
+ global void *instances_in)
+{
+ global const struct GRL_RAYTRACING_INSTANCE_DESC **instances =
+ (global const struct GRL_RAYTRACING_INSTANCE_DESC **)instances_in;
+
+ const uint instanceID = get_local_id(0) + get_group_id(0)*get_local_size(0);
+ rebraid_compute_AABB(bvh, instances[instanceID]);
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16))) void kernel
+rebraid_computeAABB_DXR_instances_pointers_indirect(
+ global struct BVHBase* bvh,
+ global void *instances_in,
+ global struct IndirectBuildRangeInfo const * const indirect_data)
+{
+ instances_in = ((global char*)instances_in) + indirect_data->primitiveOffset;
+ global const struct GRL_RAYTRACING_INSTANCE_DESC **instances =
+ (global const struct GRL_RAYTRACING_INSTANCE_DESC **)instances_in;
+
+ const uint instanceID = get_local_id(0) + get_group_id(0)*get_local_size(0);
+ rebraid_compute_AABB(bvh, instances[instanceID]);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////
+// Init scratch: Dispatch one work group
+///////////////////////////////////////////////////////////////////////////////////////////
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(64, 1, 1))) void kernel rebraid_init_scratch(global uint *scratch)
+{
+ scratch[get_local_id(0) + get_group_id(0)*get_local_size(0)] = 0;
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(1, 1, 1))) void kernel rebraid_chase_instance_pointers(global struct GRL_RAYTRACING_INSTANCE_DESC *instances_out,
+ global void *instance_buff)
+{
+ global const struct GRL_RAYTRACING_INSTANCE_DESC **instances_in =
+ (global const struct GRL_RAYTRACING_INSTANCE_DESC **)instance_buff;
+
+ instances_out[get_local_id(0)] = *instances_in[get_local_id(0)];
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(1, 1, 1)))
+void kernel rebraid_chase_instance_pointers_indirect(
+ global struct GRL_RAYTRACING_INSTANCE_DESC* instances_out,
+ global void* instance_buff,
+ global struct IndirectBuildRangeInfo const* const indirect_data)
+{
+ instance_buff = ((global char*)instance_buff) + indirect_data->primitiveOffset;
+ global const struct GRL_RAYTRACING_INSTANCE_DESC**
+ instances_in = (global const struct GRL_RAYTRACING_INSTANCE_DESC**)instance_buff;
+
+ instances_out[get_local_id(0)] = *instances_in[get_local_id(0)];
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////
+// Count splits
+///////////////////////////////////////////////////////////////////////////////////////////
+
+GRL_INLINE void DEBUG_SUBGROUP_print_split_counts( uniform uint instanceID, varying uint split_counts_lo, varying uint split_counts_hi )
+{
+ uniform uint vals[32] = {
+ sub_group_broadcast( split_counts_lo, 0 ), sub_group_broadcast( split_counts_lo, 1 ),
+ sub_group_broadcast( split_counts_lo, 2 ), sub_group_broadcast( split_counts_lo, 3 ),
+ sub_group_broadcast( split_counts_lo, 4 ), sub_group_broadcast( split_counts_lo, 5 ),
+ sub_group_broadcast( split_counts_lo, 6 ), sub_group_broadcast( split_counts_lo, 7 ),
+ sub_group_broadcast( split_counts_lo, 8 ), sub_group_broadcast( split_counts_lo, 9 ),
+ sub_group_broadcast( split_counts_lo, 10 ), sub_group_broadcast( split_counts_lo, 11 ),
+ sub_group_broadcast( split_counts_lo, 12 ), sub_group_broadcast( split_counts_lo, 13 ),
+ sub_group_broadcast( split_counts_lo, 14 ), sub_group_broadcast( split_counts_lo, 15 ),
+
+ sub_group_broadcast( split_counts_hi, 0 ), sub_group_broadcast( split_counts_hi, 1 ),
+ sub_group_broadcast( split_counts_hi, 2 ), sub_group_broadcast( split_counts_hi, 3 ),
+ sub_group_broadcast( split_counts_hi, 4 ), sub_group_broadcast( split_counts_hi, 5 ),
+ sub_group_broadcast( split_counts_hi, 6 ), sub_group_broadcast( split_counts_hi, 7 ),
+ sub_group_broadcast( split_counts_hi, 8 ), sub_group_broadcast( split_counts_hi, 9 ),
+ sub_group_broadcast( split_counts_hi, 10 ), sub_group_broadcast( split_counts_hi, 11 ),
+ sub_group_broadcast( split_counts_hi, 12 ), sub_group_broadcast( split_counts_hi, 13 ),
+ sub_group_broadcast( split_counts_hi, 14 ), sub_group_broadcast( split_counts_hi, 15 )
+ };
+
+ if ( get_sub_group_local_id() == 0 )
+ {
+ printf(
+ "Instance: %4u "
+ "%2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u "
+ "%2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u \n"
+ ,
+ instanceID,
+ vals[0], vals[1], vals[2], vals[3], vals[4], vals[5], vals[6], vals[7],
+ vals[8], vals[9], vals[10], vals[11], vals[12], vals[13], vals[14], vals[15],
+ vals[16], vals[17], vals[18], vals[19], vals[20], vals[21], vals[22], vals[23],
+ vals[24], vals[25], vals[26], vals[27], vals[28], vals[29], vals[30], vals[31]
+ );
+ }
+}
+
+GRL_INLINE void do_rebraid_count_splits_SG(
+ uniform global struct BVHBase* bvh,
+ uniform global __const struct GRL_RAYTRACING_INSTANCE_DESC *instances,
+ uniform global uint *rebraid_scratch)
+{
+ uniform const uint instanceID = get_sub_group_global_id();
+ uniform RebraidBuffers buffers = cast_rebraid_buffers(rebraid_scratch,instanceID);
+
+ varying uint lane = get_sub_group_local_id();
+ varying uint2 splits = SUBGROUP_count_instance_splits(&bvh->Meta.bounds, instances + instanceID);
+ varying uint split_counts_lo = splits.x;
+ varying uint split_counts_hi = splits.y;
+
+ // write this instance's per-bin counts
+ global uint* counts = buffers.instance_bin_counts;
+ intel_sub_group_block_write2( counts, splits );
+
+ // update the per-bin split and instance counters
+ if (split_counts_lo > 0)
+ {
+ atomic_add(&buffers.bin_split_counts[lane], split_counts_lo);
+ GRL_ATOMIC_INC(&buffers.bin_instance_counts[lane]);
+ }
+ if (split_counts_hi > 0)
+ {
+ atomic_add(&buffers.bin_split_counts[lane + 16], split_counts_hi);
+ GRL_ATOMIC_INC(&buffers.bin_instance_counts[lane + 16]);
+ }
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(16, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16))) void kernel
+rebraid_count_splits_SG(
+ uniform global struct BVHBase* bvh,
+ uniform global __const struct GRL_RAYTRACING_INSTANCE_DESC *instances,
+ uniform global uint *rebraid_scratch)
+{
+ do_rebraid_count_splits_SG(bvh, instances, rebraid_scratch);
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(16, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16))) void kernel
+rebraid_count_splits_SG_indirect(
+ uniform global struct BVHBase* bvh,
+ uniform global __const struct GRL_RAYTRACING_INSTANCE_DESC *instances,
+ uniform global uint *rebraid_scratch,
+ global struct IndirectBuildRangeInfo const * const indirect_data)
+{
+ instances = (global __const struct GRL_RAYTRACING_INSTANCE_DESC*)
+ (((global char*)instances) + indirect_data->primitiveOffset);
+ do_rebraid_count_splits_SG(bvh, instances, rebraid_scratch);
+}
+
+
+#define HEAP_SIZE 16
+#define COUNT_SPLITS_WG_SIZE 16
+
+struct SLMHeapNode
+{
+ short offs;
+ ushort area;
+};
+
+struct SLMHeap
+{
+ struct SLMHeapNode nodes[HEAP_SIZE];
+ ushort size;
+ ushort min_key;
+};
+
+GRL_INLINE bool SLMHeapNode_Greater( struct SLMHeapNode a, struct SLMHeapNode b )
+{
+ return a.area > b.area;
+}
+
+GRL_INLINE ushort SLMHeapNode_UnpackKey( struct SLMHeapNode a )
+{
+ return a.area;
+}
+
+GRL_INLINE void SLMHeapNode_Unpack( struct SLMHeapNode a, ushort* area_out, short* offs_out )
+{
+ *area_out = a.area;
+ *offs_out = a.offs;
+}
+
+GRL_INLINE struct SLMHeapNode SLMHeapNode_Pack( ushort area, short offs )
+{
+ struct SLMHeapNode n;
+ n.offs = offs;
+ n.area = area;
+ return n;
+}
+
+
+GRL_INLINE void SLMHeap_Init( struct SLMHeap* heap )
+{
+ heap->size = 0;
+ heap->min_key = 0xffff;
+}
+
+GRL_INLINE bool SLMHeap_empty( struct SLMHeap* heap )
+{
+ return heap->size == 0;
+}
+
+GRL_INLINE bool SLMHeap_full( struct SLMHeap* heap )
+{
+ return heap->size == HEAP_SIZE;
+}
+
+
+GRL_INLINE void SLMHeap_push( struct SLMHeap* heap, ushort area, short offs )
+{
+ ushort insert_pos;
+ if ( SLMHeap_full( heap ) )
+ {
+ ushort current_min_key = heap->min_key;
+ if ( area <= current_min_key )
+ return; // don't push stuff that's smaller than the current minimum
+
+ // search for the minimum element
+ // The heap is laid out in level order, so it is sufficient to search only the last half
+ ushort last_leaf = HEAP_SIZE - 1;
+ ushort first_leaf = (last_leaf / 2) + 1;
+
+ // as we search, keep track of what the new min-key will be so we can cull future pushes
+ ushort new_min_key = area;
+ ushort min_pos = 0;
+
+ do
+ {
+ ushort idx = first_leaf++;
+
+ ushort current_key = SLMHeapNode_UnpackKey( heap->nodes[idx] );
+ bool found_min_pos = (min_pos == 0) && (current_key == current_min_key);
+
+ if ( found_min_pos )
+ min_pos = idx;
+ else
+ new_min_key = min( current_key, new_min_key );
+
+ } while ( first_leaf != last_leaf );
+
+ heap->min_key = new_min_key;
+ insert_pos = min_pos;
+ }
+ else
+ {
+ insert_pos = heap->size++;
+ heap->min_key = min( area, heap->min_key );
+ }
+
+ heap->nodes[insert_pos] = SLMHeapNode_Pack( area, offs );
+
+ // heap-up
+ while ( insert_pos )
+ {
+ ushort parent = insert_pos / 2;
+
+ struct SLMHeapNode parent_node = heap->nodes[parent];
+ struct SLMHeapNode current_node = heap->nodes[insert_pos];
+ if ( SLMHeapNode_Greater( parent_node, current_node ) )
+ break;
+
+ heap->nodes[insert_pos] = parent_node;
+ heap->nodes[parent] = current_node;
+ insert_pos = parent;
+ }
+
+}
+
+bool SLMHeap_pop_max( struct SLMHeap* heap, ushort* area_out, short* offs_out )
+{
+ if ( SLMHeap_empty( heap ) )
+ return false;
+
+ SLMHeapNode_Unpack( heap->nodes[0], area_out, offs_out );
+
+ // heap down
+ ushort size = heap->size;
+ ushort idx = 0;
+ do
+ {
+ ushort left = 2 * idx + 1;
+ ushort right = 2 * idx + 2;
+ if ( left >= size )
+ break;
+
+ if ( right >= size )
+ {
+ heap->nodes[idx] = heap->nodes[left];
+ break;
+ }
+
+ struct SLMHeapNode left_node = heap->nodes[left];
+ struct SLMHeapNode right_node = heap->nodes[right];
+ bool go_left = SLMHeapNode_Greater( left_node, right_node );
+ heap->nodes[idx] = go_left ? left_node : right_node;
+ idx = go_left ? left : right;
+
+ } while ( 1 );
+
+ heap->size = size - 1;
+ return true;
+}
+
+void SLMHeap_Print( struct SLMHeap* heap )
+{
+ printf( " size=%u min=%u {", heap->size, heap->min_key );
+ for ( uint i = 0; i < heap->size; i++ )
+ printf( "%04x:%04x", heap->nodes[i].area, heap->nodes[i].offs );
+}
+
+
+GRL_INLINE bool can_open_root(
+ global struct BVHBase* bvh_base,
+ const struct GRL_RAYTRACING_INSTANCE_DESC* instance
+ )
+{
+ float3 root_lower = AABB3f_load_lower( &bvh_base->Meta.bounds );
+ float3 root_upper = AABB3f_load_upper( &bvh_base->Meta.bounds );
+ if ( !is_aabb_valid( root_lower, root_upper ) || GRL_get_InstanceMask(instance) == 0 )
+ return false;
+
+ global InternalNode* node = get_node( bvh_base, 0 );
+ if ( node->nodeType != NODE_TYPE_INTERNAL )
+ return false;
+
+ return is_node_openable( node );
+}
+
+
+GRL_INLINE void count_instance_splits(
+ global struct AABB3f* geometry_bounds,
+ global __const struct GRL_RAYTRACING_INSTANCE_DESC* instance,
+ local ushort* bin_split_counts,
+ local struct SLMHeap* heap
+)
+{
+ global BVHBase* bvh_base = (global BVHBase*)instance->AccelerationStructure;
+
+ SLMHeap_Init( heap );
+
+ float relative_area_scale = 1.0f / AABB3f_halfArea( geometry_bounds );
+ float3 root_lower = AABB3f_load_lower( &bvh_base->Meta.bounds );
+ float3 root_upper = AABB3f_load_upper( &bvh_base->Meta.bounds );
+
+ ushort quantized_area = quantize_area( transformed_aabb_halfArea( root_lower, root_upper, instance->Transform ) * relative_area_scale );
+ short node_offs = 0;
+ ushort num_splits = 0;
+
+ global InternalNode* node_array = get_node( bvh_base, 0 );
+
+ while ( 1 )
+ {
+ global InternalNode* node = node_array + node_offs;
+
+ // count this split
+ uint bin = get_rebraid_bin_index( quantized_area, NUM_REBRAID_BINS );
+ bin_split_counts[bin]++;
+
+ // open this node and push children to heap
+
+ // TODO_OPT: Restructure this control flow to prevent differnet lanes from skipping different loop iterations and diverging
+ // TODO_OPT: Precompute openability masks in BLAS nodes at build time... one bit for self and N bits for each child
+ int offs = node->childOffset;
+ for ( ushort i = 0; i < NUM_CHILDREN; i++ )
+ {
+ if ( InternalNode_IsChildValid( node, i ) )
+ {
+ if ( offs >= SHRT_MIN && offs <= SHRT_MAX )
+ {
+ if ( is_node_openable( node_array + offs ) )
+ {
+ ushort area = get_child_area( node, i, instance->Transform, relative_area_scale );
+ SLMHeap_push( heap, area, (short)offs );
+ }
+ }
+ }
+ offs += InternalNode_GetChildBlockIncr( node, i );
+ }
+
+ num_splits++;
+ if ( num_splits == MAX_SPLITS_PER_INSTANCE )
+ break;
+
+ if ( !SLMHeap_pop_max( heap, &quantized_area, &node_offs ) )
+ break;
+ }
+
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__( (reqd_work_group_size( COUNT_SPLITS_WG_SIZE, 1, 1 )) )
+void kernel
+rebraid_count_splits(
+ global struct BVHBase* bvh_base,
+ global __const struct GRL_RAYTRACING_INSTANCE_DESC* instances,
+ global uint* rebraid_scratch,
+ uint num_instances
+ )
+{
+ local struct SLMHeap heap[COUNT_SPLITS_WG_SIZE];
+ local ushort split_counts[COUNT_SPLITS_WG_SIZE][NUM_REBRAID_BINS];
+
+ // initialize stuff
+ // TODO_OPT: transpose this and subgroup-vectorize it so that
+ // block-writes can be used
+ for ( uint i = 0; i < NUM_REBRAID_BINS; i++ )
+ split_counts[get_local_id( 0 )][i] = 0;
+
+
+ // count splits for this thread's instance
+ uniform uint base_instance = get_group_id( 0 ) * get_local_size( 0 );
+ uint instanceID = base_instance + get_local_id( 0 );
+
+ if ( instanceID < num_instances )
+ {
+ global BVHBase* bvh_base = (global BVHBase*)instances[instanceID].AccelerationStructure;
+ if ( can_open_root( bvh_base, &instances[instanceID] ) )
+ {
+ count_instance_splits( &bvh_base->Meta.bounds,
+ &instances[instanceID],
+ &split_counts[get_local_id( 0 )][0],
+ &heap[get_local_id(0)] );
+ }
+ }
+
+ barrier( CLK_LOCAL_MEM_FENCE );
+
+ RebraidBuffers buffers = cast_rebraid_buffers( rebraid_scratch, instanceID );
+
+
+ // reduce bins
+ for ( uint bin = get_local_id( 0 ); bin < NUM_REBRAID_BINS; bin += get_local_size( 0 ) )
+ {
+ // TODO_OPT: There's probably a better way to arrange this computation
+ uint bin_split_count = 0;
+ uint bin_instance_count = 0;
+ for ( uint i = 0; i < COUNT_SPLITS_WG_SIZE; i++ )
+ {
+ uint s = split_counts[i][bin];
+ bin_split_count += s;
+ bin_instance_count += (s > 0) ? 1 : 0;
+ }
+
+ if ( bin_split_count > 0 )
+ {
+ atomic_add( &buffers.bin_split_counts[bin], bin_split_count );
+ atomic_add( &buffers.bin_instance_counts[bin], bin_instance_count );
+ }
+ }
+
+ // write out bin counts for each instance
+ for ( uniform uint i = get_sub_group_id(); i < COUNT_SPLITS_WG_SIZE; i += get_num_sub_groups() )
+ {
+ uniform uint iid = base_instance + i;
+ if ( iid > num_instances )
+ break;
+
+ global uint* instance_bin_counts = cast_rebraid_buffers( rebraid_scratch, iid ).instance_bin_counts;
+
+ for ( uniform ushort j = 0; j < NUM_REBRAID_BINS; j += get_sub_group_size() )
+ {
+ uint count = split_counts[i][j + get_sub_group_local_id() ];
+ intel_sub_group_block_write( instance_bin_counts + j, count );
+ }
+ }
+
+}
+
+
+
+
+///////////////////////////////////////////////////////////////////////////////////////////
+// Build PrimRefs
+///////////////////////////////////////////////////////////////////////////////////////////
+
+GRL_INLINE uint get_instance_split_count(RebraidBuffers buffers, uint instanceID, uint available_splits)
+{
+ global uint* instance_desired_split_count = buffers.instance_bin_counts;
+ global uint *bin_split_counts = buffers.bin_split_counts;
+ global uint *bin_instance_counts = buffers.bin_instance_counts;
+
+ uint total_splits = 0;
+ uint remaining_available_splits = available_splits;
+ uint max_bin = 0;
+ uint desired_splits_this_bin = 0;
+ uint instance_splits = 0;
+
+ do
+ {
+ // stop when we reach a level where we can't satisfy the demand
+ desired_splits_this_bin = instance_desired_split_count[max_bin];
+ uint total_bin_splits = bin_split_counts[max_bin];
+
+ if (total_bin_splits > remaining_available_splits)
+ break;
+
+ // we have enough budget to give all instances everything they want at this level, so do it
+ remaining_available_splits -= total_bin_splits;
+ instance_splits += desired_splits_this_bin;
+ desired_splits_this_bin = 0;
+ max_bin++;
+
+ } while (max_bin < NUM_REBRAID_BINS);
+
+ if (max_bin < NUM_REBRAID_BINS)
+ {
+ // we have more split demand than we have splits available. The current bin is the last one that gets any splits
+ // distribute the leftovers as evenly as possible to instances that want them
+ if (desired_splits_this_bin > 0)
+ {
+ // this instance wants splits. how many does it want?
+ uint desired_total = instance_splits + desired_splits_this_bin;
+
+ // distribute to all instances as many as possible
+ uint count = bin_instance_counts[max_bin];
+ uint whole = remaining_available_splits / count;
+ remaining_available_splits -= whole * count;
+
+ // distribute remainder to lower numbered instances
+ size_t partial = (instanceID < remaining_available_splits) ? 1 : 0;
+
+ // give the instance its share.
+ instance_splits += whole + partial;
+ instance_splits = min(instance_splits, desired_total); // don't give it more than it needs
+ }
+ }
+
+ return instance_splits;
+}
+
+GRL_INLINE void build_unopened_primref(
+ struct AABB3f* centroid_bounds,
+ global __const BVHBase *bvh_base,
+ global volatile uint *primref_counter,
+ global struct AABB *primref_buffer,
+ global __const float *Transform,
+ uint instanceID,
+ float matOverhead,
+ ushort instanceMask)
+{
+ float3 root_lower = AABB3f_load_lower(&bvh_base->Meta.bounds);
+ float3 root_upper = AABB3f_load_upper(&bvh_base->Meta.bounds);
+
+ struct AABB primRef;
+ AABB_init( &primRef );
+
+ uint bvhoffset = (uint)BVH_ROOT_NODE_OFFSET;
+ if (is_aabb_valid(root_lower, root_upper) && instanceMask != 0)
+ {
+ primRef = AABBfromAABB3f(compute_xfm_bbox(Transform, BVHBase_GetRootNode(bvh_base), XFM_BOX_NOT_REFINED_TAKE_CLIPBOX, &bvh_base->Meta.bounds, matOverhead));
+ }
+ else
+ {
+ primRef.lower.x = Transform[3];
+ primRef.lower.y = Transform[7];
+ primRef.lower.z = Transform[11];
+ primRef.upper.xyz = primRef.lower.xyz;
+
+ instanceMask = 0;
+ bvhoffset = NO_NODE_OFFSET;
+ }
+
+ primRef.lower.w = as_float(instanceID | (instanceMask << 24));
+ primRef.upper.w = as_float(bvhoffset);
+
+ float3 centroid = primRef.lower.xyz + primRef.upper.xyz;
+ centroid_bounds->lower[0] = centroid.x;
+ centroid_bounds->upper[0] = centroid.x;
+ centroid_bounds->lower[1] = centroid.y;
+ centroid_bounds->upper[1] = centroid.y;
+ centroid_bounds->lower[2] = centroid.z;
+ centroid_bounds->upper[2] = centroid.z;
+
+ uint place = GRL_ATOMIC_INC(primref_counter);
+ primref_buffer[place] = primRef;
+}
+
+GRL_INLINE void build_opened_primrefs(
+ varying bool lane_mask,
+ varying uint offset,
+ varying InternalNode* node,
+ varying struct AABB3f* centroid_bounds,
+ uniform global BVHBase *bvh_base,
+ uniform volatile global uint *primref_counter,
+ uniform global struct AABB *primref_buffer,
+ uniform uint instanceID,
+ uniform const float *Transform,
+ uniform float matOverhead,
+ varying ushort instanceMask)
+{
+ // TODO_OPT: This function is often called with <= 6 active lanes
+ // If lanes are sparse, consider jumping to a sub-group vectorized variant...
+
+ if (lane_mask)
+ {
+ varying uint place = GRL_ATOMIC_INC(primref_counter);
+
+ struct AABB box = AABBfromAABB3f(compute_xfm_bbox(Transform, node, XFM_BOX_NOT_REFINED_CLIPPED, &bvh_base->Meta.bounds, matOverhead));
+
+ box.lower.w = as_float(instanceID | (instanceMask << 24));
+ box.upper.w = as_float(offset * 64 + (uint)BVH_ROOT_NODE_OFFSET);
+ primref_buffer[place] = box;
+
+ AABB3f_extend_point( centroid_bounds, box.lower.xyz + box.upper.xyz );
+ }
+}
+
+
+GRL_INLINE void SUBGROUP_open_nodes(
+ uniform global struct AABB3f *geometry_bounds,
+ uniform uint split_limit,
+ uniform global __const struct GRL_RAYTRACING_INSTANCE_DESC *instance,
+ uniform uint instanceID,
+ uniform volatile global uint *primref_counter,
+ uniform global struct AABB *primref_buffer,
+ varying struct AABB3f* centroid_bounds,
+ float transformOverhead)
+{
+ uniform SGHeap heap;
+ SGHeap_init(&heap);
+
+ uniform float relative_area_scale = 1.0f / AABB3f_halfArea(geometry_bounds);
+ uniform global BVHBase *bvh_base = (global BVHBase *)instance->AccelerationStructure;
+
+ uniform uint16_t node_offs = 0;
+ varying uint lane = get_sub_group_local_id();
+
+ uniform InternalNode* node_array = get_node( bvh_base, 0 );
+
+ LOOP_TRIPWIRE_INIT;
+
+ while ( 1 )
+ {
+ uniform InternalNode *node = node_array + node_offs;
+
+ varying uint sg_offs = node_offs + SUBGROUP_get_child_offsets(node);
+ varying bool sg_valid = false;
+ varying bool sg_openable = false;
+ if (lane < NUM_CHILDREN)
+ {
+ sg_valid = InternalNode_IsChildValid(node, lane);
+ if (sg_valid && (sg_offs <= MAX_NODE_OFFSET))
+ {
+ sg_openable = is_node_openable( node_array + sg_offs);
+ }
+ }
+
+ uniform uint16_t valid_children = intel_sub_group_ballot(sg_valid);
+ uniform uint16_t openable_children = intel_sub_group_ballot(sg_openable);
+ uniform uint16_t unopenable_children = valid_children & (~openable_children);
+
+ if ( openable_children )
+ {
+ varying uint16_t sg_area = SUBGROUP_get_child_areas( node, instance->Transform, relative_area_scale );
+
+ // try to push all openable children to the heap
+ if ( !SGHeap_full( &heap ) )
+ {
+ openable_children = SGHeap_vectorized_push( &heap, sg_area, sg_offs, openable_children );
+ }
+
+ // we have more openable children than will fit in the heap
+ // process these one by one.
+ // TODO: Try re-writing with sub_group_any() and see if compiler does a better job
+ while ( openable_children )
+ {
+ // pop min element
+ uniform uint16_t min_area;
+ uniform uint16_t min_offs;
+ SGHeap_full_pop_min( &heap, &min_area, &min_offs );
+
+ // eliminate all children smaller than heap minimum.
+ // mark eliminated children as unopenable
+ varying uint culled_children = openable_children & intel_sub_group_ballot( sg_area <= min_area );
+ unopenable_children ^= culled_children;
+ openable_children &= ~culled_children;
+
+ if ( openable_children )
+ {
+ // if any children survived the purge
+ // find the first such child and swap its offset for the one from the heap
+ //
+ uniform uint child_id = ctz( openable_children );
+ uniform uint16_t old_min_offs = min_offs;
+ min_area = sub_group_broadcast( sg_area, child_id );
+ min_offs = sub_group_broadcast( sg_offs, child_id );
+
+ if ( lane == child_id )
+ sg_offs = old_min_offs;
+
+ openable_children ^= (1 << child_id);
+ unopenable_children ^= (1 << child_id);
+ }
+
+ SGHeap_push_and_fill( &heap, min_area, min_offs );
+
+ }
+ }
+
+ if (unopenable_children)
+ {
+ varying bool sg_create_primref = ((1 << lane) & unopenable_children);
+ build_opened_primrefs(sg_create_primref, sg_offs, node_array + sg_offs, centroid_bounds, bvh_base, primref_counter, primref_buffer, instanceID, instance->Transform, transformOverhead, GRL_get_InstanceMask(instance));
+ }
+
+ --split_limit;
+ if (split_limit == 0)
+ {
+ // split limit exceeded
+ // create primrefs for all remaining openable nodes in heap
+ varying bool sg_mask = SGHeap_get_lane_mask(&heap);
+ sg_offs = SGHeap_get_lane_values(&heap);
+ build_opened_primrefs(sg_mask, sg_offs, node_array + sg_offs, centroid_bounds, bvh_base, primref_counter, primref_buffer, instanceID, instance->Transform, transformOverhead, GRL_get_InstanceMask(instance));
+
+ break;
+ }
+
+
+ // NOTE: the heap should never be empty. If it is, the instance was given too many splits.
+
+ // get next node from heap
+ uint16_t quantized_area;
+ SGHeap_pop_max(&heap, &quantized_area, &node_offs);
+
+ LOOP_TRIPWIRE_INCREMENT( 500, "rebraid_build_primrefs" );
+
+ }
+}
+
+
+#define OPEN_QUEUE_SIZE 256
+#define OPEN_QUEUE_NUM_SGS 16
+
+typedef struct OpenQueueEntry
+{
+ uint instanceID;
+ ushort num_splits;
+} OpenQueueEntry;
+
+typedef struct OpenQueue
+{
+ uint num_produced;
+ uint num_consumed;
+ OpenQueueEntry Q[OPEN_QUEUE_SIZE];
+} OpenQueue;
+
+uniform uint SUBGROUP_GetNextQueueEntry( local OpenQueue* queue )
+{
+ uint next = 0;
+ if ( get_sub_group_local_id() == 0 )
+ next = GRL_ATOMIC_INC( &queue->num_consumed );
+ return sub_group_broadcast( next, 0 );
+}
+
+
+GRL_INLINE void do_rebraid_build_primrefs(
+ local struct AABB3f* SLM_CentroidBounds,
+ local OpenQueue* SLM_Q,
+ global struct Globals* globals,
+ global struct BVHBase* base,
+ global __const struct GRL_RAYTRACING_INSTANCE_DESC* instance_buffer,
+ global uint* rebraid_scratch,
+ global struct AABB* primref_buffer,
+ uint extra_primref_count,
+ uint num_instances)
+{
+ varying uint instanceID = get_sub_group_size() * get_sub_group_global_id() + get_sub_group_local_id();
+
+ uniform volatile global uint* primref_counter = &globals->numPrimitives;
+ uniform RebraidBuffers buffers = cast_rebraid_buffers( rebraid_scratch, instanceID );
+ uniform uint available_splits = get_num_splits( extra_primref_count, NUM_CHILDREN );
+
+
+
+ varying struct AABB3f centroidBounds;
+ AABB3f_init( &centroidBounds );
+
+ if ( get_local_id( 0 ) == 0 )
+ {
+ SLM_Q->num_produced = 0;
+ SLM_Q->num_consumed = 0;
+ AABB3f_init( SLM_CentroidBounds );
+ }
+
+ barrier( CLK_LOCAL_MEM_FENCE );
+
+ // assign splits to unopened instances. Build primrefs for unsplit instances in vectorized form
+ varying uint num_splits = 0;
+ if ( instanceID < num_instances )
+ {
+ num_splits = get_instance_split_count( buffers, instanceID, available_splits );
+ if ( num_splits == 0 )
+ {
+ varying global const struct GRL_RAYTRACING_INSTANCE_DESC* instance = instance_buffer + instanceID;
+ varying global BVHBase* bvh_base = (global BVHBase*)instance->AccelerationStructure;
+ if ( bvh_base != 0 )
+ {
+ build_unopened_primref( &centroidBounds, bvh_base, primref_counter, primref_buffer, instance->Transform, instanceID, 0.0f, GRL_get_InstanceMask(instance));
+ }
+ }
+ else
+ {
+ // defer opened instances
+ uint place = GRL_ATOMIC_INC( &SLM_Q->num_produced );
+ SLM_Q->Q[place].instanceID = instanceID;
+ SLM_Q->Q[place].num_splits = (ushort)num_splits;
+ }
+ }
+
+ barrier( CLK_LOCAL_MEM_FENCE );
+
+ // if there were opened instances, process them, one per subgroup
+ uniform uint num_produced = SLM_Q->num_produced;
+ uniform uint next = SUBGROUP_GetNextQueueEntry( SLM_Q );
+
+ while ( next < num_produced )
+ {
+ uniform uint instanceID = SLM_Q->Q[next].instanceID;
+ uniform uint num_splits = SLM_Q->Q[next].num_splits;
+
+ uniform global const struct GRL_RAYTRACING_INSTANCE_DESC* instance = instance_buffer + instanceID;
+
+ float transformOverhead =
+#if FINE_TRANSFORM_NODE_BOX
+ transformation_bbox_surf_overhead(instance->Transform);
+#else
+ 0.0f;
+#endif
+
+ SUBGROUP_open_nodes(
+ &base->Meta.bounds,
+ num_splits,
+ instance,
+ instanceID,
+ primref_counter,
+ primref_buffer,
+ &centroidBounds,
+ transformOverhead);
+
+ next = SUBGROUP_GetNextQueueEntry( SLM_Q );
+ }
+
+ // reduce the centroid bounds AABB
+ struct AABB3f reduced = AABB3f_sub_group_reduce( &centroidBounds );
+ if ( get_sub_group_local_id() == 0 )
+ AABB3f_atomic_merge_localBB_nocheck( SLM_CentroidBounds, &reduced );
+
+ barrier( CLK_LOCAL_MEM_FENCE );
+
+ if( get_local_id(0) == 0 )
+ {
+ atomic_min( (global float*) (&globals->centroidBounds.lower) + 0, SLM_CentroidBounds->lower[0] );
+ atomic_min( (global float*) (&globals->centroidBounds.lower) + 1, SLM_CentroidBounds->lower[1] );
+ atomic_min( (global float*) (&globals->centroidBounds.lower) + 2, SLM_CentroidBounds->lower[2] );
+ atomic_max( (global float*) (&globals->centroidBounds.upper) + 0, SLM_CentroidBounds->upper[0] );
+ atomic_max( (global float*) (&globals->centroidBounds.upper) + 1, SLM_CentroidBounds->upper[1] );
+ atomic_max( (global float*) (&globals->centroidBounds.upper) + 2, SLM_CentroidBounds->upper[2] );
+ }
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__( (reqd_work_group_size( OPEN_QUEUE_SIZE, 1, 1 )) )
+__attribute__( (intel_reqd_sub_group_size( 16 )) )
+void kernel rebraid_build_primrefs(
+ global struct Globals* globals,
+ global struct BVHBase* base,
+ global __const struct GRL_RAYTRACING_INSTANCE_DESC* instance_buffer,
+ global uint* rebraid_scratch,
+ global struct AABB* primref_buffer,
+ uint extra_primref_count,
+ uint num_instances)
+{
+ local struct AABB3f SLM_CentroidBounds;
+ local OpenQueue SLM_Q;
+ do_rebraid_build_primrefs(
+ &SLM_CentroidBounds,
+ &SLM_Q,
+ globals,
+ base,
+ instance_buffer,
+ rebraid_scratch,
+ primref_buffer,
+ extra_primref_count,
+ num_instances);
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__( (reqd_work_group_size( OPEN_QUEUE_SIZE, 1, 1 )) )
+__attribute__( (intel_reqd_sub_group_size( 16 )) )
+void kernel rebraid_build_primrefs_indirect(
+ global struct Globals* globals,
+ global struct BVHBase* base,
+ global __const struct GRL_RAYTRACING_INSTANCE_DESC* instance_buffer,
+ global uint* rebraid_scratch,
+ global struct AABB* primref_buffer,
+ global struct IndirectBuildRangeInfo const * const indirect_data,
+ uint extra_primref_count )
+{
+ local struct AABB3f SLM_CentroidBounds;
+ local OpenQueue SLM_Q;
+
+ instance_buffer = (global __const struct GRL_RAYTRACING_INSTANCE_DESC*)
+ (((global char*)instance_buffer) + indirect_data->primitiveOffset);
+
+ do_rebraid_build_primrefs(
+ &SLM_CentroidBounds,
+ &SLM_Q,
+ globals,
+ base,
+ instance_buffer,
+ rebraid_scratch,
+ primref_buffer,
+ extra_primref_count,
+ indirect_data->primitiveCount);
+}
+
+
+///////////////////////////////////////////////////////////////////////////////////////////
+// Misc
+///////////////////////////////////////////////////////////////////////////////////////////
+
+
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(16, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16))) void kernel
+ISA_TEST(global InternalNode *n, global uint *out, global float *xform, float scale)
+{
+
+ out[get_sub_group_local_id()] = InternalNode_IsChildValid(n, get_sub_group_local_id());
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__( (reqd_work_group_size( 1, 1, 1 )) ) void kernel
+DEBUG_PRINT(
+ global struct Globals* globals,
+ global __const struct GRL_RAYTRACING_INSTANCE_DESC* instance_buffer,
+ global uint* rebraid_scratch,
+ global struct AABB* primref_buffer,
+ dword num_extra,
+ dword input_instances )
+{
+#if 0
+ // validate primrefs
+ if ( (get_local_id(0) + get_group_id(0)*get_local_size(0)) == 0 )
+ {
+ uint refs = globals->numPrimitives;
+ for ( uint i = 0; i < refs; i++ )
+ {
+ if ( any( primref_buffer[i].lower.xyz < globals->geometryBounds.lower.xyz ) ||
+ any( primref_buffer[i].upper.xyz > globals->geometryBounds.upper.xyz ) ||
+ any( isnan(primref_buffer[i].lower.xyz) ) ||
+ any( isnan(primref_buffer[i].upper.xyz) ) )
+ {
+ struct AABB box = primref_buffer[i];
+ printf( "BAD BOX: %u {%f,%f,%f} {%f,%f,%f} %u\n", as_uint( box.lower.w ),
+ box.lower.x, box.lower.y, box.lower.z,
+ box.upper.x, box.upper.y, box.upper.z,
+ as_uint( box.lower.w ) );
+ }
+
+ const uint instIndex = PRIMREF_instanceID(&primref_buffer[i]); // TODO: Refactor me. We should not be using struct AABB for primRefs
+ const uint rootByteOffset = as_uint( primref_buffer[i].upper.w ); // It should be struct PrimRef
+ if ( instIndex >= input_instances )
+ printf( "BAD INSTANCE INDEX: %u", i );
+ else
+ {
+ global struct BVHBase* blas = (global struct BVHBase*)instance_buffer[instIndex].AccelerationStructure;
+ if ( blas )
+ {
+ struct InternalNode* start = BVHBase_GetInternalNodes( blas );
+ struct InternalNode* end = BVHBase_GetInternalNodesEnd( blas );
+
+ InternalNode* entryPoint = (struct InternalNode*)((char*)instance_buffer[instIndex].AccelerationStructure + rootByteOffset);
+ if ( entryPoint < start || entryPoint >= end )
+ printf( "BAD ENTRYPOINT: %u\n", i );
+ if ( (rootByteOffset & 63) != 0 )
+ printf( "MISALIGNED ENTRYPOInt: %u\n", i );
+
+ }
+ }
+ }
+ }
+#endif
+#if 0
+ if ( (get_local_id(0) + get_group_id(0)*get_local_size(0)) == 0 )
+ printf( "REBRAIDED: %u\n", globals->numPrimitives );
+
+ // print instance bin information
+ if ( (get_local_id(0) + get_group_id(0)*get_local_size(0)) == 0 )
+ {
+ printf( "REBRAIDED: %u\n", globals->numPrimitives );
+ for( uint i=0; i<231; i++ )
+ {
+ RebraidBuffers buffers = cast_rebraid_buffers( rebraid_scratch,i );
+ printf( " ID:%4u ", i );
+ for ( uint j = 0; j < NUM_REBRAID_BINS; j++ )
+ {
+ global uint* count = buffers.instance_bin_counts;
+ printf( " %2u ", count[j] );
+ }
+ printf( "\n" );
+ }
+ }
+#endif
+#if 0
+ if ( (get_local_id(0) + get_group_id(0)*get_local_size(0)) == 0 )
+ {
+ printf( "Instances: %u\n", globals->numPrimitives );
+
+ for ( uint i = 0; i < globals->numPrimitives; i++ )
+ {
+ if ( any( primref_buffer[i].lower.xyz < globals->geometryBounds.lower.xyz ) ||
+ any( primref_buffer[i].upper.xyz > globals->geometryBounds.upper.xyz ) )
+ {
+ struct AABB box = primref_buffer[i];
+ printf( " %u {%f,%f,%f} {%f,%f,%f} %u\n", as_uint( box.lower.w ),
+ box.lower.x, box.lower.y, box.lower.z,
+ box.upper.x, box.upper.y, box.upper.z,
+ as_uint( box.lower.w ) );
+ }
+
+ }
+ }
+#endif
+}
+
diff --git a/src/intel/vulkan/grl/gpu/common.h b/src/intel/vulkan/grl/gpu/common.h
new file mode 100644
index 00000000000..5fa0e117ae4
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/common.h
@@ -0,0 +1,429 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#pragma once
+
+#include "shared.h"
+#include "intrinsics.h"
+#include "AABB.h"
+#include "AABB3f.h"
+#include "qbvh6.h"
+
+/* ====== BVH_BUILDER config ====== */
+
+__constant const float cfg_intCost = 4.0f;
+__constant const float cfg_travCost = 1.0f;
+__constant const uint cfg_minLeafSize = BVH_LEAF_N_MIN;
+__constant const uint cfg_maxLeafSize = BVH_LEAF_N_MAX;
+__constant const uint cfg_maxDepth = BUILDRECORD_STACK_SIZE;
+
+#define ENABLE_CONVERSION_CHECKS 0
+
+#ifdef ENABLE_BIG_REG_ANNOTATION
+#define GRL_ANNOTATE_BIG_REG_REQ __attribute__((annotate("num-thread-per-eu 4")))
+#else
+#define GRL_ANNOTATE_BIG_REG_REQ
+#endif
+
+#ifdef ENABLE_IGC_DO_NOT_SPILL
+#define GRL_ANNOTATE_IGC_DO_NOT_SPILL __attribute__((annotate("igc-do-not-spill")))
+#else
+#define GRL_ANNOTATE_IGC_DO_NOT_SPILL
+#endif
+
+#define ERROR()
+
+/* =================================================================================================================================================== */
+/* =================================================================================================================================================== */
+/* =================================================================================================================================================== */
+/* =================================================================================================================================================== */
+
+GRL_INLINE unsigned int getNumLeafPrims(unsigned int offset)
+{
+ return (offset & 0x7) - 3;
+}
+
+GRL_INLINE unsigned int getLeafOffset(unsigned int offset)
+{
+ return offset & (~0x7);
+}
+
+GRL_INLINE float4 triangleNormal(const float4 v0, const float4 v1, const float4 v2)
+{
+ const float4 a = v1 - v0;
+ const float4 b = v2 - v0;
+ return cross(a, b);
+}
+
+GRL_INLINE float areaTriangle(const float4 v0, const float4 v1, const float4 v2)
+{
+ const float4 normal = triangleNormal(v0, v1, v2);
+ return length((float3)(normal.x, normal.y, normal.z)) * 0.5f;
+}
+
+GRL_INLINE float det2(const float2 a, const float2 b)
+{
+ return a.x * b.y - a.y * b.x;
+}
+
+GRL_INLINE float areaProjectedTriangle(const float4 v0, const float4 v1, const float4 v2)
+{
+ const float xy = 0.5f * fabs(det2(v1.xy - v0.xy, v2.xy - v0.xy));
+ const float yz = 0.5f * fabs(det2(v1.yz - v0.yz, v2.yz - v0.yz));
+ const float zx = 0.5f * fabs(det2(v1.zx - v0.zx, v2.zx - v0.zx));
+ return xy + yz + zx;
+}
+
+typedef struct Block64B {
+ char data[64];
+} Block64B __attribute__((aligned(64)));
+
+typedef char byte_align64B __attribute__((aligned(64)));
+
+/* ====================================================================== */
+/* ============================== GLOBALS =============================== */
+/* ====================================================================== */
+
+GRL_INLINE bool Globals_OnFinish(global struct Globals *globals)
+{
+ /* last active HW thread ? */
+ if (get_local_id(0) == 0)
+ {
+ const uint sync = atomic_add(&globals->sync, 1);
+ if (sync + 1 == get_num_groups(0))
+ {
+ globals->sync = 0;
+ return true;
+ }
+ }
+ return false;
+}
+
+GRL_INLINE uint BlockAllocator_BytesUsed(struct BlockAllocator *p)
+{
+ return p->cur - p->start;
+};
+
+GRL_INLINE uint BlockAllocator_Alloc(__global struct BlockAllocator *p, const uint size)
+{
+ return atomic_add(&p->cur, size);
+}
+
+GRL_INLINE uint BlockAllocator_Alloc_Single(__global struct BlockAllocator *p, const uint size)
+{
+ uint offset = 0;
+ if (get_sub_group_local_id() == 0)
+ offset = atomic_add(&p->cur, size);
+ return sub_group_broadcast(offset, 0);
+}
+
+// node allocation returns an offset from beginning of BVH to allocated node
+// in multiples of 64B
+GRL_INLINE uint allocate_inner_nodes(global struct BVHBase* base, uint num_nodes )
+{
+ return atomic_add_global( &base->nodeDataCur, num_nodes );
+}
+GRL_INLINE uint allocate_procedural_leaves(global struct BVHBase* base, uint num_nodes)
+{
+ return atomic_add_global(&base->proceduralDataCur, num_nodes);
+}
+
+GRL_INLINE uint allocate_quad_leaves(global struct BVHBase* base, uint num_nodes)
+{
+ return atomic_add_global(&base->quadLeafCur, num_nodes);
+}
+
+#if 0
+GRL_INLINE uint alloc_node_mem(global struct Globals *globals, const uint size)
+{
+ const uint aligned_size = ((size + 63) / 64) * 64; /* allocate in 64 bytes blocks */
+ return BlockAllocator_Alloc(&globals->node_mem_allocator, aligned_size);
+}
+
+GRL_INLINE uint alloc_single_node_mem(global struct Globals *globals, const uint size)
+{
+ const uint aligned_size = ((size + 63) / 64) * 64; /* allocate in 64 bytes blocks */
+ return BlockAllocator_Alloc_Single(&globals->node_mem_allocator, aligned_size);
+}
+
+GRL_INLINE uint alloc_quad_leaf_mem(global struct Globals *globals, const uint size)
+{
+ const uint aligned_size = ((size + 63) / 64) * 64; /* allocate in 64 bytes blocks */
+ return BlockAllocator_Alloc(&globals->quad_mem_allocator, aligned_size);
+}
+
+GRL_INLINE uint alloc_procedural_leaf_mem(global struct Globals *globals, const uint size)
+{
+ const uint aligned_size = ((size + 63) / 64) * 64; /* allocate in 64 bytes blocks */
+ return BlockAllocator_Alloc(&globals->procedural_mem_allocator, aligned_size);
+}
+#endif
+
+GRL_INLINE global struct BuildRecord *getBuildRecords(char *bvh_mem, struct Globals *globals)
+{
+ return (global struct BuildRecord *)(bvh_mem + globals->build_record_start);
+}
+
+/* ======================================================================= */
+/* ============================== TRIANGLE =============================== */
+/* ======================================================================= */
+
+/*GRL_INLINE void printTriangle(struct Triangle *t)
+{
+ printf("vtx[0] %d vtx[1] %d vtx[2] %d primID %d geomID %d \n",t->vtx[0],t->vtx[1],t->vtx[2],t->primID,t->geomID);
+ }*/
+
+/* ==================================================================== */
+/* ============================== SPLIT =============================== */
+/* ==================================================================== */
+
+GRL_INLINE void printSplit(struct Split *split)
+{
+ printf("split sah %f dim %d pos %d \n", split->sah, split->dim, split->pos);
+}
+
+/* ========================================================================== */
+/* ============================== BUILDRECORD =============================== */
+/* ========================================================================== */
+
+GRL_INLINE void initBuildRecord(struct BuildRecord *buildRecord, uint start, uint end)
+{
+ AABB_init(&buildRecord->centroidBounds);
+ buildRecord->start = start;
+ buildRecord->end = end;
+}
+
+GRL_INLINE void extendBuildRecord(struct BuildRecord *buildRecord, struct AABB *primref)
+{
+ AABB_extend_point(&buildRecord->centroidBounds, AABB_centroid2(primref));
+}
+
+GRL_INLINE uint getBuildRecursionDepth(struct BuildRecord *buildRecord)
+{
+ return as_uint(buildRecord->centroidBounds.upper.w);
+}
+
+GRL_INLINE void setBuildRecursionDepth(struct BuildRecord *buildRecord, uint depth)
+{
+ buildRecord->centroidBounds.upper.w = as_float(depth);
+}
+
+GRL_INLINE uint getNumPrimsBuildRecord(struct BuildRecord *buildRecord)
+{
+ return buildRecord->end - buildRecord->start;
+}
+
+/* ========================================================================== */
+/* =================== BinaryMortonCodeHierarchy ============================= */
+/* ========================================================================== */
+
+GRL_INLINE void BinaryMortonCodeHierarchy_init(struct BinaryMortonCodeHierarchy *record, uint start, uint end)
+{
+ record->range.start = start;
+ record->range.end = end;
+ record->leftChild = -1;
+ record->rightChild = -1;
+// record->flag = 0;
+}
+
+GRL_INLINE uint BinaryMortonCodeHierarchy_getNumPrimitives(global struct BinaryMortonCodeHierarchy *nodes, uint nodeID)
+{
+ /* leaf case */
+ if (nodeID & (uint)(1 << 31))
+ return 1;
+
+ /* inner node case*/
+ else
+ return nodes[nodeID].range.end - nodes[nodeID].range.start + 1;
+}
+
+GRL_INLINE struct BinaryMortonCodeHierarchy BinaryMortonCodeHierarchy_getEntry(global struct BinaryMortonCodeHierarchy* nodes, uint nodeID)
+{
+ struct BinaryMortonCodeHierarchy entry;
+
+ if (nodeID & (uint)(1 << 31)) {
+ /* leaf case */
+ uint rangeStart = nodeID ^ (uint)(1 << 31);
+ BinaryMortonCodeHierarchy_init(&entry, rangeStart, rangeStart);
+ }
+ else {
+ /* inner node case*/
+ entry = nodes[nodeID];
+ }
+
+ return entry;
+}
+
+GRL_INLINE uint BinaryMortonCodeHierarchy_getRangeStart(global struct BinaryMortonCodeHierarchy *nodes, uint nodeID)
+{
+ /* leaf case */
+ if (nodeID & (uint)(1 << 31))
+ return nodeID ^ (uint)(1 << 31);
+
+ /* inner node case*/
+ else
+ return nodes[nodeID].range.start;
+}
+
+/* ==================================================================== */
+/* ============================== RANGE =============================== */
+/* ==================================================================== */
+
+GRL_INLINE void printRange(struct Range *range)
+{
+ printf("start %d end %d \n", range->start, range->end);
+}
+
+GRL_INLINE bool equalRange(struct Range *range0, struct Range *range1)
+{
+ if (range0->start == range1->start &&
+ range0->end == range1->end)
+ return true;
+ return false;
+}
+
+GRL_INLINE uint getSizeRange(struct Range *range)
+{
+ return range->end - range->start;
+}
+
+/* ==================================================================== */
+/* ========================= ProceduralLeaf =========================== */
+/* ==================================================================== */
+
+#if 0
+struct ProceduralLeaf
+{
+ uint shaderIndex_geomMask;
+ uint geomIndex_flags;
+ uint N_last;
+ uint primIndex[13];
+};
+#endif
+
+GRL_INLINE uint ProceduralLeaf_geomIndex(global struct ProceduralLeaf *This)
+{
+ return This->leafDesc.geomIndex_flags & 0x1FFFFFFF;
+}
+
+GRL_INLINE uint ProceduralLeaf_primIndex(global struct ProceduralLeaf *This, uint i)
+{
+ //assert(i < N);
+ return This->_primIndex[i];
+}
+
+/* ==================================================================== */
+/* =========================== TrianglePair =========================== */
+/* ==================================================================== */
+
+struct TrianglePair
+{
+ uint4 a; // indices of the 4 verts to store in the quad
+ uint3 lb; // index of the second triangle's verts in 'a'
+};
+
+GRL_INLINE struct TrianglePair TrianglePair_Constructor(uint3 tri0, uint primID0, uint3 tri1, uint primID1)
+{
+ struct TrianglePair q;
+ q.a.x = tri0.x;
+ q.a.y = tri0.y;
+ q.a.z = tri0.z;
+ q.a.w = tri0.z;
+
+ uint3 b;
+ b.x = tri1.x;
+ b.y = tri1.y;
+ b.z = tri1.z;
+
+ q.lb = (uint3)(3);
+
+ q.lb.x = (b.x == q.a.x) ? 0 : q.lb.x;
+ q.lb.y = (b.y == q.a.x) ? 0 : q.lb.y;
+ q.lb.z = (b.z == q.a.x) ? 0 : q.lb.z;
+
+ q.lb.x = (b.x == q.a.y) ? 1 : q.lb.x;
+ q.lb.y = (b.y == q.a.y) ? 1 : q.lb.y;
+ q.lb.z = (b.z == q.a.y) ? 1 : q.lb.z;
+
+ q.lb.x = (b.x == q.a.z) ? 2 : q.lb.x;
+ q.lb.y = (b.y == q.a.z) ? 2 : q.lb.y;
+ q.lb.z = (b.z == q.a.z) ? 2 : q.lb.z;
+
+ q.lb.x = (primID0 != primID1) ? q.lb.x : 0;
+ q.lb.y = (primID0 != primID1) ? q.lb.y : 0;
+ q.lb.z = (primID0 != primID1) ? q.lb.z : 0;
+
+ q.a.w = (q.lb.x == 3) ? b.x : q.a.w;
+ q.a.w = (q.lb.y == 3) ? b.y : q.a.w;
+ q.a.w = (q.lb.z == 3) ? b.z : q.a.w;
+
+ return q;
+}
+
+GRL_INLINE float InstanceDesc_get_transform(const InstanceDesc *d, const uint32_t row, const uint32_t column)
+{
+ return d->Transform[row][column];
+}
+
+GRL_INLINE uint32_t InstanceDesc_get_instanceID(const InstanceDesc *d)
+{
+ return d->InstanceIDAndMask & (0x00FFFFFF);
+}
+
+GRL_INLINE uint32_t InstanceDesc_get_InstanceMask(const InstanceDesc *d)
+{
+ return d->InstanceIDAndMask >> 24;
+}
+
+GRL_INLINE uint32_t InstanceDesc_get_InstanceContributionToHitGroupIndex(const InstanceDesc *d)
+{
+ return d->InstanceContributionToHitGroupIndexAndFlags & ((1 << 24) - 1);
+}
+
+GRL_INLINE uint32_t InstanceDesc_get_InstanceFlags(const InstanceDesc *d)
+{
+ return d->InstanceContributionToHitGroupIndexAndFlags >> 24;
+}
+
+GRL_INLINE gpuva_t InstanceDesc_get_AccelerationStructure(const InstanceDesc *d)
+{
+ return d->AccelerationStructureGPUVA;
+}
+
+GRL_INLINE void InstanceDesc_set_transform(InstanceDesc *d, const uint32_t row, const uint32_t column, float value)
+{
+ d->Transform[row][column] = value;
+}
+
+GRL_INLINE void InstanceDesc_set_instanceID(InstanceDesc *d, const uint32_t id)
+{
+ d->InstanceIDAndMask &= 255 << 24;
+ d->InstanceIDAndMask |= id & ((1 << 24) - 1);
+}
+
+GRL_INLINE void InstanceDesc_set_InstanceMask(InstanceDesc *d, const uint32_t mask)
+{
+ d->InstanceIDAndMask &= ((1 << 24) - 1);
+ d->InstanceIDAndMask |= mask << 24;
+}
+
+GRL_INLINE void InstanceDesc_set_InstanceContributionToHitGroupIndex(InstanceDesc *d, const uint32_t contribution)
+{
+ d->InstanceContributionToHitGroupIndexAndFlags &= 255 << 24;
+ d->InstanceContributionToHitGroupIndexAndFlags |= contribution & ((1 << 24) - 1);
+}
+
+GRL_INLINE void InstanceDesc_set_InstanceFlags(InstanceDesc *d, const uint32_t flags)
+{
+ d->InstanceContributionToHitGroupIndexAndFlags &= ((1 << 24) - 1);
+ d->InstanceContributionToHitGroupIndexAndFlags |= flags << 24;
+}
+
+GRL_INLINE void InstanceDesc_set_AccelerationStructure(InstanceDesc *d, gpuva_t address)
+{
+ d->AccelerationStructureGPUVA = address;
+}
diff --git a/src/intel/vulkan/grl/gpu/copy.grl b/src/intel/vulkan/grl/gpu/copy.grl
new file mode 100644
index 00000000000..1bb500a4ea0
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/copy.grl
@@ -0,0 +1,129 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+module copy; // In copy we assume output data structure to be DXR compatible
+
+kernel clone_indirect < source="bvh_copy.cl", kernelFunction="clone_indirect" >
+kernel compact < source="bvh_copy.cl", kernelFunction="compact" >
+kernel serialize_indirect < source="bvh_copy.cl", kernelFunction="serialize_indirect" >
+kernel serialize_for_input_dump_indirect < source="bvh_copy.cl", kernelFunction="serialize_for_input_dump_indirect" >
+kernel deserialize_indirect < source="bvh_copy.cl", kernelFunction="deserialize_indirect" >
+kernel dxr_decode < source="bvh_copy.cl", kernelFunction="dxr_decode" >
+
+metakernel clone_indirect(
+ qword dest,
+ qword src,
+ qword srcBVHsizedwordAddr)
+{
+// this has to be compatible with in kernel GroupCountForCopy(...)
+ define byteSize REG0;
+ define numGroupsRqd REG1;
+ define BYTE_PER_GROUP_CHUNK_ROUNDUP REG2; BYTE_PER_GROUP_CHUNK_ROUNDUP = 255;
+ define BYTE_PER_GROUP_CHUNK_SHIFT REG3; BYTE_PER_GROUP_CHUNK_SHIFT = 8;
+ define REMINDER_NUM_GROUPS REG4; REMINDER_NUM_GROUPS = 4;
+ byteSize = load_dword(srcBVHsizedwordAddr);
+ numGroupsRqd = byteSize >> BYTE_PER_GROUP_CHUNK_SHIFT;
+ numGroupsRqd = numGroupsRqd + REMINDER_NUM_GROUPS;
+
+ DISPATCHDIM_X = numGroupsRqd.lo;
+ DISPATCHDIM_Y = 1;
+ DISPATCHDIM_Z = 1;
+
+ dispatch_indirect clone_indirect args(
+ dest,
+ src);
+}
+
+metakernel compact(
+ qword dest,
+ qword src)
+{
+ dispatch compact(32,1,1) args(
+ dest,
+ src,
+ 32);
+}
+
+metakernel serialize_indirect(
+ qword dest,
+ qword src,
+ qword driverID,
+ qword srcBVHsizedwordAddr)
+{
+ define byteSize REG0;
+ define numGroupsRqd REG1;
+ define BYTE_PER_GROUP_CHUNK_ROUNDUP REG2; BYTE_PER_GROUP_CHUNK_ROUNDUP = 255;
+ define BYTE_PER_GROUP_CHUNK_SHIFT REG3; BYTE_PER_GROUP_CHUNK_SHIFT = 8;
+ define REMINDER_NUM_GROUPS REG4; REMINDER_NUM_GROUPS = 4;
+ byteSize = load_dword(srcBVHsizedwordAddr);
+ numGroupsRqd = byteSize >> BYTE_PER_GROUP_CHUNK_SHIFT;
+ numGroupsRqd = numGroupsRqd + REMINDER_NUM_GROUPS;
+ DISPATCHDIM_X = numGroupsRqd.lo;
+ DISPATCHDIM_Y = 1;
+ DISPATCHDIM_Z = 1;
+
+ dispatch_indirect serialize_indirect args(
+ dest,
+ src,
+ driverID);
+}
+
+metakernel serialize_for_input_dump_indirect(
+ qword batchPtrs,
+ qword dstOffset,
+ qword src,
+ qword driverID,
+ qword srcBVHsizedwordAddr)
+{
+ define byteSize REG0;
+ define numGroupsRqd REG1;
+ define BYTE_PER_GROUP_CHUNK_SHIFT REG2; BYTE_PER_GROUP_CHUNK_SHIFT = 8;
+ define REMINDER_NUM_GROUPS REG3; REMINDER_NUM_GROUPS = 4;
+ byteSize = load_dword(srcBVHsizedwordAddr);
+ numGroupsRqd = byteSize >> BYTE_PER_GROUP_CHUNK_SHIFT;
+ numGroupsRqd = numGroupsRqd + REMINDER_NUM_GROUPS;
+ DISPATCHDIM_X = numGroupsRqd.lo;
+ DISPATCHDIM_Y = 1;
+ DISPATCHDIM_Z = 1;
+
+ dispatch_indirect serialize_for_input_dump_indirect args(
+ batchPtrs,
+ dstOffset,
+ src,
+ driverID);
+}
+
+metakernel deserialize_indirect(
+ qword dest,
+ qword src,
+ qword srcBVHsizedwordAddr)
+{
+ define byteSize REG0;
+ define numGroupsRqd REG1;
+ define BYTE_PER_GROUP_CHUNK_ROUNDUP REG2; BYTE_PER_GROUP_CHUNK_ROUNDUP = 255;
+ define BYTE_PER_GROUP_CHUNK_SHIFT REG3; BYTE_PER_GROUP_CHUNK_SHIFT = 8;
+ define REMINDER_NUM_GROUPS REG4; REMINDER_NUM_GROUPS = 4;
+ byteSize = load_dword(srcBVHsizedwordAddr);
+ numGroupsRqd = byteSize >> BYTE_PER_GROUP_CHUNK_SHIFT;
+ numGroupsRqd = numGroupsRqd + REMINDER_NUM_GROUPS;
+ DISPATCHDIM_X = numGroupsRqd.lo;
+ DISPATCHDIM_Y = 1;
+ DISPATCHDIM_Z = 1;
+
+ dispatch_indirect deserialize_indirect args(
+ dest,
+ src);
+}
+
+metakernel dxr_decode(
+ qword dest,
+ qword src)
+{
+ dispatch dxr_decode(1,1,1) args(
+ dest,
+ src);
+}
diff --git a/src/intel/vulkan/grl/gpu/d3d12.h b/src/intel/vulkan/grl/gpu/d3d12.h
new file mode 100644
index 00000000000..32a7654eac5
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/d3d12.h
@@ -0,0 +1,525 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#pragma once
+#include "GRLStructs.h"
+#include "shared.h"
+
+typedef global void *D3D12_GPU_VIRTUAL_ADDRESS;
+typedef void *ID3D12StateObjectPrototype;
+
+enum DXGI_FORMAT
+{
+ DXGI_FORMAT_UNKNOWN,
+ DXGI_FORMAT_R32G32B32A32_TYPELESS,
+ DXGI_FORMAT_R32G32B32A32_FLOAT,
+ DXGI_FORMAT_R32G32B32A32_UINT,
+ DXGI_FORMAT_R32G32B32A32_SINT,
+ DXGI_FORMAT_R32G32B32_TYPELESS,
+ DXGI_FORMAT_R32G32B32_FLOAT,
+ DXGI_FORMAT_R32G32B32_UINT,
+ DXGI_FORMAT_R32G32B32_SINT,
+ DXGI_FORMAT_R16G16B16A16_TYPELESS,
+ DXGI_FORMAT_R16G16B16A16_FLOAT,
+ DXGI_FORMAT_R16G16B16A16_UNORM,
+ DXGI_FORMAT_R16G16B16A16_UINT,
+ DXGI_FORMAT_R16G16B16A16_SNORM,
+ DXGI_FORMAT_R16G16B16A16_SINT,
+ DXGI_FORMAT_R32G32_TYPELESS,
+ DXGI_FORMAT_R32G32_FLOAT,
+ DXGI_FORMAT_R32G32_UINT,
+ DXGI_FORMAT_R32G32_SINT,
+ DXGI_FORMAT_R32G8X24_TYPELESS,
+ DXGI_FORMAT_D32_FLOAT_S8X24_UINT,
+ DXGI_FORMAT_R32_FLOAT_X8X24_TYPELESS,
+ DXGI_FORMAT_X32_TYPELESS_G8X24_UINT,
+ DXGI_FORMAT_R10G10B10A2_TYPELESS,
+ DXGI_FORMAT_R10G10B10A2_UNORM,
+ DXGI_FORMAT_R10G10B10A2_UINT,
+ DXGI_FORMAT_R11G11B10_FLOAT,
+ DXGI_FORMAT_R8G8B8A8_TYPELESS,
+ DXGI_FORMAT_R8G8B8A8_UNORM,
+ DXGI_FORMAT_R8G8B8A8_UNORM_SRGB,
+ DXGI_FORMAT_R8G8B8A8_UINT,
+ DXGI_FORMAT_R8G8B8A8_SNORM,
+ DXGI_FORMAT_R8G8B8A8_SINT,
+ DXGI_FORMAT_R16G16_TYPELESS,
+ DXGI_FORMAT_R16G16_FLOAT,
+ DXGI_FORMAT_R16G16_UNORM,
+ DXGI_FORMAT_R16G16_UINT,
+ DXGI_FORMAT_R16G16_SNORM,
+ DXGI_FORMAT_R16G16_SINT,
+ DXGI_FORMAT_R32_TYPELESS,
+ DXGI_FORMAT_D32_FLOAT,
+ DXGI_FORMAT_R32_FLOAT,
+ DXGI_FORMAT_R32_UINT,
+ DXGI_FORMAT_R32_SINT,
+ DXGI_FORMAT_R24G8_TYPELESS,
+ DXGI_FORMAT_D24_UNORM_S8_UINT,
+ DXGI_FORMAT_R24_UNORM_X8_TYPELESS,
+ DXGI_FORMAT_X24_TYPELESS_G8_UINT,
+ DXGI_FORMAT_R8G8_TYPELESS,
+ DXGI_FORMAT_R8G8_UNORM,
+ DXGI_FORMAT_R8G8_UINT,
+ DXGI_FORMAT_R8G8_SNORM,
+ DXGI_FORMAT_R8G8_SINT,
+ DXGI_FORMAT_R16_TYPELESS,
+ DXGI_FORMAT_R16_FLOAT,
+ DXGI_FORMAT_D16_UNORM,
+ DXGI_FORMAT_R16_UNORM,
+ DXGI_FORMAT_R16_UINT,
+ DXGI_FORMAT_R16_SNORM,
+ DXGI_FORMAT_R16_SINT,
+ DXGI_FORMAT_R8_TYPELESS,
+ DXGI_FORMAT_R8_UNORM,
+ DXGI_FORMAT_R8_UINT,
+ DXGI_FORMAT_R8_SNORM,
+ DXGI_FORMAT_R8_SINT,
+ DXGI_FORMAT_A8_UNORM,
+ DXGI_FORMAT_R1_UNORM,
+ DXGI_FORMAT_R9G9B9E5_SHAREDEXP,
+ DXGI_FORMAT_R8G8_B8G8_UNORM,
+ DXGI_FORMAT_G8R8_G8B8_UNORM,
+ DXGI_FORMAT_BC1_TYPELESS,
+ DXGI_FORMAT_BC1_UNORM,
+ DXGI_FORMAT_BC1_UNORM_SRGB,
+ DXGI_FORMAT_BC2_TYPELESS,
+ DXGI_FORMAT_BC2_UNORM,
+ DXGI_FORMAT_BC2_UNORM_SRGB,
+ DXGI_FORMAT_BC3_TYPELESS,
+ DXGI_FORMAT_BC3_UNORM,
+ DXGI_FORMAT_BC3_UNORM_SRGB,
+ DXGI_FORMAT_BC4_TYPELESS,
+ DXGI_FORMAT_BC4_UNORM,
+ DXGI_FORMAT_BC4_SNORM,
+ DXGI_FORMAT_BC5_TYPELESS,
+ DXGI_FORMAT_BC5_UNORM,
+ DXGI_FORMAT_BC5_SNORM,
+ DXGI_FORMAT_B5G6R5_UNORM,
+ DXGI_FORMAT_B5G5R5A1_UNORM,
+ DXGI_FORMAT_B8G8R8A8_UNORM,
+ DXGI_FORMAT_B8G8R8X8_UNORM,
+ DXGI_FORMAT_R10G10B10_XR_BIAS_A2_UNORM,
+ DXGI_FORMAT_B8G8R8A8_TYPELESS,
+ DXGI_FORMAT_B8G8R8A8_UNORM_SRGB,
+ DXGI_FORMAT_B8G8R8X8_TYPELESS,
+ DXGI_FORMAT_B8G8R8X8_UNORM_SRGB,
+ DXGI_FORMAT_BC6H_TYPELESS,
+ DXGI_FORMAT_BC6H_UF16,
+ DXGI_FORMAT_BC6H_SF16,
+ DXGI_FORMAT_BC7_TYPELESS,
+ DXGI_FORMAT_BC7_UNORM,
+ DXGI_FORMAT_BC7_UNORM_SRGB,
+ DXGI_FORMAT_AYUV,
+ DXGI_FORMAT_Y410,
+ DXGI_FORMAT_Y416,
+ DXGI_FORMAT_NV12,
+ DXGI_FORMAT_P010,
+ DXGI_FORMAT_P016,
+ DXGI_FORMAT_420_OPAQUE,
+ DXGI_FORMAT_YUY2,
+ DXGI_FORMAT_Y210,
+ DXGI_FORMAT_Y216,
+ DXGI_FORMAT_NV11,
+ DXGI_FORMAT_AI44,
+ DXGI_FORMAT_IA44,
+ DXGI_FORMAT_P8,
+ DXGI_FORMAT_A8P8,
+ DXGI_FORMAT_B4G4R4A4_UNORM,
+ DXGI_FORMAT_P208,
+ DXGI_FORMAT_V208,
+ DXGI_FORMAT_V408,
+ DXGI_FORMAT_FORCE_UINT
+};
+
+typedef enum D3D12_RAYTRACING_GEOMETRY_FLAGS
+{
+ D3D12_RAYTRACING_GEOMETRY_FLAG_NONE = 0,
+ D3D12_RAYTRACING_GEOMETRY_FLAG_OPAQUE = 0x1,
+ D3D12_RAYTRACING_GEOMETRY_FLAG_NO_DUPLICATE_ANYHIT_INVOCATION = 0x2
+} D3D12_RAYTRACING_GEOMETRY_FLAGS;
+
+typedef enum D3D12_RAYTRACING_GEOMETRY_TYPE
+{
+ D3D12_RAYTRACING_GEOMETRY_TYPE_TRIANGLES = 0,
+ D3D12_RAYTRACING_GEOMETRY_TYPE_PROCEDURAL_PRIMITIVE_AABBS = (D3D12_RAYTRACING_GEOMETRY_TYPE_TRIANGLES + 1)
+} D3D12_RAYTRACING_GEOMETRY_TYPE;
+
+typedef enum D3D12_RAYTRACING_INSTANCE_FLAGS
+{
+ D3D12_RAYTRACING_INSTANCE_FLAG_NONE = 0,
+ D3D12_RAYTRACING_INSTANCE_FLAG_TRIANGLE_CULL_DISABLE = 0x1,
+ D3D12_RAYTRACING_INSTANCE_FLAG_TRIANGLE_FRONT_COUNTERCLOCKWISE = 0x2,
+ D3D12_RAYTRACING_INSTANCE_FLAG_FORCE_OPAQUE = 0x4,
+ D3D12_RAYTRACING_INSTANCE_FLAG_FORCE_NON_OPAQUE = 0x8
+} D3D12_RAYTRACING_INSTANCE_FLAGS;
+
+typedef struct D3D12_GPU_VIRTUAL_ADDRESSAND_STRIDE
+{
+ D3D12_GPU_VIRTUAL_ADDRESS StartAddress;
+ unsigned long StrideInBytes;
+} D3D12_GPU_VIRTUAL_ADDRESSAND_STRIDE;
+
+typedef struct D3D12_GPU_VIRTUAL_ADDRESSRANGE
+{
+ D3D12_GPU_VIRTUAL_ADDRESS StartAddress;
+ unsigned long SizeInBytes;
+} D3D12_GPU_VIRTUAL_ADDRESSRANGE;
+
+typedef struct D3D12_GPU_VIRTUAL_ADDRESSRANGE_AND_STRIDE
+{
+ D3D12_GPU_VIRTUAL_ADDRESS StartAddress;
+ unsigned long SizeInBytes;
+ unsigned long StrideInBytes;
+} D3D12_GPU_VIRTUAL_ADDRESSRANGE_AND_STRIDE;
+
+typedef struct D3D12_RAYTRACING_GEOMETRY_TRIANGLES_DESC
+{
+ D3D12_GPU_VIRTUAL_ADDRESS Transform;
+ enum DXGI_FORMAT IndexFormat;
+ enum DXGI_FORMAT VertexFormat;
+ unsigned int IndexCount;
+ unsigned int VertexCount;
+ D3D12_GPU_VIRTUAL_ADDRESS IndexBuffer;
+ struct D3D12_GPU_VIRTUAL_ADDRESSAND_STRIDE VertexBuffer;
+} D3D12_RAYTRACING_GEOMETRY_TRIANGLES_DESC;
+
+typedef struct D3D12_RAYTRACING_AABB
+{
+ float MinX;
+ float MinY;
+ float MinZ;
+ float MaxX;
+ float MaxY;
+ float MaxZ;
+} D3D12_RAYTRACING_AABB;
+
+GRL_INLINE void D3D12_set_raytracing_aabb(D3D12_RAYTRACING_AABB* dest, struct AABB* source)
+{
+ dest->MinX = source->lower.x;
+ dest->MinY = source->lower.y;
+ dest->MinZ = source->lower.z;
+ dest->MaxX = source->upper.x;
+ dest->MaxY = source->upper.y;
+ dest->MaxZ = source->upper.z;
+}
+
+typedef struct D3D12_RAYTRACING_GEOMETRY_AABBS_DESC
+{
+ unsigned long AABBCount;
+ D3D12_GPU_VIRTUAL_ADDRESSAND_STRIDE AABBs;
+} D3D12_RAYTRACING_GEOMETRY_AABBS_DESC;
+
+typedef struct D3D12_RAYTRACING_GEOMETRY_DESC
+{
+ D3D12_RAYTRACING_GEOMETRY_TYPE Type;
+ D3D12_RAYTRACING_GEOMETRY_FLAGS Flags;
+ //unsigned int ShaderIndex : 24; // extension
+ //unsigned int Mask : 8; // extension
+ //unsigned int ShaderIndex_Mask; // extension
+ union {
+ D3D12_RAYTRACING_GEOMETRY_TRIANGLES_DESC Triangles;
+ D3D12_RAYTRACING_GEOMETRY_AABBS_DESC AABBs;
+ };
+} D3D12_RAYTRACING_GEOMETRY_DESC;
+
+GRL_INLINE void D3D12_set_Type(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc, D3D12_RAYTRACING_GEOMETRY_TYPE type)
+{
+ geomDesc->Type = type;
+}
+
+GRL_INLINE D3D12_RAYTRACING_GEOMETRY_TYPE D3D12_get_Type(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc)
+{
+ return geomDesc->Type;
+}
+
+GRL_INLINE void D3D12_set_Flags(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc, D3D12_RAYTRACING_GEOMETRY_FLAGS flags)
+{
+ geomDesc->Flags = flags;
+}
+
+GRL_INLINE D3D12_RAYTRACING_GEOMETRY_FLAGS D3D12_get_Flags(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc)
+{
+ return geomDesc->Flags;
+}
+
+GRL_INLINE void D3D12_set_triangles_Transform(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc, D3D12_GPU_VIRTUAL_ADDRESS transform)
+{
+ geomDesc->Triangles.Transform = transform;
+}
+
+GRL_INLINE D3D12_GPU_VIRTUAL_ADDRESS D3D12_get_triangles_Transform(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc)
+{
+ return geomDesc->Triangles.Transform;
+}
+
+GRL_INLINE void D3D12_set_triangles_IndexFormat(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc, IndexFormat format)
+{
+ switch (format)
+ {
+ case INDEX_FORMAT_NONE:
+ geomDesc->Triangles.IndexFormat = DXGI_FORMAT_UNKNOWN;
+ break;
+ case INDEX_FORMAT_R16_UINT:
+ geomDesc->Triangles.IndexFormat = DXGI_FORMAT_R16_UINT;
+ break;
+ case INDEX_FORMAT_R32_UINT:
+ geomDesc->Triangles.IndexFormat = DXGI_FORMAT_R32_UINT;
+ break;
+ }
+}
+
+GRL_INLINE IndexFormat D3D12_get_triangles_IndexFormat(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc)
+{
+ switch (geomDesc->Triangles.IndexFormat)
+ {
+ case DXGI_FORMAT_R16_UINT:
+ return INDEX_FORMAT_R16_UINT;
+ case DXGI_FORMAT_R32_UINT:
+ return INDEX_FORMAT_R32_UINT;
+ case DXGI_FORMAT_UNKNOWN:
+ default:
+ return INDEX_FORMAT_NONE;
+ }
+}
+
+GRL_INLINE void D3D12_set_triangles_VertexFormat(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc, VertexFormat format)
+{
+ switch (format)
+ {
+ case VERTEX_FORMAT_R32G32_FLOAT:
+ geomDesc->Triangles.VertexFormat = DXGI_FORMAT_R32G32_FLOAT;
+ break;
+ case VERTEX_FORMAT_R32G32B32_FLOAT:
+ geomDesc->Triangles.VertexFormat = DXGI_FORMAT_R32G32B32_FLOAT;
+ break;
+ case VERTEX_FORMAT_R16G16_FLOAT:
+ geomDesc->Triangles.VertexFormat = DXGI_FORMAT_R16G16_FLOAT;
+ break;
+ case VERTEX_FORMAT_R16G16B16A16_FLOAT:
+ geomDesc->Triangles.VertexFormat = DXGI_FORMAT_R16G16B16A16_FLOAT;
+ break;
+ case VERTEX_FORMAT_R16G16_SNORM:
+ geomDesc->Triangles.VertexFormat = DXGI_FORMAT_R16G16_SNORM;
+ break;
+ case VERTEX_FORMAT_R16G16B16A16_SNORM:
+ geomDesc->Triangles.VertexFormat = DXGI_FORMAT_R16G16B16A16_SNORM;
+ break;
+ case VERTEX_FORMAT_R16G16B16A16_UNORM:
+ geomDesc->Triangles.VertexFormat = DXGI_FORMAT_R16G16B16A16_UNORM;
+ break;
+ case VERTEX_FORMAT_R16G16_UNORM:
+ geomDesc->Triangles.VertexFormat = DXGI_FORMAT_R16G16_UNORM;
+ break;
+ case VERTEX_FORMAT_R10G10B10A2_UNORM:
+ geomDesc->Triangles.VertexFormat = DXGI_FORMAT_R10G10B10A2_UNORM;
+ break;
+ case VERTEX_FORMAT_R8G8B8A8_UNORM:
+ geomDesc->Triangles.VertexFormat = DXGI_FORMAT_R8G8B8A8_UNORM;
+ break;
+ case VERTEX_FORMAT_R8G8_UNORM:
+ geomDesc->Triangles.VertexFormat = DXGI_FORMAT_R8G8_UNORM;
+ break;
+ case VERTEX_FORMAT_R8G8B8A8_SNORM:
+ geomDesc->Triangles.VertexFormat = DXGI_FORMAT_R8G8B8A8_SNORM;
+ break;
+ case VERTEX_FORMAT_R8G8_SNORM:
+ geomDesc->Triangles.VertexFormat = DXGI_FORMAT_R8G8_SNORM;
+ break;
+ }
+}
+
+GRL_INLINE VertexFormat D3D12_get_triangles_VertexFormat(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc)
+{
+ switch(geomDesc->Triangles.VertexFormat)
+ {
+ case DXGI_FORMAT_R32G32_FLOAT:
+ return VERTEX_FORMAT_R32G32_FLOAT;
+ case DXGI_FORMAT_R32G32B32_FLOAT:
+ return VERTEX_FORMAT_R32G32B32_FLOAT;
+ case DXGI_FORMAT_R16G16_FLOAT:
+ return VERTEX_FORMAT_R16G16_FLOAT;
+ case DXGI_FORMAT_R16G16B16A16_FLOAT:
+ return VERTEX_FORMAT_R16G16B16A16_FLOAT;
+ case DXGI_FORMAT_R16G16_SNORM:
+ return VERTEX_FORMAT_R16G16_SNORM;
+ case DXGI_FORMAT_R16G16B16A16_SNORM:
+ return VERTEX_FORMAT_R16G16B16A16_SNORM;
+ case DXGI_FORMAT_R16G16B16A16_UNORM:
+ return VERTEX_FORMAT_R16G16B16A16_UNORM;
+ case DXGI_FORMAT_R16G16_UNORM:
+ return VERTEX_FORMAT_R16G16_UNORM;
+ case DXGI_FORMAT_R10G10B10A2_UNORM:
+ return VERTEX_FORMAT_R10G10B10A2_UNORM;
+ case DXGI_FORMAT_R8G8B8A8_UNORM:
+ return VERTEX_FORMAT_R8G8B8A8_UNORM;
+ case DXGI_FORMAT_R8G8_UNORM:
+ return VERTEX_FORMAT_R8G8_UNORM;
+ case DXGI_FORMAT_R8G8B8A8_SNORM:
+ return VERTEX_FORMAT_R8G8B8A8_SNORM;
+ case DXGI_FORMAT_R8G8_SNORM:
+ return VERTEX_FORMAT_R8G8_SNORM;
+ default:
+ return VERTEX_FORMAT_R32G32_FLOAT;
+ }
+}
+
+GRL_INLINE void D3D12_set_triangles_IndexCount(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc, unsigned int count)
+{
+ geomDesc->Triangles.IndexCount = count;
+}
+
+GRL_INLINE unsigned int D3D12_get_triangles_IndexCount(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc)
+{
+ return geomDesc->Triangles.IndexCount;
+}
+
+GRL_INLINE void D3D12_set_triangles_VertexCount(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc, unsigned int count)
+{
+ geomDesc->Triangles.VertexCount = count;
+}
+
+GRL_INLINE unsigned int D3D12_get_triangles_VertexCount(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc)
+{
+ return geomDesc->Triangles.VertexCount;
+}
+
+GRL_INLINE void D3D12_set_triangles_IndexBuffer(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc, D3D12_GPU_VIRTUAL_ADDRESS buffer)
+{
+ geomDesc->Triangles.IndexBuffer = buffer;
+}
+
+GRL_INLINE D3D12_GPU_VIRTUAL_ADDRESS D3D12_get_triangles_IndexBuffer(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc)
+{
+ return geomDesc->Triangles.IndexBuffer;
+}
+
+GRL_INLINE void D3D12_set_triangles_VertexBuffer_StartAddress(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc, D3D12_GPU_VIRTUAL_ADDRESS address)
+{
+ geomDesc->Triangles.VertexBuffer.StartAddress = address;
+}
+
+GRL_INLINE D3D12_GPU_VIRTUAL_ADDRESS D3D12_get_triangles_VertexBuffer_StartAddress(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc)
+{
+ return geomDesc->Triangles.VertexBuffer.StartAddress;
+}
+
+GRL_INLINE void D3D12_set_triangles_VertexBuffer_StrideInBytes(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc, unsigned long stride)
+{
+ geomDesc->Triangles.VertexBuffer.StrideInBytes = stride;
+}
+
+GRL_INLINE unsigned long D3D12_get_triangles_VertexBuffer_StrideInBytes(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc)
+{
+ return geomDesc->Triangles.VertexBuffer.StrideInBytes;
+}
+
+GRL_INLINE void D3D12_set_procedurals_AABBCount(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc, unsigned long count)
+{
+ geomDesc->AABBs.AABBCount = count;
+}
+
+GRL_INLINE unsigned long D3D12_get_procedurals_AABBCount(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc)
+{
+ return geomDesc->AABBs.AABBCount;
+}
+
+GRL_INLINE void D3D12_set_procedurals_AABBs_StartAddress(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc, D3D12_GPU_VIRTUAL_ADDRESS address)
+{
+ geomDesc->AABBs.AABBs.StartAddress = address;
+}
+
+GRL_INLINE D3D12_GPU_VIRTUAL_ADDRESS D3D12_get_procedurals_AABBs_StartAddress(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc)
+{
+ return geomDesc->AABBs.AABBs.StartAddress;
+}
+
+GRL_INLINE void D3D12_set_procedurals_AABBs_StrideInBytes(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc, unsigned long stride)
+{
+ geomDesc->AABBs.AABBs.StrideInBytes = stride;
+}
+
+GRL_INLINE unsigned long D3D12_get_procedurals_AABBs_StrideInBytes(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc)
+{
+ return geomDesc->AABBs.AABBs.StrideInBytes;
+}
+
+typedef struct D3D12_RAYTRACING_INSTANCE_DESC
+{
+ float Transform[12];
+ // unsigned int InstanceID : 24;
+ // unsigned int InstanceMask : 8;
+ uint32_t DW0;
+ // unsigned int InstanceContributionToHitGroupIndex : 24;
+ // unsigned int Flags : 8;
+ uint32_t DW1;
+ global char *AccelerationStructure;
+} D3D12_RAYTRACING_INSTANCE_DESC;
+
+GRL_INLINE float D3D12_get_transform(const D3D12_RAYTRACING_INSTANCE_DESC *d, const uint32_t row, const uint32_t column)
+{
+ return d->Transform[row * 4 + column];
+}
+
+GRL_INLINE uint32_t D3D12_get_instanceID(const D3D12_RAYTRACING_INSTANCE_DESC *d)
+{
+ return d->DW0 & ((1 << 24) - 1);
+}
+
+GRL_INLINE uint32_t D3D12_get_InstanceMask(const D3D12_RAYTRACING_INSTANCE_DESC *d)
+{
+ return d->DW0 >> 24;
+}
+
+GRL_INLINE uint32_t D3D12_get_InstanceContributionToHitGroupIndex(const D3D12_RAYTRACING_INSTANCE_DESC *d)
+{
+ return d->DW1 & ((1 << 24) - 1);
+}
+
+GRL_INLINE uint32_t D3D12_get_InstanceFlags(const D3D12_RAYTRACING_INSTANCE_DESC *d)
+{
+ return d->DW1 >> 24;
+}
+
+GRL_INLINE gpuva_t D3D12_get_AccelerationStructure(const D3D12_RAYTRACING_INSTANCE_DESC *d)
+{
+ return (gpuva_t)d->AccelerationStructure;
+}
+
+GRL_INLINE void D3D12_set_transform(D3D12_RAYTRACING_INSTANCE_DESC *d, const uint32_t row, const uint32_t column, float value)
+{
+ d->Transform[row * 4 + column] = value;
+}
+
+GRL_INLINE void D3D12_set_instanceID(D3D12_RAYTRACING_INSTANCE_DESC *d, const uint32_t id)
+{
+ d->DW0 &= 255 << 24;
+ d->DW0 |= id & ((1 << 24) - 1);
+}
+
+GRL_INLINE void D3D12_set_InstanceMask(D3D12_RAYTRACING_INSTANCE_DESC *d, const uint32_t mask)
+{
+ d->DW0 &= ((1 << 24) - 1);
+ d->DW0 |= mask << 24;
+}
+
+GRL_INLINE void D3D12_set_InstanceContributionToHitGroupIndex(D3D12_RAYTRACING_INSTANCE_DESC *d, const uint32_t contribution)
+{
+ d->DW1 &= 255 << 24;
+ d->DW1 |= contribution & ((1 << 24) - 1);
+}
+
+GRL_INLINE void D3D12_set_InstanceFlags(D3D12_RAYTRACING_INSTANCE_DESC *d, const uint32_t flags)
+{
+ d->DW1 &= ((1 << 24) - 1);
+ d->DW1 |= flags << 24;
+}
+
+GRL_INLINE void D3D12_set_AccelerationStructure(D3D12_RAYTRACING_INSTANCE_DESC *d, gpuva_t address)
+{
+ d->AccelerationStructure = (global char*)address;
+}
diff --git a/src/intel/vulkan/grl/gpu/grl_api_interface_verify.cl b/src/intel/vulkan/grl/gpu/grl_api_interface_verify.cl
new file mode 100644
index 00000000000..d37adbbbb2b
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/grl_api_interface_verify.cl
@@ -0,0 +1,59 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#include "api_interface.h"
+
+__attribute__((reqd_work_group_size(1, 1, 1))) void kernel CopyGeom(
+ global struct Geo *src,
+ global struct Geo *dst,
+ global float4 *vec,
+ global ushort *indices,
+ dword step)
+{
+ src = src + get_group_id(0);
+ dst = dst + get_group_id(0);
+ dst->Flags = src->Flags;
+ dst->Type = src->Type;
+ if (src->Type == GEOMETRY_TYPE_PROCEDURAL)
+ {
+ dst->Desc.Procedural.AABBByteStride = src->Desc.Procedural.AABBByteStride;
+ dst->Desc.Procedural.AABBCount = src->Desc.Procedural.AABBCount;
+ dst->Desc.Procedural.AABBByteStride = src->Desc.Procedural.AABBByteStride;
+ }
+ else
+ {
+ dst->Desc.Triangles.pTransformBuffer = src->Desc.Triangles.pTransformBuffer;
+ if (step == 0)
+ return;
+ dst->Desc.Triangles.IndexCount = src->Desc.Triangles.IndexCount;
+ if (step == 1)
+ return;
+ dst->Desc.Triangles.VertexCount = src->Desc.Triangles.VertexCount;
+ if (step == 2)
+ return;
+ dst->Desc.Triangles.IndexFormat = src->Desc.Triangles.IndexFormat;
+ if (step == 3)
+ return;
+ dst->Desc.Triangles.pIndexBuffer = src->Desc.Triangles.pIndexBuffer;
+ if (step == 4)
+ return;
+ dst->Desc.Triangles.pVertexBuffer = src->Desc.Triangles.pVertexBuffer;
+ if (step == 5)
+ return;
+ dst->Desc.Triangles.VertexBufferByteStride = src->Desc.Triangles.VertexBufferByteStride;
+
+ dst->Desc.Triangles.VertexFormat = src->Desc.Triangles.VertexFormat;
+
+ for (uint t = 0; t * 3 < dst->Desc.Triangles.IndexCount; t++)
+ {
+ uint3 tri = GRL_load_triangle(src, t);
+ vec[t * 3] = GRL_load_vertex(src, tri[0]);
+ vec[t * 3 + 1] = GRL_load_vertex(src, tri[1]);
+ vec[t * 3 + 2] = GRL_load_vertex(src, tri[2]);
+ }
+ }
+}
diff --git a/src/intel/vulkan/grl/gpu/grl_api_interface_verify.grl b/src/intel/vulkan/grl/gpu/grl_api_interface_verify.grl
new file mode 100644
index 00000000000..3779439c54b
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/grl_api_interface_verify.grl
@@ -0,0 +1,27 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+module api_interface_verify;
+
+kernel copy_geom < source="grl_api_interface_verify.cl", kernelFunction="CopyGeom" >
+
+metakernel ifc0_copy(
+ qword src,
+ qword dst,
+ qword vec,
+ qword srcIndices,
+ dword numGroups,
+ dword step)
+{
+ dispatch copy_geom(numGroups,1,1) args(
+ src,
+ dst,
+ vec,
+ srcIndices,
+ step
+ );
+}
diff --git a/src/intel/vulkan/grl/gpu/input_dump.cl b/src/intel/vulkan/grl/gpu/input_dump.cl
new file mode 100644
index 00000000000..f668f053f1f
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/input_dump.cl
@@ -0,0 +1,723 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#include "api_interface.h"
+#include "common.h"
+#include "d3d12.h"
+#include "mem_utils.h"
+#include "misc_shared.h"
+
+/// Align value to 128
+///
+/// @param value vale to align
+/// @return aligned value
+GRL_INLINE ulong AlignTo128(ulong value) { return ((value + 127) / 128) * 128; }
+
+GRL_INLINE char* GetVertexBuffersStart(global InputBatchPtrs* batchPtrs) {
+ return (global char*)(batchPtrs->dumpDst + AlignTo128(sizeof(InputBatch)));
+}
+
+/// Finds max used byte in vertex buffer
+///
+/// @param indexBuffPtr pointer to index buffer
+/// @param vertexBufferUsedByteEnd pointer to max used byte of vertex buffers
+/// @param IndexCount number of indices in index buffer
+/// @param IndexFormat index format
+/// @param VertexCount number of vertices in vertex buffer
+/// @param VertexBufferByteStride vertex buffer byte stride
+__attribute__((reqd_work_group_size(256, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
+void kernel find_max_used_byte_in_buff(
+ global void* indexBuffPtr,
+ global uint* vertexBufferUsedByteEnd,
+ dword IndexCount,
+ dword IndexFormat,
+ dword VertexCount,
+ qword VertexBufferByteStride)
+{
+ local uint sgMax[16];
+ uint glob_id = get_group_id(0) * get_local_size(0) + get_local_id(0);
+
+ if (IndexFormat != INDEX_FORMAT_NONE)
+ {
+ uint endByte = 0;
+ if (glob_id < IndexCount)
+ {
+ if (IndexFormat == INDEX_FORMAT_R16_UINT)
+ {
+ global ushort* indexBuffPtrShort = (global ushort*) indexBuffPtr;
+ endByte = indexBuffPtrShort[glob_id];
+ }
+ else
+ {
+ global uint* indexBuffPtrUint = (global uint*) indexBuffPtr;
+ endByte = indexBuffPtrUint[glob_id];
+ }
+ }
+
+ endByte = sub_group_reduce_max(endByte);
+
+ if (get_sub_group_local_id() == 0) { sgMax[get_sub_group_id()] = endByte; }
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ if (get_sub_group_id() == 0)
+ {
+ endByte = sub_group_reduce_max(sgMax[get_sub_group_local_id()]);
+ if (get_sub_group_local_id() == 0)
+ {
+ endByte = min(endByte, VertexCount);
+ if (endByte < VertexCount && IndexCount != 0)
+ ++endByte;
+ endByte *= (dword)VertexBufferByteStride;
+ atomic_max(vertexBufferUsedByteEnd, endByte);
+ }
+ }
+ }
+ else if (glob_id == 0)
+ {
+ uint endByte = VertexCount * VertexBufferByteStride;
+ atomic_max(vertexBufferUsedByteEnd, endByte);
+ }
+}
+
+/// Allocates buffer for vertices
+///
+/// @param batchPtrs batch pointers struct
+/// @param vertexBufferUsedByteEnd pointer to sizes of vertex buffers
+/// @param vertexBufferOffset pointer to offsets to vertex buffers
+/// @param numVertexBuffers number of vertex buffers
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
+void kernel allocate_linear_offsets_for_vertex_buffers(
+ global InputBatchPtrs* batchPtrs,
+ global uint* vertexBufferUsedByteEnd,
+ global uint* vertexBufferOffset,
+ dword numVertexBuffers)
+{
+ uint glob_id = get_group_id(0) * get_local_size(0) + get_sub_group_local_id();
+
+ if (glob_id < numVertexBuffers)
+ {
+ uint numBytes = AlignTo128(vertexBufferUsedByteEnd[glob_id]);
+ uint position = atomic_add_global( &batchPtrs->vertexBuffersSize, numBytes);
+ vertexBufferOffset[glob_id] = position;
+ }
+}
+
+/// Sets the dst data space for input dump of this batch
+///
+/// @param inputDumpMainBuffer pointer to main dump buffer
+/// @param batchPtrs batch pointers struct
+/// @param nonVertexSize size of non vertex data
+/// @param batchIdPtr pointer to batch id
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
+void kernel allocate_data_space_for_inputs(
+ global DebugBufferHeader* inputDumpMainBuffer,
+ global InputBatchPtrs* batchPtrs,
+ uint nonVertexSize,
+ global qword* batchIdPtr)
+{
+ if (get_sub_group_local_id() == 0)
+ {
+ uint vertexBufferSize = batchPtrs->vertexBuffersSize;
+ uint sizeOfThisBatch = vertexBufferSize + AlignTo128(sizeof(InputBatch)) + nonVertexSize;
+
+ if ((sizeOfThisBatch + sizeof(InputBatch)) > ((inputDumpMainBuffer->totalSize - inputDumpMainBuffer->headStart) / 2))
+ {
+ inputDumpMainBuffer->overflow = 1;
+ batchPtrs->dumpDst = 0;
+ batchPtrs->globalDumpBuffer = 0;
+ batchPtrs->nonVertexDataStart = 0;
+ batchPtrs->totalSize = 0;
+ return;
+ }
+
+ dword prevHead = inputDumpMainBuffer->gpuHead;
+ dword newHead;
+ bool circled;
+
+ do
+ {
+ circled = false;
+ newHead = prevHead + sizeOfThisBatch;
+ dword bufferBegin = prevHead;
+ if ((newHead + sizeof(InputBatch)) > inputDumpMainBuffer->totalSize)
+ {
+ circled = true;
+ newHead = inputDumpMainBuffer->headStart + sizeOfThisBatch;
+ bufferBegin = inputDumpMainBuffer->headStart;
+ }
+ dword bufferEnd = newHead + sizeof(InputBatch);
+
+ uint tail;
+ uint tail2 = 7;
+ bool wait;
+ do
+ {
+ wait = true;
+ tail = load_uint_L1UC_L3UC(&inputDumpMainBuffer->tail, 0);
+
+ // dead code, workaround so IGC won't move tail load out of loop
+ if (tail > inputDumpMainBuffer->totalSize)
+ {
+ store_uint_L1UC_L3UC(&inputDumpMainBuffer->tail, 0, tail + tail2);
+ tail2 = tail;
+ }
+
+ if( prevHead >= tail )
+ {
+ //colision example:
+ // ----------T=======H------------
+ // -------B=====E-----------------
+ //
+ if((bufferEnd < tail) || (bufferBegin >= prevHead))
+ {
+ wait = false;
+ }
+ }
+ else
+ {
+ //colision example:
+ // ==========H-------T============
+ // B==============E---------------
+ // caution: we will never have H circled completely so that H == T
+ if((bufferEnd < tail) && (bufferBegin >= prevHead))
+ {
+ wait = false;
+ }
+ }
+ } while (wait);
+ } while (!atomic_compare_exchange_global(&inputDumpMainBuffer->gpuHead, &prevHead, newHead));
+
+ if (circled)
+ {
+ global InputBatch* endBufferOp = (global InputBatch*)(((global char*)inputDumpMainBuffer) + prevHead);
+ endBufferOp->header.opHeader.operationType = INPUT_DUMP_OP_END_BUFFER;
+ prevHead = inputDumpMainBuffer->headStart;
+ }
+
+ global char* thisBatchDump = ((global char*)inputDumpMainBuffer) + prevHead;
+ batchPtrs->dumpDst = (qword)thisBatchDump;
+ batchPtrs->globalDumpBuffer = (qword)inputDumpMainBuffer;
+ batchPtrs->nonVertexDataStart = (qword)(thisBatchDump + AlignTo128(sizeof(InputBatch)) + vertexBufferSize);
+ batchPtrs->totalSize = sizeOfThisBatch;
+
+ global InputBatch* batchOp = (global InputBatch*) thisBatchDump;
+ batchOp->header.opHeader.operationType = INPUT_DUMP_OP_BATCH;
+ batchOp->header.opHeader.endOfData = sizeOfThisBatch;
+ batchOp->vertexBufferDataSize = vertexBufferSize;
+ batchOp->firstContainedOpOffset = AlignTo128(sizeof(InputBatch)) + vertexBufferSize;
+ batchOp->batchId = *batchIdPtr;
+ }
+}
+
+/// Sets the dst data space for output dump of this batch
+///
+/// @param outputDumpMainBuffer pointer to main dump buffer
+/// @param batchPtrs batch pointers struct
+/// @param batchIdPtr pointer to batch id
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
+void kernel allocate_data_space_for_outputs(
+ global DebugBufferHeader* outputDumpMainBuffer,
+ global OutputBatchPtrs* batchPtrs,
+ global qword* batchIdPtr)
+{
+ if (get_sub_group_local_id() == 0)
+ {
+ uint sizeOfThisBatch = AlignTo128(sizeof(OutputBatch)) + batchPtrs->dataSize;
+
+ if ((sizeOfThisBatch + sizeof(OutputBatch)) > ((outputDumpMainBuffer->totalSize - outputDumpMainBuffer->headStart) / 2))
+ {
+ outputDumpMainBuffer->overflow = 1;
+ batchPtrs->dumpDst = 0;
+ batchPtrs->dataStart = 0;
+ batchPtrs->totalSize = 0;
+ return;
+ }
+
+ dword prevHead = *((volatile global uint*)(&outputDumpMainBuffer->gpuHead));
+ dword newHead;
+ bool circled;
+
+ do
+ {
+ //mem_fence_gpu_invalidate();
+ //prevHead = *((volatile global uint*)(&outputDumpMainBuffer->gpuHead));
+ circled = false;
+ newHead = prevHead + sizeOfThisBatch;
+ dword bufferBegin = prevHead;
+ if ((newHead + sizeof(OutputBatch)) > outputDumpMainBuffer->totalSize)
+ {
+ circled = true;
+ newHead = outputDumpMainBuffer->headStart + sizeOfThisBatch;
+ bufferBegin = outputDumpMainBuffer->headStart;
+ }
+ dword bufferEnd = newHead + sizeof(OutputBatch);
+
+ uint tail;
+ uint tail2 = 7;
+ bool wait;
+ do
+ {
+ wait = true;
+ tail = load_uint_L1UC_L3UC(&outputDumpMainBuffer->tail, 0);
+
+ // dead code, workaround so IGC won't move tail load out of loop
+ if (tail > outputDumpMainBuffer->totalSize)
+ {
+ store_uint_L1UC_L3UC(&outputDumpMainBuffer->tail, 0, tail + tail2);
+ tail2 = tail;
+ }
+
+ if( prevHead >= tail )
+ {
+ //colision example:
+ // ----------T=======H------------
+ // -------B=====E-----------------
+ //
+ if((bufferEnd < tail) || (bufferBegin >= prevHead))
+ {
+ wait = false;
+ }
+ }
+ else
+ {
+ //colision example:
+ // ==========H-------T============
+ // B==============E---------------
+ // caution: we will never have H circled completely so that H == T
+ if((bufferEnd < tail) && (bufferBegin >= prevHead))
+ {
+ wait = false;
+ }
+ }
+ } while (wait);
+ } while (!atomic_compare_exchange_global(&outputDumpMainBuffer->gpuHead, &prevHead, newHead));
+
+ if (circled)
+ {
+ global OutputBatch* endBufferOp = (global OutputBatch*)(((global char*)outputDumpMainBuffer) + prevHead);
+ endBufferOp->header.opHeader.operationType = OUTPUT_DUMP_OP_END_BUFFER;
+ prevHead = outputDumpMainBuffer->headStart;
+ }
+
+ global char* thisBatchDump = ((global char*)outputDumpMainBuffer) + prevHead;
+ batchPtrs->dumpDst = (qword)thisBatchDump;
+ batchPtrs->dataStart = (qword)(thisBatchDump + AlignTo128(sizeof(OutputBatch)));
+ batchPtrs->totalSize = sizeOfThisBatch;
+
+ global OutputBatch* batchOp = (global OutputBatch*) thisBatchDump;
+ batchOp->header.opHeader.operationType = OUTPUT_DUMP_OP_BATCH;
+ batchOp->header.opHeader.endOfData = sizeOfThisBatch;
+ batchOp->firstContainedOpOffset = AlignTo128(sizeof(OutputBatch));
+ batchOp->batchId = *batchIdPtr;
+ }
+}
+
+/// Calculates sum of output sizes
+///
+/// @param pbi pointer to post build infos
+/// @param destOffset offset in dest buffer
+/// @param numOutputs number of outputs
+/// @param batchPtrs batch pointers struct
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
+void kernel calc_outputs_data_size(
+ global PostbuildInfoSerializationDesc* pbi,
+ global dword* destOffsets,
+ qword numOutputs,
+ global OutputBatchPtrs* batchPtrs)
+{
+ uint offset = 0;
+ for (uint i = get_sub_group_local_id(); i < numOutputs + (MAX_HW_SIMD_WIDTH - 1); i += MAX_HW_SIMD_WIDTH)
+ {
+ uint size = 0;
+ if (i < numOutputs)
+ {
+ size = AlignTo128(pbi[i].SerializedSizeInBytes);
+ size += AlignTo128(sizeof(OutputData));
+ destOffsets[i] = offset + sub_group_scan_exclusive_add(size);
+ }
+ offset += sub_group_reduce_add(size);
+ }
+ if (get_sub_group_local_id() == 0)
+ batchPtrs->dataSize = offset;
+}
+
+/// Adds output data operation to batch
+///
+/// @param batchPtrs batch pointers struct
+/// @param destOffset offset in dest buffer
+/// @param src pointer to source bvh
+/// @param pbi pointer to post build info
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
+void kernel write_output_data_op(
+ global OutputBatchPtrs* batchPtrs,
+ global dword* destOffset,
+ qword src,
+ global PostbuildInfoSerializationDesc* pbi)
+{
+ if (batchPtrs->dataStart == 0)
+ return;
+
+ global OutputData* out = (global OutputData*)(batchPtrs->dataStart + *destOffset);
+ out->header.operationType = OUTPUT_DUMP_OP_DATA;
+ out->header.endOfData = AlignTo128(sizeof(OutputData)) + AlignTo128(pbi->SerializedSizeInBytes);
+ out->srcBvhPtr = src;
+}
+
+/// Writes indices and transform or procedurals data
+///
+/// @param batchPtrs batch pointers struct
+/// @param srcDesc description of source geometry
+/// @param pVertexBufferOffsetInLinearisedUniqueVertexBuffers pointer to offset to vertices in vertex buffer
+/// @param dstDescOffset offset to dest geo desc
+/// @param dstDataOffset offset to dest geo data
+/// @param numThreads number of threads
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
+void kernel write_geo_data(
+ global InputBatchPtrs* batchPtrs,
+ global GRL_RAYTRACING_GEOMETRY_DESC* srcDesc,
+ global uint* pVertexBufferOffsetInLinearisedUniqueVertexBuffers,
+ global uint* pVertexBufferSize,
+ qword dstDescOffset,
+ qword dstDataOffset,
+ dword numThreads)
+{
+ if (batchPtrs->dumpDst == 0) return;
+
+ uint glob_id = get_group_id(0) * get_sub_group_size() + get_sub_group_local_id();
+
+ GRL_RAYTRACING_GEOMETRY_DESC geoDescToStore = *srcDesc;
+
+ global char* dstDataPtr = (global char*)(
+ batchPtrs->nonVertexDataStart + dstDataOffset);
+
+ global char* srcDataPtr;
+ global char* dstTransform;
+ uint bytesToCopy = 0;
+
+ if (geoDescToStore.Type == GEOMETRY_TYPE_TRIANGLES)
+ {
+ uint sizeOfMatrix = 0;
+
+ if (geoDescToStore.Desc.Triangles.pTransformBuffer)
+ {
+ sizeOfMatrix = AlignTo128(4 * 3 * sizeof(float));
+ if (glob_id < 12)
+ {
+ global float* matrixSrc = (global float*)geoDescToStore.Desc.Triangles.pTransformBuffer;
+ global float* matrixDst = (global float*)dstDataPtr;
+ matrixDst[glob_id] = matrixSrc[glob_id];
+ if (glob_id == 0)
+ {
+ geoDescToStore.Desc.Triangles.pTransformBuffer = ((qword)matrixDst) - batchPtrs->globalDumpBuffer;
+ }
+ }
+ }
+
+ dstDataPtr += sizeOfMatrix;
+ srcDataPtr = (global char*)geoDescToStore.Desc.Triangles.pIndexBuffer;
+
+ bytesToCopy = AlignTo128(geoDescToStore.Desc.Triangles.IndexFormat * geoDescToStore.Desc.Triangles.IndexCount);
+
+ if (bytesToCopy && (glob_id == 0))
+ {
+ qword vertBuff = (qword)(GetVertexBuffersStart(batchPtrs) + *pVertexBufferOffsetInLinearisedUniqueVertexBuffers);
+ // for this we remember offset relative to global debug buffer
+ geoDescToStore.Desc.Triangles.pVertexBuffer = ((qword)vertBuff) - batchPtrs->globalDumpBuffer;
+ geoDescToStore.Desc.Triangles.pIndexBuffer = ((qword)dstDataPtr) - batchPtrs->globalDumpBuffer;
+ geoDescToStore.Desc.Triangles.VertexCount = *pVertexBufferSize / geoDescToStore.Desc.Triangles.VertexBufferByteStride;
+ }
+ else if (geoDescToStore.Desc.Triangles.IndexFormat == INDEX_FORMAT_NONE && geoDescToStore.Desc.Triangles.VertexCount > 0 && glob_id == 0)
+ {
+ if (geoDescToStore.Desc.Triangles.pVertexBuffer)
+ {
+ qword vertBuff = (qword)(GetVertexBuffersStart(batchPtrs) + *pVertexBufferOffsetInLinearisedUniqueVertexBuffers);
+ // for this we remember offset relative to global debug buffer
+ geoDescToStore.Desc.Triangles.pVertexBuffer = ((qword)vertBuff) - batchPtrs->globalDumpBuffer;
+ }
+ }
+ else if (glob_id == 0)
+ {
+ geoDescToStore.Desc.Triangles.IndexCount = 0;
+ geoDescToStore.Desc.Triangles.VertexCount = 0;
+ geoDescToStore.Desc.Triangles.pVertexBuffer = 0;
+ geoDescToStore.Desc.Triangles.pIndexBuffer = 0;
+ }
+ }
+ else
+ {
+ srcDataPtr = (global char*)geoDescToStore.Desc.Procedural.pAABBs_GPUVA;
+ bytesToCopy = AlignTo128(geoDescToStore.Desc.Procedural.AABBByteStride * geoDescToStore.Desc.Procedural.AABBCount);
+ if (glob_id == 0)
+ {
+ geoDescToStore.Desc.Procedural.pAABBs_GPUVA = ((qword)dstDataPtr) - batchPtrs->globalDumpBuffer;
+ }
+ }
+
+ if (bytesToCopy)
+ {
+ CopyMemory(dstDataPtr, srcDataPtr, bytesToCopy, numThreads);
+ }
+
+ if (glob_id == 0)
+ {
+ global GRL_RAYTRACING_GEOMETRY_DESC* dstDescPtr = (global GRL_RAYTRACING_GEOMETRY_DESC*)(
+ batchPtrs->nonVertexDataStart + dstDescOffset);
+ *dstDescPtr = geoDescToStore;
+ }
+}
+
+/// Adds build operation to batch
+///
+/// @param batchPtrs batch pointers struct
+/// @param buildOpOffset offset in dst buffer
+/// @param srcBvh address of src bvh (in case of update)
+/// @param dstBvhAddr address of dest bvh buffer
+/// @param offsetToEnd offset to end of this operation
+/// @param flags build flags
+/// @param numGeometries number of geometries in build
+/// @param numInstances number of instances in build
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
+void kernel write_input_build_op(
+ global InputBatchPtrs* batchPtrs,
+ qword buildOpOffset,
+ qword srcBvh,
+ qword dstBvhAddr,
+ dword offsetToEnd,
+ dword flags,
+ dword numGeometries,
+ dword numInstances,
+ dword instArrayOfPtrs)
+{
+ uint glob_id = get_group_id(0) * get_sub_group_size() + get_sub_group_local_id();
+ if (batchPtrs->dumpDst == 0 || glob_id != 0) return;
+
+ global InputBuild* buildOp = (global InputBuild*)(
+ batchPtrs->nonVertexDataStart + buildOpOffset);
+ buildOp->header.operationType = srcBvh ? INPUT_DUMP_OP_UPDATE : INPUT_DUMP_OP_BUILD;
+ buildOp->header.endOfData = offsetToEnd;
+ buildOp->dstBvhPtr = dstBvhAddr;
+ buildOp->srcBvhPtr = srcBvh;
+ buildOp->flags = flags;
+ buildOp->numGeos = numGeometries;
+ buildOp->numInstances = numInstances;
+ buildOp->instArrayOfPtrs = instArrayOfPtrs;
+}
+
+/// Copies instance description
+///
+/// @param batchPtrs batch pointers struct
+/// @param instanceDescArr inst desc source
+/// @param offset ptr to offset in dst buffer
+/// @param numInstances number of instances to copy
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) void kernel
+copy_instance_descriptors_array(
+ global InputBatchPtrs* batchPtrs,
+ global GRL_RAYTRACING_INSTANCE_DESC* instanceDescArr,
+ qword offset,
+ dword numInstances)
+{
+ uint glob_id = get_group_id(0) * get_sub_group_size() + get_sub_group_local_id();
+ if (batchPtrs->dumpDst == 0) return;
+
+ global GRL_RAYTRACING_INSTANCE_DESC* dst = (global GRL_RAYTRACING_INSTANCE_DESC* )(
+ batchPtrs->nonVertexDataStart + offset);
+
+ if (glob_id < numInstances)
+ {
+ dst[glob_id] = instanceDescArr[glob_id];
+ }
+}
+
+/// Copies instance description, array of pointers version
+///
+/// @param batchPtrs batch pointers struct
+/// @param pInstanceDescPtrsArr inst desc source
+/// @param offset ptr to offset in dst buffer
+/// @param numInstances number of instances to copy
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) void kernel
+copy_instance_descriptors_array_of_ptrs(
+ global InputBatchPtrs* batchPtrs,
+ global qword* pInstanceDescPtrsArr,
+ qword offset,
+ dword numInstances)
+{
+ uint glob_id = get_group_id(0) * get_sub_group_size() + get_sub_group_local_id();
+ if (batchPtrs->dumpDst == 0) return;
+
+ // save gpuva of instance descs for debug
+ global qword* gpuvaDst = (global qword*)(batchPtrs->nonVertexDataStart + offset);
+
+ global GRL_RAYTRACING_INSTANCE_DESC* dst = (global GRL_RAYTRACING_INSTANCE_DESC*)(
+ batchPtrs->nonVertexDataStart + AlignTo128(numInstances * sizeof(qword)) + offset);
+ global GRL_RAYTRACING_INSTANCE_DESC** instanceDescPtrsArr = (global GRL_RAYTRACING_INSTANCE_DESC **)pInstanceDescPtrsArr;
+
+ if (glob_id < numInstances)
+ {
+ gpuvaDst[glob_id] = (qword)instanceDescPtrsArr[glob_id];
+ dst[glob_id] = *(instanceDescPtrsArr[glob_id]);
+ }
+}
+
+/// Adds copy operation to batch
+///
+/// @param batchPtrs batch pointers struct
+/// @param offset ptr to offset in dst buffer
+/// @param src copy source pointer
+/// @param dst copy destination pointer
+/// @param copyOpType copy type
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
+void kernel insert_copy_op(
+ global InputBatchPtrs* batchPtrs,
+ qword offset,
+ global void* src,
+ global void* dst,
+ uint copyOpType)
+{
+ uint glob_id = get_group_id(0) * get_sub_group_size() + get_sub_group_local_id();
+ if (batchPtrs->dumpDst == 0 || glob_id != 0) return;
+
+ global InputCopy* copyOp = (global InputCopy*)(batchPtrs->nonVertexDataStart + offset);
+
+ copyOp->header.operationType = copyOpType;
+ copyOp->header.endOfData = AlignTo128(sizeof(InputCopy));
+ copyOp->srcBvhPtr = (qword)src;
+ copyOp->dstBvhPtr = (qword)dst;
+}
+
+/// Copies vertex buffer
+///
+/// @param batchPtrs batch pointers struct
+/// @param src input buffer
+/// @param offset ptr to offset in dst buffer
+/// @param size ptr to number of bytes to copy
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
+void kernel copy_vertex_data(
+ global InputBatchPtrs* batchPtrs,
+ global const char* src,
+ global const uint* offset,
+ global const uint* size)
+{
+ if (batchPtrs->dumpDst == 0) return;
+
+ global char *dst = (global char *)(GetVertexBuffersStart(batchPtrs) + *offset);
+ uint numGroups = (*size >> 6) + 1;
+ CopyMemory(dst, src, *size, numGroups);
+}
+
+/// Generate unique batch id
+///
+/// @param batchIds array of unique batch ids
+/// @param index index of batch id to generate
+__attribute__((reqd_work_group_size(1, 1, 1)))
+void kernel generate_unique_batch_id(global unsigned long *batchIds, unsigned int index) {
+ global unsigned int *counterPtrs = (global unsigned int *)batchIds;
+ atomic_add(&counterPtrs[index * 2 + 1], 1);
+ batchIds[index] |= (unsigned long)index;
+}
+
+/// Sets batch as ready to read and moves cpuHead forward, inputs case
+///
+/// @param batchPtrs batch pointers struct
+/// @param dumpMainBuffer pointer to main dump buffer
+__attribute__((reqd_work_group_size(1, 1, 1)))
+void kernel finish_batch_dump_inputs(
+ global InputBatchPtrs* batchPtrs,
+ global DebugBufferHeader* dumpMainBuffer)
+{
+ if (batchPtrs->dumpDst == 0)
+ return;
+
+ global InputBatch* myBatchOp = (global InputBatch*)batchPtrs->dumpDst;
+
+ dword myDstOffset = (batchPtrs->dumpDst - (qword)dumpMainBuffer);
+
+ dword seven = 7;
+ while (true)
+ {
+ dword currentHead = load_uint_L1UC_L3C(&dumpMainBuffer->cpuHead, 0);
+ if (currentHead > dumpMainBuffer->totalSize) // dead code - workaround so IGC won't move currentHead load out of loop
+ {
+ store_uint_L1UC_L3UC(&dumpMainBuffer->cpuHead, 0, currentHead + seven);
+ currentHead = seven;
+ }
+
+ if (currentHead == myDstOffset)
+ {
+ mem_fence_evict_to_memory();
+ dumpMainBuffer->cpuHead = currentHead + myBatchOp->header.opHeader.endOfData;
+ break;
+ }
+ else if (myDstOffset == dumpMainBuffer->headStart)
+ {
+ global InputBatch* curBatchOp = (global InputBatch*)(((global char*)dumpMainBuffer) + currentHead);
+ if (curBatchOp->header.opHeader.operationType == INPUT_DUMP_OP_END_BUFFER)
+ {
+ mem_fence_evict_to_memory();
+ dumpMainBuffer->cpuHead = dumpMainBuffer->headStart + myBatchOp->header.opHeader.endOfData;
+ break;
+ }
+ }
+ }
+}
+
+/// Sets batch as ready to read and moves cpuHead forward, outputs case
+///
+/// @param batchPtrs batch pointers struct
+/// @param dumpMainBuffer pointer to main dump buffer
+__attribute__((reqd_work_group_size(1, 1, 1)))
+void kernel finish_batch_dump_outputs(
+ global OutputBatchPtrs* batchPtrs,
+ global DebugBufferHeader* dumpMainBuffer)
+{
+ if (batchPtrs->dumpDst == 0)
+ return;
+
+ global OutputBatch* myBatchOp = (global OutputBatch*)batchPtrs->dumpDst;
+
+ dword myDstOffset = (batchPtrs->dumpDst - (qword)dumpMainBuffer);
+
+ dword seven = 7;
+ while (true)
+ {
+ dword currentHead = load_uint_L1UC_L3C(&dumpMainBuffer->cpuHead, 0);
+ if (currentHead > dumpMainBuffer->totalSize) // dead code - workaround so IGC won't move currentHead load out of loop
+ {
+ store_uint_L1UC_L3UC(&dumpMainBuffer->cpuHead, 0, currentHead + seven);
+ currentHead = seven;
+ }
+
+ if (currentHead == myDstOffset)
+ {
+ mem_fence_evict_to_memory();
+ dumpMainBuffer->cpuHead = currentHead + myBatchOp->header.opHeader.endOfData;
+ break;
+ }
+ else if (myDstOffset == dumpMainBuffer->headStart)
+ {
+ global OutputBatch* curBatchOp = (global OutputBatch*)(((global char*)dumpMainBuffer) + currentHead);
+ if (curBatchOp->header.opHeader.operationType == OUTPUT_DUMP_OP_END_BUFFER)
+ {
+ mem_fence_evict_to_memory();
+ dumpMainBuffer->cpuHead = dumpMainBuffer->headStart + myBatchOp->header.opHeader.endOfData;
+ break;
+ }
+ }
+ }
+}
diff --git a/src/intel/vulkan/grl/gpu/input_dump.grl b/src/intel/vulkan/grl/gpu/input_dump.grl
new file mode 100644
index 00000000000..7cc6e60a95d
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/input_dump.grl
@@ -0,0 +1,252 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+module input_dump;
+
+kernel_module input_dumper("input_dump.cl")
+{
+ links lsc_intrinsics;
+
+ kernel opencl_kernel_find_max_used_byte_in_buff < kernelFunction="find_max_used_byte_in_buff" >;
+ kernel opencl_kernel_allocate_linear_offsets_for_vertex_buffers < kernelFunction="allocate_linear_offsets_for_vertex_buffers" >;
+ kernel opencl_kernel_allocate_data_space_for_inputs < kernelFunction="allocate_data_space_for_inputs" >;
+ kernel opencl_kernel_allocate_data_space_for_outputs < kernelFunction="allocate_data_space_for_outputs" >;
+ kernel opencl_kernel_calc_outputs_data_size < kernelFunction="calc_outputs_data_size" >;
+ kernel opencl_kernel_write_output_data_op < kernelFunction="write_output_data_op" >;
+ kernel opencl_kernel_write_geo_data < kernelFunction="write_geo_data" >;
+ kernel opencl_kernel_write_input_build_op < kernelFunction="write_input_build_op" >;
+ kernel opencl_kernel_copy_instance_descriptors_array < kernelFunction="copy_instance_descriptors_array" >;
+ kernel opencl_kernel_copy_instance_descriptors_array_of_ptrs < kernelFunction="copy_instance_descriptors_array_of_ptrs" >;
+ kernel opencl_kernel_insert_copy_op < kernelFunction="insert_copy_op" >;
+ kernel opencl_kernel_copy_vertex_data < kernelFunction="copy_vertex_data" >;
+ kernel opencl_kernel_generate_unique_batch_id < kernelFunction="generate_unique_batch_id" >;
+ kernel opencl_kernel_finish_batch_dump_inputs < kernelFunction="finish_batch_dump_inputs" >;
+ kernel opencl_kernel_finish_batch_dump_outputs < kernelFunction="finish_batch_dump_outputs" >;
+}
+
+
+metakernel find_max_used_byte_in_buff(
+ qword indexBuffPtr,
+ qword vertexBufferUsedByteEnd,
+ dword IndexCount,
+ dword IndexFormat,
+ dword VertexCount,
+ qword VertexBufferByteStride,
+ dword numPhysThreads)
+{
+ dispatch opencl_kernel_find_max_used_byte_in_buff(numPhysThreads, 1, 1) args(
+ indexBuffPtr,
+ vertexBufferUsedByteEnd,
+ IndexCount,
+ IndexFormat,
+ VertexCount,
+ VertexBufferByteStride);
+}
+
+metakernel allocate_linear_offsets_for_vertex_buffers(
+ qword batchPtrs,
+ qword m_VertexBufferUsedByteEnd,
+ qword m_VertexBufferOffset,
+ dword numVertexBuffers,
+ dword numPhysThreads)
+{
+ dispatch opencl_kernel_allocate_linear_offsets_for_vertex_buffers(numPhysThreads, 1, 1) args(
+ batchPtrs,
+ m_VertexBufferUsedByteEnd,
+ m_VertexBufferOffset,
+ numVertexBuffers);
+}
+
+metakernel allocate_data_space_for_inputs(
+ qword inputDumpMainBuffer,
+ qword batchPtrs,
+ dword nonVertexSize,
+ qword batchIdPtr)
+{
+ dispatch opencl_kernel_allocate_data_space_for_inputs(1, 1, 1) args(
+ inputDumpMainBuffer,
+ batchPtrs,
+ nonVertexSize,
+ batchIdPtr);
+}
+
+metakernel allocate_data_space_for_outputs(
+ qword inputDumpMainBuffer,
+ qword batchPtrs,
+ qword batchIdPtr)
+{
+ dispatch opencl_kernel_allocate_data_space_for_outputs(1, 1, 1) args(
+ inputDumpMainBuffer,
+ batchPtrs,
+ batchIdPtr);
+}
+
+metakernel calc_outputs_data_size(
+ qword pbi,
+ qword destOffsets,
+ qword numOutputs,
+ qword batchPtrs)
+{
+ dispatch opencl_kernel_calc_outputs_data_size(1, 1, 1) args(
+ pbi,
+ destOffsets,
+ numOutputs,
+ batchPtrs);
+}
+
+metakernel write_output_data_op(
+ qword batchPtrs,
+ qword destOffset,
+ qword src,
+ qword pbi)
+{
+ dispatch opencl_kernel_write_output_data_op(1, 1, 1) args(
+ batchPtrs,
+ destOffset,
+ src,
+ pbi);
+}
+
+metakernel write_geo_data(
+ qword batchPtrs,
+ qword srcDesc,
+ qword pVertexBufferOffsetInLinearisedUniqueVertexBuffers,
+ qword pVertexBufferSize,
+ qword dstDescOffset,
+ qword dstDataOffset,
+ dword numThreads)
+{
+ dispatch opencl_kernel_write_geo_data(numThreads, 1, 1) args(
+ batchPtrs,
+ srcDesc,
+ pVertexBufferOffsetInLinearisedUniqueVertexBuffers,
+ pVertexBufferSize,
+ dstDescOffset,
+ dstDataOffset,
+ numThreads);
+}
+
+metakernel write_input_build_op(
+ qword batchPtrs,
+ qword buildOpOffset,
+ qword srcBvh,
+ qword dstBvhAddr,
+ dword offsetToEnd,
+ dword flags,
+ dword numGeometries,
+ dword numInstances,
+ dword instArrayOfPtrs)
+
+{
+ dispatch opencl_kernel_write_input_build_op(1, 1, 1) args(
+ batchPtrs,
+ buildOpOffset,
+ srcBvh,
+ dstBvhAddr,
+ offsetToEnd,
+ flags,
+ numGeometries,
+ numInstances,
+ instArrayOfPtrs);
+}
+
+metakernel copy_instance_descriptors_array(
+ qword batchPtrs,
+ qword instanceDescArr,
+ qword offset,
+ dword numInstances,
+ dword numPhysThreads)
+{
+ dispatch opencl_kernel_copy_instance_descriptors_array(numPhysThreads, 1, 1) args(
+ batchPtrs,
+ instanceDescArr,
+ offset,
+ numInstances);
+}
+
+metakernel copy_instance_descriptors_array_of_ptrs(
+ qword batchPtrs,
+ qword instanceDescArrPtrs,
+ qword offset,
+ dword numInstances,
+ dword numPhysThreads)
+{
+ dispatch opencl_kernel_copy_instance_descriptors_array_of_ptrs(numPhysThreads, 1, 1) args(
+ batchPtrs,
+ instanceDescArrPtrs,
+ offset,
+ numInstances);
+}
+
+metakernel insert_copy_op(
+ qword batchPtrs,
+ qword offset,
+ qword src,
+ qword dst,
+ dword type)
+{
+ dispatch opencl_kernel_insert_copy_op(1, 1, 1) args(
+ batchPtrs,
+ offset,
+ src,
+ dst,
+ type);
+}
+
+metakernel copy_vertex_data(
+ qword desc,
+ qword src,
+ qword offset,
+ qword size)
+{
+ define byteSize REG0;
+ define numGroupsRqd REG1;
+ define shift REG2;
+ define minimum REG3;
+
+ shift = 6;
+ minimum = 1;
+ byteSize = load_dword(size);
+ numGroupsRqd = byteSize >> shift;
+ numGroupsRqd = numGroupsRqd + minimum;
+ DISPATCHDIM_X = numGroupsRqd.lo;
+ DISPATCHDIM_Y = 1;
+ DISPATCHDIM_Z = 1;
+
+ dispatch_indirect opencl_kernel_copy_vertex_data args(
+ desc,
+ src,
+ offset,
+ size);
+}
+
+metakernel generate_unique_batch_id(
+ qword batchIds,
+ dword batchIndex)
+{
+ dispatch opencl_kernel_generate_unique_batch_id(1, 1, 1) args(
+ batchIds,
+ batchIndex);
+}
+
+metakernel finish_batch_dump_inputs(
+ qword batchPtrs,
+ qword dumpMainBuffer)
+{
+ dispatch opencl_kernel_finish_batch_dump_inputs(1, 1, 1) args(
+ batchPtrs,
+ dumpMainBuffer);
+}
+
+metakernel finish_batch_dump_outputs(
+ qword batchPtrs,
+ qword dumpMainBuffer)
+{
+ dispatch opencl_kernel_finish_batch_dump_outputs(1, 1, 1) args(
+ batchPtrs,
+ dumpMainBuffer);
+}
diff --git a/src/intel/vulkan/grl/gpu/instance.h b/src/intel/vulkan/grl/gpu/instance.h
new file mode 100644
index 00000000000..e463a01dc90
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/instance.h
@@ -0,0 +1,183 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#pragma once
+
+#include "shared.h"
+#include "affinespace.h"
+#include "api_interface.h"
+#include "qbvh6.h"
+#include "libs/lsc_intrinsics.h"
+
+GRL_INLINE uint32_t HwInstanceLeafPart1_getInstanceIndex(struct HwInstanceLeaf *I)
+{
+ return I->part1.instanceIndex;
+}
+
+GRL_INLINE void encodeDW0_HwInstanceLeafPart0(
+ uint32_t shaderIndex,
+ uint32_t geomMask,
+ uint4 *dst)
+{
+ (*dst).x = (shaderIndex & ((1 << 24) - 1)) |
+ (geomMask << 24);
+}
+
+GRL_INLINE void encodeDW1_HwInstanceLeafPart0(
+ uint32_t instanceContributionToHitGroupIndex,
+ uint32_t notProcedural,
+ uint32_t geomFlags,
+ uint4* dst)
+{
+ (*dst).y = (instanceContributionToHitGroupIndex & ((1 << 24) - 1)) |
+ ((notProcedural & 1) << (24 + 5)) |
+ ((geomFlags & 3) << (24 + 5 + 1));
+}
+
+GRL_INLINE void encodeDW2DW3_HwInstanceLeafPart0(
+ uint64_t rootNodePtr,
+ uint32_t instFlags,
+ uint4* dst)
+{
+ uint64_t flags = instFlags;
+ uint DW2 = (uint)rootNodePtr;
+ uint DW3 = ((uint)(rootNodePtr >> 32ul) & 0xffff);
+ DW3 |= flags << 16ull;
+ (*dst).z = DW2;
+ (*dst).w = DW3;
+}
+
+GRL_INLINE void HwInstanceLeafPart0_setDW0(struct HwInstanceLeaf *I,
+ uint32_t shaderIndex,
+ uint32_t geomMask)
+{
+ I->part0.DW0 =
+ (shaderIndex & ((1 << 24) - 1)) |
+ (geomMask << 24);
+}
+
+GRL_INLINE void HwInstanceLeafPart0_setDW1(struct HwInstanceLeaf *I,
+ uint32_t instanceContributionToHitGroupIndex,
+ uint32_t notProcedural,
+ uint32_t geomFlags)
+{
+ I->part0.DW1 =
+ (instanceContributionToHitGroupIndex & ((1 << 24) - 1)) |
+ ((notProcedural & 1) << (24 + 5)) |
+ ((geomFlags & 3) << (24 + 5 + 1));
+}
+
+GRL_INLINE void HwInstanceLeafPart1_setDW0DW1(struct HwInstanceLeaf *I,
+ global char *pBvhPtr)
+{
+ I->part1.DW0_DW1 = ((uint64_t)pBvhPtr) & (((uint64_t)1 << 48) - 1);
+}
+
+GRL_INLINE void HwInstanceLeafPart0_setDW2DW3(struct HwInstanceLeaf *I,
+ uint64_t rootNodePtr,
+ uint32_t instFlags)
+{
+ uint64_t flags = instFlags;
+ flags = flags << 48ull;
+ uint64_t ptr = rootNodePtr & 0x0000ffffffffffff;
+ I->part0.DW2_DW3 = ptr + flags;
+}
+
+GRL_INLINE void HwInstanceLeaf_Constructor(global struct HwInstanceLeaf* leaf,
+ global const struct GRL_RAYTRACING_INSTANCE_DESC* instDesc,
+ uint instanceIndex,
+ uint rootNodeByteOffset,
+ uint instanceMask)
+{
+ global uint4* InstanceLeaf_4DWparts = (global uint4*) (leaf);
+
+ struct AffineSpace3f obj2world = AffineSpace3f_load_row_major(instDesc->Transform);
+
+ qword accStructPtr = (qword)instDesc->AccelerationStructure;
+ uint4 p1_DW0_3 = (uint4)(
+ (uint)accStructPtr,
+ (uint)(accStructPtr >> (uint64_t)32),
+ GRL_get_instanceID(instDesc),
+ instanceIndex);
+
+ struct AffineSpace3f world2obj = AffineSpace3f_invert(obj2world);
+
+ store_uint4_L1S_L3WB(InstanceLeaf_4DWparts, 4 + 0 /*part1 + 0DW*/, p1_DW0_3);
+
+ uint4 p1_DW4_7 = (uint4)(
+ as_uint(obj2world.l.vx.x),
+ as_uint(obj2world.l.vx.y),
+ as_uint(obj2world.l.vx.z),
+ as_uint(obj2world.l.vy.x));
+
+ store_uint4_L1S_L3WB(InstanceLeaf_4DWparts, 4 + 1 /*part1 + 4DW*/, p1_DW4_7);
+
+ uint4 p1_DW8_11 = (uint4)(
+ as_uint(obj2world.l.vy.y),
+ as_uint(obj2world.l.vy.z),
+ as_uint(obj2world.l.vz.x),
+ as_uint(obj2world.l.vz.y));
+
+ store_uint4_L1S_L3WB(InstanceLeaf_4DWparts, 4 + 2 /*part1 + 8DW*/, p1_DW8_11);
+
+
+ uint4 p1_DW12_15 = (uint4)(
+ as_uint(obj2world.l.vz.z),
+ as_uint(world2obj.p.x),
+ as_uint(world2obj.p.y),
+ as_uint(world2obj.p.z));
+
+ store_uint4_L1S_L3WB(InstanceLeaf_4DWparts, 4 + 3 /*part1 + 12DW*/, p1_DW12_15);
+
+
+ uint hit_group_index = GRL_get_InstanceContributionToHitGroupIndex(instDesc);
+ global struct BVHBase* bvh = (global struct BVHBase*)instDesc->AccelerationStructure;
+
+ uint4 p0_DW0_3;
+
+ encodeDW0_HwInstanceLeafPart0(
+ hit_group_index,
+ instanceMask,
+ &p0_DW0_3);
+
+ encodeDW1_HwInstanceLeafPart0(
+ hit_group_index, // for HW instance leaf, this field is used to offset the hit-group index
+ 1, // disable opaque culling.. Necessary for SW instancing.. don't-care for HW instancing
+ 0,
+ &p0_DW0_3);
+
+ encodeDW2DW3_HwInstanceLeafPart0(
+ rootNodeByteOffset == NO_NODE_OFFSET ? 0 : ((uint64_t)bvh) + rootNodeByteOffset, // offset NO_NODE_OFFSET is for degenerated instance, put null as root pointer
+ GRL_get_InstanceFlags(instDesc),
+ &p0_DW0_3);
+
+ store_uint4_L1S_L3WB(InstanceLeaf_4DWparts, 0 /*part0 + 0DW*/, p0_DW0_3);
+
+ uint4 p0_DW4_7 = (uint4)(
+ as_uint(world2obj.l.vx.x),
+ as_uint(world2obj.l.vx.y),
+ as_uint(world2obj.l.vx.z),
+ as_uint(world2obj.l.vy.x));
+
+ store_uint4_L1S_L3WB(InstanceLeaf_4DWparts, 1 /*part0 + 4DW*/, p0_DW4_7);
+
+ uint4 p0_DW8_11 = (uint4)(
+ as_uint(world2obj.l.vy.y),
+ as_uint(world2obj.l.vy.z),
+ as_uint(world2obj.l.vz.x),
+ as_uint(world2obj.l.vz.y));
+
+ store_uint4_L1S_L3WB(InstanceLeaf_4DWparts, 2 /*part0 + 8DW*/, p0_DW8_11);
+
+ uint4 p0_DW12_15 = (uint4)(
+ as_uint(world2obj.l.vz.z),
+ as_uint(obj2world.p.x),
+ as_uint(obj2world.p.y),
+ as_uint(obj2world.p.z));
+
+ store_uint4_L1S_L3WB(InstanceLeaf_4DWparts, 3 /*part0 + 12DW*/, p0_DW12_15);
+}
diff --git a/src/intel/vulkan/grl/gpu/intrinsics.h b/src/intel/vulkan/grl/gpu/intrinsics.h
new file mode 100644
index 00000000000..0dff3147d8a
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/intrinsics.h
@@ -0,0 +1,581 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#pragma once
+
+// TODO: AABB_work_group_reduce is super slow, remove !!!
+
+#pragma cl_intel_subgroups : enable
+#pragma cl_khr_fp16 : enable
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+
+uint intel_sub_group_ballot(bool valid);
+
+// atom_min
+float __attribute__((overloadable)) atom_min(volatile __global float *p, float val);
+float __attribute__((overloadable)) atom_min(volatile __local float *p, float val);
+float __attribute__((overloadable)) atomic_min(volatile __global float *p, float val);
+float __attribute__((overloadable)) atomic_min(volatile __local float *p, float val);
+// atom_max
+float __attribute__((overloadable)) atom_max(volatile __global float *p, float val);
+float __attribute__((overloadable)) atom_max(volatile __local float *p, float val);
+float __attribute__((overloadable)) atomic_max(volatile __global float *p, float val);
+float __attribute__((overloadable)) atomic_max(volatile __local float *p, float val);
+// atom_cmpxchg
+float __attribute__((overloadable)) atom_cmpxchg(volatile __global float *p, float cmp, float val);
+float __attribute__((overloadable)) atom_cmpxchg(volatile __local float *p, float cmp, float val);
+float __attribute__((overloadable)) atomic_cmpxchg(volatile __global float *p, float cmp, float val);
+float __attribute__((overloadable)) atomic_cmpxchg(volatile __local float *p, float cmp, float val);
+
+
+
+inline uint subgroup_single_atomic_add(global uint *p, uint val)
+{
+ const uint subgroupLocalID = get_sub_group_local_id();
+ const int v = subgroupLocalID == 0 ? atomic_add(p, val) : 0;
+ return sub_group_broadcast(v, 0);
+}
+
+inline float halfarea(const float3 d)
+{
+ return fma(d.x, (d.y + d.z), d.y * d.z);
+}
+
+inline float area(const float3 d)
+{
+ return halfarea(d) * 2.0f;
+}
+
+inline uint maxDim(const float3 a)
+{
+ const float3 b = fabs(a);
+ const bool b_x_y = b.x > b.y;
+ const float cur_max = b_x_y ? b.x : b.y;
+ const uint cur_idx = b_x_y ? 0 : 1;
+ const bool b_x_y_z = b.z > cur_max;
+ return b_x_y_z ? 2 : cur_idx;
+}
+
+inline uint3 sortByMaxDim(const float3 a)
+{
+ const uint kz = maxDim(a);
+ const uint _kx = (kz + 1) % 3;
+ const uint _ky = (_kx + 1) % 3;
+ const bool kz_pos = a[kz] >= 0.0f;
+ const uint kx = kz_pos ? _ky : _kx;
+ const uint ky = kz_pos ? _kx : _ky;
+ return (uint3)(kx, ky, kz);
+}
+
+inline uint4 sort4_ascending(const uint4 dist)
+{
+ const uint a0 = dist.s0;
+ const uint a1 = dist.s1;
+ const uint a2 = dist.s2;
+ const uint a3 = dist.s3;
+ const uint b0 = min(a0, a2);
+ const uint b1 = min(a1, a3);
+ const uint b2 = max(a0, a2);
+ const uint b3 = max(a1, a3);
+ const uint c0 = min(b0, b1);
+ const uint c1 = max(b0, b1);
+ const uint c2 = min(b2, b3);
+ const uint c3 = max(b2, b3);
+ const uint d0 = c0;
+ const uint d1 = min(c1, c2);
+ const uint d2 = max(c1, c2);
+ const uint d3 = c3;
+ return (uint4)(d0, d1, d2, d3);
+}
+
+__constant const uint shuffleA[8] = {1, 0, 3, 2, 5, 4, 7, 6};
+__constant const uint shuffleB[8] = {2, 3, 0, 1, 7, 6, 5, 4};
+__constant const uint shuffleC[8] = {1, 0, 3, 2, 5, 4, 7, 6};
+__constant const uint shuffleD[8] = {7, 6, 5, 4, 3, 2, 1, 0};
+__constant const uint shuffleE[8] = {2, 3, 0, 1, 6, 7, 4, 5};
+__constant const uint shuffleF[8] = {1, 0, 3, 2, 5, 4, 7, 6};
+__constant const uint shuffleG[8] = {0, 2, 1, 3, 5, 4, 7, 6};
+
+__constant const uint selAA[8] = {0, 1, 0, 1, 0, 1, 0, 1};
+__constant const uint selCC[8] = {0, 0, 1, 1, 0, 0, 1, 1};
+__constant const uint selF0[8] = {0, 0, 0, 0, 1, 1, 1, 1};
+
+__constant const uint selGG[8] = {0, 0, 1, 0, 1, 1, 1, 1};
+
+inline uint compare_exchange_descending(const uint a0, const uint shuffleMask, const uint selectMask)
+{
+ const uint a1 = intel_sub_group_shuffle(a0, shuffleMask);
+ const uint a_min = min(a0, a1);
+ const uint a_max = max(a0, a1);
+ return select(a_max, a_min, selectMask);
+}
+
+inline uint compare_exchange_ascending(const uint a0, const uint shuffleMask, const uint selectMask)
+{
+ const uint a1 = intel_sub_group_shuffle(a0, shuffleMask);
+ const uint a_min = min(a0, a1);
+ const uint a_max = max(a0, a1);
+ return select(a_min, a_max, selectMask);
+}
+
+inline uint sort8_descending(const uint aa)
+{
+ const unsigned int slotID = get_sub_group_local_id() % 8;
+ const uint bb = compare_exchange_descending(aa, shuffleA[slotID], selAA[slotID]);
+ const uint cc = compare_exchange_descending(bb, shuffleB[slotID], selCC[slotID]);
+ const uint dd = compare_exchange_descending(cc, shuffleC[slotID], selAA[slotID]);
+ const uint ee = compare_exchange_descending(dd, shuffleD[slotID], selF0[slotID]);
+ const uint ff = compare_exchange_descending(ee, shuffleE[slotID], selCC[slotID]);
+ const uint gg = compare_exchange_descending(ff, shuffleF[slotID], selAA[slotID]);
+ return gg;
+}
+
+inline uint sort8_ascending(const uint aa)
+{
+ const unsigned int slotID = get_sub_group_local_id() % 8;
+ const uint bb = compare_exchange_ascending(aa, shuffleA[slotID], selAA[slotID]);
+ const uint cc = compare_exchange_ascending(bb, shuffleB[slotID], selCC[slotID]);
+ const uint dd = compare_exchange_ascending(cc, shuffleC[slotID], selAA[slotID]);
+ const uint ee = compare_exchange_ascending(dd, shuffleD[slotID], selF0[slotID]);
+ const uint ff = compare_exchange_ascending(ee, shuffleE[slotID], selCC[slotID]);
+ const uint gg = compare_exchange_ascending(ff, shuffleF[slotID], selAA[slotID]);
+ return gg;
+}
+
+inline uint sort4_descending(const uint aa)
+{
+ const unsigned int slotID = get_sub_group_local_id() % 8;
+ const uint bb = compare_exchange_descending(aa, shuffleA[slotID], selAA[slotID]);
+ const uint cc = compare_exchange_descending(bb, shuffleB[slotID], selCC[slotID]);
+ const uint dd = compare_exchange_descending(cc, shuffleG[slotID], selGG[slotID]);
+ return dd;
+}
+
+inline ulong compare_exchange_descending_ulong(const ulong a0, const uint shuffleMask, const uint selectMask)
+{
+ const ulong a1 = intel_sub_group_shuffle(a0, shuffleMask);
+ const ulong a_min = min(a0, a1);
+ const ulong a_max = max(a0, a1);
+ return select(a_max, a_min, (ulong)selectMask);
+}
+
+inline ulong compare_exchange_ascending_ulong(const ulong a0, const uint shuffleMask, const uint selectMask)
+{
+ const ulong a1 = intel_sub_group_shuffle(a0, shuffleMask);
+ const ulong a_min = min(a0, a1);
+ const ulong a_max = max(a0, a1);
+ return select(a_min, a_max, (ulong)selectMask);
+}
+
+inline ulong sort8_ascending_ulong(const ulong aa)
+{
+ const unsigned int slotID = get_sub_group_local_id() % 8;
+ const ulong bb = compare_exchange_ascending_ulong(aa, shuffleA[slotID], selAA[slotID]);
+ const ulong cc = compare_exchange_ascending_ulong(bb, shuffleB[slotID], selCC[slotID]);
+ const ulong dd = compare_exchange_ascending_ulong(cc, shuffleC[slotID], selAA[slotID]);
+ const ulong ee = compare_exchange_ascending_ulong(dd, shuffleD[slotID], selF0[slotID]);
+ const ulong ff = compare_exchange_ascending_ulong(ee, shuffleE[slotID], selCC[slotID]);
+ const ulong gg = compare_exchange_ascending_ulong(ff, shuffleF[slotID], selAA[slotID]);
+ return gg;
+}
+
+inline uint bitInterleave3D(const uint4 in)
+{
+ uint x = in.x, y = in.y, z = in.z;
+ x = (x | (x << 16)) & 0x030000FF;
+ x = (x | (x << 8)) & 0x0300F00F;
+ x = (x | (x << 4)) & 0x030C30C3;
+ x = (x | (x << 2)) & 0x09249249;
+
+ y = (y | (y << 16)) & 0x030000FF;
+ y = (y | (y << 8)) & 0x0300F00F;
+ y = (y | (y << 4)) & 0x030C30C3;
+ y = (y | (y << 2)) & 0x09249249;
+
+ z = (z | (z << 16)) & 0x030000FF;
+ z = (z | (z << 8)) & 0x0300F00F;
+ z = (z | (z << 4)) & 0x030C30C3;
+ z = (z | (z << 2)) & 0x09249249;
+
+ return x | (y << 1) | (z << 2);
+}
+
+inline uint bitInterleave4D(const uint4 in)
+{
+ uint x = in.x, y = in.y, z = in.z, w = in.w;
+
+ x = x & 0x000000ff;
+ x = (x ^ (x << 16)) & 0x00c0003f;
+ x = (x ^ (x << 8)) & 0x00c03807;
+ x = (x ^ (x << 4)) & 0x08530853;
+ x = (x ^ (x << 2)) & 0x09090909;
+ x = (x ^ (x << 1)) & 0x11111111;
+
+ y = y & 0x000000ff;
+ y = (y ^ (y << 16)) & 0x00c0003f;
+ y = (y ^ (y << 8)) & 0x00c03807;
+ y = (y ^ (y << 4)) & 0x08530853;
+ y = (y ^ (y << 2)) & 0x09090909;
+ y = (y ^ (y << 1)) & 0x11111111;
+
+ z = z & 0x000000ff;
+ z = (z ^ (z << 16)) & 0x00c0003f;
+ z = (z ^ (z << 8)) & 0x00c03807;
+ z = (z ^ (z << 4)) & 0x08530853;
+ z = (z ^ (z << 2)) & 0x09090909;
+ z = (z ^ (z << 1)) & 0x11111111;
+
+ w = w & 0x000000ff;
+ w = (w ^ (w << 16)) & 0x00c0003f;
+ w = (w ^ (w << 8)) & 0x00c03807;
+ w = (w ^ (w << 4)) & 0x08530853;
+ w = (w ^ (w << 2)) & 0x09090909;
+ w = (w ^ (w << 1)) & 0x11111111;
+
+ return (x | (y << 1) | (z << 2) | (w << 3));
+}
+
+inline ulong ulong_bitInterleave4D(const uint4 in)
+{
+ ulong x = in.x, y = in.y, z = in.z, w = in.w;
+
+ x = x & 0x0000ffff;
+ x = (x ^ (x << 32)) & 0x0000f800000007ff;
+ x = (x ^ (x << 16)) & 0x0000f80007c0003f;
+ x = (x ^ (x << 8)) & 0x00c0380700c03807;
+ x = (x ^ (x << 4)) & 0x0843084308430843;
+ x = (x ^ (x << 2)) & 0x0909090909090909;
+ x = (x ^ (x << 1)) & 0x1111111111111111;
+
+ y = y & 0x0000ffff;
+ y = (y ^ (y << 32)) & 0x0000f800000007ff;
+ y = (y ^ (y << 16)) & 0x0000f80007c0003f;
+ y = (y ^ (y << 8)) & 0x00c0380700c03807;
+ y = (y ^ (y << 4)) & 0x0843084308430843;
+ y = (y ^ (y << 2)) & 0x0909090909090909;
+ y = (y ^ (y << 1)) & 0x1111111111111111;
+
+ z = z & 0x0000ffff;
+ z = (z ^ (z << 32)) & 0x0000f800000007ff;
+ z = (z ^ (z << 16)) & 0x0000f80007c0003f;
+ z = (z ^ (z << 8)) & 0x00c0380700c03807;
+ z = (z ^ (z << 4)) & 0x0843084308430843;
+ z = (z ^ (z << 2)) & 0x0909090909090909;
+ z = (z ^ (z << 1)) & 0x1111111111111111;
+
+ w = w & 0x0000ffff;
+ w = (w ^ (w << 32)) & 0x0000f800000007ff;
+ w = (w ^ (w << 16)) & 0x0000f80007c0003f;
+ w = (w ^ (w << 8)) & 0x00c0380700c03807;
+ w = (w ^ (w << 4)) & 0x0843084308430843;
+ w = (w ^ (w << 2)) & 0x0909090909090909;
+ w = (w ^ (w << 1)) & 0x1111111111111111;
+
+ return (x | (y << 1) | (z << 2) | (w << 3));
+}
+
+inline uint bitCompact(uint x)
+{
+ x &= 0x09249249;
+ x = (x ^ (x >> 2)) & 0x030c30c3;
+ x = (x ^ (x >> 4)) & 0x0300f00f;
+ x = (x ^ (x >> 8)) & 0xff0000ff;
+ x = (x ^ (x >> 16)) & 0x000003ff;
+ return x;
+}
+
+inline uint3 bitCompact3D(const uint in)
+{
+ const uint x = bitCompact(x >> 0);
+ const uint y = bitCompact(y >> 1);
+ const uint z = bitCompact(z >> 2);
+ return (uint3)(x, y, z);
+}
+
+inline uint convertToPushIndices8(uint ID)
+{
+ const unsigned int slotID = get_sub_group_local_id();
+ uint index = 0;
+ for (uint i = 0; i < 8; i++)
+ {
+ const uint mask = intel_sub_group_ballot(ID == i);
+ const uint new_index = ctz(mask);
+ index = i == slotID ? new_index : index;
+ }
+ return index;
+}
+
+inline uint convertToPushIndices16(uint ID)
+{
+ const unsigned int slotID = get_sub_group_local_id();
+ uint index = 0;
+ for (uint i = 0; i < 16; i++)
+ {
+ const uint mask = intel_sub_group_ballot(ID == i);
+ const uint new_index = ctz(mask);
+ index = i == slotID ? new_index : index;
+ }
+ return index;
+}
+
+#define FLOAT_EXPONENT_MASK (0x7F800000) // used to be EXPONENT_MASK
+#define FLOAT_MANTISSA_MASK (0x007FFFFF) // used to be MANTISSA_MASK
+#define FLOAT_NEG_ONE_EXP_MASK (0x3F000000)
+#define FLOAT_BIAS (127)
+#define FLOAT_MANTISSA_BITS (23)
+
+inline float3 frexp_vec3(float3 len, int3* exp)
+{
+ float3 mant = as_float3((int3)((as_int3(len) & (int3)FLOAT_MANTISSA_MASK) + (int3)FLOAT_NEG_ONE_EXP_MASK));
+ mant = select(mant, (float3)(0.5f), (int3)(mant == (float3)(1.0f)));
+ mant = copysign(mant, len);
+ *exp = ((as_int3(len) & (int3)FLOAT_EXPONENT_MASK) >> (int3)FLOAT_MANTISSA_BITS) - ((int3)FLOAT_BIAS - (int3)(1));
+ return mant;
+}
+
+
+#ifndef uniform
+#define uniform
+#endif
+
+#ifndef varying
+#define varying
+#endif
+
+uint get_sub_group_global_id()
+{
+ return get_sub_group_id() + get_num_sub_groups() * get_group_id( 0 );
+}
+
+// each lane contains the number of 1 bits below the corresponding position in 'mask'
+uint subgroup_bit_prefix_exclusive(uniform uint mask)
+{
+ varying ushort lane = get_sub_group_local_id();
+ varying uint lane_mask = (1 << lane) - 1;
+ varying uint m = mask & lane_mask;
+ return popcount(m);
+}
+
+uint bit_prefix_exclusive(uniform uint mask, varying uint lane_idx )
+{
+ varying uint lane_mask = (1 << lane_idx) - 1;
+ varying uint m = mask & lane_mask;
+ return popcount(m);
+}
+
+
+uint3 sub_group_broadcast_uint3(uint3 v, uniform ushort idx)
+{
+ return (uint3)(sub_group_broadcast(v.x,idx),
+ sub_group_broadcast(v.y,idx),
+ sub_group_broadcast(v.z,idx));
+}
+
+float3 sub_group_broadcast_float3(float3 v, uniform ushort idx)
+{
+ return (float3)(sub_group_broadcast(v.x, idx),
+ sub_group_broadcast(v.y, idx),
+ sub_group_broadcast(v.z, idx));
+}
+
+float3 sub_group_reduce_min_float3(float3 v)
+{
+ return (float3)(sub_group_reduce_min(v.x),
+ sub_group_reduce_min(v.y),
+ sub_group_reduce_min(v.z) );
+}
+float3 sub_group_reduce_max_float3(float3 v)
+{
+ return (float3)(sub_group_reduce_max(v.x),
+ sub_group_reduce_max(v.y),
+ sub_group_reduce_max(v.z));
+}
+
+float3 sub_group_shuffle_float3(float3 v, uniform ushort idx)
+{
+ return (float3)(intel_sub_group_shuffle(v.x, idx),
+ intel_sub_group_shuffle(v.y, idx),
+ intel_sub_group_shuffle(v.z, idx));
+}
+uint3 sub_group_shuffle_uint3(uint3 v, uniform ushort idx)
+{
+ return (uint3)( intel_sub_group_shuffle(v.x, idx),
+ intel_sub_group_shuffle(v.y, idx),
+ intel_sub_group_shuffle(v.z, idx));
+}
+
+
+inline uchar sub_group_reduce_or_N6(uchar val)
+{
+ val = val | intel_sub_group_shuffle_down(val, val, 4);
+ val = val | intel_sub_group_shuffle_down(val, val, 2);
+ val = val | intel_sub_group_shuffle_down(val, val, 1);
+ return sub_group_broadcast(val, 0);
+}
+
+inline uchar sub_group_reduce_or_N6_2xSIMD8_in_SIMD16(uchar val)
+{
+ uint SIMD8_id = get_sub_group_local_id() / 8;
+ val = val | intel_sub_group_shuffle_down(val, val, 4);
+ val = val | intel_sub_group_shuffle_down(val, val, 2);
+ val = val | intel_sub_group_shuffle_down(val, val, 1);
+
+ return intel_sub_group_shuffle(val, SIMD8_id * 8);
+}
+
+
+inline __attribute__((overloadable)) uint atomic_inc_local( local uint* p )
+{
+ return atomic_fetch_add_explicit( (volatile local atomic_uint*) p, (uint)1, memory_order_relaxed, memory_scope_work_group );
+}
+
+inline __attribute__((overloadable)) int atomic_inc_local(local int* p)
+{
+ return atomic_fetch_add_explicit( (volatile local atomic_int*) p, (int)1, memory_order_relaxed, memory_scope_work_group);
+}
+
+inline __attribute__((overloadable)) uint atomic_dec_local(local uint* p)
+{
+ return atomic_fetch_sub_explicit((volatile local atomic_uint*) p, (uint)1, memory_order_relaxed, memory_scope_work_group);
+}
+
+inline __attribute__((overloadable)) int atomic_dec_local(local int* p)
+{
+ return atomic_fetch_sub_explicit((volatile local atomic_int*) p, (int)1, memory_order_relaxed, memory_scope_work_group);
+}
+
+inline __attribute__((overloadable)) uint atomic_sub_local(local uint* p, uint n)
+{
+ return atomic_fetch_sub_explicit((volatile local atomic_uint*) p, n, memory_order_relaxed, memory_scope_work_group);
+}
+
+inline __attribute__((overloadable)) int atomic_sub_local(local int* p, int n )
+{
+ return atomic_fetch_sub_explicit( (volatile local atomic_int*) p, n, memory_order_relaxed, memory_scope_work_group);
+}
+
+inline uint atomic_add_local( local uint* p, uint n )
+{
+ return atomic_fetch_add_explicit((volatile local atomic_uint*) p, n, memory_order_relaxed, memory_scope_work_group);
+}
+
+inline uint atomic_xor_local(local uint* p, uint n)
+{
+ return atomic_fetch_xor_explicit((volatile local atomic_uint*) p, n, memory_order_relaxed, memory_scope_work_group);
+}
+
+inline uint atomic_or_local(local uint* p, uint n)
+{
+ return atomic_fetch_or_explicit((volatile local atomic_uint*) p, n, memory_order_relaxed, memory_scope_work_group);
+}
+
+inline uint atomic_min_local(local uint* p, uint n)
+{
+ return atomic_fetch_min_explicit((volatile local atomic_uint*) p, n, memory_order_relaxed, memory_scope_work_group);
+}
+
+inline uint atomic_max_local(local uint* p, uint n)
+{
+ return atomic_fetch_max_explicit((volatile local atomic_uint*) p, n, memory_order_relaxed, memory_scope_work_group);
+}
+
+
+
+
+inline uint atomic_inc_global( global uint* p )
+{
+ return atomic_fetch_add_explicit((volatile global atomic_uint*) p, (uint)1, memory_order_relaxed, memory_scope_device);
+}
+
+inline uint atomic_dec_global(global uint* p)
+{
+ return atomic_fetch_sub_explicit( (volatile global atomic_uint*) p, (uint)1, memory_order_relaxed, memory_scope_device);
+}
+
+inline bool atomic_compare_exchange_global(global uint* p, uint* expected, uint desired)
+{
+ return atomic_compare_exchange_strong_explicit((volatile global atomic_uint*) p, expected, desired, memory_order_relaxed, memory_order_relaxed, memory_scope_device);
+}
+
+inline uint atomic_add_global( global uint* p, uint n )
+{
+ return atomic_fetch_add_explicit( (volatile global atomic_uint*) p, n, memory_order_relaxed, memory_scope_device);
+}
+
+inline uint atomic_sub_global(global uint* p, uint n)
+{
+ return atomic_fetch_sub_explicit((volatile global atomic_uint*) p, n, memory_order_relaxed, memory_scope_device);
+}
+
+inline uint atomic_or_global(global uint* p, uint n)
+{
+ return atomic_fetch_or_explicit((volatile global atomic_uint*) p, n, memory_order_relaxed, memory_scope_device);
+}
+
+
+inline uint atomic_inc_global_acquire(global uint* p)
+{
+ return atomic_fetch_add_explicit((volatile global atomic_uint*) p, (uint)1, memory_order_acquire, memory_scope_device);
+}
+
+
+inline uint atomic_inc_global_release(global uint* p)
+{
+ return atomic_fetch_add_explicit((volatile global atomic_uint*) p, (uint)1, memory_order_release, memory_scope_device);
+}
+inline uint atomic_dec_global_release(global uint* p)
+{
+ return atomic_fetch_sub_explicit((volatile global atomic_uint*) p, (uint)1, memory_order_release, memory_scope_device);
+}
+
+inline uint generic_atomic_add(uint* p, uint val)
+{
+ if (to_global(p) != NULL)
+ return atomic_add_global(to_global(p), val);
+ if (to_local(p) != NULL)
+ return atomic_add_local(to_local(p), val);
+ return 0;
+}
+
+inline __attribute__((overloadable)) uint sub_group_reduce_max_N6( uint n )
+{
+ n = max( n, intel_sub_group_shuffle_down( n, n, 4 ) );
+ n = max( n, intel_sub_group_shuffle_down( n, n, 2 ) );
+ n = max( n, intel_sub_group_shuffle_down( n, n, 1 ) );
+ return sub_group_broadcast( n, 0 );
+}
+
+inline __attribute__((overloadable)) float sub_group_reduce_max_N6( float n )
+{
+ n = max( n, intel_sub_group_shuffle_down( n, n, 4 ) );
+ n = max( n, intel_sub_group_shuffle_down( n, n, 2 ) );
+ n = max( n, intel_sub_group_shuffle_down( n, n, 1 ) );
+ return sub_group_broadcast( n, 0 );
+}
+
+inline __attribute__((overloadable)) float sub_group_reduce_max_N6_2xSIMD8_in_SIMD16(float n)
+{
+ n = max(n, intel_sub_group_shuffle_down(n, n, 4));
+ n = max(n, intel_sub_group_shuffle_down(n, n, 2));
+ n = max(n, intel_sub_group_shuffle_down(n, n, 1));
+ return intel_sub_group_shuffle(n, (get_sub_group_local_id() / 8) * 8);//sub_group_broadcast(n, 0);
+}
+
+inline uint generic_atomic_inc(uint* p)
+{
+ if (to_global(p) != NULL)
+ return atomic_inc_global(to_global(p));
+ if (to_local(p) != NULL)
+ return atomic_inc(to_local(p));
+ return 0;
+}
+
+
+// Built-in GRL function which, if called in a kernel body, will force the kernel
+// to be compiled to the minimum SIMD width supported by the platform
+void GRL_UseMinimumSIMDWidth(); \ No newline at end of file
diff --git a/src/intel/vulkan/grl/gpu/libs/libraries.grl b/src/intel/vulkan/grl/gpu/libs/libraries.grl
new file mode 100644
index 00000000000..1d6c0d2c6c5
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/libs/libraries.grl
@@ -0,0 +1,13 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+library lsc_intrinsics
+{
+ default "lsc_intrinsics.cl" ;
+ fallback "lsc_intrinsics_fallback.cl";
+}
+
diff --git a/src/intel/vulkan/grl/gpu/libs/lsc_intrinsics.cl b/src/intel/vulkan/grl/gpu/libs/lsc_intrinsics.cl
new file mode 100644
index 00000000000..03a76ba36f1
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/libs/lsc_intrinsics.cl
@@ -0,0 +1,1033 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+// LSC Cache options
+// Load message caching control
+enum LSC_LDCC {
+ LSC_LDCC_DEFAULT,
+ LSC_LDCC_L1UC_L3UC, // Override to L1 uncached and L3 uncached
+ LSC_LDCC_L1UC_L3C, // Override to L1 uncached and L3 cached
+ LSC_LDCC_L1C_L3UC, // Override to L1 cached and L3 uncached
+ LSC_LDCC_L1C_L3C, // Override to L1 cached and L3 cached
+ LSC_LDCC_L1S_L3UC, // Override to L1 streaming load and L3 uncached
+ LSC_LDCC_L1S_L3C, // Override to L1 streaming load and L3 cached
+ LSC_LDCC_L1IAR_L3C, // Override to L1 invalidate-after-read, and L3 cached
+};
+
+// Store message caching control (also used for atomics)
+enum LSC_STCC {
+ LSC_STCC_DEFAULT,
+ LSC_STCC_L1UC_L3UC, // Override to L1 uncached and L3 uncached
+ LSC_STCC_L1UC_L3WB, // Override to L1 uncached and L3 written back
+ LSC_STCC_L1WT_L3UC, // Override to L1 written through and L3 uncached
+ LSC_STCC_L1WT_L3WB, // Override to L1 written through and L3 written back
+ LSC_STCC_L1S_L3UC, // Override to L1 streaming and L3 uncached
+ LSC_STCC_L1S_L3WB, // Override to L1 streaming and L3 written back
+ LSC_STCC_L1WB_L3WB, // Override to L1 written through and L3 written back
+};
+
+// LSC Loads
+
+// Global address space
+uint __builtin_IB_lsc_load_global_uchar_to_uint (const __global uchar *base, int immElemOff, enum LSC_LDCC cacheOpt); //D8U32
+uint __builtin_IB_lsc_load_global_ushort_to_uint(const __global ushort *base, int immElemOff, enum LSC_LDCC cacheOpt); //D16U32
+uint __builtin_IB_lsc_load_global_uint (const __global uint *base, int immElemOff, enum LSC_LDCC cacheOpt); //D32V1
+uint2 __builtin_IB_lsc_load_global_uint2 (const __global uint2 *base, int immElemOff, enum LSC_LDCC cacheOpt); //D32V2
+uint3 __builtin_IB_lsc_load_global_uint3 (const __global uint3 *base, int immElemOff, enum LSC_LDCC cacheOpt); //D32V3
+uint4 __builtin_IB_lsc_load_global_uint4 (const __global uint4 *base, int immElemOff, enum LSC_LDCC cacheOpt); //D32V4
+uint8 __builtin_IB_lsc_load_global_uint8 (const __global uint8 *base, int immElemOff, enum LSC_LDCC cacheOpt); //D32V8
+ulong __builtin_IB_lsc_load_global_ulong (const __global ulong *base, int immElemOff, enum LSC_LDCC cacheOpt); //D64V1
+ulong2 __builtin_IB_lsc_load_global_ulong2(const __global ulong2 *base, int immElemOff, enum LSC_LDCC cacheOpt); //D64V2
+ulong3 __builtin_IB_lsc_load_global_ulong3(const __global ulong3 *base, int immElemOff, enum LSC_LDCC cacheOpt); //D64V3
+ulong4 __builtin_IB_lsc_load_global_ulong4(const __global ulong4 *base, int immElemOff, enum LSC_LDCC cacheOpt); //D64V4
+ulong8 __builtin_IB_lsc_load_global_ulong8(const __global ulong8 *base, int immElemOff, enum LSC_LDCC cacheOpt); //D64V8
+
+// Local address space
+uint __builtin_IB_lsc_load_local_uchar_to_uint( const __local uchar *base, int immElemOff); //D8U32
+uint __builtin_IB_lsc_load_local_ushort_to_uint(const __local ushort *base, int immElemOff); //D16U32
+uint __builtin_IB_lsc_load_local_uint (const __local uint *base, int immElemOff); //D32V1
+uint2 __builtin_IB_lsc_load_local_uint2 (const __local uint2 *base, int immElemOff); //D32V2
+uint3 __builtin_IB_lsc_load_local_uint3 (const __local uint3 *base, int immElemOff); //D32V3
+uint4 __builtin_IB_lsc_load_local_uint4 (const __local uint4 *base, int immElemOff); //D32V4
+uint8 __builtin_IB_lsc_load_local_uint8 (const __local uint8 *base, int immElemOff); //D32V8
+ulong __builtin_IB_lsc_load_local_ulong (const __local ulong *base, int immElemOff); //D64V1
+ulong2 __builtin_IB_lsc_load_local_ulong2(const __local ulong2 *base, int immElemOff); //D64V2
+ulong3 __builtin_IB_lsc_load_local_ulong3(const __local ulong3 *base, int immElemOff); //D64V3
+ulong4 __builtin_IB_lsc_load_local_ulong4(const __local ulong4 *base, int immElemOff); //D64V4
+ulong8 __builtin_IB_lsc_load_local_ulong8(const __local ulong8 *base, int immElemOff); //D64V8
+
+// LSC Stores
+
+// Global address space
+void __builtin_IB_lsc_store_global_uchar_from_uint (__global uchar *base, int immElemOff, uint val, enum LSC_STCC cacheOpt); //D8U32
+void __builtin_IB_lsc_store_global_ushort_from_uint(__global ushort *base, int immElemOff, uint val, enum LSC_STCC cacheOpt); //D16U32
+void __builtin_IB_lsc_store_global_uint (__global uint *base, int immElemOff, uint val, enum LSC_STCC cacheOpt); //D32V1
+void __builtin_IB_lsc_store_global_uint2 (__global uint2 *base, int immElemOff, uint2 val, enum LSC_STCC cacheOpt); //D32V2
+void __builtin_IB_lsc_store_global_uint3 (__global uint3 *base, int immElemOff, uint3 val, enum LSC_STCC cacheOpt); //D32V3
+void __builtin_IB_lsc_store_global_uint4 (__global uint4 *base, int immElemOff, uint4 val, enum LSC_STCC cacheOpt); //D32V4
+void __builtin_IB_lsc_store_global_uint8 (__global uint8 *base, int immElemOff, uint8 val, enum LSC_STCC cacheOpt); //D32V8
+void __builtin_IB_lsc_store_global_ulong (__global ulong *base, int immElemOff, ulong val, enum LSC_STCC cacheOpt); //D64V1
+void __builtin_IB_lsc_store_global_ulong2(__global ulong2 *base, int immElemOff, ulong2 val, enum LSC_STCC cacheOpt); //D64V2
+void __builtin_IB_lsc_store_global_ulong3(__global ulong3 *base, int immElemOff, ulong3 val, enum LSC_STCC cacheOpt); //D64V3
+void __builtin_IB_lsc_store_global_ulong4(__global ulong4 *base, int immElemOff, ulong4 val, enum LSC_STCC cacheOpt); //D64V4
+void __builtin_IB_lsc_store_global_ulong8(__global ulong8 *base, int immElemOff, ulong8 val, enum LSC_STCC cacheOpt); //D64V8
+
+// Local address space
+void __builtin_IB_lsc_store_local_uchar_from_uint (__local uchar *base, int immElemOff, uint val); //D8U32
+void __builtin_IB_lsc_store_local_ushort_from_uint(__local ushort *base, int immElemOff, uint val); //D16U32
+void __builtin_IB_lsc_store_local_uint (__local uint *base, int immElemOff, uint val); //D32V1
+void __builtin_IB_lsc_store_local_uint2 (__local uint2 *base, int immElemOff, uint2 val); //D32V2
+void __builtin_IB_lsc_store_local_uint3 (__local uint3 *base, int immElemOff, uint3 val); //D32V3
+void __builtin_IB_lsc_store_local_uint4 (__local uint4 *base, int immElemOff, uint4 val); //D32V4
+void __builtin_IB_lsc_store_local_uint8 (__local uint8 *base, int immElemOff, uint8 val); //D32V8
+void __builtin_IB_lsc_store_local_ulong (__local ulong *base, int immElemOff, ulong val); //D64V1
+void __builtin_IB_lsc_store_local_ulong2(__local ulong2 *base, int immElemOff, ulong2 val); //D64V2
+void __builtin_IB_lsc_store_local_ulong3(__local ulong3 *base, int immElemOff, ulong3 val); //D64V3
+void __builtin_IB_lsc_store_local_ulong4(__local ulong4 *base, int immElemOff, ulong4 val); //D64V4
+void __builtin_IB_lsc_store_local_ulong8(__local ulong8 *base, int immElemOff, ulong8 val); //D64V8
+
+// LSC prefetching
+
+// LSC Pre-Fetch Load functions with CacheControls
+// Global address space
+void __builtin_IB_lsc_prefetch_global_uchar (const __global uchar *base, int immElemOff, enum LSC_LDCC cacheOpt); //D8U32
+void __builtin_IB_lsc_prefetch_global_ushort(const __global ushort *base, int immElemOff, enum LSC_LDCC cacheOpt); //D16U32
+void __builtin_IB_lsc_prefetch_global_uint (const __global uint *base, int immElemOff, enum LSC_LDCC cacheOpt); //D32V1
+void __builtin_IB_lsc_prefetch_global_uint2 (const __global uint2 *base, int immElemOff, enum LSC_LDCC cacheOpt); //D32V2
+void __builtin_IB_lsc_prefetch_global_uint3 (const __global uint3 *base, int immElemOff, enum LSC_LDCC cacheOpt); //D32V3
+void __builtin_IB_lsc_prefetch_global_uint4 (const __global uint4 *base, int immElemOff, enum LSC_LDCC cacheOpt); //D32V4
+void __builtin_IB_lsc_prefetch_global_uint8 (const __global uint8 *base, int immElemOff, enum LSC_LDCC cacheOpt); //D32V8
+void __builtin_IB_lsc_prefetch_global_ulong (const __global ulong *base, int immElemOff, enum LSC_LDCC cacheOpt); //D64V1
+void __builtin_IB_lsc_prefetch_global_ulong2(const __global ulong2 *base, int immElemOff, enum LSC_LDCC cacheOpt); //D64V2
+void __builtin_IB_lsc_prefetch_global_ulong3(const __global ulong3 *base, int immElemOff, enum LSC_LDCC cacheOpt); //D64V3
+void __builtin_IB_lsc_prefetch_global_ulong4(const __global ulong4 *base, int immElemOff, enum LSC_LDCC cacheOpt); //D64V4
+void __builtin_IB_lsc_prefetch_global_ulong8(const __global ulong8 *base, int immElemOff, enum LSC_LDCC cacheOpt); //D64V8
+
+// LSC Fence support
+
+// FS - Fence Scope
+enum LSC_FS {
+ LSC_FS_THREAD_GROUP,
+ LSC_FS_LOCAL,
+ LSC_FS_TILE,
+ LSC_FS_GPU,
+ LSC_FS_GPUs,
+ LSC_FS_SYSTEM_RELEASE,
+ LSC_FS_SYSTEM_ACQUIRE
+};
+
+// FT - Fence Type
+enum LSC_FT {
+ LSC_FT_DEFAULT,
+ LSC_FT_EVICT,
+ LSC_FT_INVALIDATE,
+ LSC_FT_DISCARD,
+ LSC_FT_CLEAN,
+ LSC_FT_L3
+};
+
+// LSC Fence functions
+void __builtin_IB_lsc_fence_global_untyped(enum LSC_FS scope, enum LSC_FT flushType); // Mem Port - UGM
+void __builtin_IB_lsc_fence_global_untyped_cross_tile(enum LSC_FS scope, enum LSC_FT flushType); // Mem Port - UGML
+void __builtin_IB_lsc_fence_global_typed(enum LSC_FS scope, enum LSC_FT flushType); // Mem Port - TGM
+void __builtin_IB_lsc_fence_local(); // Mem Port - SLM
+
+// Exported functions
+
+// LSC Loads
+// uchar
+uint load_uchar_to_uint_L1UC_L3UC(global uchar* it, int offset)
+{
+ return __builtin_IB_lsc_load_global_uchar_to_uint(it, offset, LSC_LDCC_L1UC_L3UC);
+}
+
+uint load_uchar_to_uint_L1UC_L3C(global uchar* it, int offset)
+{
+ return __builtin_IB_lsc_load_global_uchar_to_uint(it, offset, LSC_LDCC_L1UC_L3C);
+}
+
+uint load_uchar_to_uint_L1C_L3UC(global uchar* it, int offset)
+{
+ return __builtin_IB_lsc_load_global_uchar_to_uint(it, offset, LSC_LDCC_L1C_L3UC);
+}
+
+uint load_uchar_to_uint_L1C_L3C(global uchar* it, int offset)
+{
+ return __builtin_IB_lsc_load_global_uchar_to_uint(it, offset, LSC_LDCC_L1C_L3C);
+}
+
+uint load_uchar_to_uint_L1S_L3UC(global uchar* it, int offset)
+{
+ return __builtin_IB_lsc_load_global_uchar_to_uint(it, offset, LSC_LDCC_L1S_L3UC);
+}
+
+uint load_uchar_to_uint_L1S_L3C(global uchar* it, int offset)
+{
+ return __builtin_IB_lsc_load_global_uchar_to_uint(it, offset, LSC_LDCC_L1S_L3C);
+}
+
+uint load_uchar_to_uint_L1IAR_L3C(global uchar* it, int offset)
+{
+ return __builtin_IB_lsc_load_global_uchar_to_uint(it, offset, LSC_LDCC_L1IAR_L3C);
+}
+
+// ushort
+uint load_ushort_to_uint_L1UC_L3UC(global ushort* it, int offset)
+{
+ return __builtin_IB_lsc_load_global_ushort_to_uint(it, offset, LSC_LDCC_L1UC_L3UC);
+}
+
+uint load_ushort_to_uint_L1UC_L3C(global ushort* it, int offset)
+{
+ return __builtin_IB_lsc_load_global_ushort_to_uint(it, offset, LSC_LDCC_L1UC_L3C);
+}
+
+uint load_ushort_to_uint_L1C_L3UC(global ushort* it, int offset)
+{
+ return __builtin_IB_lsc_load_global_ushort_to_uint(it, offset, LSC_LDCC_L1C_L3UC);
+}
+
+uint load_ushort_to_uint_L1C_L3C(global ushort* it, int offset)
+{
+ return __builtin_IB_lsc_load_global_ushort_to_uint(it, offset, LSC_LDCC_L1C_L3C);
+}
+
+uint load_ushort_to_uint_L1S_L3UC(global ushort* it, int offset)
+{
+ return __builtin_IB_lsc_load_global_ushort_to_uint(it, offset, LSC_LDCC_L1S_L3UC);
+}
+
+uint load_ushort_to_uint_L1S_L3C(global ushort* it, int offset)
+{
+ return __builtin_IB_lsc_load_global_ushort_to_uint(it, offset, LSC_LDCC_L1S_L3C);
+}
+
+uint load_ushort_to_uint_L1IAR_L3C(global ushort* it, int offset)
+{
+ return __builtin_IB_lsc_load_global_ushort_to_uint(it, offset, LSC_LDCC_L1IAR_L3C);
+}
+
+// uint
+uint load_uint_L1UC_L3UC(global uint* it, int offset)
+{
+ return __builtin_IB_lsc_load_global_uint(it, offset, LSC_LDCC_L1UC_L3UC);
+}
+
+uint load_uint_L1UC_L3C(global uint* it, int offset)
+{
+ return __builtin_IB_lsc_load_global_uint(it, offset, LSC_LDCC_L1UC_L3C);
+}
+
+uint load_uint_L1C_L3UC(global uint* it, int offset)
+{
+ return __builtin_IB_lsc_load_global_uint(it, offset, LSC_LDCC_L1C_L3UC);
+}
+
+uint load_uint_L1C_L3C(global uint* it, int offset)
+{
+ return __builtin_IB_lsc_load_global_uint(it, offset, LSC_LDCC_L1C_L3C);
+}
+
+uint load_uint_L1S_L3UC(global uint* it, int offset)
+{
+ return __builtin_IB_lsc_load_global_uint(it, offset, LSC_LDCC_L1S_L3UC);
+}
+
+uint load_uint_L1S_L3C(global uint* it, int offset)
+{
+ return __builtin_IB_lsc_load_global_uint(it, offset, LSC_LDCC_L1S_L3C);
+}
+
+uint load_uint_L1IAR_L3C(global uint* it, int offset)
+{
+ return __builtin_IB_lsc_load_global_uint(it, offset, LSC_LDCC_L1IAR_L3C);
+}
+
+// uint2
+uint2 load_uint2_L1UC_L3UC(global uint2* it, int offset)
+{
+ return __builtin_IB_lsc_load_global_uint2(it, offset, LSC_LDCC_L1UC_L3UC);
+}
+
+uint2 load_uint2_L1UC_L3C(global uint2* it, int offset)
+{
+ return __builtin_IB_lsc_load_global_uint2(it, offset, LSC_LDCC_L1UC_L3C);
+}
+
+uint2 load_uint2_L1C_L3UC(global uint2* it, int offset)
+{
+ return __builtin_IB_lsc_load_global_uint2(it, offset, LSC_LDCC_L1C_L3UC);
+}
+
+uint2 load_uint2_L1C_L3C(global uint2* it, int offset)
+{
+ return __builtin_IB_lsc_load_global_uint2(it, offset, LSC_LDCC_L1C_L3C);
+}
+
+uint2 load_uint2_L1S_L3UC(global uint2* it, int offset)
+{
+ return __builtin_IB_lsc_load_global_uint2(it, offset, LSC_LDCC_L1S_L3UC);
+}
+
+uint2 load_uint2_L1S_L3C(global uint2* it, int offset)
+{
+ return __builtin_IB_lsc_load_global_uint2(it, offset, LSC_LDCC_L1S_L3C);
+}
+
+uint2 load_uint2_L1IAR_L3C(global uint2* it, int offset)
+{
+ return __builtin_IB_lsc_load_global_uint2(it, offset, LSC_LDCC_L1IAR_L3C);
+}
+
+// uint3
+uint3 load_uint3_L1UC_L3UC(global uint3* it, int offset)
+{
+ return __builtin_IB_lsc_load_global_uint3(it, offset, LSC_LDCC_L1UC_L3UC);
+}
+
+uint3 load_uint3_L1UC_L3C(global uint3* it, int offset)
+{
+ return __builtin_IB_lsc_load_global_uint3(it, offset, LSC_LDCC_L1UC_L3C);
+}
+
+uint3 load_uint3_L1C_L3UC(global uint3* it, int offset)
+{
+ return __builtin_IB_lsc_load_global_uint3(it, offset, LSC_LDCC_L1C_L3UC);
+}
+
+uint3 load_uint3_L1C_L3C(global uint3* it, int offset)
+{
+ return __builtin_IB_lsc_load_global_uint3(it, offset, LSC_LDCC_L1C_L3C);
+}
+
+uint3 load_uint3_L1S_L3UC(global uint3* it, int offset)
+{
+ return __builtin_IB_lsc_load_global_uint3(it, offset, LSC_LDCC_L1S_L3UC);
+}
+
+uint3 load_uint3_L1S_L3C(global uint3* it, int offset)
+{
+ return __builtin_IB_lsc_load_global_uint3(it, offset, LSC_LDCC_L1S_L3C);
+}
+
+uint3 load_uint3_L1IAR_L3C(global uint3* it, int offset)
+{
+ return __builtin_IB_lsc_load_global_uint3(it, offset, LSC_LDCC_L1IAR_L3C);
+}
+
+// uint4
+uint4 load_uint4_L1UC_L3UC(global uint4* it, int offset)
+{
+ return __builtin_IB_lsc_load_global_uint4(it, offset, LSC_LDCC_L1UC_L3UC);
+}
+
+uint4 load_uint4_L1UC_L3C(global uint4* it, int offset)
+{
+ return __builtin_IB_lsc_load_global_uint4(it, offset, LSC_LDCC_L1UC_L3C);
+}
+
+uint4 load_uint4_L1C_L3UC(global uint4* it, int offset)
+{
+ return __builtin_IB_lsc_load_global_uint4(it, offset, LSC_LDCC_L1C_L3UC);
+}
+
+uint4 load_uint4_L1C_L3C(global uint4* it, int offset)
+{
+ return __builtin_IB_lsc_load_global_uint4(it, offset, LSC_LDCC_L1C_L3C);
+}
+
+uint4 load_uint4_L1S_L3UC(global uint4* it, int offset)
+{
+ return __builtin_IB_lsc_load_global_uint4(it, offset, LSC_LDCC_L1S_L3UC);
+}
+
+uint4 load_uint4_L1S_L3C(global uint4* it, int offset)
+{
+ return __builtin_IB_lsc_load_global_uint4(it, offset, LSC_LDCC_L1S_L3C);
+}
+
+uint4 load_uint4_L1IAR_L3C(global uint4* it, int offset)
+{
+ return __builtin_IB_lsc_load_global_uint4(it, offset, LSC_LDCC_L1IAR_L3C);
+}
+
+// uint8
+uint8 load_uint8_L1UC_L3UC(global uint8* it, int offset)
+{
+ return __builtin_IB_lsc_load_global_uint8(it, offset, LSC_LDCC_L1UC_L3UC);
+}
+
+uint8 load_uint8_L1UC_L3C(global uint8* it, int offset)
+{
+ return __builtin_IB_lsc_load_global_uint8(it, offset, LSC_LDCC_L1UC_L3C);
+}
+
+uint8 load_uint8_L1C_L3UC(global uint8* it, int offset)
+{
+ return __builtin_IB_lsc_load_global_uint8(it, offset, LSC_LDCC_L1C_L3UC);
+}
+
+uint8 load_uint8_L1C_L3C(global uint8* it, int offset)
+{
+ return __builtin_IB_lsc_load_global_uint8(it, offset, LSC_LDCC_L1C_L3C);
+}
+
+uint8 load_uint8_L1S_L3UC(global uint8* it, int offset)
+{
+ return __builtin_IB_lsc_load_global_uint8(it, offset, LSC_LDCC_L1S_L3UC);
+}
+
+uint8 load_uint8_L1S_L3C(global uint8* it, int offset)
+{
+ return __builtin_IB_lsc_load_global_uint8(it, offset, LSC_LDCC_L1S_L3C);
+}
+
+uint8 load_uint8_L1IAR_L3C(global uint8* it, int offset)
+{
+ return __builtin_IB_lsc_load_global_uint8(it, offset, LSC_LDCC_L1IAR_L3C);
+}
+
+// ulong
+ulong load_ulong_L1UC_L3UC(global ulong* it, int offset)
+{
+ return __builtin_IB_lsc_load_global_ulong(it, offset, LSC_LDCC_L1UC_L3UC);
+}
+
+ulong load_ulong_L1UC_L3C(global ulong* it, int offset)
+{
+ return __builtin_IB_lsc_load_global_ulong(it, offset, LSC_LDCC_L1UC_L3C);
+}
+
+ulong load_ulong_L1C_L3UC(global ulong* it, int offset)
+{
+ return __builtin_IB_lsc_load_global_ulong(it, offset, LSC_LDCC_L1C_L3UC);
+}
+
+ulong load_ulong_L1C_L3C(global ulong* it, int offset)
+{
+ return __builtin_IB_lsc_load_global_ulong(it, offset, LSC_LDCC_L1C_L3C);
+}
+
+ulong load_ulong_L1S_L3UC(global ulong* it, int offset)
+{
+ return __builtin_IB_lsc_load_global_ulong(it, offset, LSC_LDCC_L1S_L3UC);
+}
+
+ulong load_ulong_L1S_L3C(global ulong* it, int offset)
+{
+ return __builtin_IB_lsc_load_global_ulong(it, offset, LSC_LDCC_L1S_L3C);
+}
+
+ulong load_ulong_L1IAR_L3C(global ulong* it, int offset)
+{
+ return __builtin_IB_lsc_load_global_ulong(it, offset, LSC_LDCC_L1IAR_L3C);
+}
+
+// ulong2
+ulong2 load_ulong2_L1UC_L3UC(global ulong2* it, int offset)
+{
+ return __builtin_IB_lsc_load_global_ulong2(it, offset, LSC_LDCC_L1UC_L3UC);
+}
+
+ulong2 load_ulong2_L1UC_L3C(global ulong2* it, int offset)
+{
+ return __builtin_IB_lsc_load_global_ulong2(it, offset, LSC_LDCC_L1UC_L3C);
+}
+
+ulong2 load_ulong2_L1C_L3UC(global ulong2* it, int offset)
+{
+ return __builtin_IB_lsc_load_global_ulong2(it, offset, LSC_LDCC_L1C_L3UC);
+}
+
+ulong2 load_ulong2_L1C_L3C(global ulong2* it, int offset)
+{
+ return __builtin_IB_lsc_load_global_ulong2(it, offset, LSC_LDCC_L1C_L3C);
+}
+
+ulong2 load_ulong2_L1S_L3UC(global ulong2* it, int offset)
+{
+ return __builtin_IB_lsc_load_global_ulong2(it, offset, LSC_LDCC_L1S_L3UC);
+}
+
+ulong2 load_ulong2_L1S_L3C(global ulong2* it, int offset)
+{
+ return __builtin_IB_lsc_load_global_ulong2(it, offset, LSC_LDCC_L1S_L3C);
+}
+
+ulong2 load_ulong2_L1IAR_L3C(global ulong2* it, int offset)
+{
+ return __builtin_IB_lsc_load_global_ulong2(it, offset, LSC_LDCC_L1IAR_L3C);
+}
+
+// ulong3
+ulong3 load_ulong3_L1UC_L3UC(global ulong3* it, int offset)
+{
+ return __builtin_IB_lsc_load_global_ulong3(it, offset, LSC_LDCC_L1UC_L3UC);
+}
+
+ulong3 load_ulong3_L1UC_L3C(global ulong3* it, int offset)
+{
+ return __builtin_IB_lsc_load_global_ulong3(it, offset, LSC_LDCC_L1UC_L3C);
+}
+
+ulong3 load_ulong3_L1C_L3UC(global ulong3* it, int offset)
+{
+ return __builtin_IB_lsc_load_global_ulong3(it, offset, LSC_LDCC_L1C_L3UC);
+}
+
+ulong3 load_ulong3_L1C_L3C(global ulong3* it, int offset)
+{
+ return __builtin_IB_lsc_load_global_ulong3(it, offset, LSC_LDCC_L1C_L3C);
+}
+
+ulong3 load_ulong3_L1S_L3UC(global ulong3* it, int offset)
+{
+ return __builtin_IB_lsc_load_global_ulong3(it, offset, LSC_LDCC_L1S_L3UC);
+}
+
+ulong3 load_ulong3_L1S_L3C(global ulong3* it, int offset)
+{
+ return __builtin_IB_lsc_load_global_ulong3(it, offset, LSC_LDCC_L1S_L3C);
+}
+
+ulong3 load_ulong3_L1IAR_L3C(global ulong3* it, int offset)
+{
+ return __builtin_IB_lsc_load_global_ulong3(it, offset, LSC_LDCC_L1IAR_L3C);
+}
+
+// ulong4
+ulong4 load_ulong4_L1UC_L3UC(global ulong4* it, int offset)
+{
+ return __builtin_IB_lsc_load_global_ulong4(it, offset, LSC_LDCC_L1UC_L3UC);
+}
+
+ulong4 load_ulong4_L1UC_L3C(global ulong4* it, int offset)
+{
+ return __builtin_IB_lsc_load_global_ulong4(it, offset, LSC_LDCC_L1UC_L3C);
+}
+
+ulong4 load_ulong4_L1C_L3UC(global ulong4* it, int offset)
+{
+ return __builtin_IB_lsc_load_global_ulong4(it, offset, LSC_LDCC_L1C_L3UC);
+}
+
+ulong4 load_ulong4_L1C_L3C(global ulong4* it, int offset)
+{
+ return __builtin_IB_lsc_load_global_ulong4(it, offset, LSC_LDCC_L1C_L3C);
+}
+
+ulong4 load_ulong4_L1S_L3UC(global ulong4* it, int offset)
+{
+ return __builtin_IB_lsc_load_global_ulong4(it, offset, LSC_LDCC_L1S_L3UC);
+}
+
+ulong4 load_ulong4_L1S_L3C(global ulong4* it, int offset)
+{
+ return __builtin_IB_lsc_load_global_ulong4(it, offset, LSC_LDCC_L1S_L3C);
+}
+
+ulong4 load_ulong4_L1IAR_L3C(global ulong4* it, int offset)
+{
+ return __builtin_IB_lsc_load_global_ulong4(it, offset, LSC_LDCC_L1IAR_L3C);
+}
+
+// ulong8
+ulong8 load_ulong8_L1UC_L3UC(global ulong8* it, int offset)
+{
+ return __builtin_IB_lsc_load_global_ulong8(it, offset, LSC_LDCC_L1UC_L3UC);
+}
+
+ulong8 load_ulong8_L1UC_L3C(global ulong8* it, int offset)
+{
+ return __builtin_IB_lsc_load_global_ulong8(it, offset, LSC_LDCC_L1UC_L3C);
+}
+
+ulong8 load_ulong8_L1C_L3UC(global ulong8* it, int offset)
+{
+ return __builtin_IB_lsc_load_global_ulong8(it, offset, LSC_LDCC_L1C_L3UC);
+}
+
+ulong8 load_ulong8_L1C_L3C(global ulong8* it, int offset)
+{
+ return __builtin_IB_lsc_load_global_ulong8(it, offset, LSC_LDCC_L1C_L3C);
+}
+
+ulong8 load_ulong8_L1S_L3UC(global ulong8* it, int offset)
+{
+ return __builtin_IB_lsc_load_global_ulong8(it, offset, LSC_LDCC_L1S_L3UC);
+}
+
+ulong8 load_ulong8_L1S_L3C(global ulong8* it, int offset)
+{
+ return __builtin_IB_lsc_load_global_ulong8(it, offset, LSC_LDCC_L1S_L3C);
+}
+
+ulong8 load_ulong8_L1IAR_L3C(global ulong8* it, int offset)
+{
+ return __builtin_IB_lsc_load_global_ulong8(it, offset, LSC_LDCC_L1IAR_L3C);
+}
+
+// LSC Stores
+// uchar
+void store_uchar_from_uint_L1UC_L3UC(global uchar* it, int offset, uint value)
+{
+ __builtin_IB_lsc_store_global_uchar_from_uint(it, offset, value, LSC_STCC_L1UC_L3UC);
+}
+
+void store_uchar_from_uint_L1UC_L3WB(global uchar* it, int offset, uint value)
+{
+ __builtin_IB_lsc_store_global_uchar_from_uint(it, offset, value, LSC_STCC_L1UC_L3WB);
+}
+
+void store_uchar_from_uint_L1WT_L3UC(global uchar* it, int offset, uint value)
+{
+ __builtin_IB_lsc_store_global_uchar_from_uint(it, offset, value, LSC_STCC_L1WT_L3UC);
+}
+
+void store_uchar_from_uint_L1WT_L3WB(global uchar* it, int offset, uint value)
+{
+ __builtin_IB_lsc_store_global_uchar_from_uint(it, offset, value, LSC_STCC_L1WT_L3WB);
+}
+
+void store_uchar_from_uint_L1S_L3UC(global uchar* it, int offset, uint value)
+{
+ __builtin_IB_lsc_store_global_uchar_from_uint(it, offset, value, LSC_STCC_L1S_L3UC);
+}
+
+void store_uchar_from_uint_L1S_L3WB(global uchar* it, int offset, uint value)
+{
+ __builtin_IB_lsc_store_global_uchar_from_uint(it, offset, value, LSC_STCC_L1S_L3WB);
+}
+
+void store_uchar_from_uint_L1WB_L3WB(global uchar* it, int offset, uint value)
+{
+ __builtin_IB_lsc_store_global_uchar_from_uint(it, offset, value, LSC_STCC_L1WB_L3WB);
+}
+
+// ushort
+void store_ushort_from_uint_L1UC_L3UC(global ushort* it, int offset, uint value)
+{
+ __builtin_IB_lsc_store_global_ushort_from_uint(it, offset, value, LSC_STCC_L1UC_L3UC);
+}
+
+void store_ushort_from_uint_L1UC_L3WB(global ushort* it, int offset, uint value)
+{
+ __builtin_IB_lsc_store_global_ushort_from_uint(it, offset, value, LSC_STCC_L1UC_L3WB);
+}
+
+void store_ushort_from_uint_L1WT_L3UC(global ushort* it, int offset, uint value)
+{
+ __builtin_IB_lsc_store_global_ushort_from_uint(it, offset, value, LSC_STCC_L1WT_L3UC);
+}
+
+void store_ushort_from_uint_L1WT_L3WB(global ushort* it, int offset, uint value)
+{
+ __builtin_IB_lsc_store_global_ushort_from_uint(it, offset, value, LSC_STCC_L1WT_L3WB);
+}
+
+void store_ushort_from_uint_L1S_L3UC(global ushort* it, int offset, uint value)
+{
+ __builtin_IB_lsc_store_global_ushort_from_uint(it, offset, value, LSC_STCC_L1S_L3UC);
+}
+
+void store_ushort_from_uint_L1S_L3WB(global ushort* it, int offset, uint value)
+{
+ __builtin_IB_lsc_store_global_ushort_from_uint(it, offset, value, LSC_STCC_L1S_L3WB);
+}
+
+void store_ushort_from_uint_L1WB_L3WB(global ushort* it, int offset, uint value)
+{
+ __builtin_IB_lsc_store_global_ushort_from_uint(it, offset, value, LSC_STCC_L1WB_L3WB);
+}
+
+// uint
+void store_uint_L1UC_L3UC(global uint* it, int offset, uint value)
+{
+ __builtin_IB_lsc_store_global_uint(it, offset, value, LSC_STCC_L1UC_L3UC);
+}
+
+void store_uint_L1UC_L3WB(global uint* it, int offset, uint value)
+{
+ __builtin_IB_lsc_store_global_uint(it, offset, value, LSC_STCC_L1UC_L3WB);
+}
+
+void store_uint_L1WT_L3UC(global uint* it, int offset, uint value)
+{
+ __builtin_IB_lsc_store_global_uint(it, offset, value, LSC_STCC_L1WT_L3UC);
+}
+
+void store_uint_L1WT_L3WB(global uint* it, int offset, uint value)
+{
+ __builtin_IB_lsc_store_global_uint(it, offset, value, LSC_STCC_L1WT_L3WB);
+}
+
+void store_uint_L1S_L3UC(global uint* it, int offset, uint value)
+{
+ __builtin_IB_lsc_store_global_uint(it, offset, value, LSC_STCC_L1S_L3UC);
+}
+
+void store_uint_L1S_L3WB(global uint* it, int offset, uint value)
+{
+ __builtin_IB_lsc_store_global_uint(it, offset, value, LSC_STCC_L1S_L3WB);
+}
+
+void store_uint_L1WB_L3WB(global uint* it, int offset, uint value)
+{
+ __builtin_IB_lsc_store_global_uint(it, offset, value, LSC_STCC_L1WB_L3WB);
+}
+
+// uint2
+void store_uint2_L1UC_L3UC(global uint2* it, int offset, uint2 value)
+{
+ __builtin_IB_lsc_store_global_uint2(it, offset, value, LSC_STCC_L1UC_L3UC);
+}
+
+void store_uint2_L1UC_L3WB(global uint2* it, int offset, uint2 value)
+{
+ __builtin_IB_lsc_store_global_uint2(it, offset, value, LSC_STCC_L1UC_L3WB);
+}
+
+void store_uint2_L1WT_L3UC(global uint2* it, int offset, uint2 value)
+{
+ __builtin_IB_lsc_store_global_uint2(it, offset, value, LSC_STCC_L1WT_L3UC);
+}
+
+void store_uint2_L1WT_L3WB(global uint2* it, int offset, uint2 value)
+{
+ __builtin_IB_lsc_store_global_uint2(it, offset, value, LSC_STCC_L1WT_L3WB);
+}
+
+void store_uint2_L1S_L3UC(global uint2* it, int offset, uint2 value)
+{
+ __builtin_IB_lsc_store_global_uint2(it, offset, value, LSC_STCC_L1S_L3UC);
+}
+
+void store_uint2_L1S_L3WB(global uint2* it, int offset, uint2 value)
+{
+ __builtin_IB_lsc_store_global_uint2(it, offset, value, LSC_STCC_L1S_L3WB);
+}
+
+void store_uint2_L1WB_L3WB(global uint2* it, int offset, uint2 value)
+{
+ __builtin_IB_lsc_store_global_uint2(it, offset, value, LSC_STCC_L1WB_L3WB);
+}
+
+// uint3
+void store_uint3_L1UC_L3UC(global uint3* it, int offset, uint3 value)
+{
+ __builtin_IB_lsc_store_global_uint3(it, offset, value, LSC_STCC_L1UC_L3UC);
+}
+
+void store_uint3_L1UC_L3WB(global uint3* it, int offset, uint3 value)
+{
+ __builtin_IB_lsc_store_global_uint3(it, offset, value, LSC_STCC_L1UC_L3WB);
+}
+
+void store_uint3_L1WT_L3UC(global uint3* it, int offset, uint3 value)
+{
+ __builtin_IB_lsc_store_global_uint3(it, offset, value, LSC_STCC_L1WT_L3UC);
+}
+
+void store_uint3_L1WT_L3WB(global uint3* it, int offset, uint3 value)
+{
+ __builtin_IB_lsc_store_global_uint3(it, offset, value, LSC_STCC_L1WT_L3WB);
+}
+
+void store_uint3_L1S_L3UC(global uint3* it, int offset, uint3 value)
+{
+ __builtin_IB_lsc_store_global_uint3(it, offset, value, LSC_STCC_L1S_L3UC);
+}
+
+void store_uint3_L1S_L3WB(global uint3* it, int offset, uint3 value)
+{
+ __builtin_IB_lsc_store_global_uint3(it, offset, value, LSC_STCC_L1S_L3WB);
+}
+
+void store_uint3_L1WB_L3WB(global uint3* it, int offset, uint3 value)
+{
+ __builtin_IB_lsc_store_global_uint3(it, offset, value, LSC_STCC_L1WB_L3WB);
+}
+
+// uint4
+void store_uint4_L1UC_L3UC(global uint4* it, int offset, uint4 value)
+{
+ __builtin_IB_lsc_store_global_uint4(it, offset, value, LSC_STCC_L1UC_L3UC);
+}
+
+void store_uint4_L1UC_L3WB(global uint4* it, int offset, uint4 value)
+{
+ __builtin_IB_lsc_store_global_uint4(it, offset, value, LSC_STCC_L1UC_L3WB);
+}
+
+void store_uint4_L1WT_L3UC(global uint4* it, int offset, uint4 value)
+{
+ __builtin_IB_lsc_store_global_uint4(it, offset, value, LSC_STCC_L1WT_L3UC);
+}
+
+void store_uint4_L1WT_L3WB(global uint4* it, int offset, uint4 value)
+{
+ __builtin_IB_lsc_store_global_uint4(it, offset, value, LSC_STCC_L1WT_L3WB);
+}
+
+void store_uint4_L1S_L3UC(global uint4* it, int offset, uint4 value)
+{
+ __builtin_IB_lsc_store_global_uint4(it, offset, value, LSC_STCC_L1S_L3UC);
+}
+
+void store_uint4_L1S_L3WB(global uint4* it, int offset, uint4 value)
+{
+ __builtin_IB_lsc_store_global_uint4(it, offset, value, LSC_STCC_L1S_L3WB);
+}
+
+void store_uint4_L1WB_L3WB(global uint4* it, int offset, uint4 value)
+{
+ __builtin_IB_lsc_store_global_uint4(it, offset, value, LSC_STCC_L1WB_L3WB);
+}
+
+// uint8
+void store_uint8_L1UC_L3UC(global uint8* it, int offset, uint8 value)
+{
+ __builtin_IB_lsc_store_global_uint8(it, offset, value, LSC_STCC_L1UC_L3UC);
+}
+
+void store_uint8_L1UC_L3WB(global uint8* it, int offset, uint8 value)
+{
+ __builtin_IB_lsc_store_global_uint8(it, offset, value, LSC_STCC_L1UC_L3WB);
+}
+
+void store_uint8_L1WT_L3UC(global uint8* it, int offset, uint8 value)
+{
+ __builtin_IB_lsc_store_global_uint8(it, offset, value, LSC_STCC_L1WT_L3UC);
+}
+
+void store_uint8_L1WT_L3WB(global uint8* it, int offset, uint8 value)
+{
+ __builtin_IB_lsc_store_global_uint8(it, offset, value, LSC_STCC_L1WT_L3WB);
+}
+
+void store_uint8_L1S_L3UC(global uint8* it, int offset, uint8 value)
+{
+ __builtin_IB_lsc_store_global_uint8(it, offset, value, LSC_STCC_L1S_L3UC);
+}
+
+void store_uint8_L1S_L3WB(global uint8* it, int offset, uint8 value)
+{
+ __builtin_IB_lsc_store_global_uint8(it, offset, value, LSC_STCC_L1S_L3WB);
+}
+
+void store_uint8_L1WB_L3WB(global uint8* it, int offset, uint8 value)
+{
+ __builtin_IB_lsc_store_global_uint8(it, offset, value, LSC_STCC_L1WB_L3WB);
+}
+
+// ulong
+void store_ulong_L1UC_L3UC(global ulong* it, int offset, ulong value)
+{
+ __builtin_IB_lsc_store_global_ulong(it, offset, value, LSC_STCC_L1UC_L3UC);
+}
+
+void store_ulong_L1UC_L3WB(global ulong* it, int offset, ulong value)
+{
+ __builtin_IB_lsc_store_global_ulong(it, offset, value, LSC_STCC_L1UC_L3WB);
+}
+
+void store_ulong_L1WT_L3UC(global ulong* it, int offset, ulong value)
+{
+ __builtin_IB_lsc_store_global_ulong(it, offset, value, LSC_STCC_L1WT_L3UC);
+}
+
+void store_ulong_L1WT_L3WB(global ulong* it, int offset, ulong value)
+{
+ __builtin_IB_lsc_store_global_ulong(it, offset, value, LSC_STCC_L1WT_L3WB);
+}
+
+void store_ulong_L1S_L3UC(global ulong* it, int offset, ulong value)
+{
+ __builtin_IB_lsc_store_global_ulong(it, offset, value, LSC_STCC_L1S_L3UC);
+}
+
+void store_ulong_L1S_L3WB(global ulong* it, int offset, ulong value)
+{
+ __builtin_IB_lsc_store_global_ulong(it, offset, value, LSC_STCC_L1S_L3WB);
+}
+
+void store_ulong_L1WB_L3WB(global ulong* it, int offset, ulong value)
+{
+ __builtin_IB_lsc_store_global_ulong(it, offset, value, LSC_STCC_L1WB_L3WB);
+}
+
+// ulong2
+void store_ulong2_L1UC_L3UC(global ulong2* it, int offset, ulong2 value)
+{
+ __builtin_IB_lsc_store_global_ulong2(it, offset, value, LSC_STCC_L1UC_L3UC);
+}
+
+void store_ulong2_L1UC_L3WB(global ulong2* it, int offset, ulong2 value)
+{
+ __builtin_IB_lsc_store_global_ulong2(it, offset, value, LSC_STCC_L1UC_L3WB);
+}
+
+void store_ulong2_L1WT_L3UC(global ulong2* it, int offset, ulong2 value)
+{
+ __builtin_IB_lsc_store_global_ulong2(it, offset, value, LSC_STCC_L1WT_L3UC);
+}
+
+void store_ulong2_L1WT_L3WB(global ulong2* it, int offset, ulong2 value)
+{
+ __builtin_IB_lsc_store_global_ulong2(it, offset, value, LSC_STCC_L1WT_L3WB);
+}
+
+void store_ulong2_L1S_L3UC(global ulong2* it, int offset, ulong2 value)
+{
+ __builtin_IB_lsc_store_global_ulong2(it, offset, value, LSC_STCC_L1S_L3UC);
+}
+
+void store_ulong2_L1S_L3WB(global ulong2* it, int offset, ulong2 value)
+{
+ __builtin_IB_lsc_store_global_ulong2(it, offset, value, LSC_STCC_L1S_L3WB);
+}
+
+void store_ulong2_L1WB_L3WB(global ulong2* it, int offset, ulong2 value)
+{
+ __builtin_IB_lsc_store_global_ulong2(it, offset, value, LSC_STCC_L1WB_L3WB);
+}
+
+// ulong3
+void store_ulong3_L1UC_L3UC(global ulong3* it, int offset, ulong3 value)
+{
+ __builtin_IB_lsc_store_global_ulong3(it, offset, value, LSC_STCC_L1UC_L3UC);
+}
+
+void store_ulong3_L1UC_L3WB(global ulong3* it, int offset, ulong3 value)
+{
+ __builtin_IB_lsc_store_global_ulong3(it, offset, value, LSC_STCC_L1UC_L3WB);
+}
+
+void store_ulong3_L1WT_L3UC(global ulong3* it, int offset, ulong3 value)
+{
+ __builtin_IB_lsc_store_global_ulong3(it, offset, value, LSC_STCC_L1WT_L3UC);
+}
+
+void store_ulong3_L1WT_L3WB(global ulong3* it, int offset, ulong3 value)
+{
+ __builtin_IB_lsc_store_global_ulong3(it, offset, value, LSC_STCC_L1WT_L3WB);
+}
+
+void store_ulong3_L1S_L3UC(global ulong3* it, int offset, ulong3 value)
+{
+ __builtin_IB_lsc_store_global_ulong3(it, offset, value, LSC_STCC_L1S_L3UC);
+}
+
+void store_ulong3_L1S_L3WB(global ulong3* it, int offset, ulong3 value)
+{
+ __builtin_IB_lsc_store_global_ulong3(it, offset, value, LSC_STCC_L1S_L3WB);
+}
+
+void store_ulong3_L1WB_L3WB(global ulong3* it, int offset, ulong3 value)
+{
+ __builtin_IB_lsc_store_global_ulong3(it, offset, value, LSC_STCC_L1WB_L3WB);
+}
+
+// ulong4
+void store_ulong4_L1UC_L3UC(global ulong4* it, int offset, ulong4 value)
+{
+ __builtin_IB_lsc_store_global_ulong4(it, offset, value, LSC_STCC_L1UC_L3UC);
+}
+
+void store_ulong4_L1UC_L3WB(global ulong4* it, int offset, ulong4 value)
+{
+ __builtin_IB_lsc_store_global_ulong4(it, offset, value, LSC_STCC_L1UC_L3WB);
+}
+
+void store_ulong4_L1WT_L3UC(global ulong4* it, int offset, ulong4 value)
+{
+ __builtin_IB_lsc_store_global_ulong4(it, offset, value, LSC_STCC_L1WT_L3UC);
+}
+
+void store_ulong4_L1WT_L3WB(global ulong4* it, int offset, ulong4 value)
+{
+ __builtin_IB_lsc_store_global_ulong4(it, offset, value, LSC_STCC_L1WT_L3WB);
+}
+
+void store_ulong4_L1S_L3UC(global ulong4* it, int offset, ulong4 value)
+{
+ __builtin_IB_lsc_store_global_ulong4(it, offset, value, LSC_STCC_L1S_L3UC);
+}
+
+void store_ulong4_L1S_L3WB(global ulong4* it, int offset, ulong4 value)
+{
+ __builtin_IB_lsc_store_global_ulong4(it, offset, value, LSC_STCC_L1S_L3WB);
+}
+
+void store_ulong4_L1WB_L3WB(global ulong4* it, int offset, ulong4 value)
+{
+ __builtin_IB_lsc_store_global_ulong4(it, offset, value, LSC_STCC_L1WB_L3WB);
+}
+
+// ulong8
+void store_ulong8_L1UC_L3UC(global ulong8* it, int offset, ulong8 value)
+{
+ __builtin_IB_lsc_store_global_ulong8(it, offset, value, LSC_STCC_L1UC_L3UC);
+}
+
+void store_ulong8_L1UC_L3WB(global ulong8* it, int offset, ulong8 value)
+{
+ __builtin_IB_lsc_store_global_ulong8(it, offset, value, LSC_STCC_L1UC_L3WB);
+}
+
+void store_ulong8_L1WT_L3UC(global ulong8* it, int offset, ulong8 value)
+{
+ __builtin_IB_lsc_store_global_ulong8(it, offset, value, LSC_STCC_L1WT_L3UC);
+}
+
+void store_ulong8_L1WT_L3WB(global ulong8* it, int offset, ulong8 value)
+{
+ __builtin_IB_lsc_store_global_ulong8(it, offset, value, LSC_STCC_L1WT_L3WB);
+}
+
+void store_ulong8_L1S_L3UC(global ulong8* it, int offset, ulong8 value)
+{
+ __builtin_IB_lsc_store_global_ulong8(it, offset, value, LSC_STCC_L1S_L3UC);
+}
+
+void store_ulong8_L1S_L3WB(global ulong8* it, int offset, ulong8 value)
+{
+ __builtin_IB_lsc_store_global_ulong8(it, offset, value, LSC_STCC_L1S_L3WB);
+}
+
+void store_ulong8_L1WB_L3WB(global ulong8* it, int offset, ulong8 value)
+{
+ __builtin_IB_lsc_store_global_ulong8(it, offset, value, LSC_STCC_L1WB_L3WB);
+}
+
+// LSC Fence support
+void mem_fence_gpu_default()
+{
+ __builtin_IB_lsc_fence_global_untyped(LSC_FS_TILE, LSC_FT_DEFAULT);
+}
+
+void mem_fence_workgroup_default()
+{
+ __builtin_IB_lsc_fence_global_untyped(LSC_FS_THREAD_GROUP, LSC_FT_DEFAULT);
+}
+
+void mem_fence_gpu_invalidate()
+{
+ // NOTE: 'FS_TILE' is used here to avoid DG2 HW bug where L3 is needlessly flushed on a 'GPU' scope fence
+ __builtin_IB_lsc_fence_global_untyped(LSC_FS_TILE, LSC_FT_INVALIDATE);
+}
+
+void mem_fence_gpu_evict()
+{
+ __builtin_IB_lsc_fence_global_untyped(LSC_FS_TILE, LSC_FT_EVICT);
+}
+
+void mem_fence_evict_to_memory()
+{
+ __builtin_IB_lsc_fence_global_untyped(LSC_FS_GPU, LSC_FT_EVICT);
+ __builtin_IB_lsc_fence_global_untyped(LSC_FS_GPU, LSC_FT_L3);
+}
diff --git a/src/intel/vulkan/grl/gpu/libs/lsc_intrinsics.h b/src/intel/vulkan/grl/gpu/libs/lsc_intrinsics.h
new file mode 100644
index 00000000000..a12dac00e77
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/libs/lsc_intrinsics.h
@@ -0,0 +1,207 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+// LSC Loads
+uint load_uchar_to_uint_L1UC_L3UC(global uchar* it, int offset);
+uint load_uchar_to_uint_L1UC_L3C(global uchar* it, int offset);
+uint load_uchar_to_uint_L1C_L3UC(global uchar* it, int offset);
+uint load_uchar_to_uint_L1C_L3C(global uchar* it, int offset);
+uint load_uchar_to_uint_L1S_L3UC(global uchar* it, int offset);
+uint load_uchar_to_uint_L1S_L3C(global uchar* it, int offset);
+uint load_uchar_to_uint_L1IAR_L3C(global uchar* it, int offset);
+
+uint load_ushort_to_uint_L1UC_L3UC(global ushort* it, int offset);
+uint load_ushort_to_uint_L1UC_L3C(global ushort* it, int offset);
+uint load_ushort_to_uint_L1C_L3UC(global ushort* it, int offset);
+uint load_ushort_to_uint_L1C_L3C(global ushort* it, int offset);
+uint load_ushort_to_uint_L1S_L3UC(global ushort* it, int offset);
+uint load_ushort_to_uint_L1S_L3C(global ushort* it, int offset);
+uint load_ushort_to_uint_L1IAR_L3C(global ushort* it, int offset);
+
+uint load_uint_L1UC_L3UC(global uint* it, int offset);
+uint load_uint_L1UC_L3C(global uint* it, int offset);
+uint load_uint_L1C_L3UC(global uint* it, int offset);
+uint load_uint_L1C_L3C(global uint* it, int offset);
+uint load_uint_L1S_L3UC(global uint* it, int offset);
+uint load_uint_L1S_L3C(global uint* it, int offset);
+uint load_uint_L1IAR_L3C(global uint* it, int offset);
+
+uint2 load_uint2_L1UC_L3UC(global uint2* it, int offset);
+uint2 load_uint2_L1UC_L3C(global uint2* it, int offset);
+uint2 load_uint2_L1C_L3UC(global uint2* it, int offset);
+uint2 load_uint2_L1C_L3C(global uint2* it, int offset);
+uint2 load_uint2_L1S_L3UC(global uint2* it, int offset);
+uint2 load_uint2_L1S_L3C(global uint2* it, int offset);
+uint2 load_uint2_L1IAR_L3C(global uint2* it, int offset);
+
+uint3 load_uint3_L1UC_L3UC(global uint3* it, int offset);
+uint3 load_uint3_L1UC_L3C(global uint3* it, int offset);
+uint3 load_uint3_L1C_L3UC(global uint3* it, int offset);
+uint3 load_uint3_L1C_L3C(global uint3* it, int offset);
+uint3 load_uint3_L1S_L3UC(global uint3* it, int offset);
+uint3 load_uint3_L1S_L3C(global uint3* it, int offset);
+uint3 load_uint3_L1IAR_L3C(global uint3* it, int offset);
+
+uint4 load_uint4_L1UC_L3UC(global uint4* it, int offset);
+uint4 load_uint4_L1UC_L3C(global uint4* it, int offset);
+uint4 load_uint4_L1C_L3UC(global uint4* it, int offset);
+uint4 load_uint4_L1C_L3C(global uint4* it, int offset);
+uint4 load_uint4_L1S_L3UC(global uint4* it, int offset);
+uint4 load_uint4_L1S_L3C(global uint4* it, int offset);
+uint4 load_uint4_L1IAR_L3C(global uint4* it, int offset);
+
+uint8 load_uint8_L1UC_L3UC(global uint8* it, int offset);
+uint8 load_uint8_L1UC_L3C(global uint8* it, int offset);
+uint8 load_uint8_L1C_L3UC(global uint8* it, int offset);
+uint8 load_uint8_L1C_L3C(global uint8* it, int offset);
+uint8 load_uint8_L1S_L3UC(global uint8* it, int offset);
+uint8 load_uint8_L1S_L3C(global uint8* it, int offset);
+uint8 load_uint8_L1IAR_L3C(global uint8* it, int offset);
+
+ulong load_ulong_L1UC_L3UC(global ulong* it, int offset);
+ulong load_ulong_L1UC_L3C(global ulong* it, int offset);
+ulong load_ulong_L1C_L3UC(global ulong* it, int offset);
+ulong load_ulong_L1C_L3C(global ulong* it, int offset);
+ulong load_ulong_L1S_L3UC(global ulong* it, int offset);
+ulong load_ulong_L1S_L3C(global ulong* it, int offset);
+ulong load_ulong_L1IAR_L3C(global ulong* it, int offset);
+
+ulong2 load_ulong2_L1UC_L3UC(global ulong2* it, int offset);
+ulong2 load_ulong2_L1UC_L3C(global ulong2* it, int offset);
+ulong2 load_ulong2_L1C_L3UC(global ulong2* it, int offset);
+ulong2 load_ulong2_L1C_L3C(global ulong2* it, int offset);
+ulong2 load_ulong2_L1S_L3UC(global ulong2* it, int offset);
+ulong2 load_ulong2_L1S_L3C(global ulong2* it, int offset);
+ulong2 load_ulong2_L1IAR_L3C(global ulong2* it, int offset);
+
+ulong3 load_ulong3_L1UC_L3UC(global ulong3* it, int offset);
+ulong3 load_ulong3_L1UC_L3C(global ulong3* it, int offset);
+ulong3 load_ulong3_L1C_L3UC(global ulong3* it, int offset);
+ulong3 load_ulong3_L1C_L3C(global ulong3* it, int offset);
+ulong3 load_ulong3_L1S_L3UC(global ulong3* it, int offset);
+ulong3 load_ulong3_L1S_L3C(global ulong3* it, int offset);
+ulong3 load_ulong3_L1IAR_L3C(global ulong3* it, int offset);
+
+ulong4 load_ulong4_L1UC_L3UC(global ulong4* it, int offset);
+ulong4 load_ulong4_L1UC_L3C(global ulong4* it, int offset);
+ulong4 load_ulong4_L1C_L3UC(global ulong4* it, int offset);
+ulong4 load_ulong4_L1C_L3C(global ulong4* it, int offset);
+ulong4 load_ulong4_L1S_L3UC(global ulong4* it, int offset);
+ulong4 load_ulong4_L1S_L3C(global ulong4* it, int offset);
+ulong4 load_ulong4_L1IAR_L3C(global ulong4* it, int offset);
+
+ulong8 load_ulong8_L1UC_L3UC(global ulong8* it, int offset);
+ulong8 load_ulong8_L1UC_L3C(global ulong8* it, int offset);
+ulong8 load_ulong8_L1C_L3UC(global ulong8* it, int offset);
+ulong8 load_ulong8_L1C_L3C(global ulong8* it, int offset);
+ulong8 load_ulong8_L1S_L3UC(global ulong8* it, int offset);
+ulong8 load_ulong8_L1S_L3C(global ulong8* it, int offset);
+ulong8 load_ulong8_L1IAR_L3C(global ulong8* it, int offset);
+
+// LSC Stores
+void store_uchar_from_uint_L1UC_L3UC(global uchar* it, int offset, uint value);
+void store_uchar_from_uint_L1UC_L3WB(global uchar* it, int offset, uint value);
+void store_uchar_from_uint_L1WT_L3UC(global uchar* it, int offset, uint value);
+void store_uchar_from_uint_L1WT_L3WB(global uchar* it, int offset, uint value);
+void store_uchar_from_uint_L1S_L3UC(global uchar* it, int offset, uint value);
+void store_uchar_from_uint_L1S_L3WB(global uchar* it, int offset, uint value);
+void store_uchar_from_uint_L1WB_L3WB(global uchar* it, int offset, uint value);
+
+void store_ushort_from_uint_L1UC_L3UC(global ushort* it, int offset, uint value);
+void store_ushort_from_uint_L1UC_L3WB(global ushort* it, int offset, uint value);
+void store_ushort_from_uint_L1WT_L3UC(global ushort* it, int offset, uint value);
+void store_ushort_from_uint_L1WT_L3WB(global ushort* it, int offset, uint value);
+void store_ushort_from_uint_L1S_L3UC(global ushort* it, int offset, uint value);
+void store_ushort_from_uint_L1S_L3WB(global ushort* it, int offset, uint value);
+void store_ushort_from_uint_L1WB_L3WB(global ushort* it, int offset, uint value);
+
+void store_uint_L1UC_L3UC(global uint* it, int offset, uint value);
+void store_uint_L1UC_L3WB(global uint* it, int offset, uint value);
+void store_uint_L1WT_L3UC(global uint* it, int offset, uint value);
+void store_uint_L1WT_L3WB(global uint* it, int offset, uint value);
+void store_uint_L1S_L3UC(global uint* it, int offset, uint value);
+void store_uint_L1S_L3WB(global uint* it, int offset, uint value);
+void store_uint_L1WB_L3WB(global uint* it, int offset, uint value);
+
+void store_uint2_L1UC_L3UC(global uint2* it, int offset, uint2 value);
+void store_uint2_L1UC_L3WB(global uint2* it, int offset, uint2 value);
+void store_uint2_L1WT_L3UC(global uint2* it, int offset, uint2 value);
+void store_uint2_L1WT_L3WB(global uint2* it, int offset, uint2 value);
+void store_uint2_L1S_L3UC(global uint2* it, int offset, uint2 value);
+void store_uint2_L1S_L3WB(global uint2* it, int offset, uint2 value);
+void store_uint2_L1WB_L3WB(global uint2* it, int offset, uint2 value);
+
+void store_uint3_L1UC_L3UC(global uint3* it, int offset, uint3 value);
+void store_uint3_L1UC_L3WB(global uint3* it, int offset, uint3 value);
+void store_uint3_L1WT_L3UC(global uint3* it, int offset, uint3 value);
+void store_uint3_L1WT_L3WB(global uint3* it, int offset, uint3 value);
+void store_uint3_L1S_L3UC(global uint3* it, int offset, uint3 value);
+void store_uint3_L1S_L3WB(global uint3* it, int offset, uint3 value);
+void store_uint3_L1WB_L3WB(global uint3* it, int offset, uint3 value);
+
+void store_uint4_L1UC_L3UC(global uint4* it, int offset, uint4 value);
+void store_uint4_L1UC_L3WB(global uint4* it, int offset, uint4 value);
+void store_uint4_L1WT_L3UC(global uint4* it, int offset, uint4 value);
+void store_uint4_L1WT_L3WB(global uint4* it, int offset, uint4 value);
+void store_uint4_L1S_L3UC(global uint4* it, int offset, uint4 value);
+void store_uint4_L1S_L3WB(global uint4* it, int offset, uint4 value);
+void store_uint4_L1WB_L3WB(global uint4* it, int offset, uint4 value);
+
+void store_uint8_L1UC_L3UC(global uint8* it, int offset, uint8 value);
+void store_uint8_L1UC_L3WB(global uint8* it, int offset, uint8 value);
+void store_uint8_L1WT_L3UC(global uint8* it, int offset, uint8 value);
+void store_uint8_L1WT_L3WB(global uint8* it, int offset, uint8 value);
+void store_uint8_L1S_L3UC(global uint8* it, int offset, uint8 value);
+void store_uint8_L1S_L3WB(global uint8* it, int offset, uint8 value);
+void store_uint8_L1WB_L3WB(global uint8* it, int offset, uint8 value);
+
+void store_ulong_L1UC_L3UC(global ulong* it, int offset, ulong value);
+void store_ulong_L1UC_L3WB(global ulong* it, int offset, ulong value);
+void store_ulong_L1WT_L3UC(global ulong* it, int offset, ulong value);
+void store_ulong_L1WT_L3WB(global ulong* it, int offset, ulong value);
+void store_ulong_L1S_L3UC(global ulong* it, int offset, ulong value);
+void store_ulong_L1S_L3WB(global ulong* it, int offset, ulong value);
+void store_ulong_L1WB_L3WB(global ulong* it, int offset, ulong value);
+
+void store_ulong2_L1UC_L3UC(global ulong2* it, int offset, ulong2 value);
+void store_ulong2_L1UC_L3WB(global ulong2* it, int offset, ulong2 value);
+void store_ulong2_L1WT_L3UC(global ulong2* it, int offset, ulong2 value);
+void store_ulong2_L1WT_L3WB(global ulong2* it, int offset, ulong2 value);
+void store_ulong2_L1S_L3UC(global ulong2* it, int offset, ulong2 value);
+void store_ulong2_L1S_L3WB(global ulong2* it, int offset, ulong2 value);
+void store_ulong2_L1WB_L3WB(global ulong2* it, int offset, ulong2 value);
+
+void store_ulong3_L1UC_L3UC(global ulong3* it, int offset, ulong3 value);
+void store_ulong3_L1UC_L3WB(global ulong3* it, int offset, ulong3 value);
+void store_ulong3_L1WT_L3UC(global ulong3* it, int offset, ulong3 value);
+void store_ulong3_L1WT_L3WB(global ulong3* it, int offset, ulong3 value);
+void store_ulong3_L1S_L3UC(global ulong3* it, int offset, ulong3 value);
+void store_ulong3_L1S_L3WB(global ulong3* it, int offset, ulong3 value);
+void store_ulong3_L1WB_L3WB(global ulong3* it, int offset, ulong3 value);
+
+void store_ulong4_L1UC_L3UC(global ulong4* it, int offset, ulong4 value);
+void store_ulong4_L1UC_L3WB(global ulong4* it, int offset, ulong4 value);
+void store_ulong4_L1WT_L3UC(global ulong4* it, int offset, ulong4 value);
+void store_ulong4_L1WT_L3WB(global ulong4* it, int offset, ulong4 value);
+void store_ulong4_L1S_L3UC(global ulong4* it, int offset, ulong4 value);
+void store_ulong4_L1S_L3WB(global ulong4* it, int offset, ulong4 value);
+void store_ulong4_L1WB_L3WB(global ulong4* it, int offset, ulong4 value);
+
+void store_ulong8_L1UC_L3UC(global ulong8* it, int offset, ulong8 value);
+void store_ulong8_L1UC_L3WB(global ulong8* it, int offset, ulong8 value);
+void store_ulong8_L1WT_L3UC(global ulong8* it, int offset, ulong8 value);
+void store_ulong8_L1WT_L3WB(global ulong8* it, int offset, ulong8 value);
+void store_ulong8_L1S_L3UC(global ulong8* it, int offset, ulong8 value);
+void store_ulong8_L1S_L3WB(global ulong8* it, int offset, ulong8 value);
+void store_ulong8_L1WB_L3WB(global ulong8* it, int offset, ulong8 value);
+
+// LSC Fence support
+void mem_fence_gpu_default();
+void mem_fence_workgroup_default();
+void mem_fence_gpu_invalidate();
+void mem_fence_gpu_evict();
+void mem_fence_evict_to_memory();
diff --git a/src/intel/vulkan/grl/gpu/libs/lsc_intrinsics_fallback.cl b/src/intel/vulkan/grl/gpu/libs/lsc_intrinsics_fallback.cl
new file mode 100644
index 00000000000..2217618c7c5
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/libs/lsc_intrinsics_fallback.cl
@@ -0,0 +1,898 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+// LSC Loads
+// uchar
+uint load_uchar_to_uint_L1UC_L3UC(global uchar* it, int offset)
+{
+ return (uint)(it[offset]);
+}
+
+uint load_uchar_to_uint_L1UC_L3C(global uchar* it, int offset)
+{
+ return (uint)(it[offset]);
+}
+
+uint load_uchar_to_uint_L1C_L3UC(global uchar* it, int offset)
+{
+ return (uint)(it[offset]);
+}
+
+uint load_uchar_to_uint_L1C_L3C(global uchar* it, int offset)
+{
+ return (uint)(it[offset]);
+}
+
+uint load_uchar_to_uint_L1S_L3UC(global uchar* it, int offset)
+{
+ return (uint)(it[offset]);
+}
+
+uint load_uchar_to_uint_L1S_L3C(global uchar* it, int offset)
+{
+ return (uint)(it[offset]);
+}
+
+uint load_uchar_to_uint_L1IAR_L3C(global uchar* it, int offset)
+{
+ return (uint)(it[offset]);
+}
+
+// ushort
+uint load_ushort_to_uint_L1UC_L3UC(global ushort* it, int offset)
+{
+ return (uint)(it[offset]);
+}
+
+uint load_ushort_to_uint_L1UC_L3C(global ushort* it, int offset)
+{
+ return (uint)(it[offset]);
+}
+
+uint load_ushort_to_uint_L1C_L3UC(global ushort* it, int offset)
+{
+ return (uint)(it[offset]);
+}
+
+uint load_ushort_to_uint_L1C_L3C(global ushort* it, int offset)
+{
+ return (uint)(it[offset]);
+}
+
+uint load_ushort_to_uint_L1S_L3UC(global ushort* it, int offset)
+{
+ return (uint)(it[offset]);
+}
+
+uint load_ushort_to_uint_L1S_L3C(global ushort* it, int offset)
+{
+ return (uint)(it[offset]);
+}
+
+uint load_ushort_to_uint_L1IAR_L3C(global ushort* it, int offset)
+{
+ return (uint)(it[offset]);
+}
+
+// uint
+uint load_uint_L1UC_L3UC(global uint* it, int offset)
+{
+ return it[offset];
+}
+
+uint load_uint_L1UC_L3C(global uint* it, int offset)
+{
+ return it[offset];
+}
+
+uint load_uint_L1C_L3UC(global uint* it, int offset)
+{
+ return it[offset];
+}
+
+uint load_uint_L1C_L3C(global uint* it, int offset)
+{
+ return it[offset];
+}
+
+uint load_uint_L1S_L3UC(global uint* it, int offset)
+{
+ return it[offset];
+}
+
+uint load_uint_L1S_L3C(global uint* it, int offset)
+{
+ return it[offset];
+}
+
+uint load_uint_L1IAR_L3C(global uint* it, int offset)
+{
+ return it[offset];
+}
+
+// uint2
+uint2 load_uint2_L1UC_L3UC(global uint2* it, int offset)
+{
+ return it[offset];
+}
+
+uint2 load_uint2_L1UC_L3C(global uint2* it, int offset)
+{
+ return it[offset];
+}
+
+uint2 load_uint2_L1C_L3UC(global uint2* it, int offset)
+{
+ return it[offset];
+}
+
+uint2 load_uint2_L1C_L3C(global uint2* it, int offset)
+{
+ return it[offset];
+}
+
+uint2 load_uint2_L1S_L3UC(global uint2* it, int offset)
+{
+ return it[offset];
+}
+
+uint2 load_uint2_L1S_L3C(global uint2* it, int offset)
+{
+ return it[offset];
+}
+
+uint2 load_uint2_L1IAR_L3C(global uint2* it, int offset)
+{
+ return it[offset];
+}
+
+// uint3
+uint3 load_uint3_L1UC_L3UC(global uint3* it, int offset)
+{
+ return it[offset];
+}
+
+uint3 load_uint3_L1UC_L3C(global uint3* it, int offset)
+{
+ return it[offset];
+}
+
+uint3 load_uint3_L1C_L3UC(global uint3* it, int offset)
+{
+ return it[offset];
+}
+
+uint3 load_uint3_L1C_L3C(global uint3* it, int offset)
+{
+ return it[offset];
+}
+
+uint3 load_uint3_L1S_L3UC(global uint3* it, int offset)
+{
+ return it[offset];
+}
+
+uint3 load_uint3_L1S_L3C(global uint3* it, int offset)
+{
+ return it[offset];
+}
+
+uint3 load_uint3_L1IAR_L3C(global uint3* it, int offset)
+{
+ return it[offset];
+}
+
+// uint4
+uint4 load_uint4_L1UC_L3UC(global uint4* it, int offset)
+{
+ return it[offset];
+}
+
+uint4 load_uint4_L1UC_L3C(global uint4* it, int offset)
+{
+ return it[offset];
+}
+
+uint4 load_uint4_L1C_L3UC(global uint4* it, int offset)
+{
+ return it[offset];
+}
+
+uint4 load_uint4_L1C_L3C(global uint4* it, int offset)
+{
+ return it[offset];
+}
+
+uint4 load_uint4_L1S_L3UC(global uint4* it, int offset)
+{
+ return it[offset];
+}
+
+uint4 load_uint4_L1S_L3C(global uint4* it, int offset)
+{
+ return it[offset];
+}
+
+uint4 load_uint4_L1IAR_L3C(global uint4* it, int offset)
+{
+ return it[offset];
+}
+
+// uint8
+uint8 load_uint8_L1UC_L3UC(global uint8* it, int offset)
+{
+ return it[offset];
+}
+
+uint8 load_uint8_L1UC_L3C(global uint8* it, int offset)
+{
+ return it[offset];
+}
+
+uint8 load_uint8_L1C_L3UC(global uint8* it, int offset)
+{
+ return it[offset];
+}
+
+uint8 load_uint8_L1C_L3C(global uint8* it, int offset)
+{
+ return it[offset];
+}
+
+uint8 load_uint8_L1S_L3UC(global uint8* it, int offset)
+{
+ return it[offset];
+}
+
+uint8 load_uint8_L1S_L3C(global uint8* it, int offset)
+{
+ return it[offset];
+}
+
+uint8 load_uint8_L1IAR_L3C(global uint8* it, int offset)
+{
+ return it[offset];
+}
+
+// ulong
+ulong load_ulong_L1UC_L3UC(global ulong* it, int offset)
+{
+ return it[offset];
+}
+
+ulong load_ulong_L1UC_L3C(global ulong* it, int offset)
+{
+ return it[offset];
+}
+
+ulong load_ulong_L1C_L3UC(global ulong* it, int offset)
+{
+ return it[offset];
+}
+
+ulong load_ulong_L1C_L3C(global ulong* it, int offset)
+{
+ return it[offset];
+}
+
+ulong load_ulong_L1S_L3UC(global ulong* it, int offset)
+{
+ return it[offset];
+}
+
+ulong load_ulong_L1S_L3C(global ulong* it, int offset)
+{
+ return it[offset];
+}
+
+ulong load_ulong_L1IAR_L3C(global ulong* it, int offset)
+{
+ return it[offset];
+}
+
+// ulong2
+ulong2 load_ulong2_L1UC_L3UC(global ulong2* it, int offset)
+{
+ return it[offset];
+}
+
+ulong2 load_ulong2_L1UC_L3C(global ulong2* it, int offset)
+{
+ return it[offset];
+}
+
+ulong2 load_ulong2_L1C_L3UC(global ulong2* it, int offset)
+{
+ return it[offset];
+}
+
+ulong2 load_ulong2_L1C_L3C(global ulong2* it, int offset)
+{
+ return it[offset];
+}
+
+ulong2 load_ulong2_L1S_L3UC(global ulong2* it, int offset)
+{
+ return it[offset];
+}
+
+ulong2 load_ulong2_L1S_L3C(global ulong2* it, int offset)
+{
+ return it[offset];
+}
+
+ulong2 load_ulong2_L1IAR_L3C(global ulong2* it, int offset)
+{
+ return it[offset];
+}
+
+// ulong3
+ulong3 load_ulong3_L1UC_L3UC(global ulong3* it, int offset)
+{
+ return it[offset];
+}
+
+ulong3 load_ulong3_L1UC_L3C(global ulong3* it, int offset)
+{
+ return it[offset];
+}
+
+ulong3 load_ulong3_L1C_L3UC(global ulong3* it, int offset)
+{
+ return it[offset];
+}
+
+ulong3 load_ulong3_L1C_L3C(global ulong3* it, int offset)
+{
+ return it[offset];
+}
+
+ulong3 load_ulong3_L1S_L3UC(global ulong3* it, int offset)
+{
+ return it[offset];
+}
+
+ulong3 load_ulong3_L1S_L3C(global ulong3* it, int offset)
+{
+ return it[offset];
+}
+
+ulong3 load_ulong3_L1IAR_L3C(global ulong3* it, int offset)
+{
+ return it[offset];
+}
+
+// ulong4
+ulong4 load_ulong4_L1UC_L3UC(global ulong4* it, int offset)
+{
+ return it[offset];
+}
+
+ulong4 load_ulong4_L1UC_L3C(global ulong4* it, int offset)
+{
+ return it[offset];
+}
+
+ulong4 load_ulong4_L1C_L3UC(global ulong4* it, int offset)
+{
+ return it[offset];
+}
+
+ulong4 load_ulong4_L1C_L3C(global ulong4* it, int offset)
+{
+ return it[offset];
+}
+
+ulong4 load_ulong4_L1S_L3UC(global ulong4* it, int offset)
+{
+ return it[offset];
+}
+
+ulong4 load_ulong4_L1S_L3C(global ulong4* it, int offset)
+{
+ return it[offset];
+}
+
+ulong4 load_ulong4_L1IAR_L3C(global ulong4* it, int offset)
+{
+ return it[offset];
+}
+
+// ulong8
+ulong8 load_ulong8_L1UC_L3UC(global ulong8* it, int offset)
+{
+ return it[offset];
+}
+
+ulong8 load_ulong8_L1UC_L3C(global ulong8* it, int offset)
+{
+ return it[offset];
+}
+
+ulong8 load_ulong8_L1C_L3UC(global ulong8* it, int offset)
+{
+ return it[offset];
+}
+
+ulong8 load_ulong8_L1C_L3C(global ulong8* it, int offset)
+{
+ return it[offset];
+}
+
+ulong8 load_ulong8_L1S_L3UC(global ulong8* it, int offset)
+{
+ return it[offset];
+}
+
+ulong8 load_ulong8_L1S_L3C(global ulong8* it, int offset)
+{
+ return it[offset];
+}
+
+ulong8 load_ulong8_L1IAR_L3C(global ulong8* it, int offset)
+{
+ return it[offset];
+}
+
+// LSC Stores
+// uchar
+void store_uchar_from_uint_L1UC_L3UC(global uchar* it, int offset, uint value)
+{
+ it[offset] = (uchar)(value);
+}
+
+void store_uchar_from_uint_L1UC_L3WB(global uchar* it, int offset, uint value)
+{
+ it[offset] = (uchar)(value);
+}
+
+void store_uchar_from_uint_L1WT_L3UC(global uchar* it, int offset, uint value)
+{
+ it[offset] = (uchar)(value);
+}
+
+void store_uchar_from_uint_L1WT_L3WB(global uchar* it, int offset, uint value)
+{
+ it[offset] = (uchar)(value);
+}
+
+void store_uchar_from_uint_L1S_L3UC(global uchar* it, int offset, uint value)
+{
+ it[offset] = (uchar)(value);
+}
+
+void store_uchar_from_uint_L1S_L3WB(global uchar* it, int offset, uint value)
+{
+ it[offset] = (uchar)(value);
+}
+
+void store_uchar_from_uint_L1WB_L3WB(global uchar* it, int offset, uint value)
+{
+ it[offset] = (uchar)(value);
+}
+
+// ushort
+void store_ushort_from_uint_L1UC_L3UC(global ushort* it, int offset, uint value)
+{
+ it[offset] = (ushort)(value);
+}
+
+void store_ushort_from_uint_L1UC_L3WB(global ushort* it, int offset, uint value)
+{
+ it[offset] = (ushort)(value);
+}
+
+void store_ushort_from_uint_L1WT_L3UC(global ushort* it, int offset, uint value)
+{
+ it[offset] = (ushort)(value);
+}
+
+void store_ushort_from_uint_L1WT_L3WB(global ushort* it, int offset, uint value)
+{
+ it[offset] = (ushort)(value);
+}
+
+void store_ushort_from_uint_L1S_L3UC(global ushort* it, int offset, uint value)
+{
+ it[offset] = (ushort)(value);
+}
+
+void store_ushort_from_uint_L1S_L3WB(global ushort* it, int offset, uint value)
+{
+ it[offset] = (ushort)(value);
+}
+
+void store_ushort_from_uint_L1WB_L3WB(global ushort* it, int offset, uint value)
+{
+ it[offset] = (ushort)(value);
+}
+
+// uint
+void store_uint_L1UC_L3UC(global uint* it, int offset, uint value)
+{
+ it[offset] = value;
+}
+
+void store_uint_L1UC_L3WB(global uint* it, int offset, uint value)
+{
+ it[offset] = value;
+}
+
+void store_uint_L1WT_L3UC(global uint* it, int offset, uint value)
+{
+ it[offset] = value;
+}
+
+void store_uint_L1WT_L3WB(global uint* it, int offset, uint value)
+{
+ it[offset] = value;
+}
+
+void store_uint_L1S_L3UC(global uint* it, int offset, uint value)
+{
+ it[offset] = value;
+}
+
+void store_uint_L1S_L3WB(global uint* it, int offset, uint value)
+{
+ it[offset] = value;
+}
+
+void store_uint_L1WB_L3WB(global uint* it, int offset, uint value)
+{
+ it[offset] = value;
+}
+
+// uint2
+void store_uint2_L1UC_L3UC(global uint2* it, int offset, uint2 value)
+{
+ it[offset] = value;
+}
+
+void store_uint2_L1UC_L3WB(global uint2* it, int offset, uint2 value)
+{
+ it[offset] = value;
+}
+
+void store_uint2_L1WT_L3UC(global uint2* it, int offset, uint2 value)
+{
+ it[offset] = value;
+}
+
+void store_uint2_L1WT_L3WB(global uint2* it, int offset, uint2 value)
+{
+ it[offset] = value;
+}
+
+void store_uint2_L1S_L3UC(global uint2* it, int offset, uint2 value)
+{
+ it[offset] = value;
+}
+
+void store_uint2_L1S_L3WB(global uint2* it, int offset, uint2 value)
+{
+ it[offset] = value;
+}
+
+void store_uint2_L1WB_L3WB(global uint2* it, int offset, uint2 value)
+{
+ it[offset] = value;
+}
+
+// uint3
+void store_uint3_L1UC_L3UC(global uint3* it, int offset, uint3 value)
+{
+ it[offset] = value;
+}
+
+void store_uint3_L1UC_L3WB(global uint3* it, int offset, uint3 value)
+{
+ it[offset] = value;
+}
+
+void store_uint3_L1WT_L3UC(global uint3* it, int offset, uint3 value)
+{
+ it[offset] = value;
+}
+
+void store_uint3_L1WT_L3WB(global uint3* it, int offset, uint3 value)
+{
+ it[offset] = value;
+}
+
+void store_uint3_L1S_L3UC(global uint3* it, int offset, uint3 value)
+{
+ it[offset] = value;
+}
+
+void store_uint3_L1S_L3WB(global uint3* it, int offset, uint3 value)
+{
+ it[offset] = value;
+}
+
+void store_uint3_L1WB_L3WB(global uint3* it, int offset, uint3 value)
+{
+ it[offset] = value;
+}
+
+// uint4
+void store_uint4_L1UC_L3UC(global uint4* it, int offset, uint4 value)
+{
+ it[offset] = value;
+}
+
+void store_uint4_L1UC_L3WB(global uint4* it, int offset, uint4 value)
+{
+ it[offset] = value;
+}
+
+void store_uint4_L1WT_L3UC(global uint4* it, int offset, uint4 value)
+{
+ it[offset] = value;
+}
+
+void store_uint4_L1WT_L3WB(global uint4* it, int offset, uint4 value)
+{
+ it[offset] = value;
+}
+
+void store_uint4_L1S_L3UC(global uint4* it, int offset, uint4 value)
+{
+ it[offset] = value;
+}
+
+void store_uint4_L1S_L3WB(global uint4* it, int offset, uint4 value)
+{
+ it[offset] = value;
+}
+
+void store_uint4_L1WB_L3WB(global uint4* it, int offset, uint4 value)
+{
+ it[offset] = value;
+}
+
+// uint8
+void store_uint8_L1UC_L3UC(global uint8* it, int offset, uint8 value)
+{
+ it[offset] = value;
+}
+
+void store_uint8_L1UC_L3WB(global uint8* it, int offset, uint8 value)
+{
+ it[offset] = value;
+}
+
+void store_uint8_L1WT_L3UC(global uint8* it, int offset, uint8 value)
+{
+ it[offset] = value;
+}
+
+void store_uint8_L1WT_L3WB(global uint8* it, int offset, uint8 value)
+{
+ it[offset] = value;
+}
+
+void store_uint8_L1S_L3UC(global uint8* it, int offset, uint8 value)
+{
+ it[offset] = value;
+}
+
+void store_uint8_L1S_L3WB(global uint8* it, int offset, uint8 value)
+{
+ it[offset] = value;
+}
+
+void store_uint8_L1WB_L3WB(global uint8* it, int offset, uint8 value)
+{
+ it[offset] = value;
+}
+
+// ulong
+void store_ulong_L1UC_L3UC(global ulong* it, int offset, ulong value)
+{
+ it[offset] = value;
+}
+
+void store_ulong_L1UC_L3WB(global ulong* it, int offset, ulong value)
+{
+ it[offset] = value;
+}
+
+void store_ulong_L1WT_L3UC(global ulong* it, int offset, ulong value)
+{
+ it[offset] = value;
+}
+
+void store_ulong_L1WT_L3WB(global ulong* it, int offset, ulong value)
+{
+ it[offset] = value;
+}
+
+void store_ulong_L1S_L3UC(global ulong* it, int offset, ulong value)
+{
+ it[offset] = value;
+}
+
+void store_ulong_L1S_L3WB(global ulong* it, int offset, ulong value)
+{
+ it[offset] = value;
+}
+
+void store_ulong_L1WB_L3WB(global ulong* it, int offset, ulong value)
+{
+ it[offset] = value;
+}
+
+// ulong2
+void store_ulong2_L1UC_L3UC(global ulong2* it, int offset, ulong2 value)
+{
+ it[offset] = value;
+}
+
+void store_ulong2_L1UC_L3WB(global ulong2* it, int offset, ulong2 value)
+{
+ it[offset] = value;
+}
+
+void store_ulong2_L1WT_L3UC(global ulong2* it, int offset, ulong2 value)
+{
+ it[offset] = value;
+}
+
+void store_ulong2_L1WT_L3WB(global ulong2* it, int offset, ulong2 value)
+{
+ it[offset] = value;
+}
+
+void store_ulong2_L1S_L3UC(global ulong2* it, int offset, ulong2 value)
+{
+ it[offset] = value;
+}
+
+void store_ulong2_L1S_L3WB(global ulong2* it, int offset, ulong2 value)
+{
+ it[offset] = value;
+}
+
+void store_ulong2_L1WB_L3WB(global ulong2* it, int offset, ulong2 value)
+{
+ it[offset] = value;
+}
+
+// ulong3
+void store_ulong3_L1UC_L3UC(global ulong3* it, int offset, ulong3 value)
+{
+ it[offset] = value;
+}
+
+void store_ulong3_L1UC_L3WB(global ulong3* it, int offset, ulong3 value)
+{
+ it[offset] = value;
+}
+
+void store_ulong3_L1WT_L3UC(global ulong3* it, int offset, ulong3 value)
+{
+ it[offset] = value;
+}
+
+void store_ulong3_L1WT_L3WB(global ulong3* it, int offset, ulong3 value)
+{
+ it[offset] = value;
+}
+
+void store_ulong3_L1S_L3UC(global ulong3* it, int offset, ulong3 value)
+{
+ it[offset] = value;
+}
+
+void store_ulong3_L1S_L3WB(global ulong3* it, int offset, ulong3 value)
+{
+ it[offset] = value;
+}
+
+void store_ulong3_L1WB_L3WB(global ulong3* it, int offset, ulong3 value)
+{
+ it[offset] = value;
+}
+
+// ulong4
+void store_ulong4_L1UC_L3UC(global ulong4* it, int offset, ulong4 value)
+{
+ it[offset] = value;
+}
+
+void store_ulong4_L1UC_L3WB(global ulong4* it, int offset, ulong4 value)
+{
+ it[offset] = value;
+}
+
+void store_ulong4_L1WT_L3UC(global ulong4* it, int offset, ulong4 value)
+{
+ it[offset] = value;
+}
+
+void store_ulong4_L1WT_L3WB(global ulong4* it, int offset, ulong4 value)
+{
+ it[offset] = value;
+}
+
+void store_ulong4_L1S_L3UC(global ulong4* it, int offset, ulong4 value)
+{
+ it[offset] = value;
+}
+
+void store_ulong4_L1S_L3WB(global ulong4* it, int offset, ulong4 value)
+{
+ it[offset] = value;
+}
+
+void store_ulong4_L1WB_L3WB(global ulong4* it, int offset, ulong4 value)
+{
+ it[offset] = value;
+}
+
+// ulong8
+void store_ulong8_L1UC_L3UC(global ulong8* it, int offset, ulong8 value)
+{
+ it[offset] = value;
+}
+
+void store_ulong8_L1UC_L3WB(global ulong8* it, int offset, ulong8 value)
+{
+ it[offset] = value;
+}
+
+void store_ulong8_L1WT_L3UC(global ulong8* it, int offset, ulong8 value)
+{
+ it[offset] = value;
+}
+
+void store_ulong8_L1WT_L3WB(global ulong8* it, int offset, ulong8 value)
+{
+ it[offset] = value;
+}
+
+void store_ulong8_L1S_L3UC(global ulong8* it, int offset, ulong8 value)
+{
+ it[offset] = value;
+}
+
+void store_ulong8_L1S_L3WB(global ulong8* it, int offset, ulong8 value)
+{
+ it[offset] = value;
+}
+
+void store_ulong8_L1WB_L3WB(global ulong8* it, int offset, ulong8 value)
+{
+ it[offset] = value;
+}
+
+// LSC Fence support
+void mem_fence_gpu_default()
+{
+ write_mem_fence(CLK_GLOBAL_MEM_FENCE);
+}
+
+void mem_fence_workgroup_default()
+{
+ write_mem_fence( CLK_GLOBAL_MEM_FENCE );
+}
+
+void mem_fence_gpu_invalidate()
+{
+ read_mem_fence(CLK_GLOBAL_MEM_FENCE);
+}
+
+void mem_fence_gpu_evict()
+{
+ read_mem_fence(CLK_GLOBAL_MEM_FENCE);
+}
+
+void mem_fence_evict_to_memory()
+{
+ mem_fence(CLK_GLOBAL_MEM_FENCE);
+}
diff --git a/src/intel/vulkan/grl/gpu/mem_utils.h b/src/intel/vulkan/grl/gpu/mem_utils.h
new file mode 100644
index 00000000000..b57a25279fd
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/mem_utils.h
@@ -0,0 +1,161 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#include "shared.h"
+
+/// Write cache line to global memory
+/// Assumes subgroup_size is 16
+///
+/// @param dst 64 bytes aligned output pointer
+/// @param val value to write
+GRL_INLINE void CacheLineSubgroupWrite(global char* dst, uint val)
+{
+ global uint* addrAligned = (global uint*)(global uint16*)dst;
+ intel_sub_group_block_write(addrAligned, val);
+}
+
+/// Read cache line from global memory
+/// Assumes subgroup_size is 16
+///
+/// @param src 64 bytes aligned input pointer
+/// @return uint read from memory
+GRL_INLINE uint CacheLineSubgroupRead(const global char* src)
+{
+ const global uint* addrAligned = (const global uint*)(global uint16*)src;
+ return intel_sub_group_block_read(addrAligned);
+}
+
+/// Copy cache line
+/// Assumes subgroup_size is 16
+///
+/// @param dst 64 bytes aligned output pointer
+/// @param src input pointer
+GRL_INLINE void CopyCacheLine(global char* dst, const global char* src)
+{
+ global const uint* usrc = (global const uint*) (src);
+
+ uint data = intel_sub_group_block_read(usrc);
+ CacheLineSubgroupWrite(dst, data);
+}
+
+/// Fast memory copy
+///
+/// @param dst output pointer
+/// @param src input pointer
+/// @param size number of bytes to copy
+/// @param numGroups number of groups that execute this function
+GRL_INLINE void CopyMemory(global char* dst, const global char* src, uint size, uint numGroups)
+{
+ const uint CACHELINE_SIZE = 64;
+
+ uint globalID = get_local_size(0) * get_group_id(0) + get_local_id(0);
+
+ // this part copies cacheline per physical thread one write. starting from dst aligned up to cacheline.
+ // it copies laso reminder
+ {
+ uint alignAdd = ((uint)(uint64_t)dst) & (CACHELINE_SIZE - 1);
+ alignAdd = (CACHELINE_SIZE - alignAdd) & (CACHELINE_SIZE - 1);
+
+ if (size > alignAdd)
+ {
+ uint alignedBytesCount = size - alignAdd;
+ uint alignedDWsCount = alignedBytesCount >> 2;
+ global uint* dstAlignedPart = (global uint*)(dst + alignAdd);
+ global uint* srcAlignedPart = (global uint*)(src + alignAdd);
+
+ for (uint id = globalID; id < alignedDWsCount; id += get_local_size(0) * numGroups)
+ {
+ dstAlignedPart[id] = srcAlignedPart[id];
+ }
+
+ if (globalID < alignedBytesCount - (alignedDWsCount << 2))
+ {
+ global uint8_t* dstByteRem = (global uint8_t*)(dstAlignedPart + alignedDWsCount);
+ global uint8_t* srcByteRem = (global uint8_t*)(srcAlignedPart + alignedDWsCount);
+ dstByteRem[globalID] = srcByteRem[globalID];
+ }
+ }
+ }
+
+ // copy to dst below aligned up to chacheline
+ {
+ uint misalignmentBytesSize = (4 - (((uint)dst) & /*bytes in DW*/3)) & 3;
+ if (misalignmentBytesSize)
+ {
+ if (globalID < misalignmentBytesSize)
+ {
+ dst[globalID] = src[globalID];
+ }
+ dst += misalignmentBytesSize;
+ src += misalignmentBytesSize;
+ }
+
+ uint misalignmentDWSize = (CACHELINE_SIZE - (((uint)dst) & (CACHELINE_SIZE - 1))) & (CACHELINE_SIZE - 1);
+ if (misalignmentDWSize)
+ {
+ if (globalID < (misalignmentDWSize >> 2))
+ {
+ ((global uint*)dst)[globalID] = ((global uint*)src)[globalID];
+ }
+ }
+ }
+}
+
+#define CACHELINE_SIZE 64
+#define CACHELINE_PER_BLOCK 4
+#define BLOCK_SIZE 256 // = CACHELINE_SIZE * CACHELINE_PER_BLOCK;
+
+GRL_INLINE
+global const char *getInstanceDataToCopy(global const char *array, global const uint64_t *arrayOfPtrs, const uint byteOffset)
+{
+ if (array != NULL)
+ {
+ return array + byteOffset;
+ }
+ else
+ {
+ return (global char *)arrayOfPtrs[byteOffset >> 6];
+ }
+}
+
+// assummed:
+// dst is always 64 bytes alligned
+// size is always multiply of 64 bytes (size of InstanceDesc is always 64 bytes)
+GRL_INLINE
+void copyInstances(global char *dst, global const char *array, global const uint64_t *arrayOfPtrs, const uint64_t size, const uint numGroups)
+{
+ uint taskId = get_group_id(0);
+
+ uint blockedSize = (size) & (~(BLOCK_SIZE - 1));
+
+ uint cachelinedTailOffset = blockedSize;
+ uint cachelinedTailSize = (size - cachelinedTailOffset) & (~(CACHELINE_SIZE - 1));
+
+ uint tailCacheLines = cachelinedTailSize >> 6; // divide by CACHELINE_SIZE
+ uint reversedTaskId = (uint)(-(((int)taskId) - ((int)numGroups - 1)));
+ if (reversedTaskId < tailCacheLines)
+ {
+ uint byteOffset = cachelinedTailOffset + (reversedTaskId * CACHELINE_SIZE);
+ global const char *src = getInstanceDataToCopy(array, arrayOfPtrs, byteOffset);
+ CopyCacheLine(dst + byteOffset, src);
+ }
+
+ uint numBlocks = blockedSize >> 8;
+ while (taskId < numBlocks)
+ {
+ uint byteOffset = (taskId * BLOCK_SIZE);
+
+ for (uint cl = 0; cl < CACHELINE_PER_BLOCK; cl++)
+ {
+ global const char *src = getInstanceDataToCopy(array, arrayOfPtrs, byteOffset);
+ CopyCacheLine(dst + byteOffset, src);
+ byteOffset += CACHELINE_SIZE;
+ }
+
+ taskId += numGroups;
+ }
+} \ No newline at end of file
diff --git a/src/intel/vulkan/grl/gpu/misc.cl b/src/intel/vulkan/grl/gpu/misc.cl
new file mode 100644
index 00000000000..d32c8267b73
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/misc.cl
@@ -0,0 +1,367 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#include "api_interface.h"
+#include "common.h"
+#include "instance.h"
+#include "misc_shared.h"
+#include "mem_utils.h"
+
+#define DBG(x)
+#define ENABLE_CHECKS 0
+
+#define CACHELINE_SIZE 64
+#define CACHELINE_PER_BLOCK 4
+#define BLOCK_SIZE 256 // = CACHELINE_SIZE * CACHELINE_PER_BLOCK;
+
+GRL_INLINE
+uint32_t getGeomDescPrimitiveCountAsUint32t(global GRL_RAYTRACING_GEOMETRY_DESC *geomDesc, uint64_t index)
+{
+ return (uint32_t)GRL_get_primitive_count(&geomDesc[index]);
+}
+
+GRL_INLINE
+uint32_t getGeomDescTypeAndFlagsAsUint32t(global GRL_RAYTRACING_GEOMETRY_DESC *geomDesc, uint64_t index)
+{
+ return (uint32_t)GRL_get_Type(&geomDesc[index]) |
+ (((uint32_t)GRL_get_Flags(&geomDesc[index])) << 16);
+}
+
+GRL_INLINE
+uint64_t getGeomDescAsUint64t(global GRL_RAYTRACING_GEOMETRY_DESC *geomDesc, uint64_t index)
+{
+ return (uint64_t)getGeomDescPrimitiveCountAsUint32t(geomDesc, index) |
+ (((uint64_t)getGeomDescTypeAndFlagsAsUint32t(geomDesc, index)) << 32);
+}
+
+// assummed:
+// dst is always 64 bytes alligned
+GRL_INLINE
+void copyGeoMetaData(global char* dst, global GRL_RAYTRACING_GEOMETRY_DESC *geomDesc, uint64_t size, uint numGroups)
+{
+ uint taskId = get_group_id(0);
+ uint localId = get_sub_group_local_id();
+
+ uint cachelinedSize = (size) & (~(CACHELINE_SIZE-1));
+
+ uint reminderOffset = cachelinedSize;
+ uint reminderQWSize = (size - reminderOffset) >> 3;
+
+ uint tailCacheLines = cachelinedSize >> 6; // divide by CACHELINE_SIZE
+ uint reversedTaskId = (uint)(-(((int)taskId) - ((int)numGroups-1)));
+ if (reversedTaskId == tailCacheLines && localId < reminderQWSize)
+ {
+ uint reminderOffsetQW = reminderOffset >> 3;
+ global uint64_t* dstQW = (global uint64_t*)(dst);
+ dstQW[localId + reminderOffsetQW] = getGeomDescAsUint64t(geomDesc, localId + reminderOffsetQW);
+ }
+
+ uint numCacheLines = cachelinedSize >> 6;
+ while (taskId < numCacheLines)
+ {
+ uint byteOffset = taskId * CACHELINE_SIZE;
+ uint geoIdFromOffset = (byteOffset >> 3) + (localId >> 1);
+
+ uint32_t data = 0;
+ if (localId & 1)
+ {
+ data = getGeomDescTypeAndFlagsAsUint32t(geomDesc, geoIdFromOffset);
+ }
+ else
+ {
+ data = getGeomDescPrimitiveCountAsUint32t(geomDesc, geoIdFromOffset);
+ }
+ CacheLineSubgroupWrite(dst + byteOffset, data);
+
+ taskId += numGroups;
+ }
+}
+
+GRL_INLINE
+uint groupCountForInstancesCopySize(uint size)
+{
+ return (size >> 8) + 3;
+}
+
+GRL_INLINE
+uint groupCountForGeoMetaDataCopySize(uint size)
+{
+ return (size >> 6) + 1;
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
+void kernel copy_instances(global char* dest, global char* instancesArray, uint64_t size)
+{
+ // global char *dest = (global char *)((unsigned long)bvh + bvh->Meta.instanceDescsStart);
+ copyInstances(dest, instancesArray, NULL, size, groupCountForInstancesCopySize(size));
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
+void kernel copy_instances_indirect(global char* dest, global char* instancesArray, global const struct IndirectBuildRangeInfo* const indirect_data)
+{
+ uint64_t size = indirect_data->primitiveCount * sizeof(InstanceDesc);
+ instancesArray += indirect_data->primitiveOffset;
+ uint tid = get_sub_group_local_id() + get_group_id(0) * MAX_HW_SIMD_WIDTH;
+ if (tid == 0)
+ {
+ struct BVHBase* bvh = (struct BVHBase*)dest;
+ bvh->Meta.instanceCount = indirect_data->primitiveCount;
+ }
+ copyInstances(dest, instancesArray, NULL, size, groupCountForInstancesCopySize(size));
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
+void kernel copy_instance_ptrs(global char* dest, global uint64_t* arrayOfPtrs, uint64_t size)
+{
+ //global char *dest = (global char *)((unsigned long)bvh + bvh->Meta.instanceDescsStart);
+ copyInstances(dest, NULL, arrayOfPtrs, size, groupCountForInstancesCopySize(size));
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
+void kernel copy_instance_ptrs_indirect(global char* dest, global uint64_t* arrayOfPtrs, global struct IndirectBuildRangeInfo const * const indirect_data)
+{
+ uint64_t size = indirect_data->primitiveCount * sizeof(InstanceDesc);
+ arrayOfPtrs += indirect_data->primitiveOffset;
+ uint tid = get_sub_group_local_id() + get_group_id(0) * MAX_HW_SIMD_WIDTH;
+ if (tid == 0)
+ {
+ struct BVHBase* bvh = (struct BVHBase*)dest;
+ bvh->Meta.instanceCount = indirect_data->primitiveCount;
+ }
+ copyInstances(dest, NULL, arrayOfPtrs, size, groupCountForInstancesCopySize(size));
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
+void kernel copy_instances_base_ptr(global BVHBase* bvh, global char* instancesArray, uint64_t size)
+{
+ global char *dest = (global char *)((unsigned long)bvh + bvh->Meta.instanceDescsStart);
+ copyInstances(dest, instancesArray, NULL, size, groupCountForInstancesCopySize(size));
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
+void kernel copy_instances_base_ptr_indirect(global BVHBase* bvh, global char* instancesArray, global struct IndirectBuildRangeInfo const * const indirect_data)
+{
+ global char* dest = (global char*)((unsigned long)bvh + bvh->Meta.instanceDescsStart);
+ uint64_t size = indirect_data->primitiveCount * sizeof(InstanceDesc);
+ instancesArray += indirect_data->primitiveOffset;
+ copyInstances(dest, instancesArray, NULL, size, groupCountForInstancesCopySize(size));
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
+void kernel copy_instance_ptrs_base_ptr(global BVHBase* bvh, global uint64_t* arrayOfPtrs, uint64_t size)
+{
+ global char *dest = (global char *)((unsigned long)bvh + bvh->Meta.instanceDescsStart);
+ copyInstances(dest, NULL, arrayOfPtrs, size, groupCountForInstancesCopySize(size));
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
+void kernel copy_instance_ptrs_base_ptr_indirect(global BVHBase* bvh, global uint64_t* arrayOfPtrs, global struct IndirectBuildRangeInfo const * const indirect_data)
+{
+ global char* dest = (global char*)((unsigned long)bvh + bvh->Meta.instanceDescsStart);
+ uint64_t size = indirect_data->primitiveCount * sizeof(InstanceDesc);
+ arrayOfPtrs += indirect_data->primitiveOffset;
+ copyInstances(dest, NULL, arrayOfPtrs, size, groupCountForInstancesCopySize(size));
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
+void kernel copy_geo_meta_data(global char* dest, global char* src, uint64_t size)
+{
+ //global char *dest = (global char *)((unsigned long)bvh + bvh->Meta.geoDescsStart);
+ global GRL_RAYTRACING_GEOMETRY_DESC *geomDesc = (global GRL_RAYTRACING_GEOMETRY_DESC *)((unsigned long)src);
+ copyGeoMetaData(dest, geomDesc, size, groupCountForGeoMetaDataCopySize(size));
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__( ( reqd_work_group_size( MAX_HW_SIMD_WIDTH, 1, 1 ) ) )
+__attribute__( ( intel_reqd_sub_group_size( MAX_HW_SIMD_WIDTH ) ) )
+void kernel copy_geo_descs_indirect_build(global char* dest, global char* src, global struct IndirectBuildRangeInfo const * const indirect_data, uint numGeometries)
+{
+ uint32_t gid = get_local_id(0) + get_group_id(0) * get_local_size(0);
+ if (gid < numGeometries) {
+ global GRL_RAYTRACING_GEOMETRY_DESC* dstDesc = (global GRL_RAYTRACING_GEOMETRY_DESC*)(dest);
+ global GRL_RAYTRACING_GEOMETRY_DESC* srcDesc = (global GRL_RAYTRACING_GEOMETRY_DESC*)(src);
+
+ GRL_RAYTRACING_GEOMETRY_DESC geo = srcDesc[gid];
+
+ uint primitiveCount = indirect_data[gid].primitiveCount;
+ uint primitiveOffset = indirect_data[gid].primitiveOffset;
+ uint firstVertex = indirect_data[gid].firstVertex;
+ uint transformOffset = indirect_data[gid].transformOffset;
+
+ if (srcDesc[gid].Type == GEOMETRY_TYPE_TRIANGLES)
+ {
+ if (geo.Desc.Triangles.IndexFormat == INDEX_FORMAT_NONE)
+ {
+ geo.Desc.Triangles.VertexCount = primitiveCount * 3;
+ geo.Desc.Triangles.pVertexBuffer += primitiveOffset
+ + firstVertex * geo.Desc.Triangles.VertexBufferByteStride;
+ }
+ else
+ {
+ geo.Desc.Triangles.IndexCount = primitiveCount * 3;
+ geo.Desc.Triangles.pIndexBuffer += primitiveOffset;
+ geo.Desc.Triangles.pVertexBuffer += firstVertex * geo.Desc.Triangles.VertexBufferByteStride;
+ }
+ if (geo.Desc.Triangles.pTransformBuffer) {
+ geo.Desc.Triangles.pTransformBuffer += transformOffset;
+ }
+ } else {
+ // GEOMETRY_TYPE_PROCEDURAL
+ geo.Desc.Procedural.AABBCount = primitiveCount;
+ geo.Desc.Procedural.pAABBs_GPUVA += primitiveOffset;
+ }
+
+ dstDesc[gid] = geo;
+ }
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(1, 1, 1))) void kernel batched_init_globals(global struct BatchedInitGlobalsData *data)
+{
+ uint groupID = get_group_id(0);
+
+ struct BatchedInitGlobalsData entry = data[groupID];
+
+ global struct Globals* globals = (global struct Globals*)entry.p_build_globals;
+ global char *bvh_mem = (global char*)entry.p_bvh_buffer;
+ uint numPrimitives = entry.numPrimitives;
+ uint numGeometries = entry.numGeometries;
+ uint numInstances = entry.numInstances;
+ uint instance_descs_start = entry.instance_descs_start;
+ uint geo_meta_data_start = entry.geo_meta_data_start;
+ uint node_data_start = entry.node_data_start;
+ uint quad_data_start = entry.leaf_data_start;
+ uint instance_data_start = entry.leaf_data_start;
+ uint procedural_data_start = entry.procedural_data_start;
+ uint back_pointer_start = entry.back_pointer_start;
+ uint build_record_start = entry.leaf_data_start;
+ uint totalBytes = entry.sizeTotal;
+ uint leafPrimType = entry.leafType;
+ uint leafSize = entry.leafSize;
+
+ uint root_node_offset = node_data_start;
+ struct BVHBase *base = (struct BVHBase *)bvh_mem;
+
+ base->Meta.instanceCount = numInstances;
+ base->Meta.geoCount = numGeometries;
+ base->Meta.instanceDescsStart = instance_descs_start;
+ base->Meta.geoDescsStart = geo_meta_data_start;
+ base->Meta.allocationSize = totalBytes;
+ // This doesnt work correctly
+ //ERROR_INFO initErr = { 0, 0, 0, 0xAAABBAAA };
+ //base->Meta.errors = initErr;
+ base->Meta.errors.type = 0;
+ base->Meta.errors.offset_in_BVH = 0; //in 64B units
+ base->Meta.errors.when = 0;
+ base->Meta.errors.reserved = 0xAAABBAAA;
+
+ base->nodeDataCur = node_data_start / 64;
+ base->quadLeafStart = quad_data_start / 64;
+ base->quadLeafCur = quad_data_start / 64;
+ base->instanceLeafStart = instance_data_start / 64;
+ base->instanceLeafEnd = instance_data_start / 64;
+ base->proceduralDataStart = procedural_data_start / 64;
+ base->proceduralDataCur = procedural_data_start / 64;
+ base->backPointerDataStart = back_pointer_start / 64;
+ base->refitTreeletsDataStart = totalBytes / 64;
+ base->refitStartPointDataStart = totalBytes / 64;
+ base->BVHDataEnd = totalBytes / 64;
+ base->refitTreeletCnt = 0;
+ base->refitTreeletCnt2 = 0;
+ base->rootNodeOffset = root_node_offset;
+
+ base->fatLeafCount = 0;
+ base->fatLeafTableStart = entry.fatleaf_table_start / 64;
+ base->innerCount = 0;
+ base->innerTableStart = entry.innernode_table_start / 64;
+ base->quadLeftoversCountNewAtomicUpdate = 0;
+ base->quadTableSizeNewAtomicUpdate = 0;
+ base->quadIndicesDataStart = entry.quad_indices_data_start / 64;
+
+ if (back_pointer_start != totalBytes)
+ {
+ BackPointers* back_pointers = BVHBase_GetBackPointers(base);
+ uint root_node_idx = root_node_offset - node_data_start;
+ global uint *root_node_backpointer = (global uint *)InnerNode_GetBackPointer(back_pointers,root_node_idx);
+ *root_node_backpointer = ((uint)-1) << 6;
+ }
+
+ AABB3f_init(&base->Meta.bounds);
+ AABB_init(&globals->centroidBounds);
+
+ globals->build_record_start = build_record_start;
+
+ globals->numBuildRecords = 0;
+ globals->numBuildRecords_extended = 0;
+ globals->numPrimitives = numPrimitives;
+ globals->numSplittedPrimitives = 0;
+ globals->sync = 0;
+ globals->probThreshold = 0.0f;
+ globals->leafPrimType = leafPrimType;
+ globals->leafSize = leafSize;
+}
+
+
+
+// This is temporary WA for mock in DXR
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) void kernel copy_mock(global char *dest,
+ global char *src,
+ uint32_t size)
+{
+ uint32_t globalId = get_local_id(0) + get_group_id(0) * get_local_size(0);
+ uint32_t globalSize = get_num_groups(0) * get_local_size(0);
+ for (uint32_t i = globalId; i < size; i += globalSize)
+ {
+ dest[i] = src[i];
+ }
+}
+
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(32, 1, 1)))
+void kernel mem_set(global char *dest,
+ dword byte,
+ dword size)
+{
+ uint32_t globalId = get_local_id(0) + get_group_id(0) * get_local_size(0);
+ if (globalId < size)
+ {
+ dest[globalId] = (char)byte;
+ }
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(32, 1, 1)))
+void kernel mem_set_size_ptr(global char *dest,
+ dword byte,
+ global qword* sizePtr)
+{
+ uint32_t globalId = get_local_id(0) + get_group_id(0) * get_local_size(0);
+ if (globalId < *sizePtr)
+ {
+ dest[globalId] = (char)byte;
+ }
+}
diff --git a/src/intel/vulkan/grl/gpu/misc.grl b/src/intel/vulkan/grl/gpu/misc.grl
new file mode 100644
index 00000000000..cb98534afb4
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/misc.grl
@@ -0,0 +1,278 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+module misc;
+
+kernel_module misc("misc.cl")
+{
+ kernel opencl_kernel_batched_init_globals < kernelFunction="batched_init_globals" >;
+ kernel opencl_kernel_copy_instances < kernelFunction="copy_instances" >;
+ kernel opencl_kernel_copy_instances_indirect < kernelFunction="copy_instances_indirect" >;
+ kernel opencl_kernel_copy_instance_ptrs < kernelFunction="copy_instance_ptrs" >;
+ kernel opencl_kernel_copy_instance_ptrs_indirect < kernelFunction="copy_instance_ptrs_indirect" >;
+ kernel opencl_kernel_copy_instances_base_ptr < kernelFunction="copy_instances_base_ptr" >;
+ kernel opencl_kernel_copy_instances_base_ptr_indirect < kernelFunction="copy_instances_base_ptr_indirect" >;
+ kernel opencl_kernel_copy_instance_ptrs_base_ptr < kernelFunction="copy_instance_ptrs_base_ptr" >;
+ kernel opencl_kernel_copy_instance_ptrs_base_ptr_indirect < kernelFunction="copy_instance_ptrs_base_ptr_indirect" >;
+ kernel opencl_kernel_copy_geo_meta_data < kernelFunction="copy_geo_meta_data" >;
+ kernel opencl_kernel_copy_geo_descs_indirect_build < source="misc.cl", kernelFunction="copy_geo_descs_indirect_build" >;
+ kernel opencl_kernel_copy_mock < kernelFunction="copy_mock" >;
+ kernel opencl_kernel_memset < kernelFunction="mem_set" >;
+ kernel opencl_kernel_memset_size_ptr < kernelFunction="mem_set_size_ptr" >;
+}
+
+import struct MKBuilderState "structs.grl";
+import struct MKSizeEstimate "structs.grl";
+
+
+metakernel batched_init_globals(
+ qword p_data,
+ dword numWgs)
+{
+ dispatch opencl_kernel_batched_init_globals(numWgs,1,1) args(p_data);
+}
+
+metakernel copy_instances(
+ qword bvh_buffer,
+ qword instanceDescsBuffer,
+ qword totalSizeToCopy,
+ dword numThreads)
+{
+ dispatch opencl_kernel_copy_instances (numThreads, 1, 1) args(
+ bvh_buffer,
+ instanceDescsBuffer,
+ totalSizeToCopy);
+}
+
+metakernel
+copy_instances_indirect( qword bvh_buffer, qword instanceDescsBuffer, qword indirectBuildRangeInfo )
+{
+
+ define num_groups REG0;
+ define C_2 REG2;
+ define C_3 REG3;
+
+ C_2 = 2;
+ C_3 = 3;
+
+ // sizeof(InstanceDesc) == 64, matches DXR and Vulkan API definitions
+ // num_groups = ((num_instances << log_2(64)) >> 8) + 3 = (num_instances >> 2) + 3
+ num_groups = load_dword( indirectBuildRangeInfo );
+ num_groups = num_groups >> C_2;
+ num_groups = num_groups + C_3;
+
+ DISPATCHDIM_X = num_groups.lo;
+ DISPATCHDIM_Y = 1;
+ DISPATCHDIM_Z = 1;
+
+ dispatch_indirect opencl_kernel_copy_instances_indirect args(
+ bvh_buffer,
+ instanceDescsBuffer,
+ indirectBuildRangeInfo);
+}
+
+metakernel copy_instance_ptrs(
+ qword bvh_buffer,
+ qword instanceDescPtrsBuffer,
+ qword totalSizeToCopy,
+ dword numThreads)
+{
+ dispatch opencl_kernel_copy_instance_ptrs (numThreads, 1, 1) args(
+ bvh_buffer,
+ instanceDescPtrsBuffer,
+ totalSizeToCopy);
+}
+
+metakernel copy_instance_ptrs_indirect(
+ qword bvh_buffer,
+ qword instanceDescPtrsBuffer,
+ qword indirectBuildRangeInfo)
+{
+ define num_groups REG0;
+ define C_2 REG2;
+ define C_3 REG3;
+
+ C_2 = 2;
+ C_3 = 3;
+
+ // sizeof(InstanceDesc) == 64, matches DXR and Vulkan API definitions
+ // num_groups = ((num_instances << log_2(64)) >> 8) + 3 = (num_instances >> 2) + 3
+ num_groups = load_dword( indirectBuildRangeInfo );
+ num_groups = num_groups >> C_2;
+ num_groups = num_groups + C_3;
+
+ DISPATCHDIM_X = num_groups.lo;
+ DISPATCHDIM_Y = 1;
+ DISPATCHDIM_Z = 1;
+
+ dispatch_indirect opencl_kernel_copy_instance_ptrs_indirect args(
+ bvh_buffer,
+ instanceDescPtrsBuffer,
+ indirectBuildRangeInfo);
+}
+
+metakernel copy_instances_base_ptr(
+ qword bvh_buffer,
+ qword instanceDescsBuffer,
+ qword totalSizeToCopy,
+ dword numThreads)
+{
+ dispatch opencl_kernel_copy_instances_base_ptr (numThreads, 1, 1) args(
+ bvh_buffer,
+ instanceDescsBuffer,
+ totalSizeToCopy);
+}
+
+metakernel copy_instances_base_ptr_indirect(
+ qword bvh_buffer,
+ qword instanceDescsBuffer,
+ qword indirectBuildRangeInfo)
+{
+ define num_groups REG0;
+ define C_2 REG2;
+ define C_3 REG3;
+
+ C_2 = 2;
+ C_3 = 3;
+
+ // sizeof(InstanceDesc) == 64, matches DXR and Vulkan API definitions
+ // num_groups = ((num_instances << log_2(64)) >> 8) + 3 = (num_instances >> 2) + 3
+ num_groups = load_dword( indirectBuildRangeInfo );
+ num_groups = num_groups >> C_2;
+ num_groups = num_groups + C_3;
+
+ DISPATCHDIM_X = num_groups.lo;
+ DISPATCHDIM_Y = 1;
+ DISPATCHDIM_Z = 1;
+
+ dispatch_indirect opencl_kernel_copy_instances_base_ptr_indirect args(
+ bvh_buffer,
+ instanceDescsBuffer,
+ indirectBuildRangeInfo);
+}
+
+metakernel copy_instance_ptrs_base_ptr(
+ qword bvh_buffer,
+ qword instanceDescPtrsBuffer,
+ qword totalSizeToCopy,
+ dword numThreads)
+{
+ dispatch opencl_kernel_copy_instance_ptrs_base_ptr (numThreads, 1, 1) args(
+ bvh_buffer,
+ instanceDescPtrsBuffer,
+ totalSizeToCopy);
+}
+
+metakernel copy_instance_ptrs_base_ptr_indirect(
+ qword bvh_buffer,
+ qword instanceDescPtrsBuffer,
+ qword indirectBuildRangeInfo)
+{
+ define num_groups REG0;
+ define C_2 REG2;
+ define C_3 REG3;
+
+ C_2 = 2;
+ C_3 = 3;
+
+ // sizeof(InstanceDesc) == 64, matches DXR and Vulkan API definitions
+ // num_groups = ((num_instances << log_2(64)) >> 8) + 3 = (num_instances >> 2) + 3
+ num_groups = load_dword( indirectBuildRangeInfo );
+ num_groups = num_groups >> C_2;
+ num_groups = num_groups + C_3;
+
+ DISPATCHDIM_X = num_groups.lo;
+ DISPATCHDIM_Y = 1;
+ DISPATCHDIM_Z = 1;
+
+ dispatch_indirect opencl_kernel_copy_instance_ptrs_base_ptr_indirect args(
+ bvh_buffer,
+ instanceDescPtrsBuffer,
+ indirectBuildRangeInfo);
+}
+
+metakernel copy_geo_descs(
+ qword private_dest,
+ qword transient_src,
+ qword indirectBuildRangeInfo,
+ dword numGeometries)
+{
+
+ define num_groups (numGeometries + 16 - 1) / 16;
+ dispatch opencl_kernel_copy_geo_descs_indirect_build(num_groups, 1, 1) args(
+ private_dest,
+ transient_src,
+ indirectBuildRangeInfo,
+ numGeometries);
+}
+
+metakernel copy_geo_meta_data(
+ qword bvh_buffer,
+ qword geomdesc_buffer,
+ qword totalSizeToCopy,
+ dword numThreads)
+{
+ dispatch opencl_kernel_copy_geo_meta_data (numThreads, 1, 1) args(
+ bvh_buffer,
+ geomdesc_buffer,
+ totalSizeToCopy);
+}
+
+
+const COPY_MOCK_GROUP_SIZE = 16;
+
+metakernel copy_mock(
+ qword dest,
+ qword src,
+ dword size)
+{
+ define num_groups (size + COPY_MOCK_GROUP_SIZE - 1) / COPY_MOCK_GROUP_SIZE;
+ dispatch opencl_kernel_copy_mock(num_groups, 1, 1) args(
+ dest,
+ src,
+ size);
+}
+
+metakernel memset(
+ qword dest,
+ dword byte,
+ dword size)
+{
+ define num_groups (size + 32 - 1) / 32;
+ dispatch opencl_kernel_memset(num_groups, 1, 1) args(
+ dest,
+ byte,
+ size);
+}
+
+metakernel memset_size_ptr(
+ qword dest,
+ dword byte,
+ qword sizePtr)
+{
+ define byteSize REG0;
+ define C_32 REG1; C_32 = 32;
+ define C_1 REG2; C_1 = 1;
+ define C_4 REG3; C_4 = 4;
+ define numGroupsRqd REG4;
+
+ byteSize = load_dword(sizePtr);
+
+ numGroupsRqd = byteSize + C_32;
+ numGroupsRqd = numGroupsRqd - C_1;
+ numGroupsRqd = numGroupsRqd >> C_4;
+ numGroupsRqd = numGroupsRqd >> C_1;
+
+ DISPATCHDIM_X = numGroupsRqd.lo;
+ DISPATCHDIM_Y = 1;
+ DISPATCHDIM_Z = 1;
+
+ dispatch_indirect opencl_kernel_memset_size_ptr args(
+ dest,
+ byte,
+ sizePtr);
+}
diff --git a/src/intel/vulkan/grl/gpu/misc_legacy.cl b/src/intel/vulkan/grl/gpu/misc_legacy.cl
new file mode 100644
index 00000000000..a464e89537c
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/misc_legacy.cl
@@ -0,0 +1,386 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#include "input_client_structs.h"
+#include "common.h"
+#include "instance.h"
+
+#define DBG(x)
+#define ENABLE_CHECKS 0
+
+/*
+
+ This kernel implements a exclusive scan addition operation. The
+ implementation currently only uses one DSS.
+
+ */
+__attribute__((reqd_work_group_size(16, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16))) void kernel
+parallel_scan_exclusive_add(global uint *input,
+ global uint *output,
+ const uint N)
+{
+ const uint j = get_local_id(0);
+ const uint J = get_local_size(0);
+ const uint BLOCKSIZE = (N + J - 1) / J;
+ const uint start = min((j + 0) * BLOCKSIZE, N);
+ const uint end = min((j + 1) * BLOCKSIZE, N);
+
+ uint base = 0;
+ for (uint i = start; i < end; i++)
+ base += input[i];
+
+ base = work_group_scan_exclusive_add(base);
+
+ uint accu = 0;
+ for (uint i = start; i < end; i++)
+ {
+ output[i] = base + accu;
+ accu += input[i];
+ }
+}
+
+/*
+
+ This kernel implements a exclusive scan addition operation that can use the entire GPU.
+
+ */
+__attribute__((reqd_work_group_size(16, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16))) void kernel
+parallel_scan_exclusive_add_phase0(global uint *input,
+ global uint *output,
+ global uint *prefix_sums,
+ const uint N)
+{
+ const uint local_size = get_local_size(0);
+ const uint numTasks = get_num_groups(0);
+ const uint groupID = get_group_id(0);
+ const uint localID = get_local_id(0);
+ const uint global_startID = (groupID + 0) * N / numTasks;
+ const uint global_endID = (groupID + 1) * N / numTasks;
+
+ uint base = 0;
+ for (uint i = global_startID + localID; i < global_endID; i += local_size)
+ base += input[i];
+
+ base = work_group_reduce_add(base);
+
+ if (localID == 0)
+ {
+ prefix_sums[groupID] = base;
+ printf("%d -> %d \n", groupID, base);
+ }
+}
+
+__attribute__((reqd_work_group_size(16, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16))) void kernel
+parallel_scan_exclusive_add_phase1(global uint *input,
+ global uint *output,
+ global uint *prefix_sums,
+ const uint N)
+{
+ const uint local_size = get_local_size(0);
+ const uint numTasks = get_num_groups(0);
+ const uint groupID = get_group_id(0);
+ const uint localID = get_local_id(0);
+ const uint global_startID = (groupID + 0) * N / numTasks;
+ const uint global_endID = (groupID + 1) * N / numTasks;
+ const uint local_range = global_endID - global_startID;
+
+ uint global_base = 0;
+ for (uint i = 0; i < groupID; i++)
+ global_base += prefix_sums[i];
+
+ const uint j = get_local_id(0);
+ const uint J = get_local_size(0);
+ const uint BLOCKSIZE = (local_range + J - 1) / J;
+ const uint startID = (j + 0) * local_range / J + global_startID;
+ const uint endID = (j + 1) * local_range / J + global_startID;
+
+ uint base = 0;
+ for (uint i = startID; i < endID; i++)
+ base += input[i];
+
+ base = work_group_scan_exclusive_add(base);
+
+ uint accu = 0;
+ for (uint i = startID; i < endID; i++)
+ {
+ output[i] = global_base + base + accu;
+ accu += input[i];
+ }
+}
+
+/* ========================================================================= */
+/* ============================== STATISTICS =============================== */
+/* ========================================================================= */
+
+/* ====== STATS config ====== */
+
+#define ENABLE_STAT_CHECKS 1
+#define DBG_STATS(x)
+
+__attribute__((reqd_work_group_size(256, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16))) void kernel
+printBVHStatistics(global struct Globals *globals,
+ global char *bvh_mem,
+ global struct StatStackEntry *global_stack0,
+ global struct StatStackEntry *global_stack1,
+ const uint presplit)
+{
+ const uint globalID = get_global_id(0);
+ const uint localID = get_local_id(0);
+ const uint local_size = get_local_size(0);
+
+ struct BVHBase *base = (struct BVHBase *)bvh_mem;
+ const uint root = base->rootNodeOffset;
+
+ local uint stack_items[2];
+ local uint iterations;
+
+ struct AABB root_aabb = getAABB_QBVHNodeN((global struct QBVHNodeN *)(bvh_mem + root));
+ root_aabb = conservativeAABB(&root_aabb);
+ const float root_area = AABB_halfArea(&root_aabb);
+
+ global struct QBVHNodeN *root_node = (global struct QBVHNodeN *)(bvh_mem + base->rootNodeOffset);
+
+ if (root_node->type != BVH_INTERNAL_NODE)
+ {
+ const uint numChildren = getNumChildren_QBVHNodeN(root_node);
+ const uint current = root;
+ for (uint i = 0; i < numChildren; i++)
+ {
+ struct AABB aabb = extractAABB_QBVHNodeN(root_node, i);
+ const float area = AABB_halfArea(&aabb);
+
+ global_stack0[i].node = current + root_node->offset * 64 + i * sizeof(struct Quad);
+ global_stack0[i].type = root_node->type;
+ global_stack0[i].area = area;
+ global_stack0[i].aabb = aabb;
+ global_stack0[i].depth = 0;
+ }
+ stack_items[0] = numChildren;
+ stack_items[1] = 0;
+ }
+ else
+ {
+ global_stack0[0].node = root;
+ global_stack0[0].type = root_node->type;
+ global_stack0[0].area = root_area;
+ global_stack0[0].aabb = root_aabb;
+ global_stack0[0].depth = 1;
+ stack_items[0] = 1;
+ stack_items[1] = 0;
+ }
+
+ const uint maxInnerNodeOffset = globals->node_mem_allocator.cur;
+ const uint maxLeafNodeOffset = globals->quad_mem_allocator.cur;
+
+ DBG_STATS(if (localID == 0) printf("diff %d \n", (globals->node_mem_allocator_cur - globals->node_mem_allocator_start) / 64));
+
+ iterations = 0;
+
+ work_group_barrier(CLK_LOCAL_MEM_FENCE);
+
+ float sah_nodes = 0.0f;
+ float sah_leaves = 0.0f;
+ uint leaves = 0;
+ uint inner_nodes = 0;
+ uint max_depth = 0;
+ uint leaf_items = 0;
+ uint inner_nodes_valid_children = 0;
+
+ while (1)
+ {
+ work_group_barrier(CLK_GLOBAL_MEM_FENCE);
+ const uint buffer_index = (iterations % 2) == 0 ? 0 : 1;
+ global struct StatStackEntry *input_global_stack = buffer_index == 0 ? global_stack0 : global_stack1;
+ global struct StatStackEntry *output_global_stack = buffer_index == 0 ? global_stack1 : global_stack0;
+
+ const uint local_stack_items = stack_items[buffer_index];
+ stack_items[1 - buffer_index] = 0;
+
+ DBG_STATS(if (globalID == 0) printf("iterations %d local_stack_items %d \n", iterations, local_stack_items));
+
+ if (local_stack_items == 0)
+ break;
+ //if (iterations == 5) break;
+
+ work_group_barrier(CLK_GLOBAL_MEM_FENCE);
+
+ if (globalID == 0)
+ iterations++;
+
+ for (uint sindex = localID; sindex < local_stack_items; sindex += local_size)
+ {
+
+ uint current = input_global_stack[sindex].node;
+ uint type = input_global_stack[sindex].type;
+ float current_area = input_global_stack[sindex].area;
+ struct AABB current_aabb = input_global_stack[sindex].aabb;
+ uint current_depth = input_global_stack[sindex].depth;
+
+ //printf("localID %d sindex %d current %d type %d local_stack_items %d \n",localID,sindex,current,type,local_stack_items);
+
+ max_depth = max(max_depth, current_depth);
+
+ if (type == BVH_QUAD_NODE)
+ {
+ unsigned int prims = 1; //getNumLeafPrims(current);
+ if (prims > BVH_LEAF_N_MAX)
+ printf("too many items in leaf %d \n", prims);
+ unsigned int prims_offset = current; //getLeafOffset(current);
+ //printf("prims_offset %d \n",prims_offset);
+
+ leaf_items += prims;
+ sah_leaves += current_area;
+ leaves++;
+#if ENABLE_STAT_CHECKS == 1
+ struct AABB leafAABB;
+ AABB_init(&leafAABB);
+
+ global struct Quad *quads = (global struct Quad *)(bvh_mem + prims_offset);
+ //printf("prims_offset %d \n",prims_offset);
+
+ for (uint i = 0; i < prims; i++)
+ {
+ struct AABB quadAABB = getAABB_Quad(&quads[i]);
+ AABB_extend(&leafAABB, &quadAABB);
+ }
+
+ if (!presplit && !AABB_subset(&leafAABB, &current_aabb))
+ {
+ printf("leaf error: current %d depth %d \n", current, current_depth);
+ AABB_print(&current_aabb);
+ printf("leaf bounds: \n");
+ AABB_print(&leafAABB);
+ }
+#endif
+ }
+ else if (type == BVH_INTERNAL_NODE)
+ {
+ inner_nodes++;
+ sah_nodes += current_area;
+ global struct QBVHNodeN *nodeN = (global struct QBVHNodeN *)(bvh_mem + current);
+
+ uint children = 0;
+ for (uint i = 0; i < BVH_NODE_N6; i++)
+ {
+ if (nodeN->qbounds.lower_x[i] > nodeN->qbounds.upper_x[i])
+ break;
+ children++;
+ }
+ //printf("children %d \n",children);
+
+#if ENABLE_STAT_CHECKS == 1
+ if (children > BVH_NODE_N6 || children == 0)
+ {
+ printf("#children not in valid range: %d offset %d localID %d \n", children, current, localID);
+ printQBVHNodeN(nodeN);
+ }
+
+ if (nodeN->offset > globals->totalAllocatedMem || (int)nodeN->offset < 0)
+ {
+ printf("offset error %d \n", nodeN->offset);
+ }
+#endif
+
+ uint children_offset = atomic_add(&stack_items[1 - buffer_index], children);
+
+ for (uint i = 0; i < children; i++)
+ {
+ inner_nodes_valid_children++;
+
+ struct AABB aabb = extractAABB_QBVHNodeN(nodeN, i);
+ const float area = AABB_halfArea(&aabb);
+
+ aabb = conservativeAABB(&aabb);
+
+#if 0 // ENABLE_STAT_CHECKS == 1 // FIXME: not clear whether parent child property still holds !!!!
+
+ // if (aabb.lower.x == (float)(INFINITY))
+ // {
+ // printf("aabb inf error %d current %d nodeN %d \n",i, current, children);
+ // break;
+ // }
+
+
+ if (!presplit && !AABB_subset(&aabb,&current_aabb))
+ {
+ printf("Parent: current %d depth %d children %d \n",current, current_depth, children);
+ AABB_print(&current_aabb);
+ printf("Child %d: \n",i);
+ AABB_print(&aabb);
+ }
+#endif
+
+ uint dest_index = children_offset + i;
+ if (nodeN->type == BVH_QUAD_NODE)
+ {
+ output_global_stack[dest_index].node = current + nodeN->offset * 64 + i * sizeof(struct Quad);
+ if (output_global_stack[dest_index].node >= maxLeafNodeOffset)
+ {
+ printf("stack leaf offset error %d %d current %d %d \n", output_global_stack[dest_index].node, output_global_stack[dest_index].node / 64, current, current / 64);
+ }
+ }
+ else if (nodeN->type == BVH_INTERNAL_NODE)
+ {
+ output_global_stack[dest_index].node = (current + nodeN->offset * 64 + i * sizeof(struct QBVHNodeN));
+ if (output_global_stack[dest_index].node >= maxInnerNodeOffset)
+ {
+ printf("stack inner node offset error %d %d current %d %d maxInnerNodeOffset %d \n", output_global_stack[dest_index].node, output_global_stack[dest_index].node / 64, current, current / 64, maxInnerNodeOffset);
+ }
+ }
+
+ output_global_stack[dest_index].type = nodeN->type;
+ output_global_stack[dest_index].area = area;
+ output_global_stack[dest_index].aabb = aabb;
+ output_global_stack[dest_index].depth = current_depth + 1;
+ //printf("global_stack[dest_index].node %d global_stack[dest_index].type %d \n",global_stack[dest_index].node,global_stack[dest_index].type);
+ }
+ }
+ }
+ }
+
+ sah_nodes = work_group_reduce_add(sah_nodes);
+ sah_leaves = work_group_reduce_add(sah_leaves);
+ leaves = work_group_reduce_add(leaves);
+ inner_nodes = work_group_reduce_add(inner_nodes);
+ max_depth = work_group_reduce_max(max_depth);
+ leaf_items = work_group_reduce_add(leaf_items);
+ inner_nodes_valid_children = work_group_reduce_add(inner_nodes_valid_children);
+
+ if (globalID == 0)
+ {
+ /*
+ sah_nodes *= 1.0f / root_area;
+ sah_leaves *= 1.0f / root_area;
+ float sah = sah_nodes + sah_leaves;
+
+ const uint globalLeafMemAllocatorOffset = globals->quad_mem_allocator.start;
+ const uint totalAllocatedMem = globals->totalAllocatedMem;
+
+ printf("BVH_NODE_N6 %d BVH_LEAF_N_MIN %d BVH_LEAF_N_MAX %d \n",BVH_NODE_N6,BVH_LEAF_N_MIN,BVH_LEAF_N_MAX);
+ float node_util = 100.0f * (float)inner_nodes_valid_children / (inner_nodes * BVH_NODE_N6);
+ float leaf_util = 100.0f * (float)leaf_items / (leaves);
+ printf("allocators: node %d -> %d ; leaf %d -> %d \n",globals->node_mem_allocator_cur,globals->node_mem_allocator_start,globals->leaf_mem_allocator_cur,globals->leaf_mem_allocator_start);
+ printf("inner nodes %d leaves %d sah %f sah_node %f sah_leaves %f max_depth %d leaf_items %d node util %f leaf util %f (%f) \n",inner_nodes,leaves,sah,sah_nodes,sah_leaves,max_depth,leaf_items,node_util,leaf_util,(float)leaf_items / leaves);
+ uint node_mem = globals->node_mem_allocator_cur;
+ uint max_node_mem = globalLeafMemAllocatorOffset;
+ float node_mem_ratio = 100.0f * (float)node_mem / max_node_mem;
+
+ uint leaf_mem = globals->leaf_mem_allocator.cur - globalLeafMemAllocatorOffset;
+ uint max_leaf_mem = totalAllocatedMem - globalLeafMemAllocatorOffset;
+ float leaf_mem_ratio = 100.0f * (float)leaf_mem / max_leaf_mem;
+
+ uint total_mem = node_mem + leaf_mem;
+ float total_mem_ratio = 100.0f * (float)total_mem / totalAllocatedMem;
+
+ printf("used node memory %d (%f) / used leaf memory %d (%f) / total memory used %d (%f) / total memory allocated %d \n",node_mem, node_mem_ratio, leaf_mem, leaf_mem_ratio, total_mem, total_mem_ratio, totalAllocatedMem);
+ */
+ }
+}
diff --git a/src/intel/vulkan/grl/gpu/misc_shared.h b/src/intel/vulkan/grl/gpu/misc_shared.h
new file mode 100644
index 00000000000..218f2fa4291
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/misc_shared.h
@@ -0,0 +1,196 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+//
+// This file contains structure definitions shared by GRL OCL kernels and host code
+//
+
+#pragma once
+
+#include "GRLGen12.h"
+
+GRL_NAMESPACE_BEGIN(GRL)
+GRL_NAMESPACE_BEGIN(RTAS)
+GRL_NAMESPACE_BEGIN(MISC)
+
+struct BatchedInitGlobalsData
+{
+ qword p_build_globals;
+ qword p_bvh_buffer;
+ dword numPrimitives;
+ dword numGeometries;
+ dword numInstances;
+ dword instance_descs_start;
+ dword geo_meta_data_start;
+ dword node_data_start;
+ dword leaf_data_start;
+ dword procedural_data_start;
+ dword back_pointer_start;
+ dword sizeTotal;
+ dword leafType;
+ dword leafSize;
+ dword fatleaf_table_start;
+ dword innernode_table_start;
+ dword quad_indices_data_start;
+};
+
+/// Header of debug buffer
+///
+/// Header is placed at the begining of debug buffer.
+/// After header there is circullar buffer space
+typedef struct DebugBufferHeader
+{
+ /// Offset to begin of buffer (after header)
+ dword headStart;
+ /// Offset to free memory in buffer (used by gpu)
+ dword gpuHead;
+ /// Offset to end of data in buffer that is ready to read (read on cpu, set on gpu, might be behind gpuHeader)
+ dword cpuHead;
+ /// Flag for buffer overflow
+ dword overflow;
+ /// Total size of buffer
+ dword totalSize;
+ /// Padding needed because otherwise GPU overrides tail with cacheline flush
+ dword pad[11];
+ /// Offset to begin of data in buffer
+ dword tail;
+} DebugBufferHeader;
+
+enum InputDumpOperationType
+{
+ INPUT_DUMP_OP_NOP,
+ INPUT_DUMP_OP_BATCH,
+ INPUT_DUMP_OP_BUILD,
+ INPUT_DUMP_OP_UPDATE,
+ INPUT_DUMP_OP_CLONE,
+ INPUT_DUMP_OP_COMPACT,
+ INPUT_DUMP_OP_SERIALIZE,
+ INPUT_DUMP_OP_DESERIALIZE,
+ INPUT_DUMP_OP_END_BUFFER
+};
+
+// each operation starts with the same header structure and looks like this
+
+// some defined struct { <-----------------start
+// OpHeader
+// .... struct type specific data
+// }
+// ... auxilary data of variable len
+// <-------------------------------------- end - indicated by endOfData
+typedef struct OpHeader
+{
+ dword operationType;
+ dword endOfData; // offset to end of this primitive
+} OpHeader;
+
+// header for batch operations
+typedef struct BatchOpHeader
+{
+ OpHeader opHeader;
+} BatchOpHeader;
+
+// interpretation for operationType INPUT_DUMP_OP_BATCH
+typedef struct InputBatch
+{
+ BatchOpHeader header;
+ qword batchId;
+ dword vertexBufferDataSize;
+ dword firstContainedOpOffset;
+
+ // layout of batch is as below, each line is 128B aligned:
+
+ //
+ // InputBatch <-------------------------------- start
+ // optional: batchVertexData
+ // InputBuildDesc/InputCopy <------------------ start + firstContainedOpOffset
+ // optional: extra data of above token
+ // InputBuildDesc/InputCopy
+ // optional: extra data of above token
+ // ...
+ // InputBuildDesc/InputCopy
+ // optional: extra data of above token
+ // <-------------------------------------------- end = start + endOfData
+} InputBatch;
+
+// for operationType:
+// INPUT_DUMP_OP_BUILD,
+// INPUT_DUMP_OP_UPDATE,
+// followed by auxilary data of variable len
+typedef struct InputBuild
+{
+ OpHeader header;
+ qword srcBvhPtr;
+ qword dstBvhPtr;
+ dword flags;
+ dword numGeos;
+ dword numInstances;
+ dword instArrayOfPtrs;
+} InputBuild;
+
+// for operationType:
+// INPUT_DUMP_OP_CLONE,
+// INPUT_DUMP_OP_COMPACT,
+// INPUT_DUMP_OP_SERIALIZE,
+//
+// Not for INPUT_DUMP_OP_DESERIALIZE!
+typedef struct InputCopy
+{
+ OpHeader header;
+ qword srcBvhPtr;
+ qword dstBvhPtr;
+} InputCopy;
+
+// for INPUT_DUMP_OP_DESERIALIZE
+// decode for debug tools follows this format
+typedef struct InputDeserialize
+{
+ OpHeader header;
+ qword dstBvhPtr;
+} InputDeserialize;
+
+typedef struct InputBatchPtrs
+{
+ qword dumpDst;
+ qword globalDumpBuffer;
+ qword nonVertexDataStart;
+ dword vertexBuffersSize;
+ dword totalSize;
+} InputBatchPtrs;
+
+enum OutputDumpOperationType
+{
+ OUTPUT_DUMP_OP_NOP,
+ OUTPUT_DUMP_OP_BATCH,
+ OUTPUT_DUMP_OP_DATA,
+ OUTPUT_DUMP_OP_END_BUFFER
+};
+
+// interpretation for operationType OUTPUT_DUMP_OP_BATCH
+typedef struct OutputBatch {
+ BatchOpHeader header;
+ qword batchId;
+ dword firstContainedOpOffset;
+} OutputBatch;
+
+// interpretation for operationType OUTPUT_DUMP_OP_DATA
+typedef struct OutputData
+{
+ OpHeader header;
+ qword srcBvhPtr;
+} OutputData;
+
+typedef struct OutputBatchPtrs
+{
+ qword dumpDst;
+ qword dataStart;
+ dword dataSize;
+ dword totalSize;
+} OutputBatchPtrs;
+
+GRL_NAMESPACE_END(MISC)
+GRL_NAMESPACE_END(RTAS)
+GRL_NAMESPACE_END(GRL)
diff --git a/src/intel/vulkan/grl/gpu/morton/morton_common.h b/src/intel/vulkan/grl/gpu/morton/morton_common.h
new file mode 100644
index 00000000000..2beb7a1aff3
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/morton/morton_common.h
@@ -0,0 +1,245 @@
+//
+// Copyright (C) 2009-2022 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#include "common.h"
+
+#define MORTON_DEBUG_CHECKS 0
+#define MORTON_VERBOSE_LOG 0
+
+GRL_INLINE uint get_morton_sort_lsb_req_iterations( uint shift )
+{
+#if 0 // turn off, because current hierarchy build requires full sort
+ // Difference between max iterations needed for LSB sorting and
+ // number of iterations needed for LSB sorting without primIDs
+ // This indicates how many of first iterations would be skipped in LSB
+ return 8 - (8 - (shift >> 3));
+#else
+ return 0;
+#endif
+}
+
+typedef struct BuildRecordLocalMortonFlattener
+{
+ unsigned int leftChild; // global
+ unsigned int rightChild; // global
+ unsigned int rangeStart; // global
+ unsigned int local_parent_index__numItems;
+} BuildRecordLocalMortonFlattener;
+
+// TODO: Currently sizeof UPerNodeData is 32, AABB struct allocates more data than needed and can be reduced
+typedef union UPerNodeData {
+ float4 four_DWs;
+ BuildRecordLocalMortonFlattener buildRecord;
+ MortonFlattenedBoxlessNode boxlessNode;
+ struct AABB box;
+} UPerNodeData;
+
+GRL_INLINE uint MortonFlattenedBoxlessNode_GetChildOffset(MortonFlattenedBoxlessNode bn)
+{
+ return bn.childOffset_type >> 6;
+}
+
+GRL_INLINE uint MortonFlattenedBoxlessNode_GetType(MortonFlattenedBoxlessNode bn)
+{
+ return bn.childOffset_type & ((1<<6) -1);
+}
+
+GRL_INLINE void set_2xSG_arr_first_write(uint index, uint* arr, ushort val, short lane)
+{
+ short lane_used = index % get_sub_group_size();
+ short shift = (index / get_sub_group_size()) * get_sub_group_size();
+ if (lane_used == lane) {
+ *arr |= (val << shift);
+ }
+}
+
+GRL_INLINE short get_from_2xSG_arr(uint index, uint arr, short lane)
+{
+ short r = 0;
+ short lane_used = index % get_sub_group_size();
+ short shift = (index / get_sub_group_size()) * get_sub_group_size();
+ r = arr >> shift;
+ r = sub_group_broadcast(r, lane_used);
+ return r;
+}
+
+GRL_INLINE void unpack_from_2xSG_arr(uint count, uint arr, short lane, ushort* dst)
+{
+ if (lane < count)
+ {
+ dst[lane]=(ushort)(arr & 0xFFFF);
+ short hi_idx = lane + get_sub_group_size();
+ if (hi_idx < count) {
+ dst[hi_idx] = (ushort)(arr >> 16);
+ }
+ }
+}
+
+
+GRL_INLINE void pack_from_2xSG_arr(ushort* src, uint count, uint *arr, short lane)
+{
+ if (lane < count)
+ {
+ *arr = src[lane];
+ short hi_idx = lane + get_sub_group_size();
+ if (hi_idx < count) {
+ *arr |= ((uint)(src[hi_idx])) << 16u;
+ }
+ }
+}
+
+GRL_INLINE void set_2xSG_arr(uint index, uint* arr, short val, short lane)
+{
+ short lane_used = index % get_sub_group_size();
+ short shift = (index / get_sub_group_size()) * get_sub_group_size();
+ if (lane_used == lane) {
+ uint rem_val = (*arr) & (0xFFFF0000 >> shift); //calculate the ramaining other half in the uint
+ *arr = (val << shift) | rem_val;
+ }
+}
+
+GRL_INLINE void SUBGROUP_refit_bottom_up_local(
+ uniform struct QBVHNodeN* globalNodeData,
+ uniform struct BackPointers* backPointers,
+ uniform uint treeletRootGlobalIndex,
+ uniform uint globalBaseForInternalNodes,
+ varying ushort lane,
+ uniform local union UPerNodeData* local_nodes,
+ varying uint sg_bu_startpoints,
+ uniform uint sg_bu_startpoints_cnt)
+{
+ if(sg_bu_startpoints_cnt == 0)
+ return;
+
+ const uint head_lane = 0;
+ uint curNodeIndex = get_from_2xSG_arr(--sg_bu_startpoints_cnt, sg_bu_startpoints, lane);
+
+ uniform uint prev_loc_index = 0;
+ uniform struct AABB child_aabb; // this carries reduced aabb between loop turns
+
+ uniform uint backpointer = local_nodes[curNodeIndex].boxlessNode.backPointer;
+
+ while (curNodeIndex != 0)
+ {
+ uniform uint lead_child_loc_offset = MortonFlattenedBoxlessNode_GetChildOffset(local_nodes[curNodeIndex].boxlessNode);
+ uniform uint nodeType = MortonFlattenedBoxlessNode_GetType(local_nodes[curNodeIndex].boxlessNode);
+ varying uint child_loc_idx = lead_child_loc_offset + curNodeIndex + lane;
+
+ uint numChildren = BackPointer_GetNumChildren(backpointer);
+ if (child_loc_idx != prev_loc_index &&
+ lane < numChildren)
+ {
+ child_aabb = local_nodes[child_loc_idx].box;
+ }
+ else if (lane >= numChildren) {
+ AABB_init(&child_aabb);
+ child_aabb.lower.w = as_float(0u);
+ }
+
+ // TODO: perNode data could hold 7 dwords per node instead of 8 as long as we keep it in SLM
+ struct AABB reduced_bounds = AABB_sub_group_reduce_N6(&child_aabb);
+ reduced_bounds = AABB_sub_group_shuffle( &reduced_bounds, 0 );
+
+ uint instMask = (uint)sub_group_reduce_or_N6(as_uint(child_aabb.lower.w));
+ reduced_bounds.lower.w = as_float((uint)instMask);
+ uint reduce_bounds_lane = AABB_sub_group_shuffle_coordPerLane(&reduced_bounds, 0);
+ local uint* pbox = (local uint*)(local_nodes+ curNodeIndex);
+ if (lane < 8)
+ {
+ pbox[lane] = reduce_bounds_lane;
+ }
+
+ uint global_node_idx = globalBaseForInternalNodes + curNodeIndex;
+ /* get bounds of all children from child nodes directly */
+ struct QBVHNodeN* qnode = globalNodeData + global_node_idx;
+ subgroup_setQBVHNodeN_setFields(lead_child_loc_offset, nodeType, &child_aabb, numChildren, instMask, qnode, false);
+ child_aabb = reduced_bounds;
+ uint parentIndex = BackPointer_GetParentIndex(backpointer);
+
+ write_mem_fence(CLK_LOCAL_MEM_FENCE);
+
+ if (lane == 0)
+ {
+ backpointer = atomic_inc_local(&(local_nodes[parentIndex].boxlessNode.backPointer));
+ uint globalParentIndex = (parentIndex > 0) ? (parentIndex + globalBaseForInternalNodes) : treeletRootGlobalIndex;
+ uint globalBackpointer = (globalParentIndex << 6) | (numChildren << 3);
+
+ /* set global back pointer */
+ *InnerNode_GetBackPointer(backPointers, global_node_idx) = globalBackpointer;
+
+#if MORTON_VERBOSE_LOG
+ printf("BU_INNER: index: %d, first_child_id: %d, offset: %d, parent: %d, lead_child_loc_offset: %d, numChildren: %d, child_loc_idx: %d\n",
+ global_node_idx, global_node_idx + qnode->offset, qnode->offset, globalBackpointer >> 6, lead_child_loc_offset, numChildren, child_loc_idx);
+#endif
+ }
+
+ backpointer = 1 + intel_sub_group_shuffle(backpointer, head_lane);
+ prev_loc_index = curNodeIndex;
+ curNodeIndex = parentIndex;
+
+ /* if all children got refitted, then continue */
+ uniform uint numChildrenRefitted = (backpointer >> 0) & 0x7;
+ uniform uint numChildrenTotal = (backpointer >> 3) & 0x7;
+ if (numChildrenRefitted != numChildrenTotal)
+ {
+ if(sg_bu_startpoints_cnt)
+ {
+ curNodeIndex = get_from_2xSG_arr(--sg_bu_startpoints_cnt, sg_bu_startpoints, lane);
+ backpointer = local_nodes[curNodeIndex].boxlessNode.backPointer;
+ }
+ else
+ return;
+ }
+ }
+
+ // process root of the treelet
+ {
+
+#if MORTON_DEBUG_CHECKS
+ if (curNodeIndex != 0) printf("SUBGROUP_refit_bottom_up_local: this should be local node index 0\n");
+#endif
+
+ uniform uint lead_child_loc_offset = MortonFlattenedBoxlessNode_GetChildOffset(local_nodes[0].boxlessNode);
+ varying uint child_loc_idx = lead_child_loc_offset + 0 + lane;
+ uint numChildren = BackPointer_GetNumChildren(backpointer);
+
+ if (child_loc_idx != prev_loc_index &&
+ lane < numChildren)
+ {
+ child_aabb = local_nodes[child_loc_idx].box;
+ }
+ else if (lane >= numChildren) {
+ AABB_init(&child_aabb);
+ child_aabb.lower.w = as_float(0u);
+ }
+
+ // TODO: perNode data could hold 7 dwords per node instead of 8 as long as we keep it in SLM
+ uint instMask = (uint)sub_group_reduce_or_N6(as_uint(child_aabb.lower.w));
+ uint nodeType = MortonFlattenedBoxlessNode_GetType(local_nodes[curNodeIndex].boxlessNode);
+ uint global_node_idx = treeletRootGlobalIndex;
+ uint lead_child_global_idx = globalBaseForInternalNodes + lead_child_loc_offset;
+
+ /* get bounds of all children from child nodes directly */
+ struct QBVHNodeN* qnode = globalNodeData + global_node_idx;
+
+ subgroup_setQBVHNodeN_setFields(lead_child_global_idx - global_node_idx, nodeType, &child_aabb, numChildren, instMask, qnode, false);
+
+ /* reset refit counter for next refit */
+ if (lane == 0)
+ {
+ /* set global back pointer */
+ *InnerNode_GetBackPointer(backPointers, global_node_idx) = backpointer & (~7u);
+
+ // TODO: Move AABBs to separate buffer, but for now communicate bottom-tip boxes through qnodes
+
+#if MORTON_VERBOSE_LOG
+ printf("BU_ROOT: curNodeIndex: %d, index: %d, first_child_id: %d, offset: %d, parent: %d, numChildren: %d, sg_bu_startpoints_cnt: %d\n",
+ curNodeIndex, global_node_idx, global_node_idx + qnode->offset, qnode->offset, backpointer >> 6, numChildren, sg_bu_startpoints_cnt);
+#endif
+ }
+ }
+}
diff --git a/src/intel/vulkan/grl/gpu/morton/phase0.cl b/src/intel/vulkan/grl/gpu/morton/phase0.cl
new file mode 100644
index 00000000000..2fa91c214e1
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/morton/phase0.cl
@@ -0,0 +1,400 @@
+//
+// Copyright (C) 2009-2022 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#include "libs/lsc_intrinsics.h"
+#include "morton/morton_common.h"
+
+GRL_INLINE void SUBGROUP_create_node_phase0(
+ uniform global struct Globals* globals,
+ uniform global struct BinaryMortonCodeHierarchy* bnodes,
+ uniform global char* bvh_mem,
+ uniform global uint *global_refit_startpoints,
+ uniform uint rID,
+ uniform local uint* local_numRecords,
+ uniform local uint* local_QNodeOffset,
+ uniform global struct BuildRecordMorton* records,
+ uniform struct BuildRecordMorton current,
+ uniform local uint* local_startpoints_num)
+{
+ uniform global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem;
+ uniform const uint rootNodeOffset = BVH_ROOT_NODE_OFFSET;
+ uniform global struct QBVHNodeN* nodeData = BVHBase_nodeData( bvh );
+ uniform BackPointers* backPointers = BVHBase_GetBackPointers( bvh );
+
+ varying ushort lane = get_sub_group_local_id();
+
+ /* initialize child array */
+ uniform uint numChildren = 2;
+ varying struct BuildRecordMorton sg_children;
+ sg_children.items = 0;
+ sg_children.nodeID = (lane == 0) ? bnodes[current.nodeID].leftChild : bnodes[current.nodeID].rightChild;
+
+ if ( lane < numChildren )
+ sg_children.items = BinaryMortonCodeHierarchy_getNumPrimitives( bnodes, sg_children.nodeID );
+
+ /* fill QBVH6 node with up to 6 children */
+ while ( numChildren < BVH_NODE_N6 )
+ {
+ varying bool sg_is_leaf = sg_children.items <= cfg_minLeafSize;
+ if ( sub_group_all( sg_is_leaf ) )
+ break;
+
+ uniform uint bestItems = sub_group_reduce_max_N6( sg_children.items );
+ uniform ushort bestChild = ctz( intel_sub_group_ballot( sg_children.items == bestItems ) );
+ uniform uint bestNodeID = sub_group_broadcast( sg_children.nodeID, bestChild );
+
+ varying uint nodeID = (lane == bestChild) ? bnodes[bestNodeID].leftChild : bnodes[bestNodeID].rightChild;
+
+ if ( lane == numChildren || lane == bestChild )
+ {
+ sg_children.nodeID = nodeID;
+ sg_children.items = BinaryMortonCodeHierarchy_getNumPrimitives( bnodes, nodeID );
+ }
+
+ numChildren++;
+ }
+
+ const uint current_index = current.current_index;
+ struct QBVHNodeN* qnode = nodeData + current_index;
+ SUBGROUP_QBVHNodeN_setChildIncr1( qnode );
+
+ uniform uint global_offset;
+ uniform uint child_node_offset;
+
+ // Check if all children will be roots for the local subgtrees in phase1. If so we keep the node ids to be later
+ // used in global refit after phase1
+ varying uchar is_children_root = (lane < numChildren) ? (sg_children.items <= MORTON_BUILDER_SUBTREE_THRESHOLD) : 0;
+ uniform uchar children_roots_num = sub_group_reduce_add(is_children_root);
+
+ if ( lane == 0 )
+ {
+ child_node_offset = atomic_add_local(local_QNodeOffset,64*numChildren);
+
+ /* create node, but to not set bounds yet as these get calculated during refit */
+ QBVH6Node_set_type( qnode, BVH_INTERNAL_NODE );
+ QBVH6Node_set_offset( qnode, (global struct QBVHNodeN*)(bvh_mem + child_node_offset) );
+ /* set back pointers */
+ uint backpointer = (current.parent_index << 6) | (numChildren << 3);
+
+ global_offset = atomic_add_local( local_numRecords, numChildren - 1 );
+
+#if MORTON_VERBOSE_LOG
+ printf("PHASE0: loc_id: %d, index: %d, first_child_id: %d, offset: %d, parent: %d, numChildren: %d\n",
+ rID, current_index, current_index + qnode->offset, qnode->offset, current.parent_index, numChildren);
+#endif
+
+ if(children_roots_num == numChildren)
+ {
+ uint startpoints_offset = atomic_inc_local( local_startpoints_num );
+ global_refit_startpoints[startpoints_offset] = current_index;
+ }
+ else
+ {
+ backpointer += children_roots_num;
+ }
+
+ *InnerNode_GetBackPointer(backPointers, current_index) = backpointer;
+ }
+
+ child_node_offset = sub_group_broadcast( child_node_offset, 0 );
+ global_offset = sub_group_broadcast( global_offset, 0 );
+
+ uniform global struct QBVHNodeN* childNodes = (global struct QBVHNodeN*)(bvh_mem + child_node_offset);
+
+ sg_children.current_index = childNodes - nodeData + lane;
+ sg_children.parent_index = current_index;
+
+ if ( lane < numChildren )
+ {
+ uint write_position = (lane == 0) ? rID : global_offset + lane - 1;
+ records[write_position] = sg_children;
+ }
+}
+
+
+GRL_INLINE void SUBGROUP_create_node_phase0_local_sync(
+ uniform global struct Globals* globals,
+ uniform global struct BinaryMortonCodeHierarchy* bnodes,
+ uniform global char* bvh_mem,
+ uniform uint rID,
+ uniform local uint* local_numRecords,
+ uniform local uint* local_QNodeOffset,
+ uniform global struct BuildRecordMorton* records,
+ uniform struct BuildRecordMorton current,
+ uniform local uint* local_p0_total,
+ uniform global struct MortonFlattenedBoxlessNode *boxless_nodes,
+ uniform uint nodeDataStart)
+{
+ uniform global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem;
+ uniform const uint rootNodeOffset = bvh->rootNodeOffset;
+ uniform global struct QBVHNodeN* nodeData = BVHBase_nodeData( bvh );
+ uniform BackPointers* backPointers = BVHBase_GetBackPointers( bvh );
+
+ varying ushort lane = get_sub_group_local_id();
+
+ /* initialize child array */
+ uniform uint numChildren = 2;
+ varying struct BuildRecordMorton sg_children;
+ sg_children.items = 0;
+ sg_children.nodeID = (lane == 0) ? bnodes[current.nodeID].leftChild : bnodes[current.nodeID].rightChild;
+
+ if ( lane < numChildren )
+ sg_children.items = BinaryMortonCodeHierarchy_getNumPrimitives( bnodes, sg_children.nodeID );
+
+ /* fill QBVH6 node with up to 6 children */
+ while ( numChildren < BVH_NODE_N6 )
+ {
+ varying bool sg_is_leaf = sg_children.items <= cfg_minLeafSize;
+ if ( sub_group_all( sg_is_leaf ) )
+ break;
+
+ uniform uint bestItems = sub_group_reduce_max_N6( sg_children.items );
+ uniform ushort bestChild = ctz( intel_sub_group_ballot( sg_children.items == bestItems ) );
+ uniform uint bestNodeID = sub_group_broadcast( sg_children.nodeID, bestChild );
+
+ varying uint nodeID = (lane == bestChild) ? bnodes[bestNodeID].leftChild : bnodes[bestNodeID].rightChild;
+
+ if ( lane == numChildren || lane == bestChild )
+ {
+ sg_children.nodeID = nodeID;
+ sg_children.items = BinaryMortonCodeHierarchy_getNumPrimitives( bnodes, nodeID );
+ }
+
+ numChildren++;
+ }
+
+ const uint current_index = current.current_index;
+ uniform uint global_offset;
+ uniform uint child_node_offset;
+
+ // Check if all children will be roots for the local subgtrees in phase1. If so we keep the node ids to be later
+ // used in global refit after phase1
+ varying uchar is_children_root = (lane < numChildren) ? (sg_children.items <= MORTON_BUILDER_SUBTREE_THRESHOLD) : 0;
+ uniform uchar rootMask = sub_group_reduce_or_N6(is_children_root << lane);
+ uniform uchar children_roots_num = sub_group_reduce_add(is_children_root);
+
+ if ( lane == 0 )
+ {
+ child_node_offset = atomic_add_local(local_QNodeOffset,64*numChildren);
+
+ /* Do not create qnodes here */
+ uint backpointer = (current.parent_index << 6) | (numChildren << 3);
+
+ global_offset = atomic_add_local( local_numRecords, numChildren - 1 );
+
+#if MORTON_VERBOSE_LOG
+ printf("PHASE0: loc_id: %d, index: %d, first_child_id: %d, offset: %d, parent: %d, numChildren: %d, nodeDataStart: %d\n",
+ rID, current_index, current_index + qnode->offset, qnode->offset, current.parent_index, numChildren, nodeDataStart);
+#endif
+
+ MortonFlattenedBoxlessNode flattened_node;
+
+ if(children_roots_num != numChildren)
+ backpointer += children_roots_num;
+
+ flattened_node.binary_hierarchy_index = (current_index << 6) | rootMask;
+
+ uint loc_id = atomic_inc_local( local_p0_total );
+
+ flattened_node.childOffset_type = ((((child_node_offset - nodeDataStart * 64) / 64) - current_index) << 6) | BVH_INTERNAL_NODE;
+ flattened_node.backPointer = backpointer;
+
+ //TODO: change this writes to L1WB or streaming
+ boxless_nodes[loc_id] = flattened_node;
+
+ *InnerNode_GetBackPointer(backPointers, current_index) = backpointer;
+ }
+
+ child_node_offset = sub_group_broadcast( child_node_offset, 0 );
+ global_offset = sub_group_broadcast( global_offset, 0 );
+
+ uniform global struct QBVHNodeN* childNodes = (global struct QBVHNodeN*)(bvh_mem + child_node_offset);
+
+ sg_children.current_index = childNodes - nodeData + lane;
+ sg_children.parent_index = current_index;
+
+ if ( lane < numChildren )
+ {
+ uint write_position = (lane == 0) ? rID : global_offset + lane - 1;
+ records[write_position] = sg_children;
+ }
+}
+
+/*
+
+ In this phase a single large work group performs the construction of
+ the top of the BVH and creates a build record array.
+
+ Two varians of this kernel:
+ 1. Refit with global synchronization - Used for big bvh, where number of allocated nodes will not fit
+ in SLM in phase2. Phase0 creates qnodes in bvh, and provides startpoints for bottom up phase
+ that is executed after phase1. This refit uses global synchronizations and mem_fence_gpu_invalidate
+ that is not effective.
+ 2. Refit with local synchronization - Flattened boxless nodes are passed via global memory, along with
+ number of created nodes. Phase0 does not create qnodes in bvh, it is done in phase2 during refit.
+ In phase2, flattened boxless nodes are moved to SLM, along with bounding boxes from phase1.
+ Refit is performed only with local synchronization.
+
+*/
+
+__attribute__((reqd_work_group_size(512, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) void kernel
+parallel_build_phase0(global struct Globals *globals,
+ global struct BinaryMortonCodeHierarchy *bnodes,
+ global char *bvh_mem,
+ global uint *global_refit_startpoints)
+{
+ global struct BVHBase *bvh = (global struct BVHBase *)bvh_mem;
+ global struct QBVHNodeN *nodeData = BVHBase_nodeData(bvh);
+
+ /* a queue of build records in global memory */
+ global struct BuildRecordMorton *records = (global struct BuildRecordMorton *)(bvh_mem + 64*bvh->quadLeafStart);
+ local uint local_numRecords;
+ local uint local_QNodeOffset;
+ local uint local_startpoints_num;
+
+ /* initialize first build record */
+ if (get_local_id(0) == 0)
+ {
+ /* allocate root node */
+ uint root_node_offset = 64*bvh->nodeDataCur;
+ global struct QBVHNodeN *rootNode = (global struct QBVHNodeN *)(bvh_mem + root_node_offset);
+
+ //assert(root_node_offset == 0);
+ records[0].nodeID = globals->binary_hierarchy_root;
+ records[0].items = globals->numPrimitives;
+ records[0].current_index = rootNode - nodeData;
+ records[0].parent_index = -1;
+
+ local_numRecords = 1;
+ local_QNodeOffset = root_node_offset + 64;
+ local_startpoints_num = 0;
+
+ mem_fence_workgroup_default();
+ }
+
+ uint num_records = 1;
+
+ /* terminate when all subtrees are under size threshold */
+ while(true)
+ {
+ work_group_barrier(CLK_LOCAL_MEM_FENCE);
+
+ /* all work items in the work group pick a subtree to build */
+ for (uint ID = get_sub_group_id(); ID < num_records; ID += get_num_sub_groups() )
+ {
+ /* small subtrees will get built in next phase */
+ if (records[ID].items <= MORTON_BUILDER_SUBTREE_THRESHOLD) // FIXME: should break at 64 leaves not 64 primitives
+ continue;
+
+ /* create QBVH node */
+ SUBGROUP_create_node_phase0(globals, bnodes, bvh_mem, global_refit_startpoints, ID, &local_numRecords, &local_QNodeOffset,
+ records, records[ID], &local_startpoints_num);
+ }
+
+ work_group_barrier( CLK_LOCAL_MEM_FENCE );
+ mem_fence_workgroup_default();
+ uint old_num_records = num_records;
+ num_records = local_numRecords;
+ if( old_num_records == num_records )
+ break;
+
+ }
+
+ /* remember number of build records for next phase */
+ if (get_local_id( 0 ) == 0)
+ {
+ globals->numBuildRecords = local_numRecords;
+ globals->p0_created_num = local_startpoints_num;
+ bvh->nodeDataCur = local_QNodeOffset / 64;
+
+#if MORTON_VERBOSE_LOG
+ printf("PHASE_0: allocated %d nodes. globals->global_refit_startpoints: %d\n", BVHBase_numNodes(bvh), globals->p0_created_num);
+#endif
+ }
+}
+
+__attribute__((reqd_work_group_size(512, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) void kernel
+parallel_build_phase0_local_sync(global struct Globals *globals,
+ global struct BinaryMortonCodeHierarchy *bnodes,
+ global char *bvh_mem,
+ global struct MortonFlattenedBoxlessNode *boxless_nodes)
+{
+ global struct BVHBase *bvh = (global struct BVHBase *)bvh_mem;
+ global struct QBVHNodeN *nodeData = BVHBase_nodeData(bvh);
+ uint nodeDataStart = BVH_ROOT_NODE_OFFSET / 64;
+
+ /* a queue of build records in global memory */
+ global struct BuildRecordMorton *records = (global struct BuildRecordMorton *)(bvh_mem + 64*bvh->quadLeafStart);
+ local uint local_numRecords;
+ local uint local_QNodeOffset;
+ local uint local_p0_total;
+
+ /* initialize first build record */
+ if (get_local_id(0) == 0)
+ {
+ /* allocate root node */
+ uint root_node_offset = 64*bvh->nodeDataCur;
+ global struct QBVHNodeN *rootNode = (global struct QBVHNodeN *)(bvh_mem + root_node_offset);
+
+ //assert(root_node_offset == 0);
+ records[0].nodeID = globals->binary_hierarchy_root;
+ records[0].items = globals->numPrimitives;
+ records[0].current_index = rootNode - nodeData;
+ records[0].parent_index = -1;
+
+ local_numRecords = 1;
+ local_QNodeOffset = root_node_offset + 64;
+ local_p0_total = 0;
+
+ mem_fence_workgroup_default();
+ }
+
+ uint num_records = 1;
+
+ /* terminate when all subtrees are under size threshold */
+ while(true)
+ {
+ work_group_barrier(CLK_LOCAL_MEM_FENCE);
+
+ /* all work items in the work group pick a subtree to build */
+ for (uint ID = get_sub_group_id(); ID < num_records; ID += get_num_sub_groups() )
+ {
+ /* small subtrees will get built in next phase */
+ if (records[ID].items <= MORTON_BUILDER_SUBTREE_THRESHOLD) // FIXME: should break at 64 leaves not 64 primitives
+ continue;
+
+ /* create QBVH node */
+ SUBGROUP_create_node_phase0_local_sync(globals, bnodes, bvh_mem, ID, &local_numRecords, &local_QNodeOffset, records,
+ records[ID], &local_p0_total, boxless_nodes, nodeDataStart);
+ }
+
+ mem_fence_workgroup_default();
+ work_group_barrier( CLK_LOCAL_MEM_FENCE );
+
+ uint old_num_records = num_records;
+ num_records = local_numRecords;
+ if( old_num_records == num_records )
+ break;
+
+ }
+
+ /* remember number of build records for next phase */
+ if (get_local_id( 0 ) == 0)
+ {
+ globals->numBuildRecords = local_numRecords;
+ bvh->nodeDataCur = local_QNodeOffset / 64;
+
+ globals->p0_allocated_num = BVHBase_numNodes(bvh);
+ globals->p0_created_num = local_p0_total;
+
+#if MORTON_VERBOSE_LOG
+ printf("PHASE_0_LOCAL_SYNC: allocated %d nodes. globals->global_refit_startpoints: %d\n", BVHBase_numNodes(bvh), globals->global_refit_startpoints);
+#endif
+ }
+}
diff --git a/src/intel/vulkan/grl/gpu/morton/phase1.cl b/src/intel/vulkan/grl/gpu/morton/phase1.cl
new file mode 100644
index 00000000000..6a1dd2aa44b
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/morton/phase1.cl
@@ -0,0 +1,785 @@
+//
+// Copyright (C) 2009-2022 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#include "libs/lsc_intrinsics.h"
+#include "morton/morton_common.h"
+
+// caution rec.local_parent_index__numItems needs to have high 16bits filled afterwards;
+BuildRecordLocalMortonFlattener TranslateToLocalRecord(struct BinaryMortonCodeHierarchy srcRec)
+{
+ BuildRecordLocalMortonFlattener rec;
+ rec.leftChild = srcRec.leftChild;
+ rec.rightChild = srcRec.rightChild;
+ rec.rangeStart = srcRec.range.start;
+ rec.local_parent_index__numItems = (srcRec.range.end - srcRec.range.start) + 1;
+ return rec;
+}
+
+GRL_INLINE BuildRecordLocalMortonFlattener MortonFlattenedBoxlessNode_reinterpret_as_BR(MortonFlattenedBoxlessNode boxless)
+{
+ BuildRecordLocalMortonFlattener rec;
+ rec.leftChild = boxless.binary_hierarchy_index;
+ rec.rightChild = boxless.childOffset_type;
+ rec.rangeStart = boxless.backPointer;
+ rec.local_parent_index__numItems = 0;
+ return rec;
+}
+
+GRL_INLINE void SUBGROUP_create_boxless_node_phase1(
+ uniform global struct Globals* globals,
+ uniform global struct BinaryMortonCodeHierarchy* bnodes,
+ uniform global char* bvh_mem,
+ uniform BuildRecordLocalMortonFlattener currentRecord,
+ uniform uint currQnodeLocalId, //local index for flattened qnoode, don't mix this with nodeIndex that is in morton build record
+ uniform local uint* local_numRecords,
+ uniform uint tictoc,
+ uniform uint* sg_bu_startpoint_arr,
+ uniform uint* sg_bu_startpoint_cnt,
+ uniform uint parentOfRoot,
+ uniform bool processRoot,
+ uniform UPerNodeData* nodeData)
+{
+ varying ushort lane = get_sub_group_local_id();
+
+ /* initialize child array */
+ uniform uint numChildren = 2;
+ varying struct BuildRecordLocalMortonFlattener sg_children;
+ sg_children.local_parent_index__numItems = 0;
+
+ uint binary_hierarchy_child_idx = (lane == 0) ? currentRecord.leftChild : currentRecord.rightChild;
+ if (lane >= numChildren) binary_hierarchy_child_idx = 1 << 31;
+
+ sg_children = TranslateToLocalRecord(BinaryMortonCodeHierarchy_getEntry(bnodes, binary_hierarchy_child_idx));
+
+ /* fill QBVH6 node with up to 6 children */
+ while (numChildren < BVH_NODE_N6)
+ {
+ // we dont have to do "local_parent_index__numItems & 0xFFFF" because local_parent_index part is 0 here at this point
+ uint childNumItems = sg_children.local_parent_index__numItems;
+ varying bool sg_is_leaf = childNumItems <= cfg_minLeafSize;
+ if (sub_group_all(sg_is_leaf)) { break; }
+
+ uniform uint bestItems = sub_group_reduce_max_N6(childNumItems);
+ uniform ushort bestChild = ctz(intel_sub_group_ballot(childNumItems == bestItems));
+ varying uint leftOfBest = sg_children.leftChild; // val important only for (lane == bestChild), not valid for other lanes
+ uniform uint rightOfBest = sub_group_broadcast(sg_children.rightChild, bestChild);
+
+ varying uint nodeID = (lane == bestChild) ? leftOfBest : rightOfBest;
+
+ if (lane == numChildren || lane == bestChild)
+ {
+ sg_children = TranslateToLocalRecord(BinaryMortonCodeHierarchy_getEntry(bnodes, nodeID));
+ }
+
+ numChildren++;
+ }
+
+ uniform uint global_offset;
+ uniform uint child_node_index;
+
+ bool isFatleafChild = (sg_children.local_parent_index__numItems <= cfg_minLeafSize) && (lane < numChildren);
+ uint numFatleafChildren = popcount(intel_sub_group_ballot(isFatleafChild));
+
+ if (lane <= numChildren) {
+ uint writeIDX = 0;
+
+ if (lane == numChildren)
+ {
+ /* create nodes in local structure, to be used later in the bottom up to create nodes in actual bvh */
+ MortonFlattenedBoxlessNode flattened_node;
+ uint parentIDX;
+
+ if (processRoot)
+ {
+ *local_numRecords = numChildren + 1;
+ child_node_index = 1;
+ writeIDX = 0;
+ flattened_node.binary_hierarchy_index = 0xFFFFFFFF;
+ flattened_node.childOffset_type = (1 << 6) | BVH_INTERNAL_NODE;
+ parentIDX = parentOfRoot;
+ }
+ else
+ {
+ uint shift = (16 * tictoc);
+ uint mask = 0xFFFF;
+ uint atomicAddVal = numChildren << shift;
+ child_node_index = atomic_add_local(local_numRecords, atomicAddVal);
+ sub_group_barrier(0);
+ writeIDX = currQnodeLocalId;
+ parentIDX = currentRecord.local_parent_index__numItems >> 16;
+ flattened_node.binary_hierarchy_index = 0xFFFFFFFF;
+ sub_group_barrier(0);
+ child_node_index = (child_node_index >> 16) + (child_node_index & mask);
+ flattened_node.childOffset_type = ((child_node_index - currQnodeLocalId) << 6) | BVH_INTERNAL_NODE;
+ }
+
+#if MORTON_VERBOSE_LOG
+ printf("wg %d: SUBGROUP_create_boxless_node_phase1: writeIDX %d, child_node_index %d - %d\n", get_group_id(0), writeIDX, child_node_index, child_node_index + numChildren);
+#endif
+ flattened_node.backPointer = (parentIDX << 6) | (numChildren << 3) | numFatleafChildren;
+ sg_children = MortonFlattenedBoxlessNode_reinterpret_as_BR(flattened_node);
+ }
+
+ child_node_index = sub_group_broadcast(child_node_index, numChildren);
+
+ if (lane != numChildren)
+ {
+ writeIDX = child_node_index + lane;
+ sg_children.local_parent_index__numItems |= currQnodeLocalId << 16;
+ }
+
+ nodeData[writeIDX].buildRecord = sg_children;
+ }
+
+ if (numFatleafChildren == numChildren) {
+ uint arridx = *sg_bu_startpoint_cnt;
+ // GRL_INLINE void set_2xSG_arr_first_write(uint index, uint* arr, ushort val, short lane)
+ set_2xSG_arr_first_write(arridx, sg_bu_startpoint_arr, (ushort)currQnodeLocalId, lane);
+ *sg_bu_startpoint_cnt = arridx + 1;
+ }
+}
+
+// TODO_OPT: Consider having phase 0 bucket the build records by number of primitives, and dispatch different variants
+// of this kernel with different WG sizes. There are many records produced that generate only 1 or 2 subtrees, so 8 SGs is
+// probably often wasted
+GRL_INLINE void phase1_process_fatleaf(
+ uint globalBaseForInternalNodes, // for root node this is indexOfRoot
+ uint globalParent , // for root this should be parentOfRoot
+ bool isInstancePrimLeafType, //
+ uint leafPrimType, //
+ uint leafStride, //
+ global struct QBVHNodeN* nodeData, // per group
+ uint nodeDataStart, //
+ struct AABB* primref, //
+ BackPointers* backPointers, //
+ global struct MortonCodePrimitive* mc,//
+ uint nodesToLeafsGap, //
+ local union UPerNodeData* perNodeData,//
+ bool processRoot, //
+ short localNodeId, //
+ BuildRecordLocalMortonFlattener fatleafRecord, // per node
+ uint primID ) //
+{
+ uint lane = get_sub_group_local_id();
+ uint numChildren = (fatleafRecord.local_parent_index__numItems & 0xFFFF);
+ uniform uint mcID = fatleafRecord.rangeStart;
+ uint pseudolane = lane < numChildren ? lane : 0;
+ varying struct AABB sg_bounds = primref[primID];
+
+ uint local_parent_idx = (fatleafRecord.local_parent_index__numItems >> 16);
+ uint globalNodeId = globalBaseForInternalNodes + localNodeId;
+ uniform global struct QBVHNodeN* qnode = nodeData + globalNodeId;
+
+ uint children_offset = (mcID * leafStride + nodesToLeafsGap) - globalNodeId;
+
+ {
+ /* For all primitives in a fat leaf we store a back
+ * pointer. This way we can modify the fat leaf node at leaf construction time. */
+ uint back_pointer = globalNodeId + nodeDataStart;
+ /* Store back pointer and primID inside morton code array to
+ * be later used by leaf creation. */
+ mc[mcID + pseudolane].index_code = ((ulong)back_pointer) << 32 | (ulong)primID;
+ }
+
+ struct AABB reduce_bounds = AABB_sub_group_reduce_N6(&sg_bounds);
+ reduce_bounds = AABB_sub_group_shuffle( &reduce_bounds, 0 );
+
+ uint8_t instMask;
+ if (isInstancePrimLeafType)
+ {
+ instMask = lane < numChildren ? PRIMREF_instanceMask(&sg_bounds) : 0;
+ subgroup_setInstanceQBVHNodeN(children_offset, &sg_bounds, numChildren, qnode, instMask);
+ instMask = sub_group_reduce_or_N6(instMask);
+ }
+ else
+ {
+ instMask = 0xFF;
+ subgroup_setQBVHNodeN_setFields_reduced_bounds(children_offset, leafPrimType, &sg_bounds, numChildren, instMask, qnode, false, reduce_bounds);
+ }
+
+ reduce_bounds.lower.w = as_float((uint)instMask);
+ uint reduce_bounds_lane = AABB_sub_group_shuffle_coordPerLane(&reduce_bounds, 0);
+ local uint* boxUint = (local uint*)(perNodeData + localNodeId);
+ if (get_sub_group_size() == 8 || lane < 8)
+ {
+ boxUint[lane] = reduce_bounds_lane;
+ uint globalParentIdx;
+ if (processRoot) {
+ // for root, treeletRootGlobalIndex is index of rootsParent in global space
+ globalParentIdx = globalParent;
+ }
+ else {
+ // for non root, raw_parent_idx is in local space
+ globalParentIdx = (local_parent_idx > 0) ? (globalBaseForInternalNodes + local_parent_idx) : globalParent;
+ }
+ if (lane == 0) {
+ *InnerNode_GetBackPointer(backPointers, globalNodeId) = (globalParentIdx << 6) | (numChildren << 3);
+ }
+ }
+}
+
+GRL_INLINE void perform_phase1(global struct Globals* globals,
+ global struct MortonCodePrimitive* mc,
+ global struct AABB* primref,
+ global struct BinaryMortonCodeHierarchy* bnodes,
+ global char* bvh_mem,
+ local union UPerNodeData* perNodeData,
+ local uint* local_records_head,
+ local uint* local_globalOffsetForNodes,
+ BuildRecordLocalMortonFlattener rootRecord,
+ uint treeletRootGlobalIndex,
+ uint parentOfRootIndex,
+ const uint leafPrimType,
+ bool isInstancePrimLeafType)
+{
+ global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem;
+ varying ushort lane = get_sub_group_local_id();
+
+ // array that will keep 2x8 shorts indices
+ varying uint sg_fatleaf_array = 0x0;
+ uniform uint8_t sg_fatleaf_cnt = 0;
+ /* terminate when all subtrees are leaves */
+
+ uint subgroupId = get_sub_group_id();
+ uint ID = subgroupId;
+
+ uint sg_bu_startpoints = 0;
+ uniform uint sg_bu_startpoints_cnt = 0;
+ const uint shift_mask = globals->shift_mask;
+
+ const uint nodeDataStart = BVH_ROOT_NODE_OFFSET / 64;
+ BackPointers* backPointers = BVHBase_GetBackPointers(bvh);
+ global struct QBVHNodeN* nodeData = BVHBase_nodeData(bvh);
+
+ uint* pLeafStart = (!isInstancePrimLeafType) ? &bvh->quadLeafStart : &bvh->instanceLeafStart;
+ uint leafStart = *pLeafStart;
+ uint leafStride = (!isInstancePrimLeafType) ? 1 : (sizeof(struct HwInstanceLeaf) / sizeof(struct InternalNode));
+ uint nodesToLeafsGap = leafStart - nodeDataStart;
+
+ if (ID == 0)
+ {
+ BuildRecordLocalMortonFlattener current = rootRecord;
+
+ if ((current.local_parent_index__numItems & 0xFFFF) <= BVH_NODE_N6)
+ {
+ *local_records_head = 1;
+#if MORTON_DEBUG_CHECKS
+ if (sg_fatleaf_cnt > 32) printf("parallel_build_phase1_Indirect_SG sg_fatleaf_array: one subgroup has more than 32 items remembered\n");
+#endif
+ BuildRecordLocalMortonFlattener fatleafRecord = current;
+ uint numChildren = (fatleafRecord.local_parent_index__numItems & 0xFFFF);
+ uint pseudolane = lane < numChildren ? lane : 0;
+ uniform const uint mcID = fatleafRecord.rangeStart;
+ varying uint primID = (uint)(mc[mcID + pseudolane].index_code & shift_mask);
+
+ phase1_process_fatleaf(
+ treeletRootGlobalIndex, parentOfRootIndex, isInstancePrimLeafType, leafPrimType, leafStride,
+ nodeData, nodeDataStart, primref, backPointers, mc, nodesToLeafsGap, perNodeData,
+ true, 0, fatleafRecord, primID);
+ }
+ else
+ {
+#if MORTON_VERBOSE_LOG
+ if (get_local_id(0) == 0) { printf("wg %d perform_phase1: starting collapsing subtree with root at node %d \n", get_group_id(0), rootIndex); }
+#endif
+ //printf("local_records_head = %d\n", *local_records_head);
+ SUBGROUP_create_boxless_node_phase1(globals, bnodes, bvh_mem, current, ID, local_records_head, 0, &sg_bu_startpoints, &sg_bu_startpoints_cnt, parentOfRootIndex, true, perNodeData);
+ *local_globalOffsetForNodes = treeletRootGlobalIndex;
+ }
+
+ ID += get_num_sub_groups();
+ }
+
+ uniform uint priv_records_tail = 1;
+
+ /* wait for all work items to have updated local_records array */
+ work_group_barrier(CLK_LOCAL_MEM_FENCE);
+
+ uniform uint priv_records_head = *local_records_head & 0xFFFF;
+ treeletRootGlobalIndex = *local_globalOffsetForNodes; // propagated from subgroup 1
+ uniform uint priv_records_tail_prev = priv_records_tail;
+ uniform uint other_records_head = priv_records_head;
+
+ uint ticToc = 1;
+
+ if (priv_records_head == priv_records_tail)
+ {
+ return;
+ }
+ else
+ {
+ do
+ {
+ for (; ID < priv_records_head; ID += get_num_sub_groups())
+ {
+ BuildRecordLocalMortonFlattener current = (perNodeData[ID].buildRecord);
+
+ if ((current.local_parent_index__numItems & 0xFFFF) <= BVH_NODE_N6)
+ {
+ set_2xSG_arr_first_write(sg_fatleaf_cnt++, &sg_fatleaf_array, ID, lane);
+#if MORTON_VERBOSE_LOG
+ if (lane == 0)printf("wg %d, sg %d, perform_phase1: node ID %d is fatleaf \n", get_group_id(0), get_sub_group_id(), ID);
+#endif
+#if MORTON_DEBUG_CHECKS
+ if (sg_fatleaf_cnt > 32) printf("parallel_build_phase1_Indirect_SG sg_fatleaf_array: one subgroup has more than 32 items remembered\n");
+#endif
+ }
+ else
+ {
+ SUBGROUP_create_boxless_node_phase1(globals, bnodes, bvh_mem, current, ID, local_records_head, ticToc, &sg_bu_startpoints, &sg_bu_startpoints_cnt, 0, 0, perNodeData);
+ }
+ }
+
+ priv_records_tail = priv_records_head;
+ /* wait for all work items to have updated local_records array */
+ work_group_barrier(CLK_LOCAL_MEM_FENCE);
+ {
+ uint records_as_in_mem = *local_records_head;
+ priv_records_head = (records_as_in_mem >> (16 * ticToc)) & 0xFFFF;
+ uint other_records_head_temp = priv_records_head;
+ priv_records_head += other_records_head;
+ other_records_head = other_records_head_temp;
+ ticToc = ticToc ^ 1;
+#if MORTON_VERBOSE_LOG
+ if(get_local_id(0) == 0)printf("wg %d, perform_phase1: priv_records_tail %d, priv_records_head %d, records_as_in_mem %x\n", get_group_id(0), get_sub_group_id(), priv_records_tail, priv_records_head, records_as_in_mem);
+#endif
+ }
+ } while (priv_records_tail != priv_records_head); // get out of the loop if the tail reached the head
+ }
+
+ bool atomicNodeAllocation = treeletRootGlobalIndex > 0;
+ bool atomicNodeAllocationProduce = (get_sub_group_id() + lane == 0) && atomicNodeAllocation;
+ uint singleTreeletBumpBVHnodeCnt = (!atomicNodeAllocation && (get_sub_group_id() + lane == 0)) ? nodeDataStart + priv_records_tail : 0;
+
+ uniform uint globalBaseForInternalNodes = 0;
+
+ // we distinguish multi treelet from single treelets here by looking on our treeletRootGlobalIndex
+ // if treelets root is whole tree root (treeletRootGlobalIndex==0) then we are the only treelet so
+ // there's no need to synchronize multiple treelets nodes allocations with atomics.
+ if (atomicNodeAllocationProduce)
+ {
+ *local_globalOffsetForNodes = allocate_inner_nodes(bvh, priv_records_tail - 1);
+ }
+
+ // because, root is allocated elsewhere, and first node placed in global mem is node with local index 1
+ // mapping local to global:
+ // local space global space
+ // [0] - treelet root [treeletRootGlobalIndex]
+ // ... possibly very long distance ...
+ // [1] - first non root [globalBaseForInternalNodes + 1] - this index is returned by atomic allocator above
+ // [2] - first [globalBaseForInternalNodes + 2]
+ // ...
+ // [numToAllocate] - last node [globalBaseForInternalNodes + 3]
+ if (atomicNodeAllocation)
+ {
+ work_group_barrier(CLK_LOCAL_MEM_FENCE);
+ globalBaseForInternalNodes = *local_globalOffsetForNodes -(nodeDataStart+1);
+ }
+
+#if MORTON_VERBOSE_LOG
+ if (get_local_id(0) == 0) { printf("wg %d perform_phase1: globalBaseForInternalNodes %d, num local nodes %d\n", get_group_id(0), globalBaseForInternalNodes, priv_records_tail - 1); }
+#endif
+
+ if (sg_fatleaf_cnt)
+ {
+ short localNodeId = get_from_2xSG_arr(sg_fatleaf_cnt - 1, sg_fatleaf_array, lane);
+ //if (localNodeId >= MORTON_BUILDER_SUBTREE_THRESHOLD * 2) continue;
+ //if(local_startpoints_cnt > 1) return;
+ BuildRecordLocalMortonFlattener fatleafRecord = perNodeData[localNodeId].buildRecord;
+
+ varying uint primID;
+ {
+ uint numChildren = (fatleafRecord.local_parent_index__numItems & 0xFFFF);
+ uint pseudolane = lane < numChildren ? lane : 0;
+ uniform const uint mcID = fatleafRecord.rangeStart;
+ primID = (uint)(mc[mcID + pseudolane].index_code & shift_mask);
+ }
+
+ // process fatleafs, and store their boxes to SLM
+ // also put startpoints for bottom up
+ //uint fatleaf_cnt = *local_startpoints_cnt;
+ while (sg_fatleaf_cnt-- > 1)
+ {
+ short nextLocalNodeId = get_from_2xSG_arr(sg_fatleaf_cnt-1, sg_fatleaf_array, lane);
+ BuildRecordLocalMortonFlattener nextfatleafRecord = perNodeData[nextLocalNodeId].buildRecord;
+ varying uint nextPrimId;
+
+ {
+ uint numChildren = (nextfatleafRecord.local_parent_index__numItems & 0xFFFF);
+ uint pseudolane = lane < numChildren ? lane : 0;
+ uniform const uint mcID = nextfatleafRecord.rangeStart;
+ nextPrimId = (uint)(mc[mcID + pseudolane].index_code & shift_mask);
+ }
+
+ phase1_process_fatleaf(
+ globalBaseForInternalNodes, treeletRootGlobalIndex, isInstancePrimLeafType, leafPrimType, leafStride,
+ nodeData, nodeDataStart, primref, backPointers, mc, nodesToLeafsGap, perNodeData,
+ false, localNodeId, fatleafRecord, primID);
+
+ fatleafRecord = nextfatleafRecord;
+ localNodeId = nextLocalNodeId;
+ primID = nextPrimId;
+ }
+
+ phase1_process_fatleaf(
+ globalBaseForInternalNodes, treeletRootGlobalIndex, isInstancePrimLeafType, leafPrimType, leafStride,
+ nodeData, nodeDataStart, primref, backPointers, mc, nodesToLeafsGap, perNodeData,
+ false, localNodeId, fatleafRecord, primID);
+ }
+
+#if 0
+ // put collected bottom-up startpoints to wg shared array to later distribute the work evenly accross the groups.
+ {
+ ushort myStartpointWriteSite = 0;
+
+ if (lane == 0)
+ {
+ myStartpointWriteSite = atomic_add_local((local uint*)local_startpoints_cnt, (ushort)sg_bu_startpoints_cnt);
+ }
+ myStartpointWriteSite = sub_group_broadcast(myStartpointWriteSite, 0);
+
+ unpack_from_2xSG_arr(sg_bu_startpoints_cnt, sg_bu_startpoints, lane, local_startpoints_arr + myStartpointWriteSite);
+ }
+#endif
+
+ work_group_barrier(CLK_LOCAL_MEM_FENCE);
+
+ // distribute bottom-up startpoints
+#if 0
+ {
+ short sp_count_to_divide = (*local_startpoints_cnt);
+
+ //calculate the chunk for each sg.
+ sg_bu_startpoints_cnt = sp_count_to_divide / get_num_sub_groups();
+ uint sg_bu_startpoints_cnt_reminder = sp_count_to_divide % get_num_sub_groups();
+
+ uint myReadSite = get_sub_group_id() * sg_bu_startpoints_cnt;
+ if (get_sub_group_id() < sg_bu_startpoints_cnt_reminder) {
+ //from the reminder elements if sg idx is < sg_bu_startpoints_cnt_reminder then sg gets one extra idx
+ // and all sgs before it also have one extra
+ myReadSite += get_sub_group_id();
+ sg_bu_startpoints_cnt++;
+ }
+ else
+ {
+ // all reminder elements are consummed by previous sgs
+ myReadSite += sg_bu_startpoints_cnt_reminder;
+ }
+
+ pack_from_2xSG_arr(local_startpoints_arr + myReadSite, sg_bu_startpoints_cnt, &sg_bu_startpoints, lane);
+ }
+#endif
+
+ SUBGROUP_refit_bottom_up_local(nodeData, backPointers, treeletRootGlobalIndex, globalBaseForInternalNodes, lane, perNodeData, sg_bu_startpoints, sg_bu_startpoints_cnt);
+
+ if (singleTreeletBumpBVHnodeCnt)
+ {
+ bvh->nodeDataCur = singleTreeletBumpBVHnodeCnt;
+ }
+}
+
+GRL_INLINE void update_empty_blas(global struct BVHBase* bvh, uint leafPrimType)
+{
+ if (get_sub_group_id() == 0 )
+ {
+ global struct QBVHNodeN* qnode = BVHBase_nodeData(bvh);
+ BackPointers* backPointers = BVHBase_GetBackPointers(bvh);
+
+ //set required fields to mark that blas is empty
+ uint k = (get_sub_group_local_id() < BVH_NODE_N6) ? get_sub_group_local_id() : 0;
+ qnode->type = leafPrimType;
+ qnode->instMask = 0;
+ qnode->qbounds.lower_x[k] = 0x80;
+ qnode->qbounds.upper_x[k] = 0;
+
+ *InnerNode_GetBackPointer(backPointers, 0) = (((uint)-1) << 6);
+ }
+}
+
+/*
+
+ POSTSORT PHASE1:
+ Two kernels here, selected by MORTON_BUILDER_SUBTREE_THRESHOLD.
+ 1. parallel_build_phase1_Indirect_SG - record[0] is set to the subtree tip
+ 2. parallel_build_phase1_Indirect_global_root - record[0] is set to the bvh root (no phase2 needed afterwards)
+
+*/
+
+__attribute__( (reqd_work_group_size( 512, 1, 1 )) )
+__attribute__((intel_reqd_sub_group_size(16))) void kernel
+parallel_build_phase1_Indirect_SG( global struct Globals* globals,
+ global struct MortonCodePrimitive* mc,
+ global struct AABB* primref,
+ global struct BinaryMortonCodeHierarchy* bnodes,
+ global char* bvh_mem)
+{
+ global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem;
+ const uint leafPrimType = globals->leafPrimType;
+
+ //special case for empty blas
+ if(globals->numPrimitives == 0)
+ {
+ bvh->nodeDataCur = BVH_ROOT_NODE_OFFSET / 64 + 1;
+ update_empty_blas(bvh, leafPrimType);
+ return;
+ }
+
+ local union UPerNodeData perNodeData[(MORTON_BUILDER_SUBTREE_THRESHOLD * 2) -1];
+ local uint local_records_head;
+ // Two separate SLM variables for local_globalOffsetForNodes to remove one of the barriers
+ local uint local_globalOffsetForNodes, local_globalOffsetForNodes2;
+
+ uint rootIndex = 0;
+ uint parentOfRoot = 0;
+ BuildRecordLocalMortonFlattener rootBuildRecord;
+
+ /* add start build record to local stack */
+ if (get_sub_group_id() == 0 )
+ {
+ global struct BuildRecordMorton* records = (global struct BuildRecordMorton*)(bvh_mem + 64 * bvh->quadLeafStart);
+ uint recordID = get_group_id(0);
+ struct BuildRecordMorton mortonGlobalRecord = records[recordID];
+
+ rootBuildRecord = TranslateToLocalRecord(BinaryMortonCodeHierarchy_getEntry(bnodes, mortonGlobalRecord.nodeID));
+
+ parentOfRoot = mortonGlobalRecord.parent_index;
+ rootIndex = mortonGlobalRecord.current_index;
+
+#if MORTON_VERBOSE_LOG
+ printf("P1_STARTPOINTS: current_index: %d, buildRecord.numItems: %d, buildRecord.binary_hierarchy_index: %d, buildRecord.local_parent_index: %d\n",
+ local_globalOffsetForNodes, buildRecord.numItems, buildRecord.binary_hierarchy_index, buildRecord.local_parent_index);
+#endif
+ }
+
+ if (leafPrimType == NODE_TYPE_INSTANCE)
+ {
+ perform_phase1(globals, mc, primref, bnodes, bvh_mem, perNodeData,
+ &local_records_head, &local_globalOffsetForNodes,
+ rootBuildRecord, rootIndex, parentOfRoot, NODE_TYPE_INSTANCE, true);
+ }
+ else
+ {
+ perform_phase1(globals, mc, primref, bnodes, bvh_mem, perNodeData,
+ &local_records_head, &local_globalOffsetForNodes,
+ rootBuildRecord, rootIndex, parentOfRoot, leafPrimType, false);
+ }
+
+}
+
+__attribute__( (reqd_work_group_size( 512, 1, 1 )) )
+__attribute__((intel_reqd_sub_group_size(16))) void kernel
+parallel_build_phase1_Indirect_global_root( global struct Globals* globals,
+ global struct MortonCodePrimitive* mc,
+ global struct AABB* primref,
+ global struct BinaryMortonCodeHierarchy* bnodes,
+ global char* bvh_mem)
+{
+ global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem;
+ const uint leafPrimType = globals->leafPrimType;
+ const uint nodeDataStart = BVH_ROOT_NODE_OFFSET / 64;
+
+ bvh->nodeDataCur = nodeDataStart + 1;
+
+ //special case for empty blas
+ if(globals->numPrimitives == 0)
+ {
+ update_empty_blas(bvh, leafPrimType);
+ return;
+ }
+
+ local union UPerNodeData perNodeData[MORTON_BUILDER_SUBTREE_THRESHOLD * 2 - 1];
+ local uint local_records_head;
+ local uint local_globalOffsetForNodes;
+
+ BuildRecordLocalMortonFlattener rootBuildRecord;
+
+ if (get_sub_group_id() == 0 )
+ {
+ struct BinaryMortonCodeHierarchy binaryNode = BinaryMortonCodeHierarchy_getEntry(bnodes, globals->binary_hierarchy_root);
+
+ rootBuildRecord = TranslateToLocalRecord(binaryNode);
+
+ local_globalOffsetForNodes = 0;
+ }
+
+ if (leafPrimType == NODE_TYPE_INSTANCE)
+ {
+ perform_phase1(globals, mc, primref, bnodes, bvh_mem, perNodeData,
+ &local_records_head, &local_globalOffsetForNodes, rootBuildRecord, 0, (uint)-1, NODE_TYPE_INSTANCE, true);
+ }
+ else
+ {
+ perform_phase1(globals, mc, primref, bnodes, bvh_mem, perNodeData,
+ &local_records_head, &local_globalOffsetForNodes, rootBuildRecord, 0, (uint)-1, leafPrimType, false);
+
+ }
+}
+
+#if 0
+GRL_INLINE void
+DO_OLD_PARALLEL_BUILD_PHASE1( global struct Globals* globals,
+ global struct MortonCodePrimitive* mc,
+ global struct AABB* primref,
+ global struct BinaryMortonCodeHierarchy* bnodes,
+ global char* bvh_mem,
+ uint startID, uint endID,
+ local uint* local_numRecords,
+ local uint* local_numRecordsOld,
+ local struct BuildRecordMorton* local_records
+)
+{
+ global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem;
+ global struct BuildRecordMorton* records = (global struct BuildRecordMorton*)(bvh_mem + bvh->quadLeafStart*64);
+
+ /* iterate over all subtrees this workgroup should build */
+ for ( uint recordID = startID; recordID < endID; recordID++ )
+ {
+ /* add start build record to local stack */
+ if ( get_local_id( 0 ) == 0 )
+ {
+ local_records[0] = records[recordID];
+ *local_numRecords = 1;
+ *local_numRecordsOld = 0;
+ }
+ work_group_barrier( CLK_LOCAL_MEM_FENCE );
+
+ /* terminate when all subtrees are leaves */
+ while ( *local_numRecords != *local_numRecordsOld )
+ {
+ /* remember the old number of build records to detect later
+ * whether we are done */
+ if ( get_local_id( 0 ) == 0 )
+ {
+ *local_numRecordsOld = *local_numRecords;
+ }
+ work_group_barrier( CLK_LOCAL_MEM_FENCE );
+
+ /* all work items in the sub group pick a subtree to build */
+ for ( uint ID = get_local_id( 0 ); ID < *local_numRecordsOld; ID += get_local_size( 0 ) )
+ {
+ /* ignore small subtrees */
+ if ( local_records[ID].items <= BVH_NODE_N6 )
+ continue;
+
+ /* create QBVH node */
+ create_node( globals, bnodes, bvh_mem, ID, local_numRecords, local_records, &local_records[ID] );
+ }
+
+ /* wait for all work items to have updated local_records array */
+ work_group_barrier( CLK_LOCAL_MEM_FENCE );
+ }
+
+ const uint shift_mask = globals->shift_mask;
+ const uint leafPrimType = globals->leafPrimType;
+ const uint rootNodeOffset = BVH_ROOT_NODE_OFFSET;
+ BackPointers* backPointers = BVHBase_GetBackPointers( bvh );
+ global struct QBVHNodeN* nodeData = BVHBase_nodeData( bvh );
+
+ /* create all fat leaf nodes and initiate refit */
+ for ( uint ID = get_local_id( 0 ); ID < *local_numRecords; ID += get_local_size( 0 ) )
+ {
+ struct BuildRecordMorton current = local_records[ID];
+ const uint primrefID = BinaryMortonCodeHierarchy_getRangeStart( bnodes, current.nodeID );
+
+ global struct QBVHNodeN* qnode = nodeData + current.current_index;
+
+ /* get bounds of all children of the fat leaf node */
+ struct AABB bounds[BVH_NODE_N6];
+ for ( uint i = 0; i < current.items; i++ )
+ {
+ /* get primID and bounds of primitive */
+ const uint primID = (uint)(mc[primrefID + i].index_code & shift_mask);
+ bounds[i] = primref[primID];
+
+ /* For all primitives in a fat leaf we store a back
+ * pointer. This way we can modify the fat leaf node at leaf construction time. */
+ const uint back_pointer = qnode - (struct QBVHNodeN*)bvh_mem;
+
+ /* Store back pointer and primID inside morton code array to
+ * be later used by leaf creation. */
+ mc[primrefID + i].index_code = ((ulong)back_pointer) << 32 | (ulong)primID;
+ }
+
+ /* update fat leaf node */
+ QBVHNodeN_setType( qnode, leafPrimType );
+ global void* offset;
+ if ( leafPrimType != BVH_INSTANCE_NODE )
+ {
+ offset = bvh_mem + 64*bvh->quadLeafStart + primrefID * sizeof( struct Quad );
+ QBVHNodeN_setChildIncr1( qnode );
+ }
+ else
+ {
+ offset = bvh_mem + 64*bvh->instanceLeafStart + primrefID * sizeof( struct HwInstanceLeaf );
+ QBVHNodeN_setChildIncr2( qnode );
+ }
+ QBVH6Node_set_offset( qnode, offset );
+ QBVHNodeN_setBounds( qnode, bounds, current.items );
+
+ /* set back pointers for fat leaf nodes */
+ *InnerNode_GetBackPointer(backPointers, current.current_index) = (current.parent_index << 6) | (current.items << 3);
+
+ /* bottom up refit */
+ refit_bottom_up( qnode, bvh, bounds, current.items );
+ }
+ }
+}
+
+/*
+
+ This phase takes the build records calculated in phase0 as input and
+ finished the BVH construction for all these subtrees.
+
+*/
+__attribute__((reqd_work_group_size(8, 1, 1)))
+old_parallel_build_phase1(global struct Globals *globals,
+ global struct MortonCodePrimitive *mc,
+ global struct AABB *primref,
+ global struct BinaryMortonCodeHierarchy *bnodes,
+ global char *bvh_mem)
+{
+ global struct BVHBase *bvh = (global struct BVHBase *)bvh_mem;
+ global struct BuildRecordMorton *records = (global struct BuildRecordMorton *)(bvh_mem + 64*bvh->quadLeafStart);
+
+ /* a queue of build records */
+ local struct BuildRecordMorton local_records[MORTON_BUILDER_SUBTREE_THRESHOLD];
+ local uint local_numRecords;
+ local uint local_numRecordsOld;
+
+ /* construct range of build records that each sub group will process */
+ const uint numRecords = globals->numBuildRecords;
+ const uint startID = (get_group_id(0) + 0) * numRecords / get_num_groups(0);
+ const uint endID = (get_group_id(0) + 1) * numRecords / get_num_groups(0);
+
+ DO_OLD_PARALLEL_BUILD_PHASE1( globals, mc, primref, bnodes, bvh_mem, startID, endID, &local_numRecords, &local_numRecordsOld, local_records );
+
+}
+
+__attribute__( (reqd_work_group_size( 8, 1, 1 )) )
+old_parallel_build_phase1_Indirect( global struct Globals* globals,
+ global struct MortonCodePrimitive* mc,
+ global struct AABB* primref,
+ global struct BinaryMortonCodeHierarchy* bnodes,
+ global char* bvh_mem )
+{
+ global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem;
+ global struct BuildRecordMorton* records = (global struct BuildRecordMorton*)(bvh_mem + 64*bvh->quadLeafStart);
+
+ /* a queue of build records */
+ local struct BuildRecordMorton local_records[MORTON_BUILDER_SUBTREE_THRESHOLD];
+ local uint local_numRecords;
+ local uint local_numRecordsOld;
+
+ /* construct range of build records that each sub group will process */
+ const uint numRecords = globals->numBuildRecords;
+ uint startID = get_group_id( 0 );
+ uint endID = startID + 1;
+
+ DO_OLD_PARALLEL_BUILD_PHASE1( globals, mc, primref, bnodes, bvh_mem, startID, endID, &local_numRecords, &local_numRecordsOld, local_records );
+
+}
+#endif
diff --git a/src/intel/vulkan/grl/gpu/morton/phase2.cl b/src/intel/vulkan/grl/gpu/morton/phase2.cl
new file mode 100644
index 00000000000..e82d22aaacf
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/morton/phase2.cl
@@ -0,0 +1,314 @@
+//
+// Copyright (C) 2009-2022 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#include "bvh_build_refit.h"
+#include "libs/lsc_intrinsics.h"
+#include "morton/morton_common.h"
+
+/*
+
+ POSTSORT PHASE2:
+ Two kernels here, selected by MORTON_BUILDER_P2_SINGLE_WG_THRESHOLD whish is set to very big value.
+ 1. parallel_build_phase2_refit - performs refit using global synchronization and mem_fence_gpu_invalidate.
+ This kernel should be used only for very big bvh, it is faster than non-SLM fallback
+ in parallel_build_phase2_refit_local.
+ 2. parallel_build_phase2_refit_local - should be used for most of the cases, we usually fit into SLM with the number of
+ nodes allocated in phase0, but there is also non-SLM fallback there, as the
+ decision on which kernel to run is based on the nodes estimates on the host
+ side.
+
+*/
+
+
+GRL_INLINE void refit_bottom_up_global_sync(
+ global char* bvh_mem,
+ global uint* global_refit_startpoints,
+ uniform uint nodeId,
+ uniform ushort lane)
+{
+ global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem;
+
+ BackPointers* backPointers = BVHBase_GetBackPointers( bvh );
+ global struct QBVHNodeN* nodeData = BVHBase_nodeData( bvh );
+
+ // Get the node idx that was put here in phase1
+ const uint innerNodeIdx = global_refit_startpoints[nodeId];
+
+ // Get the qnode and backpointer
+ uniform global struct QBVHNodeN* qnode = nodeData + innerNodeIdx;
+ uint backPointer = *InnerNode_GetBackPointer(backPointers, innerNodeIdx);
+
+ varying struct AABB childrenAABB; // one child AABB per lane
+ AABB_init(&childrenAABB);
+
+ uniform uint numChildren = (backPointer >> 3) & 0x7;
+ if(numChildren == 0) return;
+
+ global struct QBVHNodeN* qnode_child = (global struct QBVHNodeN*)QBVHNodeN_childrenPointer( qnode );
+ varying ushort child_idx = (lane < numChildren) ? lane : 0;
+ childrenAABB = getAABB_QBVHNodeN( qnode_child + child_idx );
+
+#if MORTON_VERBOSE_LOG
+ if(lane == 0)
+ printf("REFIT2: index: %d, child_idx: %d\n", innerNodeIdx, child_idx);
+#endif
+
+ struct AABB reduce_bounds = AABB_sub_group_reduce_N6( &childrenAABB );
+ reduce_bounds = AABB_sub_group_shuffle( &reduce_bounds, 0 );
+
+ subgroup_QBVHNodeN_setBounds(qnode, reduce_bounds, childrenAABB, numChildren, lane);
+
+ uint children_mask = qnode_child[child_idx].instMask;
+ qnode->instMask = sub_group_reduce_or_N6(children_mask);
+
+ SUBGROUP_refit_bottom_up( qnode, bvh, reduce_bounds, numChildren, lane, 0 );
+}
+
+__attribute__( (reqd_work_group_size( 16, 1, 1 )) ) void kernel
+parallel_build_phase2_refit( global char* bvh_mem,
+ global uint* global_refit_startpoints )
+{
+ refit_bottom_up_global_sync(bvh_mem, global_refit_startpoints, get_group_id(0), get_local_id(0));
+}
+
+
+GRL_INLINE void SUBGROUP_refit_bottom_up_global(
+ uniform global struct QBVHNodeN* globalNodeData,
+ uniform struct BackPointers* backPointers,
+ varying ushort lane,
+ varying uint curNodeIndex)
+{
+ uniform uint backpointer = *InnerNode_GetBackPointer(backPointers, curNodeIndex);
+
+ const uint head_lane = 0;
+ uniform struct AABB child_aabb; // this carries reduced aabb between loop turns
+
+ while (curNodeIndex != 0)
+ {
+ global struct QBVHNodeN* qnode = globalNodeData + curNodeIndex;
+ global struct QBVHNodeN* qnode_child = (global struct QBVHNodeN*)QBVHNodeN_childrenPointer( qnode );
+ uint numChildren = BackPointer_GetNumChildren(backpointer);
+
+ varying ushort child_idx = (lane < numChildren) ? lane : 0;
+ child_aabb = getAABB_QBVHNodeN( qnode_child + child_idx );
+
+ struct AABB reduced_bounds = AABB_sub_group_reduce_N6(&child_aabb);
+ reduced_bounds = AABB_sub_group_shuffle(&reduced_bounds, head_lane);
+
+ /* get bounds of all children from child nodes directly */
+ subgroup_QBVHNodeN_setBounds(qnode, reduced_bounds, child_aabb, numChildren, lane);
+
+ uchar childrenMask = qnode_child[child_idx].instMask;
+ qnode->instMask = sub_group_reduce_or_N6(childrenMask);
+
+ uint parentIndex = BackPointer_GetParentIndex(backpointer);
+
+ mem_fence_gpu_invalidate();
+
+ if (lane == 0)
+ {
+ backpointer = atomic_inc_global((__global uint *)InnerNode_GetBackPointer(backPointers, parentIndex));
+
+ uint globalBackpointer = (parentIndex << 6) | (numChildren << 3);
+
+ /* set global back pointer */
+ *InnerNode_GetBackPointer(backPointers, curNodeIndex) = globalBackpointer;
+
+#if MORTON_VERBOSE_LOG
+ printf("BU_INNER: index: %d, first_child_id: %d, offset: %d, parent: %d, numChildren: %d, child_loc_idx: %d reduced_bounds: %f\n",
+ curNodeIndex, curNodeIndex + qnode->offset, qnode->offset, backpointer >> 6, numChildren, child_idx, reduced_bounds.lower.x);
+#endif
+ }
+
+ backpointer = 1 + intel_sub_group_shuffle(backpointer, head_lane);
+ curNodeIndex = parentIndex;
+
+ /* if all children got refitted, then continue */
+ uniform uint numChildrenRefitted = (backpointer >> 0) & 0x7;
+ uniform uint numChildrenTotal = (backpointer >> 3) & 0x7;
+
+ if (numChildrenRefitted != numChildrenTotal)
+ return;
+ }
+
+ // process root of the treelet
+ {
+
+#if MORTON_DEBUG_CHECKS
+ if (curNodeIndex != 0) printf("SUBGROUP_refit_bottom_up_local: this should be local node index 0\n");
+#endif
+
+ global struct QBVHNodeN* qnode_child = (global struct QBVHNodeN*)QBVHNodeN_childrenPointer( globalNodeData );
+ uint numChildren = BackPointer_GetNumChildren(backpointer);
+
+ varying ushort child_idx = (lane < numChildren) ? lane : 0;
+ child_aabb = getAABB_QBVHNodeN( qnode_child + child_idx );
+
+ struct AABB reduced_bounds = AABB_sub_group_reduce_N6(&child_aabb);
+ reduced_bounds = AABB_sub_group_shuffle(&reduced_bounds, head_lane);
+
+ /* get bounds of all children from child nodes directly */
+ subgroup_QBVHNodeN_setBounds(globalNodeData, reduced_bounds, child_aabb, numChildren, lane);
+
+ uchar childrenMask = qnode_child[child_idx].instMask;
+ globalNodeData->instMask = sub_group_reduce_or_N6(childrenMask);
+
+ /* reset refit counter for next refit */
+ if (lane == 0)
+ {
+ /* set global back pointer */
+ *InnerNode_GetBackPointer(backPointers, 0) = backpointer & (~7u);
+
+#if MORTON_VERBOSE_LOG
+ printf("BU_ROOT: curNodeIndex: %d, index: %d, first_child_id: %d, offset: %d, parent: %d, numChildren: %d, sg_bu_startpoints_cnt: %d\n",
+ curNodeIndex, 0, 0 + globalNodeData->offset, globalNodeData->offset, backpointer >> 6, numChildren, sg_bu_startpoints_cnt);
+#endif
+ }
+ }
+}
+
+
+// TODO: Check why 512 wg size has worse performance than 256
+__attribute__( (reqd_work_group_size( 512, 1, 1 )) )
+__attribute__((intel_reqd_sub_group_size(16))) void kernel
+parallel_build_phase2_refit_local( global struct Globals* globals,
+ global char* bvh_mem,
+ global struct MortonFlattenedBoxlessNode *boxless_nodes)
+{
+ // Number of nodes created in P0, to be refitted in this stage
+ uint p0_created_num = globals->p0_created_num;
+
+ // Return immediately if host executed this kernel but there is nothing to do
+ if(p0_created_num == 0)
+ return;
+
+ global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem;
+ BackPointers* backPointers = BVHBase_GetBackPointers( bvh );
+ global struct QBVHNodeN* nodeData = BVHBase_nodeData( bvh );
+ varying ushort lane = get_sub_group_local_id();
+
+ // Hardcode SLM to max here as we do not know upfront how much mem will be needed
+ local union UPerNodeData perNodeData[MORTON_BUILDER_P2_ELEMENTS_IN_SLM]; /* 16kb is max slm for 256 wg_size */
+
+ // Number of allocated nodes in phase0 (p0_created_num + children)
+ uint p0_allocated_num = globals->p0_allocated_num;
+
+ // array that will keep 2x8 shorts indices
+ varying uint sg_fatleaf_array = 0x0;
+ uniform uint8_t sg_bu_startpoints_cnt = 0;
+
+ // Determine if we can fit into SLM with all the nodes allocated in phase0,
+ // There are two paths here:
+ // 1. Copy all needed flattened nodes and bounding boxes to SLM and reuse bottom up local,
+ // which does refit nad creates qnodes in bvh
+ // 2. If not fit into SLM, first create qnodes in bvh, and perform bottom up refit with global atomics synchronization.
+ // It is not performant to do so, keep it as a guardrail here. On the host side we do fallback
+ // to the old refit separated path, with wg_size 8 with better EU reuse.
+ if(p0_allocated_num < MORTON_BUILDER_P2_ELEMENTS_IN_SLM)
+ {
+ for (uint ID = get_sub_group_id(); ID < p0_created_num; ID += get_num_sub_groups() )
+ {
+ MortonFlattenedBoxlessNode boxless_node = boxless_nodes[ID];
+ uint current_id = boxless_node.binary_hierarchy_index >> 6;
+
+ // Put the mask for the children that are subtree roots in the binary_hierarchy_index that is unused
+ uchar children_root_mask = (boxless_node.binary_hierarchy_index & 0x3F);
+
+ if(lane == 0)
+ perNodeData[current_id].boxlessNode = boxless_node;
+
+ // When no children are subtree roots, we are done and skip to the next iteration
+ if(children_root_mask == 0x0)
+ {
+ continue;
+ }
+ // When all children are subtree roots, put them to sg_fatleaf_array
+ else if(children_root_mask == 0x3F)
+ {
+ set_2xSG_arr_first_write(sg_bu_startpoints_cnt++, &sg_fatleaf_array, current_id, lane);
+ }
+
+ uniform global struct QBVHNodeN* qnode = nodeData + current_id;
+
+ uniform uint numChildren = (boxless_node.backPointer >> 3) & 0x7;
+ uint lead_child_offset = MortonFlattenedBoxlessNode_GetChildOffset(boxless_node);
+ varying ushort child_idx = (lane < numChildren) ? lane : 0;
+
+ varying struct AABB childrenAABB; // one child AABB per lane
+ AABB_init(&childrenAABB);
+
+ uint lead_child_global_id = current_id + lead_child_offset;
+
+ uniform global struct QBVHNodeN* qnode_child = nodeData + lead_child_global_id;
+ childrenAABB = getAABB_QBVHNodeN( qnode_child + child_idx );
+
+ // Get only AABBs of children that are p1 subtree roots
+ bool lane_active = boxless_node.binary_hierarchy_index & (1 << child_idx);
+ if(lane_active)
+ {
+ uint child_global_id = lead_child_global_id + child_idx;
+ perNodeData[child_global_id].box = childrenAABB;
+ perNodeData[child_global_id].box.lower.w = as_float((uint)qnode_child->instMask);
+ }
+
+#if MORTON_VERBOSE_LOG
+ if(lane == 0)
+ printf("P2_LOCAL: ID: %d, lead_child_offset: %d, child_idx: %d, lane_active: %d, boxless_node >> 6: %d, perNodeData[ID].box = %f, qnode->offset: %d\n", ID, lead_child_offset, child_idx, lane_active, boxless_node.backPointer >> 6, perNodeData[ID].box.lower.x, qnode->offset);
+#endif
+ }
+
+ work_group_barrier(CLK_LOCAL_MEM_FENCE);
+
+ SUBGROUP_refit_bottom_up_local(nodeData, backPointers, 0, 0, lane, perNodeData, sg_fatleaf_array, sg_bu_startpoints_cnt);
+ }
+ else
+ {
+ for (uint ID = get_sub_group_id(); ID < p0_created_num; ID += get_num_sub_groups() )
+ {
+ MortonFlattenedBoxlessNode boxless_node = boxless_nodes[ID];
+ uint current_id = boxless_node.binary_hierarchy_index >> 6;
+
+ // Put the mask for the children that are subtree roots in the binary_hierarchy_index that is unused
+ uchar children_root_mask = (boxless_node.binary_hierarchy_index & 0x3F);
+ uniform uint numChildren = (boxless_node.backPointer >> 3) & 0x7;
+
+ uniform global struct QBVHNodeN* qnode = nodeData + current_id;
+ uint nodeType = MortonFlattenedBoxlessNode_GetType(boxless_node);
+ uint lead_child_offset = MortonFlattenedBoxlessNode_GetChildOffset(boxless_node);
+
+ SUBGROUP_QBVHNodeN_setChildIncr1( qnode );
+ if(lane == 0)
+ {
+ QBVH6Node_set_type( qnode, nodeType );
+ qnode->offset = lead_child_offset;
+ }
+
+ // When no children are subtree roots, we are done and skip to the next iteration
+ if(children_root_mask == 0x0)
+ {
+ continue;
+ }
+ // When all children are subtree roots, put them to sg_fatleaf_array
+ else if(children_root_mask == 0x3F)
+ {
+ set_2xSG_arr_first_write(sg_bu_startpoints_cnt++, &sg_fatleaf_array, current_id, lane);
+ }
+
+#if MORTON_VERBOSE_LOG
+ if(lane == 0)
+ printf("P2_GLOBAL: ID: %d, lead_child_offset: %d, child_idx: %d, boxless_node >> 6: %d, perNodeData[ID].box = %f, qnode->offset: %d\n", ID, lead_child_offset, child_idx, boxless_node.backPointer >> 6, reduce_bounds.lower.x, qnode->offset);
+#endif
+ }
+
+ while (sg_bu_startpoints_cnt > 0)
+ {
+ uint curNodeIndex = get_from_2xSG_arr(--sg_bu_startpoints_cnt, sg_fatleaf_array, lane);
+
+ SUBGROUP_refit_bottom_up_global(nodeData, backPointers, lane, curNodeIndex);
+ }
+ }
+}
diff --git a/src/intel/vulkan/grl/gpu/morton/post_sort.cl b/src/intel/vulkan/grl/gpu/morton/post_sort.cl
new file mode 100644
index 00000000000..c13762438a3
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/morton/post_sort.cl
@@ -0,0 +1,521 @@
+//
+// Copyright (C) 2009-2022 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#include "libs/lsc_intrinsics.h"
+#include "morton/morton_common.h"
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////
+/*
+
+ This kernel constructs a binary hierarchy in bottom up fashion from
+ the morton codes.
+
+*/
+////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+int Delta(global struct MortonCodePrimitive* mc, const uint64_t key0, const uint i1 )
+{
+ const uint64_t key1 = mc[i1].index_code;
+ return clz(key0 ^ key1);
+}
+
+int sign( int d )
+{
+ return (d > 0) ? 1 : -1;
+}
+
+__attribute__( (reqd_work_group_size( MAX_HW_SIMD_WIDTH, 1, 1 )) )
+__attribute__( (intel_reqd_sub_group_size( MAX_HW_SIMD_WIDTH )) )
+void kernel build_bottom_up_indirect( global struct Globals* globals,
+ global struct BinaryMortonCodeHierarchy* bnodes,
+ global struct MortonCodePrimitive* mc )
+{
+ /* construct range of primitives that each work group will process */
+ const uint numPrimitives = globals->numPrimitives;
+
+ uint i = get_group_id( 0 ) * get_local_size(0) + get_local_id( 0 );
+
+ if (i == 0)
+ {
+ globals->binary_hierarchy_root = 0;
+ if (numPrimitives == 1)
+ {
+ // special kludge for 1-prim tree. Make sure the one leaf node is initialized
+ bnodes[i].range.start = 0;
+ bnodes[i].range.end = 0;
+ bnodes[i].leftChild = -1;
+ bnodes[i].rightChild = -1;
+ }
+
+ // store pointer to the binary hierarchy in the globals struct.
+ // This will be used
+ globals->binary_hierarchy_buffer = (gpuva_t) bnodes;
+ }
+
+ uint num_inner_nodes = numPrimitives-1;
+ if ( i < num_inner_nodes )
+ {
+ //
+ // direction is 1 if this morton code is the node's first key, -1 if it's the last
+ // By construction every internal node is either the start or the end of a given key range
+ // direction should be towards the neighbor with the most bits in common
+
+ uint64_t ki = mc[i].index_code;
+
+ int direction, delta_min;
+ uint lmax;
+ if( i == 0 )
+ {
+ direction = 1;
+ delta_min = -1;
+ lmax = numPrimitives;
+ }
+ else
+ {
+ direction = sign( Delta( mc, ki, i + 1 ) - Delta( mc, ki, i - 1 ) );
+ delta_min = Delta( mc, ki, i - direction );
+
+ // find upper bound for length of this node's key range
+ lmax = 8;
+ while ( (i+lmax*direction) < numPrimitives && Delta( mc, ki, i+lmax*direction ) > delta_min)
+ lmax = lmax * 2;
+ }
+
+ // clamp max length so that the binary searches are fully in-bounds
+ uint maxLen = (direction>0) ? (numPrimitives - i) : (i+1);
+ lmax = min(lmax, maxLen);
+
+ // find end of range using binary search
+ uint length = 0;
+ uint end = lmax-1;
+ while (length != end)
+ {
+ uint mid = length + ((end-length)/2) + ((end-length)%2);
+ bool bigger = Delta( mc, ki, i+mid*direction) > delta_min;
+ length = bigger ? mid : length;
+ end = bigger ? end : mid-1;
+ }
+ uint j = i + length*direction ;
+
+ // find split position using binary search
+ uint split = 0;
+ end = length-1;
+ int delta_node = Delta(mc, ki, j);
+ while (split != end)
+ {
+ uint mid = split + ((end-split)/2) + ((end-split)%2);
+ bool bigger = Delta( mc, ki, i+mid*direction) > delta_node;
+ split = bigger ? mid : split;
+ end = bigger ? end : mid-1;
+ }
+ split = i + split*direction + min(direction,0);
+
+ uint left = split;
+ uint right = split+1;
+
+ // mark leaves
+ if( min(i,j) == split )
+ left = left | (1<<31);
+ if( max(i,j) == split+1 )
+ right = right | (1<<31);
+
+ bnodes[i].range.start = min(i,j);
+ bnodes[i].range.end = max(i,j);
+ bnodes[i].leftChild = left;
+ bnodes[i].rightChild = right;
+ }
+}
+
+
+
+
+
+#if 0
+__attribute__( (reqd_work_group_size( MAX_HW_SIMD_WIDTH, 1, 1 )) )
+__attribute__( (intel_reqd_sub_group_size( MAX_HW_SIMD_WIDTH )) )
+void kernel build_bottom_up_indirect( global struct Globals* globals,
+ global struct BinaryMortonCodeHierarchy* bnodes,
+ global struct MortonCodePrimitive* mc )
+{
+ /* construct range of primitives that each work group will process */
+ const uint numPrimitives = globals->numPrimitives;
+
+ // RangeFactor determines the distance between adjacent nodeIds in work group.
+ // The aim of the nodes distribution within work group, for rangeFactor > 1
+ // is to be sure that half of the work groups will entirelly be dropped off
+ // at the bottom layer of the graph. This way the EUs can be reused faster.
+ // The factor needs to be smaller than MAX_HW_SIMD_WIDTH
+ const uint rangeFactor = 2;
+
+ const uint numGroups = ((numPrimitives + MAX_HW_SIMD_WIDTH - 1) / MAX_HW_SIMD_WIDTH);
+ const uint globalId = get_group_id( 0 ) * MAX_HW_SIMD_WIDTH + get_local_id( 0 );
+ const uint numPrimitivesAlignedToWGSize = MAX_HW_SIMD_WIDTH * numGroups;
+ const uint groupsRange = numPrimitivesAlignedToWGSize / rangeFactor;
+
+ /* iterate over all primitives the work group should process */
+ const uint i = (globalId * rangeFactor) % numPrimitivesAlignedToWGSize + globalId / groupsRange;
+
+ if ( i < numPrimitives )
+ {
+ uint node = i | ((uint)1 << 31);
+ uint start = i;
+ uint end = i;
+
+ /* bottom up */
+ while ( true )
+ {
+ /* goto parent node and link parent node to current node */
+ node = updateParent( bnodes, mc, node, start, end, numPrimitives - 1 );
+
+ /* do not continue if we reached this node the first time */
+ if ( node == -1 )
+ break;
+
+ mem_fence_gpu_invalidate();
+
+ /* update range */
+ start = bnodes[node].range.start;
+ end = bnodes[node].range.end;
+
+ /* stop when we reached the root node */
+ if ( start == 0 && end == numPrimitives - 1 )
+ {
+ globals->binary_hierarchy_root = node;
+ break;
+ }
+ }
+ }
+}
+
+#endif
+
+/*
+
+ This function builds one QBVH6 node by opening the provided binary
+ BVH nodes until the QBVH node is full.
+
+ */
+
+GRL_INLINE void create_node(global struct Globals *globals,
+ global struct BinaryMortonCodeHierarchy *bnodes,
+ global char *bvh_mem,
+ uint rID,
+ local uint *local_numRecords,
+ local uint *local_QNodeOffset,
+ struct BuildRecordMorton *records,
+ struct BuildRecordMorton *current)
+{
+ global struct BVHBase *bvh = (global struct BVHBase *)bvh_mem;
+ const uint rootNodeOffset = BVH_ROOT_NODE_OFFSET;
+ global struct QBVHNodeN *nodeData = BVHBase_nodeData(bvh);
+ BackPointers *backPointers = BVHBase_GetBackPointers(bvh);
+
+ /* initialize child array */
+ uint numChildren = 2;
+ struct BuildRecordMorton children[BVH_NODE_N6];
+ children[0].nodeID = bnodes[current->nodeID].leftChild;
+ children[0].items = BinaryMortonCodeHierarchy_getNumPrimitives(bnodes, children[0].nodeID);
+ children[1].nodeID = bnodes[current->nodeID].rightChild;
+ children[1].items = BinaryMortonCodeHierarchy_getNumPrimitives(bnodes, children[1].nodeID);
+
+ /* fill QBVH6 node with up to 6 children */
+ while (numChildren < BVH_NODE_N6)
+ {
+ /*! find best child to split */
+ uint bestItems = 0;
+ int bestChild = -1;
+ for (int i = 0; i < numChildren; i++)
+ {
+ const uint items = children[i].items;
+
+ /* ignore leaves as they cannot get split */
+ if (items <= cfg_minLeafSize)
+ continue;
+
+ /* find child with largest number of items */
+ if (items > bestItems)
+ {
+ bestItems = items;
+ bestChild = i;
+ }
+ }
+ if (bestChild == -1)
+ break;
+
+ /* perform best found split */
+ const uint bestNodeID = children[bestChild].nodeID;
+ struct BuildRecordMorton *lrecord = &children[bestChild];
+ struct BuildRecordMorton *rrecord = &children[numChildren];
+ lrecord->nodeID = bnodes[bestNodeID].leftChild;
+ lrecord->items = BinaryMortonCodeHierarchy_getNumPrimitives(bnodes, lrecord->nodeID);
+ rrecord->nodeID = bnodes[bestNodeID].rightChild;
+ rrecord->items = BinaryMortonCodeHierarchy_getNumPrimitives(bnodes, rrecord->nodeID);
+ numChildren++;
+ }
+
+ /* allocate memory for all children */
+ const uint child_node_offset = atomic_add_local(local_QNodeOffset,64*numChildren);
+ global struct QBVHNodeN *childNodes = (global struct QBVHNodeN *)(bvh_mem + child_node_offset);
+
+ /* create node, but to not set bounds yet as these get calculated during refit */
+ const uint current_index = current->current_index;
+ struct QBVHNodeN *qnode = nodeData + current_index;
+ QBVH6Node_set_type(qnode, BVH_INTERNAL_NODE);
+ QBVHNodeN_setChildIncr1(qnode);
+ QBVH6Node_set_offset(qnode, childNodes);
+
+ /* set back pointers */
+ *InnerNode_GetBackPointer(backPointers, current_index) = (current->parent_index << 6) | (numChildren << 3);
+
+ /* update parent pointer of build records of all children */
+ for (uint ID = 0; ID < numChildren; ID++)
+ {
+ children[ID].current_index = childNodes - nodeData + ID;
+ children[ID].parent_index = current_index;
+ }
+
+ /* write out child build records */
+ const uint global_offset = atomic_add_local(local_numRecords, numChildren - 1);
+ records[rID] = children[0];
+
+ for (uint i = 1; i < numChildren; i++)
+ records[global_offset + i - 1] = children[i];
+
+ mem_fence_workgroup_default();
+
+}
+
+#if 0
+/* This function calculates the similarity between two morton
+ * codes. It essentially counts how many bits of the morton codes are
+ * equal starting at the top. The more bits are equal, the similar the
+ * codes, and the closer the primitives are located spatially. */
+
+GRL_INLINE uint64_t delta(global struct MortonCodePrimitive *mc,
+ const uint id)
+{
+ const uint64_t key0 = mc[id + 0].index_code;
+ const uint64_t key1 = mc[id + 1].index_code;
+ return clz(key0 ^ key1);
+}
+
+
+
+/* This function checks for a range [left,right] of morton codes, if
+ * it is spatially closer to the left or to the right nodes. */
+
+GRL_INLINE bool merge_to_right(global struct MortonCodePrimitive *mc,
+ const uint left,
+ const uint right,
+ const uint last)
+{
+ /* merge to right if we are at the left end of the array */
+ if (left == 0)
+ return true;
+
+ /* merge to left if we are at the right end of the array */
+ if (right == last)
+ return false;
+
+ /* otherwise merge to the side where the morton code sequence has
+ * the largest number of equal bits from the top */
+ return delta(mc, right) > delta(mc, left - 1);
+}
+
+GRL_INLINE uint updateParent(global struct BinaryMortonCodeHierarchy *bnodes,
+ global struct MortonCodePrimitive *mc,
+ const uint nodeID,
+ const uint left,
+ const uint right,
+ const uint last)
+{
+ uint parent;
+
+ /* check if we should merge this node to the left or right */
+ if (merge_to_right(mc, left, right, last))
+ {
+ parent = right;
+ bnodes[parent].leftChild = nodeID;
+ bnodes[parent].range.start = left;
+ }
+ else
+ {
+ parent = left - 1;
+ bnodes[parent].rightChild = nodeID;
+ bnodes[parent].range.end = right;
+ }
+
+ mem_fence_gpu_default();
+
+ /* stop ascending the tree if we reached this node the first time */
+ const bool first = atomic_inc_global((global uint *)&bnodes[parent].flag) == 0;
+ return first ? -1 : parent;
+}
+
+GRL_INLINE void
+DO_OLD_PARALLEL_BUILD_PHASE1( global struct Globals* globals,
+ global struct MortonCodePrimitive* mc,
+ global struct AABB* primref,
+ global struct BinaryMortonCodeHierarchy* bnodes,
+ global char* bvh_mem,
+ uint startID, uint endID,
+ local uint* local_numRecords,
+ local uint* local_numRecordsOld,
+ local struct BuildRecordMorton* local_records
+)
+{
+ global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem;
+ global struct BuildRecordMorton* records = (global struct BuildRecordMorton*)(bvh_mem + bvh->quadLeafStart*64);
+
+ /* iterate over all subtrees this workgroup should build */
+ for ( uint recordID = startID; recordID < endID; recordID++ )
+ {
+ /* add start build record to local stack */
+ if ( get_local_id( 0 ) == 0 )
+ {
+ local_records[0] = records[recordID];
+ *local_numRecords = 1;
+ *local_numRecordsOld = 0;
+ }
+ work_group_barrier( CLK_LOCAL_MEM_FENCE );
+
+ /* terminate when all subtrees are leaves */
+ while ( *local_numRecords != *local_numRecordsOld )
+ {
+ /* remember the old number of build records to detect later
+ * whether we are done */
+ if ( get_local_id( 0 ) == 0 )
+ {
+ *local_numRecordsOld = *local_numRecords;
+ }
+ work_group_barrier( CLK_LOCAL_MEM_FENCE );
+
+ /* all work items in the sub group pick a subtree to build */
+ for ( uint ID = get_local_id( 0 ); ID < *local_numRecordsOld; ID += get_local_size( 0 ) )
+ {
+ /* ignore small subtrees */
+ if ( local_records[ID].items <= BVH_NODE_N6 )
+ continue;
+
+ /* create QBVH node */
+ create_node( globals, bnodes, bvh_mem, ID, local_numRecords, local_records, &local_records[ID] );
+ }
+
+ /* wait for all work items to have updated local_records array */
+ work_group_barrier( CLK_LOCAL_MEM_FENCE );
+ }
+
+ const uint shift_mask = globals->shift_mask;
+ const uint leafPrimType = globals->leafPrimType;
+ const uint rootNodeOffset = BVH_ROOT_NODE_OFFSET;
+ BackPointers* backPointers = BVHBase_GetBackPointers( bvh );
+ global struct QBVHNodeN* nodeData = BVHBase_nodeData( bvh );
+
+ /* create all fat leaf nodes and initiate refit */
+ for ( uint ID = get_local_id( 0 ); ID < *local_numRecords; ID += get_local_size( 0 ) )
+ {
+ struct BuildRecordMorton current = local_records[ID];
+ const uint primrefID = BinaryMortonCodeHierarchy_getRangeStart( bnodes, current.nodeID );
+
+ global struct QBVHNodeN* qnode = nodeData + current.current_index;
+
+ /* get bounds of all children of the fat leaf node */
+ struct AABB bounds[BVH_NODE_N6];
+ for ( uint i = 0; i < current.items; i++ )
+ {
+ /* get primID and bounds of primitive */
+ const uint primID = (uint)(mc[primrefID + i].index_code & shift_mask);
+ bounds[i] = primref[primID];
+
+ /* For all primitives in a fat leaf we store a back
+ * pointer. This way we can modify the fat leaf node at leaf construction time. */
+ const uint back_pointer = qnode - (struct QBVHNodeN*)bvh_mem;
+
+ /* Store back pointer and primID inside morton code array to
+ * be later used by leaf creation. */
+ mc[primrefID + i].index_code = ((ulong)back_pointer) << 32 | (ulong)primID;
+ }
+
+ /* update fat leaf node */
+ QBVHNodeN_setType( qnode, leafPrimType );
+ global void* offset;
+ if ( leafPrimType != BVH_INSTANCE_NODE )
+ {
+ offset = bvh_mem + 64*bvh->quadLeafStart + primrefID * sizeof( struct Quad );
+ QBVHNodeN_setChildIncr1( qnode );
+ }
+ else
+ {
+ offset = bvh_mem + 64*bvh->instanceLeafStart + primrefID * sizeof( struct HwInstanceLeaf );
+ QBVHNodeN_setChildIncr2( qnode );
+ }
+ QBVH6Node_set_offset( qnode, offset );
+ QBVHNodeN_setBounds( qnode, bounds, current.items );
+
+ /* set back pointers for fat leaf nodes */
+ *InnerNode_GetBackPointer(backPointers, current.current_index) = (current.parent_index << 6) | (current.items << 3);
+
+ /* bottom up refit */
+ refit_bottom_up( qnode, bvh, bounds, current.items );
+ }
+ }
+}
+
+/*
+
+ This phase takes the build records calculated in phase0 as input and
+ finished the BVH construction for all these subtrees.
+
+*/
+__attribute__((reqd_work_group_size(8, 1, 1)))
+old_parallel_build_phase1(global struct Globals *globals,
+ global struct MortonCodePrimitive *mc,
+ global struct AABB *primref,
+ global struct BinaryMortonCodeHierarchy *bnodes,
+ global char *bvh_mem)
+{
+ global struct BVHBase *bvh = (global struct BVHBase *)bvh_mem;
+ global struct BuildRecordMorton *records = (global struct BuildRecordMorton *)(bvh_mem + 64*bvh->quadLeafStart);
+
+ /* a queue of build records */
+ local struct BuildRecordMorton local_records[MORTON_BUILDER_SUBTREE_THRESHOLD];
+ local uint local_numRecords;
+ local uint local_numRecordsOld;
+
+ /* construct range of build records that each sub group will process */
+ const uint numRecords = globals->numBuildRecords;
+ const uint startID = (get_group_id(0) + 0) * numRecords / get_num_groups(0);
+ const uint endID = (get_group_id(0) + 1) * numRecords / get_num_groups(0);
+
+ DO_OLD_PARALLEL_BUILD_PHASE1( globals, mc, primref, bnodes, bvh_mem, startID, endID, &local_numRecords, &local_numRecordsOld, local_records );
+
+}
+
+__attribute__( (reqd_work_group_size( 8, 1, 1 )) )
+old_parallel_build_phase1_Indirect( global struct Globals* globals,
+ global struct MortonCodePrimitive* mc,
+ global struct AABB* primref,
+ global struct BinaryMortonCodeHierarchy* bnodes,
+ global char* bvh_mem )
+{
+ global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem;
+ global struct BuildRecordMorton* records = (global struct BuildRecordMorton*)(bvh_mem + 64*bvh->quadLeafStart);
+
+ /* a queue of build records */
+ local struct BuildRecordMorton local_records[MORTON_BUILDER_SUBTREE_THRESHOLD];
+ local uint local_numRecords;
+ local uint local_numRecordsOld;
+
+ /* construct range of build records that each sub group will process */
+ const uint numRecords = globals->numBuildRecords;
+ uint startID = get_group_id( 0 );
+ uint endID = startID + 1;
+
+ DO_OLD_PARALLEL_BUILD_PHASE1( globals, mc, primref, bnodes, bvh_mem, startID, endID, &local_numRecords, &local_numRecordsOld, local_records );
+
+}
+#endif
diff --git a/src/intel/vulkan/grl/gpu/morton/pre_sort.cl b/src/intel/vulkan/grl/gpu/morton/pre_sort.cl
new file mode 100644
index 00000000000..099f926e194
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/morton/pre_sort.cl
@@ -0,0 +1,117 @@
+//
+// Copyright (C) 2009-2022 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#include "morton/morton_common.h"
+
+GRL_INLINE uint get_morton_shift( uint numPrimitives )
+{
+ return 32 - clz( numPrimitives );
+}
+
+GRL_INLINE uint get_morton_shift_mask( uint numPrimitives )
+{
+ uint shift = get_morton_shift( numPrimitives );
+ uint mask =(uint)(((ulong)1 << shift));
+ return mask - 1; // separated due to problems in DX
+}
+
+__attribute__((reqd_work_group_size(1, 1, 1))) void kernel init( global struct Globals *globals )
+{
+ /* variable shift for putting morton code + index to 64 bit */
+ const uint shift = 32 - clz(globals->numPrimitives);
+ globals->shift = shift;
+ globals->shift_mask = (uint)(((ulong)1 << shift));
+ globals->shift_mask -= 1; // separated due to problems in DX
+ globals->binary_hierarchy_root = 0;
+ globals->morton_sort_in_flight = 0;
+ globals->sort_iterations = get_morton_sort_lsb_req_iterations(shift);
+}
+
+/*
+
+ This kernel create a morton code array containing a morton code and
+ index into the primref array.
+
+ The code uses the maximal number of bits for the morton code, such
+ that the morton code and index can still both get stored in 64 bits.
+
+ The algorithm first maps the centroids of the primitives and their
+ bounding box diagonal into a 4D grid, and then interleaves all 4
+ grid coordinates to construct the to morton code.
+
+ */
+
+__attribute__( (reqd_work_group_size( MAX_HW_SIMD_WIDTH, 1, 1 )) )
+__attribute__( (intel_reqd_sub_group_size( 16 )) ) void kernel
+create_morton_codes_indirect( global struct Globals* globals,
+ global struct BVHBase* bvh,
+ global struct AABB* primref,
+ global struct MortonCodePrimitive* morton_codes,
+ global struct MortonCodePrimitive* morton_codes_tmp,
+ uint use_new_morton_sort)
+{
+ /* construct range of morton codes each work group should create */
+ const uint numPrimitives = globals->numPrimitives;
+ const uint startID = get_group_id( 0 ) * get_local_size( 0 );
+ const uint endID = min((uint)(startID + get_local_size(0)), numPrimitives);
+
+ /* get lower and upper bounds of geometry and length of scene diagonal */
+ const float3 lower = globals->centroidBounds.lower.xyz;
+ const float3 upper = globals->centroidBounds.upper.xyz;
+ const float diag = length( AABB3f_size( &bvh->Meta.bounds ).xyz );
+
+ /* calculates the 4D grid */
+ const uint shift = get_morton_shift( numPrimitives );
+ const uint grid_size = 1 << (64 - shift) / 4;
+ const float4 grid_base = (float4)(lower, 0.0f);
+ const float4 grid_extend = (float4)(upper - lower, diag);
+ const float4 grid_scale = select( (grid_size * 0.99f) / grid_extend, 0.0f, grid_extend == 0.0f ); // FIXME: 0.99f!!!!!
+
+ const uint req_iterations = get_morton_sort_lsb_req_iterations(shift);
+
+ /* each work group iterates over its range of morton codes to create */
+ uint primID = startID + get_local_id( 0 );
+ if( primID < endID )
+ {
+ /* calculate position inside 4D grid */
+ float4 centroid2 = AABB_centroid2( &primref[primID] );
+ centroid2.w = length( AABB_size( &primref[primID] ).xyz );
+ const uint4 gridpos = convert_uint4_rtz( (centroid2 - grid_base) * grid_scale );
+
+ /* calculate and store morton code */
+ const ulong code = ulong_bitInterleave4D( gridpos );
+ const ulong index_code = ((ulong)code << shift) | (ulong)primID;
+
+ // It is required for morton code to be in morton_codes buffer after LSB sort finishes.
+ // If there would be odd iteration number needed for sorting, it is needed
+ // to skip some iterations of sorting. For odd number of iteration start with morton_codes_tmp buffer
+ if(req_iterations & 1 && !use_new_morton_sort)
+ morton_codes_tmp[primID].index_code = index_code;
+ else
+ morton_codes[primID].index_code = index_code;
+ }
+}
+
+/*
+
+ Initialization of the binary morton code hierarchy.
+
+ */
+
+__attribute__( (reqd_work_group_size( MAX_HW_SIMD_WIDTH, 1, 1 )) ) void kernel init_bottom_up_indirect( global struct Globals* globals,
+ global struct BinaryMortonCodeHierarchy* bnodes )
+{
+ /* construct range each work group will process */
+ const uint numPrimitives = globals->numPrimitives;
+ const uint startID = get_group_id( 0 ) * get_local_size(0);
+ const uint endID = min((uint)(startID + get_local_size(0)), numPrimitives);
+
+ /* each workgroup iterates over its range to initialize the binary BVH */
+ uint i = startID + get_local_id( 0 );
+ if( i < endID )
+ BinaryMortonCodeHierarchy_init( &bnodes[i], 0, numPrimitives - 1 );
+}
diff --git a/src/intel/vulkan/grl/gpu/morton_builder.grl b/src/intel/vulkan/grl/gpu/morton_builder.grl
new file mode 100644
index 00000000000..f221fd39fed
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/morton_builder.grl
@@ -0,0 +1,335 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+module morton_builder;
+
+kernel_module morton_kernels ("morton/pre_sort.cl")
+{
+ kernel opencl_build_kernel_init < kernelFunction="init" >;
+ kernel opencl_build_morton_kernel_create_morton_codes_indirect < kernelFunction="create_morton_codes_indirect" >;
+ kernel opencl_build_morton_kernel_init_bottom_up_indirect < kernelFunction="init_bottom_up_indirect" >;
+}
+
+kernel_module morton_kernels ("morton/post_sort.cl")
+{
+ links lsc_intrinsics;
+
+ kernel opencl_build_morton_kernel_build_bottom_up_indirect < kernelFunction="build_bottom_up_indirect" >;
+}
+
+kernel_module morton_kernels ("morton/phase0.cl")
+{
+ links lsc_intrinsics;
+
+ kernel opencl_build_morton_kernel_parallel_build_phase0 < kernelFunction="parallel_build_phase0" >;
+ kernel opencl_build_morton_kernel_parallel_build_phase0_local_sync < kernelFunction="parallel_build_phase0_local_sync" >;
+}
+
+kernel_module morton_kernels ("morton/phase1.cl")
+{
+ links lsc_intrinsics;
+
+ kernel opencl_build_morton_kernel_parallel_build_phase1_Indirect < kernelFunction="parallel_build_phase1_Indirect_SG" >;
+ kernel opencl_build_morton_kernel_parallel_build_phase1_root < kernelFunction="parallel_build_phase1_Indirect_global_root" >;
+}
+
+kernel_module morton_kernels ("morton/phase2.cl")
+{
+ links lsc_intrinsics;
+
+ kernel opencl_build_morton_kernel_parallel_build_phase2_refit < kernelFunction="parallel_build_phase2_refit" >;
+ kernel opencl_build_morton_kernel_parallel_build_phase2_refit_local < kernelFunction="parallel_build_phase2_refit_local" >;
+}
+
+import struct MKBuilderState "structs.grl";
+
+/*
+metakernel begin(
+ MKBuilderState state,
+ qword morton_code_buffer,
+ dword primLeafType,
+ dword numHwThreads)
+{
+ dispatch opencl_build_kernel_init(1, 1, 1) args(
+ state.build_globals
+ );
+
+ control(wait_idle);
+
+
+ dispatch opencl_build_morton_kernel_create_morton_codes(numHwThreads, 1, 1) args(
+ state.build_globals,
+ state.bvh_buffer,
+ state.build_primref_buffer,
+ morton_code_buffer);
+
+ control(wait_idle);
+
+}
+
+metakernel build_bottom_up(
+ MKBuilderState state,
+ qword buildrecords_bottom_up,
+ qword morton_code_buffer,
+ dword numHwThreads)
+{
+ dispatch opencl_build_morton_kernel_init_bottom_up(numHwThreads, 1, 1) args(
+ state.build_globals,
+ buildrecords_bottom_up);
+
+ control(wait_idle);
+
+ dispatch opencl_build_morton_kernel_build_bottom_up(numHwThreads, 1, 1) args(
+ state.build_globals,
+ buildrecords_bottom_up,
+ morton_code_buffer);
+
+ control(wait_idle);
+
+}
+
+
+metakernel parallel_build(
+ MKBuilderState state,
+ qword buildrecords_bottom_up,
+ qword morton_code_buffer,
+ dword numHwThreads)
+{
+ dispatch opencl_build_morton_kernel_parallel_build_phase0(1, 1, 1) args(
+ state.build_globals,
+ buildrecords_bottom_up,
+ state.bvh_buffer);
+
+ control(wait_idle);
+
+ dispatch opencl_build_morton_kernel_parallel_build_phase1(numHwThreads, 1, 1) args(
+ state.build_globals,
+ morton_code_buffer,
+ state.build_primref_buffer,
+ buildrecords_bottom_up,
+ state.bvh_buffer);
+
+ control(wait_idle);
+
+}
+
+*/
+
+metakernel NewMorton_pre_sort(
+ qword num_primrefs_counter,
+ MKBuilderState state,
+ qword morton_code_buffer,
+ qword morton_code_buffer_tmp,
+ qword buildrecords_bottom_up,
+ dword use_new_morton_sort)
+{
+
+
+ {
+ REG1 = 15;
+ REG2 = 4;
+ REG0 = load_dword( num_primrefs_counter );
+
+ REG0 = REG0 + REG1; // JDB TODO: TGL will need to do this computation in the EU and store it in globals
+ REG1 = ~REG1;
+ REG0 = REG0 & REG1;
+ REG0 = REG0 >> REG2;
+ }
+
+ dispatch opencl_build_kernel_init(1, 1, 1) args( state.build_globals );
+
+ DISPATCHDIM_X = REG0.lo;
+ DISPATCHDIM_Y = 1;
+ DISPATCHDIM_Z = 1;
+
+ /*
+ // new bottom-up kernel does not need this
+ dispatch_indirect opencl_build_morton_kernel_init_bottom_up_indirect args(
+ state.build_globals,
+ buildrecords_bottom_up);
+ */
+ dispatch_indirect opencl_build_morton_kernel_create_morton_codes_indirect args(
+ state.build_globals,
+ state.bvh_buffer,
+ state.build_primref_buffer,
+ morton_code_buffer,
+ morton_code_buffer_tmp,
+ use_new_morton_sort);
+
+
+}
+
+
+
+metakernel NewMorton_post_sort(
+ qword num_primrefs_counter,
+ qword num_buildrecords_counter,
+ MKBuilderState state,
+ qword buildrecords_bottom_up,
+ qword morton_code_buffer )
+{
+
+ {
+ REG1 = 15;
+ REG2 = 4;
+ REG0 = load_dword( num_primrefs_counter );
+
+ REG0 = REG0 + REG1; // JDB TODO: TGL will need to do this computation in the EU and store it in globals
+ REG1 = ~REG1;
+ REG0 = REG0 & REG1;
+ REG0 = REG0 >> REG2;
+ }
+
+ DISPATCHDIM_X = REG0.lo;
+ DISPATCHDIM_Y = 1;
+ DISPATCHDIM_Z = 1;
+
+ dispatch_indirect opencl_build_morton_kernel_build_bottom_up_indirect args(
+ state.build_globals,
+ buildrecords_bottom_up,
+ morton_code_buffer);
+
+
+ /*
+ dispatch opencl_build_morton_kernel_build_bottom_up(16, 1, 1) args(
+ state.build_globals,
+ buildrecords_bottom_up,
+ morton_code_buffer);
+ */
+
+ control(wait_idle);
+
+ dispatch opencl_build_morton_kernel_parallel_build_phase0(1, 1, 1) args(
+ state.build_globals,
+ buildrecords_bottom_up,
+ state.bvh_buffer);
+
+ control(wait_idle);
+
+ DISPATCHDIM_X = load_dword( num_buildrecords_counter );
+
+ dispatch_indirect opencl_build_morton_kernel_parallel_build_phase1_Indirect args(
+ state.build_globals,
+ morton_code_buffer,
+ state.build_primref_buffer,
+ buildrecords_bottom_up,
+ state.bvh_buffer);
+
+ control(wait_idle);
+
+}
+
+metakernel NewMorton_bottom_up(
+ qword num_primrefs_counter,
+ MKBuilderState state,
+ qword buildrecords_bottom_up,
+ qword morton_code_buffer )
+{
+
+ {
+ REG1 = 15;
+ REG2 = 4;
+ REG0 = load_dword( num_primrefs_counter );
+
+ REG0 = REG0 + REG1; // JDB TODO: TGL will need to do this computation in the EU and store it in globals
+ REG1 = ~REG1;
+ REG0 = REG0 & REG1;
+ REG0 = REG0 >> REG2;
+ }
+
+ DISPATCHDIM_X = REG0.lo;
+ DISPATCHDIM_Y = 1;
+ DISPATCHDIM_Z = 1;
+
+ dispatch_indirect opencl_build_morton_kernel_build_bottom_up_indirect args(
+ state.build_globals,
+ buildrecords_bottom_up,
+ morton_code_buffer);
+}
+
+
+metakernel NewMorton_phase0(
+ MKBuilderState state,
+ qword buildrecords_bottom_up,
+ qword morton_p0_refit_startpoints)
+{
+
+ dispatch opencl_build_morton_kernel_parallel_build_phase0(1, 1, 1) args(
+ state.build_globals,
+ buildrecords_bottom_up,
+ state.bvh_buffer,
+ morton_p0_refit_startpoints);
+}
+
+metakernel NewMorton_phase0_local_sync(
+ MKBuilderState state,
+ qword buildrecords_bottom_up,
+ qword p0_boxless_nodes)
+{
+
+ dispatch opencl_build_morton_kernel_parallel_build_phase0_local_sync(1, 1, 1) args(
+ state.build_globals,
+ buildrecords_bottom_up,
+ state.bvh_buffer,
+ p0_boxless_nodes);
+}
+
+
+metakernel NewMorton_phase1(
+ qword num_buildrecords_counter,
+ MKBuilderState state,
+ qword buildrecords_bottom_up,
+ qword morton_code_buffer)
+{
+
+ DISPATCHDIM_X = load_dword( num_buildrecords_counter );
+
+ dispatch_indirect opencl_build_morton_kernel_parallel_build_phase1_Indirect args(
+ state.build_globals,
+ morton_code_buffer,
+ state.build_primref_buffer,
+ buildrecords_bottom_up,
+ state.bvh_buffer);
+}
+
+metakernel NewMorton_phase1_root(
+ qword num_buildrecords_counter,
+ MKBuilderState state,
+ qword buildrecords_bottom_up,
+ qword morton_code_buffer)
+{
+ dispatch opencl_build_morton_kernel_parallel_build_phase1_root(1, 1, 1) args(
+ state.build_globals,
+ morton_code_buffer,
+ state.build_primref_buffer,
+ buildrecords_bottom_up,
+ state.bvh_buffer);
+}
+
+metakernel NewMorton_phase2(
+ qword num_leaves_counter,
+ MKBuilderState state,
+ qword bottom_node_ids )
+{
+
+ DISPATCHDIM_X = load_dword( num_leaves_counter );
+
+ dispatch_indirect opencl_build_morton_kernel_parallel_build_phase2_refit args(
+ state.bvh_buffer,
+ bottom_node_ids);
+}
+
+metakernel NewMorton_phase2_local(
+ MKBuilderState state,
+ qword p0_boxless_nodes)
+{
+
+ dispatch opencl_build_morton_kernel_parallel_build_phase2_refit_local(1, 1, 1) args(
+ state.build_globals,
+ state.bvh_buffer,
+ p0_boxless_nodes);
+}
diff --git a/src/intel/vulkan/grl/gpu/morton_msb_radix_bitonic_sort.cl b/src/intel/vulkan/grl/gpu/morton_msb_radix_bitonic_sort.cl
new file mode 100644
index 00000000000..075d44a51ba
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/morton_msb_radix_bitonic_sort.cl
@@ -0,0 +1,9 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+// just inlines the kernels that are there in the header
+#include "morton_msb_radix_bitonic_sort.h" \ No newline at end of file
diff --git a/src/intel/vulkan/grl/gpu/morton_msb_radix_bitonic_sort.h b/src/intel/vulkan/grl/gpu/morton_msb_radix_bitonic_sort.h
new file mode 100644
index 00000000000..4fb6c21b014
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/morton_msb_radix_bitonic_sort.h
@@ -0,0 +1,924 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#pragma once
+
+#include "common.h"
+#include "morton_msb_radix_bitonic_sort_shared.h"
+
+#include "libs/lsc_intrinsics.h"
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// Configuration switches
+//
+///////////////////////////////////////////////////////////////////////////////
+
+#define DEBUG 0
+#define MERGE_BLS_WITHIN_SG 0
+
+///////////////////////////////////////////////////////////////////////////////
+
+
+#if DEBUG
+#define DEBUG_CODE(A) A
+#else
+#define DEBUG_CODE(A)
+#endif
+
+#define BOTTOM_LEVEL_SORT_WG_SIZE 512
+
+// this kernel is only used to put into metakernel for debug to print that the code reached that place
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(1, 1, 1)))
+void kernel debug_print_kernel(uint variable)
+{
+ if(get_local_id(0) == 0)
+ printf("I'm here! %d\n", variable);
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(1, 1, 1)))
+void kernel check_bls_sort(global struct Globals* globals, global ulong* input)
+{
+ uint prims_num = globals->numPrimitives;
+
+ printf("in check_bls_sort kernel. Values count:: %d\n", prims_num);
+
+ ulong left = input[0];
+ ulong right;
+ for (int i = 0; i < prims_num - 1; i++)
+ {
+ right = input[i + 1];
+ printf("sorted val: %llu\n", left);
+ if (left > right)
+ {
+ printf("element %d is bigger than %d: %llu > %llu\n", i, i+1, left, right);
+ }
+ left = right;
+ }
+}
+
+inline uint wg_scan_inclusive_add_opt(local uint* tmp, uint val, uint SG_SIZE, uint WG_SIZE)
+{
+ const uint hw_thread_in_wg_id = get_local_id(0) / SG_SIZE;
+ const uint sg_local_id = get_local_id(0) % SG_SIZE;
+ const uint NUM_HW_THREADS_IN_WG = WG_SIZE / SG_SIZE;
+
+ uint acc = sub_group_scan_inclusive_add(val);
+ if (NUM_HW_THREADS_IN_WG == 1)
+ {
+ return acc;
+ }
+ tmp[hw_thread_in_wg_id] = sub_group_broadcast(acc, SG_SIZE - 1);
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ uint loaded_val = sg_local_id < NUM_HW_THREADS_IN_WG ? tmp[sg_local_id] : 0;
+ uint wgs_acc = sub_group_scan_exclusive_add(loaded_val);
+ uint acc_for_this_hw_thread = sub_group_broadcast(wgs_acc, hw_thread_in_wg_id);
+ // for > 256 workitems in SIMD16 we won't fit in 16 workitems per subgroup, so we need additional iteration
+ // same for > 64 workitems and more in SIMD8
+ uint num_iterations = (NUM_HW_THREADS_IN_WG + SG_SIZE - 1) / SG_SIZE;
+ for (int i = 1; i < num_iterations; i++)
+ {
+ // need to add tmp[] because of "exclusive" scan, so last element misses it
+ uint prev_max_sum = sub_group_broadcast(wgs_acc, SG_SIZE - 1) + tmp[(i * SG_SIZE) - 1];
+ loaded_val = (sg_local_id + i * SG_SIZE) < NUM_HW_THREADS_IN_WG ? tmp[sg_local_id] : 0;
+ wgs_acc = sub_group_scan_exclusive_add(loaded_val);
+ wgs_acc += prev_max_sum;
+ uint new_acc_for_this_hw_thread = sub_group_broadcast(wgs_acc, hw_thread_in_wg_id % SG_SIZE);
+ if (hw_thread_in_wg_id >= i * SG_SIZE)
+ acc_for_this_hw_thread = new_acc_for_this_hw_thread;
+ }
+ return acc + acc_for_this_hw_thread;
+}
+
+struct MSBDispatchArgs
+{
+ global struct MSBRadixContext* context;
+ uint num_of_wgs; // this is the number of workgroups that was dispatched for this context
+ ulong* wg_key_start; // this is where keys to process start for current workgroup
+ ulong* wg_key_end;
+ uint shift_bit;
+};
+
+
+
+
+struct MSBDispatchArgs get_msb_dispatch_args(global struct VContextScheduler* scheduler)
+{
+ global struct MSBDispatchQueue* queue = &scheduler->msb_queue;
+
+ uint group = get_group_id(0);
+ struct MSBDispatchRecord record;
+
+ // TODO_OPT: Load this entire prefix array into SLM instead of searching..
+ // Or use sub-group ops
+ uint i = 0;
+ while (i < queue->num_records)
+ {
+ uint n = queue->records[i].wgs_to_dispatch;
+
+ if (group < n)
+ {
+ record = queue->records[i];
+ break;
+ }
+
+ group -= n;
+ i++;
+ }
+
+ uint context_id = i;
+ global struct MSBRadixContext* context = &scheduler->contexts[context_id];
+
+ // moving to ulongs to avoid uint overflow
+ ulong group_id_in_dispatch = group;
+ ulong start_offset = context->start_offset;
+ ulong num_keys = context->num_keys;
+ ulong wgs_to_dispatch = record.wgs_to_dispatch;
+
+ struct MSBDispatchArgs args;
+ args.context = context;
+ args.num_of_wgs = record.wgs_to_dispatch;
+ args.wg_key_start = context->keys_in + start_offset + (group_id_in_dispatch * num_keys / wgs_to_dispatch);
+ args.wg_key_end = context->keys_in + start_offset + ((group_id_in_dispatch+1) * num_keys / wgs_to_dispatch);
+ args.shift_bit = MSB_SHIFT_BYTE_START_OFFSET - context->iteration * MSB_BITS_PER_ITERATION;
+ return args;
+}
+
+
+
+
+void BLSDispatchQueue_push(global struct BLSDispatchQueue* queue, struct BLSDispatchRecord* record)
+{
+ uint new_idx = atomic_inc_global(&queue->num_records);
+ queue->records[new_idx] = *record;
+ DEBUG_CODE(printf("adding bls of size: %d\n", record->count));
+}
+
+
+
+
+void DO_CountSort(struct BLSDispatchRecord dispatchRecord, local ulong* SLM_shared, global ulong* output)
+{
+ uint tid = get_local_id(0);
+
+ global ulong* in = ((global ulong*)(dispatchRecord.keys_in)) + dispatchRecord.start_offset;
+
+ ulong a = tid < dispatchRecord.count ? in[tid] : ULONG_MAX;
+
+ SLM_shared[tid] = a;
+
+ uint counter = 0;
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ ulong curr = SLM_shared[get_sub_group_local_id()];
+
+ for (uint i = 16; i < dispatchRecord.count; i += 16)
+ {
+ ulong next = SLM_shared[i + get_sub_group_local_id()];
+
+ for (uint j = 0; j < 16; j++)
+ {
+ // some older drivers have bug when shuffling ulong so we process by shuffling 2x uint
+ uint2 curr_as_uint2 = as_uint2(curr);
+ uint2 sg_curr_as_uint2 = (uint2)(sub_group_broadcast(curr_as_uint2.x, j), sub_group_broadcast(curr_as_uint2.y, j));
+ ulong c = as_ulong(sg_curr_as_uint2);
+ if (c < a)
+ counter++;
+ }
+
+ curr = next;
+ }
+
+
+ // last iter
+ for (uint j = 0; j < 16; j++)
+ {
+ // some older drivers have bug when shuffling ulong so we process by shuffling 2x uint
+ uint2 curr_as_uint2 = as_uint2(curr);
+ uint2 sg_curr_as_uint2 = (uint2)(sub_group_broadcast(curr_as_uint2.x, j), sub_group_broadcast(curr_as_uint2.y, j));
+ ulong c = as_ulong(sg_curr_as_uint2);
+ if (c < a)
+ counter++;
+ }
+
+ // save elements to its sorted positions
+ if (tid < dispatchRecord.count)
+ output[dispatchRecord.start_offset + counter] = a;
+}
+
+void DO_Bitonic(struct BLSDispatchRecord dispatchRecord, local ulong* SLM_shared, global ulong* output)
+{
+ uint lid = get_local_id(0);
+ uint elements_to_sort = BOTTOM_LEVEL_SORT_THRESHOLD;
+ while ((elements_to_sort >> 1) >= dispatchRecord.count && elements_to_sort >> 1 >= BOTTOM_LEVEL_SORT_WG_SIZE)
+ {
+ elements_to_sort >>= 1;
+ }
+
+ for (int i = 0; i < elements_to_sort / BOTTOM_LEVEL_SORT_WG_SIZE; i++)
+ {
+ uint tid = lid + i * BOTTOM_LEVEL_SORT_WG_SIZE;
+
+ if (tid >= dispatchRecord.count)
+ SLM_shared[tid] = ULONG_MAX;
+ else
+ SLM_shared[tid] = ((global ulong*)(dispatchRecord.keys_in))[dispatchRecord.start_offset + tid];
+ }
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ uint k_iterations = elements_to_sort;
+ while(k_iterations >> 1 >= dispatchRecord.count && k_iterations != 0)
+ {
+ k_iterations >>= 1;
+ }
+
+ for (unsigned int k = 2; k <= k_iterations; k *= 2)
+ {
+ for (unsigned int j = k / 2; j > 0; j /= 2)
+ {
+ // this loop is needed when we can't create big enough workgroup so we need to process multiple times
+ for (uint i = 0; i < elements_to_sort / BOTTOM_LEVEL_SORT_WG_SIZE; i++)
+ {
+ uint tid = lid + i * BOTTOM_LEVEL_SORT_WG_SIZE;
+ unsigned int ixj = tid ^ j;
+ if (ixj > tid)
+ {
+ if ((tid & k) == 0)
+ {
+ if (SLM_shared[tid] > SLM_shared[ixj])
+ {
+ ulong tmp = SLM_shared[tid];
+ SLM_shared[tid] = SLM_shared[ixj];
+ SLM_shared[ixj] = tmp;
+ }
+ }
+ else
+ {
+ if (SLM_shared[tid] < SLM_shared[ixj])
+ {
+ ulong tmp = SLM_shared[tid];
+ SLM_shared[tid] = SLM_shared[ixj];
+ SLM_shared[ixj] = tmp;
+ }
+ }
+ }
+ }
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+ }
+ }
+
+ for (int i = 0; i < elements_to_sort / BOTTOM_LEVEL_SORT_WG_SIZE; i++)
+ {
+ uint tid = lid + i * BOTTOM_LEVEL_SORT_WG_SIZE;
+
+ if (tid < dispatchRecord.count)
+ output[dispatchRecord.start_offset + tid] = SLM_shared[tid];
+ }
+}
+
+
+
+
+void DO_Create_Separate_BLS_Work(global struct VContextScheduler* scheduler, global struct MSBRadixContext* context, global ulong* input)
+{
+ uint lid = get_local_id(0);
+
+ uint start = context->start[lid];
+ uint count = context->count[lid];
+ uint start_offset = context->start_offset + start;
+
+ struct BLSDispatchRecord record;
+ record.start_offset = start_offset;
+ record.count = count;
+ record.keys_in = context->keys_out;
+
+ if (count == 0) // we don't have elements so don't do anything
+ {
+ }
+ else if (count == 1) // single element so just write it out
+ {
+ input[start_offset] = ((global ulong*)record.keys_in)[start_offset];
+ }
+ else if (count <= BOTTOM_LEVEL_SORT_THRESHOLD)
+ {
+ BLSDispatchQueue_push((global struct BLSDispatchQueue*)scheduler->next_bls_queue, &record);
+ }
+}
+
+
+
+
+// We try to merge small BLS into larger one within the sub_group
+void DO_Create_SG_Merged_BLS_Work_Parallel(global struct VContextScheduler* scheduler, global struct MSBRadixContext* context, global ulong* input)
+{
+ uint lid = get_local_id(0);
+ uint sid = get_sub_group_local_id();
+
+ uint create_msb_work = context->count[lid] > BOTTOM_LEVEL_SORT_THRESHOLD ? 1 : 0;
+
+ uint start = context->start[lid];
+ uint count = context->count[lid];
+ uint ctx_start_offset = context->start_offset;
+
+ if (sid == 0 || create_msb_work) // these SIMD lanes are the begining of merged BLS
+ {
+ struct BLSDispatchRecord record;
+ if (create_msb_work)
+ {
+ record.start_offset = ctx_start_offset + start + count;
+ record.count = 0;
+ }
+ else // SIMD lane 0 case
+ {
+ record.start_offset = ctx_start_offset + start;
+ record.count = count;
+ }
+
+ record.keys_in = context->keys_out;
+
+ uint loop_idx = 1;
+ while (sid + loop_idx < 16) // loop over subgroup
+ {
+ uint _create_msb_work = intel_sub_group_shuffle_down(create_msb_work, 0u, loop_idx);
+ uint _count = intel_sub_group_shuffle_down(count, 0u, loop_idx);
+ uint _start = intel_sub_group_shuffle_down(start, 0u, loop_idx);
+
+ if (_create_msb_work) // found out next MSB work, so range of merges ends
+ break;
+
+ // need to push record since nothing more will fit
+ if (record.count + _count > BOTTOM_LEVEL_SORT_MERGING_THRESHOLD)
+ {
+ if (record.count == 1)
+ {
+ input[record.start_offset] = record.keys_in[record.start_offset];
+ }
+ else if (record.count > 1)
+ {
+ BLSDispatchQueue_push((global struct BLSDispatchQueue*)scheduler->next_bls_queue, &record);
+ }
+ record.start_offset = ctx_start_offset + _start;
+ record.count = _count;
+ }
+ else
+ {
+ record.count += _count;
+ }
+ loop_idx++;
+ }
+ // if we have any elements left, then schedule them
+ if (record.count == 1) // only one element, so just write it out
+ {
+ input[record.start_offset] = record.keys_in[record.start_offset];
+ }
+ else if (record.count > 1)
+ {
+ BLSDispatchQueue_push((global struct BLSDispatchQueue*)scheduler->next_bls_queue, &record);
+ }
+ }
+}
+
+
+
+
+// We try to merge small BLS into larger one within the sub_group
+void DO_Create_SG_Merged_BLS_Work(global struct VContextScheduler* scheduler, global struct MSBRadixContext* context, global ulong* input)
+{
+ uint lid = get_local_id(0);
+ uint sid = get_sub_group_local_id();
+
+ uint create_msb_work = context->count[lid] > BOTTOM_LEVEL_SORT_THRESHOLD ? 1 : 0;
+
+ uint start = context->start[lid];
+ uint count = context->count[lid];
+ uint ctx_start_offset = context->start_offset;
+
+ if (sid == 0)
+ {
+ struct BLSDispatchRecord record;
+ record.start_offset = ctx_start_offset + start;
+ record.count = 0;
+ record.keys_in = context->keys_out;
+
+ for (int i = 0; i < 16; i++)
+ {
+ uint _create_msb_work = sub_group_broadcast(create_msb_work, i);
+ uint _count = sub_group_broadcast(count, i);
+ uint _start = sub_group_broadcast(start, i);
+ if (_create_msb_work)
+ {
+ if (record.count == 1) // only one element, so just write it out
+ {
+ input[record.start_offset] = record.keys_in[record.start_offset];
+ }
+ else if (record.count > 1)
+ {
+ BLSDispatchQueue_push((global struct BLSDispatchQueue*)scheduler->next_bls_queue, &record);
+ }
+ record.start_offset = ctx_start_offset + _start + _count;
+ record.count = 0;
+ continue;
+ }
+ // need to push record since nothing more will fit
+ if (record.count + _count > BOTTOM_LEVEL_SORT_MERGING_THRESHOLD)
+ {
+ BLSDispatchQueue_push((global struct BLSDispatchQueue*)scheduler->next_bls_queue, &record);
+ record.start_offset = ctx_start_offset + _start;
+ record.count = _count;
+ }
+ else
+ {
+ record.count += _count;
+ }
+ }
+ // if we have any elements left, then schedule them
+ if (record.count == 1) // only one element, so just write it out
+ {
+ input[record.start_offset] = record.keys_in[record.start_offset];
+ }
+ else if (record.count > 1)
+ {
+ BLSDispatchQueue_push((global struct BLSDispatchQueue*)scheduler->next_bls_queue, &record);
+ }
+ }
+}
+
+
+
+
+void DO_Create_Work(global struct VContextScheduler* scheduler, global struct MSBRadixContext* context, global ulong* input, local uint* slm_for_wg_scan, uint sg_size, uint wg_size)
+{
+ uint lid = get_local_id(0);
+
+ uint iteration = context->iteration + 1;
+ uint start = context->start[lid];
+ uint count = context->count[lid];
+ uint start_offset = context->start_offset + start;
+
+ uint create_msb_work = count > BOTTOM_LEVEL_SORT_THRESHOLD ? 1 : 0;
+
+#if MERGE_BLS_WITHIN_SG
+ DO_Create_SG_Merged_BLS_Work_Parallel(scheduler, context, input);
+#else
+ DO_Create_Separate_BLS_Work(scheduler, context, input);
+#endif
+
+ uint new_entry_id = wg_scan_inclusive_add_opt(slm_for_wg_scan, create_msb_work, sg_size, wg_size);//work_group_scan_inclusive_add(create_msb_work);
+ uint stack_begin_entry;
+ // last workitem in wg contains number of all new entries
+ if (lid == (MSB_RADIX_NUM_BINS - 1))
+ {
+ stack_begin_entry = atomic_add_global(&scheduler->msb_stack.num_entries, new_entry_id);
+ }
+ stack_begin_entry = work_group_broadcast(stack_begin_entry, (MSB_RADIX_NUM_BINS - 1));
+ new_entry_id += stack_begin_entry -1;
+
+
+ if (create_msb_work)
+ {
+ scheduler->msb_stack.entries[new_entry_id].start_offset = start_offset;
+ scheduler->msb_stack.entries[new_entry_id].count = count;
+ scheduler->msb_stack.entries[new_entry_id].iteration = iteration;
+ }
+
+ if (lid == 0) {
+ DEBUG_CODE(printf("num of new bls: %d\n", scheduler->next_bls_queue->num_records));
+ }
+}
+
+
+struct BatchedBLSDispatchEntry
+{
+ /////////////////////////////////////////////////////////////
+ // State data used for communication with command streamer
+ // NOTE: This part must match definition in 'msb_radix_bitonic_sort.grl'
+ /////////////////////////////////////////////////////////////
+ qword p_data_buffer;
+ qword num_elements; // number of elements in p_data_buffer
+};
+
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(BOTTOM_LEVEL_SORT_WG_SIZE, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16)))
+void kernel sort_morton_codes_batched_BLS_dispatch(global struct BatchedBLSDispatchEntry* bls_dispatches)
+{
+ uint dispatch_id = get_group_id(0);
+ uint lid = get_local_id(0);
+
+ local ulong SLM_shared[BOTTOM_LEVEL_SORT_THRESHOLD];
+
+ struct BatchedBLSDispatchEntry dispatchArgs = bls_dispatches[dispatch_id];
+ struct BLSDispatchRecord dispatchRecord;
+ dispatchRecord.start_offset = 0;
+ dispatchRecord.count = dispatchArgs.num_elements;
+ dispatchRecord.keys_in = (ulong*)dispatchArgs.p_data_buffer;
+
+ DEBUG_CODE(if (lid == 0) printf("running sort_morton_codes_bottom_level_single_wg for %d elements\n", dispatchRecord.count));
+
+ if(dispatchRecord.count > 1)
+ DO_Bitonic(dispatchRecord, SLM_shared, (global ulong*)dispatchRecord.keys_in);
+}
+
+
+
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(BOTTOM_LEVEL_SORT_WG_SIZE, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16)))
+void kernel sort_morton_codes_bottom_level_single_wg(global struct Globals* globals, global ulong* input, global ulong* output)
+{
+ uint lid = get_local_id(0);
+
+ DEBUG_CODE(if (lid == 0) printf("running sort_morton_codes_bottom_level_single_wg for %d elements\n", globals->numPrimitives));
+
+ local ulong SLM_shared[BOTTOM_LEVEL_SORT_THRESHOLD];
+
+ struct BLSDispatchRecord dispatchRecord;
+ dispatchRecord.start_offset = 0;
+ dispatchRecord.count = globals->numPrimitives;
+ dispatchRecord.keys_in = (ulong*)input;
+
+ //TODO: count or bitonic here?
+ //DO_Bitonic(dispatchRecord, SLM_shared, output);
+ DO_CountSort(dispatchRecord, SLM_shared, output);
+}
+
+
+
+
+// This kernel initializes first context to start up the whole execution
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(MSB_RADIX_NUM_BINS, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16)))
+void kernel sort_morton_codes_msb_begin(
+ global struct Globals* globals,
+ global struct VContextScheduler* scheduler,
+ global ulong* buf0,
+ global ulong* buf1)
+{
+ uint lid = get_local_id(0);
+ uint gid = get_group_id(0);
+
+ DEBUG_CODE(if (lid == 0)printf("running sort_morton_codes_msb_begin\n"));
+
+ scheduler->contexts[gid].count[lid] = 0;
+
+ if (gid == 0 && lid == 0)
+ {
+ global struct MSBRadixContext* context = &scheduler->contexts[lid];
+ const uint num_prims = globals->numPrimitives;
+
+ scheduler->bls_queue0.num_records = 0;
+ scheduler->bls_queue1.num_records = 0;
+
+ scheduler->curr_bls_queue = &scheduler->bls_queue1;
+ scheduler->next_bls_queue = &scheduler->bls_queue0;
+
+ context->start_offset = 0;
+ context->num_wgs_in_flight = 0;
+ context->num_keys = num_prims;
+ context->iteration = 0;
+ context->keys_in = buf0;
+ context->keys_out = buf1;
+
+ uint msb_wgs_to_dispatch = (num_prims + MSB_WG_SORT_ELEMENTS_THRESHOLD - 1) / MSB_WG_SORT_ELEMENTS_THRESHOLD;
+ scheduler->msb_queue.records[0].wgs_to_dispatch = msb_wgs_to_dispatch;
+
+ scheduler->num_wgs_msb = msb_wgs_to_dispatch;
+ scheduler->num_wgs_bls = 0;
+ scheduler->msb_stack.num_entries = 0;
+ scheduler->msb_queue.num_records = 1;
+ }
+}
+
+
+
+
+__attribute__((reqd_work_group_size(MSB_RADIX_NUM_VCONTEXTS, 1, 1)))
+kernel void
+scheduler(global struct VContextScheduler* scheduler, global ulong* buf0, global ulong* buf1)
+{
+ uint lid = get_local_id(0);
+
+ DEBUG_CODE(if (lid == 0) printf("running sort_morton_codes_scheduler\n"));
+
+ uint context_idx = lid;
+
+ const uint num_of_stack_entries = scheduler->msb_stack.num_entries;
+
+ uint msb_wgs_to_dispatch = 0;
+ if (lid < num_of_stack_entries)
+ {
+ struct MSBStackEntry entry = scheduler->msb_stack.entries[(num_of_stack_entries-1) - lid];
+ global struct MSBRadixContext* context = &scheduler->contexts[lid];
+ context->start_offset = entry.start_offset;
+ context->num_wgs_in_flight = 0;
+ context->num_keys = entry.count;
+ context->iteration = entry.iteration;
+ context->keys_in = entry.iteration % 2 == 0 ? buf0 : buf1;
+ context->keys_out = entry.iteration % 2 == 0 ? buf1 : buf0;
+
+ msb_wgs_to_dispatch = (entry.count + MSB_WG_SORT_ELEMENTS_THRESHOLD - 1) / MSB_WG_SORT_ELEMENTS_THRESHOLD;
+ scheduler->msb_queue.records[lid].wgs_to_dispatch = msb_wgs_to_dispatch;
+ }
+
+ msb_wgs_to_dispatch = work_group_reduce_add(msb_wgs_to_dispatch);// TODO: if compiler implementation is slow, then consider to manually write it
+
+ if (lid == 0)
+ {
+ // swap queue for next iteration
+ struct BLSDispatchQueue* tmp = scheduler->curr_bls_queue;
+ scheduler->curr_bls_queue = scheduler->next_bls_queue;
+ scheduler->next_bls_queue = tmp;
+
+ scheduler->next_bls_queue->num_records = 0;
+
+ scheduler->num_wgs_bls = scheduler->curr_bls_queue->num_records;
+ scheduler->num_wgs_msb = msb_wgs_to_dispatch;
+
+ if (num_of_stack_entries < MSB_RADIX_NUM_VCONTEXTS)
+ {
+ scheduler->msb_queue.num_records = num_of_stack_entries;
+ scheduler->msb_stack.num_entries = 0;
+ }
+ else
+ {
+ scheduler->msb_queue.num_records = MSB_RADIX_NUM_VCONTEXTS;
+ scheduler->msb_stack.num_entries -= MSB_RADIX_NUM_VCONTEXTS;
+ }
+ }
+
+ DEBUG_CODE(if (lid == 0) printf("running sort_morton_codes_scheduler finished, to spawn %d MSB wgs in %d contexts and %d BLS wgs, MSB records on stack %d\n",
+ scheduler->num_wgs_msb, scheduler->msb_queue.num_records, scheduler->num_wgs_bls, scheduler->msb_stack.num_entries));
+}
+
+
+
+
+// this is the lowest sub-task, which should end return sorted codes
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(BOTTOM_LEVEL_SORT_WG_SIZE, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16)))
+void kernel sort_morton_codes_bottom_level( global struct VContextScheduler* scheduler, global ulong* output)
+{
+ uint lid = get_local_id(0);
+
+ DEBUG_CODE(if (get_group_id(0) == 0 && lid == 0) printf("running sort_morton_codes_bottom_level\n"));
+
+ local struct BLSDispatchRecord l_dispatchRecord;
+ if (lid == 0)
+ {
+ uint record_idx = get_group_id(0);
+ l_dispatchRecord = scheduler->curr_bls_queue->records[record_idx];
+ //l_dispatchRecord = BLSDispatchQueue_pop((global struct BLSDispatchQueue*)scheduler->curr_bls_queue);
+ atomic_dec_global(&scheduler->num_wgs_bls);
+ }
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ struct BLSDispatchRecord dispatchRecord = l_dispatchRecord;
+
+ local ulong SLM_shared[BOTTOM_LEVEL_SORT_THRESHOLD];
+
+ // right now use only bitonic sort
+ // TODO: maybe implement something else
+ if (1)
+ {
+ //DO_Bitonic(dispatchRecord, SLM_shared, output);
+ DO_CountSort(dispatchRecord, SLM_shared, output);
+ }
+}
+
+
+
+
+#define MSB_COUNT_WG_SIZE MSB_RADIX_NUM_BINS
+#define MSB_COUNT_SG_SIZE 16
+
+// count how many elements per buckets we have
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(MSB_COUNT_WG_SIZE, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(MSB_COUNT_SG_SIZE)))
+void kernel sort_morton_codes_msb_count_items( global struct VContextScheduler* scheduler)
+{
+ uint lid = get_local_id(0);
+ uint lsz = MSB_RADIX_NUM_BINS;
+
+ DEBUG_CODE(if (lid == 0) printf("running sort_morton_codes_msb_count_items\n"));
+
+ local uint bucket_count[MSB_RADIX_NUM_BINS];
+ local uint finish_count;
+ bucket_count[lid] = 0;
+ if (lid == 0)
+ {
+ finish_count = 0;
+ }
+
+ struct MSBDispatchArgs dispatchArgs = get_msb_dispatch_args(scheduler);
+
+ global struct MSBRadixContext* context = dispatchArgs.context;
+
+ global ulong* key_start = (global ulong*)dispatchArgs.wg_key_start + lid;
+ global ulong* key_end = (global ulong*)dispatchArgs.wg_key_end;
+ uint shift_bit = dispatchArgs.shift_bit;
+ uchar shift_byte = shift_bit / 8; // so we count how many uchars to shift
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ global uchar* ks = (global uchar*)key_start;
+ ks += shift_byte;
+ global uchar* ke = (global uchar*)key_end;
+ ke += shift_byte;
+
+ // double buffering on value loading
+ if (ks < ke)
+ {
+ uchar bucket_id = *ks;
+ ks += lsz * sizeof(ulong);
+
+ for (global uchar* k = ks; k < ke; k += lsz * sizeof(ulong))
+ {
+ uchar next_bucket_id = *k;
+ atomic_inc_local(&bucket_count[bucket_id]);
+ bucket_id = next_bucket_id;
+ }
+
+ atomic_inc_local(&bucket_count[bucket_id]);
+
+ }
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ //update global counters for context
+ uint count = bucket_count[lid];
+ if (count > 0)
+ atomic_add_global(&context->count[lid], bucket_count[lid]);
+
+ mem_fence_gpu_invalidate();
+ work_group_barrier(0);
+
+ bool final_wg = true;
+ // count WGs which have reached the end
+ if (dispatchArgs.num_of_wgs > 1)
+ {
+ if (lid == 0)
+ finish_count = atomic_inc_global(&context->num_wgs_in_flight) + 1;
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ final_wg = finish_count == dispatchArgs.num_of_wgs;
+ }
+
+ local uint partial_dispatches[MSB_COUNT_WG_SIZE / MSB_COUNT_SG_SIZE];
+ // if this is last wg for current dispatch, update context
+ if (final_wg)
+ {
+ // code below does work_group_scan_exclusive_add(context->count[lid]);
+ {
+ uint lane_val = context->count[lid];
+ uint sg_result = sub_group_scan_inclusive_add(lane_val);
+
+ partial_dispatches[get_sub_group_id()] = sub_group_broadcast(sg_result, MSB_COUNT_SG_SIZE - 1);
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ uint slm_result = sub_group_scan_exclusive_add(partial_dispatches[get_sub_group_local_id()]);
+ slm_result = sub_group_broadcast(slm_result, get_sub_group_id());
+ uint result = slm_result + sg_result - lane_val;
+ context->start[lid] = result;//work_group_scan_exclusive_add(context->count[lid]);
+ }
+
+ context->count[lid] = 0;
+ if(lid == 0)
+ context->num_wgs_in_flight = 0;
+ }
+}
+
+
+
+
+// sort elements into appropriate buckets
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(MSB_RADIX_NUM_BINS, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16)))
+void kernel sort_morton_codes_msb_bin_items(
+ global struct VContextScheduler* scheduler, global ulong* input)
+{
+ uint lid = get_local_id(0);
+ uint lsz = get_local_size(0);
+
+ DEBUG_CODE(if (lid == 0) printf("running sort_morton_codes_msb_bin_items\n"));
+
+ local uint finish_count;
+ if (lid == 0)
+ {
+ finish_count = 0;
+ }
+
+ struct MSBDispatchArgs dispatchArgs = get_msb_dispatch_args(scheduler);
+ global struct MSBRadixContext* context = dispatchArgs.context;
+
+ global ulong* key_start = (global ulong*)dispatchArgs.wg_key_start + lid;
+ global ulong* key_end = (global ulong*)dispatchArgs.wg_key_end;
+ uint shift_bit = dispatchArgs.shift_bit;
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ global ulong* sorted_keys = (global ulong*)context->keys_out + context->start_offset;
+
+#if MSB_RADIX_NUM_BINS == MSB_WG_SORT_ELEMENTS_THRESHOLD // special case meaning that we process exactly 1 element per workitem
+ // here we'll do local counting, then move to global
+
+ local uint slm_counters[MSB_RADIX_NUM_BINS];
+ slm_counters[lid] = 0;
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ uint place_in_slm_bucket;
+ uint bucket_id;
+ ulong val;
+
+ bool active_lane = key_start < key_end;
+
+ if (active_lane)
+ {
+ val = *key_start;
+
+ bucket_id = (val >> (ulong)shift_bit) & (MSB_RADIX_NUM_BINS - 1);
+ place_in_slm_bucket = atomic_inc_local(&slm_counters[bucket_id]);
+ }
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ // override slm_counters with global counters - we don't need to override counters with 0 elements since we won't use them anyway
+ if (slm_counters[lid])
+ slm_counters[lid] = atomic_add_global(&context->count[lid], slm_counters[lid]);
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ uint id_in_bucket = slm_counters[bucket_id] + place_in_slm_bucket;//atomic_inc_global(&context->count[bucket_id]);
+
+ if (active_lane)
+ sorted_keys[context->start[bucket_id] + id_in_bucket] = val;
+#else
+ // double buffering on value loading
+ if (key_start < key_end)
+ {
+ ulong val = *key_start;
+ key_start += lsz;
+
+ for (global ulong* k = key_start; k < key_end; k += lsz)
+ {
+ ulong next_val = *k;
+ uint bucket_id = (val >> (ulong)shift_bit) & (MSB_RADIX_NUM_BINS - 1);
+ uint id_in_bucket = atomic_inc_global(&context->count[bucket_id]);
+
+ //printf("dec: %llu, val: %llX bucket_id: %X", *k, *k, bucket_id);
+ sorted_keys[context->start[bucket_id] + id_in_bucket] = val;
+
+ val = next_val;
+ }
+
+ uint bucket_id = (val >> (ulong)shift_bit) & (MSB_RADIX_NUM_BINS - 1);
+ uint id_in_bucket = atomic_inc_global(&context->count[bucket_id]);
+
+ sorted_keys[context->start[bucket_id] + id_in_bucket] = val;
+ }
+#endif
+
+ // make sure all groups's "counters" and "starts" are visible to final workgroup
+ mem_fence_gpu_invalidate();
+ work_group_barrier(0);
+
+ bool final_wg = true;
+ // count WGs which have reached the end
+ if (dispatchArgs.num_of_wgs > 1)
+ {
+ if (lid == 0)
+ finish_count = atomic_inc_global(&context->num_wgs_in_flight) + 1;
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ final_wg = finish_count == dispatchArgs.num_of_wgs;
+ }
+
+ local uint slm_for_wg_funcs[MSB_COUNT_WG_SIZE / MSB_COUNT_SG_SIZE];
+ // if this is last wg for current dispatch, then prepare sub-tasks
+ if (final_wg)
+ {
+ DO_Create_Work(scheduler, context, input, slm_for_wg_funcs, 16, MSB_RADIX_NUM_BINS);
+
+ // clear context's counters for future execution
+ context->count[lid] = 0;
+ }
+
+} \ No newline at end of file
diff --git a/src/intel/vulkan/grl/gpu/morton_msb_radix_bitonic_sort_shared.h b/src/intel/vulkan/grl/gpu/morton_msb_radix_bitonic_sort_shared.h
new file mode 100644
index 00000000000..c2ab0d4a2c9
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/morton_msb_radix_bitonic_sort_shared.h
@@ -0,0 +1,135 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+//
+// This file contains structure definitions shared by GRL OCL kernels and host code
+//
+
+#pragma once
+
+#include "GRLGen12.h"
+
+// NOTE:
+// MSB(Most significant byte) - here I refer to it as a part of sorting that does MSB Radix sort, which can spawn additional work
+// BLS(Bottom level sort) - here I refer to it as a last part of sorting a particular range(currently Bitonic), which cannot spawn additional work
+//
+
+#define MSB_RADIX_NUM_BINS 256
+#define MSB_BITS_PER_ITERATION 8 // how many bits are sorted per iteration
+#define MSB_SHIFT_BYTE_START_OFFSET 56 // start offset for byte shifting, first iteration will start from here
+
+#define MSB_RADIX_NUM_VCONTEXTS 8 // NOTE: mkulikow: maybe expand/shrink? More means more MSB processed in parallel but more memory used
+
+#define MSB_STACK_ENTRIES_NUM (MSB_RADIX_NUM_VCONTEXTS * MSB_RADIX_NUM_BINS * 7) // first level doesn't get spawned, so 7 iterations must fit here,
+// since at max one algorithm iteration can spawn MSB_RADIX_NUM_VCONTEXTS * MSB_RADIX_NUM_BINS we need 7 of these
+
+#define MSB_DISPATCH_QUEUE_NUM_RECORDS (MSB_RADIX_NUM_VCONTEXTS) // one per context
+
+#define BLS_DISPATCH_QUEUE_NUM_RECORDS (MSB_RADIX_NUM_VCONTEXTS * MSB_RADIX_NUM_BINS) // each context can spawn MSB_RADIX_NUM_BINS,
+// so at max one algorithm iteration can spawn MSB_RADIX_NUM_VCONTEXTS * MSB_RADIX_NUM_BINS
+
+#define MSB_WG_SORT_ELEMENTS_THRESHOLD 256 // This tells us how many elements at max we can process in a single workgroup.
+ // If a single MSB entry needs more, then it will spawn more WGs
+ // after updating this also needs to update msb_radix_bitonic_sort.grl's computation of initial workgroups num
+
+#define BOTTOM_LEVEL_SORT_THRESHOLD 512 // TODO: is 4096 best value? ON skl gives best performance
+// Right now we use 256 workitems in simd16 which give us 16 hw threads, assuming 2KB per thread, we have 32KB SLM to play with.
+// Since we use ulong(8bytes) we can store 4096 elements
+// This also tells us that if number of elements to sort is less than this, we don't need to allocate scheduler
+// Need to keep in sync with the GRL const BOTTOM_LEVEL_SORT_THRESHOLD
+
+#define BOTTOM_LEVEL_SORT_MERGING_THRESHOLD 512 // This is the amount till which we'll merge small BLS'es produced by MSB into a single bigger BLS
+
+GRL_NAMESPACE_BEGIN(GRL)
+
+
+
+
+GRL_NAMESPACE_BEGIN(RTAS)
+GRL_NAMESPACE_BEGIN(MORTON_MSB_RADIX_BITONIC_SORT)
+
+struct MSBStackEntry
+{
+ uint start_offset;
+ uint count;
+ uint iteration;
+};
+
+struct MSBStack
+{
+ dword num_entries;
+ struct MSBStackEntry entries[MSB_STACK_ENTRIES_NUM];
+};
+
+struct MSBRadixContext
+{
+ uint start[MSB_RADIX_NUM_BINS];
+ uint count[MSB_RADIX_NUM_BINS];
+ uint num_wgs_in_flight; // this is used to identify which msb wg is last
+ uint num_keys; // number of keys to process
+ uint iteration;
+ ulong* keys_in;
+ ulong* keys_out;
+
+ uint start_offset; //offset from the beginning of the buffer
+};
+
+struct MSBDispatchRecord
+{
+ uint wgs_to_dispatch; // amount of workgroups to dispatch for this current record
+};
+
+struct MSBDispatchQueue
+{
+ dword num_records;
+ struct MSBDispatchRecord records[MSB_RADIX_NUM_VCONTEXTS]; // each context have its own record
+};
+
+// BLS(Bottom Level Sort) - last stage of sorting which will not spawn any new tasks
+struct BLSDispatchRecord
+{
+ uint start_offset; // offset from the beginning of the buffer
+ uint count;
+ ulong* keys_in; // we don't need keys_out since we will write always to the same output buffer
+};
+
+struct BLSDispatchQueue
+{
+ dword num_records;
+ struct BLSDispatchRecord records[BLS_DISPATCH_QUEUE_NUM_RECORDS];
+};
+
+struct VContextScheduler
+{
+ /////////////////////////////////////////////////////////////
+ // State data used for communication with command streamer
+ // NOTE: This part must match definition in 'msb_radix_bitonic_sort.grl'
+ /////////////////////////////////////////////////////////////
+
+ dword num_wgs_msb; // number of MSB workgroups being processed by current iteration
+ dword num_wgs_bls; // number of BLS workgroups being processed by current iteration
+
+ dword scheduler_postsync;
+ dword _pad1;
+
+ /////////////////////////////////////////////////////////////
+
+ struct MSBDispatchQueue msb_queue;
+ struct BLSDispatchQueue bls_queue0;
+ struct BLSDispatchQueue bls_queue1;
+
+ struct BLSDispatchQueue* curr_bls_queue;
+ struct BLSDispatchQueue* next_bls_queue;
+
+ struct MSBStack msb_stack;
+
+ struct MSBRadixContext contexts[MSB_RADIX_NUM_VCONTEXTS];
+};
+
+GRL_NAMESPACE_END(MORTON_MSB_RADIX_BITONIC_SORT)
+GRL_NAMESPACE_END(RTAS)
+GRL_NAMESPACE_END(GRL)
diff --git a/src/intel/vulkan/grl/gpu/morton_radix_sort.cl b/src/intel/vulkan/grl/gpu/morton_radix_sort.cl
new file mode 100644
index 00000000000..e123b2f46d3
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/morton_radix_sort.cl
@@ -0,0 +1,9 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+// just inlines the kernels that are there in the header
+#include "morton_radix_sort.h"
diff --git a/src/intel/vulkan/grl/gpu/morton_radix_sort.h b/src/intel/vulkan/grl/gpu/morton_radix_sort.h
new file mode 100644
index 00000000000..d58ec829883
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/morton_radix_sort.h
@@ -0,0 +1,855 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#pragma once
+
+#include "common.h"
+#include "libs/lsc_intrinsics.h"
+
+/* ============================================================================= */
+/* ============================== LSB RADIX SORT =============================== */
+/* ============================================================================= */
+
+#define RADIX_BINS 256
+#define SCATTER_WG_SIZE 512
+#define MORTON_LSB_SORT_NO_SHIFT_THRESHOLD 0xFFFFFFFF // turn off, because current hierarchy build requires full sort
+
+uint2 get_thread_range( uint numItems, uint numGroups, uint taskID )
+{
+ uint items_per_group = (numItems / numGroups);
+ uint remainder = numItems - (items_per_group * numGroups);
+ uint startID = taskID * items_per_group + min(taskID, remainder);
+ uint endID = startID + items_per_group + ((taskID < remainder) ? 1 : 0);
+
+ return (uint2)(startID,endID);
+}
+
+GRL_INLINE void sort_morton_codes_bin_items_taskID_func(global struct Globals* globals,
+ global uint* global_histogram,
+ global uchar* input,
+ local uint* histogram,
+ uint iteration,
+ uint numGroups,
+ uint numItems,
+ bool shift_primID,
+ uint taskID,
+ uint startID,
+ uint endID)
+{
+ const uint shift = globals->shift;
+
+ for (uint i = get_local_id(0); i < RADIX_BINS; i += get_local_size(0))
+ histogram[i] = 0;
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ if (shift_primID)
+ {
+ for (uint i = startID + get_local_id(0); i < endID; i += get_local_size(0))
+ {
+ // Read input as ulong to make bitshift, so the bits representing primID are not being
+ // taken into account during sorting, which would result in smaller sort loops for
+ // cases where morton shift are bigger than 8 bits
+ ulong* ptr_ul = (ulong*)&input[8 * i];
+ ulong code = *ptr_ul;
+ uchar* ptr = (uchar*)&code;
+ code >>= shift;
+
+ uchar bin = ptr[iteration];
+ atomic_inc_local(&histogram[bin]);
+ }
+ }
+ else
+ {
+ for (uint i = startID + get_local_id(0); i < endID; i += get_local_size(0))
+ {
+ uchar bin = input[8 * i + iteration];
+ atomic_inc_local(&histogram[bin]);
+ }
+ }
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ for (uint i = get_local_id(0); i < RADIX_BINS; i += get_local_size(0))
+ global_histogram[RADIX_BINS * taskID + i] = histogram[i];
+}
+
+GRL_INLINE void sort_morton_codes_bin_items_func(global struct Globals* globals,
+ global uint* global_histogram,
+ global uint* wg_flags,
+ global uchar* input,
+ local uint* histogram,
+ uint iteration,
+ uint numGroups,
+ uint numItems,
+ bool shift_primID,
+ bool update_wg_flags)
+{
+ if (shift_primID)
+ {
+ // This check is present in other LSB sort functions as well, its purpose is
+ // to skip first n iterations where n is the difference between max iterations
+ // and actually needed iterations to sort without primIDs
+ const uint req_iterations = globals->sort_iterations;
+ if (iteration < req_iterations)
+ return;
+
+ // iteration needs to be adjusted to reflect the skipped cycles
+ iteration -= req_iterations;
+ }
+
+ const uint taskID = get_group_id(0);
+
+ if (taskID == 0 && update_wg_flags)
+ {
+ for (uint i = get_local_id(0); i < RADIX_BINS; i += get_local_size(0))
+ wg_flags[i] = 0;
+ }
+
+ uint2 ids = get_thread_range(numItems, numGroups, taskID);
+ uint startID = ids.x;
+ uint endID = ids.y;
+
+ sort_morton_codes_bin_items_taskID_func(globals, global_histogram, input, histogram, iteration, numGroups, numItems, shift_primID,
+ taskID, startID, endID);
+}
+
+__attribute__((reqd_work_group_size(512, 1, 1)))
+void kernel
+sort_morton_codes_bin_items(
+ global struct Globals* globals,
+ global uint* global_histogram,
+ global uint* wg_flags,
+ global uchar* input,
+ uint iteration,
+ uint numGroups,
+ uint update_wg_flags
+)
+{
+ local uint histogram[RADIX_BINS];
+ const uint numItems = globals->numPrimitives;
+ if(numItems < MORTON_LSB_SORT_NO_SHIFT_THRESHOLD)
+ sort_morton_codes_bin_items_func(globals, global_histogram, wg_flags, input, histogram, iteration, numGroups, numItems, false, update_wg_flags);
+ else
+ sort_morton_codes_bin_items_func(globals, global_histogram, wg_flags, input, histogram, iteration, numGroups, numItems, true, update_wg_flags);
+}
+
+
+GRL_INLINE void sort_morton_codes_reduce_bins_func(global struct Globals* globals,
+ global uint* global_histogram,
+ local uint* partials,
+ uint numTasks,
+ uint iteration,
+ bool shift_primID)
+{
+ const uint localID = get_local_id(0);
+
+ if (shift_primID)
+ {
+ const uint req_iterations = globals->sort_iterations;
+ if (iteration < req_iterations)
+ return;
+ }
+
+ uint t = 0;
+ for (uint j = 0; j < numTasks; j++)
+ {
+ const uint count = load_uint_L1C_L3C(&global_histogram[RADIX_BINS * j + localID], 0);
+ store_uint_L1WB_L3WB(&global_histogram[RADIX_BINS * j + localID], 0, t);
+ t += count;
+ }
+
+ // each lane now contains the number of elements in the corresponding bin
+ // prefix sum this for use in the subsequent scattering pass.
+ uint global_count = t;
+
+ partials[get_sub_group_id()] = sub_group_reduce_add(global_count);
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ uint lane = get_sub_group_local_id();
+ uint p = partials[lane];
+ p = (lane < get_sub_group_id()) ? p : 0;
+
+ global_count = sub_group_reduce_add(p) + sub_group_scan_exclusive_add(global_count);
+
+ store_uint_L1WB_L3WB(&global_histogram[RADIX_BINS * numTasks + localID], 0, global_count);
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(256, 1, 1)))
+void kernel
+sort_morton_codes_reduce_bins(global struct Globals* globals,
+ uint numTasks,
+ global uint* global_histogram,
+ uint iteration)
+{
+ local uint partials[RADIX_BINS];
+ const uint numItems = globals->numPrimitives;
+ if (numItems < MORTON_LSB_SORT_NO_SHIFT_THRESHOLD)
+ sort_morton_codes_reduce_bins_func(globals, global_histogram, partials, numTasks, iteration, false);
+ else
+ sort_morton_codes_reduce_bins_func(globals, global_histogram, partials, numTasks, iteration, true);
+}
+
+
+#if 1
+GRL_INLINE void sort_morton_codes_scatter_items_func(
+ global struct Globals* globals,
+ global uint* global_histogram,
+ global ulong* input,
+ global ulong* output,
+ local uint* local_offset,
+ local uint* flags,
+ uint iteration,
+ uint numGroups,
+ uint numItems,
+ bool shift_primID,
+ bool update_morton_sort_in_flight)
+{
+ const uint gID = get_local_id(0) + get_group_id(0) * get_local_size(0);
+
+ const uint global_shift = globals->shift;
+ const uint localID = get_local_id(0);
+ const uint taskID = get_group_id(0);
+
+ if (gID == 0 && update_morton_sort_in_flight)
+ globals->morton_sort_in_flight = 0;
+
+ uint2 ids = get_thread_range(numItems, numGroups, taskID);
+ uint startID = ids.x;
+ uint endID = ids.y;
+
+ if (shift_primID)
+ {
+ const uint req_iterations = globals->sort_iterations;
+ if (iteration < req_iterations)
+ return;
+
+ iteration -= req_iterations;
+ }
+
+ const uint shift = 8 * iteration;
+
+ // load the global bin counts, and add each bin's global prefix
+ // to the local prefix
+ {
+ uint global_prefix = 0, local_prefix = 0;
+ if (localID < RADIX_BINS)
+ {
+ local_prefix = global_histogram[RADIX_BINS * taskID + localID];
+ global_prefix = global_histogram[RADIX_BINS * numGroups + localID];
+ local_offset[localID] = global_prefix + local_prefix;
+ }
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+ }
+
+
+ // move elements in WG-sized chunks. The elements need to be moved sequentially (can't use atomics)
+ // because relative order has to be preserved for LSB radix sort to work
+
+ // For each bin, a bit vector indicating which elements are in the bin
+ for (uint block_base = startID; block_base < endID; block_base += get_local_size(0))
+ {
+ // initialize bit vectors
+ for (uint i = 4 * localID; i < RADIX_BINS * SCATTER_WG_SIZE / 32; i += 4 * get_local_size(0))
+ {
+ flags[i + 0] = 0;
+ flags[i + 1] = 0;
+ flags[i + 2] = 0;
+ flags[i + 3] = 0;
+ }
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ // read sort key, determine which bin it goes into, scatter into the bit vector
+ // and pre-load the local offset
+ uint ID = localID + block_base;
+ ulong key = 0;
+ uint bin_offset = 0;
+ uint bin = 0;
+ uint bin_word = localID / 32;
+ uint bin_bit = 1 << (localID % 32);
+
+ if (ID < endID)
+ {
+ key = input[ID];
+
+ if (shift_primID)
+ bin = ((key >> global_shift) >> shift) & (RADIX_BINS - 1);
+ else
+ bin = (key >> shift) & (RADIX_BINS - 1);
+
+ atomic_add_local(&flags[(SCATTER_WG_SIZE / 32) * bin + bin_word], bin_bit);
+ bin_offset = local_offset[bin];
+ }
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ if (ID < endID)
+ {
+ // each key reads the bit-vectors for its bin,
+ // - Computes local prefix sum to determine its output location
+ // - Computes number of items added to its bin (last thread adjusts bin position)
+ uint prefix = 0;
+ uint count = 0;
+ for (uint i = 0; i < (SCATTER_WG_SIZE / 32); i++)
+ {
+ uint bits = flags[(SCATTER_WG_SIZE / 32) * bin + i];
+ uint bc = popcount(bits);
+ uint pc = popcount(bits & (bin_bit - 1));
+ prefix += (i < bin_word) ? bc : 0;
+ prefix += (i == bin_word) ? pc : 0;
+
+ count += bc;
+ }
+
+ // store the key in its proper place..
+ output[prefix + bin_offset] = key;
+
+ // last item for each bin adjusts local offset for next outer loop iteration
+ if (prefix == count - 1)
+ local_offset[bin] += count;
+ }
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ }
+
+ /* uint local_offset[RADIX_BINS]; */
+ /* uint offset_global = 0; */
+ /* for (int i=0;i<RADIX_BINS;i++) */
+ /* { */
+ /* const uint count_global = global_histogram[RADIX_BINS*numTasks+i]; */
+ /* const uint offset_local = global_histogram[RADIX_BINS*taskID+i]; */
+ /* local_offset[i] = offset_global + offset_local; */
+ /* offset_global += count_global; */
+ /* } */
+
+ /* for (uint ID=startID;ID<endID;ID++) */
+ /* { */
+ /* const uint bin = (input[ID] >> shift) & (RADIX_BINS-1); */
+ /* const uint offset = local_offset[bin]; */
+ /* output[offset] = input[ID]; */
+ /* local_offset[bin]++; */
+ /* } */
+}
+
+#else
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(16, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16))) void kernel
+sort_morton_codes_scatter_items(
+ global struct Globals* globals,
+ uint shift,
+ global uint* global_histogram,
+ global char* input0,
+ global char* input1,
+ unsigned int input0_offset,
+ unsigned int input1_offset,
+ uint iteration)
+{
+ const uint numItems = globals->numPrimitives;
+ const uint local_size = get_local_size(0);
+ const uint taskID = get_group_id(0);
+ const uint numTasks = get_num_groups(0);
+ const uint localID = get_local_id(0);
+ const uint globalID = get_local_id(0) + get_group_id(0) * get_local_size(0);
+ const uint subgroupLocalID = get_sub_group_local_id();
+ const uint subgroup_size = get_sub_group_size();
+
+ const uint startID = (taskID + 0) * numItems / numTasks;
+ const uint endID = (taskID + 1) * numItems / numTasks;
+
+ global ulong* input = (global ulong*)((iteration % 2) == 0 ? input0 + input0_offset : input1 + input1_offset);
+ global ulong* output = (global ulong*)((iteration % 2) == 0 ? input1 + input1_offset : input0 + input0_offset);
+
+ local uint local_offset[RADIX_BINS];
+ uint off = 0;
+ for (int i = subgroupLocalID; i < RADIX_BINS; i += subgroup_size)
+ {
+ const uint count = global_histogram[RADIX_BINS * numTasks + i];
+ const uint offset_task = global_histogram[RADIX_BINS * taskID + i];
+ const uint sum = sub_group_reduce_add(count);
+ const uint prefix_sum = sub_group_scan_exclusive_add(count);
+ local_offset[i] = off + offset_task + prefix_sum;
+ off += sum;
+ }
+
+ for (uint ID = startID + subgroupLocalID; ID < endID; ID += subgroup_size)
+ {
+ const uint bin = (input[ID] >> shift) & (RADIX_BINS - 1);
+ const uint offset = atomic_add_local(&local_offset[bin], 1);
+ output[offset] = input[ID];
+ }
+
+ /* uint local_offset[RADIX_BINS]; */
+ /* uint offset_global = 0; */
+ /* for (int i=0;i<RADIX_BINS;i++) */
+ /* { */
+ /* const uint count_global = global_histogram[RADIX_BINS*numTasks+i]; */
+ /* const uint offset_local = global_histogram[RADIX_BINS*taskID+i]; */
+ /* local_offset[i] = offset_global + offset_local; */
+ /* offset_global += count_global; */
+ /* } */
+
+ /* for (uint ID=startID;ID<endID;ID++) */
+ /* { */
+ /* const uint bin = (input[ID] >> shift) & (RADIX_BINS-1); */
+ /* const uint offset = local_offset[bin]; */
+ /* output[offset] = input[ID]; */
+ /* local_offset[bin]++; */
+ /* } */
+}
+#endif
+
+#if 1
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(SCATTER_WG_SIZE, 1, 1)))
+void kernel
+sort_morton_codes_scatter_items(
+ global struct Globals *globals,
+ global uint *global_histogram,
+ global ulong *input,
+ global ulong *output,
+ uint iteration,
+ uint numGroups,
+ uint update_morton_sort_in_flight)
+{
+ local uint local_offset[RADIX_BINS];
+ local uint flags[RADIX_BINS*SCATTER_WG_SIZE/32];
+ const uint numItems = globals->numPrimitives;
+ if (numItems < MORTON_LSB_SORT_NO_SHIFT_THRESHOLD)
+ sort_morton_codes_scatter_items_func(globals, global_histogram, input, output, local_offset,
+ flags, iteration, numGroups, numItems, false, update_morton_sort_in_flight);
+ else
+ sort_morton_codes_scatter_items_func(globals, global_histogram, input, output, local_offset,
+ flags, iteration, numGroups, numItems, true, update_morton_sort_in_flight);
+}
+
+#else
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(16, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16))) void kernel
+sort_morton_codes_scatter_items(
+ global struct Globals *globals,
+ uint shift,
+ global uint *global_histogram,
+ global char *input0,
+ global char *input1,
+ unsigned int input0_offset,
+ unsigned int input1_offset,
+ uint iteration)
+{
+ const uint numItems = globals->numPrimitives;
+ const uint local_size = get_local_size(0);
+ const uint taskID = get_group_id(0);
+ const uint numTasks = get_num_groups(0);
+ const uint localID = get_local_id(0);
+ const uint globalID = get_local_id(0) + get_group_id(0)*get_local_size(0);
+ const uint subgroupLocalID = get_sub_group_local_id();
+ const uint subgroup_size = get_sub_group_size();
+
+ const uint startID = (taskID + 0) * numItems / numTasks;
+ const uint endID = (taskID + 1) * numItems / numTasks;
+
+ global ulong *input = (global ulong *)((iteration % 2) == 0 ? input0 + input0_offset : input1 + input1_offset);
+ global ulong *output = (global ulong *)((iteration % 2) == 0 ? input1 + input1_offset : input0 + input0_offset);
+
+ local uint local_offset[RADIX_BINS];
+ uint off = 0;
+ for (int i = subgroupLocalID; i < RADIX_BINS; i += subgroup_size)
+ {
+ const uint count = global_histogram[RADIX_BINS * numTasks + i];
+ const uint offset_task = global_histogram[RADIX_BINS * taskID + i];
+ const uint sum = sub_group_reduce_add(count);
+ const uint prefix_sum = sub_group_scan_exclusive_add(count);
+ local_offset[i] = off + offset_task + prefix_sum;
+ off += sum;
+ }
+
+ for (uint ID = startID + subgroupLocalID; ID < endID; ID += subgroup_size)
+ {
+ const uint bin = (input[ID] >> shift) & (RADIX_BINS - 1);
+ const uint offset = atomic_add_local(&local_offset[bin], 1);
+ output[offset] = input[ID];
+ }
+
+ /* uint local_offset[RADIX_BINS]; */
+ /* uint offset_global = 0; */
+ /* for (int i=0;i<RADIX_BINS;i++) */
+ /* { */
+ /* const uint count_global = global_histogram[RADIX_BINS*numTasks+i]; */
+ /* const uint offset_local = global_histogram[RADIX_BINS*taskID+i]; */
+ /* local_offset[i] = offset_global + offset_local; */
+ /* offset_global += count_global; */
+ /* } */
+
+ /* for (uint ID=startID;ID<endID;ID++) */
+ /* { */
+ /* const uint bin = (input[ID] >> shift) & (RADIX_BINS-1); */
+ /* const uint offset = local_offset[bin]; */
+ /* output[offset] = input[ID]; */
+ /* local_offset[bin]++; */
+ /* } */
+}
+#endif
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(512, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
+void kernel
+sort_morton_codes_merged(
+ global struct Globals* globals,
+ global uint* global_histogram,
+ global uchar* input,
+ uint iteration,
+ uint numGroups
+)
+{
+ const uint numItems = globals->numPrimitives;
+ const uint taskID = get_group_id(0);
+ const uint loc_id = get_local_id(0);
+ const uint lane = get_sub_group_local_id();
+
+ uint2 ids = get_thread_range(numItems, numGroups, taskID);
+ uint startID = ids.x;
+ uint endID = ids.y;
+
+ local uint histogram[RADIX_BINS];
+ local uint hist_tmp[RADIX_BINS];
+
+ if (numItems < MORTON_LSB_SORT_NO_SHIFT_THRESHOLD)
+ {
+ sort_morton_codes_bin_items_taskID_func(globals, global_histogram, input, histogram, iteration, numGroups, numItems, false,
+ taskID, startID, endID);
+ }
+ else
+ {
+ const uint req_iterations = globals->sort_iterations;
+ if (iteration < req_iterations)
+ return;
+
+ iteration -= req_iterations;
+
+ sort_morton_codes_bin_items_taskID_func(globals, global_histogram, input, histogram, iteration, numGroups, numItems, true,
+ taskID, startID, endID);
+ }
+
+ uint last_group = 0;
+ if (loc_id == 0)
+ last_group = atomic_inc_global(&globals->morton_sort_in_flight);
+
+ write_mem_fence(CLK_GLOBAL_MEM_FENCE);
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ last_group = work_group_broadcast(last_group, 0);
+
+ bool isLastGroup = (loc_id < RADIX_BINS) && (last_group == numGroups - 1);
+
+ uint global_count = 0;
+
+ if (isLastGroup)
+ {
+ for (uint j = 0; j < numGroups; j++)
+ {
+ const uint count = (j == taskID) ? histogram[loc_id] : load_uint_L1C_L3C(&global_histogram[RADIX_BINS * j + loc_id], 0);
+ store_uint_L1WB_L3WB(&global_histogram[RADIX_BINS * j + loc_id], 0, global_count);
+ global_count += count;
+ }
+
+ hist_tmp[get_sub_group_id()] = (get_sub_group_id() < MAX_HW_SIMD_WIDTH) ? sub_group_reduce_add(global_count) : 0;
+ }
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ if (isLastGroup)
+ {
+ uint p = hist_tmp[lane];
+ p = (lane < get_sub_group_id()) ? p : 0;
+
+ global_count = sub_group_reduce_add(p) + sub_group_scan_exclusive_add(global_count);
+
+ store_uint_L1WB_L3WB(&global_histogram[RADIX_BINS * numGroups + loc_id], 0, global_count);
+ }
+}
+
+#if 0
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(16, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16))) void kernel
+sort_morton_codes_bin_items(
+ global struct Globals* globals,
+ uint shift,
+ global uint* global_histogram,
+ global char* input0,
+ global char* input1,
+ unsigned int input0_offset,
+ unsigned int input1_offset,
+ uint iteration)
+{
+ const uint numItems = globals->numPrimitives;
+ const uint local_size = get_local_size(0);
+ const uint taskID = get_group_id(0);
+ const uint numTasks = get_num_groups(0);
+ const uint localID = get_local_id(0);
+ const uint globalID = get_local_id(0) + get_group_id(0) * get_local_size(0);
+ const uint subgroupLocalID = get_sub_group_local_id();
+ const uint subgroup_size = get_sub_group_size();
+
+ const uint startID = (taskID + 0) * numItems / numTasks;
+ const uint endID = (taskID + 1) * numItems / numTasks;
+
+ global ulong* input = (global ulong*)((iteration % 2) == 0 ? input0 + input0_offset : input1 + input1_offset);
+
+#if 1
+ local uint histogram[RADIX_BINS];
+ for (uint i = subgroupLocalID; i < RADIX_BINS; i += subgroup_size)
+ histogram[i] = 0;
+
+ for (uint ID = startID + subgroupLocalID; ID < endID; ID += subgroup_size)
+ {
+ const uint bin = ((uint)(input[ID] >> (ulong)shift)) & (RADIX_BINS - 1);
+ atomic_add(&histogram[bin], 1);
+ }
+
+ for (uint i = subgroupLocalID; i < RADIX_BINS; i += subgroup_size)
+ global_histogram[RADIX_BINS * taskID + i] = histogram[i];
+
+#else
+ uint histogram[RADIX_BINS];
+ for (int i = 0; i < RADIX_BINS; i++)
+ histogram[i] = 0;
+
+ for (uint ID = startID + subgroupLocalID; ID < endID; ID += subgroup_size)
+ {
+ const uint bin = ((uint)(input[ID] >> (ulong)shift)) & (RADIX_BINS - 1);
+ histogram[bin]++;
+ }
+
+ for (uint i = 0; i < RADIX_BINS; i++)
+ {
+ const uint reduced_counter = sub_group_reduce_add(histogram[i]);
+ global_histogram[RADIX_BINS * taskID + i] = reduced_counter;
+ }
+#endif
+}
+
+#endif
+
+#define WG_SIZE_WIDE 256
+#define SG_SIZE_SCAN 16
+
+// Fast implementation of work_group_scan_exclusive using SLM for WG size 256 and SG size 16
+GRL_INLINE uint work_group_scan_exclusive_add_opt(local uint* tmp, uint val)
+{
+ const uint hw_thread_in_wg_id = get_local_id(0) / SG_SIZE_SCAN;
+ const uint sg_local_id = get_local_id(0) % SG_SIZE_SCAN;
+ const uint NUM_HW_THREADS_IN_WG = WG_SIZE_WIDE / SG_SIZE_SCAN;
+
+ uint acc = sub_group_scan_exclusive_add(val);
+ uint acc2 = acc + val;
+
+ tmp[hw_thread_in_wg_id] = sub_group_broadcast(acc2, SG_SIZE_SCAN - 1);
+ barrier(CLK_LOCAL_MEM_FENCE);
+ uint loaded_val = tmp[sg_local_id];
+ uint wgs_acc = sub_group_scan_exclusive_add(loaded_val);
+ uint acc_for_this_hw_thread = sub_group_broadcast(wgs_acc, hw_thread_in_wg_id);
+ return acc + acc_for_this_hw_thread;
+}
+
+// Wide reduce algorithm is divided into 2 kernels:
+// 1. First, partial exclusive add scans are made within each work group using SLM.
+// Then, The last work group for each histogram bin perform exclusive add scan along the bins using separate histgram_partials buffer.
+// Last work group is determined using global atomics on wg_flags buffer.
+// 2. Second kernel globally adds the values from histgram_partials to the histogram buffer where partial sums are.
+// Then, last work group performs one more work_group scan and add so the histogram buffer values are adjusted with the global ones.
+GRL_INLINE void sort_morton_codes_reduce_bins_wide_partial_sum_func(
+ global struct Globals* globals,
+ global uint* global_histogram,
+ global uint* global_histogram_partials,
+ global uint* wg_flags,
+ local uint* exclusive_scan_tmp,
+ uint numTasks,
+ uint numGroups,
+ uint iteration,
+ bool shift_primID)
+{
+ if (shift_primID)
+ {
+ const uint req_iterations = globals->sort_iterations;
+ if (iteration < req_iterations)
+ return;
+
+ iteration -= req_iterations;
+ }
+
+ const uint groupID = get_group_id(0) % RADIX_BINS;
+ const uint scanGroupID = get_group_id(0) / RADIX_BINS;
+ uint localID = get_local_id(0);
+ uint globalID = localID + (scanGroupID * WG_SIZE_WIDE);
+ const uint lastGroup = (numGroups / WG_SIZE_WIDE);
+ const uint endID = min(numTasks, (uint)(scanGroupID * WG_SIZE_WIDE + WG_SIZE_WIDE)) - 1;
+
+ uint temp = 0;
+ uint last_count = 0;
+ if (globalID < numTasks)
+ {
+ temp = global_histogram[RADIX_BINS * globalID + groupID];
+
+ // Store the last value of the work group, it is either last element of histogram or last item in work group
+ if (globalID == endID)
+ last_count = temp;
+ }
+
+ uint val = work_group_scan_exclusive_add_opt(exclusive_scan_tmp, temp);
+
+ if (globalID <= numTasks)
+ {
+ global_histogram[RADIX_BINS * globalID + groupID] = val;
+
+ // Store the block sum value to separate buffer
+ if (globalID == endID)
+ global_histogram_partials[scanGroupID * WG_SIZE_WIDE + groupID] = val + last_count;
+ }
+
+ // Make sure that global_histogram_partials is updated in all work groups
+ write_mem_fence(CLK_GLOBAL_MEM_FENCE);
+ barrier(0);
+
+ // Now, wait for the last group for each histogram bin, so we know that
+ // all work groups already updated the global_histogram_partials buffer
+ uint last_group = 0;
+ if (localID == 0)
+ last_group = atomic_inc_global(&wg_flags[groupID]);
+
+ last_group = work_group_broadcast(last_group, 0);
+ bool isLastGroup = (last_group == lastGroup - 1);
+
+ // Each of the last groups computes the scan exclusive add for each partial sum we have
+ if (isLastGroup)
+ {
+ uint temp1 = 0;
+ if (localID < lastGroup)
+ temp1 = global_histogram_partials[localID * WG_SIZE_WIDE + groupID];
+
+ uint val2 = work_group_scan_exclusive_add_opt(exclusive_scan_tmp, temp1);
+
+ if (localID < lastGroup)
+ global_histogram_partials[localID * WG_SIZE_WIDE + groupID] = val2;
+ }
+}
+
+GRL_INLINE void sort_morton_codes_reduce_bins_wide_add_reduce_func(
+ global struct Globals* globals,
+ global uint* global_histogram,
+ global uint* global_histogram_partials,
+ local uint* partials,
+ uint numTasks,
+ uint numGroups,
+ uint iteration,
+ bool shift_primID)
+{
+ if (shift_primID)
+ {
+ const uint req_iterations = globals->sort_iterations;
+ if (iteration < req_iterations)
+ return;
+
+ iteration -= req_iterations;
+ }
+
+ const uint groupID = get_group_id(0) % RADIX_BINS;
+ const uint scanGroupID = get_group_id(0) / RADIX_BINS;
+ const uint lastGroup = (numGroups / WG_SIZE_WIDE);
+ uint localID = get_local_id(0);
+ uint globalID = localID + (scanGroupID * WG_SIZE_WIDE);
+ const uint endID = min(numTasks, (uint)(scanGroupID * WG_SIZE_WIDE + WG_SIZE_WIDE)) - 1;
+
+ // Add the global sums to the partials, skip the firsy scanGroupID as the first add
+ // value is 0 in case of exclusive add scans
+ if (scanGroupID > 0 && globalID <= numTasks)
+ {
+ uint add_val = global_histogram_partials[scanGroupID * RADIX_BINS + groupID];
+ atomic_add_global(&global_histogram[globalID * RADIX_BINS + groupID], add_val);
+ }
+
+ // Wait for the last group
+ uint last_group = 0;
+ if (localID == 0)
+ last_group = atomic_inc_global(&globals->morton_sort_in_flight);
+
+ last_group = work_group_broadcast(last_group, 0);
+ bool isLastGroup = (last_group == numGroups - 1);
+
+ // Do the exclusive scan within all bins with global data now
+ if (isLastGroup)
+ {
+ mem_fence_gpu_invalidate();
+
+ uint global_count = global_histogram[numTasks * RADIX_BINS + localID];
+
+ partials[get_sub_group_id()] = sub_group_reduce_add(global_count);
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ uint lane = get_sub_group_local_id();
+ uint p = partials[lane];
+ p = (lane < get_sub_group_id()) ? p : 0;
+
+ global_count = sub_group_reduce_add(p) + sub_group_scan_exclusive_add(global_count);
+
+ store_uint_L1WB_L3WB(&global_histogram[numTasks * RADIX_BINS + localID], 0, global_count);
+ }
+}
+
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(WG_SIZE_WIDE, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(SG_SIZE_SCAN)))
+void kernel
+sort_morton_codes_reduce_bins_wide_partial_sum(
+ global struct Globals* globals,
+ uint numTasks,
+ uint numGroups,
+ global uint* global_histogram,
+ global uint* global_histogram_partials,
+ global uint* wg_flags,
+ uint iteration)
+{
+ local uint exclusive_scan_tmp[WG_SIZE_WIDE / SG_SIZE_SCAN];
+
+ const uint numItems = globals->numPrimitives;
+ if (numItems < MORTON_LSB_SORT_NO_SHIFT_THRESHOLD)
+ sort_morton_codes_reduce_bins_wide_partial_sum_func(globals, global_histogram, global_histogram_partials, wg_flags, exclusive_scan_tmp, numTasks, numGroups, iteration, false);
+ else
+ sort_morton_codes_reduce_bins_wide_partial_sum_func(globals, global_histogram, global_histogram_partials, wg_flags, exclusive_scan_tmp, numTasks, numGroups, iteration, true);
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(WG_SIZE_WIDE, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(SG_SIZE_SCAN)))
+void kernel
+sort_morton_codes_reduce_bins_wide_add_reduce(
+ global struct Globals* globals,
+ uint numTasks,
+ uint numGroups,
+ global uint* global_histogram,
+ global uint* global_histogram_partials,
+ uint iteration)
+{
+ local uint partials[RADIX_BINS];
+
+ const uint numItems = globals->numPrimitives;
+ if (numItems < MORTON_LSB_SORT_NO_SHIFT_THRESHOLD)
+ sort_morton_codes_reduce_bins_wide_add_reduce_func(globals, global_histogram, global_histogram_partials, partials, numTasks, numGroups, iteration, false);
+ else
+ sort_morton_codes_reduce_bins_wide_add_reduce_func(globals, global_histogram, global_histogram_partials, partials, numTasks, numGroups, iteration, true);
+}
diff --git a/src/intel/vulkan/grl/gpu/msb_radix_bitonic_sort.grl b/src/intel/vulkan/grl/gpu/msb_radix_bitonic_sort.grl
new file mode 100644
index 00000000000..dee315adcda
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/msb_radix_bitonic_sort.grl
@@ -0,0 +1,297 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+module msb_radix_bitonic_sort;
+
+kernel_module msb_radix_sort ("morton_msb_radix_bitonic_sort.cl")
+{
+ links lsc_intrinsics;
+
+ kernel opencl_debug_print < kernelFunction="debug_print_kernel">;
+ kernel opencl_check_bls < kernelFunction="check_bls_sort">;
+
+ kernel opencl_bottom_level_sort_single_wg < kernelFunction="sort_morton_codes_bottom_level_single_wg">;
+
+ kernel opencl_build_morton_kernel_sort_msb_init < kernelFunction="sort_morton_codes_msb_begin">;
+
+ kernel opencl_build_morton_kernel_sort_msb_scheduler < kernelFunction="scheduler">;
+
+ kernel opencl_build_morton_kernel_sort_bottom_level < kernelFunction="sort_morton_codes_bottom_level">;
+
+ kernel opencl_build_morton_kernel_sort_msb_count_items < kernelFunction="sort_morton_codes_msb_count_items">;
+ kernel opencl_build_morton_kernel_sort_msb_bin_items < kernelFunction="sort_morton_codes_msb_bin_items">;
+
+ kernel opencl_build_morton_kernel_sort_batched_bls_dispatch < kernelFunction="sort_morton_codes_batched_BLS_dispatch">;
+}
+
+
+const MSB_RADIX_NUM_VCONTEXTS = 8;
+const BOTTOM_LEVEL_SORT_THRESHOLD = 512;
+
+struct MSBRadixScheduler
+{
+ dword num_wgs_msb;
+ dword num_wgs_bls;
+
+ dword scheduler_postsync;
+ dword _pad1;
+};
+
+struct MSBRadixArgs
+{
+ qword p_scheduler;
+ qword p_num_primitives;
+};
+
+
+
+
+struct BatchedBLSDispatchEntry
+{
+ qword p_data_buffer;
+ qword num_elements; // number of elements in p_data_buffer
+};
+
+
+
+
+metakernel add_bls_dispatch_init(qword p_storage)
+{
+ define REG_numWgs REG14;
+ define REG_p_storage REG15;
+
+ REG_numWgs = 0;
+ REG_p_storage = p_storage;
+}
+
+
+
+
+// basically this code does:
+// bls_args_for_dispatches[dispatchID] = { bls_new_pointer, numPrimitives };
+// dispatchId++;
+//
+metakernel add_bls_dispatch(
+ qword p_data,
+ qword p_num_primitives
+)
+{
+ define C_1 REG0;
+ define C_8 REG1;
+
+ define C_MIN_PRIMREFS REG2;
+
+ define REG_p_data REG3;
+ define REG_num_prims REG4;
+ define REG_no_dispatch REG5;
+
+ define REG_numWgs REG14;
+ define REG_p_storage REG15;
+
+ C_MIN_PRIMREFS = 2;
+
+ REG_num_prims = 0;
+ REG_num_prims.lo = load_dword(p_num_primitives);
+
+ REG_no_dispatch = REG_num_prims < C_MIN_PRIMREFS;
+
+ goto l_finish if(REG_no_dispatch.lo);
+
+ C_1 = 1;
+ C_8 = 8;
+
+ // pseudocode: BatchedBLSDispatchEntry.p_data_buffer = p_data
+ REG_p_data = p_data;
+ store_qword( REG_p_storage, REG_p_data ); // store the data pointer
+
+ REG_p_storage = REG_p_storage + C_8; // point to next member in BatchedBLSDispatchEntry struct
+
+ // pseudocode: BatchedBLSDispatchEntry.num_elements = *p_num_primitives
+ store_qword( REG_p_storage, REG_num_prims );
+
+ REG_p_storage = REG_p_storage + C_8; // point to next BatchedBLSDispatchEntry instance
+
+ REG_numWgs = REG_numWgs + C_1;
+
+l_finish:
+
+}
+
+
+
+
+metakernel batched_bls_dispatch(
+ qword private_mem
+)
+{
+ define REG_numWgs REG14;
+
+ DISPATCHDIM_X = REG_numWgs;
+ DISPATCHDIM_Y = 1;
+ DISPATCHDIM_Z = 1;
+
+ dispatch_indirect opencl_build_morton_kernel_sort_batched_bls_dispatch args(private_mem);
+}
+
+
+
+
+metakernel sort_bottom_level(
+ qword build_globals,
+ qword input,
+ qword p_num_primitives)
+{
+ define REG_num_prims REG0;
+ define C_MIN_PRIMREFS REG1;
+ define REG_no_dispatch REG2;
+
+ REG_num_prims = load_dword( p_num_primitives );
+
+ C_MIN_PRIMREFS = 2;
+
+ REG_no_dispatch = REG_num_prims < C_MIN_PRIMREFS;
+
+ goto l_finish if(REG_no_dispatch.lo);
+
+ dispatch opencl_bottom_level_sort_single_wg (1, 1, 1) args(build_globals, input, input);
+
+l_finish:
+
+}
+
+
+
+
+metakernel sort(
+ qword build_globals,
+ qword input,
+ qword tmp,
+ MSBRadixArgs sort_args)
+{
+ define REG_num_prims REG0;
+ {
+ define C_MIN_PRIMREFS REG1;
+ define C_MAX_PRIMREFS REG2;
+ define REG_no_dispatch REG3;
+ define REG_dispatch_single_wg REG4;
+
+ REG_num_prims = load_dword( sort_args.p_num_primitives );
+ C_MIN_PRIMREFS = 2;
+ C_MAX_PRIMREFS = BOTTOM_LEVEL_SORT_THRESHOLD;
+
+ REG_no_dispatch = REG_num_prims < C_MIN_PRIMREFS;
+ REG_dispatch_single_wg = REG_num_prims < C_MAX_PRIMREFS;
+
+ goto l_sort_finish if(REG_no_dispatch.lo);
+ goto l_dispatch_single_wg if(REG_dispatch_single_wg.lo);
+ goto l_full_sort;
+ }
+
+l_dispatch_single_wg:
+
+ {
+ dispatch opencl_bottom_level_sort_single_wg (1, 1, 1) args(build_globals, input, input);
+ goto l_sort_finish;
+ }
+
+l_full_sort:
+
+ define p_scheduler sort_args.p_scheduler;
+ define p_scheduler_postsync (sort_args.p_scheduler + offsetof(MSBRadixScheduler.scheduler_postsync) );
+ define p_num_wgs_bls (sort_args.p_scheduler + offsetof(MSBRadixScheduler.num_wgs_bls) );
+
+ define REG_scheduler_postsync REG3;
+ REG_scheduler_postsync = p_scheduler_postsync;
+
+ define C_0 REG4;
+ define C_8 REG5;
+ define C_255 REG6;
+ C_0 = 0;
+ C_8 = 8;
+ C_255 = 255;
+
+ store_qword( REG_scheduler_postsync, C_0 ); // initialize scheduler semaphore
+
+ REG_num_prims = REG_num_prims + C_255;
+ REG_num_prims = REG_num_prims >> C_8;
+
+ DISPATCHDIM_X = REG_num_prims.lo;
+ DISPATCHDIM_Y = 1;
+ DISPATCHDIM_Z = 1;
+
+ control( cs_store_fence ); // commit the semaphore write
+
+ // initialize the whole execution
+ dispatch opencl_build_morton_kernel_sort_msb_init (MSB_RADIX_NUM_VCONTEXTS, 1, 1) args(build_globals, sort_args.p_scheduler, input, tmp)
+ postsync store_dword( p_scheduler_postsync, 1 );
+
+ // wait on count_items kernel
+ semaphore_wait while( *p_scheduler_postsync != 1 );
+
+ dispatch_indirect opencl_build_morton_kernel_sort_msb_count_items args(sort_args.p_scheduler)
+ postsync store_dword( p_scheduler_postsync, 2 );
+
+ // wait on count_items kernel
+ semaphore_wait while( *p_scheduler_postsync != 2 );
+
+ dispatch_indirect opencl_build_morton_kernel_sort_msb_bin_items args(sort_args.p_scheduler, input)
+ postsync store_dword( p_scheduler_postsync, 0 );
+
+ define C_MASK_HI REG4;
+ C_MASK_HI = 0x00000000ffffffff;
+
+ l_build_loop:
+ {
+ semaphore_wait while( *p_scheduler_postsync != 0 );
+ {
+ dispatch opencl_build_morton_kernel_sort_msb_scheduler(1,1,1) args( sort_args.p_scheduler, input, tmp )
+ postsync store_dword( p_scheduler_postsync, 1 );
+
+ // wait on scheduler kernel
+ semaphore_wait while( *p_scheduler_postsync != 1 );
+ }
+
+ // load and process the scheduler results
+ define REG_wg_counts REG0;
+ define REG_num_msb_wgs REG0.lo;
+ define REG_num_bls_wgs REG0.hi;
+ define REG_p_scheduler REG1;
+ define REG_no_msb_wgs REG2;
+ {
+ REG_p_scheduler = p_scheduler;
+ REG_wg_counts = load_qword( REG_p_scheduler );
+
+ REG_no_msb_wgs = REG_wg_counts & C_MASK_HI;
+ REG_no_msb_wgs = REG_no_msb_wgs == 0;
+ }
+
+ // dispatch new bls WGs
+ DISPATCHDIM_X = REG_num_bls_wgs;
+ dispatch_indirect opencl_build_morton_kernel_sort_bottom_level args( p_scheduler, input );
+
+ // jump out if there are no msb WGs
+ goto l_sort_finish if (REG_no_msb_wgs);
+
+ DISPATCHDIM_X = REG_num_msb_wgs;
+ dispatch_indirect opencl_build_morton_kernel_sort_msb_count_items args(sort_args.p_scheduler)
+ postsync store_dword( p_scheduler_postsync, 2 );
+
+ // wait on count_items kernel
+ semaphore_wait while( *p_scheduler_postsync != 2 );
+
+ dispatch_indirect opencl_build_morton_kernel_sort_msb_bin_items args(sort_args.p_scheduler, input)
+ postsync store_dword( p_scheduler_postsync, 0 );
+
+ // wait till all BLS finished launching
+ semaphore_wait while( *p_num_wgs_bls != 0 );
+
+ goto l_build_loop;
+ }
+
+l_sort_finish:
+
+}
diff --git a/src/intel/vulkan/grl/gpu/new_sah_builder.grl b/src/intel/vulkan/grl/gpu/new_sah_builder.grl
new file mode 100644
index 00000000000..d0a9694acc2
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/new_sah_builder.grl
@@ -0,0 +1,665 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+module new_sah_builder;
+
+kernel_module bfs_kernels ("bvh_build_BFS.cl")
+{
+ links lsc_intrinsics;
+
+ kernel opencl_build_kernel_BinnedSAH_BFS_pass1_initial < kernelFunction="BFS_pass1_initial" > ;
+ kernel opencl_build_kernel_BinnedSAH_BFS_pass1_indexed < kernelFunction="BFS_pass1_indexed" > ;
+ kernel opencl_build_kernel_BinnedSAH_BFS_pass2_initial < kernelFunction="BFS_pass2_initial" > ;
+ kernel opencl_build_kernel_BinnedSAH_BFS_pass2_indexed < kernelFunction="BFS_pass2_indexed" > ;
+
+ kernel opencl_build_kernel_BinnedSAH_DFS < kernelFunction="DFS" >;
+ // kernel opencl_build_kernel_BinnedSAH_BuildQNodes < kernelFunction="build_qnodes" >;
+ kernel opencl_build_kernel_BinnedSAH_BuildQNodes_Kickoff < kernelFunction="build_qnodes_pc_kickoff" >;
+ kernel opencl_build_kernel_BinnedSAH_BuildQNodes_Amplify < kernelFunction="build_qnodes_pc_amplify" >;
+ kernel opencl_build_kernel_BinnedSAH_begin < kernelFunction = "begin" >;
+ kernel opencl_build_kernel_BinnedSAH_scheduler < kernelFunction = "scheduler" >;
+
+ kernel opencl_build_kernel_BinnedSAH_BFS_pass1_initial_batch < kernelFunction="BFS_pass1_initial_batchable" >;
+ kernel opencl_build_kernel_BinnedSAH_BFS_pass1_indexed_batch < kernelFunction="BFS_pass1_indexed_batchable" >;
+ kernel opencl_build_kernel_BinnedSAH_BFS_pass2_initial_batch < kernelFunction="BFS_pass2_initial_batchable" >;
+ kernel opencl_build_kernel_BinnedSAH_BFS_pass2_indexed_batch < kernelFunction="BFS_pass2_indexed_batchable" >;
+
+ kernel opencl_build_kernel_BinnedSAH_categorize_builds_and_init_scheduler < kernelFunction="categorize_builds_and_init_scheduler" >;
+ kernel opencl_build_kernel_BinnedSAH_begin_batched < kernelFunction="begin_batchable" >;
+
+ kernel opencl_build_kernel_BinnedSAH_qnode_init_scheduler_batched < kernelFunction="build_qnodes_init_scheduler_batched" >;
+ kernel opencl_build_kernel_BinnedSAH_qnode_begin_batched < kernelFunction="build_qnodes_begin_batchable" >;
+ kernel opencl_build_kernel_BinnedSAH_qnode_scheduler < kernelFunction="build_qnodes_scheduler" >;
+ kernel opencl_build_kernel_BinnedSAH_BuildQNodes_Amplify_batch < kernelFunction="build_qnodes_pc_amplify_batched" >;
+
+ kernel opencl_build_kernel_BinnedSAH_BuildQnodes_TryToFillGRB_batched < kernelFunction="build_qnodes_try_to_fill_grb_batched" >;
+
+}
+
+kernel opencl_build_kernel_DFS_single_wg < source="bvh_build_DFS.cl", kernelFunction="DFS_single_wg" >
+kernel opencl_build_kernel_DFS_trivial < source="bvh_build_DFS.cl", kernelFunction="DFS_trivial" >
+kernel opencl_build_kernel_DFS_single_wg_batch < source="bvh_build_DFS.cl", kernelFunction="DFS_single_wg_batchable" >
+kernel opencl_build_kernel_DFS_trivial_batch < source="bvh_build_DFS.cl", kernelFunction="DFS_trivial_batchable" >
+
+kernel single_pass_binsah < source="bvh_build_DFS.cl", kernelFunction="DFS" >
+
+
+const DFS_MIN_PRIMREFS = 6;
+const DFS_MAX_PRIMREFS = 256;
+const BFS_WG_SIZE_SHIFT = 9;
+
+
+
+struct Scheduler
+{
+ dword num_bfs_wgs;
+ dword num_dfs_wgs;
+
+ dword scheduler_postsync;
+ dword _pad1;
+
+ dword num_trivial_builds;
+ dword num_single_builds;
+
+ dword batched_build_wg_count;
+ dword batched_build_loop_mask;
+
+};
+
+
+struct SAHBuildArgs
+{
+ qword p_num_primitives;
+ qword p_qnode_child_buffer;
+ qword p_scheduler;
+ qword p_sah_globals;
+ qword p_globals;
+ qword p_primref_buffer;
+ qword p_primref_index_buffers;
+ qword p_bvh_base;
+ qword p_bvh2;
+ qword p_root_buffer_counters;
+ dword sah_build_flags;
+ dword leaf_size;
+ dword leaf_type;
+ dword max_internal_nodes;
+};
+
+
+metakernel single_pass_binsah(
+ qword build_globals,
+ qword bvh_buffer,
+ qword build_primref_buffer,
+ qword build_primref_index_buffers,
+ dword alloc_backpointers )
+{
+
+ dispatch single_pass_binsah(1, 1, 1) args(
+ build_globals,
+ bvh_buffer,
+ build_primref_buffer,
+ build_primref_index_buffers,
+ alloc_backpointers
+ );
+
+}
+
+
+
+metakernel new_sah_build( SAHBuildArgs build_args )
+{
+ define REG_num_prims REG0;
+
+ {
+ define C_MIN_PRIMREFS REG1;
+ define C_MAX_PRIMREFS REG2;
+ define REG_dispatch_trivial REG3;
+ define REG_dispatch_single_wg REG4;
+
+ REG_num_prims = load_dword( build_args.p_num_primitives );
+ C_MIN_PRIMREFS = DFS_MIN_PRIMREFS;
+ C_MAX_PRIMREFS = DFS_MAX_PRIMREFS;
+
+ REG_dispatch_trivial = REG_num_prims <= C_MIN_PRIMREFS;
+ REG_dispatch_single_wg = REG_num_prims <= C_MAX_PRIMREFS;
+
+ goto l_dispatch_trivial if(REG_dispatch_trivial.lo);
+ goto l_dispatch_single_wg if(REG_dispatch_single_wg.lo);
+ goto l_full_build;
+ }
+
+l_dispatch_trivial:
+ {
+ dispatch opencl_build_kernel_DFS_trivial (1,1,1)
+ args( build_args.p_globals,
+ build_args.p_bvh_base,
+ build_args.p_primref_buffer,
+ build_args.p_primref_index_buffers,
+ build_args.sah_build_flags
+ );
+
+ control( wait_idle );
+ goto l_done;
+ }
+
+l_dispatch_single_wg:
+ {
+ dispatch opencl_build_kernel_DFS_single_wg (1,1,1)
+ args( build_args.p_globals,
+ build_args.p_bvh_base,
+ build_args.p_primref_buffer,
+ build_args.p_primref_index_buffers,
+ build_args.sah_build_flags
+ );
+
+ control( wait_idle );
+ goto l_done;
+ }
+
+
+l_full_build:
+
+
+ {
+ define p_scheduler build_args.p_scheduler;
+ define p_num_dfs_wgs build_args.p_scheduler + offsetof(Scheduler.num_dfs_wgs);
+ define p_scheduler_postsync (build_args.p_scheduler + offsetof(Scheduler.scheduler_postsync) );
+ define C_0 REG1;
+ define C_8 REG2;
+ C_8 = 8;
+ C_0 = 0;
+
+
+ //
+ // Init pass
+ //
+ store_dword( p_scheduler_postsync, C_0.lo );
+
+ // compute number of BFS WGs from prim-count
+ // NOTE: This code uses a hardcoded WG size of 512 for BFS
+ // If the BFS wg size ever changes, it needs to be touched
+ // This is necessary because DG2 shifter only supports POW2 shifts
+ {
+ define REG_scheduler_postsync REG3;
+ define C_511 REG4;
+ define C_1 REG5;
+
+ REG_scheduler_postsync = p_scheduler_postsync;
+ C_511 = 511;
+ C_1 = 1;
+
+ store_qword( REG_scheduler_postsync, C_0 ); // initialize scheduler semaphore
+
+ REG_num_prims = REG_num_prims + C_511;
+ REG_num_prims = REG_num_prims >> C_8;
+ REG_num_prims = REG_num_prims >> C_1;
+
+ DISPATCHDIM_X = REG_num_prims.lo;
+ DISPATCHDIM_Y = 1;
+ DISPATCHDIM_Z = 1;
+
+ control( cs_store_fence ); // commit the semaphore write
+
+ // launch scheduler init kernel
+ dispatch opencl_build_kernel_BinnedSAH_begin (1,1,1)
+ args(
+ build_args.p_scheduler,
+ build_args.leaf_size,
+ build_args.leaf_type,
+ build_args.p_primref_index_buffers,
+ build_args.p_primref_buffer,
+ build_args.p_bvh2,
+ build_args.p_bvh_base,
+ build_args.p_globals,
+ build_args.p_sah_globals,
+ build_args.p_qnode_child_buffer,
+ build_args.sah_build_flags
+ )
+ postsync store_dword( p_scheduler_postsync, 1 );
+
+ // wait on init kernel
+ semaphore_wait while( *p_scheduler_postsync != 1 );
+
+ // launch BFS1 pass1
+ dispatch_indirect opencl_build_kernel_BinnedSAH_BFS_pass1_initial
+ args( build_args.p_scheduler,
+ build_args.p_sah_globals)
+ postsync store_dword( p_scheduler_postsync, 0 );
+
+ // wait on BFS pass1
+ semaphore_wait while( *p_scheduler_postsync != 0 );
+
+ // launch BFS pass2
+ dispatch_indirect opencl_build_kernel_BinnedSAH_BFS_pass2_initial
+ args( build_args.p_scheduler,
+ build_args.p_sah_globals )
+ postsync store_dword( p_scheduler_postsync, 1 );
+ }
+
+ // after BFS pass 2 we drop into a scheduling loop
+
+ l_build_loop:
+ {
+ semaphore_wait while( *p_scheduler_postsync != 1 );
+
+ {
+ dispatch opencl_build_kernel_BinnedSAH_scheduler(1,1,1)
+ args( build_args.p_scheduler, build_args.p_sah_globals )
+ postsync store_dword( p_scheduler_postsync, 0 );
+
+ // wait on the scheduler
+ semaphore_wait while( *p_scheduler_postsync != 0 );
+ }
+
+ // load and process the scheduler results
+ define REG_wg_counts REG0;
+ define REG_num_bfs_wgs REG0.lo;
+ define REG_num_dfs_wgs REG0.hi;
+ define REG_loop_break REG1;
+ define REG_p_scheduler REG2;
+ {
+ REG_p_scheduler = p_scheduler;
+ REG_wg_counts = load_qword( REG_p_scheduler );
+
+ define C_MASK_LO REG3 ;
+ C_MASK_LO = 0xffffffff;
+
+ REG_loop_break = REG_wg_counts & C_MASK_LO;
+ REG_loop_break = REG_loop_break == 0;
+ }
+
+ // dispatch new DFS WGs
+ DISPATCHDIM_X = REG_num_dfs_wgs;
+ dispatch_indirect opencl_build_kernel_BinnedSAH_DFS
+ args( p_scheduler,
+ build_args.p_sah_globals );
+
+ // jump out if there are no bfs WGs
+ goto l_build_qnodes if (REG_loop_break);
+
+ // dispatch new BFS1 WGs
+ DISPATCHDIM_X = REG_num_bfs_wgs;
+ dispatch_indirect opencl_build_kernel_BinnedSAH_BFS_pass1_indexed
+ args( p_scheduler,
+ build_args.p_sah_globals )
+ postsync store_dword( p_scheduler_postsync, 2 );
+
+ semaphore_wait while( *p_scheduler_postsync != 2 );
+
+ // dispatch new BFS2 WGs
+ dispatch_indirect opencl_build_kernel_BinnedSAH_BFS_pass2_indexed
+ args( p_scheduler,
+ build_args.p_sah_globals )
+ postsync store_dword( p_scheduler_postsync, 1 );
+
+ //goto l_build_loop if not(REG_num_dfs_wgs); //TODO: maybe add some logic to do "bool have_dfs_work" which will add some cycles but may be faster than checking semaphore
+
+ // wait until all upcoming DFS WGs have finished launching
+ // so that the scheduler can refill the launch array
+ // TODO_OPT: Look at replacing this with a ring buffer so that scheduler stalls instead (and only rarely)
+ semaphore_wait while( *p_num_dfs_wgs != 0 );
+
+
+ goto l_build_loop;
+ }
+ }
+
+l_build_qnodes:
+
+ control( wait_idle );
+
+ // P/C qnode build
+
+ dispatch opencl_build_kernel_BinnedSAH_BuildQNodes_Kickoff (1,1,1)
+ args( build_args.p_sah_globals,
+ build_args.p_qnode_child_buffer,
+ build_args.sah_build_flags );
+
+ {
+ define p_pc_counters ( build_args.p_root_buffer_counters );
+
+ define REG_addr REG0;
+ define REG_produced REG1;
+ define REG_consumed REG2;
+ define REG_have_work REG3;
+ define REG_wg_count REG4;
+ define C_8 REG5;
+ define C_16 REG6;
+ define C_1 REG7;
+ C_1 = 1;
+ C_8 = 8;
+ C_16 = 16;
+ REG_addr = build_args.p_root_buffer_counters; // HINT: should we use REG_addr or just pass separate arguments to metakernel to avoid add/sub from address
+
+ REG_consumed = 0;
+
+ l_qnode_loop:
+
+ control( wait_idle ); // wait for previous pass
+
+ // load counters and compute number of wgs to respawn
+ REG_produced = load_qword( REG_addr ); REG_addr = REG_addr + C_8;
+ REG_wg_count = REG_produced - REG_consumed;
+ REG_have_work = REG_wg_count > 0;
+
+ goto l_done if not(REG_have_work.lo);
+
+ // save REG_consumed as a starting position in p_qnode_child_buffer
+ store_qword(REG_addr, REG_consumed); REG_addr = REG_addr + C_8;
+
+ // save REG_produced as ending position in p_qnode_child_buffer
+ store_qword(REG_addr, REG_produced); REG_addr = REG_addr - C_16;
+
+ REG_consumed = REG_consumed + REG_wg_count; // update consumed for next iteration
+
+ // calculate amount of workgroups to schedule
+ REG_wg_count = REG_wg_count + C_1;
+ REG_wg_count = REG_wg_count >> C_1;
+
+ DISPATCHDIM_X = REG_wg_count.lo;
+
+ control( cs_store_fence ); // commit the stores
+
+ dispatch_indirect opencl_build_kernel_BinnedSAH_BuildQNodes_Amplify
+ args( build_args.p_sah_globals,
+ build_args.p_qnode_child_buffer,
+ build_args.sah_build_flags);
+
+ goto l_qnode_loop;
+ }
+
+l_done:
+}
+
+
+
+
+
+
+
+
+
+struct SAHBuildArgsBatchable
+{
+ qword p_globals_ptrs;
+ qword p_scheduler;
+ qword p_buffers_info;
+ qword p_sah_globals;
+
+ dword num_max_qnode_global_root_buffer_entries;
+ dword num_builds;
+
+};
+
+
+metakernel new_sah_build_batchable( SAHBuildArgsBatchable build_args )
+{
+ define p_scheduler build_args.p_scheduler;
+ define p_scheduler_postsync (build_args.p_scheduler + offsetof(Scheduler.scheduler_postsync) );
+ define p_num_dfs_wgs (build_args.p_scheduler + offsetof(Scheduler.num_dfs_wgs));
+
+ // initialize scheduler semaphore
+ REG0.lo = 0;
+ store_dword( p_scheduler_postsync, REG0.lo );
+
+
+ // dispatch categorization pass
+ dispatch opencl_build_kernel_BinnedSAH_categorize_builds_and_init_scheduler(2,1,1)
+ args(
+ build_args.p_scheduler,
+ build_args.p_globals_ptrs,
+ build_args.p_buffers_info,
+ build_args.p_sah_globals,
+ build_args.num_builds
+ )
+ postsync store_dword( p_scheduler_postsync, 1 );
+
+ // wait on the categorization pass
+ semaphore_wait while( *p_scheduler_postsync != 1 );
+
+
+ // dispatch the trivial and single-WG passes
+ {
+ REG0 = load_qword( build_args.p_scheduler + offsetof(Scheduler.num_trivial_builds) );
+ DISPATCHDIM_X = REG0.lo;
+ DISPATCHDIM_Y = 1;
+ DISPATCHDIM_Z = 1;
+
+ // dispatch trivial builds
+
+ dispatch_indirect opencl_build_kernel_DFS_trivial_batch
+ args( build_args.p_sah_globals );
+
+ control( wait_idle );
+
+ // dispatch single-wg builds
+
+ DISPATCHDIM_X = REG0.hi;
+ dispatch_indirect opencl_build_kernel_DFS_single_wg_batch
+ args( build_args.p_sah_globals, build_args.p_scheduler );
+ }
+
+ // compute the number of builds not covered by the trivial passes
+ // skip the builder loop if all builds are satisfied by trivial passes
+ {
+ REG1 = REG0.lo;
+ REG2 = REG0.hi;
+ REG3 = build_args.num_builds;
+ REG5 = REG2 + REG1;
+ REG5 = REG3 - REG5;
+ REG4 = REG5 == 0 ;
+
+ goto l_done if (REG4.lo);
+ }
+
+ // REG5 (number of non-trivial builds) will be used to launch build_qnodes kernel after the build loop
+ define REG_num_nontrivial REG5;
+
+l_build_outer_loop:
+ {
+
+ // configure the scheduler to initiate a new block of builds
+
+ dispatch opencl_build_kernel_BinnedSAH_begin_batched (1,1,1)
+ args( build_args.p_scheduler, build_args.p_sah_globals )
+ postsync store_dword( p_scheduler_postsync, 0 );
+
+ // wait on init kernel
+ semaphore_wait while( *p_scheduler_postsync != 0 );
+
+
+ // read results produced by scheduler init kernel
+ // lo == BFS wg count. hi == all ones if we need to loop again
+ //
+ REG0 = build_args.p_scheduler + offsetof(Scheduler.batched_build_wg_count);
+ REG4 = load_qword( REG0 );
+
+ // launch BFS1 pass1
+ DISPATCHDIM_X = REG4.lo;
+ dispatch_indirect opencl_build_kernel_BinnedSAH_BFS_pass1_initial_batch
+ args( build_args.p_scheduler,
+ build_args.p_sah_globals)
+ postsync store_dword( p_scheduler_postsync, 1 );
+
+ // wait on BFS pass1
+ semaphore_wait while( *p_scheduler_postsync != 1 );
+
+ // launch BFS pass2
+ dispatch_indirect opencl_build_kernel_BinnedSAH_BFS_pass2_initial_batch
+ args( build_args.p_scheduler,
+ build_args.p_sah_globals )
+ postsync store_dword( p_scheduler_postsync, 0 );
+
+ l_build_loop:
+ {
+ semaphore_wait while( *p_scheduler_postsync != 0 );
+
+ {
+ dispatch opencl_build_kernel_BinnedSAH_scheduler(1,1,1)
+ args( build_args.p_scheduler, build_args.p_sah_globals )
+ postsync store_dword( p_scheduler_postsync, 1 );
+
+ // wait on the scheduler
+ semaphore_wait while( *p_scheduler_postsync != 1 );
+ }
+
+ // load and process the scheduler results
+ define REG_wg_counts REG0;
+ define REG_num_bfs_wgs REG0.lo;
+ define REG_num_dfs_wgs REG0.hi;
+ define REG_loop_break REG1;
+ define REG_p_scheduler REG2;
+ {
+ REG_p_scheduler = p_scheduler;
+ REG_wg_counts = load_qword( REG_p_scheduler );
+
+ define C_MASK_LO REG3 ;
+ C_MASK_LO = 0xffffffff;
+
+ REG_loop_break = REG_wg_counts & C_MASK_LO;
+ REG_loop_break = REG_loop_break == 0;
+ }
+
+ // dispatch new DFS WGs
+ DISPATCHDIM_X = REG_num_dfs_wgs;
+ dispatch_indirect opencl_build_kernel_BinnedSAH_DFS
+ args( p_scheduler,
+ build_args.p_sah_globals );
+
+ // jump out if there are no bfs WGs
+ goto l_continue_outer_loop if (REG_loop_break);
+
+ // dispatch new BFS1 WGs
+ DISPATCHDIM_X = REG_num_bfs_wgs;
+ dispatch_indirect opencl_build_kernel_BinnedSAH_BFS_pass1_indexed_batch
+ args( p_scheduler,
+ build_args.p_sah_globals )
+ postsync store_dword( p_scheduler_postsync, 2 );
+
+ semaphore_wait while( *p_scheduler_postsync != 2 );
+
+ // dispatch new BFS2 WGs
+ dispatch_indirect opencl_build_kernel_BinnedSAH_BFS_pass2_indexed_batch
+ args( p_scheduler,
+ build_args.p_sah_globals )
+ postsync store_dword( p_scheduler_postsync, 0 );
+
+ //goto l_build_loop if not(REG_num_dfs_wgs); //TODO: maybe add some logic to do "bool have_dfs_work" which will add some cycles but may be faster than checking semaphore
+
+ // wait until all upcoming DFS WGs have finished launching
+ // so that the scheduler can refill the launch array
+ // TODO_OPT: Look at replacing this with a ring buffer so that scheduler stalls instead (and only rarely)
+ semaphore_wait while( *p_num_dfs_wgs != 0 );
+
+ goto l_build_loop;
+ }
+
+
+ l_continue_outer_loop:
+
+
+ goto l_build_outer_loop if(REG4.hi);
+
+ }
+
+////////
+//
+// Qnode build phase
+//
+////////
+
+ // Wait for all outstanding DFS dispatches to complete, then build the QNodes
+ control( wait_idle );
+
+ define REG_wg_counts REG1;
+ define REG_p_scheduler REG2;
+ define REG_have_work REG3;
+ define REG_GRB_NUM_MAX_ENTRIES REG4;
+
+ // init scheduler for qnode phase
+ dispatch opencl_build_kernel_BinnedSAH_qnode_init_scheduler_batched(1,1,1)
+ args( build_args.p_scheduler,
+ build_args.num_builds,
+ build_args.num_max_qnode_global_root_buffer_entries);
+
+ REG_p_scheduler = p_scheduler;
+
+ control( wait_idle );
+
+ REG_wg_counts = load_qword( REG_p_scheduler );
+
+ DISPATCHDIM_X = REG_wg_counts.lo;
+
+ // configure the scheduler to initiate a new block of builds
+ dispatch_indirect opencl_build_kernel_BinnedSAH_qnode_begin_batched
+ args( build_args.p_scheduler,
+ build_args.p_sah_globals);
+
+ // read results produced by init scheduler kernel
+ // lo == num of builds processed. hi == num of maximum global root buffer entries
+ //
+ REG0 = build_args.p_scheduler + offsetof(Scheduler.batched_build_wg_count);
+ REG5 = load_qword( REG0 );
+
+ REG_GRB_NUM_MAX_ENTRIES.lo = REG5.hi;
+ REG_GRB_NUM_MAX_ENTRIES.hi = 0;
+
+l_qnode_loop:
+ {
+ control( wait_idle ); // wait for previous pass
+
+ dispatch opencl_build_kernel_BinnedSAH_qnode_scheduler(1,1,1) args( build_args.p_scheduler );
+
+ control( wait_idle );
+
+ REG_wg_counts = load_qword( REG_p_scheduler );
+ REG_have_work = REG_wg_counts > 0;
+
+ goto l_done if not(REG_have_work.lo);
+
+ DISPATCHDIM_X = REG_wg_counts.lo;
+
+ dispatch_indirect opencl_build_kernel_BinnedSAH_BuildQNodes_Amplify_batch
+ args( build_args.p_sah_globals,
+ build_args.p_scheduler );
+
+ control( wait_idle );
+
+ REG_wg_counts = load_qword( REG_p_scheduler ); // reload values
+ REG_wg_counts.lo = REG_wg_counts.hi;
+ REG_wg_counts.hi = 0;
+
+ REG_have_work = REG_wg_counts < REG_GRB_NUM_MAX_ENTRIES;
+
+ goto l_qnode_loop if not(REG_have_work.lo);
+
+ DISPATCHDIM_X = REG5.lo; // dispatch single workgroup for each build scheduled
+
+ dispatch_indirect opencl_build_kernel_BinnedSAH_BuildQnodes_TryToFillGRB_batched
+ args( build_args.p_sah_globals,
+ build_args.p_scheduler );
+
+ goto l_qnode_loop;
+ }
+
+////////
+//
+// Old implementation - TODO: maybe add switch between two implementations?
+//
+////////
+ // Wait for all outstanding DFS dispatches to complete, then build the QNodes
+ //DISPATCHDIM_X = REG5.lo;
+
+ //dispatch_indirect opencl_build_kernel_BinnedSAH_BuildQNodes
+ // args( build_args.p_sah_globals, build_args.p_scheduler );
+
+
+l_done:
+
+ control( wait_idle );
+
+}
diff --git a/src/intel/vulkan/grl/gpu/postbuild_info.grl b/src/intel/vulkan/grl/gpu/postbuild_info.grl
new file mode 100644
index 00000000000..3039e533a9b
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/postbuild_info.grl
@@ -0,0 +1,49 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+module postbuild_info; // In postbuild we assume output data structure to be DXR compatible
+
+kernel compacted_size < source="bvh_postbuild_info.cl", kernelFunction="compacted_size" >
+kernel current_size < source="bvh_postbuild_info.cl", kernelFunction="current_size" >
+kernel serialized_size < source="bvh_postbuild_info.cl", kernelFunction="serialized_size" >
+kernel decoded_size < source="bvh_postbuild_info.cl", kernelFunction="decoded_size" >
+
+metakernel compacted_size(
+ qword bvh,
+ qword postbuildInfo)
+{
+ dispatch compacted_size(1,1,1) args(
+ bvh,
+ postbuildInfo);
+}
+
+metakernel current_size(
+ qword bvh,
+ qword postbuildInfo)
+{
+ dispatch current_size(1,1,1) args(
+ bvh,
+ postbuildInfo);
+}
+
+metakernel serialized_size(
+ qword bvh,
+ qword postbuildInfo)
+{
+ dispatch serialized_size(1,1,1) args(
+ bvh,
+ postbuildInfo);
+}
+
+metakernel decoded_size(
+ qword bvh,
+ qword postbuildInfo)
+{
+ dispatch decoded_size(1,1,1) args(
+ bvh,
+ postbuildInfo);
+}
diff --git a/src/intel/vulkan/grl/gpu/presplit.grl b/src/intel/vulkan/grl/gpu/presplit.grl
new file mode 100644
index 00000000000..d0f6e53fbb1
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/presplit.grl
@@ -0,0 +1,62 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+module presplit;
+
+kernel_module presplit_kernels ("bvh_build_presplit.cl")
+{
+ links lsc_intrinsics;
+
+ kernel opencl_kernel_compute_num_presplits < kernelFunction="compute_num_presplits" >;
+ kernel opencl_kernel_priority_sum < kernelFunction="priority_sum" >;
+ kernel opencl_kernel_perform_presplits < kernelFunction="perform_presplits" >;
+}
+
+import struct MKBuilderState "structs.grl";
+import struct MKSizeEstimate "structs.grl";
+
+
+metakernel compute_num_presplits(
+ MKBuilderState state,
+ qword presplit_buffer,
+ dword numHwThreads )
+{
+ dispatch opencl_kernel_compute_num_presplits ( numHwThreads, 1, 1 ) args(
+ state.build_globals,
+ state.bvh_buffer,
+ state.build_primref_buffer,
+ presplit_buffer,
+ state.geomDesc_buffer );
+}
+
+
+metakernel priority_sum(
+ MKBuilderState state,
+ MKSizeEstimate estimate,
+ qword presplit_buffer )
+{
+ dispatch opencl_kernel_priority_sum ( 1, 1, 1 ) args(
+ state.build_globals,
+ presplit_buffer,
+ estimate.numPrimitivesToSplit / 2 );
+}
+
+metakernel perform_presplits(
+ MKBuilderState state,
+ MKSizeEstimate estimate,
+ qword presplit_buffer,
+ dword numHwThreads )
+{
+ dispatch opencl_kernel_perform_presplits ( numHwThreads, 1, 1 ) args(
+ state.build_globals,
+ state.bvh_buffer,
+ state.build_primref_buffer,
+ presplit_buffer,
+ state.bvh_buffer,
+ state.geomDesc_buffer,
+ estimate.numPrimitivesToSplit / 2 );
+}
diff --git a/src/intel/vulkan/grl/gpu/qbvh6.h b/src/intel/vulkan/grl/gpu/qbvh6.h
new file mode 100644
index 00000000000..22260d07f41
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/qbvh6.h
@@ -0,0 +1,933 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#pragma once
+
+#include "GRLGen12.h"
+
+#include "shared.h"
+#include "quad.h"
+
+/* ====== GENERAL BVH config ====== */
+
+#define BVH_NODE_N6 6
+#define BVH_NODE_N 8
+#define BVH_NODE_N_LOG 3
+
+#define SAH_LOG_BLOCK_SHIFT 2
+#define BVH_LEAF_N_MIN BVH_NODE_N6
+#define BVH_LEAF_N_MAX BVH_NODE_N6
+
+#define BVH_NODE_DEFAULT_MASK 0xff
+#define BVH_NODE_DEGENERATED_MASK 0x00
+
+/* ====== QUANTIZATION config ====== */
+
+#define QUANT_BITS 8
+#define QUANT_MIN 0
+#define QUANT_MAX 255
+#define QUANT_MAX_MANT (255.0f / 256.0f)
+
+#define NO_NODE_OFFSET 0
+
+/* ======================================================================= */
+/* ============================== BVH BASE =============================== */
+/* ======================================================================= */
+
+GRL_INLINE void setBVHBaseBounds(struct BVHBase *base, struct AABB *aabb)
+{
+ base->Meta.bounds.lower[0] = aabb->lower.x;
+ base->Meta.bounds.lower[1] = aabb->lower.y;
+ base->Meta.bounds.lower[2] = aabb->lower.z;
+
+ base->Meta.bounds.upper[0] = aabb->upper.x;
+ base->Meta.bounds.upper[1] = aabb->upper.y;
+ base->Meta.bounds.upper[2] = aabb->upper.z;
+}
+
+GRL_INLINE global struct QBVHNodeN *BVHBase_nodeData(struct BVHBase *bvh)
+{
+ return (global struct QBVHNodeN *)((void *)bvh + BVH_ROOT_NODE_OFFSET);
+}
+
+GRL_INLINE global struct QBVHNodeN *BVHBase_rootNode(struct BVHBase *bvh)
+{
+ return (global struct QBVHNodeN *)((void *)bvh + BVH_ROOT_NODE_OFFSET);
+}
+
+GRL_INLINE global struct Quad *BVHBase_quadLeaves(struct BVHBase *bvh)
+{
+ return (global struct Quad *)((void *)bvh + 64 * (ulong)bvh->quadLeafStart);
+}
+
+GRL_INLINE uint64_t BVHBase_numNodes(struct BVHBase *bvh)
+{
+ return bvh->nodeDataCur - BVH_ROOT_NODE_OFFSET / 64;
+}
+
+GRL_INLINE uint64_t BVHBase_numQuads(struct BVHBase *bvh)
+{
+ return bvh->quadLeafCur - bvh->quadLeafStart;
+}
+
+GRL_INLINE uint64_t BVHBase_numProcedurals(struct BVHBase *bvh)
+{
+ return bvh->proceduralDataCur - bvh->proceduralDataStart;
+}
+
+GRL_INLINE uint64_t BVHBase_numInstances(struct BVHBase *bvh)
+{
+ return bvh->instanceLeafEnd - bvh->instanceLeafStart;
+}
+
+/* =================================================================== */
+/* ============================== QBVH =============================== */
+/* =================================================================== */
+
+__constant const float ulp = FLT_EPSILON;
+
+GRL_INLINE struct AABB conservativeAABB(struct AABB *aabb)
+{
+ struct AABB box;
+ const float4 v4 = max(fabs(aabb->lower), fabs(aabb->upper));
+ const float v = ulp * max(v4.x, max(v4.y, v4.z));
+ box.lower = aabb->lower - (float4)v;
+ box.upper = aabb->upper + (float4)v;
+ return box;
+}
+
+GRL_INLINE struct AABB3f conservativeAABB3f(struct AABB3f* aabb3d)
+{
+ struct AABB aabb4d = AABBfromAABB3f(*aabb3d);
+ struct AABB box = conservativeAABB(&aabb4d);
+ return AABB3fFromAABB(box);
+}
+
+struct QBVH_AABB
+{
+ uchar lower_x[BVH_NODE_N6];
+ uchar upper_x[BVH_NODE_N6];
+ uchar lower_y[BVH_NODE_N6];
+ uchar upper_y[BVH_NODE_N6];
+ uchar lower_z[BVH_NODE_N6];
+ uchar upper_z[BVH_NODE_N6];
+};
+
+struct QBVHNodeN
+{
+ float lower[3];
+ int offset;
+ // 16 bytes
+ uchar type;
+ uchar pad;
+ // 18 bytes
+ char exp[3];
+ uchar instMask;
+ // 22 bytes
+ uchar childData[6];
+ // 28 bytes
+ struct QBVH_AABB qbounds; // + 36 bytes
+ // 64 bytes
+};
+
+GRL_INLINE uint QBVHNodeN_blockIncr(struct QBVHNodeN *This, uint childID)
+{
+ return This->childData[childID] & 0x3;
+}
+
+GRL_INLINE uint QBVHNodeN_startPrim(struct QBVHNodeN *This, uint childID)
+{
+ return (This->childData[childID] >> 2) & 0xF;
+}
+
+GRL_INLINE void initQBVHNodeN(struct QBVHNodeN *qnode)
+{
+ uint *ptr = (uint *)qnode;
+ for (uint i = 0; i < 16; i++)
+ ptr[i] = 0;
+}
+
+GRL_INLINE struct AABB extractAABB_QBVHNodeN(struct QBVHNodeN *qnode, uint i)
+{
+ struct AABB aabb;
+ const float4 base = (float4)(qnode->lower[0], qnode->lower[1], qnode->lower[2], 0.0f);
+ const int4 lower_i = (int4)(qnode->qbounds.lower_x[i], qnode->qbounds.lower_y[i], qnode->qbounds.lower_z[i], 0);
+ const int4 upper_i = (int4)(qnode->qbounds.upper_x[i], qnode->qbounds.upper_y[i], qnode->qbounds.upper_z[i], 0);
+ const int4 exp_i = (int4)(qnode->exp[0], qnode->exp[1], qnode->exp[2], 0.0f);
+ aabb.lower = base + bitShiftLdexp4(convert_float4_rtn(lower_i), exp_i - 8);
+ aabb.upper = base + bitShiftLdexp4(convert_float4_rtp(upper_i), exp_i - 8);
+ return aabb;
+}
+
+GRL_INLINE struct AABB getAABB_QBVHNodeN(struct QBVHNodeN *qnode)
+{
+ struct AABB aabb;
+#if 0
+ AABB_init(&aabb);
+ for (uint i = 0; i < BVH_NODE_N6; i++)
+ {
+ struct AABB v = extractAABB_QBVHNodeN(qnode, i);
+ AABB_extend(&aabb, &v);
+ }
+#else
+ uint lower_x = qnode->qbounds.lower_x[0];
+ uint lower_y = qnode->qbounds.lower_y[0];
+ uint lower_z = qnode->qbounds.lower_z[0];
+
+ uint upper_x = qnode->qbounds.upper_x[0];
+ uint upper_y = qnode->qbounds.upper_y[0];
+ uint upper_z = qnode->qbounds.upper_z[0];
+
+ for (uint i = 1; i < BVH_NODE_N6; i++)
+ {
+ uint lx = qnode->qbounds.lower_x[i];
+ uint ly = qnode->qbounds.lower_y[i];
+ uint lz = qnode->qbounds.lower_z[i];
+
+ uint ux = qnode->qbounds.upper_x[i];
+ uint uy = qnode->qbounds.upper_y[i];
+ uint uz = qnode->qbounds.upper_z[i];
+
+ bool valid = lx <= ux;
+ if (valid)
+ {
+ lower_x = min(lower_x, lx);
+ lower_y = min(lower_y, ly);
+ lower_z = min(lower_z, lz);
+
+ upper_x = max(upper_x, ux);
+ upper_y = max(upper_y, uy);
+ upper_z = max(upper_z, uz);
+ }
+ }
+
+ const float4 base = (float4)(qnode->lower[0], qnode->lower[1], qnode->lower[2], 0.0f);
+ const int4 lower_i = (int4)(lower_x, lower_y, lower_z, 0);
+ const int4 upper_i = (int4)(upper_x, upper_y, upper_z, 0);
+ const int4 exp_i = (int4)(qnode->exp[0], qnode->exp[1], qnode->exp[2], 0.0f);
+ aabb.lower = base + bitShiftLdexp4(convert_float4_rtn(lower_i), exp_i - 8);
+ aabb.upper = base + bitShiftLdexp4(convert_float4_rtp(upper_i), exp_i - 8);
+#endif
+ return aabb;
+}
+
+GRL_INLINE struct AABB3f InternalNode_getAABB3f(struct InternalNode* node)
+{
+ return AABB3fFromAABB(getAABB_QBVHNodeN((struct QBVHNodeN*)node));
+}
+
+GRL_INLINE uint getNumChildren_QBVHNodeN(struct QBVHNodeN *qnode)
+{
+ uint children = 0;
+ for (uint i = 0; i < BVH_NODE_N6; i++)
+ {
+ uint lx = qnode->qbounds.lower_x[i];
+ uint ux = qnode->qbounds.upper_x[i];
+ bool valid = lx <= ux;
+ if (valid)
+ children++;
+ }
+ return children;
+}
+
+GRL_INLINE long extractQBVHNodeN_offset(struct QBVHNodeN *qnode)
+{
+ return ((long)qnode->offset) << 6;
+}
+
+GRL_INLINE void *QBVHNodeN_childrenPointer(struct QBVHNodeN *qnode)
+{
+ const int offset = qnode->offset;
+ return (void *)(qnode + offset);
+}
+
+GRL_INLINE void subgroup_setQBVHNodeN_setFields_reduced_bounds(const int offset, const uint type, struct AABB* input_aabb, const uint numChildren, const uchar mask, struct QBVHNodeN* qbvh_node, const bool degenerated, struct AABB reduced_aabb)
+{
+ const uint subgroupLocalID = get_sub_group_local_id();
+ const uint k = subgroupLocalID;
+ const float up = 1.0f + ulp;
+ const float down = 1.0f - ulp;
+
+ struct AABB aabb = reduced_aabb; // needs to execute with full subgroup width
+ aabb = AABB_sub_group_broadcast(&aabb, 0);
+
+ if (subgroupLocalID < BVH_NODE_N6)
+ {
+ struct AABB conservative_aabb = conservativeAABB(&aabb);
+ const float3 len = AABB_size(&conservative_aabb).xyz * up;
+ int3 exp;
+ const float3 mant = frexp_vec3(len, &exp);
+ const float3 org = conservative_aabb.lower.xyz;
+
+ exp += (mant > (float3)QUANT_MAX_MANT ? (int3)1 : (int3)0);
+
+ qbvh_node->offset = offset;
+ qbvh_node->type = type;
+
+ qbvh_node->lower[0] = org.x;
+ qbvh_node->lower[1] = org.y;
+ qbvh_node->lower[2] = org.z;
+
+ qbvh_node->exp[0] = exp.x;
+ qbvh_node->exp[1] = exp.y;
+ qbvh_node->exp[2] = exp.z;
+
+ qbvh_node->instMask = mask;
+
+ uchar3 lower_uchar = (uchar3)(0x80);
+ uchar3 upper_uchar = (uchar3)(0);
+
+ if (subgroupLocalID < numChildren)
+ {
+ struct AABB child_aabb = conservativeAABB(input_aabb);
+
+ float3 lower = floor(bitShiftLdexp3((child_aabb.lower.xyz - org) * down, -exp + 8));
+ lower = clamp(lower, (float)(QUANT_MIN), (float)(QUANT_MAX));
+ float3 upper = ceil(bitShiftLdexp3((child_aabb.upper.xyz - org) * up, -exp + 8));
+ upper = clamp(upper, (float)(QUANT_MIN), (float)(QUANT_MAX));
+
+ lower_uchar = convert_uchar3_rtn(lower);
+ upper_uchar = convert_uchar3_rtp(upper);
+
+ if (degenerated)
+ {
+ lower_uchar = upper_uchar = 0;
+ }
+ }
+
+ qbvh_node->qbounds.lower_x[k] = lower_uchar.x;
+ qbvh_node->qbounds.lower_y[k] = lower_uchar.y;
+ qbvh_node->qbounds.lower_z[k] = lower_uchar.z;
+ qbvh_node->qbounds.upper_x[k] = upper_uchar.x;
+ qbvh_node->qbounds.upper_y[k] = upper_uchar.y;
+ qbvh_node->qbounds.upper_z[k] = upper_uchar.z;
+
+ qbvh_node->childData[k] = (type == NODE_TYPE_INSTANCE) ? 2 : 1;
+
+#if ENABLE_CONVERSION_CHECKS == 1
+
+ if (!(exp.x >= -128 && exp.x <= 127))
+ printf("exp_x error \n");
+ if (!(exp.y >= -128 && exp.y <= 127))
+ printf("exp_y error \n");
+ if (!(exp.z >= -128 && exp.z <= 127))
+ printf("exp_z error \n");
+
+ struct AABB child_qaabb = extractAABB_QBVHNodeN(qbvh_node, k);
+ if (!AABB_subset(&child_aabb, &child_qaabb))
+ {
+ uint3 lower_i = convert_uint3(lower_uchar);
+ uint3 upper_i = convert_uint3(upper_uchar);
+
+ printf("\n ERROR %d\n", k);
+ printf("lower %f upper %f \n lower_i %d upper_i %d \n", lower, upper, lower_i, upper_i);
+ printf("%i uncompressed \n", k);
+ AABB_print(&child_aabb);
+ printf("%i compressed \n", k);
+ AABB_print(&child_qaabb);
+
+ printf("%i uncompressed (as int) \n", k);
+ AABB_printasInt(&child_aabb);
+ printf("%i compressed (as int) \n", k);
+ AABB_printasInt(&child_qaabb);
+
+ int4 e0 = child_aabb.lower < child_qaabb.lower;
+ int4 e1 = child_aabb.upper > child_qaabb.upper;
+ printf("e0 %d e1 %d \n", e0, e1);
+ }
+#endif
+ }
+}
+
+GRL_INLINE void subgroup_setQBVHNodeN_setFields(const int offset, const uint type, struct AABB* input_aabb, const uint numChildren, const uchar mask, struct QBVHNodeN* qbvh_node, const bool degenerated)
+{
+ struct AABB aabb = AABB_sub_group_reduce_N6(input_aabb);
+ subgroup_setQBVHNodeN_setFields_reduced_bounds(offset, type, input_aabb, numChildren, mask, qbvh_node, degenerated, aabb);
+}
+
+GRL_INLINE void subgroup_setQBVHNodeN_setFields_2xSIMD8_in_SIMD16(const int offset, const uint type, struct AABB* input_aabb, const uint numChildren, const uchar mask, struct QBVHNodeN* qbvh_node, const bool degenerated, bool active_lane)
+{
+ const uint lane = get_sub_group_local_id() % 8;
+ const uint node_in_sg = get_sub_group_local_id() / 8;
+ const uint k = lane;
+ const float up = 1.0f + ulp;
+ const float down = 1.0f - ulp;
+
+ struct AABB aabb = AABB_sub_group_reduce_N6(input_aabb); // needs to execute with full subgroup width
+ aabb = AABB_sub_group_shuffle(&aabb, node_in_sg * 8);
+
+ if (lane < BVH_NODE_N6 && active_lane)
+ {
+ struct AABB conservative_aabb = conservativeAABB(&aabb);
+ const float3 len = AABB_size(&conservative_aabb).xyz * up;
+ int3 exp;
+ const float3 mant = frexp_vec3(len, &exp);
+ const float3 org = conservative_aabb.lower.xyz;
+
+ exp += (mant > (float3)QUANT_MAX_MANT ? (int3)1 : (int3)0);
+
+ qbvh_node->offset = offset;
+ qbvh_node->type = type;
+
+ qbvh_node->lower[0] = org.x;
+ qbvh_node->lower[1] = org.y;
+ qbvh_node->lower[2] = org.z;
+
+ qbvh_node->exp[0] = exp.x;
+ qbvh_node->exp[1] = exp.y;
+ qbvh_node->exp[2] = exp.z;
+
+ qbvh_node->instMask = mask;
+
+ uchar3 lower_uchar = (uchar3)(0x80);
+ uchar3 upper_uchar = (uchar3)(0);
+
+ if (lane < numChildren)
+ {
+ struct AABB child_aabb = conservativeAABB(input_aabb);
+
+ float3 lower = floor(bitShiftLdexp3((child_aabb.lower.xyz - org) * down, -exp + 8));
+ lower = clamp(lower, (float)(QUANT_MIN), (float)(QUANT_MAX));
+ float3 upper = ceil(bitShiftLdexp3((child_aabb.upper.xyz - org) * up, -exp + 8));
+ upper = clamp(upper, (float)(QUANT_MIN), (float)(QUANT_MAX));
+
+ lower_uchar = convert_uchar3_rtn(lower);
+ upper_uchar = convert_uchar3_rtp(upper);
+
+ if (degenerated)
+ {
+ lower_uchar = upper_uchar = 0;
+ }
+ }
+
+ qbvh_node->qbounds.lower_x[k] = lower_uchar.x;
+ qbvh_node->qbounds.lower_y[k] = lower_uchar.y;
+ qbvh_node->qbounds.lower_z[k] = lower_uchar.z;
+ qbvh_node->qbounds.upper_x[k] = upper_uchar.x;
+ qbvh_node->qbounds.upper_y[k] = upper_uchar.y;
+ qbvh_node->qbounds.upper_z[k] = upper_uchar.z;
+
+ qbvh_node->childData[k] = (type == NODE_TYPE_INSTANCE) ? 2 : 1;
+
+#if ENABLE_CONVERSION_CHECKS == 1
+
+ if (!(exp.x >= -128 && exp.x <= 127))
+ printf("exp_x error \n");
+ if (!(exp.y >= -128 && exp.y <= 127))
+ printf("exp_y error \n");
+ if (!(exp.z >= -128 && exp.z <= 127))
+ printf("exp_z error \n");
+
+ struct AABB child_qaabb = extractAABB_QBVHNodeN(qbvh_node, k);
+ if (!AABB_subset(&child_aabb, &child_qaabb))
+ {
+ uint3 lower_i = convert_uint3(lower_uchar);
+ uint3 upper_i = convert_uint3(upper_uchar);
+
+ printf("\n ERROR %d\n", k);
+ printf("lower %f upper %f \n lower_i %d upper_i %d \n", lower, upper, lower_i, upper_i);
+ printf("%i uncompressed \n", k);
+ AABB_print(&child_aabb);
+ printf("%i compressed \n", k);
+ AABB_print(&child_qaabb);
+
+ printf("%i uncompressed (as int) \n", k);
+ AABB_printasInt(&child_aabb);
+ printf("%i compressed (as int) \n", k);
+ AABB_printasInt(&child_qaabb);
+
+ int4 e0 = child_aabb.lower < child_qaabb.lower;
+ int4 e1 = child_aabb.upper > child_qaabb.upper;
+ printf("e0 %d e1 %d \n", e0, e1);
+ }
+#endif
+ }
+}
+
+GRL_INLINE void subgroup_setInstanceQBVHNodeN(const int offset, struct AABB *input_aabb, const uint numChildren, struct QBVHNodeN *qbvh_node, const uint instMask)
+{
+ const uint subgroupLocalID = get_sub_group_local_id();
+
+ // for degenerated (or inactive) instance ignore this box in exp, origin calculation and make its box be a point in the node origin.
+ // if it becomes non_degenerated on update, tree topology will be equivalent to what it would be if we would account this degenerated node here.
+ bool degenerated = (instMask == BVH_NODE_DEGENERATED_MASK);
+
+ struct AABB aabb;
+ AABB_init(&aabb);
+
+ // if every child is degenerated (or inactive) instance, we need to init aabb with origin point
+ uchar commonMask = sub_group_reduce_or_N6(instMask);
+ if (subgroupLocalID < numChildren && (!degenerated || commonMask == BVH_NODE_DEGENERATED_MASK))
+ aabb = *input_aabb;
+
+ subgroup_setQBVHNodeN_setFields(offset, NODE_TYPE_INSTANCE, &aabb, numChildren, commonMask, qbvh_node, degenerated);
+}
+
+
+// return true if is degenerated
+GRL_INLINE bool subgroup_setInstanceBox_2xSIMD8_in_SIMD16(struct AABB* input_aabb, const uint numChildren, uchar* mask, const uint instMask, bool active_lane)
+{
+ const uint lane = get_sub_group_local_id() % 8;
+
+ // for degenerated (or inactive) instance ignore this box in exp, origin calculation and make its box be a point in the node origin.
+ // if it becomes non_degenerated on update, tree topology will be equivalent to what it would be if we would account this degenerated node here.
+ bool degenerated = (instMask == BVH_NODE_DEGENERATED_MASK);
+
+ // if every child is degenerated (or inactive) instance, we need to init aabb with origin point
+ uchar commonMask = sub_group_reduce_or_N6_2xSIMD8_in_SIMD16(instMask);
+ if (active_lane)
+ *mask = commonMask;
+
+ if (active_lane && (degenerated && commonMask != BVH_NODE_DEGENERATED_MASK))
+ AABB_init(input_aabb);
+
+ return active_lane ? degenerated : false;
+}
+
+GRL_INLINE void subgroup_setInstanceQBVHNodeN_x2(const int offset, struct AABB* input_aabb, const uint numChildren, struct QBVHNodeN* qbvh_node, const uint instMask, bool active_lane)
+{
+ const uint lane = get_sub_group_local_id() % 8;
+
+ // for degenerated (or inactive) instance ignore this box in exp, origin calculation and make its box be a point in the node origin.
+ // if it becomes non_degenerated on update, tree topology will be equivalent to what it would be if we would account this degenerated node here.
+ bool degenerated = (instMask == BVH_NODE_DEGENERATED_MASK);
+
+ struct AABB aabb;
+ AABB_init(&aabb);
+
+ // if every child is degenerated (or inactive) instance, we need to init aabb with origin point
+ uchar commonMask = sub_group_reduce_or_N6_2xSIMD8_in_SIMD16(instMask);
+ if (lane < numChildren && (!degenerated || commonMask == BVH_NODE_DEGENERATED_MASK))
+ aabb = *input_aabb;
+
+ subgroup_setQBVHNodeN_setFields_2xSIMD8_in_SIMD16(offset, NODE_TYPE_INSTANCE, &aabb, numChildren, commonMask, qbvh_node, degenerated, active_lane);
+}
+
+
+GRL_INLINE void subgroup_setQBVHNodeN(const int offset, const uint type, struct AABB* input_aabb, const uint numChildren, struct QBVHNodeN* qbvh_node, uint mask)
+{
+ const uint subgroupLocalID = get_sub_group_local_id();
+
+ struct AABB aabb;
+ AABB_init(&aabb);
+
+ if (subgroupLocalID < numChildren)
+ aabb = *input_aabb;
+
+ subgroup_setQBVHNodeN_setFields(offset, type, &aabb, numChildren, mask, qbvh_node, false);
+}
+
+
+GRL_INLINE void subgroup_setQBVHNodeN_x2(const int offset, const uint type, struct AABB* input_aabb, const uint numChildren, struct QBVHNodeN* qbvh_node, bool active_lane)
+{
+ const uint lane = get_sub_group_local_id() % 8;
+
+ struct AABB aabb;
+ AABB_init(&aabb);
+
+ if (lane < numChildren)
+ aabb = *input_aabb;
+
+ subgroup_setQBVHNodeN_setFields_2xSIMD8_in_SIMD16(offset, type, &aabb, numChildren, BVH_NODE_DEFAULT_MASK, qbvh_node, false, active_lane);
+}
+
+
+GRL_INLINE void subgroup_QBVHNodeN_setBounds( uniform struct QBVHNodeN* qbvh_node,
+ uniform struct AABB reduced_bounds,
+ varying struct AABB input_aabb,
+ uniform uint numChildren,
+ varying ushort lane )
+{
+ const float up = 1.0f + ulp;
+ const float down = 1.0f - ulp;
+
+ int3 exp;
+
+ struct AABB conservative_aabb = conservativeAABB( &reduced_bounds);
+ const float3 len = AABB_size( &conservative_aabb ).xyz * up;
+ const float3 mant = frexp_vec3( len, &exp );
+ const float3 org = conservative_aabb.lower.xyz;
+
+ exp += (mant > ( float3 )QUANT_MAX_MANT ? (int3)1 : (int3)0);
+
+ qbvh_node->lower[0] = org.x;
+ qbvh_node->lower[1] = org.y;
+ qbvh_node->lower[2] = org.z;
+
+ qbvh_node->exp[0] = exp.x;
+ qbvh_node->exp[1] = exp.y;
+ qbvh_node->exp[2] = exp.z;
+
+ qbvh_node->instMask = 0xff;
+
+ uchar3 lower_uchar = 0x80;
+ uchar3 upper_uchar = 0;
+
+ if ( lane < BVH_NODE_N6 )
+ {
+ ushort k = lane;
+ if( lane < numChildren )
+ {
+ struct AABB child_aabb = conservativeAABB( &input_aabb ); // conservative ???
+
+ float3 lower = floor( bitShiftLdexp3( (child_aabb.lower.xyz - org) * down, -exp + 8 ) );
+ lower = clamp( lower, (float)(QUANT_MIN), (float)(QUANT_MAX) );
+ float3 upper = ceil( bitShiftLdexp3( (child_aabb.upper.xyz - org) * up, -exp + 8 ) );
+ upper = clamp( upper, (float)(QUANT_MIN), (float)(QUANT_MAX) );
+
+ lower_uchar = convert_uchar3_rtn( lower );
+ upper_uchar = convert_uchar3_rtp( upper );
+ }
+
+ qbvh_node->qbounds.lower_x[k] = lower_uchar.x;
+ qbvh_node->qbounds.lower_y[k] = lower_uchar.y;
+ qbvh_node->qbounds.lower_z[k] = lower_uchar.z;
+ qbvh_node->qbounds.upper_x[k] = upper_uchar.x;
+ qbvh_node->qbounds.upper_y[k] = upper_uchar.y;
+ qbvh_node->qbounds.upper_z[k] = upper_uchar.z;
+ }
+
+}
+
+GRL_INLINE void QBVHNodeN_setBounds(struct QBVHNodeN *qbvh_node, struct AABB *input_aabb, const uint numChildren)
+{
+ const float up = 1.0f + ulp;
+ const float down = 1.0f - ulp;
+
+ int3 exp;
+ struct AABB aabb;
+ AABB_init(&aabb);
+ for (uint i = 0; i < numChildren; i++)
+ AABB_extend(&aabb, &input_aabb[i]);
+
+ struct AABB conservative_aabb = conservativeAABB(&aabb);
+ const float3 len = AABB_size(&conservative_aabb).xyz * up;
+ const float3 mant = frexp_vec3(len, &exp);
+ const float3 org = conservative_aabb.lower.xyz;
+
+ exp += (mant > (float3)QUANT_MAX_MANT ? (int3)1 : (int3)0);
+
+ qbvh_node->lower[0] = org.x;
+ qbvh_node->lower[1] = org.y;
+ qbvh_node->lower[2] = org.z;
+
+ qbvh_node->exp[0] = exp.x;
+ qbvh_node->exp[1] = exp.y;
+ qbvh_node->exp[2] = exp.z;
+
+ qbvh_node->instMask = 0xff;
+
+ for (uint k = 0; k < numChildren; k++)
+ {
+ struct AABB child_aabb = conservativeAABB(&input_aabb[k]); // conservative ???
+
+ float3 lower = floor(bitShiftLdexp3((child_aabb.lower.xyz - org) * down, -exp + 8));
+ lower = clamp(lower, (float)(QUANT_MIN), (float)(QUANT_MAX));
+ float3 upper = ceil(bitShiftLdexp3((child_aabb.upper.xyz - org) * up, -exp + 8));
+ upper = clamp(upper, (float)(QUANT_MIN), (float)(QUANT_MAX));
+
+ uchar3 lower_uchar = convert_uchar3_rtn(lower);
+ uchar3 upper_uchar = convert_uchar3_rtp(upper);
+
+ qbvh_node->qbounds.lower_x[k] = lower_uchar.x;
+ qbvh_node->qbounds.lower_y[k] = lower_uchar.y;
+ qbvh_node->qbounds.lower_z[k] = lower_uchar.z;
+ qbvh_node->qbounds.upper_x[k] = upper_uchar.x;
+ qbvh_node->qbounds.upper_y[k] = upper_uchar.y;
+ qbvh_node->qbounds.upper_z[k] = upper_uchar.z;
+
+#if ENABLE_CONVERSION_CHECKS == 1
+ if (!(exp.x >= -128 && exp.x <= 127))
+ printf("exp_x error \n");
+ if (!(exp.y >= -128 && exp.y <= 127))
+ printf("exp_y error \n");
+ if (!(exp.z >= -128 && exp.z <= 127))
+ printf("exp_z error \n");
+
+ struct AABB child_qaabb = extractAABB_QBVHNodeN(qbvh_node, k);
+ if (!AABB_subset(&child_aabb, &child_qaabb))
+ {
+ uint3 lower_i = convert_uint3(lower_uchar);
+ uint3 upper_i = convert_uint3(upper_uchar);
+
+ printf("\n ERROR %d\n", k);
+ printf("lower %f upper %f \n lower_i %d upper_i %d \n", lower, upper, lower_i, upper_i);
+ printf("%i uncompressed \n", k);
+ AABB_print(&child_aabb);
+ printf("%i compressed \n", k);
+ AABB_print(&child_qaabb);
+
+ printf("%i uncompressed (as int) \n", k);
+ AABB_printasInt(&child_aabb);
+ printf("%i compressed (as int) \n", k);
+ AABB_printasInt(&child_qaabb);
+
+ int4 e0 = child_aabb.lower < child_qaabb.lower;
+ int4 e1 = child_aabb.upper > child_qaabb.upper;
+ printf("e0 %d e1 %d \n", e0, e1);
+ }
+#endif
+ }
+ for (uint k = numChildren; k < BVH_NODE_N6; k++)
+ {
+ qbvh_node->qbounds.lower_x[k] = 0x80;
+ qbvh_node->qbounds.lower_y[k] = 0x80;
+ qbvh_node->qbounds.lower_z[k] = 0x80;
+ qbvh_node->qbounds.upper_x[k] = 0;
+ qbvh_node->qbounds.upper_y[k] = 0;
+ qbvh_node->qbounds.upper_z[k] = 0;
+ }
+}
+
+GRL_INLINE void QBVHNodeN_setChildren(struct QBVHNodeN *qbvh_node, const int offset, const uint numChildren)
+{
+ qbvh_node->offset = offset;
+ for (uint k = 0; k < BVH_NODE_N6; k++)
+ qbvh_node->childData[k] = 1;
+}
+
+GRL_INLINE void QBVHNodeN_setChildIncr1(struct QBVHNodeN *qbvh_node)
+{
+ for (uint k = 0; k < BVH_NODE_N6; k++)
+ qbvh_node->childData[k] = 1;
+}
+
+GRL_INLINE void SUBGROUP_QBVHNodeN_setChildIncr1(struct QBVHNodeN *qbvh_node)
+{
+ if( get_sub_group_local_id() < BVH_NODE_N6 )
+ qbvh_node->childData[get_sub_group_local_id()] = 1;
+}
+
+
+GRL_INLINE void QBVHNodeN_setChildIncr2(struct QBVHNodeN *qbvh_node)
+{
+ for (uint k = 0; k < BVH_NODE_N6; k++)
+ qbvh_node->childData[k] = 2;
+}
+
+GRL_INLINE void QBVHNodeN_setType(struct QBVHNodeN *qbvh_node, const uint type)
+{
+ qbvh_node->type = type;
+}
+
+GRL_INLINE void setQBVHNodeN(const int offset, const uint type, struct AABB *input_aabb, const uint numChildren, struct QBVHNodeN *qbvh_node)
+{
+ QBVHNodeN_setType(qbvh_node, type);
+ QBVHNodeN_setChildren(qbvh_node, offset, numChildren);
+ QBVHNodeN_setBounds(qbvh_node, input_aabb, numChildren);
+}
+
+GRL_INLINE void printQBVHNodeN(struct QBVHNodeN *qnode)
+{
+ printf(" offset %d type %d \n", qnode->offset, (int)qnode->type);
+ printf(" lower %f %f %f \n", qnode->lower[0], qnode->lower[1], qnode->lower[2]);
+ printf(" exp %d %d %d \n", (int)qnode->exp[0], (int)qnode->exp[1], (int)qnode->exp[2]);
+ printf(" instMask %d \n", qnode->instMask);
+
+ struct AABB aabb0 = extractAABB_QBVHNodeN(qnode, 0);
+ struct AABB aabb1 = extractAABB_QBVHNodeN(qnode, 1);
+ struct AABB aabb2 = extractAABB_QBVHNodeN(qnode, 2);
+ struct AABB aabb3 = extractAABB_QBVHNodeN(qnode, 3);
+ struct AABB aabb4 = extractAABB_QBVHNodeN(qnode, 4);
+ struct AABB aabb5 = extractAABB_QBVHNodeN(qnode, 5);
+
+ printf(" lower_x %d %d %d %d %d %d %f %f %f %f %f %f\n", qnode->qbounds.lower_x[0], qnode->qbounds.lower_x[1], qnode->qbounds.lower_x[2], qnode->qbounds.lower_x[3], qnode->qbounds.lower_x[4], qnode->qbounds.lower_x[5], aabb0.lower.x, aabb1.lower.x, aabb2.lower.x, aabb3.lower.x, aabb4.lower.x, aabb5.lower.x);
+ printf(" upper_x %d %d %d %d %d %d %f %f %f %f %f %f\n", qnode->qbounds.upper_x[0], qnode->qbounds.upper_x[1], qnode->qbounds.upper_x[2], qnode->qbounds.upper_x[3], qnode->qbounds.upper_x[4], qnode->qbounds.upper_x[5], aabb0.upper.x, aabb1.upper.x, aabb2.upper.x, aabb3.upper.x, aabb4.upper.x, aabb5.upper.x);
+
+ printf(" lower_y %d %d %d %d %d %d %f %f %f %f %f %f\n", qnode->qbounds.lower_y[0], qnode->qbounds.lower_y[1], qnode->qbounds.lower_y[2], qnode->qbounds.lower_y[3], qnode->qbounds.lower_y[4], qnode->qbounds.lower_y[5], aabb0.lower.y, aabb1.lower.y, aabb2.lower.y, aabb3.lower.y, aabb4.lower.y, aabb5.lower.y);
+ printf(" upper_y %d %d %d %d %d %d %f %f %f %f %f %f\n", qnode->qbounds.upper_y[0], qnode->qbounds.upper_y[1], qnode->qbounds.upper_y[2], qnode->qbounds.upper_y[3], qnode->qbounds.upper_y[4], qnode->qbounds.upper_y[5], aabb0.upper.y, aabb1.upper.y, aabb2.upper.y, aabb3.upper.y, aabb4.upper.y, aabb5.upper.y);
+
+ printf(" lower_z %d %d %d %d %d %d %f %f %f %f %f %f\n", qnode->qbounds.lower_z[0], qnode->qbounds.lower_z[1], qnode->qbounds.lower_z[2], qnode->qbounds.lower_z[3], qnode->qbounds.lower_z[4], qnode->qbounds.lower_z[5], aabb0.lower.z, aabb1.lower.z, aabb2.lower.z, aabb3.lower.z, aabb4.lower.z, aabb5.lower.z);
+ printf(" upper_z %d %d %d %d %d %d %f %f %f %f %f %f\n", qnode->qbounds.upper_z[0], qnode->qbounds.upper_z[1], qnode->qbounds.upper_z[2], qnode->qbounds.upper_z[3], qnode->qbounds.upper_z[4], qnode->qbounds.upper_z[5], aabb0.upper.z, aabb1.upper.z, aabb2.upper.z, aabb3.upper.z, aabb4.upper.z, aabb5.upper.z);
+}
+
+GRL_INLINE int encodeOffset(global char *bvh_mem, global void *parent, int global_child_offset)
+{
+ long global_parent_offset = (long)parent - (long)bvh_mem;
+ global_parent_offset = global_parent_offset & (~(64 - 1)); // FIXME: (sw) this should not be necessary?
+ int relative_offset = global_child_offset - global_parent_offset; // FIXME: this limits BVH size to 4GB
+ //if ((int)relative_offset <= 0) printf("relative offset <= 0 %d global_child_offset %d global_parent_offset %d \n", relative_offset,global_child_offset,global_parent_offset);
+ return relative_offset;
+}
+
+GRL_INLINE void QBVH6Node_set_offset(struct QBVHNodeN *qnode, void *children)
+{
+ int ofs = (struct QBVHNodeN *)children - qnode;
+ qnode->offset = ofs;
+}
+
+GRL_INLINE void QBVH6Node_set_type(struct QBVHNodeN *qnode, uint type)
+{
+ qnode->type = type;
+}
+
+GRL_INLINE uint sortBVHChildrenIDs(uint input)
+{
+#if BVH_NODE_N == 8
+ return sort8_descending(input);
+#else
+ return sort4_descending(input);
+#endif
+}
+
+enum XFM_BOX_OPTION {
+ XFM_BOX_NO_CLIP = 0,
+ XFM_BOX_NOT_REFINED_CLIPPED = 1, //<<use clipbox, for not refined, compute bbox from children, transform after extending to one box
+ XFM_BOX_NOT_REFINED_TAKE_CLIPBOX = 2 //<<use clipbox, for not refined, just transform xlipbox, don't take children boxes into account
+};
+
+#define DEB_PRINTFS 0
+#ifndef FINE_TRANSFORM_NODE_BOX
+#define FINE_TRANSFORM_NODE_BOX 0
+#endif
+
+GRL_INLINE struct AABB3f GRL_OVERLOADABLE compute_xfm_bbox(const float* xfm, InternalNode* pnode, enum XFM_BOX_OPTION clipOpt, const AABB3f* clipBox, float matrixTransformOverhead)
+{
+ AABB3f childrenbox;
+#if FINE_TRANSFORM_NODE_BOX
+ struct AffineSpace3f axfm = AffineSpace3f_load_row_major(xfm);
+ bool computeFine = matrixTransformOverhead < 0.6f;
+ computeFine = sub_group_any(computeFine);
+ if (computeFine)
+ {
+ bool clip = clipOpt != XFM_BOX_NO_CLIP;
+ InternalNode node = *pnode;
+
+#if DEB_PRINTFS
+ if (InternalNode_IsChildValid(&node, 5) && !InternalNode_IsChildValid(&node, 4))
+ printf("child 5 valid && child 4 invalid\n");
+ if (InternalNode_IsChildValid(&node, 4) && !InternalNode_IsChildValid(&node, 3))
+ printf("child 4 valid && child 3 invalid\n");
+ if (InternalNode_IsChildValid(&node, 3) && !InternalNode_IsChildValid(&node, 2))
+ printf("child 3 valid && child 2 invalid\n");
+ if (InternalNode_IsChildValid(&node, 2) && !InternalNode_IsChildValid(&node, 1))
+ printf("child 2 valid && child 1 invalid\n");
+ if (InternalNode_IsChildValid(&node, 1) && !InternalNode_IsChildValid(&node, 0))
+ printf("child 1 valid && child 0 invalid\n");
+#endif
+
+#if DEB_PRINTFS
+ printf("F");
+#endif
+ AABB3f child_bounds0 = InternalNode_GetChildAABB(&node, 0);
+ AABB3f child_bounds1 = InternalNode_GetChildAABB(&node, 1);
+ AABB3f child_bounds2 = InternalNode_GetChildAABB(&node, 2);
+ AABB3f child_bounds3 = InternalNode_GetChildAABB(&node, 3);
+ AABB3f child_bounds4 = InternalNode_GetChildAABB(&node, 4);
+ AABB3f child_bounds5 = InternalNode_GetChildAABB(&node, 5);
+
+ // we bravely assumme we will have at least 2 children here.
+ if(!InternalNode_IsChildValid(&node, 2)) child_bounds2 = child_bounds0;
+ if(!InternalNode_IsChildValid(&node, 3)) child_bounds3 = child_bounds0;
+ if(!InternalNode_IsChildValid(&node, 4)) child_bounds4 = child_bounds0;
+ if(!InternalNode_IsChildValid(&node, 5)) child_bounds5 = child_bounds0;
+
+ if (clip)
+ {
+ AABB3f_trim_upper(&child_bounds0, clipBox->upper);
+ AABB3f_trim_upper(&child_bounds1, clipBox->upper);
+ AABB3f_trim_upper(&child_bounds2, clipBox->upper);
+ AABB3f_trim_upper(&child_bounds3, clipBox->upper);
+ AABB3f_trim_upper(&child_bounds4, clipBox->upper);
+ AABB3f_trim_upper(&child_bounds5, clipBox->upper);
+ }
+
+ child_bounds0 = transform_aabb(child_bounds0, xfm);
+ child_bounds1 = transform_aabb(child_bounds1, xfm);
+ child_bounds2 = transform_aabb(child_bounds2, xfm);
+ child_bounds3 = transform_aabb(child_bounds3, xfm);
+ child_bounds4 = transform_aabb(child_bounds4, xfm);
+ child_bounds5 = transform_aabb(child_bounds5, xfm);
+
+ AABB3f_extend(&child_bounds0, &child_bounds1);
+ AABB3f_extend(&child_bounds2, &child_bounds3);
+ AABB3f_extend(&child_bounds4, &child_bounds5);
+ AABB3f_extend(&child_bounds0, &child_bounds2);
+ AABB3f_extend(&child_bounds0, &child_bounds4);
+
+ return child_bounds0;
+ }
+#endif
+
+#if DEB_PRINTFS
+ printf("0");
+#endif
+
+ struct AABB3f child_bounds;
+
+ if (clipOpt != XFM_BOX_NOT_REFINED_TAKE_CLIPBOX)
+ {
+ // XFM_BOX_NOT_REFINED_CLIPPED || XFM_BOX_NO_CLIP
+ child_bounds = InternalNode_getAABB3f(pnode);
+ if (clipOpt != XFM_BOX_NO_CLIP)
+ {
+ AABB3f_intersect(&child_bounds, *clipBox);
+ }
+ }
+ else
+ {
+ //XFM_BOX_NOT_REFINED_TAKE_CLIPBOX
+ child_bounds = *clipBox;
+ }
+
+ child_bounds = transform_aabb(child_bounds, xfm);
+ //child_bounds = conservativeAABB3f(&child_bounds);
+ return child_bounds;
+}
+
+GRL_INLINE AABB3f GRL_OVERLOADABLE compute_xfm_bbox(struct AffineSpace3f xfm, InternalNode* pnode, bool clip, AABB3f* clipBox, float matOverhead)
+{
+ float transform[12];
+ load_row_major_from_AffineSpace3f(xfm, transform);
+ return compute_xfm_bbox(transform, pnode, clip, clipBox, matOverhead);
+}
+
+GRL_INLINE uint64_t compute_refit_structs_compacted_size(BVHBase* base)
+{
+ uint dataSize = 0;
+
+ if (BVHBase_HasBackPointers(base))
+ {
+ const uint fatleafEntrySize = (base->fatLeafCount * sizeof(LeafTableEntry) + 63) & ~63;
+ const uint innerEntrySize = (base->innerCount * sizeof(InnerNodeTableEntry) + 63) & ~63;
+
+ // New atomic update
+ if(base->quadIndicesDataStart > base->backPointerDataStart)
+ {
+ uint numQuads = BVHBase_GetNumQuads(base);
+
+ const uint quadTableMainBufferSize = (numQuads + 255) & ~255;
+ const uint quadLeftoversSize = (base->quadLeftoversCountNewAtomicUpdate + 255) & ~255;
+ const uint quadTableEntriesSize = (((quadTableMainBufferSize + quadLeftoversSize) * sizeof(LeafTableEntry) + 63) & ~63);
+
+ const uint quadIndicesDataSize = (numQuads * sizeof(QuadDataIndices) + 63) & ~63;
+
+ dataSize += quadTableEntriesSize + quadIndicesDataSize;
+ }
+
+ dataSize +=
+ ((BVHBase_GetNumInternalNodes(base) * sizeof(uint) + 63) & ~63)
+ + fatleafEntrySize + innerEntrySize;
+ }
+
+ return (uint64_t)dataSize;
+}
+
+GRL_INLINE uint64_t compute_compacted_size(BVHBase* base)
+{
+ uint64_t size = sizeof(BVHBase);
+ size += BVHBase_GetNumHWInstanceLeaves(base) * sizeof(HwInstanceLeaf);
+ size += BVHBase_GetNumProcedurals(base) * sizeof(ProceduralLeaf);
+ size += BVHBase_GetNumQuads(base) * sizeof(QuadLeaf);
+ size += compute_refit_structs_compacted_size(base);
+ size += BVHBase_GetNumInternalNodes(base) * sizeof(InternalNode);
+ size += sizeof(InstanceDesc) * base->Meta.instanceCount;
+ size += (sizeof(GeoMetaData) * base->Meta.geoCount + 63) & ~63; // align to 64
+ size = (size + 63) & ~63;
+
+ return size;
+}
diff --git a/src/intel/vulkan/grl/gpu/quad.h b/src/intel/vulkan/grl/gpu/quad.h
new file mode 100644
index 00000000000..cc1b7d470f8
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/quad.h
@@ -0,0 +1,127 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#pragma once
+
+#include "shared.h"
+#include "intrinsics.h"
+#include "AABB.h"
+#include "AABB3f.h"
+
+// JDB TODO: Use corresponding GRL structures!!!
+
+struct Quad
+{
+ unsigned int shaderIndex; // note: also mask
+ unsigned int geomIndex; // note: also geom flags in upper 2 bits
+ unsigned int primIndex0;
+ unsigned int primIndex1Delta;
+ float v[4][3];
+};
+
+GRL_INLINE unsigned int Quad_getGeomIndex(global struct Quad *quad)
+{
+ return quad->geomIndex;
+}
+
+GRL_INLINE unsigned int Quad_getPrimIndex0(global struct Quad *quad)
+{
+ return quad->primIndex0;
+}
+
+GRL_INLINE unsigned int Quad_getPrimIndex1(global struct Quad *quad)
+{
+ return quad->primIndex0 + (quad->primIndex1Delta & 0xFFFF);
+}
+
+GRL_INLINE float3 load_float3(float *p)
+{
+ return (float3)(p[0], p[1], p[2]);
+}
+
+GRL_INLINE float3 load_perm_float3(float *p, const uint3 perm)
+{
+ return (float3)(p[perm.x], p[perm.y], p[perm.z]);
+}
+
+GRL_INLINE float2 load_perm_float2(float *p, const uint2 perm)
+{
+ return (float2)(p[perm.x], p[perm.y]);
+}
+
+GRL_INLINE float load_perm_float(float *p, const uint perm)
+{
+ return p[perm];
+}
+
+GRL_INLINE struct AABB getAABB_Quad(struct Quad *q)
+{
+ struct AABB aabb;
+ const float3 lower = min(min(load_float3(q->v[0]), load_float3(q->v[1])), min(load_float3(q->v[2]), load_float3(q->v[3])));
+ const float3 upper = max(max(load_float3(q->v[0]), load_float3(q->v[1])), max(load_float3(q->v[2]), load_float3(q->v[3])));
+ aabb.lower = (float4)(lower, 0.0f);
+ aabb.upper = (float4)(upper, 0.0f);
+ return aabb;
+}
+
+GRL_INLINE void Quad_ExtendAABB(struct Quad* q, struct AABB* box)
+{
+ struct AABB aabb;
+ const float3 lower = min(min(load_float3(q->v[0]), load_float3(q->v[1])), min(load_float3(q->v[2]), load_float3(q->v[3])));
+ const float3 upper = max(max(load_float3(q->v[0]), load_float3(q->v[1])), max(load_float3(q->v[2]), load_float3(q->v[3])));
+ aabb.lower = (float4)(lower, 0.0f);
+ aabb.upper = (float4)(upper, 0.0f);
+ AABB_extend(box, &aabb);
+}
+
+GRL_INLINE float4 getCentroid2_Quad(struct Quad *q)
+{
+ struct AABB aabb = getAABB_Quad(q);
+ return aabb.lower + aabb.upper;
+}
+
+GRL_INLINE void setQuad(struct Quad *quad, const float4 v0, const float4 v1, const float4 v2, const float4 v3,
+ const uchar j0, const uchar j1, const uchar j2,
+ const uint geomID, const uint primID0, const uint primID1, const uint geomMask, const uint geomFlags )
+{
+ quad->v[0][0] = v0.x;
+ quad->v[0][1] = v0.y;
+ quad->v[0][2] = v0.z;
+ quad->v[1][0] = v1.x;
+ quad->v[1][1] = v1.y;
+ quad->v[1][2] = v1.z;
+ quad->v[2][0] = v2.x;
+ quad->v[2][1] = v2.y;
+ quad->v[2][2] = v2.z;
+ quad->v[3][0] = v3.x;
+ quad->v[3][1] = v3.y;
+ quad->v[3][2] = v3.z;
+
+ quad->shaderIndex = (geomMask << 24) | geomID;
+ quad->geomIndex = geomID | (geomFlags << 30);
+ quad->primIndex0 = primID0;
+ const uint delta = primID1 - primID0;
+ const uint j = (((j0) << 0) | ((j1) << 2) | ((j2) << 4));
+ quad->primIndex1Delta = delta | (j << 16) | (1 << 22); // single prim in leaf
+
+}
+
+GRL_INLINE void setQuadVertices(struct Quad *quad, const float3 v0, const float3 v1, const float3 v2, const float3 v3)
+{
+ quad->v[0][0] = v0.x;
+ quad->v[0][1] = v0.y;
+ quad->v[0][2] = v0.z;
+ quad->v[1][0] = v1.x;
+ quad->v[1][1] = v1.y;
+ quad->v[1][2] = v1.z;
+ quad->v[2][0] = v2.x;
+ quad->v[2][1] = v2.y;
+ quad->v[2][2] = v2.z;
+ quad->v[3][0] = v3.x;
+ quad->v[3][1] = v3.y;
+ quad->v[3][2] = v3.z;
+}
diff --git a/src/intel/vulkan/grl/gpu/radix_sort.grl b/src/intel/vulkan/grl/gpu/radix_sort.grl
new file mode 100644
index 00000000000..df932057a10
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/radix_sort.grl
@@ -0,0 +1,163 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+module radix_sort;
+
+kernel_module radix_kernels ("morton_radix_sort.cl")
+{
+ links lsc_intrinsics;
+ kernel opencl_build_morton_kernel_sort_bin_items < kernelFunction="sort_morton_codes_bin_items">;
+ kernel opencl_build_morton_kernel_sort_reduce_bins < kernelFunction="sort_morton_codes_reduce_bins">;
+ kernel opencl_build_morton_kernel_sort_scatter_items < kernelFunction="sort_morton_codes_scatter_items">;
+
+ kernel opencl_build_morton_codes_sort_merged < kernelFunction="sort_morton_codes_merged">;
+
+ kernel opencl_build_morton_kernel_sort_reduce_bins_wide_partial_sum < kernelFunction="sort_morton_codes_reduce_bins_wide_partial_sum">;
+ kernel opencl_build_morton_kernel_sort_reduce_bins_wide_add_reduce < kernelFunction="sort_morton_codes_reduce_bins_wide_add_reduce">;
+}
+
+metakernel sort(
+ qword build_globals,
+ dword shift,
+ qword global_histogram,
+ qword input0,
+ qword input1,
+ dword input0_offset,
+ dword input1_offset,
+ dword iteration,
+ dword threads)
+{
+ dispatch opencl_build_morton_kernel_sort_bin_items (threads, 1, 1) args(
+ build_globals,
+ shift,
+ global_histogram,
+ input0,
+ input1,
+ input0_offset,
+ input1_offset,
+ iteration);
+
+ control(wait_idle);
+
+ dispatch opencl_build_morton_kernel_sort_reduce_bins (1, 1, 1) args(
+ threads,
+ global_histogram);
+
+ control(wait_idle);
+
+ dispatch opencl_build_morton_kernel_sort_scatter_items (threads, 1, 1) args(
+ build_globals,
+ shift,
+ global_histogram,
+ input0,
+ input1,
+ input0_offset,
+ input1_offset,
+ iteration);
+
+ control(wait_idle);
+
+}
+
+metakernel sort_bin_items(
+ qword build_globals,
+ qword global_histogram,
+ qword wg_flags,
+ qword input0,
+ dword iteration,
+ dword threads,
+ dword update_wg_flags
+ )
+{
+ dispatch opencl_build_morton_kernel_sort_bin_items (threads, 1, 1) args(
+ build_globals,
+ global_histogram,
+ wg_flags,
+ input0,
+ iteration,
+ threads,
+ update_wg_flags
+ );
+}
+
+metakernel sort_reduce_bins(
+ qword build_globals,
+ qword global_histogram,
+ dword threads,
+ dword iteration)
+{
+ dispatch opencl_build_morton_kernel_sort_reduce_bins (1, 1, 1) args(
+ build_globals,
+ threads,
+ global_histogram,
+ iteration);
+}
+
+metakernel sort_scatter_items(
+ qword build_globals,
+ qword global_histogram,
+ qword input0,
+ qword input1,
+ dword iteration,
+ dword threads,
+ dword update_morton_sort_in_flight )
+{
+ dispatch opencl_build_morton_kernel_sort_scatter_items( threads, 1, 1 ) args(
+ build_globals,
+ global_histogram,
+ input0,
+ input1,
+ iteration,
+ threads,
+ update_morton_sort_in_flight
+ );
+}
+
+metakernel sort_bin_items_merged(
+ qword build_globals,
+ qword global_histogram,
+ qword input0,
+ dword iteration,
+ dword threads)
+{
+ dispatch opencl_build_morton_codes_sort_merged (threads, 1, 1) args(
+ build_globals,
+ global_histogram,
+ input0,
+ iteration,
+ threads
+ );
+}
+
+metakernel sort_reduce_bins_wide(
+ qword build_globals,
+ qword global_histogram,
+ qword global_histogram_tmp,
+ qword wg_flags,
+ dword threads,
+ dword threads_groups,
+ dword iteration)
+{
+ dispatch opencl_build_morton_kernel_sort_reduce_bins_wide_partial_sum(threads_groups, 1, 1) args(
+ build_globals,
+ threads,
+ threads_groups,
+ global_histogram,
+ global_histogram_tmp,
+ wg_flags,
+ iteration);
+
+ control(wait_idle);
+
+ dispatch opencl_build_morton_kernel_sort_reduce_bins_wide_add_reduce(threads_groups, 1, 1) args(
+ build_globals,
+ threads,
+ threads_groups,
+ global_histogram,
+ global_histogram_tmp,
+ iteration);
+}
diff --git a/src/intel/vulkan/grl/gpu/rebraid.grl b/src/intel/vulkan/grl/gpu/rebraid.grl
new file mode 100644
index 00000000000..5aa809637a3
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/rebraid.grl
@@ -0,0 +1,167 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+module rebraid;
+
+kernel init_scratch < source="bvh_rebraid.cl", kernelFunction="rebraid_init_scratch" >
+kernel chase_instance_ptrs < source="bvh_rebraid.cl", kernelFunction="rebraid_chase_instance_pointers" >
+kernel calc_aabb < source="bvh_rebraid.cl", kernelFunction="rebraid_computeAABB_DXR_instances" >
+kernel calc_aabb_indirect < source="bvh_rebraid.cl", kernelFunction="rebraid_computeAABB_DXR_instances_indirect" >
+kernel calc_aabb_ptr < source="bvh_rebraid.cl", kernelFunction="rebraid_computeAABB_DXR_instances_pointers" >
+kernel calc_aabb_ptr_indirect < source="bvh_rebraid.cl", kernelFunction="rebraid_computeAABB_DXR_instances_pointers_indirect" >
+kernel count_splits < source="bvh_rebraid.cl", kernelFunction="rebraid_count_splits" >
+kernel count_splits_SG < source="bvh_rebraid.cl", kernelFunction="rebraid_count_splits_SG" >
+kernel count_splits_SG_indirect < source="bvh_rebraid.cl", kernelFunction="rebraid_count_splits_SG_indirect" >
+kernel build_primrefs < source="bvh_rebraid.cl", kernelFunction="rebraid_build_primrefs" >
+kernel build_primrefs_indirect < source="bvh_rebraid.cl", kernelFunction="rebraid_build_primrefs_indirect" >
+
+//kernel ISA_TEST < source="bvh_rebraid.cl", kernelFunction="ISA_TEST" >
+//kernel DEBUG_PRINT < source="bvh_rebraid.cl", kernelFunction="DEBUG_PRINT" >
+
+
+const PRIMREF_GROUP_SIZE = 256;
+
+const COUNT_SPLITS_GROUP_SIZE = 16;
+
+struct MKRebraidArgs
+{
+ qword bvh_buffer;
+ qword primref_buffer;
+ qword global_buffer;
+ qword instances_buffer;
+ qword rebraid_scratch;
+ qword flat_instances_buffer;
+ dword num_instances;
+ dword num_extra_primrefs;
+};
+
+metakernel rebraid(
+ MKRebraidArgs Args
+ )
+{
+ dispatch init_scratch(1,1,1) args( Args.rebraid_scratch );
+ dispatch calc_aabb(Args.num_instances,1,1) args( Args.bvh_buffer, Args.instances_buffer );
+ control( wait_idle );
+
+ //define num_count_groups ((Args.num_instances + (COUNT_SPLITS_GROUP_SIZE-1)) / COUNT_SPLITS_GROUP_SIZE);
+ //dispatch count_splits(num_count_groups,1,1) args( Args.bvh_buffer, Args.instances_buffer, Args.rebraid_scratch, Args.num_instances );
+
+ dispatch count_splits_SG(Args.num_instances,1,1) args( Args.bvh_buffer, Args.instances_buffer, Args.rebraid_scratch );
+ control( wait_idle );
+
+ define num_primref_groups ((Args.num_instances + (PRIMREF_GROUP_SIZE-1)) / PRIMREF_GROUP_SIZE);
+
+ dispatch build_primrefs(num_primref_groups,1,1) args( Args.global_buffer, Args.bvh_buffer, Args.instances_buffer, Args.rebraid_scratch, Args.primref_buffer, Args.num_extra_primrefs, Args.num_instances );
+ control( wait_idle );
+
+ //dispatch DEBUG_PRINT(1,1,1) args( Args.global_buffer, Args.instances_buffer, Args.rebraid_scratch, Args.primref_buffer, Args.num_extra_primrefs, Args.num_instances );
+}
+
+metakernel rebraid_indirect(MKRebraidArgs Args, qword indirectBuildRangeInfo)
+{
+
+ dispatch init_scratch(1, 1, 1) args(Args.rebraid_scratch);
+
+ define num_groups REG0;
+ num_groups = load_dword(indirectBuildRangeInfo);
+ DISPATCHDIM_X = num_groups.lo;
+ DISPATCHDIM_Y = 1;
+ DISPATCHDIM_Z = 1;
+
+ dispatch_indirect calc_aabb_indirect args(Args.bvh_buffer, Args.instances_buffer, indirectBuildRangeInfo);
+ control(wait_idle);
+
+ dispatch_indirect count_splits_SG_indirect
+ args(Args.bvh_buffer, Args.instances_buffer, Args.rebraid_scratch, indirectBuildRangeInfo);
+
+ define groupsize_1 REG1; // groupsize - 1
+ define C_8 REG2;
+
+ groupsize_1 = 255; // PRIMREF_GROUP_SIZE - 1
+ C_8 = 8; // log_2(PRIMREF_GROUP_SIZE)
+
+ num_groups = num_groups + groupsize_1;
+ num_groups = num_groups >> C_8; // num_groups / PRIMREF_GROUP_SIZE;
+ DISPATCHDIM_X = num_groups.lo;
+
+ control(wait_idle);
+
+ dispatch_indirect build_primrefs_indirect args(
+ Args.global_buffer,
+ Args.bvh_buffer,
+ Args.instances_buffer,
+ Args.rebraid_scratch,
+ Args.primref_buffer,
+ indirectBuildRangeInfo,
+ Args.num_extra_primrefs);
+ control(wait_idle);
+}
+
+metakernel rebraid_ptrs(
+ MKRebraidArgs Args
+ )
+{
+ dispatch init_scratch(1,1,1) args( Args.rebraid_scratch );
+ dispatch chase_instance_ptrs( Args.num_instances, 1, 1) args( Args.instances_buffer, Args.flat_instances_buffer );
+ dispatch calc_aabb_ptr(Args.num_instances,1,1) args( Args.bvh_buffer, Args.instances_buffer );
+ control( wait_idle );
+
+ //define num_count_groups ((Args.num_instances + (COUNT_SPLITS_GROUP_SIZE-1)) / COUNT_SPLITS_GROUP_SIZE);
+ //dispatch count_splits(num_count_groups,1,1) args( Args.bvh_buffer, Args.instances_buffer, Args.rebraid_scratch );
+
+ dispatch count_splits_SG(Args.num_instances,1,1) args( Args.bvh_buffer, Args.flat_instances_buffer, Args.rebraid_scratch );
+ control( wait_idle );
+
+ define num_primref_groups ((Args.num_instances + (PRIMREF_GROUP_SIZE-1)) / PRIMREF_GROUP_SIZE);
+
+
+ dispatch build_primrefs(num_primref_groups,1,1) args( Args.global_buffer, Args.bvh_buffer, Args.flat_instances_buffer, Args.rebraid_scratch, Args.primref_buffer, Args.num_extra_primrefs, Args.num_instances );
+ control( wait_idle );
+
+}
+
+metakernel rebraid_ptrs_indirect(MKRebraidArgs Args, qword indirectBuildRangeInfo)
+{
+ dispatch init_scratch(1, 1, 1) args(Args.rebraid_scratch);
+
+ define num_groups REG0;
+ num_groups = load_dword(indirectBuildRangeInfo);
+ DISPATCHDIM_X = num_groups.lo;
+ DISPATCHDIM_Y = 1;
+ DISPATCHDIM_Z = 1;
+
+ dispatch_indirect chase_instance_ptrs
+ args(Args.instances_buffer, Args.flat_instances_buffer, indirectBuildRangeInfo);
+ dispatch_indirect calc_aabb_ptr_indirect args(Args.bvh_buffer, Args.instances_buffer, indirectBuildRangeInfo);
+ control(wait_idle);
+
+ dispatch_indirect count_splits_SG_indirect
+ args(Args.bvh_buffer, Args.flat_instances_buffer, Args.rebraid_scratch, indirectBuildRangeInfo);
+
+ define groupsize_1 REG1; // groupsize - 1
+ define C_8 REG2;
+
+ groupsize_1 = 255; // PRIMREF_GROUP_SIZE - 1
+ C_8 = 8; // log_2(PRIMREF_GROUP_SIZE)
+
+ num_groups = num_groups + groupsize_1;
+ num_groups = num_groups >> C_8; // num_groups / PRIMREF_GROUP_SIZE;
+ DISPATCHDIM_X = num_groups.lo;
+
+ control(wait_idle);
+
+ dispatch_indirect build_primrefs_indirect args(
+ Args.global_buffer,
+ Args.bvh_buffer,
+ Args.flat_instances_buffer,
+ Args.rebraid_scratch,
+ Args.primref_buffer,
+ Args.num_extra_primrefs,
+ indirectBuildRangeInfo,
+ Args.num_instances);
+ control(wait_idle);
+}
diff --git a/src/intel/vulkan/grl/gpu/shared.h b/src/intel/vulkan/grl/gpu/shared.h
new file mode 100644
index 00000000000..0d42d98a1d4
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/shared.h
@@ -0,0 +1,182 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#include "GRLGen12.h"
+#pragma once
+
+#define sizeof_Quad 64
+#define sizeof_Procedural 64
+#define sizeof_PrimRef 32
+#define sizeof_PresplitItem 8
+#define sizeof_HwInstanceLeaf 128
+#define MORTON_BUILDER_SUBTREE_THRESHOLD 256
+#define MORTON_BUILDER_P2_ELEMENTS_IN_SLM 16 * 1024 / 32
+// Temporarily disable localized phase2 due to issues in ELG presi
+// This implementation would be replaced with bottom_up + bounding box approach without the need for phase2 refit
+#define MORTON_BUILDER_P2_SINGLE_WG_THRESHOLD /*100000*/ 0
+
+#define BVH_QUAD_NODE 4
+#define BVH_INSTANCE_NODE 1
+#define BVH_INTERNAL_NODE 0
+#define BVH_PROCEDURAL_NODE 3
+#define BUILDRECORD_STACK_SIZE 48
+#define BINS 16
+
+GRL_NAMESPACE_BEGIN(GRL)
+GRL_NAMESPACE_BEGIN(RTAS)
+GRL_NAMESPACE_BEGIN(GPUBVHBuilder)
+
+struct AABB
+{
+ float4 lower;
+ float4 upper;
+};
+
+typedef struct BlockAllocator
+{
+ unsigned int start;
+ unsigned int cur;
+} BlockAllocator;
+
+struct Globals
+{
+ struct AABB centroidBounds;
+
+ unsigned int build_record_start;
+ unsigned int numPrimitives;
+ unsigned int leafPrimType;
+ unsigned int leafSize;
+
+ unsigned int numSplittedPrimitives;
+ unsigned int numBuildRecords;
+
+ // spatial split sate
+ unsigned int numOriginalPrimitives;
+ float presplitPrioritySum;
+ float probThreshold;
+
+ // binned-sah bfs state
+ unsigned int counter;
+ unsigned int numBuildRecords_extended;
+
+ // sync variable used for global-sync on work groups
+ unsigned int sync;
+
+
+ /* morton code builder state */
+ unsigned int shift; // used by adaptive mc-builder
+ unsigned int shift_mask; // used by adaptive mc-builder
+ unsigned int binary_hierarchy_root;
+ unsigned int p0_allocated_num;
+ unsigned int p0_created_num;
+ unsigned int morton_sort_in_flight;
+ unsigned int sort_iterations;
+
+ gpuva_t binary_hierarchy_buffer; // pointer to the binary morton code hierarchy. Stashed here as a debug aid
+};
+
+struct Range
+{
+ unsigned int start, end;
+};
+
+struct Triangle
+{
+ unsigned int vtx[3];
+ //unsigned int primID;
+ //unsigned int geomID;
+};
+
+struct MortonCodePrimitive
+{
+ uint64_t index_code; // 64bit code + index combo
+};
+
+struct BuildRecord
+{
+ struct AABB centroidBounds;
+ unsigned int start, end;
+ __global void *current;
+};
+
+struct BinaryMortonCodeHierarchy
+{
+ struct Range range;
+ unsigned int leftChild;
+ unsigned int rightChild;
+ // unsigned int flag;
+};
+
+typedef struct MortonFlattenedBoxlessNode {
+ uint binary_hierarchy_index; // only needed when type != BVH_INTERNAL_NODE
+ uint childOffset_type; // childOffset : 26, type : 6
+ uint backPointer; // same usage as in bvh
+} MortonFlattenedBoxlessNode;
+
+struct StatStackEntry
+{
+ struct AABB aabb;
+ unsigned int node;
+ unsigned int type;
+ unsigned int depth;
+ float area;
+};
+
+struct BuildRecordMorton
+{
+ unsigned int nodeID;
+ unsigned int items;
+ unsigned int current_index;
+ unsigned int parent_index;
+};
+
+struct Split
+{
+ float sah;
+ int dim;
+ int pos;
+};
+
+struct BinMapping
+{
+ float4 ofs, scale;
+};
+
+struct BinInfo
+{
+ struct AABB3f boundsX[BINS];
+ struct AABB3f boundsY[BINS];
+ struct AABB3f boundsZ[BINS];
+ uint3 counts[BINS];
+};
+
+struct BinInfo2
+{
+ struct AABB3f boundsX[BINS * 2];
+ struct AABB3f boundsY[BINS * 2];
+ struct AABB3f boundsZ[BINS * 2];
+ uint3 counts[BINS * 2];
+};
+
+struct GlobalBuildRecord
+{
+ struct BinInfo2 binInfo;
+ struct BinMapping binMapping;
+ struct Split split;
+ struct Range range;
+ struct AABB leftCentroid;
+ struct AABB rightCentroid;
+ struct AABB leftGeometry;
+ struct AABB rightGeometry;
+ unsigned int atomicCountLeft;
+ unsigned int atomicCountRight;
+ unsigned int buildRecordID;
+};
+
+GRL_NAMESPACE_END(GPUBVHBuilder)
+GRL_NAMESPACE_END(RTAS)
+GRL_NAMESPACE_END(GRL)
diff --git a/src/intel/vulkan/grl/gpu/structs.grl b/src/intel/vulkan/grl/gpu/structs.grl
new file mode 100644
index 00000000000..f15b1d2346b
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/structs.grl
@@ -0,0 +1,38 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+module structs;
+
+struct MKBuilderState {
+ qword geomDesc_buffer;
+ qword build_primref_buffer;
+ qword build_globals;
+ qword bvh_buffer;
+ dword leaf_type;
+ dword leaf_size;
+};
+
+struct MKSizeEstimate {
+ dword numTriangles;
+ dword numProcedurals;
+ dword numPrimitives;
+ dword numMeshes;
+ dword numBuildPrimitives;
+ dword numPrimitivesToSplit;
+ dword instance_descs_start;
+ dword geo_meta_data_start;
+ dword node_data_start;
+ dword leaf_data_start;
+ dword procedural_data_start;
+ dword back_pointer_start;
+ dword sizeTotal;
+ dword updateScratchSizeTotal;
+ dword fatleaf_table_start;
+ dword innernode_table_start;
+ dword max_fatleaves;
+ dword quad_indices_data_start;
+};
diff --git a/src/intel/vulkan/grl/gpu/traversal_shader.cl b/src/intel/vulkan/grl/gpu/traversal_shader.cl
new file mode 100644
index 00000000000..ee5d2afcc75
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/traversal_shader.cl
@@ -0,0 +1,277 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#include "instance.h"
+#include "api_interface.h"
+
+#include "bvh_build_primref.h"
+#include "bvh_build_refit.h"
+
+/*
+ Create primrefs from array of instance descriptors.
+ */
+ GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) void kernel
+TS_primrefs_from_instances(
+ global struct Globals* globals,
+ global struct BVHBase* bvh,
+ global __const struct GRL_RAYTRACING_INSTANCE_DESC* instances,
+ uint numInstances,
+ global struct AABB* primrefs,
+ global uchar* pAABBs,
+ global uchar* pIsProcedural,
+ dword aabb_stride,
+ uint allowUpdate
+ )
+{
+ const uint instanceIndex = get_sub_group_local_id() + get_group_id(0) * MAX_HW_SIMD_WIDTH;
+ if (instanceIndex < numInstances)
+ {
+ global __const struct GRL_RAYTRACING_INSTANCE_DESC* instance = instances + instanceIndex;
+
+ global struct GRL_RAYTRACING_AABB* procedural_bb = 0;
+ if ( pIsProcedural[instanceIndex] )
+ {
+ procedural_bb = (global struct GRL_RAYTRACING_AABB*)(pAABBs + aabb_stride * instanceIndex);
+ }
+
+ primrefs_from_instances(
+ globals,
+ bvh,
+ instance,
+ instanceIndex,
+ primrefs,
+ procedural_bb,
+ allowUpdate);
+ }
+}
+
+/*
+ Create primrefs from array of instance descriptors.
+ */
+ GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+void kernel
+TS_primrefs_from_instances_indirect(
+ global struct Globals* globals,
+ global struct BVHBase* bvh,
+ global __const struct GRL_RAYTRACING_INSTANCE_DESC* instances,
+ uint numInstances,
+ global struct AABB* primrefs,
+ global uchar* pAABBs,
+ global uchar* pIsProcedural,
+ dword aabb_stride,
+ uint allowUpdate,
+ global struct IndirectBuildRangeInfo* indirect_data
+ )
+{
+ const uint instanceIndex = get_local_id(0) + get_group_id(0) * MAX_HW_SIMD_WIDTH;
+ if (instanceIndex < indirect_data->primitiveCount)
+ {
+ instances = (global __const struct GRL_RAYTRACING_INSTANCE_DESC*)
+ (((global char*)instances) + indirect_data->primitiveOffset);
+ global __const struct GRL_RAYTRACING_INSTANCE_DESC* instance = instances + instanceIndex;
+
+ global struct GRL_RAYTRACING_AABB* procedural_bb = 0;
+ if ( pIsProcedural[instanceIndex] )
+ {
+ procedural_bb = (global struct GRL_RAYTRACING_AABB*)(pAABBs + aabb_stride * instanceIndex);
+ }
+
+ primrefs_from_instances(
+ globals,
+ bvh,
+ instance,
+ instanceIndex,
+ primrefs,
+ procedural_bb,
+ allowUpdate);
+ }
+}
+
+/*
+ Create primrefs from array of pointers to instance descriptors.
+ */
+ GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) void kernel
+TS_primrefs_from_instances_pointers(global struct Globals* globals,
+ global struct BVHBase* bvh,
+ global void* instances_in,
+ uint numInstances,
+ global struct AABB* primrefs,
+ global uchar* pAABBs,
+ global uchar* pIsProcedural,
+ dword aabb_stride,
+ uint allowUpdate
+ )
+{
+ global const struct GRL_RAYTRACING_INSTANCE_DESC** instances =
+ (global const struct GRL_RAYTRACING_INSTANCE_DESC**)instances_in;
+
+ const uint instanceIndex = get_sub_group_local_id() + get_group_id(0) * MAX_HW_SIMD_WIDTH;
+ if (instanceIndex < numInstances)
+ {
+ global __const struct GRL_RAYTRACING_INSTANCE_DESC* instance = instances[instanceIndex];
+
+ global struct GRL_RAYTRACING_AABB* procedural_bb = 0;
+ if (pIsProcedural[instanceIndex])
+ {
+ procedural_bb = (global struct GRL_RAYTRACING_AABB*)(pAABBs + aabb_stride * instanceIndex);
+ }
+
+ primrefs_from_instances(
+ globals,
+ bvh,
+ instance,
+ instanceIndex,
+ primrefs,
+ procedural_bb,
+ allowUpdate);
+ }
+}
+
+/*
+ Create primrefs from array of pointers to instance descriptors.
+ */
+ GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+void kernel
+TS_primrefs_from_instances_pointers_indirect(global struct Globals* globals,
+ global struct BVHBase* bvh,
+ global void* instances_in,
+ global struct AABB* primrefs,
+ global uchar* pAABBs,
+ global uchar* pIsProcedural,
+ dword aabb_stride,
+ uint allowUpdate,
+ global struct IndirectBuildRangeInfo* indirect_data
+ )
+{
+ const uint instanceIndex = get_local_id(0) + get_group_id(0) * MAX_HW_SIMD_WIDTH;
+ if (instanceIndex < indirect_data->primitiveCount)
+ {
+ instances_in = ((global char*)instances_in) + indirect_data->primitiveOffset;
+ global const struct GRL_RAYTRACING_INSTANCE_DESC** instances =
+ (global const struct GRL_RAYTRACING_INSTANCE_DESC**)instances_in;
+ global __const struct GRL_RAYTRACING_INSTANCE_DESC* instance = instances[instanceIndex];
+
+ global struct GRL_RAYTRACING_AABB* procedural_bb = 0;
+ if (pIsProcedural[instanceIndex])
+ {
+ procedural_bb = (global struct GRL_RAYTRACING_AABB*)(pAABBs + aabb_stride * instanceIndex);
+ }
+
+ primrefs_from_instances(
+ globals,
+ bvh,
+ instance,
+ instanceIndex,
+ primrefs,
+ procedural_bb,
+ allowUpdate);
+ }
+}
+
+
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(16, 1, 1)))
+void kernel
+TS_update_instance_leaves(global struct BVHBase* bvh,
+ uint64_t dxrInstancesArray,
+ uint64_t dxrInstancesPtr,
+ global struct AABB3f* instance_aabb_scratch,
+ global uchar* aabbs,
+ global uchar* is_procedural,
+ dword aabb_stride
+)
+{
+ uint num_leaves = BVHBase_GetNumHWInstanceLeaves(bvh);
+ uint id = get_local_id(0) + get_local_size(0) * get_group_id(0);
+ if (id >= num_leaves)
+ return;
+
+ struct HwInstanceLeaf* leaves = BVHBase_GetHWInstanceLeaves(bvh);
+ uint idx = HwInstanceLeaf_GetInstanceIndex(&leaves[id]);
+
+ global GRL_RAYTRACING_AABB* procedural_box = 0;
+ if (is_procedural[idx])
+ {
+ procedural_box = (global GRL_RAYTRACING_AABB*)(aabbs + (aabb_stride * idx));
+ }
+
+ DO_update_instance_leaves(
+ bvh,
+ dxrInstancesArray,
+ dxrInstancesPtr,
+ instance_aabb_scratch,
+ id,
+ procedural_box);
+}
+
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(16, 1, 1)))
+void kernel
+TS_fixup_leaves( global struct BVHBase* bvh,
+ global uchar* primref_index,
+ global PrimRef* primrefs,
+ uint stride )
+
+{
+ uint num_inners = BVHBase_GetNumInternalNodes(bvh);
+ uint id = get_local_id(0) + get_local_size(0) * get_group_id(0);
+
+ // assign 8 lanes to each inner node, 6 of which will do useful work
+ uint node_id = id / 8;
+ uint child_id = id % 8;
+
+ bool node_valid = (node_id < num_inners);
+
+ if (node_valid )
+ {
+ global InternalNode* nodes = (global InternalNode*) BVHBase_GetInternalNodes(bvh);
+ global InternalNode* my_node = nodes + node_id;
+
+ if (my_node->nodeType == BVH_INSTANCE_NODE)
+ {
+ bool child_valid = (child_id < 6) && InternalNode_IsChildValid(my_node, child_id);
+ if (child_valid)
+ {
+ global HwInstanceLeaf* leaves = (global HwInstanceLeaf*)InternalNode_GetChildren(my_node);
+ uint leafIndex = (leaves - BVHBase_GetHWInstanceLeaves(bvh)) + child_id;
+
+ const uint primrefID = *(uint*)(primref_index + leafIndex * stride);
+
+ uint type = PRIMREF_isProceduralInstance(&primrefs[primrefID]) ?
+ BVH_PROCEDURAL_NODE : BVH_INSTANCE_NODE;
+
+ InternalNode_SetChildType(my_node, child_id, type);
+ }
+
+ if (child_id == 0)
+ my_node->nodeType = BVH_INTERNAL_NODE;
+ }
+ }
+}
+
+
+
+
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(SG_REFIT_WG_SIZE, 1, 1))) void kernel
+TS_Refit_per_one_startpoint_sg(
+ global struct BVHBase* bvh,
+ global struct AABB3f* instance_leaf_aabbs,
+ global uchar* procedural_instance_enable_buffer )
+{
+ DO_Refit_per_one_startpoint_sg(bvh, (global GRL_RAYTRACING_GEOMETRY_DESC*) bvh, instance_leaf_aabbs, procedural_instance_enable_buffer );
+
+}
diff --git a/src/intel/vulkan/grl/gpu/traversal_shader.grl b/src/intel/vulkan/grl/gpu/traversal_shader.grl
new file mode 100644
index 00000000000..3820996c348
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/traversal_shader.grl
@@ -0,0 +1,244 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+module traversal_shader;
+
+kernel_module morton_kernels ("traversal_shader.cl")
+{
+ links lsc_intrinsics;
+
+ kernel TS_primrefs_from_instances < kernelFunction = "TS_primrefs_from_instances" >;
+ kernel TS_primrefs_from_instances_indirect < kernelFunction = "TS_primrefs_from_instances_indirect" >;
+ kernel TS_primrefs_from_instances_ptrs < kernelFunction = "TS_primrefs_from_instances_pointers" >;
+ kernel TS_primrefs_from_instances_ptrs_indirect < kernelFunction = "TS_primrefs_from_instances_pointers_indirect" >;
+ kernel TS_update_instance_leaves < kernelFunction = "TS_update_instance_leaves" >;
+ kernel TS_Refit_per_one_startpoint_sg < kernelFunction = "TS_Refit_per_one_startpoint_sg" >;
+ kernel TS_fixup_leaves < kernelFunction = "TS_fixup_leaves" >;
+}
+
+struct MKTSBuildArgs
+{
+ qword build_globals;
+ qword bvh_buffer;
+ qword instance_descs;
+ qword build_primref_buffer;
+ qword aabb_buffer;
+ qword is_procedural_buffer;
+ qword leaf_creation_index_buffer;
+ dword aabb_stride;
+ dword num_instances;
+ dword leaf_creation_index_stride;
+};
+
+const BUILD_PRIMREFS_GROUPSIZE = 16;
+
+
+metakernel TS_build_primrefs( MKTSBuildArgs build_state, dword allowUpdate )
+{
+ define num_groups((build_state.num_instances + BUILD_PRIMREFS_GROUPSIZE - 1) / BUILD_PRIMREFS_GROUPSIZE);
+ dispatch TS_primrefs_from_instances(num_groups, 1, 1) args(
+ build_state.build_globals,
+ build_state.bvh_buffer,
+ build_state.instance_descs,
+ build_state.num_instances,
+ build_state.build_primref_buffer,
+ build_state.aabb_buffer,
+ build_state.is_procedural_buffer,
+ build_state.aabb_stride,
+ allowUpdate
+ );
+
+}
+
+metakernel TS_build_primrefs_indirect(MKTSBuildArgs build_state, qword indirectBuildRangeInfo, dword allowUpdate)
+{
+ define num_groups REG0;
+ define groupsize_1 REG1; // groupsize - 1
+ define C_4 REG2;
+
+ // init with primitiveCount
+ num_groups = load_dword(indirectBuildRangeInfo);
+ groupsize_1 = 15; // BUILD_PRIMREFS_GROUPSIZE - 1
+ C_4 = 4; // log_2(BUILD_PRIMREFS_GROUPSIZE)
+
+ num_groups = num_groups + groupsize_1;
+ num_groups = num_groups >> C_4; // num_groups / BUILD_PRIMREFS_GROUPSIZE;
+
+ DISPATCHDIM_X = num_groups.lo;
+ DISPATCHDIM_Y = 1;
+ DISPATCHDIM_Z = 1;
+
+ dispatch_indirect TS_primrefs_from_instances_indirect args(
+ build_state.build_globals,
+ build_state.bvh_buffer,
+ build_state.instance_descs,
+ build_state.build_primref_buffer,
+ build_state.aabb_buffer,
+ build_state.is_procedural_buffer,
+ build_state.aabb_stride,
+ allowUpdate,
+ indirectBuildRangeInfo
+ );
+
+}
+
+metakernel TS_build_primrefs_array_of_pointers( MKTSBuildArgs build_state, dword allowUpdate )
+{
+ define num_groups((build_state.num_instances + BUILD_PRIMREFS_GROUPSIZE - 1) / BUILD_PRIMREFS_GROUPSIZE);
+ dispatch TS_primrefs_from_instances_ptrs(num_groups, 1, 1) args(
+ build_state.build_globals,
+ build_state.bvh_buffer,
+ build_state.instance_descs,
+ build_state.num_instances,
+ build_state.build_primref_buffer,
+ build_state.aabb_buffer,
+ build_state.is_procedural_buffer,
+ build_state.aabb_stride,
+ allowUpdate
+ );
+}
+
+metakernel
+TS_build_primrefs_array_of_pointers_indirect(MKTSBuildArgs build_state, qword indirectBuildRangeInfo, dword allowUpdate)
+{
+ define num_groups REG0;
+ define groupsize_1 REG1; // groupsize - 1
+ define C_4 REG2;
+
+ // init with primitiveCount
+ num_groups = load_dword(indirectBuildRangeInfo);
+ groupsize_1 = 15; // BUILD_PRIMREFS_GROUPSIZE - 1
+ C_4 = 4; // log_2(BUILD_PRIMREFS_GROUPSIZE)
+
+ num_groups = num_groups + groupsize_1;
+ num_groups = num_groups >> C_4; // num_groups / BUILD_PRIMREFS_GROUPSIZE;
+
+ DISPATCHDIM_X = num_groups.lo;
+ DISPATCHDIM_Y = 1;
+ DISPATCHDIM_Z = 1;
+
+ dispatch_indirect TS_primrefs_from_instances_ptrs_indirect args(
+ build_state.build_globals,
+ build_state.bvh_buffer,
+ build_state.instance_descs,
+ build_state.build_primref_buffer,
+ build_state.aabb_buffer,
+ build_state.is_procedural_buffer,
+ build_state.aabb_stride,
+ allowUpdate,
+ indirectBuildRangeInfo
+ );
+}
+
+
+
+
+const UPDATE_INSTANCE_LEAVES_GROUPSIZE = 16;
+
+struct MKTSUpdateArgs
+{
+ qword bvh_buffer;
+ qword instance_descs;
+ qword instance_descs_ptrs;
+ qword aabb_buffer;
+ qword is_procedural_buffer;
+ qword refit_scratch;
+ dword aabb_stride;
+ dword num_instances;
+};
+
+metakernel TS_update_instance_leaves( MKTSUpdateArgs update_state )
+{
+ define num_groups((update_state.num_instances + UPDATE_INSTANCE_LEAVES_GROUPSIZE - 1) / UPDATE_INSTANCE_LEAVES_GROUPSIZE);
+ dispatch TS_update_instance_leaves(num_groups, 1, 1) args(
+ update_state.bvh_buffer,
+ update_state.instance_descs,
+ update_state.instance_descs_ptrs,
+ update_state.refit_scratch,
+ update_state.aabb_buffer,
+ update_state.is_procedural_buffer,
+ update_state.aabb_stride
+ );
+}
+
+metakernel TS_update_instance_leaves_indirect( MKTSUpdateArgs update_state, qword indirectBuildRangeInfo )
+{
+ define num_groups REG0;
+ define groupsize_1 REG1; // groupsize - 1
+ define C_4 REG2;
+
+ // init with primitiveCount
+ num_groups = load_dword(indirectBuildRangeInfo);
+ groupsize_1 = 15; // UPDATE_INSTANCE_LEAVES_GROUPSIZE - 1
+ C_4 = 4; // log_2(UPDATE_INSTANCE_LEAVES_GROUPSIZE)
+
+ num_groups = num_groups + groupsize_1;
+ num_groups = num_groups >> C_4; // num_groups / UPDATE_INSTANCE_LEAVES_GROUPSIZE;
+
+ DISPATCHDIM_X = num_groups.lo;
+ DISPATCHDIM_Y = 1;
+ DISPATCHDIM_Z = 1;
+
+ // need to add indirect offset?
+ dispatch_indirect TS_update_instance_leaves args(
+ update_state.bvh_buffer,
+ update_state.instance_descs,
+ update_state.instance_descs_ptrs,
+ update_state.refit_scratch,
+ update_state.aabb_buffer,
+ update_state.is_procedural_buffer,
+ update_state.aabb_stride
+ );
+}
+
+metakernel TS_refit(MKTSUpdateArgs update_state, qword bvh_inner_nodes_start_value, qword bvh_inner_nodes_end )
+{
+ REG0 = bvh_inner_nodes_start_value;
+ REG1.lo = load_dword(bvh_inner_nodes_end);
+ REG1.hi = 0;
+ REG2 = REG1 - REG0;
+
+ DISPATCHDIM_X = REG2.lo;
+ DISPATCHDIM_Y = 1;
+ DISPATCHDIM_Z = 1;
+
+ dispatch_indirect TS_Refit_per_one_startpoint_sg
+ args(
+ update_state.bvh_buffer,
+ update_state.refit_scratch,
+ update_state.is_procedural_buffer
+ );
+}
+
+
+const FIXUP_LEAVES_NODES_PER_GROUP = 2;
+
+metakernel TS_fixup_leaves(MKTSBuildArgs build_state, qword bvh_inner_nodes_start_value, qword bvh_inner_nodes_end )
+{
+ define ONE REG3;
+
+ ONE = 1;
+ REG0 = bvh_inner_nodes_start_value;
+ REG1.lo = load_dword(bvh_inner_nodes_end);
+ REG1.hi = 0;
+ REG2 = REG1 - REG0;
+ REG2 = REG2 + ONE;
+ REG2 = REG2 >> ONE;
+
+ DISPATCHDIM_X = REG2.lo;
+ DISPATCHDIM_Y = 1;
+ DISPATCHDIM_Z = 1;
+
+ dispatch_indirect TS_fixup_leaves
+ args(
+ build_state.bvh_buffer,
+ build_state.leaf_creation_index_buffer,
+ build_state.build_primref_buffer,
+ build_state.leaf_creation_index_stride
+ );
+
+}
diff --git a/src/intel/vulkan/grl/grl_cl_kernel_gen.py b/src/intel/vulkan/grl/grl_cl_kernel_gen.py
new file mode 100644
index 00000000000..148438e9fa6
--- /dev/null
+++ b/src/intel/vulkan/grl/grl_cl_kernel_gen.py
@@ -0,0 +1,226 @@
+COPYRIGHT = """\
+/*
+ * Copyright 2021 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+"""
+
+import argparse
+import os
+
+from grl_parser import parse_grl_file
+from mako.template import Template
+
+TEMPLATE_H = Template(COPYRIGHT + """
+/* This file generated from ${filename}, don't edit directly. */
+
+#ifndef GRL_CL_KERNEL_H
+#define GRL_CL_KERNEL_H
+
+#include "genxml/gen_macros.h"
+#include "compiler/brw_kernel.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+enum grl_cl_kernel {
+% for k in kernels:
+ GRL_CL_KERNEL_${k.upper()},
+% endfor
+ GRL_CL_KERNEL_MAX,
+};
+
+const char *genX(grl_cl_kernel_name)(enum grl_cl_kernel kernel);
+
+const char *genX(grl_get_cl_kernel_sha1)(enum grl_cl_kernel id);
+
+void genX(grl_get_cl_kernel)(struct brw_kernel *kernel, enum grl_cl_kernel id);
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif /* INTEL_GRL_H */
+""")
+
+TEMPLATE_C = Template(COPYRIGHT + """
+/* This file generated from ${filename}, don't edit directly. */
+
+#include "grl_cl_kernel.h"
+
+% for k in kernels:
+#include "${prefix}_${k}.h"
+% endfor
+
+const char *
+genX(grl_cl_kernel_name)(enum grl_cl_kernel kernel)
+{
+ switch (kernel) {
+% for k in kernels:
+ case GRL_CL_KERNEL_${k.upper()}: return "${k}";
+% endfor
+ default: return "unknown";
+ }
+}
+
+const char *
+genX(grl_get_cl_kernel_sha1)(enum grl_cl_kernel id)
+{
+ switch (id) {
+% for k in kernels:
+ case GRL_CL_KERNEL_${k.upper()}: return ${prefix}_${k}_sha1;
+% endfor
+ default:
+ unreachable("Invalid GRL kernel enum");
+ }
+};
+
+void
+${prefix}_grl_get_cl_kernel(struct brw_kernel *kernel, enum grl_cl_kernel id)
+{
+ switch (id) {
+% for k in kernels:
+ case GRL_CL_KERNEL_${k.upper()}:
+ *kernel = ${prefix}_${k};
+ break;
+% endfor
+ default:
+ unreachable("Invalid GRL kernel enum");
+ }
+}
+""")
+
+def get_libraries_files(kernel_module):
+ lib_files = []
+ for item in kernel_module[3]:
+ if item[0] != 'library':
+ continue
+ default_file = None
+ fallback_file = None
+ path_directory = None
+ for props in item[2]:
+ if props[0] == 'fallback':
+ fallback_file = props[1]
+ elif props[0] == 'default':
+ default_file = props[1]
+ elif props[0] == 'path':
+ path_directory = props[1]
+ assert path_directory
+ assert default_file or fallback_file
+ if fallback_file:
+ lib_files.append(os.path.join(path_directory, fallback_file))
+ else:
+ lib_files.append(os.path.join(path_directory, default_file))
+ return lib_files
+
+def add_kernels(kernels, cl_file, entrypoint, libs):
+ assert cl_file.endswith('.cl')
+ for lib_file in libs:
+ assert lib_file.endswith('.cl')
+ kernels.append((cl_file, entrypoint, ','.join(libs)))
+
+def get_kernels(grl_nodes):
+ kernels = []
+ for item in grl_nodes:
+ assert isinstance(item, tuple)
+ if item[0] == 'kernel':
+ ann = item[2]
+ add_kernels(kernels, ann['source'], ann['kernelFunction'], [])
+ elif item[0] == 'kernel-module':
+ cl_file = item[2]
+ libfiles = get_libraries_files(item)
+ for kernel_def in item[3]:
+ if kernel_def[0] == 'kernel':
+ ann = kernel_def[2]
+ add_kernels(kernels, cl_file, ann['kernelFunction'], libfiles)
+ return kernels
+
+def parse_libraries(filenames):
+ libraries = {}
+ for fname in filenames:
+ lib_package = parse_grl_file(fname, [])
+ for lib in lib_package:
+ assert lib[0] == 'library'
+ # Add the directory of the library so that CL files can be found.
+ lib[2].append(('path', os.path.dirname(fname)))
+ libraries[lib[1]] = lib
+ return libraries
+
+def main():
+ parser = argparse.ArgumentParser()
+ parser.add_argument('--out-c', help='Output C file')
+ parser.add_argument('--out-h', help='Output H file')
+ parser.add_argument('--ls-kernels', action='store_const', const=True,
+ help='List all openCL kernels')
+ parser.add_argument('--prefix', help='Prefix')
+ parser.add_argument('--library', dest='libraries', action='append',
+ default=[], help='Libraries to include')
+ parser.add_argument('files', type=str, nargs='*', help='GRL files')
+ args = parser.parse_args()
+
+ libraries = parse_libraries(args.libraries)
+
+ kernels = []
+ for fname in args.files:
+ kernels += get_kernels(parse_grl_file(fname, libraries))
+
+ # Make the list of kernels unique and sorted
+ kernels = sorted(list(set(kernels)))
+
+ if args.ls_kernels:
+ for cl_file, entrypoint, libs in kernels:
+ if not os.path.isabs(cl_file):
+ cl_file = os.path.join(os.path.dirname(fname), cl_file)
+ print('{}:{}:{}'.format(cl_file, entrypoint, libs))
+
+ kernel_c_names = []
+ for cl_file, entrypoint, libs in kernels:
+ cl_file = os.path.splitext(cl_file)[0]
+ cl_file_name = cl_file.replace('/', '_')
+ kernel_c_names.append('_'.join([cl_file_name, entrypoint]))
+
+ try:
+ if args.out_h:
+ with open(args.out_h, 'w', encoding='utf-8') as f:
+ f.write(TEMPLATE_H.render(kernels=kernel_c_names,
+ filename=os.path.basename(__file__)))
+
+ if args.out_c:
+ with open(args.out_c, 'w', encoding='utf-8') as f:
+ f.write(TEMPLATE_C.render(kernels=kernel_c_names,
+ prefix=args.prefix,
+ filename=os.path.basename(__file__)))
+ except Exception:
+ # In the event there's an error, this imports some helpers from mako
+ # to print a useful stack trace and prints it, then exits with
+ # status 1, if python is run with debug; otherwise it just raises
+ # the exception
+ if __debug__:
+ import sys
+ from mako import exceptions
+ sys.stderr.write(exceptions.text_error_template().render() + '\n')
+ sys.exit(1)
+ raise
+
+if __name__ == '__main__':
+ main()
diff --git a/src/intel/vulkan/grl/grl_metakernel_gen.py b/src/intel/vulkan/grl/grl_metakernel_gen.py
new file mode 100644
index 00000000000..6c416bd3d5d
--- /dev/null
+++ b/src/intel/vulkan/grl/grl_metakernel_gen.py
@@ -0,0 +1,933 @@
+#!/bin/env python
+COPYRIGHT = """\
+/*
+ * Copyright 2021 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+"""
+
+import argparse
+import os.path
+import re
+import sys
+
+from grl_parser import parse_grl_file
+
+class Writer(object):
+ def __init__(self, file):
+ self._file = file
+ self._indent = 0
+ self._new_line = True
+
+ def push_indent(self, levels=4):
+ self._indent += levels
+
+ def pop_indent(self, levels=4):
+ self._indent -= levels
+
+ def write(self, s, *fmt):
+ if self._new_line:
+ s = '\n' + s
+ self._new_line = False
+ if s.endswith('\n'):
+ self._new_line = True
+ s = s[:-1]
+ if fmt:
+ s = s.format(*fmt)
+ self._file.write(s.replace('\n', '\n' + ' ' * self._indent))
+
+# Internal Representation
+
+class Value(object):
+ def __init__(self, name=None, zone=None):
+ self.name = name
+ self._zone = zone
+ self.live = False
+
+ @property
+ def zone(self):
+ assert self._zone is not None
+ return self._zone
+
+ def is_reg(self):
+ return False
+
+ def c_val(self):
+ if not self.name:
+ print(self)
+ assert self.name
+ return self.name
+
+ def c_cpu_val(self):
+ assert self.zone == 'cpu'
+ return self.c_val()
+
+ def c_gpu_val(self):
+ if self.zone == 'gpu':
+ return self.c_val()
+ else:
+ return 'mi_imm({})'.format(self.c_cpu_val())
+
+class Constant(Value):
+ def __init__(self, value):
+ super().__init__(zone='cpu')
+ self.value = value
+
+ def c_val(self):
+ if self.value < 100:
+ return str(self.value)
+ elif self.value < (1 << 32):
+ return '0x{:x}u'.format(self.value)
+ else:
+ return '0x{:x}ull'.format(self.value)
+
+class Register(Value):
+ def __init__(self, name):
+ super().__init__(name=name, zone='gpu')
+
+ def is_reg(self):
+ return True
+
+class FixedGPR(Register):
+ def __init__(self, num):
+ super().__init__('REG{}'.format(num))
+ self.num = num
+
+ def write_c(self, w):
+ w.write('UNUSED struct mi_value {} = mi_reserve_gpr(&b, {});\n',
+ self.name, self.num)
+
+class GroupSizeRegister(Register):
+ def __init__(self, comp):
+ super().__init__('DISPATCHDIM_' + 'XYZ'[comp])
+ self.comp = comp
+
+class Member(Value):
+ def __init__(self, value, member):
+ super().__init__(zone=value.zone)
+ self.value = value
+ self.member = member
+
+ def is_reg(self):
+ return self.value.is_reg()
+
+ def c_val(self):
+ c_val = self.value.c_val()
+ if self.zone == 'gpu':
+ assert isinstance(self.value, Register)
+ if self.member == 'hi':
+ return 'mi_value_half({}, true)'.format(c_val)
+ elif self.member == 'lo':
+ return 'mi_value_half({}, false)'.format(c_val)
+ else:
+ assert False, 'Invalid member: {}'.format(self.member)
+ else:
+ return '.'.join([c_val, self.member])
+
+class OffsetOf(Value):
+ def __init__(self, mk, expr):
+ super().__init__(zone='cpu')
+ assert isinstance(expr, tuple) and expr[0] == 'member'
+ self.type = mk.m.get_type(expr[1])
+ self.field = expr[2]
+
+ def c_val(self):
+ return 'offsetof({}, {})'.format(self.type.c_name, self.field)
+
+class Scope(object):
+ def __init__(self, m, mk, parent):
+ self.m = m
+ self.mk = mk
+ self.parent = parent
+ self.defs = {}
+
+ def add_def(self, d, name=None):
+ if name is None:
+ name = d.name
+ assert name not in self.defs
+ self.defs[name] = d
+
+ def get_def(self, name):
+ if name in self.defs:
+ return self.defs[name]
+ assert self.parent, 'Unknown definition: "{}"'.format(name)
+ return self.parent.get_def(name)
+
+class Statement(object):
+ def __init__(self, srcs=[]):
+ assert isinstance(srcs, (list, tuple))
+ self.srcs = list(srcs)
+
+class SSAStatement(Statement, Value):
+ _count = 0
+
+ def __init__(self, zone, srcs):
+ Statement.__init__(self, srcs)
+ Value.__init__(self, None, zone)
+ self.c_name = '_tmp{}'.format(SSAStatement._count)
+ SSAStatement._count += 1
+
+ def c_val(self):
+ return self.c_name
+
+ def write_c_refs(self, w):
+ assert self.zone == 'gpu'
+ assert self.uses > 0
+ if self.uses > 1:
+ w.write('mi_value_add_refs(&b, {}, {});\n',
+ self.c_name, self.uses - 1)
+
+class Half(SSAStatement):
+ def __init__(self, value, half):
+ assert half in ('hi', 'lo')
+ super().__init__(None, [value])
+ self.half = half
+
+ @property
+ def zone(self):
+ return self.srcs[0].zone
+
+ def write_c(self, w):
+ assert self.half in ('hi', 'lo')
+ if self.zone == 'cpu':
+ if self.half == 'hi':
+ w.write('uint32_t {} = (uint64_t)({}) >> 32;\n',
+ self.c_name, self.srcs[0].c_cpu_val())
+ else:
+ w.write('uint32_t {} = {};\n',
+ self.c_name, self.srcs[0].c_cpu_val())
+ else:
+ if self.half == 'hi':
+ w.write('struct mi_value {} = mi_value_half({}, true);\n',
+ self.c_name, self.srcs[0].c_gpu_val())
+ else:
+ w.write('struct mi_value {} = mi_value_half({}, false);\n',
+ self.c_name, self.srcs[0].c_gpu_val())
+ self.write_c_refs(w)
+
+class Expression(SSAStatement):
+ def __init__(self, mk, op, *srcs):
+ super().__init__(None, srcs)
+ self.op = op
+
+ @property
+ def zone(self):
+ zone = 'cpu'
+ for s in self.srcs:
+ if s.zone == 'gpu':
+ zone = 'gpu'
+ return zone
+
+ def write_c(self, w):
+ if self.zone == 'cpu':
+ w.write('uint64_t {} = ', self.c_name)
+ c_cpu_vals = [s.c_cpu_val() for s in self.srcs]
+ if len(self.srcs) == 1:
+ w.write('({} {})', self.op, c_cpu_vals[0])
+ elif len(self.srcs) == 2:
+ w.write('({} {} {})', c_cpu_vals[0], self.op, c_cpu_vals[1])
+ else:
+ assert len(self.srcs) == 3 and op == '?'
+ w.write('({} ? {} : {})', *c_cpu_vals)
+ w.write(';\n')
+ return
+
+ w.write('struct mi_value {} = ', self.c_name)
+ if self.op == '~':
+ w.write('mi_inot(&b, {});\n', self.srcs[0].c_gpu_val())
+ elif self.op == '+':
+ w.write('mi_iadd(&b, {}, {});\n',
+ self.srcs[0].c_gpu_val(), self.srcs[1].c_gpu_val())
+ elif self.op == '-':
+ w.write('mi_isub(&b, {}, {});\n',
+ self.srcs[0].c_gpu_val(), self.srcs[1].c_gpu_val())
+ elif self.op == '&':
+ w.write('mi_iand(&b, {}, {});\n',
+ self.srcs[0].c_gpu_val(), self.srcs[1].c_gpu_val())
+ elif self.op == '|':
+ w.write('mi_ior(&b, {}, {});\n',
+ self.srcs[0].c_gpu_val(), self.srcs[1].c_gpu_val())
+ elif self.op == '<<':
+ if self.srcs[1].zone == 'cpu':
+ w.write('mi_ishl_imm(&b, {}, {});\n',
+ self.srcs[0].c_gpu_val(), self.srcs[1].c_cpu_val())
+ else:
+ w.write('mi_ishl(&b, {}, {});\n',
+ self.srcs[0].c_gpu_val(), self.srcs[1].c_gpu_val())
+ elif self.op == '>>':
+ if self.srcs[1].zone == 'cpu':
+ w.write('mi_ushr_imm(&b, {}, {});\n',
+ self.srcs[0].c_gpu_val(), self.srcs[1].c_cpu_val())
+ else:
+ w.write('mi_ushr(&b, {}, {});\n',
+ self.srcs[0].c_gpu_val(), self.srcs[1].c_gpu_val())
+ elif self.op == '==':
+ w.write('mi_ieq(&b, {}, {});\n',
+ self.srcs[0].c_gpu_val(), self.srcs[1].c_gpu_val())
+ elif self.op == '<':
+ w.write('mi_ult(&b, {}, {});\n',
+ self.srcs[0].c_gpu_val(), self.srcs[1].c_gpu_val())
+ elif self.op == '>':
+ w.write('mi_ult(&b, {}, {});\n',
+ self.srcs[1].c_gpu_val(), self.srcs[0].c_gpu_val())
+ elif self.op == '<=':
+ w.write('mi_uge(&b, {}, {});\n',
+ self.srcs[1].c_gpu_val(), self.srcs[0].c_gpu_val())
+ else:
+ assert False, 'Unknown expression opcode: {}'.format(self.op)
+ self.write_c_refs(w)
+
+class StoreReg(Statement):
+ def __init__(self, mk, reg, value):
+ super().__init__([mk.load_value(value)])
+ self.reg = mk.parse_value(reg)
+ assert self.reg.is_reg()
+
+ def write_c(self, w):
+ value = self.srcs[0]
+ w.write('mi_store(&b, {}, {});\n',
+ self.reg.c_gpu_val(), value.c_gpu_val())
+
+class LoadMem(SSAStatement):
+ def __init__(self, mk, bit_size, addr):
+ super().__init__('gpu', [mk.load_value(addr)])
+ self.bit_size = bit_size
+
+ def write_c(self, w):
+ addr = self.srcs[0]
+ w.write('struct mi_value {} = ', self.c_name)
+ if addr.zone == 'cpu':
+ w.write('mi_mem{}(anv_address_from_u64({}));\n',
+ self.bit_size, addr.c_cpu_val())
+ else:
+ assert self.bit_size == 64
+ w.write('mi_load_mem64_offset(&b, anv_address_from_u64(0), {});\n',
+ addr.c_gpu_val())
+ self.write_c_refs(w)
+
+class StoreMem(Statement):
+ def __init__(self, mk, bit_size, addr, src):
+ super().__init__([mk.load_value(addr), mk.load_value(src)])
+ self.bit_size = bit_size
+
+ def write_c(self, w):
+ addr, data = tuple(self.srcs)
+ if addr.zone == 'cpu':
+ w.write('mi_store(&b, mi_mem{}(anv_address_from_u64({})), {});\n',
+ self.bit_size, addr.c_cpu_val(), data.c_gpu_val())
+ else:
+ assert self.bit_size == 64
+ w.write('mi_store_mem64_offset(&b, anv_address_from_u64(0), {}, {});\n',
+ addr.c_gpu_val(), data.c_gpu_val())
+
+class GoTo(Statement):
+ def __init__(self, mk, target_id, cond=None, invert=False):
+ cond = [mk.load_value(cond)] if cond is not None else []
+ super().__init__(cond)
+ self.target_id = target_id
+ self.invert = invert
+ self.mk = mk
+
+ def write_c(self, w):
+ # Now that we've parsed the entire metakernel, we can look up the
+ # actual target from the id
+ target = self.mk.get_goto_target(self.target_id)
+
+ if self.srcs:
+ cond = self.srcs[0]
+ if self.invert:
+ w.write('mi_goto_if(&b, mi_inot(&b, {}), &{});\n', cond.c_gpu_val(), target.c_name)
+ else:
+ w.write('mi_goto_if(&b, {}, &{});\n', cond.c_gpu_val(), target.c_name)
+ else:
+ w.write('mi_goto(&b, &{});\n', target.c_name)
+
+class GoToTarget(Statement):
+ def __init__(self, mk, name):
+ super().__init__()
+ self.name = name
+ self.c_name = '_goto_target_' + name
+ self.goto_tokens = []
+
+ mk = mk.add_goto_target(self)
+
+ def write_decl(self, w):
+ w.write('struct mi_goto_target {} = MI_GOTO_TARGET_INIT;\n',
+ self.c_name)
+
+ def write_c(self, w):
+ w.write('mi_goto_target(&b, &{});\n', self.c_name)
+
+class Dispatch(Statement):
+ def __init__(self, mk, kernel, group_size, args, postsync):
+ if group_size is None:
+ srcs = [mk.scope.get_def('DISPATCHDIM_{}'.format(d)) for d in 'XYZ']
+ else:
+ srcs = [mk.load_value(s) for s in group_size]
+ srcs += [mk.load_value(a) for a in args]
+ super().__init__(srcs)
+ self.kernel = mk.m.kernels[kernel]
+ self.indirect = group_size is None
+ self.postsync = postsync
+
+ def write_c(self, w):
+ w.write('{\n')
+ w.push_indent()
+
+ group_size = self.srcs[:3]
+ args = self.srcs[3:]
+ if not self.indirect:
+ w.write('const uint32_t _group_size[3] = {{ {}, {}, {} }};\n',
+ *[s.c_cpu_val() for s in group_size])
+ gs = '_group_size'
+ else:
+ gs = 'NULL'
+
+ w.write('const struct anv_kernel_arg _args[] = {\n')
+ w.push_indent()
+ for arg in args:
+ w.write('{{ .u64 = {} }},\n', arg.c_cpu_val())
+ w.pop_indent()
+ w.write('};\n')
+
+ w.write('genX(grl_dispatch)(cmd_buffer, {},\n', self.kernel.c_name)
+ w.write(' {}, ARRAY_SIZE(_args), _args);\n', gs)
+ w.pop_indent()
+ w.write('}\n')
+
+class SemWait(Statement):
+ def __init__(self, scope, wait):
+ super().__init__()
+ self.wait = wait
+
+class Control(Statement):
+ def __init__(self, scope, wait):
+ super().__init__()
+ self.wait = wait
+
+ def write_c(self, w):
+ w.write('cmd_buffer->state.pending_pipe_bits |=\n')
+ w.write(' ANV_PIPE_CS_STALL_BIT |\n')
+ w.write(' ANV_PIPE_DATA_CACHE_FLUSH_BIT |\n')
+ w.write(' ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT;\n')
+ w.write('genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);\n')
+
+TYPE_REMAPS = {
+ 'dword' : 'uint32_t',
+ 'qword' : 'uint64_t',
+}
+
+class Module(object):
+ def __init__(self, grl_dir, elems):
+ assert isinstance(elems[0], tuple)
+ assert elems[0][0] == 'module-name'
+ self.grl_dir = grl_dir
+ self.name = elems[0][1]
+ self.kernels = {}
+ self.structs = {}
+ self.constants = []
+ self.metakernels = []
+ self.regs = {}
+
+ scope = Scope(self, None, None)
+ for e in elems[1:]:
+ if e[0] == 'kernel':
+ k = Kernel(self, *e[1:])
+ assert k.name not in self.kernels
+ self.kernels[k.name] = k
+ elif e[0] == 'kernel-module':
+ m = KernelModule(self, *e[1:])
+ for k in m.kernels:
+ assert k.name not in self.kernels
+ self.kernels[k.name] = k
+ elif e[0] == 'struct':
+ s = Struct(self, *e[1:])
+ assert s.name not in self.kernels
+ self.structs[s.name] = s
+ elif e[0] == 'named-constant':
+ c = NamedConstant(*e[1:])
+ scope.add_def(c)
+ self.constants.append(c)
+ elif e[0] == 'meta-kernel':
+ mk = MetaKernel(self, scope, *e[1:])
+ self.metakernels.append(mk)
+ elif e[0] == 'import':
+ assert e[2] == 'struct'
+ self.import_struct(e[1], e[3])
+ else:
+ assert False, 'Invalid module-level token: {}'.format(t[0])
+
+ def import_struct(self, filename, struct_name):
+ elems = parse_grl_file(os.path.join(self.grl_dir, filename), [])
+ assert elems
+ for e in elems[1:]:
+ if e[0] == 'struct' and e[1] == struct_name:
+ s = Struct(self, *e[1:])
+ assert s.name not in self.kernels
+ self.structs[s.name] = s
+ return
+ assert False, "Struct {0} not found in {1}".format(struct_name, filename)
+
+ def get_type(self, name):
+ if name in self.structs:
+ return self.structs[name]
+ return BasicType(TYPE_REMAPS.get(name, name))
+
+ def get_fixed_gpr(self, num):
+ assert isinstance(num, int)
+ if num in self.regs:
+ return self.regs[num]
+
+ reg = FixedGPR(num)
+ self.regs[num] = reg
+ return reg
+
+ def optimize(self):
+ progress = True
+ while progress:
+ progress = False
+
+ # Copy Propagation
+ for mk in self.metakernels:
+ if mk.opt_copy_prop():
+ progress = True
+
+ # Dead Code Elimination
+ for r in self.regs.values():
+ r.live = False
+ for c in self.constants:
+ c.live = False
+ for mk in self.metakernels:
+ mk.opt_dead_code1()
+ for mk in self.metakernels:
+ if mk.opt_dead_code2():
+ progress = True
+ for n in list(self.regs.keys()):
+ if not self.regs[n].live:
+ del self.regs[n]
+ progress = True
+ self.constants = [c for c in self.constants if c.live]
+
+ def compact_regs(self):
+ old_regs = self.regs
+ self.regs = {}
+ for i, reg in enumerate(old_regs.values()):
+ reg.num = i
+ self.regs[i] = reg
+
+ def write_h(self, w):
+ for s in self.structs.values():
+ s.write_h(w)
+ for mk in self.metakernels:
+ mk.write_h(w)
+
+ def write_c(self, w):
+ for c in self.constants:
+ c.write_c(w)
+ for mk in self.metakernels:
+ mk.write_c(w)
+
+class Kernel(object):
+ def __init__(self, m, name, ann):
+ self.name = name
+ self.source_file = ann['source']
+ self.kernel_name = self.source_file.replace('/', '_')[:-3].upper()
+ self.entrypoint = ann['kernelFunction']
+
+ assert self.source_file.endswith('.cl')
+ self.c_name = '_'.join([
+ 'GRL_CL_KERNEL',
+ self.kernel_name,
+ self.entrypoint.upper(),
+ ])
+
+class KernelModule(object):
+ def __init__(self, m, name, source, kernels):
+ self.name = name
+ self.kernels = []
+ self.libraries = []
+
+ for k in kernels:
+ if k[0] == 'kernel':
+ k[2]['source'] = source
+ self.kernels.append(Kernel(m, *k[1:]))
+ elif k[0] == 'library':
+ # Skip this for now.
+ pass
+
+class BasicType(object):
+ def __init__(self, name):
+ self.name = name
+ self.c_name = name
+
+class Struct(object):
+ def __init__(self, m, name, fields, align):
+ assert align == 0
+ self.name = name
+ self.c_name = 'struct ' + '_'.join(['grl', m.name, self.name])
+ self.fields = [(m.get_type(t), n) for t, n in fields]
+
+ def write_h(self, w):
+ w.write('{} {{\n', self.c_name)
+ w.push_indent()
+ for f in self.fields:
+ w.write('{} {};\n', f[0].c_name, f[1])
+ w.pop_indent()
+ w.write('};\n')
+
+class NamedConstant(Value):
+ def __init__(self, name, value):
+ super().__init__(name, 'cpu')
+ self.name = name
+ self.value = Constant(value)
+ self.written = False
+
+ def set_module(self, m):
+ pass
+
+ def write_c(self, w):
+ if self.written:
+ return
+ w.write('static const uint64_t {} = {};\n',
+ self.name, self.value.c_val())
+ self.written = True
+
+class MetaKernelParameter(Value):
+ def __init__(self, mk, type, name):
+ super().__init__(name, 'cpu')
+ self.type = mk.m.get_type(type)
+
+class MetaKernel(object):
+ def __init__(self, m, m_scope, name, params, ann, statements):
+ self.m = m
+ self.name = name
+ self.c_name = '_'.join(['grl', m.name, self.name])
+ self.goto_targets = {}
+ self.num_tmps = 0
+
+ mk_scope = Scope(m, self, m_scope)
+
+ self.params = [MetaKernelParameter(self, *p) for p in params]
+ for p in self.params:
+ mk_scope.add_def(p)
+
+ mk_scope.add_def(GroupSizeRegister(0), name='DISPATCHDIM_X')
+ mk_scope.add_def(GroupSizeRegister(1), name='DISPATCHDIM_Y')
+ mk_scope.add_def(GroupSizeRegister(2), name='DISPATCHDIM_Z')
+
+ self.statements = []
+ self.parse_stmt(mk_scope, statements)
+ self.scope = None
+
+ def get_tmp(self):
+ tmpN = '_tmp{}'.format(self.num_tmps)
+ self.num_tmps += 1
+ return tmpN
+
+ def add_stmt(self, stmt):
+ self.statements.append(stmt)
+ return stmt
+
+ def parse_value(self, v):
+ if isinstance(v, Value):
+ return v
+ elif isinstance(v, str):
+ if re.match(r'REG\d+', v):
+ return self.m.get_fixed_gpr(int(v[3:]))
+ else:
+ return self.scope.get_def(v)
+ elif isinstance(v, int):
+ return Constant(v)
+ elif isinstance(v, tuple):
+ if v[0] == 'member':
+ return Member(self.parse_value(v[1]), v[2])
+ elif v[0] == 'offsetof':
+ return OffsetOf(self, v[1])
+ else:
+ op = v[0]
+ srcs = [self.parse_value(s) for s in v[1:]]
+ return self.add_stmt(Expression(self, op, *srcs))
+ else:
+ assert False, 'Invalid value: {}'.format(v[0])
+
+ def load_value(self, v):
+ v = self.parse_value(v)
+ if isinstance(v, Member) and v.zone == 'gpu':
+ v = self.add_stmt(Half(v.value, v.member))
+ return v
+
+ def parse_stmt(self, scope, s):
+ self.scope = scope
+ if isinstance(s, list):
+ subscope = Scope(self.m, self, scope)
+ for stmt in s:
+ self.parse_stmt(subscope, stmt)
+ elif s[0] == 'define':
+ scope.add_def(self.parse_value(s[2]), name=s[1])
+ elif s[0] == 'assign':
+ self.add_stmt(StoreReg(self, *s[1:]))
+ elif s[0] == 'dispatch':
+ self.add_stmt(Dispatch(self, *s[1:]))
+ elif s[0] == 'load-dword':
+ v = self.add_stmt(LoadMem(self, 32, s[2]))
+ self.add_stmt(StoreReg(self, s[1], v))
+ elif s[0] == 'load-qword':
+ v = self.add_stmt(LoadMem(self, 64, s[2]))
+ self.add_stmt(StoreReg(self, s[1], v))
+ elif s[0] == 'store-dword':
+ self.add_stmt(StoreMem(self, 32, *s[1:]))
+ elif s[0] == 'store-qword':
+ self.add_stmt(StoreMem(self, 64, *s[1:]))
+ elif s[0] == 'goto':
+ self.add_stmt(GoTo(self, s[1]))
+ elif s[0] == 'goto-if':
+ self.add_stmt(GoTo(self, s[1], s[2]))
+ elif s[0] == 'goto-if-not':
+ self.add_stmt(GoTo(self, s[1], s[2], invert=True))
+ elif s[0] == 'label':
+ self.add_stmt(GoToTarget(self, s[1]))
+ elif s[0] == 'control':
+ self.add_stmt(Control(self, s[1]))
+ elif s[0] == 'sem-wait-while':
+ self.add_stmt(Control(self, s[1]))
+ else:
+ assert False, 'Invalid statement: {}'.format(s[0])
+
+ def add_goto_target(self, t):
+ assert t.name not in self.goto_targets
+ self.goto_targets[t.name] = t
+
+ def get_goto_target(self, name):
+ return self.goto_targets[name]
+
+ def opt_copy_prop(self):
+ progress = False
+ copies = {}
+ for stmt in self.statements:
+ for i in range(len(stmt.srcs)):
+ src = stmt.srcs[i]
+ if isinstance(src, FixedGPR) and src.num in copies:
+ stmt.srcs[i] = copies[src.num]
+ progress = True
+
+ if isinstance(stmt, StoreReg):
+ reg = stmt.reg
+ if isinstance(reg, Member):
+ reg = reg.value
+
+ if isinstance(reg, FixedGPR):
+ copies.pop(reg.num, None)
+ if not stmt.srcs[0].is_reg():
+ copies[reg.num] = stmt.srcs[0]
+ elif isinstance(stmt, (GoTo, GoToTarget)):
+ copies = {}
+
+ return progress
+
+ def opt_dead_code1(self):
+ for stmt in self.statements:
+ # Mark every register which is read as live
+ for src in stmt.srcs:
+ if isinstance(src, Register):
+ src.live = True
+
+ # Initialize every SSA statement to dead
+ if isinstance(stmt, SSAStatement):
+ stmt.live = False
+
+ def opt_dead_code2(self):
+ def yield_live(statements):
+ gprs_read = set(self.m.regs.keys())
+ for stmt in statements:
+ if isinstance(stmt, SSAStatement):
+ if not stmt.live:
+ continue
+ elif isinstance(stmt, StoreReg):
+ reg = stmt.reg
+ if isinstance(reg, Member):
+ reg = reg.value
+
+ if not stmt.reg.live:
+ continue
+
+ if isinstance(reg, FixedGPR):
+ if reg.num in gprs_read:
+ gprs_read.remove(reg.num)
+ else:
+ continue
+ elif isinstance(stmt, (GoTo, GoToTarget)):
+ gprs_read = set(self.m.regs.keys())
+
+ for src in stmt.srcs:
+ src.live = True
+ if isinstance(src, FixedGPR):
+ gprs_read.add(src.num)
+ yield stmt
+
+ old_stmt_list = self.statements
+ old_stmt_list.reverse()
+ self.statements = list(yield_live(old_stmt_list))
+ self.statements.reverse()
+ return len(self.statements) != len(old_stmt_list)
+
+ def count_ssa_value_uses(self):
+ for stmt in self.statements:
+ if isinstance(stmt, SSAStatement):
+ stmt.uses = 0
+
+ for src in stmt.srcs:
+ if isinstance(src, SSAStatement):
+ src.uses += 1
+
+ def write_h(self, w):
+ w.write('void\n')
+ w.write('genX({})(\n', self.c_name)
+ w.push_indent()
+ w.write('struct anv_cmd_buffer *cmd_buffer')
+ for p in self.params:
+ w.write(',\n{} {}', p.type.c_name, p.name)
+ w.write(');\n')
+ w.pop_indent()
+
+ def write_c(self, w):
+ w.write('void\n')
+ w.write('genX({})(\n', self.c_name)
+ w.push_indent()
+ w.write('struct anv_cmd_buffer *cmd_buffer')
+ for p in self.params:
+ w.write(',\n{} {}', p.type.c_name, p.name)
+ w.write(')\n')
+ w.pop_indent()
+ w.write('{\n')
+ w.push_indent()
+
+ w.write('struct mi_builder b;\n')
+ w.write('mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);\n')
+ w.write('/* TODO: use anv_mocs? */\n');
+ w.write('const uint32_t mocs = isl_mocs(&cmd_buffer->device->isl_dev, 0, false);\n');
+ w.write('mi_builder_set_mocs(&b, mocs);\n');
+ w.write('\n')
+
+ for r in self.m.regs.values():
+ r.write_c(w)
+ w.write('\n')
+
+ for t in self.goto_targets.values():
+ t.write_decl(w)
+ w.write('\n')
+
+ self.count_ssa_value_uses()
+ for s in self.statements:
+ s.write_c(w)
+
+ w.pop_indent()
+
+ w.write('}\n')
+
+HEADER_PROLOGUE = COPYRIGHT + '''
+#include "anv_private.h"
+#include "grl/genX_grl.h"
+
+#ifndef {0}
+#define {0}
+
+#ifdef __cplusplus
+extern "C" {{
+#endif
+
+'''
+
+HEADER_EPILOGUE = '''
+#ifdef __cplusplus
+}}
+#endif
+
+#endif /* {0} */
+'''
+
+C_PROLOGUE = COPYRIGHT + '''
+#include "{0}"
+
+#include "genxml/gen_macros.h"
+#include "genxml/genX_pack.h"
+#include "genxml/genX_rt_pack.h"
+
+/* We reserve :
+ * - GPR 14 for secondary command buffer returns
+ * - GPR 15 for conditional rendering
+ */
+#define MI_BUILDER_NUM_ALLOC_GPRS 14
+#define __gen_get_batch_dwords anv_batch_emit_dwords
+#define __gen_address_offset anv_address_add
+#define __gen_get_batch_address(b, a) anv_batch_address(b, a)
+#include "common/mi_builder.h"
+
+#define MI_PREDICATE_RESULT mi_reg32(0x2418)
+#define DISPATCHDIM_X mi_reg32(0x2500)
+#define DISPATCHDIM_Y mi_reg32(0x2504)
+#define DISPATCHDIM_Z mi_reg32(0x2508)
+'''
+
+def parse_libraries(filenames):
+ libraries = {}
+ for fname in filenames:
+ lib_package = parse_grl_file(fname, [])
+ for lib in lib_package:
+ assert lib[0] == 'library'
+ # Add the directory of the library so that CL files can be found.
+ lib[2].append(('path', os.path.dirname(fname)))
+ libraries[lib[1]] = lib
+ return libraries
+
+def main():
+ argparser = argparse.ArgumentParser()
+ argparser.add_argument('--out-c', help='Output C file')
+ argparser.add_argument('--out-h', help='Output C file')
+ argparser.add_argument('--library', dest='libraries', action='append',
+ default=[], help='Libraries to include')
+ argparser.add_argument('grl', help="Input file")
+ args = argparser.parse_args()
+
+ grl_dir = os.path.dirname(args.grl)
+
+ libraries = parse_libraries(args.libraries)
+
+ ir = parse_grl_file(args.grl, libraries)
+
+ m = Module(grl_dir, ir)
+ m.optimize()
+ m.compact_regs()
+
+ with open(args.out_h, 'w') as f:
+ guard = os.path.splitext(os.path.basename(args.out_h))[0].upper()
+ w = Writer(f)
+ w.write(HEADER_PROLOGUE, guard)
+ m.write_h(w)
+ w.write(HEADER_EPILOGUE, guard)
+
+ with open(args.out_c, 'w') as f:
+ w = Writer(f)
+ w.write(C_PROLOGUE, os.path.basename(args.out_h))
+ m.write_c(w)
+
+if __name__ == '__main__':
+ main()
diff --git a/src/intel/vulkan/grl/grl_parser.py b/src/intel/vulkan/grl/grl_parser.py
new file mode 100644
index 00000000000..2d62b25a169
--- /dev/null
+++ b/src/intel/vulkan/grl/grl_parser.py
@@ -0,0 +1,586 @@
+#!/bin/env python
+COPYRIGHT = """\
+/*
+ * Copyright 2021 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+"""
+
+import os
+import re
+import ply.lex as lex
+import ply.yacc as yacc
+
+# Libraries
+
+libraries = {}
+
+# LEXER
+
+keywords = {
+ '__debugbreak': 'KW_DEBUGBREAK',
+ 'alignas': 'KW_ALIGNAS',
+ 'args': 'KW_ARGS',
+ 'atomic': 'KW_ATOMIC',
+ 'atomic_return': 'KW_ATOMIC_RETURN',
+ 'const': 'KW_CONST',
+ 'control': 'KW_CONTROL',
+ 'define': 'KW_DEFINE',
+ 'dispatch': 'KW_DISPATCH',
+ 'dispatch_indirect': 'KW_DISPATCH_INDIRECT',
+ 'goto': 'KW_GOTO',
+ 'if': 'KW_IF',
+ 'kernel': 'KW_KERNEL',
+ 'kernel_module': 'KW_KERNEL_MODULE',
+ 'import': 'KW_IMPORT',
+ 'library': 'KW_LIBRARY',
+ 'links': 'KW_LINKS',
+ 'load_dword': 'KW_LOAD_DWORD',
+ 'load_qword': 'KW_LOAD_QWORD',
+ 'metakernel': 'KW_METAKERNEL',
+ 'module': 'KW_MODULE',
+ 'not': 'KW_NOT',
+ 'offsetof': 'KW_OFFSETOF',
+ 'postsync': 'KW_POSTSYNC',
+ 'print': 'KW_PRINT',
+ 'semaphore_wait': 'KW_SEMAPHORE_WAIT',
+ 'shiftof': 'KW_SHIFTOF',
+ 'sizeof': 'KW_SIZEOF',
+ 'store_dword': 'KW_STORE_DWORD',
+ 'store_qword': 'KW_STORE_QWORD',
+ 'store_timestamp': 'KW_STORE_TIMESTAMP',
+ 'struct': 'KW_STRUCT',
+ 'unsigned': 'KW_UNSIGNED',
+ 'while': 'KW_WHILE'
+}
+
+ops = {
+ '&&': 'OP_LOGICAL_AND',
+ '||': 'OP_LOGICAL_OR',
+ '==': 'OP_EQUALEQUAL',
+ '!=': 'OP_NOTEQUAL',
+ '<=': 'OP_LESSEQUAL',
+ '>=': 'OP_GREATEREQUAL',
+ '<<': 'OP_LSHIFT',
+ '>>': 'OP_RSHIFT'
+}
+
+tokens = [
+ 'INT_LITERAL',
+ 'STRING_LITERAL',
+ 'OP',
+ 'IDENTIFIER'
+] + list(keywords.values()) + list(ops.values())
+
+def t_INT_LITERAL(t):
+ r'(0x[a-fA-F0-9]+|\d+)'
+ if t.value.startswith('0x'):
+ t.value = int(t.value[2:], 16)
+ else:
+ t.value = int(t.value)
+ return t
+
+def t_OP(t):
+ r'(&&|\|\||==|!=|<=|>=|<<|>>)'
+ t.type = ops.get(t.value)
+ return t
+
+def t_IDENTIFIER(t):
+ r'[a-zA-Z_][a-zA-Z_0-9]*'
+ t.type = keywords.get(t.value, 'IDENTIFIER')
+ return t
+
+def t_STRING_LITERAL(t):
+ r'"(\\.|[^"\\])*"'
+ t.value = t.value[1:-1]
+ return t
+
+literals = "+*/(){};:,=&|!~^.%?-<>[]"
+
+t_ignore = ' \t'
+
+def t_newline(t):
+ r'\n+'
+ t.lexer.lineno += len(t.value)
+
+def t_error(t):
+ print("WUT: {}".format(t.value))
+ t.lexer.skip(1)
+
+LEXER = lex.lex()
+
+# PARSER
+
+precedence = (
+ ('right', '?', ':'),
+ ('left', 'OP_LOGICAL_OR', 'OP_LOGICAL_AND'),
+ ('left', '|'),
+ ('left', '^'),
+ ('left', '&'),
+ ('left', 'OP_EQUALEQUAL', 'OP_NOTEQUAL'),
+ ('left', '<', '>', 'OP_LESSEQUAL', 'OP_GREATEREQUAL'),
+ ('left', 'OP_LSHIFT', 'OP_RSHIFT'),
+ ('left', '+', '-'),
+ ('left', '*', '/', '%'),
+ ('right', '!', '~'),
+ ('left', '[', ']', '.')
+)
+
+def p_module(p):
+ 'module : element_list'
+ p[0] = p[1]
+
+def p_element_list(p):
+ '''element_list : element_list element
+ | element'''
+ if len(p) == 2:
+ p[0] = [p[1]]
+ else:
+ p[0] = p[1] + [p[2]]
+
+def p_element(p):
+ '''element : kernel_definition
+ | kernel_module_definition
+ | library_definition
+ | metakernel_definition
+ | module_name
+ | struct_definition
+ | const_definition
+ | import_definition'''
+ p[0] = p[1]
+
+def p_module_name(p):
+ 'module_name : KW_MODULE IDENTIFIER ";"'
+ p[0] = ('module-name', p[2])
+
+def p_kernel_module_definition(p):
+ 'kernel_module_definition : KW_KERNEL_MODULE IDENTIFIER "(" STRING_LITERAL ")" "{" kernel_definition_list "}"'
+ p[0] = ('kernel-module', p[2], p[4], p[7])
+
+def p_kernel_definition(p):
+ 'kernel_definition : KW_KERNEL IDENTIFIER optional_annotation_list'
+ p[0] = ('kernel', p[2], p[3])
+
+def p_library_definition(p):
+ 'library_definition : KW_LIBRARY IDENTIFIER "{" library_definition_list "}"'
+ p[0] = ('library', p[2], p[4])
+
+def p_library_definition_list(p):
+ '''library_definition_list :
+ | library_definition_list IDENTIFIER STRING_LITERAL ";"'''
+ if len(p) < 3:
+ p[0] = []
+ else:
+ p[0] = p[1]
+ p[0].append((p[2], p[3]))
+
+def p_import_definition(p):
+ 'import_definition : KW_IMPORT KW_STRUCT IDENTIFIER STRING_LITERAL ";"'
+ p[0] = ('import', p[4], 'struct', p[3])
+
+def p_links_definition(p):
+ 'links_definition : KW_LINKS IDENTIFIER'
+
+ # Process a library include like a preprocessor
+ global libraries
+
+ if not p[2] in libraries:
+ raise "Not able to find library {0}".format(p[2])
+ p[0] = libraries[p[2]]
+
+def p_metakernel_definition(p):
+ 'metakernel_definition : KW_METAKERNEL IDENTIFIER "(" optional_parameter_list ")" optional_annotation_list scope'
+ p[0] = ('meta-kernel', p[2], p[4], p[6], p[7])
+
+def p_kernel_definition_list(p):
+ '''kernel_definition_list :
+ | kernel_definition_list kernel_definition ";"
+ | kernel_definition_list links_definition ";"'''
+ if len(p) < 3:
+ p[0] = []
+ else:
+ p[0] = p[1]
+ p[0].append(p[2])
+
+def p_optional_annotation_list(p):
+ '''optional_annotation_list :
+ | "<" ">"
+ | "<" annotation_list ">"'''
+ if len(p) < 4:
+ p[0] = {}
+ else:
+ p[0] = p[2]
+
+def p_optional_parameter_list(p):
+ '''optional_parameter_list :
+ | parameter_list'''
+ p[0] = p[1]
+
+def p_annotation_list(p):
+ '''annotation_list : annotation'''
+ p[0] = p[1]
+
+def p_annotation_list_append(p):
+ '''annotation_list : annotation_list "," annotation'''
+ p[0] = {**p[1], **p[3]}
+
+def p_annotation(p):
+ '''annotation : IDENTIFIER "=" INT_LITERAL
+ | IDENTIFIER "=" IDENTIFIER
+ | IDENTIFIER "=" STRING_LITERAL'''
+ p[0] = {p[1]: p[3]}
+
+def p_parameter_list(p):
+ '''parameter_list : parameter_definition'''
+ p[0] = [p[1]]
+
+def p_parameter_list_append(p):
+ '''parameter_list : parameter_list "," parameter_definition'''
+ p[0] = p[1]
+ p[0].append(p[3])
+
+def p_parameter_definition(p):
+ 'parameter_definition : IDENTIFIER IDENTIFIER'
+ p[0] = (p[1], p[2])
+
+def p_scope(p):
+ '''scope : "{" optional_statement_list "}"'''
+ p[0] = p[2]
+
+def p_optional_statement_list(p):
+ '''optional_statement_list :
+ | statement_list'''
+ p[0] = p[1]
+
+def p_statement_list(p):
+ '''statement_list : statement'''
+ p[0] = [p[1]]
+
+def p_statement_list_append(p):
+ '''statement_list : statement_list statement'''
+ p[0] = p[1]
+ p[0].append(p[2])
+
+def p_statement(p):
+ '''statement : definition_statement ";"
+ | assignment_statement ";"
+ | load_store_statement ";"
+ | dispatch_statement ";"
+ | semaphore_statement ";"
+ | label
+ | goto_statement ";"
+ | scope_statement
+ | atomic_op_statement ";"
+ | control_statement ";"
+ | print_statement ";"
+ | debug_break_statement ";"'''
+ p[0] = p[1]
+
+def p_definition_statement(p):
+ 'definition_statement : KW_DEFINE IDENTIFIER value'
+ p[0] = ('define', p[2], p[3])
+
+def p_assignemt_statement(p):
+ 'assignment_statement : value "=" value'
+ p[0] = ('assign', p[1], p[3])
+
+def p_load_store_statement_load_dword(p):
+ '''load_store_statement : value "=" KW_LOAD_DWORD "(" value ")"'''
+ p[0] = ('load-dword', p[1], p[5])
+
+def p_load_store_statement_load_qword(p):
+ '''load_store_statement : value "=" KW_LOAD_QWORD "(" value ")"'''
+ p[0] = ('load-qword', p[1], p[5])
+
+def p_load_store_statement_store_dword(p):
+ '''load_store_statement : KW_STORE_DWORD "(" value "," value ")"'''
+ p[0] = ('store-dword', p[3], p[5])
+
+def p_load_store_statement_store_qword(p):
+ '''load_store_statement : KW_STORE_QWORD "(" value "," value ")"'''
+ p[0] = ('store-qword', p[3], p[5])
+
+def p_dispatch_statement(p):
+ '''dispatch_statement : direct_dispatch_statement
+ | indirect_dispatch_statement'''
+ p[0] = p[1]
+
+def p_direct_dispatch_statement(p):
+ '''direct_dispatch_statement : KW_DISPATCH IDENTIFIER "(" value "," value "," value ")" optional_kernel_arg_list optional_postsync'''
+ p[0] = ('dispatch', p[2], (p[4], p[6], p[8]), p[10], p[11])
+
+def p_indirect_dispatch_statement(p):
+ '''indirect_dispatch_statement : KW_DISPATCH_INDIRECT IDENTIFIER optional_kernel_arg_list optional_postsync'''
+ p[0] = ('dispatch', p[2], None, p[3], p[4])
+
+def p_optional_kernel_arg_list(p):
+ '''optional_kernel_arg_list :
+ | KW_ARGS "(" value_list ")"'''
+ p[0] = p[3]
+
+def p_value_list(p):
+ '''value_list : value'''
+ p[0] = [p[1]]
+
+def p_value_list_append(p):
+ '''value_list : value_list "," value'''
+ p[0] = p[1]
+ p[0].append(p[3])
+
+def p_optional_postsync(p):
+ '''optional_postsync :
+ | postsync_operation'''
+ if len(p) > 1:
+ p[0] = p[1]
+
+def p_postsync_operation(p):
+ '''postsync_operation : postsync_write_dword
+ | postsync_write_timestamp'''
+ p[0] = p[1]
+
+def p_postsync_write_dword(p):
+ '''postsync_write_dword : KW_POSTSYNC KW_STORE_DWORD "(" value "," value ")"'''
+ p[0] = ('postsync', 'store-dword', p[4], p[6])
+
+def p_postsync_write_timestamp(p):
+ '''postsync_write_timestamp : KW_POSTSYNC KW_STORE_TIMESTAMP "(" value ")"'''
+ p[0] = ('postsync', 'timestamp', p[4])
+
+def p_semaphore_statement(p):
+ '''semaphore_statement : KW_SEMAPHORE_WAIT KW_WHILE "(" "*" value "<" value ")"
+ | KW_SEMAPHORE_WAIT KW_WHILE "(" "*" value ">" value ")"
+ | KW_SEMAPHORE_WAIT KW_WHILE "(" "*" value OP_LESSEQUAL value ")"
+ | KW_SEMAPHORE_WAIT KW_WHILE "(" "*" value OP_GREATEREQUAL value ")"
+ | KW_SEMAPHORE_WAIT KW_WHILE "(" "*" value OP_EQUALEQUAL value ")"
+ | KW_SEMAPHORE_WAIT KW_WHILE "(" "*" value OP_NOTEQUAL value ")"'''
+ p[0] = ('sem-wait-while', p[5], p[6], p[7])
+
+def p_atomic_op_statement(p):
+ '''atomic_op_statement : KW_ATOMIC IDENTIFIER IDENTIFIER "(" value_list ")"'''
+ p[0] = ('atomic', p[2], p[3], p[5])
+
+def p_atomic_op_statement_return(p):
+ '''atomic_op_statement : KW_ATOMIC_RETURN IDENTIFIER IDENTIFIER "(" value_list ")"'''
+ p[0] = ('atomic-return', p[2], p[3], p[5])
+
+def p_label(p):
+ '''label : IDENTIFIER ":"'''
+ p[0] = ('label', p[1])
+
+def p_goto_statement(p):
+ '''goto_statement : KW_GOTO IDENTIFIER'''
+ p[0] = ('goto', p[2])
+
+def p_goto_statement_if(p):
+ '''goto_statement : KW_GOTO IDENTIFIER KW_IF "(" value ")"'''
+ p[0] = ('goto-if', p[2], p[5])
+
+def p_goto_statement_if_not(p):
+ '''goto_statement : KW_GOTO IDENTIFIER KW_IF KW_NOT "(" value ")"'''
+ p[0] = ('goto-if-not', p[2], p[6])
+
+def p_scope_statement(p):
+ '''scope_statement : scope'''
+ p[0] = (p[1])
+
+def p_control_statement(p):
+ '''control_statement : KW_CONTROL "(" id_list ")"'''
+ p[0] = ('control', p[3])
+
+def p_print_statement(p):
+ '''print_statement : KW_PRINT "(" printable_list ")"'''
+ p[0] = ('print', p[3])
+
+def p_printable_list(p):
+ '''printable_list : printable'''
+ p[0] = [p[1]]
+
+def p_printable_list_append(p):
+ '''printable_list : printable_list "," printable'''
+ p[0] = p[1]
+ p[0].append(p[3])
+
+def p_printable_str_lit(p):
+ '''printable : STRING_LITERAL'''
+ p[0] = '"{}"'.format(p[1])
+
+def p_printable_value(p):
+ '''printable : value'''
+ p[0] = p[1]
+
+def p_printable_str_lit_value(p):
+ '''printable : STRING_LITERAL value'''
+ p[0] = ('"{}"'.format(p[1]), p[2])
+
+def p_debug_break_statement(p):
+ '''debug_break_statement : KW_DEBUGBREAK'''
+ p[0] = ('debug-break')
+
+def p_id_list(p):
+ '''id_list : IDENTIFIER'''
+ p[0] = p[1]
+
+def p_id_list_append(p):
+ '''id_list : id_list "," IDENTIFIER'''
+ p[0] = p[1]
+ p[0].append(p[3])
+
+def p_value(p):
+ '''value : IDENTIFIER
+ | INT_LITERAL'''
+ p[0] = p[1]
+
+def p_value_braces(p):
+ '''value : "(" value ")"'''
+ p[0] = (p[2])
+
+def p_value_member(p):
+ '''value : value "." IDENTIFIER'''
+ p[0] = ('member', p[1], p[3])
+
+def p_value_idx(p):
+ '''value : value "[" value "]"'''
+ p[0] = ('index', p[1], p[3])
+
+def p_value_binop(p):
+ '''value : value "+" value
+ | value "-" value
+ | value "*" value
+ | value "/" value
+ | value "%" value
+ | value "&" value
+ | value "|" value
+ | value "<" value
+ | value ">" value
+ | value "^" value
+ | value OP_LESSEQUAL value
+ | value OP_GREATEREQUAL value
+ | value OP_EQUALEQUAL value
+ | value OP_NOTEQUAL value
+ | value OP_LOGICAL_AND value
+ | value OP_LOGICAL_OR value
+ | value OP_LSHIFT value
+ | value OP_RSHIFT value'''
+ p[0] = (p[2], p[1], p[3])
+
+def p_value_uniop(p):
+ '''value : "!" value
+ | "~" value'''
+ p[0] = (p[1], p[2])
+
+def p_value_cond(p):
+ '''value : value "?" value ":" value'''
+ p[0] = ('?', p[1], p[3], p[5])
+
+def p_value_funcop(p):
+ '''value : KW_OFFSETOF "(" offset_expression ")"
+ | KW_SHIFTOF "(" IDENTIFIER ")"
+ | KW_SIZEOF "(" IDENTIFIER ")"'''
+ p[0] = (p[1], p[3])
+
+def p_offset_expression(p):
+ '''offset_expression : IDENTIFIER'''
+ p[0] = p[1]
+
+def p_offset_expression_member(p):
+ '''offset_expression : offset_expression "." IDENTIFIER'''
+ p[0] = ('member', p[1], p[3])
+
+def p_offset_expression_idx(p):
+ '''offset_expression : offset_expression "[" INT_LITERAL "]"'''
+ p[0] = ('index', p[1], p[3])
+
+def p_struct_definition(p):
+ '''struct_definition : KW_STRUCT optional_alignment_specifier IDENTIFIER "{" optional_struct_member_list "}" ";"'''
+ p[0] = ('struct', p[3], p[5], p[2])
+
+def p_optional_alignment_specifier(p):
+ '''optional_alignment_specifier :
+ | KW_ALIGNAS "(" INT_LITERAL ")"'''
+ if len(p) == 1:
+ p[0] = 0
+ else:
+ p[0] = p[3]
+
+def p_optional_struct_member_list(p):
+ '''optional_struct_member_list :
+ | struct_member_list'''
+ if len(p) == 1:
+ p[0] = {}
+ else:
+ p[0] = p[1]
+
+def p_struct_member_list(p):
+ '''struct_member_list : struct_member'''
+ p[0] = [p[1]]
+
+def p_struct_member_list_append(p):
+ '''struct_member_list : struct_member_list struct_member'''
+ p[0] = p[1] + [p[2]]
+
+def p_struct_member(p):
+ '''struct_member : struct_member_typename IDENTIFIER ";"'''
+ p[0] = (p[1], p[2])
+
+def p_struct_member_array(p):
+ '''struct_member : struct_member_typename IDENTIFIER "[" INT_LITERAL "]" ";"'''
+ '''struct_member : struct_member_typename IDENTIFIER "[" IDENTIFIER "]" ";"'''
+ p[0] = {p[1]: p[2], 'count': p[4]}
+
+def p_struct_member_typename(p):
+ '''struct_member_typename : IDENTIFIER'''
+ p[0] = p[1]
+
+def p_struct_member_typename_unsigned(p):
+ '''struct_member_typename : KW_UNSIGNED IDENTIFIER'''
+ p[0] = ('unsigned', p[2])
+
+def p_struct_member_typename_struct(p):
+ '''struct_member_typename : KW_STRUCT IDENTIFIER'''
+ p[0] = ('struct', p[2])
+
+def p_const_definition(p):
+ '''const_definition : KW_CONST IDENTIFIER "=" INT_LITERAL ";"'''
+ p[0] = ('named-constant', p[2], p[4])
+
+PARSER = yacc.yacc()
+
+# Shamelessly stolen from some StackOverflow answer
+def _remove_comments(text):
+ def replacer(match):
+ s = match.group(0)
+ if s.startswith('/'):
+ return " " # note: a space and not an empty string
+ else:
+ return s
+ pattern = re.compile(
+ r'//.*?$|/\*.*?\*/|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"',
+ re.DOTALL | re.MULTILINE
+ )
+ return re.sub(pattern, replacer, text)
+
+def parse_grl_file(grl_fname, libs):
+ global libraries
+
+ libraries = libs
+ with open(grl_fname, 'r') as f:
+ return PARSER.parse(_remove_comments(f.read()))
diff --git a/src/intel/vulkan/grl/grl_structs.h b/src/intel/vulkan/grl/grl_structs.h
new file mode 100644
index 00000000000..ed721afa6a2
--- /dev/null
+++ b/src/intel/vulkan/grl/grl_structs.h
@@ -0,0 +1,479 @@
+/*
+ * Copyright © 2022 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/**
+ * This file contains a redefinition of structures defined in the GRL library.
+ * We need to have those structures defined to allocate & prepare data for
+ * the OpenCL kernels building acceleration structures. Unfortunately because
+ * of C++ & OpenCL assumptions in GRL, it's no possible to just include GRL
+ * header files directly so we have to redefine stuff here.
+ */
+
+#ifndef GRL_STRUCTS_H
+#define GRL_STRUCTS_H
+
+#include "GRLStructs.h"
+#include "GRLRTASCommon.h"
+
+struct MKBuilderState {
+ qword geomDesc_buffer;
+ qword build_primref_buffer;
+ qword build_globals;
+ qword bvh_buffer;
+ dword leaf_type;
+ dword leaf_size;
+};
+
+#define PREFIX_MK_STATE(prefix, obj) \
+ (struct prefix##_MKBuilderState) { \
+ .geomDesc_buffer = (obj).geomDesc_buffer, \
+ .build_primref_buffer = (obj).build_primref_buffer, \
+ .build_globals = (obj).build_globals, \
+ .bvh_buffer = (obj).bvh_buffer, \
+ .leaf_type = (obj).leaf_type, \
+ .leaf_size = (obj).leaf_size, \
+ }
+
+struct MKSizeEstimate {
+ dword numTriangles;
+ dword numProcedurals;
+ dword numPrimitives;
+ dword numMeshes;
+ dword numBuildPrimitives;
+ dword numPrimitivesToSplit;
+ dword instance_descs_start;
+ dword geo_meta_data_start;
+ dword node_data_start;
+ dword leaf_data_start;
+ dword procedural_data_start;
+ dword back_pointer_start;
+ dword sizeTotal;
+ dword updateScratchSizeTotal;
+ dword fatleaf_table_start;
+ dword innernode_table_start;
+ dword max_fatleaves;
+
+ size_t max_instance_leafs;
+ size_t max_inner_nodes;
+ size_t leaf_data_size;
+ size_t min_primitives;
+ size_t max_primitives;
+};
+
+#define PREFIX_MK_SIZE(prefix, obj) \
+ (struct prefix##_MKSizeEstimate) { \
+ .numTriangles = (obj).numTriangles, \
+ .numProcedurals = (obj).numProcedurals, \
+ .numPrimitives = (obj).numPrimitives, \
+ .numMeshes = (obj).numMeshes, \
+ .numBuildPrimitives = (obj).numBuildPrimitives, \
+ .numPrimitivesToSplit = (obj).numPrimitivesToSplit, \
+ .instance_descs_start = (obj).instance_descs_start, \
+ .geo_meta_data_start = (obj).geo_meta_data_start, \
+ .node_data_start = (obj).node_data_start, \
+ .leaf_data_start = (obj).leaf_data_start, \
+ .procedural_data_start = (obj).procedural_data_start, \
+ .back_pointer_start = (obj).back_pointer_start, \
+ .sizeTotal = (obj).sizeTotal, \
+ .updateScratchSizeTotal = (obj).updateScratchSizeTotal, \
+ .fatleaf_table_start = (obj).fatleaf_table_start, \
+ .innernode_table_start = (obj).innernode_table_start, \
+ .max_fatleaves = (obj).max_fatleaves, \
+ }
+
+typedef struct AABB {
+ float lower[4];
+ float upper[4];
+} AABB;
+
+struct Globals
+{
+ struct AABB centroidBounds;
+
+ unsigned int build_record_start;
+ unsigned int numPrimitives;
+ unsigned int leafPrimType;
+ unsigned int leafSize;
+
+ unsigned int numSplittedPrimitives;
+ unsigned int numBuildRecords;
+
+ // spatial split sate
+ unsigned int numOriginalPrimitives;
+ float presplitPrioritySum;
+ float probThreshold;
+
+ // binned-sah bfs state
+ unsigned int counter;
+ unsigned int numBuildRecords_extended;
+
+ // sync variable used for global-sync on work groups
+ unsigned int sync;
+
+
+ /* morton code builder state */
+ unsigned int shift; // used by adaptive mc-builder
+ unsigned int shift_mask; // used by adaptive mc-builder
+ unsigned int binary_hierarchy_root;
+ unsigned int p0_allocated_num;
+ unsigned int p0_created_num;
+ unsigned int morton_sort_in_flight;
+ unsigned int sort_iterations;
+
+ gpuva_t binary_hierarchy_buffer; // pointer to the binary morton code hierarchy. Stashed here as a debug aid
+};
+
+typedef struct BVHBase
+{
+ // TODO: Implement the "copy-first-node" trick... duplicate root node here
+
+ uint64_t rootNodeOffset;
+
+ uint32_t reserved;
+
+ uint32_t nodeDataCur; // nodeDataStart is sizeof(BVHBase) / 64 = BVH_ROOT_NODE_OFFSET / 64
+ uint32_t quadLeafStart;
+ uint32_t quadLeafCur;
+ uint32_t proceduralDataStart;
+ uint32_t proceduralDataCur;
+ uint32_t instanceLeafStart;
+ uint32_t instanceLeafEnd;
+ uint32_t backPointerDataStart; //
+ uint32_t refitTreeletsDataStart; // refit structs
+ uint32_t refitStartPointDataStart; //
+ uint32_t BVHDataEnd;
+
+ // number of bottom treelets
+ // if 1, then the bottom treelet is also tip treelet
+ uint32_t refitTreeletCnt;
+ uint32_t refitTreeletCnt2; // always 0, used for atomic updates
+ // data layout:
+ // @backPointerDataStart
+ // 'backpointer' - a dword per inner node.
+ // The bits are used as follows:
+ // 2:0 --> Used as a refit counter during BVH refitting. MBZ
+ // 5:3 --> Number of children
+ // 31:6 --> Index of the parent node in the internal node array
+ // The root node has a parent index of all ones
+ // @refitTreeletsDataStart
+ // RefitTreelet[], the last treelet is for top treelet all previous are for bottom
+ // @refitStartPointDataStart
+ // for each treelet T there is [T.startpoint_offset, T.numStartpoints) interval of startpoints here in that space
+ // @backPointerDataEnd
+
+ uint32_t fatLeafCount; // number of internal nodes which are "fat-leaves"
+ uint32_t innerCount; // number of internal nodes which are true inner nodes (all internalNode children)
+ uint32_t fatLeafTableStart;
+ uint32_t innerTableStart;
+
+ uint32_t _pad[12];
+
+ struct RTASMetaData Meta;
+} BVHBase;
+
+
+struct BatchedInitGlobalsData
+{
+ qword p_build_globals;
+ qword p_bvh_buffer;
+ dword numPrimitives;
+ dword numGeometries;
+ dword numInstances;
+ dword instance_descs_start;
+ dword geo_meta_data_start;
+ dword node_data_start;
+ dword leaf_data_start;
+ dword procedural_data_start;
+ dword back_pointer_start;
+ dword sizeTotal;
+ dword leafType;
+ dword leafSize;
+ dword fatleaf_table_start;
+ dword innernode_table_start;
+};
+
+
+#define BFS_NUM_BINS 16
+#define BFS_NUM_VCONTEXTS 256
+#define BFS_MAX_DEPTH 32
+
+#define QNODE_GLOBAL_ROOT_BUFFER_MIN_ENTRIES_NUM 16384
+
+struct BFS_Split
+{
+ float sah;
+ int dim;
+ int pos;
+};
+
+struct BFS_BinInfo
+{
+ float min_max[18 * BFS_NUM_BINS]; // layout: bins[axis][num_bins][6]
+ // The 6 are lower(xyz) and -upper(xyz)
+ // bins use negated-max so that we can use vectorized mins instead of min/max pairs
+ uint counts[3 * BFS_NUM_BINS];
+};
+
+struct SAHBuildGlobals
+{
+ qword p_primref_index_buffers;
+ qword p_primrefs_buffer;
+ qword p_bvh2;
+ qword p_globals; // TODO: deprecate this
+ qword p_bvh_base;
+ gpuva_t p_qnode_root_buffer;
+
+ dword flags; // bit 1 is 'alloc_backpointers'. bit 2 is 'need_masks'
+ dword num_primrefs;
+ dword leaf_size;
+ dword leaf_type;
+
+ dword root_buffer_num_produced;
+ dword root_buffer_num_produced_hi;
+ dword root_buffer_num_consumed;
+ dword root_buffer_num_consumed_hi;
+ dword root_buffer_num_to_consume;
+ dword root_buffer_num_to_consume_hi;
+};
+
+typedef union LRBounds
+{
+ struct
+ {
+ struct AABB3f left_centroid_bounds;
+ struct AABB3f left_geom_bounds;
+ struct AABB3f right_centroid_bounds;
+ struct AABB3f right_geom_bounds;
+ } boxes;
+ struct
+ {
+ float Array[24];
+ } scalars;
+} LRBounds;
+
+
+struct VContext
+{
+ uint dispatch_primref_begin; // range of primrefs for this task
+ uint dispatch_primref_end;
+ uint bvh2_root; // BVH2 root node for this task
+ uint tree_depth; // depth of this node in the tree
+ uint num_left; // primref counts
+ uint num_right;
+ uint lr_mask; // lower 8b : left mask. upper 8b : right mask
+ uint batch_index;
+
+ // pass1 global working state and output
+ struct BFS_Split split;
+ struct BFS_BinInfo global_bin_info;
+
+ // pass2 global working state and output
+ LRBounds lr_bounds;
+};
+
+
+
+struct BFSDispatchRecord
+{
+ ushort batch_index;
+ ushort context_id;
+};
+
+
+struct BFSDispatchQueue
+{
+ uint num_dispatches;
+ uint wg_count[BFS_NUM_VCONTEXTS];
+ struct BFSDispatchRecord records[BFS_NUM_VCONTEXTS];
+};
+
+struct BFS1SpillStackEntry
+{
+ uint primref_begin;
+ uint primref_end;
+ uint bvh2_root;
+ ushort tree_depth;
+ ushort batch_index;
+};
+
+struct BFS1SpillStack
+{
+ uint size;
+ struct BFS1SpillStackEntry entries[BFS_NUM_VCONTEXTS * BFS_MAX_DEPTH];
+};
+
+struct QNodeGlobalRootBufferEntry
+{
+ uint bvh2_node;
+ uint qnode;
+ uint build_idx;
+ uint _pad;
+};
+
+struct QNodeGlobalRootBuffer
+{
+ uint curr_entries_offset; // we use "entries" as two buffers, so offset is either 0 or QNODE_GLOBAL_ROOT_BUFFER_MIN_ENTRIES_NUM
+ struct QNodeGlobalRootBufferEntry entries[QNODE_GLOBAL_ROOT_BUFFER_MIN_ENTRIES_NUM * 2];
+};
+
+struct DFSDispatchRecord
+{
+ uint primref_base;
+ uint bvh2_base;
+ uint batch_index;
+ ushort num_primrefs;
+ ushort tree_depth;
+};
+
+
+struct DFSDispatchQueue
+{
+ struct DFSDispatchRecord records[BFS_NUM_VCONTEXTS * 2];
+};
+
+#define VCONTEXT_STATE_EXECUTING 0
+#define VCONTEXT_STATE_UNALLOCATED 1
+
+union SchedulerUnion
+{
+ struct VContextScheduler
+ {
+ /////////////////////////////////////////////////////////////
+ // State data used for communication with command streamer
+ // NOTE: This part must match definition in 'new_sah_builder.grl'
+ /////////////////////////////////////////////////////////////
+
+ dword num_bfs_wgs;
+ dword num_dfs_wgs;
+
+ dword scheduler_postsync;
+ dword _pad1;
+
+ dword num_trivial_builds; // number of trivial builds (#primrefs < leaf_size).
+ dword num_single_builds; // number of single-wg builds (#primrefs < threshold)
+
+ dword batched_build_wg_count; // number of wgs to dispatch for initial BFS pass
+ dword batched_build_loop_mask; // value is 0 if #builds <= #contexts. else 1 command streamer uses this as a loop condition
+
+ /////////////////////////////////////////////////////////////
+
+ dword batched_build_count; // number of batched builds in the SAHBuildGlobals buffer
+ dword batched_build_offset; // location of the first batched-build in the SAHBuildGlobals buffer
+
+ dword vcontext_state[BFS_NUM_VCONTEXTS];
+
+ struct BFSDispatchQueue bfs_queue;
+ struct DFSDispatchQueue dfs_queue;
+
+ struct VContext contexts[BFS_NUM_VCONTEXTS];
+
+ struct BFS1SpillStack bfs2_spill_stack;
+ } vContextScheduler;
+
+ struct QnodeScheduler
+ {
+ dword num_qnode_grb_curr_entries;
+ dword num_qnode_grb_new_entries;
+
+ dword scheduler_postsync;
+ dword _pad1;
+
+ dword num_trivial_builds; // number of trivial builds (#primrefs < leaf_size).
+ dword num_single_builds; // number of single-wg builds (#primrefs < threshold)
+
+ dword batched_builds_to_process;
+ dword num_max_qnode_global_root_buffer_entries; // number of maximum entries to global root buffer
+
+ /////////////////////////////////////////////////////////////
+
+ dword batched_build_count; // number of batched builds in the SAHBuildGlobals buffer
+ dword batched_build_offset; // location of the first batched-build in the SAHBuildGlobals buffer
+
+ struct QNodeGlobalRootBuffer qnode_global_root_buffer;
+ } qnodeScheduler;
+};
+
+
+struct BVH2Node
+{
+ struct AABB3f box;
+ uint meta_u; // leaf: primref start. inner: offset from node to its first child
+ uint meta_ss;
+ //ushort meta_s; // leaf: primref count. inner: offset from first to second child, in nodes
+ //uchar is_inner; // 1 if inner, 0 if leaf
+ //uchar mask;
+};
+
+struct BVH2
+{
+ uint num_nodes;
+ uint _pad[7]; // align to 32B
+};
+
+struct BatchedBLSDispatchEntry
+{
+ /////////////////////////////////////////////////////////////
+ // State data used for communication with command streamer
+ // NOTE: This part must match definition in 'msb_radix_bitonic_sort.grl'
+ /////////////////////////////////////////////////////////////
+ qword p_data_buffer;
+ qword num_elements; // number of elements in p_data_buffer
+};
+
+struct SAHBuildArgsBatchable
+{
+ qword p_globals_ptrs;
+ qword p_scheduler;
+ qword p_buffers_info;
+ qword p_sah_globals;
+
+ dword num_max_qnode_global_root_buffer_entries;
+ dword num_builds;
+};
+
+#define PREFIX_MK_SAH_BUILD_ARGS_BATCHABLE(prefix, obj) \
+ (struct prefix##_SAHBuildArgsBatchable) { \
+ .p_globals_ptrs = (obj).p_globals_ptrs, \
+ .p_scheduler = (obj).p_scheduler, \
+ .p_buffers_info = (obj).p_buffers_info, \
+ .p_sah_globals = (obj).p_sah_globals, \
+ .num_max_qnode_global_root_buffer_entries = \
+ (obj).num_max_qnode_global_root_buffer_entries, \
+ .num_builds = (obj).num_builds, \
+ }
+
+
+struct SAHBuildBuffersInfo
+{
+ gpuva_t p_globals;
+ gpuva_t p_primref_index_buffers;
+ gpuva_t p_primrefs_buffer;
+ gpuva_t p_bvh2;
+ gpuva_t p_bvh_base;
+ gpuva_t p_qnode_root_buffer;
+ dword sah_globals_flags;
+ dword _pad;
+ gpuva_t _pad2;
+};
+
+#endif /* GRL_STRUCTS_H */
diff --git a/src/intel/vulkan/grl/include/AABB3f.h b/src/intel/vulkan/grl/include/AABB3f.h
new file mode 100644
index 00000000000..a3412332c77
--- /dev/null
+++ b/src/intel/vulkan/grl/include/AABB3f.h
@@ -0,0 +1,459 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#pragma once
+
+#include "GRLRTASCommon.h"
+
+#include "affinespace.h"
+
+#ifndef __OPENCL_VERSION__
+# include "stdio.h" //for printf
+#endif
+
+GRL_NAMESPACE_BEGIN(GRL)
+GRL_NAMESPACE_BEGIN(RTAS)
+
+GRL_INLINE void AABB3f_init(struct AABB3f *aabb)
+{
+ aabb->lower[0] = (float)(INFINITY);
+ aabb->lower[1] = (float)(INFINITY);
+ aabb->lower[2] = (float)(INFINITY);
+
+ aabb->upper[0] = -(float)(INFINITY);
+ aabb->upper[1] = -(float)(INFINITY);
+ aabb->upper[2] = -(float)(INFINITY);
+}
+
+GRL_INLINE float3 AABB3f_load_lower( const struct AABB3f* aabb )
+{
+ float3 v = { aabb->lower[0], aabb->lower[1], aabb->lower[2] };
+ return v;
+}
+GRL_INLINE float3 AABB3f_load_upper( const struct AABB3f* aabb )
+{
+ float3 v = { aabb->upper[0], aabb->upper[1], aabb->upper[2] };
+ return v;
+}
+
+GRL_INLINE void AABB3f_extend(struct AABB3f *aabb, const struct AABB3f *v)
+{
+ aabb->lower[0] = fmin(aabb->lower[0], v->lower[0]);
+ aabb->lower[1] = fmin(aabb->lower[1], v->lower[1]);
+ aabb->lower[2] = fmin(aabb->lower[2], v->lower[2]);
+ aabb->upper[0] = fmax(aabb->upper[0], v->upper[0]);
+ aabb->upper[1] = fmax(aabb->upper[1], v->upper[1]);
+ aabb->upper[2] = fmax(aabb->upper[2], v->upper[2]);
+}
+
+GRL_INLINE void AABB3f_intersect(struct AABB3f* aabb, struct AABB3f inters)
+{
+ aabb->upper[0] = fmin(inters.upper[0],aabb->upper[0]);
+ aabb->upper[1] = fmin(inters.upper[1],aabb->upper[1]);
+ aabb->upper[2] = fmin(inters.upper[2],aabb->upper[2]);
+ aabb->lower[0] = fmax(inters.lower[0],aabb->lower[0]);
+ aabb->lower[1] = fmax(inters.lower[1],aabb->lower[1]);
+ aabb->lower[2] = fmax(inters.lower[2],aabb->lower[2]);
+}
+
+GRL_INLINE void AABB3f_trim_upper(struct AABB3f* aabb, const float* upper)
+{
+ aabb->upper[0] = fmin(upper[0], aabb->upper[0]);
+ aabb->upper[1] = fmin(upper[1], aabb->upper[1]);
+ aabb->upper[2] = fmin(upper[2], aabb->upper[2]);
+}
+
+GRL_INLINE void AABB3f_set( struct AABB3f* aabb, float3 lower, float3 upper )
+{
+ aabb->lower[0] = lower.x ;
+ aabb->lower[1] = lower.y ;
+ aabb->lower[2] = lower.z ;
+ aabb->upper[0] = upper.x ;
+ aabb->upper[1] = upper.y ;
+ aabb->upper[2] = upper.z ;
+}
+
+inline void AABB3f_extend_point(struct AABB3f *aabb, const float3 p)
+{
+ aabb->lower[0] = fmin(aabb->lower[0], p.x);
+ aabb->lower[1] = fmin(aabb->lower[1], p.y);
+ aabb->lower[2] = fmin(aabb->lower[2], p.z);
+ aabb->upper[0] = fmax(aabb->upper[0], p.x);
+ aabb->upper[1] = fmax(aabb->upper[1], p.y);
+ aabb->upper[2] = fmax(aabb->upper[2], p.z);
+}
+
+GRL_INLINE void AABB3f_extendlu(struct AABB3f *aabb, const float3 lower, const float3 upper)
+{
+ aabb->lower[0] = fmin(aabb->lower[0], lower.x);
+ aabb->lower[1] = fmin(aabb->lower[1], lower.y);
+ aabb->lower[2] = fmin(aabb->lower[2], lower.z);
+ aabb->upper[0] = fmax(aabb->upper[0], upper.x);
+ aabb->upper[1] = fmax(aabb->upper[1], upper.y);
+ aabb->upper[2] = fmax(aabb->upper[2], upper.z);
+}
+
+GRL_INLINE float3 AABB3f_size(struct AABB3f* aabb)
+{
+ return AABB3f_load_upper(aabb) - AABB3f_load_lower(aabb);
+}
+
+GRL_INLINE float AABB3f_halfArea(struct AABB3f *aabb)
+{
+ const float3 d = AABB3f_load_upper( aabb ) - AABB3f_load_lower( aabb );
+ return d.x* (d.y + d.z) + (d.y * d.z);
+}
+
+GRL_INLINE float halfArea_AABB3f(struct AABB3f *aabb) // TODO: Remove me
+{
+ const float3 d = { aabb->upper[0] - aabb->lower[0], aabb->upper[1] - aabb->lower[1], aabb->upper[2] - aabb->lower[2] };
+ return fma(d.x, (d.y + d.z), d.y * d.z);
+}
+
+GRL_INLINE void AABB3f_set_lower(struct AABB3f* aabb, float3 lower)
+{
+ aabb->lower[0] = lower.x;
+ aabb->lower[1] = lower.y;
+ aabb->lower[2] = lower.z;
+}
+
+GRL_INLINE void AABB3f_set_upper(struct AABB3f* aabb, float3 upper)
+{
+ aabb->upper[0] = upper.x;
+ aabb->upper[1] = upper.y;
+ aabb->upper[2] = upper.z;
+}
+
+GRL_INLINE float3 conservativeExtent(float3 extent)
+{
+ const float v = FLT_EPSILON * fmax(extent.x, fmax(extent.y, extent.z));
+ float3 v3 = { v,v,v };
+ extent = extent + v3;
+ return extent;
+}
+
+inline struct AABB3f GRL_OVERLOADABLE transform_aabb(float3 lower, float3 upper, const float* Transform)
+{
+#if 1
+ // We use an abs-matrix to transform the AABB extent vector, which is enough to compute the area
+ // New AABB is center +- Extent.
+ //
+ // For derivation see:
+ // https://zeux.io/2010/10/17/aabb-from-obb-with-component-wise-abs/
+ //
+
+ float3 Center = (upper + lower) * 0.5f;
+ float3 Extent = (conservativeExtent(upper) - lower) * 0.5f;
+
+ float cx = Center.x * Transform[0] + Center.y * Transform[1] + Center.z * Transform[2] + Transform[3];
+ float cy = Center.x * Transform[4] + Center.y * Transform[5] + Center.z * Transform[6] + Transform[7];
+ float cz = Center.x * Transform[8] + Center.y * Transform[9] + Center.z * Transform[10] + Transform[11];
+ float ex = Extent.x * fabs(Transform[0]) + Extent.y * fabs(Transform[1]) + Extent.z * fabs(Transform[2]);
+ float ey = Extent.x * fabs(Transform[4]) + Extent.y * fabs(Transform[5]) + Extent.z * fabs(Transform[6]);
+ float ez = Extent.x * fabs(Transform[8]) + Extent.y * fabs(Transform[9]) + Extent.z * fabs(Transform[10]);
+
+ Center.x = cx; Center.y = cy; Center.z = cz;
+ Extent.x = ex; Extent.y = ey; Extent.z = ez;
+
+ struct AABB3f box;
+ AABB3f_set_lower(&box, Center - Extent);
+ AABB3f_set_upper(&box, Center + Extent);
+ return box;
+#else
+ struct AffineSpace3f xfm = AffineSpace3f_load_row_major(Transform);
+
+ float3 plll = { lower.x, lower.y, lower.z };
+ float3 pllu = { lower.x, lower.y, upper.z };
+ float3 plul = { lower.x, upper.y, lower.z };
+ float3 pluu = { lower.x, upper.y, upper.z };
+ float3 pull = { upper.x, lower.y, lower.z };
+ float3 pulu = { upper.x, lower.y, upper.z };
+ float3 puul = { upper.x, upper.y, lower.z };
+ float3 puuu = { upper.x, upper.y, upper.z };
+ plll = xfmPoint(xfm, plll) ;
+ pllu = xfmPoint(xfm, pllu) ;
+ plul = xfmPoint(xfm, plul) ;
+ pluu = xfmPoint(xfm, pluu) ;
+ pull = xfmPoint(xfm, pull) ;
+ pulu = xfmPoint(xfm, pulu) ;
+ puul = xfmPoint(xfm, puul) ;
+ puuu = xfmPoint(xfm, puuu) ;
+
+ float3 p1_min = fmin(plll, pull);
+ float3 p2_min = fmin(pllu, pulu);
+ float3 p3_min = fmin(plul, puul);
+ float3 p4_min = fmin(pluu, puuu);
+ float3 p1_max = fmax(plll, pull);
+ float3 p2_max = fmax(pllu, pulu);
+ float3 p3_max = fmax(plul, puul);
+ float3 p4_max = fmax(pluu, puuu);
+ p1_min = fmin(p1_min, p3_min);
+ p2_min = fmin(p2_min, p4_min);
+ p1_max = fmax(p1_max, p3_max);
+ p2_max = fmax(p2_max, p4_max);
+ p1_min = fmin(p1_min, p2_min);
+ p1_max = fmax(p1_max, p2_max);
+
+ AABB3f out = {
+ {p1_min.x,p1_min.y,p1_min.z},
+ {p1_max.x,p1_max.y,p1_max.z}
+ };
+ return out;
+#endif
+}
+
+GRL_INLINE struct AABB3f GRL_OVERLOADABLE transform_aabb(struct AABB3f box, const float* Transform)
+{
+ float3 lower = { box.lower[0], box.lower[1], box.lower[2] };
+ float3 upper = { box.upper[0], box.upper[1], box.upper[2] };
+ return transform_aabb(lower, upper, Transform);
+}
+
+GRL_INLINE struct AABB3f AABB3f_transform(struct AffineSpace3f xfm, struct AABB3f in)
+{
+ struct AABB3f out;
+ float rmTransform[12];
+ load_row_major_from_AffineSpace3f(xfm, rmTransform);
+ out = transform_aabb(in, rmTransform);
+
+ return out;
+}
+
+GRL_INLINE bool AABB3f_isIn(struct AABB3f bigger, float3 contained)
+{
+ bool iscontained =
+ contained.x >= bigger.lower[0] &&
+ contained.y >= bigger.lower[1] &&
+ contained.z >= bigger.lower[2] &&
+ contained.x <= bigger.upper[0] &&
+ contained.y <= bigger.upper[1] &&
+ contained.z <= bigger.upper[2];
+
+ return iscontained;
+}
+
+GRL_INLINE bool AABB3f_isSubset(struct AABB3f bigger, struct AABB3f contained)
+{
+ bool iscontained =
+ contained.lower[0] >= bigger.lower[0] &&
+ contained.lower[1] >= bigger.lower[1] &&
+ contained.lower[2] >= bigger.lower[2] &&
+ contained.upper[0] <= bigger.upper[0] &&
+ contained.upper[1] <= bigger.upper[1] &&
+ contained.upper[2] <= bigger.upper[2];
+
+ return iscontained;
+}
+
+GRL_INLINE bool AABB3f_is_degenerate(struct AABB3f* box )
+{
+ return box->lower[0] > box->upper[0] ||
+ box->lower[1] > box->upper[1] ||
+ box->lower[2] > box->upper[2];
+}
+
+GRL_INLINE void AABB3f_print(struct AABB3f *aabb)
+{
+ printf("AABB {\n");
+ printf(" lower = %f, %f, %f\n", aabb->lower[0], aabb->lower[1], aabb->lower[2]);
+ printf(" upper = %f, %f, %f\n", aabb->upper[0], aabb->upper[1], aabb->upper[2]);
+ printf("}\n");
+}
+
+
+
+#ifdef __OPENCL_VERSION__
+GRL_INLINE struct AABB3f AABB3f_sub_group_shuffle(struct AABB3f *aabb, const uint slotID)
+{
+ struct AABB3f bounds;
+ bounds.lower[0] = intel_sub_group_shuffle(aabb->lower[0], slotID);
+ bounds.lower[1] = intel_sub_group_shuffle(aabb->lower[1], slotID);
+ bounds.lower[2] = intel_sub_group_shuffle(aabb->lower[2], slotID);
+ bounds.upper[0] = intel_sub_group_shuffle(aabb->upper[0], slotID);
+ bounds.upper[1] = intel_sub_group_shuffle(aabb->upper[1], slotID);
+ bounds.upper[2] = intel_sub_group_shuffle(aabb->upper[2], slotID);
+ return bounds;
+}
+
+GRL_INLINE struct AABB3f AABB3f_sub_group_reduce(struct AABB3f *aabb)
+{
+ struct AABB3f bounds;
+ bounds.lower[0] = sub_group_reduce_min(aabb->lower[0]);
+ bounds.lower[1] = sub_group_reduce_min(aabb->lower[1]);
+ bounds.lower[2] = sub_group_reduce_min(aabb->lower[2]);
+ bounds.upper[0] = sub_group_reduce_max(aabb->upper[0]);
+ bounds.upper[1] = sub_group_reduce_max(aabb->upper[1]);
+ bounds.upper[2] = sub_group_reduce_max(aabb->upper[2]);
+ return bounds;
+}
+
+GRL_INLINE struct AABB3f AABB3f_sub_group_scan_exclusive_min_max(struct AABB3f *aabb)
+{
+ struct AABB3f bounds;
+ bounds.lower[0] = sub_group_scan_exclusive_min(aabb->lower[0]);
+ bounds.lower[1] = sub_group_scan_exclusive_min(aabb->lower[1]);
+ bounds.lower[2] = sub_group_scan_exclusive_min(aabb->lower[2]);
+ bounds.upper[0] = sub_group_scan_exclusive_max(aabb->upper[0]);
+ bounds.upper[1] = sub_group_scan_exclusive_max(aabb->upper[1]);
+ bounds.upper[2] = sub_group_scan_exclusive_max(aabb->upper[2]);
+ return bounds;
+}
+
+GRL_INLINE struct AABB3f AABB3f_sub_group_scan_inclusive_min_max(struct AABB3f *aabb)
+{
+ struct AABB3f bounds;
+ bounds.lower[0] = sub_group_scan_inclusive_min(aabb->lower[0]);
+ bounds.lower[1] = sub_group_scan_inclusive_min(aabb->lower[1]);
+ bounds.lower[2] = sub_group_scan_inclusive_min(aabb->lower[2]);
+ bounds.upper[0] = sub_group_scan_inclusive_max(aabb->upper[0]);
+ bounds.upper[1] = sub_group_scan_inclusive_max(aabb->upper[1]);
+ bounds.upper[2] = sub_group_scan_inclusive_max(aabb->upper[2]);
+ return bounds;
+}
+
+GRL_INLINE void AABB3f_atomic_merge_local_nocheck(local struct AABB3f *aabb, const float4 lower, const float4 upper)
+{
+ atomic_min((local float *)&aabb->lower + 0, lower.x);
+ atomic_min((local float *)&aabb->lower + 1, lower.y);
+ atomic_min((local float *)&aabb->lower + 2, lower.z);
+ atomic_max((local float *)&aabb->upper + 0, upper.x);
+ atomic_max((local float *)&aabb->upper + 1, upper.y);
+ atomic_max((local float *)&aabb->upper + 2, upper.z);
+}
+
+
+GRL_INLINE void AABB3f_atomic_merge_global_lu( global struct AABB3f* aabb, const float3 lower, const float3 upper )
+{
+ atomic_min( (global float*) & aabb->lower + 0, lower.x );
+ atomic_min( (global float*) & aabb->lower + 1, lower.y );
+ atomic_min( (global float*) & aabb->lower + 2, lower.z );
+ atomic_max( (global float*) & aabb->upper + 0, upper.x );
+ atomic_max( (global float*) & aabb->upper + 1, upper.y );
+ atomic_max( (global float*) & aabb->upper + 2, upper.z );
+}
+
+GRL_INLINE void AABB3f_atomic_merge_local_lu( local struct AABB3f* aabb, const float3 lower, const float3 upper )
+{
+ atomic_min( (local float*) & aabb->lower + 0, lower.x );
+ atomic_min( (local float*) & aabb->lower + 1, lower.y );
+ atomic_min( (local float*) & aabb->lower + 2, lower.z );
+ atomic_max( (local float*) & aabb->upper + 0, upper.x );
+ atomic_max( (local float*) & aabb->upper + 1, upper.y );
+ atomic_max( (local float*) & aabb->upper + 2, upper.z );
+}
+
+GRL_INLINE void Uniform_AABB3f_atomic_merge_local_sub_group_lu(uniform local struct AABB3f* aabb, const float3 lower, const float3 upper)
+{
+ float lx = sub_group_reduce_min(lower.x);
+ float ly = sub_group_reduce_min(lower.y);
+ float lz = sub_group_reduce_min(lower.z);
+
+ float ux = sub_group_reduce_max(upper.x);
+ float uy = sub_group_reduce_max(upper.y);
+ float uz = sub_group_reduce_max(upper.z);
+
+ if (get_sub_group_local_id() == 0)
+ {
+ atomic_min((local float*) & aabb->lower + 0, lx);
+ atomic_min((local float*) & aabb->lower + 1, ly);
+ atomic_min((local float*) & aabb->lower + 2, lz);
+ atomic_max((local float*) & aabb->upper + 0, ux);
+ atomic_max((local float*) & aabb->upper + 1, uy);
+ atomic_max((local float*) & aabb->upper + 2, uz);
+ }
+}
+
+GRL_INLINE void AABB3f_atomic_merge_global_sub_group_lu(uniform global struct AABB3f* aabb, const float3 lower, const float3 upper)
+{
+ uint lane = get_sub_group_local_id();
+ float l[3];
+ l[0] = sub_group_reduce_min(lower.x);
+ l[1] = sub_group_reduce_min(lower.y);
+ l[2] = sub_group_reduce_min(lower.z);
+ float u[3];
+ u[0] = sub_group_reduce_max(upper.x);
+ u[1] = sub_group_reduce_max(upper.y);
+ u[2] = sub_group_reduce_max(upper.z);
+
+ if (lane < 3)
+ {
+ atomic_min((global float*)&aabb->lower + lane, l[lane]);
+ atomic_max((global float*)&aabb->upper + lane, u[lane]);
+ }
+}
+
+GRL_INLINE void AABB3f_atomic_merge_global( global struct AABB3f* aabb, struct AABB3f* other )
+{
+ float3 lower = AABB3f_load_lower( other );
+ float3 upper = AABB3f_load_upper( other );
+ atomic_min( (global float*) & aabb->lower + 0, lower.x );
+ atomic_min( (global float*) & aabb->lower + 1, lower.y );
+ atomic_min( (global float*) & aabb->lower + 2, lower.z );
+ atomic_max( (global float*) & aabb->upper + 0, upper.x );
+ atomic_max( (global float*) & aabb->upper + 1, upper.y );
+ atomic_max( (global float*) & aabb->upper + 2, upper.z );
+}
+
+GRL_INLINE void AABB3f_atomic_merge_localBB_nocheck( local struct AABB3f* aabb, struct AABB3f* bb )
+{
+ atomic_min( (local float*) & aabb->lower + 0, bb->lower[0] );
+ atomic_min( (local float*) & aabb->lower + 1, bb->lower[1] );
+ atomic_min( (local float*) & aabb->lower + 2, bb->lower[2] );
+ atomic_max( (local float*) & aabb->upper + 0, bb->upper[0] );
+ atomic_max( (local float*) & aabb->upper + 1, bb->upper[1] );
+ atomic_max( (local float*) & aabb->upper + 2, bb->upper[2] );
+}
+
+GRL_INLINE void AABB3f_atomic_merge_local(local struct AABB3f *aabb, const float4 lower, const float4 upper)
+{
+ if (lower.x < aabb->lower[0])
+ atomic_min((local float *)&aabb->lower + 0, lower.x);
+ if (lower.y < aabb->lower[1])
+ atomic_min((local float *)&aabb->lower + 1, lower.y);
+ if (lower.z < aabb->lower[2])
+ atomic_min((local float *)&aabb->lower + 2, lower.z);
+ if (upper.x > aabb->upper[0])
+ atomic_max((local float *)&aabb->upper + 0, upper.x);
+ if (upper.y > aabb->upper[1])
+ atomic_max((local float *)&aabb->upper + 1, upper.y);
+ if (upper.z > aabb->upper[2])
+ atomic_max((local float *)&aabb->upper + 2, upper.z);
+}
+
+GRL_INLINE void AABB3f_atomic_merge_global_local(global struct AABB3f *dest, local struct AABB3f *source)
+{
+ float3 l = AABB3f_load_lower(source);
+ float3 u = AABB3f_load_upper(source);
+ atomic_min((global float *)&dest->lower + 0, l.x );
+ atomic_min((global float *)&dest->lower + 1, l.y );
+ atomic_min((global float *)&dest->lower + 2, l.z );
+ atomic_max((global float *)&dest->upper + 0, u.x );
+ atomic_max((global float *)&dest->upper + 1, u.y );
+ atomic_max((global float *)&dest->upper + 2, u.z );
+}
+
+
+struct AABB3f AABB3f_construct( float3 min, float3 max )
+{
+ struct AABB3f bb;
+ bb.lower[0] = min.x; bb.lower[1] = min.y; bb.lower[2] = min.z;
+ bb.upper[0] = max.x; bb.upper[1] = max.y; bb.upper[2] = max.z;
+ return bb;
+}
+
+struct AABB3f AABB3f_select( struct AABB3f left, struct AABB3f right, int3 cond )
+{
+ float3 l = select( AABB3f_load_lower(&left), AABB3f_load_lower(&right), cond );
+ float3 u = select( AABB3f_load_upper(&left), AABB3f_load_upper(&right), cond );
+ return AABB3f_construct( l, u );
+}
+
+#endif
+
+GRL_NAMESPACE_END(RTAS)
+GRL_NAMESPACE_END(GRL)
+
diff --git a/src/intel/vulkan/grl/include/GRLGen12.h b/src/intel/vulkan/grl/include/GRLGen12.h
new file mode 100644
index 00000000000..20849599e91
--- /dev/null
+++ b/src/intel/vulkan/grl/include/GRLGen12.h
@@ -0,0 +1,691 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+//
+// This file is to contain structure definitions related to the Gen12 QBVH6 acceleration structures
+//
+//
+
+//********************************************************************************************
+// WARNING!!!!!
+// This file is shared by OpenCL and C++ source code and must be compatible.
+// There should only be C structure definitions and trivial GRL_INLINE functions here
+//
+//********************************************************************************************
+
+#pragma once
+
+#include "GRLRTASCommon.h"
+#include "GRLUtilities.h"
+
+GRL_NAMESPACE_BEGIN(GRL)
+GRL_NAMESPACE_BEGIN(RTAS)
+GRL_NAMESPACE_BEGIN(GEN12)
+
+ enum_uint8(NodeType)
+ {
+ NODE_TYPE_MIXED = 0x0, // identifies a mixed internal node where each child can have a different type
+ NODE_TYPE_INTERNAL = 0x0, // internal BVH node with 6 children
+ NODE_TYPE_INSTANCE = 0x1, // instance leaf
+ NODE_TYPE_PROCEDURAL = 0x3, // procedural leaf
+ NODE_TYPE_QUAD = 0x4, // quad leaf
+ NODE_TYPE_INVALID = 0x7 // indicates invalid node
+ };
+
+
+ typedef enum PrimLeafType
+ {
+ TYPE_NONE = 0,
+
+ TYPE_QUAD = 0,
+
+ /* For a node type of NODE_TYPE_PROCEDURAL we support enabling
+ * and disabling the opaque/non_opaque culling. */
+
+ TYPE_OPACITY_CULLING_ENABLED = 0,
+ TYPE_OPACITY_CULLING_DISABLED = 1
+ } PrimLeafType;
+
+ #define BVH_MAGIC_MACRO "GEN12_RTAS_005" // If serialization-breaking or algorithm-breaking changes are made, increment the digits at the end
+ static const char BVH_MAGIC[16] = BVH_MAGIC_MACRO;
+
+ typedef struct BVHBase
+ {
+ // TODO: Implement the "copy-first-node" trick... duplicate root node here
+
+ uint64_t rootNodeOffset;
+
+ uint32_t reserved;
+
+ uint32_t nodeDataCur; // nodeDataStart is sizeof(BVHBase) / 64 = BVH_ROOT_NODE_OFFSET / 64
+ uint32_t quadLeafStart;
+ uint32_t quadLeafCur;
+ uint32_t proceduralDataStart;
+ uint32_t proceduralDataCur;
+ uint32_t instanceLeafStart;
+ uint32_t instanceLeafEnd;
+ uint32_t backPointerDataStart; //
+ uint32_t refitTreeletsDataStart; // refit structs
+ uint32_t refitStartPointDataStart; //
+ uint32_t BVHDataEnd;
+
+ // number of bottom treelets
+ // if 1, then the bottom treelet is also tip treelet
+ uint32_t refitTreeletCnt;
+ uint32_t refitTreeletCnt2; // always 0, used for atomic updates
+ // data layout:
+ // @backPointerDataStart
+ // 'backpointer' - a dword per inner node.
+ // The bits are used as follows:
+ // 2:0 --> Used as a refit counter during BVH refitting. MBZ
+ // 5:3 --> Number of children
+ // 31:6 --> Index of the parent node in the internal node array
+ // The root node has a parent index of all ones
+ // @refitTreeletsDataStart
+ // RefitTreelet[], the last treelet is for top treelet all previous are for bottom
+ // @refitStartPointDataStart
+ // for each treelet T there is [T.startpoint_offset, T.numStartpoints) interval of startpoints here in that space
+ // @backPointerDataEnd
+
+ uint32_t fatLeafCount; // number of internal nodes which are "fat-leaves"
+ uint32_t innerCount; // number of internal nodes which are true inner nodes (all internalNode children)
+ uint32_t fatLeafTableStart;
+ uint32_t innerTableStart;
+
+ uint32_t quadLeftoversCountNewAtomicUpdate; // number of quad leftovers for new atomic update
+ uint32_t quadTableSizeNewAtomicUpdate; // size of quad Table including leftovers, padded to 256
+ uint32_t quadIndicesDataStart;
+
+ uint32_t _pad[9];
+
+ struct RTASMetaData Meta;
+
+ } BVHBase;
+
+ GRL_INLINE struct GeoMetaData* BVHBase_GetGeoMetaData(BVHBase* base)
+ {
+ return (struct GeoMetaData*)(((char*)base) + base->Meta.geoDescsStart);
+ }
+
+#ifdef __OPENCL_VERSION__
+#define BVH_ROOT_NODE_OFFSET sizeof(BVHBase)
+#else
+#define BVH_ROOT_NODE_OFFSET sizeof(GRL::RTAS::GEN12::BVHBase)
+#endif
+
+GRL_STATIC_ASSERT( sizeof(BVHBase) == BVH_ROOT_NODE_OFFSET, "Wrong size!");
+GRL_STATIC_ASSERT( (sizeof(BVHBase) % 64) == 0 , "Misaligned size!");
+
+ typedef struct BackPointers {
+ } BackPointers;
+
+ // threshold for size of bottom treelets, note usually treelets will be 2-3x smaller than that number
+ // means that no bottom treelet has more paths than this number
+ #define TREELET_NUM_STARTPOINTS 1536
+
+ // threshold under which only one treelet will be created
+ #define SINGLE_TREELET_THRESHOLD 3072
+
+ typedef struct LeafTableEntry {
+
+ uint backpointer;
+ uint inner_node_index;
+ uint leaf_index;
+ } LeafTableEntry;
+
+ typedef struct InnerNodeTableEntry {
+
+ uint node_index_and_numchildren; // numchildren in 3 lsbs
+ uint first_child;
+
+ } InnerNodeTableEntry;
+
+ typedef struct QuadDataIndices
+ {
+ uint header_data[4];
+ uint vert_idx[4];
+ } QuadDataIndices;
+
+ typedef struct RefitTreelet {
+ uint32_t startpoint_offset;
+ uint32_t numStartpoints;
+ uint32_t numNonTrivialStartpoints;
+ uint8_t maxDepth;
+ uint8_t depthLess64; // depth from bottom at which there are less 64 paths
+ uint8_t depthLess128;// depth from bottom at which there are less 128 paths
+ uint8_t depthLess256;// depth from bottom at which there are less 256 paths
+ } RefitTreelet;
+
+ // if RefitTreelet has number of startpoints == 1
+ // it should be reinterpreted as:
+ typedef struct RefitTreeletTrivial {
+ uint32_t theOnlyNodeIndex;
+ uint32_t numStartpoints; // have to be 1 or 0
+ int32_t childrenOffsetOfTheNode; // 0th node based
+ uint8_t maxDepth;
+ uint8_t numChildrenOfTheNode;
+ } RefitTreeletTrivial;
+
+ // 5:0 - depth after you die
+ // 31:6 - Index of the inner node
+ typedef uint32_t StartPoint;
+
+ struct HwInstanceLeaf;
+ struct QuadLeaf;
+ struct ProceduralLeaf;
+ struct InternalNode;
+
+ typedef struct HwInstanceLeaf HwInstanceLeaf;
+ typedef struct InternalNode InternalNode;
+ typedef struct QuadLeaf QuadLeaf;
+ typedef struct ProceduralLeaf ProceduralLeaf;
+
+ GRL_INLINE uint32_t BackPointer_GetParentIndex( uint32_t bp )
+ {
+ return bp >> 6;
+ }
+ GRL_INLINE uint32_t BackPointer_GetNumChildren( uint32_t bp )
+ {
+ return (bp >> 3) & (7);
+ }
+ GRL_INLINE uint32_t BackPointer_GetRefitCount( uint32_t bp )
+ {
+ return bp & 7;
+ }
+ GRL_INLINE bool BackPointer_IsRoot( uint32_t bp )
+ {
+ return (bp >> 6) == 0x03FFFFFF;
+ }
+
+ GRL_INLINE InternalNode* BVHBase_GetRootNode( const BVHBase* p )
+ {
+ return (InternalNode*)( ((char*)p) + BVH_ROOT_NODE_OFFSET);
+ }
+
+ GRL_INLINE AABB3f BVHBase_GetRootAABB(const BVHBase* p)
+ {
+ return p->Meta.bounds;
+ }
+
+ GRL_INLINE InternalNode* BVHBase_GetInternalNodes(const BVHBase* p)
+ {
+ return (InternalNode*)(((char*)p) + BVH_ROOT_NODE_OFFSET);
+ }
+ GRL_INLINE InternalNode* BVHBase_GetInternalNodesEnd(const BVHBase* p)
+ {
+ return (InternalNode*)(((char*)p) + (size_t)(64u * p->nodeDataCur));
+ }
+ GRL_INLINE uint32_t BVHBase_GetNumInternalNodes(const BVHBase* p)
+ {
+ return p->nodeDataCur - BVH_ROOT_NODE_OFFSET / 64;
+ }
+
+
+ GRL_INLINE QuadLeaf* BVHBase_GetQuadLeaves(const BVHBase* p)
+ {
+ return (QuadLeaf*)(((char*)p) + (size_t)(64u * p->quadLeafStart));
+ }
+ GRL_INLINE const QuadLeaf* BVHBase_GetQuadLeaves_End(const BVHBase* p)
+ {
+ return (QuadLeaf*)(((char*)p) + (size_t)(64u * p->quadLeafCur));
+ }
+
+ GRL_INLINE const ProceduralLeaf* BVHBase_GetProceduralLeaves_End(const BVHBase* p)
+ {
+ return (ProceduralLeaf*)(((char*)p) + (size_t)(64u * p->proceduralDataCur));
+ }
+
+ GRL_INLINE ProceduralLeaf* BVHBase_GetProceduralLeaves(const BVHBase* p)
+ {
+ return (ProceduralLeaf*)(((char*)p) + (size_t)(64u * p->proceduralDataStart));
+ }
+
+ GRL_INLINE HwInstanceLeaf* BVHBase_GetHWInstanceLeaves(const BVHBase* p )
+ {
+ char* pRTASBits = (char*)p;
+ return (HwInstanceLeaf*)(pRTASBits + (size_t)(64u * p->instanceLeafStart));
+ }
+
+ GRL_INLINE HwInstanceLeaf* BVHBase_GetHWInstanceLeaves_End(const BVHBase* p )
+ {
+ char* pRTASBits = (char*) p;
+ return (HwInstanceLeaf*)(pRTASBits + (size_t)(64u * p->instanceLeafEnd));
+ }
+
+ GRL_INLINE uint BVHBase_GetNumHWInstanceLeaves( const BVHBase* p )
+ {
+ return (p->instanceLeafEnd - p->instanceLeafStart) / 2;
+ }
+
+ GRL_INLINE uint* BVHBase_GetRefitStartPoints(const BVHBase* p)
+ {
+ return (uint32_t*)(((char*)p) + (size_t)(64u * p->refitStartPointDataStart));
+ }
+
+ GRL_INLINE uint BVHBase_GetRefitStartPointsSize(const BVHBase* p)
+ {
+ return 64u * (p->fatLeafTableStart - p->refitStartPointDataStart);
+ }
+
+ GRL_INLINE uint StartPoint_GetDepth(StartPoint s)
+ {
+ return s & ((1 << 6) - 1);
+ }
+
+ GRL_INLINE uint StartPoint_GetNodeIdx(StartPoint s)
+ {
+ return s >> 6;
+ }
+
+ GRL_INLINE RefitTreelet* BVHBase_GetRefitTreeletDescs(const BVHBase* p)
+ {
+ return (RefitTreelet*)(((char*)p) + (size_t)(64u * p->refitTreeletsDataStart));
+ }
+
+ // this is treelet count as should be executed, ie. num of bottom treelets if there are top and bottoms.
+ // to get real number of all treelets including tip, the formula is
+ // actualNumTreelets = refitTreeletCnt > 1 ? refitTreeletCnt + 1 : 1;
+ GRL_INLINE uint32_t* BVHBase_GetRefitTreeletCntPtr(BVHBase* p)
+ {
+ return &p->refitTreeletCnt;
+ }
+
+ GRL_INLINE uint32_t BVHBase_GetRefitTreeletCnt(const BVHBase* p)
+ {
+ return p->refitTreeletCnt;
+ }
+
+ GRL_INLINE uint32_t BVHBase_IsSingleTreelet(const BVHBase* p)
+ {
+ return p->refitTreeletCnt == 1;
+ }
+
+ GRL_INLINE BackPointers* BVHBase_GetBackPointers(const BVHBase* p)
+ {
+ return (BackPointers*)(((char*)p) + (size_t)(64u * p->backPointerDataStart));
+ }
+
+
+ GRL_INLINE LeafTableEntry* BVHBase_GetFatLeafTable(const BVHBase* p)
+ {
+ return (LeafTableEntry*)(((char*)p) + (size_t)(64u * p->fatLeafTableStart));
+ }
+ GRL_INLINE InnerNodeTableEntry* BVHBase_GetInnerNodeTable(const BVHBase* p)
+ {
+ return (InnerNodeTableEntry*)(((char*)p) + (size_t)(64u * p->innerTableStart));
+ }
+ GRL_INLINE QuadDataIndices* BVHBase_GetQuadDataIndicesTable(const BVHBase* p)
+ {
+ return (QuadDataIndices*)(((char*)p) + (size_t)(64u * p->quadIndicesDataStart));
+ }
+
+ GRL_INLINE unsigned* InnerNode_GetBackPointer(
+ BackPointers* backpointersStruct,
+ uint32_t inodeOffset /*in 64B units, from the earliest Inner node*/)
+ {
+ uint* backpointersArray = (uint*)backpointersStruct;
+ // BACKPOINTER_LAYOUT
+ uint new_index = inodeOffset; //<-layout canonical
+ //uint new_index = inodeOffset*16; //<-layout scattered
+ // uint new_index = (inodeOffset & (~0xFFFF)) | (((inodeOffset & 0xFF) << 8) | ((inodeOffset & 0xFF00) >> 8)); //<-layout hashed
+
+ return backpointersArray + new_index;
+ }
+
+ GRL_INLINE uint32_t BVHBase_GetRefitStructsDataSize(const BVHBase* p)
+ {
+ return 64u * (p->BVHDataEnd - p->backPointerDataStart);
+ }
+
+ GRL_INLINE uint32_t BVHBase_GetBackpointersDataSize(const BVHBase* p)
+ {
+ return 64u * (p->refitTreeletsDataStart - p->backPointerDataStart);
+ }
+
+ GRL_INLINE uint32_t* BVHBase_GetBVHDataEnd( const BVHBase* p )
+ {
+ return (uint32_t*)(((char*)p) + (size_t)(64u * p->BVHDataEnd));
+ }
+
+ GRL_INLINE bool BVHBase_HasBackPointers( const BVHBase* p )
+ {
+ return p->refitTreeletsDataStart > p->backPointerDataStart;
+ }
+
+ GRL_INLINE const size_t BVHBase_GetNumQuads(const BVHBase* p)
+ {
+ return p->quadLeafCur - p->quadLeafStart;
+ }
+
+ GRL_INLINE const size_t BVHBase_GetNumProcedurals(const BVHBase* p)
+ {
+ return p->proceduralDataCur - p->proceduralDataStart;
+ }
+
+ GRL_INLINE const size_t BVHBase_GetNumInstances(const BVHBase* p)
+ {
+ return (p->instanceLeafEnd - p->instanceLeafStart) / 2;
+ }
+
+ GRL_INLINE const size_t BVHBase_totalBytes(const BVHBase* p)
+ {
+ return p->BVHDataEnd * 64u;
+ }
+
+
+
+ struct HwInstanceLeaf
+ {
+ /* first 64 bytes accessed during traversal */
+ struct Part0
+ {
+ //uint32_t shaderIndex : 24;
+ //uint32_t geomMask : 8;
+ uint32_t DW0;
+
+ // uint32_t instanceContributionToHitGroupIndex : 24;
+ // uint32_t pad0 : 8
+ //
+ // NOTE: Traversal shaders are implemented by aliasing instance leaves as procedural and sending them through the procedural path
+ // For a procedural instance, bit 29 should be set to 1, to disable "opaque culling"
+ // and bits 30 and 31 must be zero. See also the definition of the 'PrimLeafDesc' structure
+ uint32_t DW1;
+
+ // uint64_t rootNodePtr : 48;
+ // uint64_t instFlags : 8;
+ // uint64_t pad1 : 8;
+ uint64_t DW2_DW3;
+
+ // Vec3f world2obj_vx; // 1st row of Worl2Obj transform
+ float world2obj_vx_x;
+ float world2obj_vx_y;
+ float world2obj_vx_z;
+
+ // Vec3f world2obj_vy; // 2nd row of Worl2Obj transform
+ float world2obj_vy_x;
+ float world2obj_vy_y;
+ float world2obj_vy_z;
+
+ // Vec3f world2obj_vz; // 3rd row of Worl2Obj transform
+ float world2obj_vz_x;
+ float world2obj_vz_y;
+ float world2obj_vz_z;
+
+ // Vec3f obj2world_p; // translation of Obj2World transform (on purpose in fist 64 bytes)
+ float obj2world_p_x;
+ float obj2world_p_y;
+ float obj2world_p_z;
+ } part0;
+
+ /* second 64 bytes accessed during shading */
+ // NOTE: Everything in this block is under SW control
+ struct Part1
+ {
+ // uint64_t bvhPtr : 48;
+ // uint64_t pad : 16;
+ uint64_t DW0_DW1;
+
+ uint32_t instanceID;
+ uint32_t instanceIndex;
+
+ // Vec3f world2obj_vx; // 1st row of Worl2Obj transform
+ float obj2world_vx_x;
+ float obj2world_vx_y;
+ float obj2world_vx_z;
+
+ // Vec3f world2obj_vy; // 2nd row of Worl2Obj transform
+ float obj2world_vy_x;
+ float obj2world_vy_y;
+ float obj2world_vy_z;
+
+ // Vec3f world2obj_vz; // 3rd row of Worl2Obj transform
+ float obj2world_vz_x;
+ float obj2world_vz_y;
+ float obj2world_vz_z;
+
+ // Vec3f obj2world_p; // translation of Obj2World transform (on purpose in fist 64 bytes)
+ float world2obj_p_x;
+ float world2obj_p_y;
+ float world2obj_p_z;
+ } part1;
+ };
+
+ __constant const uint64_t c_one = 1ul;
+
+ GRL_INLINE uint32_t HwInstanceLeaf_GetInstanceMask( const HwInstanceLeaf* p )
+ {
+ return p->part0.DW0 >> 24;
+ }
+
+ GRL_INLINE uint32_t HwInstanceLeaf_GetInstanceContributionToHitGroupIndex( const HwInstanceLeaf* p )
+ {
+ return p->part0.DW1 & 0x00ffffff;
+ }
+
+ GRL_INLINE uint32_t HwInstanceLeaf_GetInstanceFlags( const HwInstanceLeaf* p )
+ {
+ return (p->part0.DW2_DW3 >> 48) & 0xff;
+ }
+ GRL_INLINE uint32_t HwInstanceLeaf_GetInstanceID( const HwInstanceLeaf* p )
+ {
+ return p->part1.instanceID;
+ }
+
+ GRL_INLINE gpuva_t HwInstanceLeaf_GetBVH( const HwInstanceLeaf* p ) { return p->part1.DW0_DW1 & ((c_one << 48) - 1); }
+ GRL_INLINE gpuva_t HwInstanceLeaf_GetStartNode( const HwInstanceLeaf* p ) { return p->part0.DW2_DW3 & ((c_one << 48) - 1); }
+ GRL_INLINE uint32_t HwInstanceLeaf_GetInstanceIndex( const HwInstanceLeaf* p ) { return p->part1.instanceIndex; }
+
+ GRL_INLINE void HwInstanceLeaf_GetTransform(struct HwInstanceLeaf* p, float* transform)
+ {
+ transform[0] = p->part1.obj2world_vx_x;
+ transform[1] = p->part1.obj2world_vy_x;
+ transform[2] = p->part1.obj2world_vz_x;
+ transform[3] = p->part0.obj2world_p_x;
+ transform[4] = p->part1.obj2world_vx_y;
+ transform[5] = p->part1.obj2world_vy_y;
+ transform[6] = p->part1.obj2world_vz_y;
+ transform[7] = p->part0.obj2world_p_y;
+ transform[8] = p->part1.obj2world_vx_z;
+ transform[9] = p->part1.obj2world_vy_z;
+ transform[10] = p->part1.obj2world_vz_z;
+ transform[11] = p->part0.obj2world_p_z;
+ }
+
+ GRL_INLINE void HwInstanceLeaf_SetBVH( HwInstanceLeaf* p, gpuva_t b ) {
+ uint64_t mask = ((c_one << 48) - 1);
+ uint64_t v = p->part1.DW0_DW1;
+ v = (b & mask) | (v & ~mask);
+ p->part1.DW0_DW1 = v;
+ }
+ GRL_INLINE void HwInstanceLeaf_SetStartNode( HwInstanceLeaf* p, gpuva_t b ) {
+ uint64_t mask = ((c_one << 48) - 1);
+ uint64_t v = p->part0.DW2_DW3;
+ v = (b & mask) | (v & ~mask);
+ p->part0.DW2_DW3 = v;
+ }
+ GRL_INLINE void HwInstanceLeaf_SetStartNodeAndInstanceFlags( HwInstanceLeaf* p,
+ gpuva_t root,
+ uint8_t flags ) {
+ uint64_t mask = ((1ull << 48) - 1);
+ uint64_t v = (root & mask) | ((uint64_t)(flags)<<48);
+ p->part1.DW0_DW1 = v;
+ }
+
+ struct InternalNode
+ {
+ float lower[3]; // world space origin of quantization grid
+ int32_t childOffset; // offset to all children in 64B multiples
+
+ uint8_t nodeType; // the type of the node
+ uint8_t pad; // unused byte
+
+ int8_t exp_x; // 2^exp_x is the size of the grid in x dimension
+ int8_t exp_y; // 2^exp_y is the size of the grid in y dimension
+ int8_t exp_z; // 2^exp_z is the size of the grid in z dimension
+ uint8_t nodeMask; // mask used for ray filtering
+
+ struct ChildData
+ {
+ //uint8_t blockIncr : 2; // size of child in 64 byte blocks. Must be ==2 for instance leaves, <=2 for quad leaves.
+ //uint8_t startPrim : 4; // start primitive in fat leaf mode or child type in mixed mode
+ //uint8_t pad : 2; // unused bits
+ uint8_t bits;
+ } childData[6];
+
+ uint8_t lower_x[6]; // the quantized lower bounds in x-dimension
+ uint8_t upper_x[6]; // the quantized upper bounds in x-dimension
+ uint8_t lower_y[6]; // the quantized lower bounds in y-dimension
+ uint8_t upper_y[6]; // the quantized upper bounds in y-dimension
+ uint8_t lower_z[6]; // the quantized lower bounds in z-dimension
+ uint8_t upper_z[6]; // the quantized upper bounds in z-dimension
+ };
+
+ GRL_INLINE uint InternalNode_GetChildBlockIncr( const InternalNode* p, uint idx )
+ {
+ return p->childData[idx].bits & 3;
+ }
+ GRL_INLINE uint InternalNode_GetChildStartPrim( const InternalNode* p, uint idx )
+ {
+ return (p->childData[idx].bits>>2) & 0xf;
+ }
+
+ GRL_INLINE uint8_t InternalNode_GetChildType( const InternalNode* p, uint idx )
+ {
+ return (p->childData[idx].bits >> 2) & 0xF;
+ }
+
+ GRL_INLINE void InternalNode_SetChildType( InternalNode* p, uint idx, uint type )
+ {
+ uint bits = p->childData[idx].bits;
+ const uint mask = (0xF << 2);
+ bits = ((type << 2) & mask) | (bits & ~mask);
+ p->childData[idx].bits = (uint8_t)bits;
+ }
+
+ GRL_INLINE bool InternalNode_IsChildValid( const InternalNode* p, size_t child )
+ {
+ bool lower = p->lower_x[child] & 0x80; // invalid nodes are indicated by setting lower_msb = 1 and upper_msb=0
+ bool upper = p->upper_x[child] & 0x80;
+ return !lower || upper;
+ }
+
+ GRL_INLINE AABB3f InternalNode_GetChildAABB(const InternalNode* node, size_t i)
+ {
+ float4 lower, upper;
+ const float4 base = { node->lower[0], node->lower[1], node->lower[2], 0.0f };
+ const int4 lower_i = { node->lower_x[i], node->lower_y[i], node->lower_z[i], 0 };
+ const int4 upper_i = { node->upper_x[i], node->upper_y[i], node->upper_z[i], 0 };
+ const int4 exp_i = { node->exp_x, node->exp_y, node->exp_z, 0 };
+ lower = base + bitShiftLdexp4(convert_float4_rtn(lower_i), exp_i - 8);
+ upper = base + bitShiftLdexp4(convert_float4_rtp(upper_i), exp_i - 8);
+ AABB3f aabb3f = {
+ { lower.x, lower.y, lower.z },
+ { upper.x, upper.y, upper.z } };
+ return aabb3f;
+ }
+
+ GRL_INLINE void* InternalNode_GetChildren( InternalNode* node)
+ {
+ return (void*)(((char*)node) + node->childOffset * 64);
+ }
+
+ typedef struct PrimLeafDesc
+ {
+ //uint32_t shaderIndex : 24; // shader index used for shader record calculations
+ //uint32_t geomMask : 8; // geometry mask used for ray masking
+ uint32_t shaderIndex_geomMask;
+
+ //uint32_t geomIndex : 29; // the geometry index specifies the n'th geometry of the scene
+ //PrimLeafType type : 1; // see above
+ //GeometryFlags geomFlags : 2; // geometry flags of this geometry
+ uint32_t geomIndex_flags;
+ } PrimLeafDesc;
+
+ GRL_INLINE uint32_t PrimLeaf_GetShaderIndex( const PrimLeafDesc* p )
+ {
+ return p->shaderIndex_geomMask & ((1 << 24) - 1);
+ }
+ GRL_INLINE uint32_t PrimLeaf_GetGeoIndex( const PrimLeafDesc* p )
+ {
+ return p->geomIndex_flags & ((1<<29)-1);
+ }
+ GRL_INLINE uint32_t PrimLeaf_GetGeomFlags( const PrimLeafDesc* p )
+ {
+ return (p->geomIndex_flags >> 30);
+ }
+ GRL_INLINE uint32_t PrimLeaf_GetType(const PrimLeafDesc* p)
+ {
+ return (p->geomIndex_flags >> 29) & 1;
+ }
+
+ struct QuadLeaf
+ {
+ PrimLeafDesc leafDesc;
+
+ uint32_t primIndex0;
+
+ //uint32_t primIndex1Delta : 16;
+ //uint32_t j0 : 2;
+ //uint32_t j1 : 2;
+ //uint32_t j2 : 2;
+ //uint32_t last : 1; // last quad in list
+ //uint32_t pad : 9;
+ uint32_t DW1;
+
+ float v[4][3];
+ };
+
+ GRL_INLINE uint32_t QuadLeaf_GetPrimIndexDelta( const QuadLeaf* p )
+ {
+ return p->DW1 & 0x0000ffff;
+ }
+ GRL_INLINE uint32_t QuadLeaf_GetPrimIndex0( const QuadLeaf* p )
+ {
+ return p->primIndex0;
+ }
+ GRL_INLINE uint32_t QuadLeaf_GetPrimIndex1( const QuadLeaf* p )
+ {
+ return p->primIndex0 + QuadLeaf_GetPrimIndexDelta(p);
+ }
+ GRL_INLINE bool QuadLeaf_IsSingleTriangle( const QuadLeaf* p )
+ {
+ return QuadLeaf_GetPrimIndexDelta(p) == 0;
+ }
+ GRL_INLINE uint32_t QuadLeaf_GetSecondTriangleIndices( const QuadLeaf* p )
+ {
+ return (p->DW1>>16) & 0x3f;
+ }
+
+ GRL_INLINE void QuadLeaf_SetVertices( QuadLeaf* quad, float3 v0, float3 v1, float3 v2, float3 v3 )
+ {
+ quad->v[0][0] = v0.x;
+ quad->v[0][1] = v0.y;
+ quad->v[0][2] = v0.z;
+ quad->v[1][0] = v1.x;
+ quad->v[1][1] = v1.y;
+ quad->v[1][2] = v1.z;
+ quad->v[2][0] = v2.x;
+ quad->v[2][1] = v2.y;
+ quad->v[2][2] = v2.z;
+ quad->v[3][0] = v3.x;
+ quad->v[3][1] = v3.y;
+ quad->v[3][2] = v3.z;
+ }
+
+
+ struct ProceduralLeaf {
+ PrimLeafDesc leafDesc;
+
+ // Number of primitives + "last" bits.
+ // The meaning of this section is SW-defined and flexible
+ uint32_t DW1 ;
+ uint32_t _primIndex[13];
+ } ;
+
+GRL_NAMESPACE_END(Gen12)
+GRL_NAMESPACE_END(RTAS)
+GRL_NAMESPACE_END(GRL)
diff --git a/src/intel/vulkan/grl/include/GRLIntTypes.h b/src/intel/vulkan/grl/include/GRLIntTypes.h
new file mode 100644
index 00000000000..573dbbc7481
--- /dev/null
+++ b/src/intel/vulkan/grl/include/GRLIntTypes.h
@@ -0,0 +1,152 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+//********************************************************************************************
+// WARNING!!!!!
+//
+// This file is shared by OpenCL and C++ source code and must be a pure C header
+// There should only be C structure definitions and trivial inline functions here
+//
+//********************************************************************************************
+
+#pragma once
+
+#include "GRLOCLCompatibility.h"
+
+GRL_NAMESPACE_BEGIN(GRL)
+
+ typedef uint32_t dword;
+ typedef uint64_t qword;
+ typedef qword gpuva_t;
+
+
+ enum_uint8( InstanceFlags )
+ {
+ INSTANCE_FLAG_TRIANGLE_CULL_DISABLE = 0x1,
+ INSTANCE_FLAG_TRIANGLE_FRONT_COUNTERCLOCKWISE = 0x2,
+ INSTANCE_FLAG_FORCE_OPAQUE = 0x4,
+ INSTANCE_FLAG_FORCE_NON_OPAQUE = 0x8,
+ };
+
+ enum_uint8( GeometryFlags )
+ {
+ GEOMETRY_FLAG_NONE = 0x0,
+ GEOMETRY_FLAG_OPAQUE = 0x1,
+ GEOMETRY_FLAG_NO_DUPLICATE_ANYHIT_INVOCATION = 0x2,
+ };
+
+ enum_uint8( GeometryType )
+ {
+ GEOMETRY_TYPE_TRIANGLES = 0,
+ GEOMETRY_TYPE_PROCEDURAL = 1,
+ NUM_GEOMETRY_TYPES = 2
+ };
+
+ // NOTE: Does NOT match DXR
+ enum_uint8( IndexFormat )
+ {
+ INDEX_FORMAT_NONE = 0, // INDEX_FORMAT_NONE Indicates non-indexed geometry
+ INDEX_FORMAT_R16_UINT = 2,
+ INDEX_FORMAT_R32_UINT = 4,
+ INDEX_FORMAT_END = INDEX_FORMAT_R32_UINT + 1
+ };
+
+ // NOTE: Does NOT match DXR
+ enum_uint8( VertexFormat )
+ {
+ VERTEX_FORMAT_R32G32_FLOAT = 0,
+ VERTEX_FORMAT_R32G32B32_FLOAT = 1,
+ VERTEX_FORMAT_R16G16_FLOAT = 2,
+ VERTEX_FORMAT_R16G16B16A16_FLOAT = 3,
+ VERTEX_FORMAT_R16G16_SNORM = 4,
+ VERTEX_FORMAT_R16G16B16A16_SNORM = 5,
+ VERTEX_FORMAT_R16G16B16A16_UNORM = 6,
+ VERTEX_FORMAT_R16G16_UNORM = 7,
+ VERTEX_FORMAT_R10G10B10A2_UNORM = 8,
+ VERTEX_FORMAT_R8G8B8A8_UNORM = 9,
+ VERTEX_FORMAT_R8G8_UNORM = 10,
+ VERTEX_FORMAT_R8G8B8A8_SNORM = 11,
+ VERTEX_FORMAT_R8G8_SNORM = 12,
+ VERTEX_FORMAT_END = VERTEX_FORMAT_R8G8_SNORM + 1
+ };
+
+
+
+ enum_uint32(RTASFlags)
+ {
+ // These flags match DXR
+ BUILD_FLAG_ALLOW_UPDATE = 1<<0,
+ BUILD_FLAG_ALLOW_COMPACTION = 1<<1,
+ BUILD_FLAG_PREFER_FAST_TRACE = 1<<2,
+ BUILD_FLAG_PREFER_FAST_BUILD = 1<<3,
+ BUILD_FLAG_MINIMIZE_MEMORY = 1<<4,
+ BUILD_FLAG_PERFORM_UPDATE = 1<<5,
+
+ // internal flags start here
+ BUILD_FLAG_DISALLOW_REBRAID = 1<<16,
+
+ BUILD_FLAG_ALL = 0x0001003f
+ };
+
+ enum_uint8(BVHType)
+ {
+ BVH_TYPE_NONE, // This is a sentinel for drivers to use when compiling out GRL on non-RT devices
+ BVH_TYPE_GEN12,
+ };
+
+ enum_uint8(PostBuildInfoType)
+ {
+ PBI_CURRENT_SIZE,
+ PBI_COMPACTED_SIZE,
+ PBI_DXR_TOOLS_VISUALIZATION_DESC,
+ PBI_DXR_SERIALIZATION_DESC,
+ };
+
+ enum_uint32(HazardTypes)
+ {
+ HAZARD_RTAS_READ = 1 << 0,
+ HAZARD_RTAS_WRITE = 1 << 1,
+ HAZARD_READ = 1 << 2,
+ HAZARD_WRITE = 1 << 3,
+ HAZARD_ALL = 0xf
+ };
+
+ enum_uint32(RaytracingAccelerationStructureType)
+ {
+ TOP_LEVEL = 0x0,
+ BOTTOM_LEVEL = 0x1,
+ };
+
+ typedef struct PostbuildInfoCurrentSize
+ {
+ uint64_t CurrentSizeInBytes;
+ } PostbuildInfoCurrentSize;
+
+ typedef struct PostbuildInfoCompactedSize
+ {
+ uint64_t CompactedSizeInBytes;
+ } PostbuildInfoCompactedSize;
+
+ typedef struct PostbuildInfoToolsVisualizationDesc
+ {
+ uint64_t DecodedSizeInBytes;
+ } PostbuildInfoToolsVisualizationDesc;
+
+ typedef struct PostbuildInfoSerializationDesc
+ {
+ uint64_t SerializedSizeInBytes;
+ uint64_t NumBottomLevelAccelerationStructurePointers;
+ } PostbuildInfoSerializationDesc;
+
+ typedef struct DecodeHeader
+ {
+ RaytracingAccelerationStructureType Type;
+ uint32_t NumDesc;
+ } DecodeHeader;
+
+
+GRL_NAMESPACE_END(GRL) \ No newline at end of file
diff --git a/src/intel/vulkan/grl/include/GRLOCLCompatibility.h b/src/intel/vulkan/grl/include/GRLOCLCompatibility.h
new file mode 100644
index 00000000000..119104f1532
--- /dev/null
+++ b/src/intel/vulkan/grl/include/GRLOCLCompatibility.h
@@ -0,0 +1,210 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#pragma once
+
+#ifdef __OPENCL_VERSION__
+
+typedef uchar uint8_t;
+typedef ushort uint16_t;
+typedef uint uint32_t;
+typedef ulong uint64_t;
+typedef char int8_t;
+typedef short int16_t;
+typedef int int32_t;
+typedef long int64_t;
+
+#else
+
+#include <stdint.h>
+
+typedef uint8_t uchar;
+typedef uint16_t ushort;
+typedef uint32_t uint;
+typedef uint64_t ulong;
+
+#define __constant
+#define __global
+
+typedef struct uint2
+{
+#ifdef __cplusplus
+ uint2() {};
+ uint2( uint ix, uint iy ) : x( ix ), y( iy ) {};
+#endif
+ uint x;
+ uint y;
+} uint2;
+
+typedef struct uint3
+{
+#ifdef __cplusplus
+ uint3() {};
+ uint3( uint ix, uint iy, uint iz ) : x( ix ), y( iy ), z( iz ) {};
+#endif
+ uint x;
+ uint y;
+ uint z;
+} uint3;
+
+typedef struct int3
+{
+ int32_t x;
+ int32_t y;
+ int32_t z;
+
+#ifdef __cplusplus
+ int3() {};
+ int3(int32_t ix, int32_t iy, int32_t iz) : x(ix), y(iy), z(iz) {};
+
+ int3 operator+(const int32_t i) const { return int3(this->x + i, this->y + i, this->z + i); }
+ int3 operator<<(const int32_t i) const { return int3(this->x << i, this->y << i, this->z << i); }
+#endif
+} int3;
+
+typedef struct int4
+{
+ int32_t x;
+ int32_t y;
+ int32_t z;
+ int32_t w;
+
+#ifdef __cplusplus
+ int4() {};
+ int4(int32_t ix, int32_t iy, int32_t iz, int32_t iw) : x(ix), y(iy), z(iz), w(iw) {};
+
+ int4 operator+(const int32_t i) const { return int4(this->x + i, this->y + i, this->z + i, this->w + i); }
+ int4 operator-(const int32_t i) const { return int4(this->x - i, this->y - i, this->z - i, this->w - i); }
+ int4 operator<<(const int32_t i) const { return int4(this->x << i, this->y << i, this->z << i, this->w << i); }
+#endif
+} int4;
+
+typedef struct float3
+{
+ float x;
+ float y;
+ float z;
+
+#ifdef __cplusplus
+ float3(){};
+ float3( float ix, float iy, float iz ) : x(ix), y(iy), z(iz){};
+
+ float3 operator+( const float3& f3 ) { return float3( this->x + f3.x, this->y + f3.y, this->z + f3.z ); }
+ float3 operator*( const float& f ) { return float3( this->x * f, this->y * f, this->z * f ); }
+ float3 operator*( const float3& f3 ) const { return float3(this->x * f3.x, this->y * f3.y, this->z * f3.z); }
+ float3 operator-() { return float3(-this->x, -this->y, -this->z); }
+ float3 operator-( const float3& f3) { return float3(this->x - f3.x, this->y - f3.y, this->z - f3.z); }
+#endif
+} float3;
+
+typedef struct float4
+{
+ float x;
+ float y;
+ float z;
+ float w;
+
+#ifdef __cplusplus
+ float4() {};
+ float4( float ix, float iy, float iz, float iw ) : x( ix ), y( iy ), z( iz ), w( iw ) {};
+
+ float4 operator+(const float4& f4) const { return float4(this->x + f4.x, this->y + f4.y, this->z + f4.z, this->w + f4.w); }
+ float4 operator*(const float4& f4) const { return float4(this->x * f4.x, this->y * f4.y, this->z * f4.z, this->w * f4.w); }
+#endif
+} float4;
+
+#endif /* ! __OPENCL_VERSION__ */
+
+
+#ifndef __cplusplus
+
+#define GRL_NAMESPACE_BEGIN(x)
+#define GRL_NAMESPACE_END(x)
+#define GRL_OVERLOADABLE __attribute((overloadable))
+#define GRL_INLINE __attribute__((always_inline)) inline static
+
+# define enum_uint8(name) \
+ typedef uint8_t name; \
+ enum name##_uint32
+# define enum_uint16(name) \
+ typedef uint16_t name; \
+ enum name##_uint32
+# define enum_uint32(name) \
+ typedef uint32_t name; \
+ enum name##_uint32
+
+#define OCL_BYTE_ALIGN(n) __attribute__ ((aligned (n)))
+#define GRL_STATIC_ASSERT(condition,desc)
+
+#else /* C++ */
+#ifdef __OPENCL_VERSION__
+#error "OpenCL C++ not supported by this header"
+#endif
+
+#define GRL_NAMESPACE_BEGIN(x) namespace x {
+#define GRL_NAMESPACE_END(x) }
+#define GRL_OVERLOADABLE
+#define GRL_INLINE inline
+
+#define enum_uint8(N) enum N : uint8_t
+#define enum_uint16(N) enum N : uint16_t
+#define enum_uint32(N) enum N : uint32_t
+
+#define OCL_BYTE_ALIGN(n)
+#define GRL_STATIC_ASSERT(condition,desc) static_assert( condition, desc )
+
+#include <cmath>
+
+inline float3 fmin(float3 a, float3 b)
+{
+ float3 o = { std::fmin(a.x, b.x), std::fmin(a.y, b.y), std::fmin(a.z, b.z) };
+ return o;
+}
+
+inline float3 fmax(float3 a, float3 b)
+{
+ float3 o = { std::fmax(a.x, b.x), std::fmax(a.y, b.y), std::fmax(a.z, b.z) };
+ return o;
+}
+
+inline float3 operator/(const float3& f3, const float& f) { return float3(f3.x / f, f3.y / f, f3.z / f); }
+
+inline float dot(const float3& a, const float3& b) {
+ return a.x * b.x + a.y * b.y + a.z * b.z;
+}
+
+inline float as_float(uint32_t i)
+{
+ union { float f; uint32_t i; } fi;
+
+ fi.i = i;
+ return fi.f;
+}
+
+inline float3 as_float3(int3 i3)
+{
+ float3 o = { as_float(i3.x), as_float(i3.y), as_float(i3.z) };
+ return o;
+}
+
+inline float4 as_float4(int4 i4)
+{
+ float4 o = { as_float(i4.x), as_float(i4.y), as_float(i4.z), as_float(i4.w) };
+ return o;
+}
+
+inline float4 convert_float4_rtn(int4 i4)
+{
+ return float4(static_cast<float>(i4.x), static_cast<float>(i4.y), static_cast<float>(i4.z), static_cast<float>(i4.w));
+}
+
+inline float4 convert_float4_rtp(int4 i4)
+{
+ return convert_float4_rtn(i4);
+}
+
+#endif
diff --git a/src/intel/vulkan/grl/include/GRLRTASCommon.h b/src/intel/vulkan/grl/include/GRLRTASCommon.h
new file mode 100644
index 00000000000..1f2cda2ea0b
--- /dev/null
+++ b/src/intel/vulkan/grl/include/GRLRTASCommon.h
@@ -0,0 +1,142 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+//
+// This file is to contain structure definitions for RTAS-related meta-deta.
+// The structures here should be generic enough to apply to any acceleration structure.
+// If we ever move to KD-Trees or Octrees, this file should not need to change.
+//
+
+//********************************************************************************************
+// WARNING!!!!!
+//
+// This file is shared by OpenCL and C++ source code and must be a pure C header
+// There should only be C structure definitions and trivial inline functions here
+//
+//********************************************************************************************
+
+
+#pragma once
+#include "GRLIntTypes.h"
+
+GRL_NAMESPACE_BEGIN(GRL)
+GRL_NAMESPACE_BEGIN(RTAS)
+
+ typedef struct SerializationIdentifier
+ {
+ uint8_t Bytes[16];
+ } SerializationIdentifier;
+
+ GRL_STATIC_ASSERT(sizeof(SerializationIdentifier) == 16, "Wrong size!");
+
+
+ // Header structure for RTAS serialization.
+ // This structure is binary-compatible with the DXR and Vulkan API definitions
+ typedef struct SerializationHeader
+ {
+ SerializationIdentifier DriverID; // DXR 'DriverOpaqueGUID'. Vulkan: 'driverUUID'
+ SerializationIdentifier GRLID; // DXR 'DriverOpaqueVersioningData'. Vulkan: 'accelerationStructureUUID'
+
+ uint64_t SerializedSizeInBytesIncludingHeader;
+ uint64_t DeserializedSizeInBytes;
+ uint64_t InstanceHandleCount;
+ } SerializationHeader;
+
+ GRL_STATIC_ASSERT(sizeof(SerializationHeader) == 56, "Wrong size!");
+
+ // This structure is binary-compatible with DXR and Vulkan 'InstanceDesc' structures
+ typedef struct InstanceDesc {
+ float Transform[3][4];
+ uint32_t InstanceIDAndMask; // mask in 8 msbs
+ uint32_t InstanceContributionToHitGroupIndexAndFlags; // flags in 8 msbs
+ gpuva_t AccelerationStructureGPUVA; // NOTE: In GRL this is always a VA. Vulkan CPU builds use handles here, and these may need to be translated
+ } InstanceDesc;
+ GRL_STATIC_ASSERT(sizeof(InstanceDesc) == 64, "Wrong size!");
+
+ typedef struct GeoMetaData{
+ uint32_t PrimitiveCount;
+ uint16_t Type;
+ uint16_t Flags;
+ } GeoMetaData;
+ GRL_STATIC_ASSERT(sizeof(GeoMetaData) == 8, "Wrong size!");
+
+ typedef struct AABB3f {
+ float lower[3];
+ float upper[3];
+ } AABB3f;
+ GRL_STATIC_ASSERT(sizeof(AABB3f) == 24, "Wrong size!");
+
+ enum_uint32(error_t_) {
+ error_t_no_error = 0x0,
+ error_t_internal_node_child_OOB = 0x1,
+ error_t_leaf_node_child_OOB = 0x2,
+ error_t_unrecognised_node_t = 0x4,
+ error_t_mixed_node_unsupported = 0x8,
+ error_t_instance_pointers_inconsistent = 0x10,
+ error_t_instance_pointed_root_not_internal = 0x20,
+ error_t_leaf_node_instance_child_missed_by_64B = 0x40,
+ error_t_internal_node_child_cycle = 0x80,
+ error_t_input_geo_insane = 0x100,
+ error_t_quad_leaf_broken = 0x200,
+ error_t_backpointer_not_reset = 0x400,
+ error_t_backpointer_wrong_children_num = 0x500,
+ error_t_backpointer_inconsitent_parent_child = 0x600,
+ error_t_backpointer_root_not_root_error = 0x700,
+ error_t_backpointer_OOB = 0x800,
+ error_t_backpointers_buffer_too_small = 0x900,
+ error_t_atomic_update_struct_fatleaf_count_oob = 0x1000, // for this and following:
+ error_t_atomic_update_struct_fatleaf_node_idx_oob = 0x2000, // offset_in_BVH is just index in fatleaf or inner node arrays
+ error_t_atomic_update_struct_fatleaf_backpointer_mismatch = 0x3000,
+ error_t_atomic_update_struct_fatleaf_num_children_error = 0x4000,
+ error_t_atomic_update_struct_fatleaf_children_non_leaf = 0x5000,
+ error_t_atomic_update_struct_inner_count_oob = 0x6000,
+ error_t_atomic_update_struct_inner_node_idx_oob = 0x7000,
+ error_t_atomic_update_struct_inner_node_child_idx_error = 0x8000,
+ error_t_atomic_update_struct_inner_num_children_error = 0x9000,
+ error_t_atomic_update_struct_inner_children_non_internal = 0xA000,
+ error_t_unknown = 1u << 31,
+ };
+
+ enum_uint32(error_phase_t) {
+ error_phase_t_unknown = 0,
+ error_phase_t_post_build_Morton = 1,
+ error_phase_t_post_build_Trivial = 2,
+ error_phase_t_post_build_NewSAH = 3,
+ error_phase_t_post_update = 4,
+ error_phase_t_pre_update = 5,
+ error_phase_t_post_copy_op = 6,
+ };
+
+ typedef struct ERROR_INFO {
+ error_t_ type;
+ uint offset_in_BVH; //in 64B units
+ error_phase_t when;
+ uint reserved;
+ } ERROR_INFO;
+
+ // Meta-data common to all acceleration structures, which is needed to implement required functionality
+ // All RTAS structures must contain a struct of this type named 'Meta'
+ typedef struct RTASMetaData {
+ struct AABB3f bounds;
+
+ uint32_t instanceDescsStart; // byte offset to array of original instance_descs used for build. Required for DXR visualization and serialization
+ uint32_t instanceCount;
+
+ uint32_t geoDescsStart; // byte offset to array of 'GeoMetaData' matching input geos. Required for DXR visualization
+ uint32_t geoCount;
+
+ uint64_t allocationSize; // Size of the memory allocation containing this RTAS
+ // This is the size given to the app in the prebuild info when the RTAS was first created
+ // If RTAS was compacted, this will be the compacted size
+
+ ERROR_INFO errors; // only used in debug mode
+ } RTASMetaData;
+
+ GRL_STATIC_ASSERT( sizeof(RTASMetaData) == 64, "Wrong size!");
+
+GRL_NAMESPACE_END(RTAS)
+GRL_NAMESPACE_END(GRL)
diff --git a/src/intel/vulkan/grl/include/GRLStructs.h b/src/intel/vulkan/grl/include/GRLStructs.h
new file mode 100644
index 00000000000..c8af8313ffc
--- /dev/null
+++ b/src/intel/vulkan/grl/include/GRLStructs.h
@@ -0,0 +1,60 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#pragma once
+
+#include "GRLIntTypes.h"
+
+GRL_NAMESPACE_BEGIN(GRL)
+GRL_NAMESPACE_BEGIN(_INTERNAL)
+
+ struct GeometryTriangles
+ {
+ gpuva_t pTransformBuffer;
+ gpuva_t pIndexBuffer;
+ gpuva_t pVertexBuffer;
+ qword VertexBufferByteStride;
+ dword IndexCount;
+ dword VertexCount;
+ IndexFormat IndexFormat;
+ VertexFormat VertexFormat;
+ };
+
+ struct GeometryProcedural
+ {
+ gpuva_t pAABBs_GPUVA; ///<elements of pAABBs_GPUVA are gpuAABB format.
+ qword AABBByteStride;
+ dword AABBCount;
+ };
+
+ // TODO we miss 'unsigned int ShaderIndex_Mask; // extension' field
+ struct Geo
+ {
+ union
+ {
+ struct GeometryTriangles Triangles;
+ struct GeometryProcedural Procedural;
+ } Desc;
+
+ GeometryType Type;
+ uint8_t Flags;
+ };
+
+ // Matches the Vulkan VkAccelerationStructureBuildRangeInfoKHR structure
+ // See Vulkan spec for data access rules:
+ // https://www.khronos.org/registry/vulkan/specs/1.2-extensions/man/html/VkAccelerationStructureBuildRangeInfoKHR.html
+ //
+ struct IndirectBuildRangeInfo
+ {
+ dword primitiveCount; // Number of primitives
+ dword primitiveOffset; // Byte offset to primitive data
+ dword firstVertex; // Index of first vertex
+ dword transformOffset; // Byte offset to transform data (for triangle Geo with non-null transform)
+ };
+
+GRL_NAMESPACE_END(_INTERNAL)
+GRL_NAMESPACE_END(GRL) \ No newline at end of file
diff --git a/src/intel/vulkan/grl/include/GRLUtilities.h b/src/intel/vulkan/grl/include/GRLUtilities.h
new file mode 100644
index 00000000000..22670bfad1b
--- /dev/null
+++ b/src/intel/vulkan/grl/include/GRLUtilities.h
@@ -0,0 +1,32 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#pragma once
+
+#include "GRLOCLCompatibility.h"
+
+GRL_NAMESPACE_BEGIN(GRL)
+
+ GRL_INLINE float4 bitShiftLdexp4(float4 x, int4 y)
+ {
+ y = (y + 127) << 23;
+ return x * as_float4(y);
+ }
+
+ GRL_INLINE float3 bitShiftLdexp3(float3 x, int3 y)
+ {
+ y = (y + 127) << 23;
+ return x * as_float3(y);
+ }
+
+ GRL_INLINE float bitShiftLdexp(float x, int y)
+ {
+ y = (y + 127) << 23;
+ return x * as_float(y);
+ }
+
+GRL_NAMESPACE_END(GRL) \ No newline at end of file
diff --git a/src/intel/vulkan/grl/include/affinespace.h b/src/intel/vulkan/grl/include/affinespace.h
new file mode 100644
index 00000000000..36ebae0ede6
--- /dev/null
+++ b/src/intel/vulkan/grl/include/affinespace.h
@@ -0,0 +1,192 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#pragma once
+
+#include "GRLRTASCommon.h"
+
+GRL_NAMESPACE_BEGIN(GRL)
+GRL_NAMESPACE_BEGIN(RTAS)
+inline float3 GRL_OVERLOADABLE cross(const float3 a, const float3 b)
+{
+ float3 res = { a.y * b.z - a.z * b.y,
+ a.z * b.x - a.x * b.z,
+ a.x * b.y - a.y * b.x };
+ return res;
+}
+
+struct LinearSpace3f
+{
+ float3 vx;
+ float3 vy;
+ float3 vz;
+};
+
+/* compute the determinant of the matrix */
+GRL_INLINE struct LinearSpace3f LinearSpace3f_Constructor(const float3 vx, const float3 vy, const float3 vz)
+{
+ struct LinearSpace3f xfm;
+ xfm.vx = vx;
+ xfm.vy = vy;
+ xfm.vz = vz;
+ return xfm;
+}
+
+/* compute the determinant of the matrix */
+GRL_INLINE float LinearSpace3f_det(struct LinearSpace3f xfm)
+{
+ return dot(xfm.vx, cross(xfm.vy, xfm.vz));
+}
+
+/* compute transposed matrix */
+GRL_INLINE struct LinearSpace3f LinearSpace3f_transpose(struct LinearSpace3f in)
+{
+ float3 x = { in.vx.x, in.vy.x, in.vz.x };
+ float3 y = { in.vx.y, in.vy.y, in.vz.y };
+ float3 z = { in.vx.z, in.vy.z, in.vz.z };
+
+ return LinearSpace3f_Constructor(x,
+ y,
+ z);
+}
+
+/* compute adjoint matrix */
+GRL_INLINE const struct LinearSpace3f LinearSpace3f_adjoint(struct LinearSpace3f in)
+{
+ return LinearSpace3f_transpose(LinearSpace3f_Constructor(cross(in.vy, in.vz),
+ cross(in.vz, in.vx),
+ cross(in.vx, in.vy)));
+}
+
+/* compute inverse matrix */
+GRL_INLINE struct LinearSpace3f LinearSpace3f_invert(struct LinearSpace3f in)
+{
+ const float det = LinearSpace3f_det(in);
+ const struct LinearSpace3f adj = LinearSpace3f_adjoint(in);
+ return LinearSpace3f_Constructor(adj.vx / det, adj.vy / det, adj.vz / det);
+}
+
+GRL_INLINE float3 GRL_OVERLOADABLE xfmPoint(struct LinearSpace3f xfm, float3 p)
+{
+ return xfm.vx * p.x + xfm.vy * p.y + xfm.vz * p.z;
+}
+
+struct AffineSpace3f
+{
+ struct LinearSpace3f l;
+ float3 p;
+};
+
+GRL_INLINE struct AffineSpace3f AffineSpace3f_Constructor(struct LinearSpace3f l, float3 p)
+{
+ struct AffineSpace3f out;
+ out.l = l;
+ out.p = p;
+ return out;
+}
+
+GRL_INLINE struct AffineSpace3f AffineSpace3f_load_row_major(const float *in)
+{
+ struct AffineSpace3f out;
+ out.l.vx.x = in[0];
+ out.l.vx.y = in[4];
+ out.l.vx.z = in[8];
+ out.l.vy.x = in[1];
+ out.l.vy.y = in[5];
+ out.l.vy.z = in[9];
+ out.l.vz.x = in[2];
+ out.l.vz.y = in[6];
+ out.l.vz.z = in[10];
+ out.p.x = in[3];
+ out.p.y = in[7];
+ out.p.z = in[11];
+ return out;
+}
+
+// squared proportion of oriented transformed cube to aa box that would contain it.
+// the smaller it is the more overhead transformation produces
+GRL_INLINE
+float transformation_bbox_surf_overhead(const float* Transform)
+{
+ // We use an abs-matrix to transform the AABB extent vector, which is enough to compute the area
+ // New AABB is center +- Extent.
+ //
+ // For derivation see:
+ // https://zeux.io/2010/10/17/aabb-from-obb-with-component-wise-abs/
+ //
+
+
+ // take the cube of side 1 and see how big aabb containing it transformed is vs just surface of transformed
+ float ex = fabs(Transform[0]) + fabs(Transform[1]) + fabs(Transform[2]);
+ float ey = fabs(Transform[4]) + fabs(Transform[5]) + fabs(Transform[6]);
+ float ez = fabs(Transform[8]) + fabs(Transform[9]) + fabs(Transform[10]);
+
+ // we will compare squared sizes
+ ex = ex * ex;
+ ey = ey * ey;
+ ez = ez * ez;
+
+ // surface of aabb containing oriented box;
+ float aabb_sq_half_surf = ex * ey + ey * ez + ez * ex;
+
+ // ^2 lengths of transformed <1,0,0>, <0,1,0>, <0,0,1>
+ float obx = Transform[0] * Transform[0] + Transform[4] * Transform[4] + Transform[8] * Transform[8];
+ float oby = Transform[1] * Transform[1] + Transform[5] * Transform[5] + Transform[9] * Transform[9];
+ float obz = Transform[2] * Transform[2] + Transform[6] * Transform[6] + Transform[10] * Transform[10];
+
+ float obb_sq_half_surf = obx * oby + oby * obz + obz * obx;
+
+ return obb_sq_half_surf / aabb_sq_half_surf;
+
+ // ex = 2.0
+ // ey = 2.0
+ // ez = 2.0
+ // ex = 4.0
+ // ey = 4.0
+ // ez = 4.0
+ // aabb_half_surf = 16+16 *2.0 + 2.0*2.0+ 2.0*2.0; = 12;
+ // aabb_sq_half_surf = 144;
+ //
+ // obx = 4.0;
+ // oby = 4.0;
+ // obz = 4.0;
+ // obb_sq_half_surf = 16 + 16+ 16;
+ // obb_sq_half_surf = 16.0 *3 = 48
+}
+
+GRL_INLINE void load_row_major_from_AffineSpace3f(struct AffineSpace3f in, float* out)
+{
+ out[0] = in.l.vx.x;
+ out[4] = in.l.vx.y;
+ out[8] = in.l.vx.z;
+ out[1] = in.l.vy.x;
+ out[5] = in.l.vy.y;
+ out[9] = in.l.vy.z;
+ out[2] = in.l.vz.x;
+ out[6] = in.l.vz.y;
+ out[10] = in.l.vz.z;
+
+ out[3] = in.p.x;
+ out[7] = in.p.y;
+ out[11] = in.p.z;
+}
+
+GRL_INLINE float3 GRL_OVERLOADABLE xfmPoint(struct AffineSpace3f xfm, float3 p)
+{
+ return xfmPoint(xfm.l, p) + xfm.p;
+}
+
+/* compute inverse matrix */
+GRL_INLINE struct AffineSpace3f AffineSpace3f_invert(struct AffineSpace3f in)
+{
+ const struct LinearSpace3f il = LinearSpace3f_invert(in.l);
+ float3 ip = -xfmPoint(il, in.p);
+ return AffineSpace3f_Constructor(il, ip);
+}
+
+GRL_NAMESPACE_END(RTAS)
+GRL_NAMESPACE_END(GRL)
diff --git a/src/intel/vulkan/grl/meson.build b/src/intel/vulkan/grl/meson.build
new file mode 100644
index 00000000000..61cb7aa8ea3
--- /dev/null
+++ b/src/intel/vulkan/grl/meson.build
@@ -0,0 +1,203 @@
+# Copyright © 2021 Intel Corporation
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+fs = import('fs')
+
+grl_lib_files = [
+ 'gpu/libs/libraries.grl',
+]
+
+grl_grl_files = [
+ 'gpu/build_leaf.grl',
+ 'gpu/build_primref.grl',
+# 'gpu/build_refit.grl',
+ 'gpu/copy.grl',
+# 'gpu/grl_api_interface_verify.grl',
+ 'gpu/misc.grl',
+# 'gpu/morton_builder.grl',
+# 'gpu/msb_radix_bitonic_sort.grl',
+ 'gpu/new_sah_builder.grl',
+ 'gpu/postbuild_info.grl',
+# 'gpu/presplit.grl',
+# 'gpu/radix_sort.grl',
+# 'gpu/rebraid.grl',
+# 'gpu/traversal_shader.grl',
+]
+
+grl_lib_args = []
+foreach libfile : grl_lib_files
+ grl_lib_args += '--library'
+ grl_lib_args += files(libfile)
+endforeach
+
+grl_genX_files = [
+ 'genX_grl_dispatch.c',
+ 'genX_grl_uuid.cpp',
+]
+
+grl_lib_args = []
+foreach libfile : grl_lib_files
+ grl_lib_args += '--library'
+ grl_lib_args += files(libfile)
+endforeach
+
+grl_cl_kernel_h = custom_target(
+ 'grl_cl_kernel.h',
+ input : ['grl_cl_kernel_gen.py', grl_grl_files, grl_lib_files],
+ output : 'grl_cl_kernel.h',
+ command : [
+ prog_python, '@INPUT0@', '--out-h', '@OUTPUT@',
+ grl_lib_args, files(grl_grl_files),
+ ],
+)
+
+has_ply = run_command(
+ prog_python, '-c',
+ '''
+import ply
+ ''', check : false)
+if has_ply.returncode() != 0
+ error('Python (3.x) ply module required to build GRL kernels.')
+endif
+
+r = run_command(prog_python, 'grl_cl_kernel_gen.py',
+ grl_lib_args, '--ls-kernels', grl_grl_files, check : false)
+assert(r.returncode() == 0, 'Failed to fetch GRL CL kernels')
+grl_kernels = r.stdout().strip().split()
+
+grl_metakernel_c = []
+grl_metakernel_h = []
+foreach grl_file : grl_grl_files
+ base_outfile = 'grl_metakernel_' + fs.replace_suffix(fs.name(grl_file), '')
+ outfiles = custom_target(
+ base_outfile,
+ input : ['grl_metakernel_gen.py', grl_file, grl_lib_files],
+ output : [base_outfile + '.h', base_outfile + '.c'],
+ command : [
+ prog_python, '@INPUT0@', '--out-h', '@OUTPUT0@',
+ '--out-c', '@OUTPUT1@', grl_lib_args, '@INPUT1@',
+ ],
+ )
+ grl_metakernel_h += outfiles[0]
+ grl_metakernel_c += outfiles[1]
+endforeach
+
+grl_genX_libs = []
+foreach t : [['125', 'gfx125', 'dg2']]
+ verX10 = t[0]
+ genX_prefix = t[1]
+ platform = t[2]
+
+ grl_compiled_cl_kernels = []
+ foreach k : grl_kernels
+ # get_cl_files dumps out filename:entrypoint:libfile1,libfile2,libfile3
+ cl_file = k.split(':')[0]
+ entrypoint = k.split(':')[1]
+ library_files = k.split(':')[2]
+ kernel_prefix = '_'.join([
+ genX_prefix,
+ fs.replace_suffix(cl_file, '').replace('gpu/', '').replace('/', '_'),
+ entrypoint
+ ])
+ input_args = [ files(cl_file), ]
+ if library_files != ''
+ foreach lib_file : library_files.split(',')
+ input_args += [ lib_file ]
+ endforeach
+ endif
+ prepended_input_args = []
+ foreach input_arg : input_args
+ prepended_input_args += ['--in', input_arg]
+ endforeach
+ outfile = kernel_prefix + '.h'
+ grl_compiled_cl_kernels += custom_target(
+ outfile,
+ input : cl_file,
+ output : outfile,
+ command : [
+ prog_intel_clc, '-p', platform, '--prefix', kernel_prefix,
+ '-e', entrypoint, prepended_input_args, '-o', '@OUTPUT@', '--',
+ '-cl-std=cl2.0', '-D__OPENCL_VERSION__=200',
+ '-DMAX_HW_SIMD_WIDTH=16', '-DMAX_WORKGROUP_SIZE=16',
+ '-I' + join_paths(meson.current_source_dir(), 'gpu'),
+ '-I' + join_paths(meson.current_source_dir(), 'include'),
+ ],
+ env: ['MESA_SHADER_CACHE_DISABLE=true',
+ 'MESA_SPIRV_LOG_LEVEL=error'],
+ depends : dep_prog_intel_clc
+ )
+ endforeach
+
+ grl_cl_kernel_c = custom_target(
+ 'grl_@0@_cl_kernel.c'.format(genX_prefix),
+ input : ['grl_cl_kernel_gen.py', grl_grl_files, grl_lib_files],
+ output : 'grl_@0@_cl_kernel.c'.format(genX_prefix),
+ command : [
+ prog_python, '@INPUT0@', '--out-c', '@OUTPUT@',
+ grl_lib_args, '--prefix', genX_prefix, files(grl_grl_files),
+ ],
+ )
+
+ grl_genX_libs += static_library(
+ 'grl_@0@'.format(genX_prefix),
+ [grl_cl_kernel_h, grl_compiled_cl_kernels, grl_cl_kernel_c,
+ grl_genX_files, grl_metakernel_c, grl_metakernel_h],
+ include_directories : [
+ inc_include, inc_src,
+ inc_intel,
+ ],
+ c_args : [
+ no_override_init_args, sse2_args,
+ '-DGFX_VERx10=@0@'.format(verX10),
+ ],
+ cpp_args : [
+ sse2_args,
+ '-DGFX_VERx10=@0@'.format(verX10),
+ ],
+ dependencies : [
+ dep_valgrind, idep_nir_headers, idep_vulkan_util_headers, idep_vulkan_wsi_headers,
+ idep_vulkan_runtime_headers, idep_anv_headers, idep_genxml,
+ ],
+ gnu_symbol_visibility : 'hidden',
+ )
+endforeach
+
+libgrl_deps = [
+ dep_valgrind,
+ idep_nir_headers,
+ idep_vulkan_util_headers,
+ idep_vulkan_wsi_headers,
+]
+
+libgrl = static_library(
+ 'grl',
+ [grl_cl_kernel_h],
+ include_directories : [
+ inc_include, inc_src, inc_intel,
+ ],
+ link_whole : [grl_genX_libs],
+ dependencies : [libgrl_deps, idep_anv_headers],
+)
+idep_grl = declare_dependency(
+ link_with : libgrl,
+ dependencies : libgrl_deps,
+ sources : [grl_metakernel_h, grl_cl_kernel_h],
+ include_directories : include_directories('include', 'gpu'),
+)
diff --git a/src/intel/vulkan/i915/anv_batch_chain.c b/src/intel/vulkan/i915/anv_batch_chain.c
new file mode 100644
index 00000000000..dd3d40bf13f
--- /dev/null
+++ b/src/intel/vulkan/i915/anv_batch_chain.c
@@ -0,0 +1,1107 @@
+/*
+ * Copyright © 2022 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "i915/anv_batch_chain.h"
+#include "anv_private.h"
+#include "anv_measure.h"
+
+#include "perf/intel_perf.h"
+#include "util/u_debug.h"
+
+#include "drm-uapi/i915_drm.h"
+
+struct anv_execbuf {
+ struct drm_i915_gem_execbuffer2 execbuf;
+
+ struct drm_i915_gem_execbuffer_ext_timeline_fences timeline_fences;
+
+ struct drm_i915_gem_exec_object2 * objects;
+ uint32_t bo_count;
+ uint32_t bo_array_length;
+ struct anv_bo ** bos;
+
+ uint32_t syncobj_count;
+ uint32_t syncobj_array_length;
+ struct drm_i915_gem_exec_fence * syncobjs;
+ uint64_t * syncobj_values;
+
+ uint32_t cmd_buffer_count;
+ struct anv_query_pool *perf_query_pool;
+
+ const VkAllocationCallbacks * alloc;
+ VkSystemAllocationScope alloc_scope;
+
+ int perf_query_pass;
+};
+
+static void
+anv_execbuf_finish(struct anv_execbuf *exec)
+{
+ vk_free(exec->alloc, exec->syncobjs);
+ vk_free(exec->alloc, exec->syncobj_values);
+ vk_free(exec->alloc, exec->objects);
+ vk_free(exec->alloc, exec->bos);
+}
+
+static void
+anv_execbuf_add_ext(struct anv_execbuf *exec,
+ uint32_t ext_name,
+ struct i915_user_extension *ext)
+{
+ __u64 *iter = &exec->execbuf.cliprects_ptr;
+
+ exec->execbuf.flags |= I915_EXEC_USE_EXTENSIONS;
+
+ while (*iter != 0) {
+ iter = (__u64 *) &((struct i915_user_extension *)(uintptr_t)*iter)->next_extension;
+ }
+
+ ext->name = ext_name;
+
+ *iter = (uintptr_t) ext;
+}
+
+static VkResult
+anv_execbuf_add_bo_bitset(struct anv_device *device,
+ struct anv_execbuf *exec,
+ uint32_t dep_words,
+ BITSET_WORD *deps,
+ uint32_t extra_flags);
+
+static VkResult
+anv_execbuf_add_bo(struct anv_device *device,
+ struct anv_execbuf *exec,
+ struct anv_bo *bo,
+ struct anv_reloc_list *relocs,
+ uint32_t extra_flags)
+{
+ struct drm_i915_gem_exec_object2 *obj = NULL;
+
+ if (bo->exec_obj_index < exec->bo_count &&
+ exec->bos[bo->exec_obj_index] == bo)
+ obj = &exec->objects[bo->exec_obj_index];
+
+ if (obj == NULL) {
+ /* We've never seen this one before. Add it to the list and assign
+ * an id that we can use later.
+ */
+ if (exec->bo_count >= exec->bo_array_length) {
+ uint32_t new_len = exec->objects ? exec->bo_array_length * 2 : 64;
+
+ struct drm_i915_gem_exec_object2 *new_objects =
+ vk_realloc(exec->alloc, exec->objects,
+ new_len * sizeof(*new_objects), 8, exec->alloc_scope);
+ if (new_objects == NULL)
+ return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+ exec->objects = new_objects;
+
+ struct anv_bo **new_bos =
+ vk_realloc(exec->alloc, exec->bos, new_len * sizeof(*new_bos), 8,
+ exec->alloc_scope);
+ if (new_bos == NULL)
+ return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+ exec->bos = new_bos;
+ exec->bo_array_length = new_len;
+ }
+
+ assert(exec->bo_count < exec->bo_array_length);
+
+ bo->exec_obj_index = exec->bo_count++;
+ obj = &exec->objects[bo->exec_obj_index];
+ exec->bos[bo->exec_obj_index] = bo;
+
+ obj->handle = bo->gem_handle;
+ obj->relocation_count = 0;
+ obj->relocs_ptr = 0;
+ obj->alignment = 0;
+ obj->offset = bo->offset;
+ obj->flags = bo->flags | extra_flags;
+ obj->rsvd1 = 0;
+ obj->rsvd2 = 0;
+ }
+
+ if (extra_flags & EXEC_OBJECT_WRITE) {
+ obj->flags |= EXEC_OBJECT_WRITE;
+ obj->flags &= ~EXEC_OBJECT_ASYNC;
+ }
+
+ if (relocs != NULL) {
+ return anv_execbuf_add_bo_bitset(device, exec, relocs->dep_words,
+ relocs->deps, extra_flags);
+ }
+
+ return VK_SUCCESS;
+}
+
+/* Add BO dependencies to execbuf */
+static VkResult
+anv_execbuf_add_bo_bitset(struct anv_device *device,
+ struct anv_execbuf *exec,
+ uint32_t dep_words,
+ BITSET_WORD *deps,
+ uint32_t extra_flags)
+{
+ for (uint32_t w = 0; w < dep_words; w++) {
+ BITSET_WORD mask = deps[w];
+ while (mask) {
+ int i = u_bit_scan(&mask);
+ uint32_t gem_handle = w * BITSET_WORDBITS + i;
+ struct anv_bo *bo = anv_device_lookup_bo(device, gem_handle);
+ assert(bo->refcount > 0);
+ VkResult result =
+ anv_execbuf_add_bo(device, exec, bo, NULL, extra_flags);
+ if (result != VK_SUCCESS)
+ return result;
+ }
+ }
+
+ return VK_SUCCESS;
+}
+
+static VkResult
+anv_execbuf_add_syncobj(struct anv_device *device,
+ struct anv_execbuf *exec,
+ uint32_t syncobj,
+ uint32_t flags,
+ uint64_t timeline_value)
+{
+ if (exec->syncobj_count >= exec->syncobj_array_length) {
+ uint32_t new_len = MAX2(exec->syncobj_array_length * 2, 16);
+
+ struct drm_i915_gem_exec_fence *new_syncobjs =
+ vk_realloc(exec->alloc, exec->syncobjs,
+ new_len * sizeof(*new_syncobjs), 8, exec->alloc_scope);
+ if (new_syncobjs == NULL)
+ return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+ exec->syncobjs = new_syncobjs;
+
+ if (exec->syncobj_values) {
+ uint64_t *new_syncobj_values =
+ vk_realloc(exec->alloc, exec->syncobj_values,
+ new_len * sizeof(*new_syncobj_values), 8,
+ exec->alloc_scope);
+ if (new_syncobj_values == NULL)
+ return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+ exec->syncobj_values = new_syncobj_values;
+ }
+
+ exec->syncobj_array_length = new_len;
+ }
+
+ if (timeline_value && !exec->syncobj_values) {
+ exec->syncobj_values =
+ vk_zalloc(exec->alloc, exec->syncobj_array_length *
+ sizeof(*exec->syncobj_values),
+ 8, exec->alloc_scope);
+ if (!exec->syncobj_values)
+ return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+ }
+
+ exec->syncobjs[exec->syncobj_count] = (struct drm_i915_gem_exec_fence) {
+ .handle = syncobj,
+ .flags = flags,
+ };
+ if (exec->syncobj_values)
+ exec->syncobj_values[exec->syncobj_count] = timeline_value;
+
+ exec->syncobj_count++;
+
+ return VK_SUCCESS;
+}
+
+static VkResult
+anv_execbuf_add_sync(struct anv_device *device,
+ struct anv_execbuf *execbuf,
+ struct vk_sync *sync,
+ bool is_signal,
+ uint64_t value)
+{
+ /* It's illegal to signal a timeline with value 0 because that's never
+ * higher than the current value. A timeline wait on value 0 is always
+ * trivial because 0 <= uint64_t always.
+ */
+ if ((sync->flags & VK_SYNC_IS_TIMELINE) && value == 0)
+ return VK_SUCCESS;
+
+ if (vk_sync_is_anv_bo_sync(sync)) {
+ struct anv_bo_sync *bo_sync =
+ container_of(sync, struct anv_bo_sync, sync);
+
+ assert(is_signal == (bo_sync->state == ANV_BO_SYNC_STATE_RESET));
+
+ return anv_execbuf_add_bo(device, execbuf, bo_sync->bo, NULL,
+ is_signal ? EXEC_OBJECT_WRITE : 0);
+ } else if (vk_sync_type_is_drm_syncobj(sync->type)) {
+ struct vk_drm_syncobj *syncobj = vk_sync_as_drm_syncobj(sync);
+
+ if (!(sync->flags & VK_SYNC_IS_TIMELINE))
+ value = 0;
+
+ return anv_execbuf_add_syncobj(device, execbuf, syncobj->syncobj,
+ is_signal ? I915_EXEC_FENCE_SIGNAL :
+ I915_EXEC_FENCE_WAIT,
+ value);
+ }
+
+ unreachable("Invalid sync type");
+}
+
+static VkResult
+setup_execbuf_for_cmd_buffer(struct anv_execbuf *execbuf,
+ struct anv_cmd_buffer *cmd_buffer)
+{
+ VkResult result;
+ /* Add surface dependencies (BOs) to the execbuf */
+ result = anv_execbuf_add_bo_bitset(cmd_buffer->device, execbuf,
+ cmd_buffer->surface_relocs.dep_words,
+ cmd_buffer->surface_relocs.deps, 0);
+ if (result != VK_SUCCESS)
+ return result;
+
+ /* First, we walk over all of the bos we've seen and add them and their
+ * relocations to the validate list.
+ */
+ struct anv_batch_bo **bbo;
+ u_vector_foreach(bbo, &cmd_buffer->seen_bbos) {
+ result = anv_execbuf_add_bo(cmd_buffer->device, execbuf,
+ (*bbo)->bo, &(*bbo)->relocs, 0);
+ if (result != VK_SUCCESS)
+ return result;
+ }
+
+ struct anv_bo **bo_entry;
+ u_vector_foreach(bo_entry, &cmd_buffer->dynamic_bos) {
+ result = anv_execbuf_add_bo(cmd_buffer->device, execbuf,
+ *bo_entry, NULL, 0);
+ if (result != VK_SUCCESS)
+ return result;
+ }
+
+ return VK_SUCCESS;
+}
+
+static VkResult
+pin_state_pool(struct anv_device *device,
+ struct anv_execbuf *execbuf,
+ struct anv_state_pool *pool)
+{
+ anv_block_pool_foreach_bo(bo, &pool->block_pool) {
+ VkResult result = anv_execbuf_add_bo(device, execbuf, bo, NULL, 0);
+ if (result != VK_SUCCESS)
+ return result;
+ }
+
+ return VK_SUCCESS;
+}
+
+static void
+get_context_and_exec_flags(struct anv_queue *queue,
+ bool is_companion_rcs_batch,
+ uint64_t *exec_flags,
+ uint32_t *context_id)
+{
+ assert(queue != NULL);
+
+ struct anv_device *device = queue->device;
+
+ /** Submit batch to index 0 which is the main virtual engine */
+ *exec_flags = device->physical->has_vm_control ? 0 : queue->exec_flags;
+
+ *context_id = device->physical->has_vm_control ?
+ is_companion_rcs_batch ?
+ queue->companion_rcs_id :
+ queue->context_id :
+ device->context_id;
+}
+
+static VkResult
+anv_execbuf_add_trtt_bos(struct anv_device *device,
+ struct anv_execbuf *execbuf)
+{
+ struct anv_trtt *trtt = &device->trtt;
+ VkResult result = VK_SUCCESS;
+
+ /* If l3_addr is zero we're not using TR-TT, there's no bo to add. */
+ if (!trtt->l3_addr)
+ return VK_SUCCESS;
+
+ pthread_mutex_lock(&trtt->mutex);
+
+ for (int i = 0; i < trtt->num_page_table_bos; i++) {
+ result = anv_execbuf_add_bo(device, execbuf, trtt->page_table_bos[i],
+ NULL, 0);
+ if (result != VK_SUCCESS)
+ goto out;
+ }
+
+out:
+ pthread_mutex_unlock(&trtt->mutex);
+ return result;
+}
+
+static VkResult
+setup_execbuf_for_cmd_buffers(struct anv_execbuf *execbuf,
+ struct anv_queue *queue,
+ struct anv_cmd_buffer **cmd_buffers,
+ uint32_t num_cmd_buffers)
+{
+ struct anv_device *device = queue->device;
+ VkResult result;
+
+ if (unlikely(device->physical->measure_device.config)) {
+ for (uint32_t i = 0; i < num_cmd_buffers; i++)
+ anv_measure_submit(cmd_buffers[i]);
+ }
+
+ /* Edit the tail of the command buffers to chain them all together if they
+ * can be.
+ */
+ anv_cmd_buffer_chain_command_buffers(cmd_buffers, num_cmd_buffers);
+
+ for (uint32_t i = 0; i < num_cmd_buffers; i++) {
+ result = setup_execbuf_for_cmd_buffer(execbuf, cmd_buffers[i]);
+ if (result != VK_SUCCESS)
+ return result;
+ }
+
+ /* Add all the global BOs to the object list for softpin case. */
+ result = pin_state_pool(device, execbuf, &device->scratch_surface_state_pool);
+ if (result != VK_SUCCESS)
+ return result;
+
+ if (device->physical->va.bindless_surface_state_pool.size > 0) {
+ result = pin_state_pool(device, execbuf, &device->bindless_surface_state_pool);
+ if (result != VK_SUCCESS)
+ return result;
+ }
+
+ if (device->physical->va.indirect_push_descriptor_pool.size > 0) {
+ result = pin_state_pool(device, execbuf, &device->indirect_push_descriptor_pool);
+ if (result != VK_SUCCESS)
+ return result;
+ }
+
+ result = pin_state_pool(device, execbuf, &device->internal_surface_state_pool);
+ if (result != VK_SUCCESS)
+ return result;
+
+ result = pin_state_pool(device, execbuf, &device->dynamic_state_pool);
+ if (result != VK_SUCCESS)
+ return result;
+
+ if (device->vk.enabled_extensions.EXT_descriptor_buffer) {
+ result = pin_state_pool(device, execbuf, &device->dynamic_state_db_pool);
+ if (result != VK_SUCCESS)
+ return result;
+ }
+
+ result = pin_state_pool(device, execbuf, &device->general_state_pool);
+ if (result != VK_SUCCESS)
+ return result;
+
+ result = pin_state_pool(device, execbuf, &device->instruction_state_pool);
+ if (result != VK_SUCCESS)
+ return result;
+
+ result = pin_state_pool(device, execbuf, &device->binding_table_pool);
+ if (result != VK_SUCCESS)
+ return result;
+
+ if (device->physical->va.aux_tt_pool.size > 0) {
+ result = pin_state_pool(device, execbuf, &device->aux_tt_pool);
+ if (result != VK_SUCCESS)
+ return result;
+ }
+
+ if (device->physical->va.push_descriptor_buffer_pool.size > 0) {
+ result = pin_state_pool(device, execbuf, &device->push_descriptor_buffer_pool);
+ if (result != VK_SUCCESS)
+ return result;
+ }
+
+ /* Add the BOs for all user allocated memory objects because we can't
+ * track after binding updates of VK_EXT_descriptor_indexing and due to how
+ * sparse resources work.
+ */
+ list_for_each_entry(struct anv_device_memory, mem,
+ &device->memory_objects, link) {
+ result = anv_execbuf_add_bo(device, execbuf, mem->bo, NULL, 0);
+ if (result != VK_SUCCESS)
+ return result;
+ }
+
+ result = anv_execbuf_add_trtt_bos(device, execbuf);
+ if (result != VK_SUCCESS)
+ return result;
+
+ /* Add all the private BOs from images because we can't track after binding
+ * updates of VK_EXT_descriptor_indexing.
+ */
+ list_for_each_entry(struct anv_image, image,
+ &device->image_private_objects, link) {
+ struct anv_bo *private_bo =
+ image->bindings[ANV_IMAGE_MEMORY_BINDING_PRIVATE].address.bo;
+ result = anv_execbuf_add_bo(device, execbuf, private_bo, NULL, 0);
+ if (result != VK_SUCCESS)
+ return result;
+ }
+
+ struct list_head *batch_bo = &cmd_buffers[0]->batch_bos;
+ struct anv_batch_bo *first_batch_bo =
+ list_first_entry(batch_bo, struct anv_batch_bo, link);
+
+ /* The kernel requires that the last entry in the validation list be the
+ * batch buffer to execute. We can simply swap the element
+ * corresponding to the first batch_bo in the chain with the last
+ * element in the list.
+ */
+ if (first_batch_bo->bo->exec_obj_index != execbuf->bo_count - 1) {
+ uint32_t idx = first_batch_bo->bo->exec_obj_index;
+ uint32_t last_idx = execbuf->bo_count - 1;
+
+ struct drm_i915_gem_exec_object2 tmp_obj = execbuf->objects[idx];
+ assert(execbuf->bos[idx] == first_batch_bo->bo);
+
+ execbuf->objects[idx] = execbuf->objects[last_idx];
+ execbuf->bos[idx] = execbuf->bos[last_idx];
+ execbuf->bos[idx]->exec_obj_index = idx;
+
+ execbuf->objects[last_idx] = tmp_obj;
+ execbuf->bos[last_idx] = first_batch_bo->bo;
+ first_batch_bo->bo->exec_obj_index = last_idx;
+ }
+
+#ifdef SUPPORT_INTEL_INTEGRATED_GPUS
+ if (device->physical->memory.need_flush &&
+ anv_bo_needs_host_cache_flush(device->batch_bo_pool.bo_alloc_flags))
+ anv_cmd_buffer_clflush(cmd_buffers, num_cmd_buffers);
+#endif
+
+ assert(!cmd_buffers[0]->is_companion_rcs_cmd_buffer || device->physical->has_vm_control);
+ uint64_t exec_flags = 0;
+ uint32_t context_id;
+ get_context_and_exec_flags(queue, cmd_buffers[0]->is_companion_rcs_cmd_buffer,
+ &exec_flags, &context_id);
+
+ execbuf->execbuf = (struct drm_i915_gem_execbuffer2) {
+ .buffers_ptr = (uintptr_t) execbuf->objects,
+ .buffer_count = execbuf->bo_count,
+ .batch_start_offset = 0,
+ .batch_len = 0,
+ .cliprects_ptr = 0,
+ .num_cliprects = 0,
+ .DR1 = 0,
+ .DR4 = 0,
+ .flags = I915_EXEC_NO_RELOC |
+ I915_EXEC_HANDLE_LUT |
+ exec_flags,
+ .rsvd1 = context_id,
+ .rsvd2 = 0,
+ };
+
+ return VK_SUCCESS;
+}
+
+static VkResult
+setup_empty_execbuf(struct anv_execbuf *execbuf, struct anv_queue *queue)
+{
+ struct anv_device *device = queue->device;
+ VkResult result = anv_execbuf_add_bo(device, execbuf,
+ device->trivial_batch_bo,
+ NULL, 0);
+ if (result != VK_SUCCESS)
+ return result;
+
+ uint64_t exec_flags = 0;
+ uint32_t context_id;
+ get_context_and_exec_flags(queue, false, &exec_flags, &context_id);
+
+ execbuf->execbuf = (struct drm_i915_gem_execbuffer2) {
+ .buffers_ptr = (uintptr_t) execbuf->objects,
+ .buffer_count = execbuf->bo_count,
+ .batch_start_offset = 0,
+ .batch_len = 8, /* GFX7_MI_BATCH_BUFFER_END and NOOP */
+ .flags = I915_EXEC_HANDLE_LUT | exec_flags | I915_EXEC_NO_RELOC,
+ .rsvd1 = context_id,
+ .rsvd2 = 0,
+ };
+
+ return VK_SUCCESS;
+}
+
+static VkResult
+setup_utrace_execbuf(struct anv_execbuf *execbuf, struct anv_queue *queue,
+ struct anv_utrace_submit *submit)
+{
+ struct anv_device *device = queue->device;
+
+ /* Always add the workaround BO as it includes a driver identifier for the
+ * error_state.
+ */
+ VkResult result = anv_execbuf_add_bo(device, execbuf,
+ device->workaround_bo,
+ NULL, 0);
+ if (result != VK_SUCCESS)
+ return result;
+
+ util_dynarray_foreach(&submit->batch_bos, struct anv_bo *, _bo) {
+ struct anv_bo *bo = *_bo;
+
+ result = anv_execbuf_add_bo(device, execbuf, bo,
+ &submit->relocs, 0);
+ if (result != VK_SUCCESS)
+ return result;
+
+#ifdef SUPPORT_INTEL_INTEGRATED_GPUS
+ if (device->physical->memory.need_flush &&
+ anv_bo_needs_host_cache_flush(bo->alloc_flags))
+ intel_flush_range(bo->map, bo->size);
+#endif
+ }
+
+ result = anv_execbuf_add_sync(device, execbuf, submit->sync,
+ true /* is_signal */, 0 /* value */);
+ if (result != VK_SUCCESS)
+ return result;
+
+ struct anv_bo *batch_bo =
+ *util_dynarray_element(&submit->batch_bos, struct anv_bo *, 0);
+ if (batch_bo->exec_obj_index != execbuf->bo_count - 1) {
+ uint32_t idx = batch_bo->exec_obj_index;
+ uint32_t last_idx = execbuf->bo_count - 1;
+
+ struct drm_i915_gem_exec_object2 tmp_obj = execbuf->objects[idx];
+ assert(execbuf->bos[idx] == batch_bo);
+
+ execbuf->objects[idx] = execbuf->objects[last_idx];
+ execbuf->bos[idx] = execbuf->bos[last_idx];
+ execbuf->bos[idx]->exec_obj_index = idx;
+
+ execbuf->objects[last_idx] = tmp_obj;
+ execbuf->bos[last_idx] = batch_bo;
+ batch_bo->exec_obj_index = last_idx;
+ }
+
+ uint64_t exec_flags = 0;
+ uint32_t context_id;
+ get_context_and_exec_flags(queue, false, &exec_flags, &context_id);
+
+ execbuf->execbuf = (struct drm_i915_gem_execbuffer2) {
+ .buffers_ptr = (uintptr_t) execbuf->objects,
+ .buffer_count = execbuf->bo_count,
+ .batch_start_offset = 0,
+ .batch_len = submit->batch.next - submit->batch.start,
+ .flags = I915_EXEC_NO_RELOC |
+ I915_EXEC_HANDLE_LUT |
+ I915_EXEC_FENCE_ARRAY |
+ exec_flags,
+ .rsvd1 = context_id,
+ .rsvd2 = 0,
+ .num_cliprects = execbuf->syncobj_count,
+ .cliprects_ptr = (uintptr_t)execbuf->syncobjs,
+ };
+
+ return VK_SUCCESS;
+}
+
+static int
+anv_gem_execbuffer(struct anv_device *device,
+ struct drm_i915_gem_execbuffer2 *execbuf)
+{
+ int ret;
+ const unsigned long request = (execbuf->flags & I915_EXEC_FENCE_OUT) ?
+ DRM_IOCTL_I915_GEM_EXECBUFFER2_WR :
+ DRM_IOCTL_I915_GEM_EXECBUFFER2;
+
+ do {
+ ret = intel_ioctl(device->fd, request, execbuf);
+ } while (ret && errno == ENOMEM);
+
+ return ret;
+}
+
+static VkResult
+anv_queue_exec_utrace_locked(struct anv_queue *queue,
+ struct anv_utrace_submit *submit)
+{
+ assert(util_dynarray_num_elements(&submit->batch_bos,
+ struct anv_bo *) > 0);
+
+ struct anv_device *device = queue->device;
+ struct anv_execbuf execbuf = {
+ .alloc = &device->vk.alloc,
+ .alloc_scope = VK_SYSTEM_ALLOCATION_SCOPE_DEVICE,
+ };
+
+ VkResult result = setup_utrace_execbuf(&execbuf, queue, submit);
+ if (result != VK_SUCCESS)
+ goto error;
+
+ ANV_RMV(bos_gtt_map, device, execbuf.bos, execbuf.bo_count);
+
+ int ret = queue->device->info->no_hw ? 0 :
+ anv_gem_execbuffer(queue->device, &execbuf.execbuf);
+ if (ret)
+ result = vk_queue_set_lost(&queue->vk, "execbuf2 failed: %m");
+
+ error:
+ anv_execbuf_finish(&execbuf);
+
+ return result;
+}
+
+static void
+anv_i915_debug_submit(const struct anv_execbuf *execbuf)
+{
+ uint32_t total_size_kb = 0, total_vram_only_size_kb = 0;
+ for (uint32_t i = 0; i < execbuf->bo_count; i++) {
+ const struct anv_bo *bo = execbuf->bos[i];
+ total_size_kb += bo->size / 1024;
+ if (anv_bo_is_vram_only(bo))
+ total_vram_only_size_kb += bo->size / 1024;
+ }
+
+ fprintf(stderr, "Batch offset=0x%x len=0x%x on queue 0 (aperture: %.1fMb, %.1fMb VRAM only)\n",
+ execbuf->execbuf.batch_start_offset, execbuf->execbuf.batch_len,
+ (float)total_size_kb / 1024.0f,
+ (float)total_vram_only_size_kb / 1024.0f);
+ for (uint32_t i = 0; i < execbuf->bo_count; i++) {
+ const struct anv_bo *bo = execbuf->bos[i];
+
+ fprintf(stderr, " BO: addr=0x%016"PRIx64"-0x%016"PRIx64" size=%7"PRIu64
+ "KB handle=%05u capture=%u vram_only=%u name=%s\n",
+ bo->offset, bo->offset + bo->size - 1, bo->size / 1024,
+ bo->gem_handle, (bo->flags & EXEC_OBJECT_CAPTURE) != 0,
+ anv_bo_is_vram_only(bo), bo->name);
+ }
+}
+
+static void
+setup_execbuf_fence_params(struct anv_execbuf *execbuf)
+{
+ if (execbuf->syncobj_values) {
+ execbuf->timeline_fences.fence_count = execbuf->syncobj_count;
+ execbuf->timeline_fences.handles_ptr = (uintptr_t)execbuf->syncobjs;
+ execbuf->timeline_fences.values_ptr = (uintptr_t)execbuf->syncobj_values;
+ anv_execbuf_add_ext(execbuf,
+ DRM_I915_GEM_EXECBUFFER_EXT_TIMELINE_FENCES,
+ &execbuf->timeline_fences.base);
+ } else if (execbuf->syncobjs) {
+ execbuf->execbuf.flags |= I915_EXEC_FENCE_ARRAY;
+ execbuf->execbuf.num_cliprects = execbuf->syncobj_count;
+ execbuf->execbuf.cliprects_ptr = (uintptr_t)execbuf->syncobjs;
+ }
+}
+
+static VkResult
+i915_companion_rcs_queue_exec_locked(struct anv_queue *queue,
+ struct anv_cmd_buffer *companion_rcs_cmd_buffer,
+ uint32_t wait_count,
+ const struct vk_sync_wait *waits)
+{
+ struct anv_device *device = queue->device;
+ struct anv_execbuf execbuf = {
+ .alloc = &queue->device->vk.alloc,
+ .alloc_scope = VK_SYSTEM_ALLOCATION_SCOPE_DEVICE,
+ };
+
+ /* Always add the workaround BO as it includes a driver identifier for the
+ * error_state.
+ */
+ VkResult result =
+ anv_execbuf_add_bo(device, &execbuf, device->workaround_bo, NULL, 0);
+ if (result != VK_SUCCESS)
+ goto error;
+
+ for (uint32_t i = 0; i < wait_count; i++) {
+ result = anv_execbuf_add_sync(device, &execbuf,
+ waits[i].sync,
+ false /* is_signal */,
+ waits[i].wait_value);
+ if (result != VK_SUCCESS)
+ goto error;
+ }
+
+ if (queue->companion_sync) {
+ result = anv_execbuf_add_sync(device, &execbuf,
+ queue->companion_sync,
+ true /* is_signal */, 0);
+ if (result != VK_SUCCESS)
+ goto error;
+ }
+
+ result = setup_execbuf_for_cmd_buffers(&execbuf, queue,
+ &companion_rcs_cmd_buffer, 1);
+ if (result != VK_SUCCESS)
+ goto error;
+
+ if (INTEL_DEBUG(DEBUG_SUBMIT))
+ anv_i915_debug_submit(&execbuf);
+
+ anv_cmd_buffer_exec_batch_debug(queue, 1, &companion_rcs_cmd_buffer, NULL, 0);
+
+ setup_execbuf_fence_params(&execbuf);
+
+ ANV_RMV(bos_gtt_map, device, execbuf.bos, execbuf.bo_count);
+
+ int ret = queue->device->info->no_hw ? 0 :
+ anv_gem_execbuffer(queue->device, &execbuf.execbuf);
+ if (ret) {
+ anv_i915_debug_submit(&execbuf);
+ result = vk_queue_set_lost(&queue->vk, "execbuf2 failed: %m");
+ }
+
+ error:
+ anv_execbuf_finish(&execbuf);
+ return result;
+}
+
+VkResult
+i915_queue_exec_locked(struct anv_queue *queue,
+ uint32_t wait_count,
+ const struct vk_sync_wait *waits,
+ uint32_t cmd_buffer_count,
+ struct anv_cmd_buffer **cmd_buffers,
+ uint32_t signal_count,
+ const struct vk_sync_signal *signals,
+ struct anv_query_pool *perf_query_pool,
+ uint32_t perf_query_pass,
+ struct anv_utrace_submit *utrace_submit)
+{
+ struct anv_device *device = queue->device;
+ struct anv_execbuf execbuf = {
+ .alloc = &queue->device->vk.alloc,
+ .alloc_scope = VK_SYSTEM_ALLOCATION_SCOPE_DEVICE,
+ .perf_query_pass = perf_query_pass,
+ };
+ VkResult result;
+
+ if (utrace_submit &&
+ util_dynarray_num_elements(&utrace_submit->batch_bos,
+ struct anv_bo *) == 0) {
+ result = anv_execbuf_add_sync(device, &execbuf,
+ utrace_submit->sync,
+ true /* is_signal */,
+ 0);
+ if (result != VK_SUCCESS)
+ goto error;
+
+ /* When The utrace submission doesn't have its own batch buffer*/
+ utrace_submit = NULL;
+ }
+
+ /* Always add the workaround BO as it includes a driver identifier for the
+ * error_state.
+ */
+ result =
+ anv_execbuf_add_bo(device, &execbuf, device->workaround_bo, NULL, 0);
+ if (result != VK_SUCCESS)
+ goto error;
+
+ for (uint32_t i = 0; i < wait_count; i++) {
+ result = anv_execbuf_add_sync(device, &execbuf,
+ waits[i].sync,
+ false /* is_signal */,
+ waits[i].wait_value);
+ if (result != VK_SUCCESS)
+ goto error;
+ }
+
+ for (uint32_t i = 0; i < signal_count; i++) {
+ result = anv_execbuf_add_sync(device, &execbuf,
+ signals[i].sync,
+ true /* is_signal */,
+ signals[i].signal_value);
+ if (result != VK_SUCCESS)
+ goto error;
+ }
+
+ if (queue->sync) {
+ result = anv_execbuf_add_sync(device, &execbuf,
+ queue->sync,
+ true /* is_signal */,
+ 0 /* signal_value */);
+ if (result != VK_SUCCESS)
+ goto error;
+ }
+
+ if (cmd_buffer_count) {
+ result = setup_execbuf_for_cmd_buffers(&execbuf, queue, cmd_buffers,
+ cmd_buffer_count);
+ } else {
+ result = setup_empty_execbuf(&execbuf, queue);
+ }
+
+ if (result != VK_SUCCESS)
+ goto error;
+
+ const bool has_perf_query =
+ perf_query_pool && perf_query_pass >= 0 && cmd_buffer_count;
+
+ if (INTEL_DEBUG(DEBUG_SUBMIT))
+ anv_i915_debug_submit(&execbuf);
+
+ anv_cmd_buffer_exec_batch_debug(queue, cmd_buffer_count, cmd_buffers,
+ perf_query_pool, perf_query_pass);
+
+ setup_execbuf_fence_params(&execbuf);
+
+ if (has_perf_query) {
+ assert(perf_query_pass < perf_query_pool->n_passes);
+ struct intel_perf_query_info *query_info =
+ perf_query_pool->pass_query[perf_query_pass];
+
+ /* Some performance queries just the pipeline statistic HW, no need for
+ * OA in that case, so no need to reconfigure.
+ */
+ if (!INTEL_DEBUG(DEBUG_NO_OACONFIG) &&
+ (query_info->kind == INTEL_PERF_QUERY_TYPE_OA ||
+ query_info->kind == INTEL_PERF_QUERY_TYPE_RAW)) {
+ int ret = intel_ioctl(device->perf_fd, I915_PERF_IOCTL_CONFIG,
+ (void *)(uintptr_t) query_info->oa_metrics_set_id);
+ if (ret < 0) {
+ result = vk_device_set_lost(&device->vk,
+ "i915-perf config failed: %s",
+ strerror(errno));
+ }
+ }
+
+ struct anv_bo *pass_batch_bo = perf_query_pool->bo;
+
+ struct drm_i915_gem_exec_object2 query_pass_object = {
+ .handle = pass_batch_bo->gem_handle,
+ .offset = pass_batch_bo->offset,
+ .flags = pass_batch_bo->flags,
+ };
+
+ uint64_t exec_flags = 0;
+ uint32_t context_id;
+ get_context_and_exec_flags(queue, false, &exec_flags, &context_id);
+
+ struct drm_i915_gem_execbuffer2 query_pass_execbuf = {
+ .buffers_ptr = (uintptr_t) &query_pass_object,
+ .buffer_count = 1,
+ .batch_start_offset = khr_perf_query_preamble_offset(perf_query_pool,
+ perf_query_pass),
+ .flags = I915_EXEC_HANDLE_LUT | exec_flags,
+ .rsvd1 = context_id,
+ };
+
+ int ret = queue->device->info->no_hw ? 0 :
+ anv_gem_execbuffer(queue->device, &query_pass_execbuf);
+ if (ret)
+ result = vk_queue_set_lost(&queue->vk, "execbuf2 failed: %m");
+ }
+
+ ANV_RMV(bos_gtt_map, device, execbuf.bos, execbuf.bo_count);
+
+ int ret = queue->device->info->no_hw ? 0 :
+ anv_gem_execbuffer(queue->device, &execbuf.execbuf);
+ if (ret) {
+ anv_i915_debug_submit(&execbuf);
+ result = vk_queue_set_lost(&queue->vk, "execbuf2 failed: %m");
+ }
+
+ if (cmd_buffer_count != 0 && cmd_buffers[0]->companion_rcs_cmd_buffer) {
+ struct anv_cmd_buffer *companion_rcs_cmd_buffer =
+ cmd_buffers[0]->companion_rcs_cmd_buffer;
+ assert(companion_rcs_cmd_buffer->is_companion_rcs_cmd_buffer);
+ assert(cmd_buffer_count == 1);
+ result = i915_companion_rcs_queue_exec_locked(queue,
+ cmd_buffers[0]->companion_rcs_cmd_buffer, wait_count,
+ waits);
+ }
+
+ result = anv_queue_post_submit(queue, result);
+
+ error:
+ anv_execbuf_finish(&execbuf);
+
+ if (result == VK_SUCCESS && utrace_submit)
+ result = anv_queue_exec_utrace_locked(queue, utrace_submit);
+
+ return result;
+}
+
+VkResult
+i915_execute_simple_batch(struct anv_queue *queue, struct anv_bo *batch_bo,
+ uint32_t batch_bo_size, bool is_companion_rcs_batch)
+{
+ struct anv_device *device = queue->device;
+ struct anv_execbuf execbuf = {
+ .alloc = &queue->device->vk.alloc,
+ .alloc_scope = VK_SYSTEM_ALLOCATION_SCOPE_DEVICE,
+ };
+
+ VkResult result = anv_execbuf_add_bo(device, &execbuf, batch_bo, NULL, 0);
+ if (result != VK_SUCCESS)
+ goto fail;
+
+ assert(!is_companion_rcs_batch || device->physical->has_vm_control);
+ uint64_t exec_flags = 0;
+ uint32_t context_id;
+ get_context_and_exec_flags(queue, is_companion_rcs_batch, &exec_flags,
+ &context_id);
+
+ execbuf.execbuf = (struct drm_i915_gem_execbuffer2) {
+ .buffers_ptr = (uintptr_t) execbuf.objects,
+ .buffer_count = execbuf.bo_count,
+ .batch_start_offset = 0,
+ .batch_len = batch_bo_size,
+ .flags = I915_EXEC_HANDLE_LUT | exec_flags | I915_EXEC_NO_RELOC,
+ .rsvd1 = context_id,
+ .rsvd2 = 0,
+ };
+
+ ANV_RMV(bos_gtt_map, device, execbuf.bos, execbuf.bo_count);
+
+ if (anv_gem_execbuffer(device, &execbuf.execbuf)) {
+ result = vk_device_set_lost(&device->vk, "anv_gem_execbuffer failed: %m");
+ goto fail;
+ }
+
+ result = anv_device_wait(device, batch_bo, INT64_MAX);
+ if (result != VK_SUCCESS)
+ result = vk_device_set_lost(&device->vk,
+ "anv_device_wait failed: %m");
+
+fail:
+ anv_execbuf_finish(&execbuf);
+ return result;
+}
+
+VkResult
+i915_execute_trtt_batch(struct anv_sparse_submission *submit,
+ struct anv_trtt_batch_bo *trtt_bbo)
+{
+ struct anv_queue *queue = submit->queue;
+ struct anv_device *device = queue->device;
+ struct anv_trtt *trtt = &device->trtt;
+ struct anv_execbuf execbuf = {
+ .alloc = &device->vk.alloc,
+ .alloc_scope = VK_SYSTEM_ALLOCATION_SCOPE_DEVICE,
+ };
+ VkResult result;
+
+ for (uint32_t i = 0; i < submit->wait_count; i++) {
+ result = anv_execbuf_add_sync(device, &execbuf, submit->waits[i].sync,
+ false /* is_signal */,
+ submit->waits[i].wait_value);
+ if (result != VK_SUCCESS)
+ goto out;
+ }
+
+ for (uint32_t i = 0; i < submit->signal_count; i++) {
+ result = anv_execbuf_add_sync(device, &execbuf, submit->signals[i].sync,
+ true /* is_signal */,
+ submit->signals[i].signal_value);
+ if (result != VK_SUCCESS)
+ goto out;
+ }
+
+ result = anv_execbuf_add_syncobj(device, &execbuf, trtt->timeline_handle,
+ I915_EXEC_FENCE_SIGNAL,
+ trtt_bbo->timeline_val);
+ if (result != VK_SUCCESS)
+ goto out;
+
+
+ result = anv_execbuf_add_bo(device, &execbuf, device->workaround_bo, NULL,
+ 0);
+ if (result != VK_SUCCESS)
+ goto out;
+
+ for (int i = 0; i < trtt->num_page_table_bos; i++) {
+ result = anv_execbuf_add_bo(device, &execbuf, trtt->page_table_bos[i],
+ NULL, EXEC_OBJECT_WRITE);
+ if (result != VK_SUCCESS)
+ goto out;
+ }
+
+ if (queue->sync) {
+ result = anv_execbuf_add_sync(device, &execbuf, queue->sync,
+ true /* is_signal */,
+ 0 /* signal_value */);
+ if (result != VK_SUCCESS)
+ goto out;
+ }
+
+ result = anv_execbuf_add_bo(device, &execbuf, trtt_bbo->bo, NULL, 0);
+ if (result != VK_SUCCESS)
+ goto out;
+
+ if (INTEL_DEBUG(DEBUG_SUBMIT))
+ anv_i915_debug_submit(&execbuf);
+
+ uint64_t exec_flags = 0;
+ uint32_t context_id;
+ get_context_and_exec_flags(queue, false, &exec_flags, &context_id);
+
+ execbuf.execbuf = (struct drm_i915_gem_execbuffer2) {
+ .buffers_ptr = (uintptr_t) execbuf.objects,
+ .buffer_count = execbuf.bo_count,
+ .batch_start_offset = 0,
+ .batch_len = trtt_bbo->size,
+ .flags = I915_EXEC_HANDLE_LUT | I915_EXEC_NO_RELOC | exec_flags,
+ .rsvd1 = context_id,
+ .rsvd2 = 0,
+ };
+ setup_execbuf_fence_params(&execbuf);
+
+ ANV_RMV(bos_gtt_map, device, execbuf.bos, execbuf.bo_count);
+
+ int ret = queue->device->info->no_hw ? 0 :
+ anv_gem_execbuffer(device, &execbuf.execbuf);
+ if (ret) {
+ result = vk_device_set_lost(&device->vk,
+ "trtt anv_gem_execbuffer failed: %m");
+ goto out;
+ }
+
+ if (queue->sync) {
+ result = vk_sync_wait(&device->vk, queue->sync, 0,
+ VK_SYNC_WAIT_COMPLETE, UINT64_MAX);
+ if (result != VK_SUCCESS) {
+ result = vk_queue_set_lost(&queue->vk, "trtt sync wait failed");
+ goto out;
+ }
+ }
+
+out:
+ anv_execbuf_finish(&execbuf);
+ return result;
+}
+
+VkResult
+i915_queue_exec_trace(struct anv_queue *queue,
+ struct anv_utrace_submit *submit)
+{
+ assert(util_dynarray_num_elements(&submit->batch_bos,
+ struct anv_bo *) > 0);
+
+ return anv_queue_exec_utrace_locked(queue, submit);
+}
diff --git a/src/intel/vulkan/i915/anv_batch_chain.h b/src/intel/vulkan/i915/anv_batch_chain.h
new file mode 100644
index 00000000000..fc799582828
--- /dev/null
+++ b/src/intel/vulkan/i915/anv_batch_chain.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright © 2022 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#include <stdint.h>
+
+#include "vulkan/vulkan_core.h"
+
+#include "vk_sync.h"
+
+struct anv_device;
+struct anv_queue;
+struct anv_bo;
+struct anv_cmd_buffer;
+struct anv_query_pool;
+struct anv_utrace_submit;
+struct anv_sparse_submission;
+struct anv_trtt_batch_bo;
+
+VkResult
+i915_queue_exec_trace(struct anv_queue *queue,
+ struct anv_utrace_submit *submit);
+VkResult
+i915_execute_simple_batch(struct anv_queue *queue, struct anv_bo *batch_bo,
+ uint32_t batch_bo_size, bool is_companion_rcs_batch);
+
+VkResult
+i915_execute_trtt_batch(struct anv_sparse_submission *submit,
+ struct anv_trtt_batch_bo *trtt_bbo);
+
+VkResult
+i915_queue_exec_locked(struct anv_queue *queue,
+ uint32_t wait_count,
+ const struct vk_sync_wait *waits,
+ uint32_t cmd_buffer_count,
+ struct anv_cmd_buffer **cmd_buffers,
+ uint32_t signal_count,
+ const struct vk_sync_signal *signals,
+ struct anv_query_pool *perf_query_pool,
+ uint32_t perf_query_pass,
+ struct anv_utrace_submit *utrace_submit);
diff --git a/src/intel/vulkan/i915/anv_device.c b/src/intel/vulkan/i915/anv_device.c
new file mode 100644
index 00000000000..818b514ca1c
--- /dev/null
+++ b/src/intel/vulkan/i915/anv_device.c
@@ -0,0 +1,400 @@
+/*
+ * Copyright © 2022 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include "i915/anv_device.h"
+#include "anv_private.h"
+
+#include "common/i915/intel_defines.h"
+#include "common/i915/intel_gem.h"
+
+#include "drm-uapi/i915_drm.h"
+
+static int
+vk_priority_to_i915(VkQueueGlobalPriorityKHR priority)
+{
+ switch (priority) {
+ case VK_QUEUE_GLOBAL_PRIORITY_LOW_KHR:
+ return INTEL_CONTEXT_LOW_PRIORITY;
+ case VK_QUEUE_GLOBAL_PRIORITY_MEDIUM_KHR:
+ return INTEL_CONTEXT_MEDIUM_PRIORITY;
+ case VK_QUEUE_GLOBAL_PRIORITY_HIGH_KHR:
+ return INTEL_CONTEXT_HIGH_PRIORITY;
+ case VK_QUEUE_GLOBAL_PRIORITY_REALTIME_KHR:
+ return INTEL_CONTEXT_REALTIME_PRIORITY;
+ default:
+ unreachable("Invalid priority");
+ }
+}
+
+int
+anv_gem_set_context_param(int fd, uint32_t context, uint32_t param, uint64_t value)
+{
+ if (param == I915_CONTEXT_PARAM_PRIORITY)
+ value = vk_priority_to_i915(value);
+
+ int err = 0;
+ if (!intel_gem_set_context_param(fd, context, param, value))
+ err = -errno;
+ return err;
+}
+
+static bool
+anv_gem_has_context_priority(int fd, VkQueueGlobalPriorityKHR priority)
+{
+ return !anv_gem_set_context_param(fd, 0, I915_CONTEXT_PARAM_PRIORITY,
+ priority);
+}
+
+VkResult
+anv_i915_physical_device_get_parameters(struct anv_physical_device *device)
+{
+ VkResult result = VK_SUCCESS;
+ int val, fd = device->local_fd;
+ uint64_t value;
+
+ if (!intel_gem_get_param(fd, I915_PARAM_HAS_WAIT_TIMEOUT, &val) || !val) {
+ result = vk_errorf(device, VK_ERROR_INITIALIZATION_FAILED,
+ "kernel missing gem wait");
+ return result;
+ }
+
+ if (!intel_gem_get_param(fd, I915_PARAM_HAS_EXECBUF2, &val) || !val) {
+ result = vk_errorf(device, VK_ERROR_INITIALIZATION_FAILED,
+ "kernel missing execbuf2");
+ return result;
+ }
+
+ if (!device->info.has_llc &&
+ (!intel_gem_get_param(fd, I915_PARAM_MMAP_VERSION, &val) || val < 1)) {
+ result = vk_errorf(device, VK_ERROR_INITIALIZATION_FAILED,
+ "kernel missing wc mmap");
+ return result;
+ }
+
+ if (!intel_gem_get_param(fd, I915_PARAM_HAS_EXEC_SOFTPIN, &val) || !val) {
+ result = vk_errorf(device, VK_ERROR_INITIALIZATION_FAILED,
+ "kernel missing softpin");
+ return result;
+ }
+
+ if (!intel_gem_get_param(fd, I915_PARAM_HAS_EXEC_FENCE_ARRAY, &val) || !val) {
+ result = vk_errorf(device, VK_ERROR_INITIALIZATION_FAILED,
+ "kernel missing syncobj support");
+ return result;
+ }
+
+ if (intel_gem_get_param(fd, I915_PARAM_HAS_EXEC_ASYNC, &val))
+ device->has_exec_async = val;
+ if (intel_gem_get_param(fd, I915_PARAM_HAS_EXEC_CAPTURE, &val))
+ device->has_exec_capture = val;
+
+ /* Start with medium; sorted low to high */
+ const VkQueueGlobalPriorityKHR priorities[] = {
+ VK_QUEUE_GLOBAL_PRIORITY_LOW_KHR,
+ VK_QUEUE_GLOBAL_PRIORITY_MEDIUM_KHR,
+ VK_QUEUE_GLOBAL_PRIORITY_HIGH_KHR,
+ VK_QUEUE_GLOBAL_PRIORITY_REALTIME_KHR,
+ };
+ device->max_context_priority = VK_QUEUE_GLOBAL_PRIORITY_LOW_KHR;
+ for (unsigned i = 0; i < ARRAY_SIZE(priorities); i++) {
+ if (!anv_gem_has_context_priority(fd, priorities[i]))
+ break;
+ device->max_context_priority = priorities[i];
+ }
+
+ if (intel_gem_get_param(fd, I915_PARAM_HAS_EXEC_TIMELINE_FENCES, &val))
+ device->has_exec_timeline = val;
+
+ if (intel_gem_get_context_param(fd, 0, I915_CONTEXT_PARAM_VM, &value))
+ device->has_vm_control = value;
+
+ return result;
+}
+
+VkResult
+anv_i915_physical_device_init_memory_types(struct anv_physical_device *device)
+{
+ if (anv_physical_device_has_vram(device)) {
+ device->memory.type_count = 3;
+ device->memory.types[0] = (struct anv_memory_type) {
+ .propertyFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT,
+ .heapIndex = 0,
+ };
+ device->memory.types[1] = (struct anv_memory_type) {
+ .propertyFlags = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
+ VK_MEMORY_PROPERTY_HOST_COHERENT_BIT |
+ VK_MEMORY_PROPERTY_HOST_CACHED_BIT,
+ .heapIndex = 1,
+ };
+ device->memory.types[2] = (struct anv_memory_type) {
+ .propertyFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT |
+ VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
+ VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
+ /* This memory type either comes from heaps[0] if there is only
+ * mappable vram region, or from heaps[2] if there is both mappable &
+ * non-mappable vram regions.
+ */
+ .heapIndex = device->vram_non_mappable.size > 0 ? 2 : 0,
+ };
+ } else if (device->info.has_llc) {
+ /* Big core GPUs share LLC with the CPU and thus one memory type can be
+ * both cached and coherent at the same time.
+ *
+ * But some game engines can't handle single type well
+ * https://gitlab.freedesktop.org/mesa/mesa/-/issues/7360#note_1719438
+ *
+ * The second memory type w/out HOST_CACHED_BIT will get write-combining.
+ * See anv_AllocateMemory()).
+ *
+ * The Intel Vulkan driver for Windows also advertises these memory types.
+ */
+ device->memory.type_count = 3;
+ device->memory.types[0] = (struct anv_memory_type) {
+ .propertyFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT,
+ .heapIndex = 0,
+ };
+ device->memory.types[1] = (struct anv_memory_type) {
+ .propertyFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT |
+ VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
+ VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
+ .heapIndex = 0,
+ };
+ device->memory.types[2] = (struct anv_memory_type) {
+ .propertyFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT |
+ VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
+ VK_MEMORY_PROPERTY_HOST_COHERENT_BIT |
+ VK_MEMORY_PROPERTY_HOST_CACHED_BIT,
+ .heapIndex = 0,
+ };
+ } else {
+ /* The spec requires that we expose a host-visible, coherent memory
+ * type, but Atom GPUs don't share LLC. Thus we offer two memory types
+ * to give the application a choice between cached, but not coherent and
+ * coherent but uncached (WC though).
+ */
+ device->memory.type_count = 2;
+ device->memory.types[0] = (struct anv_memory_type) {
+ .propertyFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT |
+ VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
+ VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
+ .heapIndex = 0,
+ };
+ device->memory.types[1] = (struct anv_memory_type) {
+ .propertyFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT |
+ VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
+ VK_MEMORY_PROPERTY_HOST_CACHED_BIT,
+ .heapIndex = 0,
+ };
+ }
+
+ if (device->has_protected_contexts) {
+ /* Add a memory type for protected buffers, local and not host
+ * visible.
+ */
+ device->memory.types[device->memory.type_count++] =
+ (struct anv_memory_type) {
+ .propertyFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT |
+ VK_MEMORY_PROPERTY_PROTECTED_BIT,
+ .heapIndex = 0,
+ };
+ }
+
+ return VK_SUCCESS;
+}
+
+VkResult
+anv_i915_set_queue_parameters(
+ struct anv_device *device,
+ uint32_t context_id,
+ const VkDeviceQueueGlobalPriorityCreateInfoKHR *queue_priority)
+{
+ struct anv_physical_device *physical_device = device->physical;
+
+ /* Here we tell the kernel not to attempt to recover our context but
+ * immediately (on the next batchbuffer submission) report that the
+ * context is lost, and we will do the recovery ourselves. In the case
+ * of Vulkan, recovery means throwing VK_ERROR_DEVICE_LOST and letting
+ * the client clean up the pieces.
+ */
+ anv_gem_set_context_param(device->fd, context_id,
+ I915_CONTEXT_PARAM_RECOVERABLE, false);
+
+ VkQueueGlobalPriorityKHR priority =
+ queue_priority ? queue_priority->globalPriority :
+ VK_QUEUE_GLOBAL_PRIORITY_MEDIUM_KHR;
+
+ /* As per spec, the driver implementation may deny requests to acquire
+ * a priority above the default priority (MEDIUM) if the caller does not
+ * have sufficient privileges. In this scenario VK_ERROR_NOT_PERMITTED_KHR
+ * is returned.
+ */
+ if (physical_device->max_context_priority >= VK_QUEUE_GLOBAL_PRIORITY_MEDIUM_KHR) {
+ int err = anv_gem_set_context_param(device->fd, context_id,
+ I915_CONTEXT_PARAM_PRIORITY,
+ priority);
+ if (err != 0 && priority > VK_QUEUE_GLOBAL_PRIORITY_MEDIUM_KHR) {
+ return vk_error(device, VK_ERROR_NOT_PERMITTED_KHR);
+ }
+ }
+
+ return VK_SUCCESS;
+}
+
+VkResult
+anv_i915_device_setup_context(struct anv_device *device,
+ const VkDeviceCreateInfo *pCreateInfo,
+ const uint32_t num_queues)
+{
+ device->protected_session_id = I915_PROTECTED_CONTENT_DEFAULT_SESSION;
+
+ if (device->physical->has_vm_control)
+ return anv_i915_device_setup_vm(device);
+
+ struct anv_physical_device *physical_device = device->physical;
+ VkResult result = VK_SUCCESS;
+
+ if (device->physical->engine_info) {
+ /* The kernel API supports at most 64 engines */
+ assert(num_queues <= 64);
+ enum intel_engine_class engine_classes[64];
+ int engine_count = 0;
+ enum intel_gem_create_context_flags flags = 0;
+ for (uint32_t i = 0; i < pCreateInfo->queueCreateInfoCount; i++) {
+ const VkDeviceQueueCreateInfo *queueCreateInfo =
+ &pCreateInfo->pQueueCreateInfos[i];
+
+ assert(queueCreateInfo->queueFamilyIndex <
+ physical_device->queue.family_count);
+ struct anv_queue_family *queue_family =
+ &physical_device->queue.families[queueCreateInfo->queueFamilyIndex];
+
+ for (uint32_t j = 0; j < queueCreateInfo->queueCount; j++)
+ engine_classes[engine_count++] = queue_family->engine_class;
+
+ if (pCreateInfo->pQueueCreateInfos[i].flags &
+ VK_DEVICE_QUEUE_CREATE_PROTECTED_BIT)
+ flags |= INTEL_GEM_CREATE_CONTEXT_EXT_PROTECTED_FLAG;
+ }
+ if (!intel_gem_create_context_engines(device->fd, flags,
+ physical_device->engine_info,
+ engine_count, engine_classes,
+ device->vm_id,
+ (uint32_t *)&device->context_id))
+ result = vk_errorf(device, VK_ERROR_INITIALIZATION_FAILED,
+ "kernel context creation failed");
+ } else {
+ assert(num_queues == 1);
+ if (!intel_gem_create_context(device->fd, &device->context_id))
+ result = vk_error(device, VK_ERROR_INITIALIZATION_FAILED);
+ }
+
+ if (result != VK_SUCCESS)
+ return result;
+
+ /* Check if client specified queue priority. */
+ const VkDeviceQueueGlobalPriorityCreateInfoKHR *queue_priority =
+ vk_find_struct_const(pCreateInfo->pQueueCreateInfos[0].pNext,
+ DEVICE_QUEUE_GLOBAL_PRIORITY_CREATE_INFO_KHR);
+
+ result = anv_i915_set_queue_parameters(device, device->context_id,
+ queue_priority);
+ if (result != VK_SUCCESS)
+ goto fail_context;
+
+ return result;
+
+fail_context:
+ intel_gem_destroy_context(device->fd, device->context_id);
+ return result;
+}
+
+static VkResult
+anv_gem_context_get_reset_stats(struct anv_device *device, int context)
+{
+ struct drm_i915_reset_stats stats = {
+ .ctx_id = context,
+ };
+
+ int ret = intel_ioctl(device->fd, DRM_IOCTL_I915_GET_RESET_STATS, &stats);
+ if (ret == -1) {
+ /* We don't know the real error. */
+ return vk_device_set_lost(&device->vk, "get_reset_stats failed: %m");
+ }
+
+ if (stats.batch_active) {
+ return vk_device_set_lost(&device->vk, "GPU hung on one of our command buffers");
+ } else if (stats.batch_pending) {
+ return vk_device_set_lost(&device->vk, "GPU hung with commands in-flight");
+ }
+
+ return VK_SUCCESS;
+}
+
+VkResult
+anv_i915_device_check_status(struct vk_device *vk_device)
+{
+ struct anv_device *device = container_of(vk_device, struct anv_device, vk);
+ VkResult result;
+
+ if (device->physical->has_vm_control) {
+ for (uint32_t i = 0; i < device->queue_count; i++) {
+ result = anv_gem_context_get_reset_stats(device,
+ device->queues[i].context_id);
+ if (result != VK_SUCCESS)
+ return result;
+
+ if (device->queues[i].companion_rcs_id != 0) {
+ uint32_t context_id = device->queues[i].companion_rcs_id;
+ result = anv_gem_context_get_reset_stats(device, context_id);
+ if (result != VK_SUCCESS) {
+ return result;
+ }
+ }
+ }
+ } else {
+ result = anv_gem_context_get_reset_stats(device, device->context_id);
+ }
+
+ return result;
+}
+
+bool
+anv_i915_device_destroy_vm(struct anv_device *device)
+{
+ struct drm_i915_gem_vm_control destroy = {
+ .vm_id = device->vm_id,
+ };
+
+ return intel_ioctl(device->fd, DRM_IOCTL_I915_GEM_VM_DESTROY, &destroy) == 0;
+}
+
+VkResult
+anv_i915_device_setup_vm(struct anv_device *device)
+{
+ struct drm_i915_gem_vm_control create = {};
+ if (intel_ioctl(device->fd, DRM_IOCTL_I915_GEM_VM_CREATE, &create))
+ return vk_errorf(device, VK_ERROR_INITIALIZATION_FAILED,
+ "vm creation failed");
+
+ device->vm_id = create.vm_id;
+ return VK_SUCCESS;
+}
diff --git a/src/intel/vulkan/i915/anv_device.h b/src/intel/vulkan/i915/anv_device.h
new file mode 100644
index 00000000000..0d871a41199
--- /dev/null
+++ b/src/intel/vulkan/i915/anv_device.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright © 2022 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#include "vulkan/vulkan_core.h"
+#include "vk_device.h"
+
+struct anv_device;
+struct anv_physical_device;
+
+VkResult
+anv_i915_physical_device_get_parameters(struct anv_physical_device *device);
+VkResult
+anv_i915_physical_device_init_memory_types(struct anv_physical_device *device);
+
+VkResult
+anv_i915_device_setup_context(struct anv_device *device,
+ const VkDeviceCreateInfo *pCreateInfo,
+ const uint32_t num_queues);
+
+VkResult anv_i915_device_check_status(struct vk_device *vk_device);
+bool anv_i915_device_destroy_vm(struct anv_device *device);
+VkResult anv_i915_device_setup_vm(struct anv_device *device);
+VkResult anv_i915_set_queue_parameters(
+ struct anv_device *device,
+ uint32_t context_id,
+ const VkDeviceQueueGlobalPriorityCreateInfoKHR *queue_priority);
diff --git a/src/intel/vulkan/i915/anv_gem.c b/src/intel/vulkan/i915/anv_gem.c
new file mode 100644
index 00000000000..a159844aa31
--- /dev/null
+++ b/src/intel/vulkan/i915/anv_gem.c
@@ -0,0 +1,139 @@
+/*
+ * Copyright © 2023 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include "i915/anv_gem.h"
+#include "anv_private.h"
+
+#include "drm-uapi/i915_drm.h"
+
+int
+anv_i915_gem_get_tiling(struct anv_device *device, uint32_t gem_handle)
+{
+ if (!device->info->has_tiling_uapi)
+ return -1;
+
+ struct drm_i915_gem_get_tiling get_tiling = {
+ .handle = gem_handle,
+ };
+
+ /* FIXME: On discrete platforms we don't have DRM_IOCTL_I915_GEM_GET_TILING
+ * anymore, so we will need another way to get the tiling. Apparently this
+ * is only used in Android code, so we may need some other way to
+ * communicate the tiling mode.
+ */
+ if (intel_ioctl(device->fd, DRM_IOCTL_I915_GEM_GET_TILING, &get_tiling)) {
+ assert(!"Failed to get BO tiling");
+ return -1;
+ }
+
+ return get_tiling.tiling_mode;
+}
+
+int
+anv_i915_gem_set_tiling(struct anv_device *device, uint32_t gem_handle,
+ uint32_t stride, uint32_t tiling)
+{
+ /* On discrete platforms we don't have DRM_IOCTL_I915_GEM_SET_TILING. So
+ * nothing needs to be done.
+ */
+ if (!device->info->has_tiling_uapi)
+ return 0;
+
+ /* set_tiling overwrites the input on the error path, so we have to open
+ * code intel_ioctl.
+ */
+ struct drm_i915_gem_set_tiling set_tiling = {
+ .handle = gem_handle,
+ .tiling_mode = tiling,
+ .stride = stride,
+ };
+
+ return intel_ioctl(device->fd, DRM_IOCTL_I915_GEM_SET_TILING, &set_tiling);
+}
+
+int
+anv_i915_gem_wait(struct anv_device *device, uint32_t gem_handle,
+ int64_t *timeout_ns)
+{
+ struct drm_i915_gem_wait wait = {
+ .bo_handle = gem_handle,
+ .timeout_ns = *timeout_ns,
+ .flags = 0,
+ };
+
+ int ret = intel_ioctl(device->fd, DRM_IOCTL_I915_GEM_WAIT, &wait);
+ *timeout_ns = wait.timeout_ns;
+
+ return ret;
+}
+
+VkResult
+anv_i915_gem_import_bo_alloc_flags_to_bo_flags(struct anv_device *device,
+ struct anv_bo *bo,
+ enum anv_bo_alloc_flags alloc_flags,
+ uint32_t *out_bo_flags)
+{
+ const uint32_t bo_flags =
+ device->kmd_backend->bo_alloc_flags_to_bo_flags(device, alloc_flags);
+ if (bo->refcount == 0) {
+ *out_bo_flags = bo_flags;
+ return VK_SUCCESS;
+ }
+
+ /* We have to be careful how we combine flags so that it makes sense.
+ * Really, though, if we get to this case and it actually matters, the
+ * client has imported a BO twice in different ways and they get what
+ * they have coming.
+ */
+ uint32_t new_flags = 0;
+ new_flags |= (bo->flags | bo_flags) & EXEC_OBJECT_WRITE;
+ new_flags |= (bo->flags & bo_flags) & EXEC_OBJECT_ASYNC;
+ new_flags |= (bo->flags & bo_flags) & EXEC_OBJECT_SUPPORTS_48B_ADDRESS;
+ new_flags |= (bo->flags | bo_flags) & EXEC_OBJECT_PINNED;
+ new_flags |= (bo->flags | bo_flags) & EXEC_OBJECT_CAPTURE;
+
+ /* It's theoretically possible for a BO to get imported such that it's
+ * both pinned and not pinned. The only way this can happen is if it
+ * gets imported as both a semaphore and a memory object and that would
+ * be an application error. Just fail out in that case.
+ */
+ if ((bo->flags & EXEC_OBJECT_PINNED) !=
+ (bo_flags & EXEC_OBJECT_PINNED))
+ return vk_errorf(device, VK_ERROR_INVALID_EXTERNAL_HANDLE,
+ "The same BO was imported two different ways");
+
+ /* It's also theoretically possible that someone could export a BO from
+ * one heap and import it into another or to import the same BO into two
+ * different heaps. If this happens, we could potentially end up both
+ * allowing and disallowing 48-bit addresses. There's not much we can
+ * do about it if we're pinning so we just throw an error and hope no
+ * app is actually that stupid.
+ */
+ if ((new_flags & EXEC_OBJECT_PINNED) &&
+ (bo->flags & EXEC_OBJECT_SUPPORTS_48B_ADDRESS) !=
+ (bo_flags & EXEC_OBJECT_SUPPORTS_48B_ADDRESS))
+ return vk_errorf(device, VK_ERROR_INVALID_EXTERNAL_HANDLE,
+ "The same BO was imported on two different heaps");
+
+ *out_bo_flags = new_flags;
+ return VK_SUCCESS;
+}
diff --git a/src/intel/vulkan/i915/anv_gem.h b/src/intel/vulkan/i915/anv_gem.h
new file mode 100644
index 00000000000..bf3713f86f3
--- /dev/null
+++ b/src/intel/vulkan/i915/anv_gem.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright © 2023 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#include <stdint.h>
+
+#include "vulkan/vulkan_core.h"
+
+struct anv_bo;
+struct anv_device;
+enum anv_bo_alloc_flags;
+
+int anv_i915_gem_get_tiling(struct anv_device *device, uint32_t gem_handle);
+int anv_i915_gem_set_tiling(struct anv_device *device, uint32_t gem_handle,
+ uint32_t stride, uint32_t tiling);
+
+int anv_i915_gem_wait(struct anv_device *device, uint32_t gem_handle,
+ int64_t *timeout_ns);
+
+VkResult anv_i915_gem_import_bo_alloc_flags_to_bo_flags(struct anv_device *device,
+ struct anv_bo *bo,
+ enum anv_bo_alloc_flags alloc_flags,
+ uint32_t *out_bo_flags);
diff --git a/src/intel/vulkan/i915/anv_kmd_backend.c b/src/intel/vulkan/i915/anv_kmd_backend.c
new file mode 100644
index 00000000000..253abfd959e
--- /dev/null
+++ b/src/intel/vulkan/i915/anv_kmd_backend.c
@@ -0,0 +1,306 @@
+/*
+ * Copyright © 2023 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <sys/mman.h>
+
+#include "anv_private.h"
+
+#include "i915/anv_batch_chain.h"
+
+#include "drm-uapi/i915_drm.h"
+#include "intel/common/i915/intel_gem.h"
+
+static int
+i915_gem_set_caching(struct anv_device *device,
+ uint32_t gem_handle, uint32_t caching)
+{
+ struct drm_i915_gem_caching gem_caching = {
+ .handle = gem_handle,
+ .caching = caching,
+ };
+
+ return intel_ioctl(device->fd, DRM_IOCTL_I915_GEM_SET_CACHING, &gem_caching);
+}
+
+static uint32_t
+i915_gem_create(struct anv_device *device,
+ const struct intel_memory_class_instance **regions,
+ uint16_t num_regions, uint64_t size,
+ enum anv_bo_alloc_flags alloc_flags,
+ uint64_t *actual_size)
+{
+ if (unlikely(!device->info->mem.use_class_instance)) {
+ assert(num_regions == 1 &&
+ device->physical->sys.region == regions[0]);
+
+ struct drm_i915_gem_create gem_create = {
+ .size = size,
+ };
+ if (intel_ioctl(device->fd, DRM_IOCTL_I915_GEM_CREATE, &gem_create))
+ return 0;
+
+ if ((alloc_flags & ANV_BO_ALLOC_HOST_CACHED_COHERENT) == ANV_BO_ALLOC_HOST_CACHED_COHERENT) {
+ /* We don't want to change these defaults if it's going to be shared
+ * with another process.
+ */
+ assert(!(alloc_flags & ANV_BO_ALLOC_EXTERNAL));
+
+ /* Regular objects are created I915_CACHING_CACHED on LLC platforms and
+ * I915_CACHING_NONE on non-LLC platforms. For many internal state
+ * objects, we'd rather take the snooping overhead than risk forgetting
+ * a CLFLUSH somewhere. Userptr objects are always created as
+ * I915_CACHING_CACHED, which on non-LLC means snooped so there's no
+ * need to do this there.
+ */
+ if (device->info->has_caching_uapi && !device->info->has_llc)
+ i915_gem_set_caching(device, gem_create.handle, I915_CACHING_CACHED);
+ }
+
+ *actual_size = gem_create.size;
+ return gem_create.handle;
+ }
+
+ struct drm_i915_gem_memory_class_instance i915_regions[2];
+ assert(num_regions <= ARRAY_SIZE(i915_regions));
+
+ for (uint16_t i = 0; i < num_regions; i++) {
+ i915_regions[i].memory_class = regions[i]->klass;
+ i915_regions[i].memory_instance = regions[i]->instance;
+ }
+
+ uint32_t flags = 0;
+ if (alloc_flags & (ANV_BO_ALLOC_MAPPED | ANV_BO_ALLOC_LOCAL_MEM_CPU_VISIBLE) &&
+ !(alloc_flags & ANV_BO_ALLOC_NO_LOCAL_MEM))
+ if (device->physical->vram_non_mappable.size > 0)
+ flags |= I915_GEM_CREATE_EXT_FLAG_NEEDS_CPU_ACCESS;
+
+ struct drm_i915_gem_create_ext_memory_regions ext_regions = {
+ .num_regions = num_regions,
+ .regions = (uintptr_t)i915_regions,
+ };
+ struct drm_i915_gem_create_ext gem_create = {
+ .size = size,
+ .flags = flags,
+ };
+
+ intel_i915_gem_add_ext(&gem_create.extensions,
+ I915_GEM_CREATE_EXT_MEMORY_REGIONS,
+ &ext_regions.base);
+
+ struct drm_i915_gem_create_ext_set_pat set_pat_param = { 0 };
+ if (device->info->has_set_pat_uapi) {
+ /* Set PAT param */
+ set_pat_param.pat_index = anv_device_get_pat_entry(device, alloc_flags)->index;
+ intel_i915_gem_add_ext(&gem_create.extensions,
+ I915_GEM_CREATE_EXT_SET_PAT,
+ &set_pat_param.base);
+ }
+
+ struct drm_i915_gem_create_ext_protected_content protected_param = { 0 };
+ if (alloc_flags & ANV_BO_ALLOC_PROTECTED) {
+ intel_i915_gem_add_ext(&gem_create.extensions,
+ I915_GEM_CREATE_EXT_PROTECTED_CONTENT,
+ &protected_param.base);
+ }
+
+ if (intel_ioctl(device->fd, DRM_IOCTL_I915_GEM_CREATE_EXT, &gem_create))
+ return 0;
+
+ *actual_size = gem_create.size;
+
+ if ((alloc_flags & ANV_BO_ALLOC_HOST_CACHED_COHERENT) == ANV_BO_ALLOC_HOST_CACHED_COHERENT) {
+ /* We don't want to change these defaults if it's going to be shared
+ * with another process.
+ */
+ assert(!(alloc_flags & ANV_BO_ALLOC_EXTERNAL));
+
+ /* Regular objects are created I915_CACHING_CACHED on LLC platforms and
+ * I915_CACHING_NONE on non-LLC platforms. For many internal state
+ * objects, we'd rather take the snooping overhead than risk forgetting
+ * a CLFLUSH somewhere. Userptr objects are always created as
+ * I915_CACHING_CACHED, which on non-LLC means snooped so there's no
+ * need to do this there.
+ */
+ if (device->info->has_caching_uapi && !device->info->has_llc)
+ i915_gem_set_caching(device, gem_create.handle, I915_CACHING_CACHED);
+ }
+
+ return gem_create.handle;
+}
+
+static void
+i915_gem_close(struct anv_device *device, struct anv_bo *bo)
+{
+ struct drm_gem_close close = {
+ .handle = bo->gem_handle,
+ };
+
+ intel_ioctl(device->fd, DRM_IOCTL_GEM_CLOSE, &close);
+}
+
+static void *
+i915_gem_mmap_offset(struct anv_device *device, struct anv_bo *bo,
+ uint64_t size, uint32_t flags,
+ void *placed_addr)
+{
+ struct drm_i915_gem_mmap_offset gem_mmap = {
+ .handle = bo->gem_handle,
+ .flags = flags,
+ };
+ if (intel_ioctl(device->fd, DRM_IOCTL_I915_GEM_MMAP_OFFSET, &gem_mmap))
+ return MAP_FAILED;
+
+ return mmap(placed_addr, size, PROT_READ | PROT_WRITE,
+ (placed_addr != NULL ? MAP_FIXED : 0) | MAP_SHARED,
+ device->fd, gem_mmap.offset);
+}
+
+static void *
+i915_gem_mmap_legacy(struct anv_device *device, struct anv_bo *bo, uint64_t offset,
+ uint64_t size, uint32_t flags)
+{
+ struct drm_i915_gem_mmap gem_mmap = {
+ .handle = bo->gem_handle,
+ .offset = offset,
+ .size = size,
+ .flags = flags,
+ };
+ if (intel_ioctl(device->fd, DRM_IOCTL_I915_GEM_MMAP, &gem_mmap))
+ return MAP_FAILED;
+
+ return (void *)(uintptr_t) gem_mmap.addr_ptr;
+}
+
+static uint32_t
+mmap_calc_flags(struct anv_device *device, struct anv_bo *bo)
+{
+ if (device->info->has_local_mem)
+ return I915_MMAP_OFFSET_FIXED;
+
+ uint32_t flags;
+ switch (anv_bo_get_mmap_mode(device, bo)) {
+ case INTEL_DEVICE_INFO_MMAP_MODE_WC:
+ flags = I915_MMAP_WC;
+ break;
+ case INTEL_DEVICE_INFO_MMAP_MODE_UC:
+ unreachable("Missing");
+ default:
+ /* no flags == WB */
+ flags = 0;
+ }
+
+ if (likely(device->physical->info.has_mmap_offset))
+ flags = (flags & I915_MMAP_WC) ? I915_MMAP_OFFSET_WC : I915_MMAP_OFFSET_WB;
+ return flags;
+}
+
+static void *
+i915_gem_mmap(struct anv_device *device, struct anv_bo *bo, uint64_t offset,
+ uint64_t size, void *placed_addr)
+{
+ const uint32_t flags = mmap_calc_flags(device, bo);
+
+ if (likely(device->physical->info.has_mmap_offset))
+ return i915_gem_mmap_offset(device, bo, size, flags, placed_addr);
+ assert(placed_addr == NULL);
+ return i915_gem_mmap_legacy(device, bo, offset, size, flags);
+}
+
+static VkResult
+i915_vm_bind(struct anv_device *device, struct anv_sparse_submission *submit,
+ enum anv_vm_bind_flags flags)
+{
+ return VK_SUCCESS;
+}
+
+static VkResult
+i915_vm_bind_bo(struct anv_device *device, struct anv_bo *bo)
+{
+ return VK_SUCCESS;
+}
+
+static uint32_t
+i915_gem_create_userptr(struct anv_device *device, void *mem, uint64_t size)
+{
+ struct drm_i915_gem_userptr userptr = {
+ .user_ptr = (__u64)((unsigned long) mem),
+ .user_size = size,
+ .flags = 0,
+ };
+
+ if (device->physical->info.has_userptr_probe)
+ userptr.flags |= I915_USERPTR_PROBE;
+
+ int ret = intel_ioctl(device->fd, DRM_IOCTL_I915_GEM_USERPTR, &userptr);
+ if (ret == -1)
+ return 0;
+
+ return userptr.handle;
+}
+
+static uint32_t
+i915_bo_alloc_flags_to_bo_flags(struct anv_device *device,
+ enum anv_bo_alloc_flags alloc_flags)
+{
+ struct anv_physical_device *pdevice = device->physical;
+
+ uint64_t bo_flags = EXEC_OBJECT_PINNED;
+
+ if (!(alloc_flags & ANV_BO_ALLOC_32BIT_ADDRESS))
+ bo_flags |= EXEC_OBJECT_SUPPORTS_48B_ADDRESS;
+
+ if (((alloc_flags & ANV_BO_ALLOC_CAPTURE) ||
+ INTEL_DEBUG(DEBUG_CAPTURE_ALL)) &&
+ pdevice->has_exec_capture)
+ bo_flags |= EXEC_OBJECT_CAPTURE;
+
+ if (alloc_flags & ANV_BO_ALLOC_IMPLICIT_WRITE) {
+ assert(alloc_flags & ANV_BO_ALLOC_IMPLICIT_SYNC);
+ bo_flags |= EXEC_OBJECT_WRITE;
+ }
+
+ if (!(alloc_flags & ANV_BO_ALLOC_IMPLICIT_SYNC) && pdevice->has_exec_async)
+ bo_flags |= EXEC_OBJECT_ASYNC;
+
+ return bo_flags;
+}
+
+const struct anv_kmd_backend *
+anv_i915_kmd_backend_get(void)
+{
+ static const struct anv_kmd_backend i915_backend = {
+ .gem_create = i915_gem_create,
+ .gem_create_userptr = i915_gem_create_userptr,
+ .gem_close = i915_gem_close,
+ .gem_mmap = i915_gem_mmap,
+ .vm_bind = i915_vm_bind,
+ .vm_bind_bo = i915_vm_bind_bo,
+ .vm_unbind_bo = i915_vm_bind_bo,
+ .execute_simple_batch = i915_execute_simple_batch,
+ .execute_trtt_batch = i915_execute_trtt_batch,
+ .queue_exec_locked = i915_queue_exec_locked,
+ .queue_exec_trace = i915_queue_exec_trace,
+ .bo_alloc_flags_to_bo_flags = i915_bo_alloc_flags_to_bo_flags,
+ };
+ return &i915_backend;
+}
diff --git a/src/intel/vulkan/i915/anv_queue.c b/src/intel/vulkan/i915/anv_queue.c
new file mode 100644
index 00000000000..173cf7b2a3a
--- /dev/null
+++ b/src/intel/vulkan/i915/anv_queue.c
@@ -0,0 +1,126 @@
+/*
+ * Copyright © 2023 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include "i915/anv_queue.h"
+
+#include "anv_private.h"
+
+#include "common/i915/intel_engine.h"
+#include "common/intel_gem.h"
+
+#include "i915/anv_device.h"
+
+#include "drm-uapi/i915_drm.h"
+
+VkResult
+anv_i915_create_engine(struct anv_device *device,
+ struct anv_queue *queue,
+ const VkDeviceQueueCreateInfo *pCreateInfo)
+{
+ struct anv_physical_device *physical = device->physical;
+ struct anv_queue_family *queue_family =
+ &physical->queue.families[pCreateInfo->queueFamilyIndex];
+
+ if (device->physical->engine_info == NULL) {
+ switch (queue_family->engine_class) {
+ case INTEL_ENGINE_CLASS_COPY:
+ queue->exec_flags = I915_EXEC_BLT;
+ break;
+ case INTEL_ENGINE_CLASS_RENDER:
+ queue->exec_flags = I915_EXEC_RENDER;
+ break;
+ case INTEL_ENGINE_CLASS_VIDEO:
+ /* We want VCS0 (with ring1) for HW lacking HEVC on VCS1. */
+ queue->exec_flags = I915_EXEC_BSD | I915_EXEC_BSD_RING1;
+ break;
+ default:
+ unreachable("Unsupported legacy engine");
+ }
+ } else if (device->physical->has_vm_control) {
+ assert(pCreateInfo->queueFamilyIndex < physical->queue.family_count);
+ enum intel_engine_class engine_classes[1];
+ enum intel_gem_create_context_flags flags = 0;
+
+ engine_classes[0] = queue_family->engine_class;
+ if (pCreateInfo->flags & VK_DEVICE_QUEUE_CREATE_PROTECTED_BIT)
+ flags |= INTEL_GEM_CREATE_CONTEXT_EXT_PROTECTED_FLAG;
+
+ if (!intel_gem_create_context_engines(device->fd, flags,
+ physical->engine_info,
+ 1, engine_classes,
+ device->vm_id,
+ (uint32_t *)&queue->context_id))
+ return vk_errorf(device, VK_ERROR_INITIALIZATION_FAILED,
+ "engine creation failed");
+
+ /* Create a companion RCS logical engine to support MSAA copy/clear
+ * operation on compute/copy engine.
+ */
+ if (queue_family->engine_class == INTEL_ENGINE_CLASS_COPY ||
+ queue_family->engine_class == INTEL_ENGINE_CLASS_COMPUTE) {
+ uint32_t *context_id = (uint32_t *)&queue->companion_rcs_id;
+ engine_classes[0] = INTEL_ENGINE_CLASS_RENDER;
+ if (!intel_gem_create_context_engines(device->fd, flags,
+ physical->engine_info,
+ 1, engine_classes,
+ device->vm_id,
+ context_id))
+ return vk_errorf(device, VK_ERROR_INITIALIZATION_FAILED,
+ "companion RCS engine creation failed");
+ }
+
+ /* Check if client specified queue priority. */
+ const VkDeviceQueueGlobalPriorityCreateInfoKHR *queue_priority =
+ vk_find_struct_const(pCreateInfo->pNext,
+ DEVICE_QUEUE_GLOBAL_PRIORITY_CREATE_INFO_KHR);
+
+ VkResult result = anv_i915_set_queue_parameters(device,
+ queue->context_id,
+ queue_priority);
+ if (result != VK_SUCCESS) {
+ intel_gem_destroy_context(device->fd, queue->context_id);
+ if (queue->companion_rcs_id != 0) {
+ intel_gem_destroy_context(device->fd, queue->companion_rcs_id);
+ }
+ return result;
+ }
+ } else {
+ /* When using the new engine creation uAPI, the exec_flags value is the
+ * index of the engine in the group specified at GEM context creation.
+ */
+ queue->exec_flags = device->queue_count;
+ }
+
+ return VK_SUCCESS;
+}
+
+void
+anv_i915_destroy_engine(struct anv_device *device, struct anv_queue *queue)
+{
+ if (device->physical->has_vm_control) {
+ intel_gem_destroy_context(device->fd, queue->context_id);
+
+ if (queue->companion_rcs_id != 0) {
+ intel_gem_destroy_context(device->fd, queue->companion_rcs_id);
+ }
+ }
+}
diff --git a/src/intel/vulkan/i915/anv_queue.h b/src/intel/vulkan/i915/anv_queue.h
new file mode 100644
index 00000000000..ab75cd5b2cb
--- /dev/null
+++ b/src/intel/vulkan/i915/anv_queue.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright © 2023 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#include "vulkan/vulkan_core.h"
+
+struct anv_device;
+struct anv_queue;
+
+VkResult
+anv_i915_create_engine(struct anv_device *device,
+ struct anv_queue *queue,
+ const VkDeviceQueueCreateInfo *pCreateInfo);
+void
+anv_i915_destroy_engine(struct anv_device *device, struct anv_queue *queue);
diff --git a/src/intel/vulkan/anv_wsi_wayland.c b/src/intel/vulkan/layers/anv_android_layer.c
index 13c59604ffe..b9ccc60649c 100644
--- a/src/intel/vulkan/anv_wsi_wayland.c
+++ b/src/intel/vulkan/layers/anv_android_layer.c
@@ -1,5 +1,5 @@
/*
- * Copyright © 2015 Intel Corporation
+ * Copyright © 2023 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@@ -21,33 +21,26 @@
* IN THE SOFTWARE.
*/
-#include "wsi_common_wayland.h"
#include "anv_private.h"
-VkBool32 anv_GetPhysicalDeviceWaylandPresentationSupportKHR(
- VkPhysicalDevice physicalDevice,
- uint32_t queueFamilyIndex,
- struct wl_display* display)
-{
- ANV_FROM_HANDLE(anv_physical_device, physical_device, physicalDevice);
-
- return wsi_wl_get_presentation_support(&physical_device->wsi_device, display);
-}
-
-VkResult anv_CreateWaylandSurfaceKHR(
- VkInstance _instance,
- const VkWaylandSurfaceCreateInfoKHR* pCreateInfo,
+VkResult anv_android_CreateImageView(
+ VkDevice _device,
+ const VkImageViewCreateInfo* pCreateInfo,
const VkAllocationCallbacks* pAllocator,
- VkSurfaceKHR* pSurface)
+ VkImageView* pView)
{
- ANV_FROM_HANDLE(anv_instance, instance, _instance);
- const VkAllocationCallbacks *alloc;
- assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_WAYLAND_SURFACE_CREATE_INFO_KHR);
-
- if (pAllocator)
- alloc = pAllocator;
- else
- alloc = &instance->vk.alloc;
+ ANV_FROM_HANDLE(anv_device, device, _device);
+ const struct util_format_description *fmt =
+ vk_format_description(pCreateInfo->format);
- return wsi_create_wl_surface(alloc, pCreateInfo, pSurface);
+ /* Throw error in case application tries to create ASTC view on gfx125.
+ * This is done to avoid gpu hang that can result in using the unsupported
+ * format.
+ */
+ if (fmt && fmt->layout == UTIL_FORMAT_LAYOUT_ASTC &&
+ device->info->verx10 >= 125) {
+ return vk_errorf(device, VK_ERROR_OUT_OF_HOST_MEMORY,
+ "ASTC format not supported (%s).", __func__);
+ }
+ return anv_CreateImageView(_device, pCreateInfo, pAllocator, pView);
}
diff --git a/src/intel/vulkan/layers/anv_doom64.c b/src/intel/vulkan/layers/anv_doom64.c
new file mode 100644
index 00000000000..8fe0287c417
--- /dev/null
+++ b/src/intel/vulkan/layers/anv_doom64.c
@@ -0,0 +1,137 @@
+/*
+ * Copyright © 2022 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "util/set.h"
+#include "anv_private.h"
+#include "vk_common_entrypoints.h"
+
+/**
+ * The DOOM 64 rendering corruption is happening because the game always uses
+ * ```
+ * vkCmdPipelineBarrier(VK_IMAGE_LAYOUT_UNDEFINED ->
+ * VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL)
+ * vkCmdCopyBufferToImage(...)
+ * vkCmdPipelineBarrier(VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL ->
+ * VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL)
+ * ```
+ * when it wants to update its texture atlas image.
+ *
+ * According to spec, transitioning from VK_IMAGE_LAYOUT_UNDEFINED means
+ * that the current image content might be discarded, but the game relies
+ * on it being fully preserved.
+ *
+ * This work-around layer implements super-barebone layout tracking: allows
+ * the first transition from VK_IMAGE_LAYOUT_UNDEFINED, but replaces
+ * oldLayout with VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL for each
+ * subsequent transition of that image.
+ *
+ * Gen12+ does not ambiguate CCS data on transition from VK_IMAGE_LAYOUT_UNDEFINED
+ * so it preserves all compressed information, and this WA is not needed.
+ */
+
+void anv_doom64_CmdPipelineBarrier(
+ VkCommandBuffer commandBuffer,
+ VkPipelineStageFlags srcStageMask,
+ VkPipelineStageFlags dstStageMask,
+ VkDependencyFlags dependencyFlags,
+ uint32_t memoryBarrierCount,
+ const VkMemoryBarrier* pMemoryBarriers,
+ uint32_t bufferMemoryBarrierCount,
+ const VkBufferMemoryBarrier* pBufferMemoryBarriers,
+ uint32_t imageMemoryBarrierCount,
+ const VkImageMemoryBarrier* pImageMemoryBarriers)
+{
+ ANV_FROM_HANDLE(anv_cmd_buffer, command_buffer, commandBuffer);
+ assert(command_buffer && command_buffer->device);
+
+ VkImageMemoryBarrier fixed_barrier;
+ struct set * defined_images =
+ command_buffer->device->workarounds.doom64_images;
+
+ if (defined_images &&
+ imageMemoryBarrierCount == 1 && pImageMemoryBarriers &&
+ pImageMemoryBarriers[0].oldLayout == VK_IMAGE_LAYOUT_UNDEFINED &&
+ pImageMemoryBarriers[0].newLayout == VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL) {
+ ANV_FROM_HANDLE(anv_image, image, pImageMemoryBarriers[0].image);
+
+ if (!_mesa_set_search(defined_images, image)) {
+ _mesa_set_add(defined_images, image);
+ } else {
+ memcpy(&fixed_barrier, pImageMemoryBarriers, sizeof(VkImageMemoryBarrier));
+
+ fixed_barrier.oldLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL;
+
+ pImageMemoryBarriers = (const VkImageMemoryBarrier*) &fixed_barrier;
+ }
+ }
+
+ vk_common_CmdPipelineBarrier(commandBuffer, srcStageMask, dstStageMask,
+ dependencyFlags, memoryBarrierCount,
+ pMemoryBarriers, bufferMemoryBarrierCount,
+ pBufferMemoryBarriers,
+ imageMemoryBarrierCount,
+ pImageMemoryBarriers);
+}
+
+VkResult anv_doom64_CreateImage(
+ VkDevice _device,
+ const VkImageCreateInfo* pCreateInfo,
+ const VkAllocationCallbacks* pAllocator,
+ VkImage* pImage)
+{
+ ANV_FROM_HANDLE(anv_device, device, _device);
+ assert(device);
+
+ if (!device->workarounds.doom64_images) {
+ device->workarounds.doom64_images = _mesa_pointer_set_create(NULL);
+
+ if (!device->workarounds.doom64_images) {
+ return VK_ERROR_OUT_OF_HOST_MEMORY;
+ }
+ }
+
+ return anv_CreateImage(_device, pCreateInfo, pAllocator, pImage);
+}
+
+void anv_doom64_DestroyImage(
+ VkDevice _device,
+ VkImage _image,
+ const VkAllocationCallbacks* pAllocator)
+{
+ ANV_FROM_HANDLE(anv_device, device, _device);
+ ANV_FROM_HANDLE(anv_image, image, _image);
+ assert(device);
+
+ struct set * defined_images = device->workarounds.doom64_images;
+
+ if (image && defined_images) {
+ _mesa_set_remove_key(defined_images, image);
+
+ if (!defined_images->entries) {
+ _mesa_set_destroy(defined_images, NULL);
+ device->workarounds.doom64_images = NULL;
+ }
+ }
+
+ anv_DestroyImage(_device, _image, pAllocator);
+}
diff --git a/src/intel/vulkan/layers/anv_hitman3.c b/src/intel/vulkan/layers/anv_hitman3.c
new file mode 100644
index 00000000000..a6add16d0c3
--- /dev/null
+++ b/src/intel/vulkan/layers/anv_hitman3.c
@@ -0,0 +1,41 @@
+/*
+ * Copyright © 2022 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "anv_private.h"
+
+VkResult anv_hitman3_CreateBufferView(
+ VkDevice _device,
+ const VkBufferViewCreateInfo* pCreateInfo,
+ const VkAllocationCallbacks* pAllocator,
+ VkBufferView* pView)
+{
+ ANV_FROM_HANDLE(anv_buffer, buffer, pCreateInfo->buffer);
+ if (pCreateInfo->format == VK_FORMAT_R32G32B32_SFLOAT &&
+ (buffer->vk.usage & VK_BUFFER_USAGE_STORAGE_TEXEL_BUFFER_BIT)) {
+ ANV_FROM_HANDLE(anv_device, device, _device);
+ return vk_errorf(device, VK_ERROR_UNKNOWN,
+ "invalid image format requested for storage");
+ }
+
+ return anv_CreateBufferView(_device, pCreateInfo, pAllocator, pView);
+}
diff --git a/src/intel/vulkan/layers/anv_rmv_layer.c b/src/intel/vulkan/layers/anv_rmv_layer.c
new file mode 100644
index 00000000000..2e36e5d4012
--- /dev/null
+++ b/src/intel/vulkan/layers/anv_rmv_layer.c
@@ -0,0 +1,122 @@
+/*
+ * Copyright © 2023 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "rmv/vk_rmv_common.h"
+#include "rmv/vk_rmv_tokens.h"
+#include "anv_private.h"
+#include "vk_common_entrypoints.h"
+
+VkResult anv_rmv_QueuePresentKHR(
+ VkQueue _queue,
+ const VkPresentInfoKHR* pPresentInfo)
+{
+ ANV_FROM_HANDLE(anv_queue, queue, _queue);
+ struct anv_device *device = queue->device;
+
+ VkResult res = anv_QueuePresentKHR(_queue, pPresentInfo);
+ if ((res != VK_SUCCESS && res != VK_SUBOPTIMAL_KHR) ||
+ !device->vk.memory_trace_data.is_enabled)
+ return res;
+
+ vk_rmv_log_misc_token(&device->vk, VK_RMV_MISC_EVENT_TYPE_PRESENT);
+
+ return VK_SUCCESS;
+}
+
+VkResult anv_rmv_FlushMappedMemoryRanges(
+ VkDevice _device,
+ uint32_t memoryRangeCount,
+ const VkMappedMemoryRange* pMemoryRanges)
+{
+ ANV_FROM_HANDLE(anv_device, device, _device);
+
+ VkResult res = anv_FlushMappedMemoryRanges(_device, memoryRangeCount, pMemoryRanges);
+ if (res != VK_SUCCESS || !device->vk.memory_trace_data.is_enabled)
+ return res;
+
+ vk_rmv_log_misc_token(&device->vk, VK_RMV_MISC_EVENT_TYPE_FLUSH_MAPPED_RANGE);
+
+ return VK_SUCCESS;
+}
+
+VkResult anv_rmv_InvalidateMappedMemoryRanges(
+ VkDevice _device,
+ uint32_t memoryRangeCount,
+ const VkMappedMemoryRange* pMemoryRanges)
+{
+ ANV_FROM_HANDLE(anv_device, device, _device);
+
+ VkResult res = anv_InvalidateMappedMemoryRanges(_device, memoryRangeCount, pMemoryRanges);
+ if (res != VK_SUCCESS || !device->vk.memory_trace_data.is_enabled)
+ return res;
+
+ vk_rmv_log_misc_token(&device->vk, VK_RMV_MISC_EVENT_TYPE_INVALIDATE_RANGES);
+
+ return VK_SUCCESS;
+}
+
+VkResult anv_rmv_SetDebugUtilsObjectNameEXT(
+ VkDevice _device,
+ const VkDebugUtilsObjectNameInfoEXT* pNameInfo)
+{
+ assert(pNameInfo->sType == VK_STRUCTURE_TYPE_DEBUG_UTILS_OBJECT_NAME_INFO_EXT);
+ ANV_FROM_HANDLE(anv_device, device, _device);
+
+ VkResult result = vk_common_SetDebugUtilsObjectNameEXT(_device, pNameInfo);
+ if (result != VK_SUCCESS || !device->vk.memory_trace_data.is_enabled)
+ return result;
+
+ switch (pNameInfo->objectType) {
+ /* only name object types we care about */
+ case VK_OBJECT_TYPE_BUFFER:
+ case VK_OBJECT_TYPE_DEVICE_MEMORY:
+ case VK_OBJECT_TYPE_IMAGE:
+ case VK_OBJECT_TYPE_EVENT:
+ case VK_OBJECT_TYPE_QUERY_POOL:
+ case VK_OBJECT_TYPE_DESCRIPTOR_POOL:
+ case VK_OBJECT_TYPE_PIPELINE:
+ break;
+ default:
+ return VK_SUCCESS;
+ }
+
+ size_t name_len = strlen(pNameInfo->pObjectName);
+ char *name_buf = malloc(name_len + 1);
+ if (!name_buf) {
+ /*
+ * Silently fail, so that applications may still continue if possible.
+ */
+ return VK_SUCCESS;
+ }
+ strcpy(name_buf, pNameInfo->pObjectName);
+
+ simple_mtx_lock(&device->vk.memory_trace_data.token_mtx);
+ struct vk_rmv_userdata_token token;
+ token.name = name_buf;
+ token.resource_id = vk_rmv_get_resource_id_locked(&device->vk, pNameInfo->objectHandle);
+
+ vk_rmv_emit_token(&device->vk.memory_trace_data, VK_RMV_TOKEN_TYPE_USERDATA, &token);
+ simple_mtx_unlock(&device->vk.memory_trace_data.token_mtx);
+
+ return VK_SUCCESS;
+}
diff --git a/src/intel/vulkan/meson.build b/src/intel/vulkan/meson.build
index 97423f0b025..8eecda92547 100644
--- a/src/intel/vulkan/meson.build
+++ b/src/intel/vulkan/meson.build
@@ -18,6 +18,15 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
+inc_anv = include_directories('.')
+
+anv_flags = [
+ no_override_init_args,
+ sse2_args,
+]
+
+anv_cpp_flags = []
+
anv_entrypoints = custom_target(
'anv_entrypoints',
input : [vk_entrypoints_gen, vk_api_xml],
@@ -25,89 +34,169 @@ anv_entrypoints = custom_target(
command : [
prog_python, '@INPUT0@', '--xml', '@INPUT1@', '--proto', '--weak',
'--out-h', '@OUTPUT0@', '--out-c', '@OUTPUT1@', '--prefix', 'anv',
- '--device-prefix', 'gfx7', '--device-prefix', 'gfx75',
- '--device-prefix', 'gfx8', '--device-prefix', 'gfx9',
- '--device-prefix', 'gfx11', '--device-prefix', 'gfx12',
+ '--beta', with_vulkan_beta.to_string(),
+ '--device-prefix', 'gfx9',
+ '--device-prefix', 'gfx11',
+ '--device-prefix', 'gfx12',
'--device-prefix', 'gfx125',
+ '--device-prefix', 'gfx20',
+ '--device-prefix', 'anv_doom64',
+ '--device-prefix', 'anv_hitman3',
+ '--device-prefix', 'anv_android',
+ '--device-prefix', 'anv_rmv',
],
depend_files : vk_entrypoints_gen_depend_files,
)
+idep_anv_headers = declare_dependency(
+ sources : [anv_entrypoints[0]],
+ include_directories : inc_anv,
+)
+
+if with_intel_vk_rt
+ subdir('grl')
+ optional_libgrl = [libgrl]
+ anv_flags += '-DANV_SUPPORT_RT=1'
+else
+ idep_grl = null_dep
+ optional_libgrl = []
+ anv_flags += '-DANV_SUPPORT_RT=0'
+endif
+
intel_icd = custom_target(
'intel_icd',
input : [vk_icd_gen, vk_api_xml],
output : 'intel_icd.@0@.json'.format(host_machine.cpu()),
command : [
prog_python, '@INPUT0@',
- '--api-version', '1.2', '--xml', '@INPUT1@',
+ '--api-version', '1.3', '--xml', '@INPUT1@',
'--lib-path', join_paths(get_option('prefix'), get_option('libdir'),
'libvulkan_intel.so'),
'--out', '@OUTPUT@',
],
build_by_default : true,
install_dir : with_vulkan_icd_dir,
+ install_tag : 'runtime',
install : true,
)
+_dev_icdname = 'intel_devenv_icd.@0@.json'.format(host_machine.cpu())
+_dev_icd = custom_target(
+ 'intel_devenv_icd',
+ input : [vk_icd_gen, vk_api_xml],
+ output : _dev_icdname,
+ command : [
+ prog_python, '@INPUT0@',
+ '--api-version', '1.3', '--xml', '@INPUT1@',
+ '--lib-path', meson.current_build_dir() / 'libvulkan_intel.so',
+ '--out', '@OUTPUT@',
+ ],
+ build_by_default : true,
+)
+
+devenv.append('VK_DRIVER_FILES', _dev_icd.full_path())
+# Deprecated: replaced by VK_DRIVER_FILES above
+devenv.append('VK_ICD_FILENAMES', _dev_icd.full_path())
+
libanv_per_hw_ver_libs = []
anv_per_hw_ver_files = files(
'genX_blorp_exec.c',
'genX_cmd_buffer.c',
+ 'genX_cmd_compute.c',
+ 'genX_cmd_draw.c',
+ 'genX_cmd_draw_generated_flush.h',
+ 'genX_cmd_draw_generated_indirect.h',
+ 'genX_cmd_video.c',
+ 'genX_gfx_state.c',
'genX_gpu_memcpy.c',
+ 'genX_init_state.c',
+ 'genX_internal_kernels.c',
'genX_pipeline.c',
'genX_query.c',
- 'genX_state.c',
+ 'genX_simple_shader.c',
)
-foreach g : [['70', ['gfx7_cmd_buffer.c']], ['75', ['gfx7_cmd_buffer.c']],
- ['80', ['gfx8_cmd_buffer.c']], ['90', ['gfx8_cmd_buffer.c']],
- ['110', ['gfx8_cmd_buffer.c']], ['120', ['gfx8_cmd_buffer.c']],
- ['125', ['gfx8_cmd_buffer.c']]]
- _gfx_ver = g[0]
+if with_intel_vk_rt
+ anv_per_hw_ver_files += files('genX_acceleration_structure.c',)
+endif
+
+foreach _gfx_ver : ['90', '110', '120', '125', '200']
libanv_per_hw_ver_libs += static_library(
'anv_per_hw_ver@0@'.format(_gfx_ver),
- [anv_per_hw_ver_files, g[1], anv_entrypoints[0]],
+ [anv_per_hw_ver_files, anv_entrypoints[0]],
include_directories : [
- inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_compiler, inc_intel, inc_vulkan_wsi,
- ],
- c_args : [
- no_override_init_args, c_sse2_args,
- '-DGFX_VERx10=@0@'.format(_gfx_ver),
+ inc_include, inc_src, inc_intel,
],
+ c_args : anv_flags + ['-DGFX_VERx10=@0@'.format(_gfx_ver)],
gnu_symbol_visibility : 'hidden',
dependencies : [
- dep_libdrm, dep_valgrind, idep_nir_headers, idep_genxml, idep_vulkan_util_headers,
+ dep_libdrm, dep_valgrind, idep_nir_headers, idep_genxml,
+ idep_vulkan_util_headers, idep_vulkan_wsi_headers,
+ idep_vulkan_runtime_headers, idep_intel_driver_ds_headers,
+ idep_grl, idep_intel_shaders, idep_intel_blorp,
],
)
endforeach
libanv_files = files(
- 'anv_acceleration_structure.c',
+ 'i915/anv_batch_chain.c',
+ 'i915/anv_batch_chain.h',
+ 'i915/anv_device.c',
+ 'i915/anv_device.h',
+ 'i915/anv_gem.c',
+ 'i915/anv_gem.h',
+ 'i915/anv_kmd_backend.c',
+ 'i915/anv_queue.c',
+ 'i915/anv_queue.h',
+ 'layers/anv_android_layer.c',
+ 'layers/anv_doom64.c',
+ 'layers/anv_hitman3.c',
+ 'layers/anv_rmv_layer.c',
+ 'xe/anv_batch_chain.c',
+ 'xe/anv_batch_chain.h',
+ 'xe/anv_kmd_backend.c',
+ 'xe/anv_device.c',
+ 'xe/anv_device.h',
+ 'xe/anv_queue.c',
+ 'xe/anv_queue.h',
'anv_allocator.c',
'anv_android.h',
+ 'anv_astc_emu.c',
'anv_batch_chain.c',
'anv_blorp.c',
+ 'anv_bo_sync.c',
'anv_cmd_buffer.c',
'anv_descriptor_set.c',
'anv_device.c',
'anv_formats.c',
'anv_genX.h',
'anv_image.c',
+ 'anv_internal_kernels.c',
+ 'anv_internal_kernels.h',
+ 'anv_kmd_backend.c',
+ 'anv_kmd_backend.h',
'anv_measure.c',
'anv_measure.h',
+ 'anv_mesh_perprim_wa.c',
'anv_nir.h',
- 'anv_nir_add_base_work_group_id.c',
'anv_nir_apply_pipeline_layout.c',
'anv_nir_compute_push_layout.c',
'anv_nir_lower_multiview.c',
+ 'anv_nir_lower_load_patch_vertices_in.c',
'anv_nir_lower_ubo_loads.c',
- 'anv_nir_lower_ycbcr_textures.c',
- 'anv_pass.c',
+ 'anv_nir_lower_resource_intel.c',
+ 'anv_nir_push_descriptor_analysis.c',
'anv_perf.c',
'anv_pipeline.c',
'anv_pipeline_cache.c',
'anv_private.h',
'anv_queue.c',
+ 'anv_rmv.c',
+ 'anv_rmv.h',
+ 'anv_sparse.c',
'anv_util.c',
+ 'anv_utrace.c',
+ 'anv_va.c',
+ 'anv_video.c',
'anv_wsi.c',
)
@@ -117,77 +206,70 @@ anv_deps = [
idep_genxml,
idep_nir_headers,
idep_vulkan_util_headers,
-]
-anv_flags = [
- no_override_init_args,
- c_sse2_args,
+ idep_vulkan_runtime_headers,
+ idep_vulkan_wsi_headers,
+ idep_intel_shaders,
+ idep_intel_blorp,
]
if with_platform_x11
anv_deps += dep_xcb_dri3
- anv_flags += [
- '-DVK_USE_PLATFORM_XCB_KHR',
- '-DVK_USE_PLATFORM_XLIB_KHR',
- ]
- libanv_files += files('anv_wsi_x11.c')
endif
if with_platform_wayland
anv_deps += dep_wayland_client
- anv_flags += '-DVK_USE_PLATFORM_WAYLAND_KHR'
- libanv_files += files('anv_wsi_wayland.c')
-endif
-
-if system_has_kms_drm and not with_platform_android
- anv_flags += '-DVK_USE_PLATFORM_DISPLAY_KHR'
- libanv_files += files('anv_wsi_display.c')
endif
if with_xlib_lease
anv_deps += [dep_xlib_xrandr]
- anv_flags += '-DVK_USE_PLATFORM_XLIB_XRANDR_EXT'
endif
if with_platform_android
- anv_flags += '-DVK_USE_PLATFORM_ANDROID_KHR'
+ anv_deps += idep_u_gralloc
libanv_files += files('anv_android.c')
else
libanv_files += files('anv_android_stubs.c')
endif
+anv_deps += idep_intel_driver_ds_headers
+
libanv_common = static_library(
'anv_common',
[
libanv_files, anv_entrypoints, sha1_h,
- gen_xml_pack,
+ gen_xml_pack, intel_float64_spv_h,
],
include_directories : [
- inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_intel, inc_compiler,
- inc_vulkan_wsi, inc_util,
+ inc_include, inc_src, inc_intel,
+ inc_util,
],
c_args : anv_flags,
+ cpp_args : anv_cpp_flags,
gnu_symbol_visibility : 'hidden',
- dependencies : anv_deps,
+ dependencies : anv_deps
)
libvulkan_intel = shared_library(
'vulkan_intel',
[files('anv_gem.c'), anv_entrypoints[0]],
include_directories : [
- inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_intel, inc_compiler, inc_vulkan_wsi,
+ inc_include, inc_src, inc_intel,
],
- link_whole : [libanv_common, libanv_per_hw_ver_libs],
+ link_whole : [libanv_common, libanv_per_hw_ver_libs] + optional_libgrl,
link_with : [
- libintel_compiler, libintel_dev, libisl, libblorp, libvulkan_wsi,
- libintel_perf,
+ libisl, libintel_perf,
],
dependencies : [
dep_thread, dep_dl, dep_m, anv_deps, idep_libintel_common,
- idep_nir, idep_genxml, idep_vulkan_util, idep_mesautil, idep_xmlconfig,
+ idep_nir, idep_genxml, idep_vulkan_util, idep_vulkan_wsi,
+ idep_vulkan_runtime, idep_mesautil, idep_xmlconfig,
+ idep_intel_driver_ds, idep_intel_dev, idep_intel_blorp,
+ idep_intel_compiler_brw, idep_intel_decoder_brw,
],
c_args : anv_flags,
gnu_symbol_visibility : 'hidden',
- link_args : [ld_args_build_id, ld_args_bsymbolic, ld_args_gc_sections],
+ link_args : [vulkan_icd_link_args, ld_args_build_id, ld_args_bsymbolic, ld_args_gc_sections],
+ link_depends : vulkan_icd_link_depends,
install : true,
)
@@ -209,37 +291,54 @@ if with_tests
'vulkan_intel_test',
[files('anv_gem_stubs.c'), anv_entrypoints[0]],
include_directories : [
- inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_intel, inc_compiler, inc_vulkan_wsi,
+ inc_include, inc_src, inc_intel,
],
link_whole : libanv_common,
link_with : [
- libanv_per_hw_ver_libs, libintel_compiler, libintel_common, libintel_dev,
- libisl, libblorp, libvulkan_wsi, libintel_perf,
- ],
+ libanv_per_hw_ver_libs, libintel_common,
+ libisl, libintel_perf,
+ ] + optional_libgrl,
dependencies : [
dep_thread, dep_dl, dep_m, anv_deps,
- idep_nir, idep_vulkan_util, idep_mesautil,
+ idep_nir, idep_vulkan_util, idep_vulkan_wsi, idep_vulkan_runtime,
+ idep_mesautil, idep_intel_dev, idep_intel_shaders, idep_intel_blorp,
+ idep_intel_compiler_brw, idep_intel_decoder_brw,
],
c_args : anv_flags,
gnu_symbol_visibility : 'hidden',
)
- foreach t : ['block_pool_no_free', 'block_pool_grow_first',
- 'state_pool_no_free', 'state_pool_free_list_only',
- 'state_pool', 'state_pool_padding']
- test(
- 'anv_@0@'.format(t),
- executable(
- t,
- ['tests/@0@.c'.format(t), anv_entrypoints[0]],
- c_args : [ c_sse2_args ],
- link_with : libvulkan_intel_test,
- dependencies : [dep_libdrm, dep_thread, dep_m, dep_valgrind, idep_vulkan_util, ],
- include_directories : [
- inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_intel, inc_compiler, inc_vulkan_wsi,
- ],
- ),
- suite : ['intel'],
- )
- endforeach
+ files_anv_tests = files(
+ 'tests/anv_tests.cpp',
+
+ 'tests/state_pool.c',
+ 'tests/state_pool_free_list_only.c',
+ 'tests/state_pool_max_size.c',
+ 'tests/state_pool_no_free.c',
+ 'tests/state_pool_padding.c',
+ 'tests/block_pool_no_free.c',
+ 'tests/block_pool_grow_first.c',
+ 'tests/block_pool_max_size.c',
+ )
+
+ test(
+ 'anv_tests',
+ executable(
+ 'anv_tests',
+ [files_anv_tests, anv_entrypoints[0]],
+ c_args : [ sse2_args ],
+ link_with : libvulkan_intel_test,
+ dependencies : [
+ idep_gtest, dep_libdrm, dep_thread, dep_m, dep_valgrind,
+ idep_vulkan_util, idep_vulkan_wsi_headers,
+ idep_vulkan_runtime, idep_intel_driver_ds, idep_intel_dev,
+ idep_intel_shaders,
+ ],
+ include_directories : [
+ inc_include, inc_src, inc_intel,
+ ],
+ ),
+ suite : ['intel'],
+ protocol : 'gtest',
+ )
endif
diff --git a/src/intel/vulkan/tests/anv_tests.cpp b/src/intel/vulkan/tests/anv_tests.cpp
new file mode 100644
index 00000000000..09be512f81e
--- /dev/null
+++ b/src/intel/vulkan/tests/anv_tests.cpp
@@ -0,0 +1,25 @@
+/*
+ * Copyright © 2023 Intel Corporation
+ * SPDX-License-Identifier: MIT
+ */
+
+#include <gtest/gtest.h>
+
+#include "test_common.h"
+
+#define ANV_C_TEST(S, N, C) extern "C" void C(void); TEST(S, N) { C(); }
+
+ANV_C_TEST(StatePool, Regular, state_pool_test);
+ANV_C_TEST(StatePool, FreeListOnly, state_pool_free_list_only_test);
+ANV_C_TEST(StatePool, MaxSizeOverLimit, state_pool_max_size_over_limit);
+ANV_C_TEST(StatePool, MaxSizeWithinLimit, state_pool_max_size_within_limit);
+ANV_C_TEST(StatePool, NoFree, state_pool_no_free_test);
+ANV_C_TEST(StatePool, Padding, state_pool_padding_test);
+
+ANV_C_TEST(BlockPool, NoFree, block_pool_no_free_test);
+ANV_C_TEST(BlockPool, GrowFirst, block_pool_grow_first_test);
+ANV_C_TEST(BlockPool, MaxSize, block_pool_max_size);
+
+extern "C" void FAIL_IN_GTEST(const char *file_path, unsigned line_number, const char *msg) {
+ GTEST_FAIL_AT(file_path, line_number) << msg;
+}
diff --git a/src/intel/vulkan/tests/block_pool_grow_first.c b/src/intel/vulkan/tests/block_pool_grow_first.c
index e50f65c8d68..1c745360ea8 100644
--- a/src/intel/vulkan/tests/block_pool_grow_first.c
+++ b/src/intel/vulkan/tests/block_pool_grow_first.c
@@ -24,14 +24,12 @@
#include "anv_private.h"
#include "test_common.h"
-int main(void)
+void block_pool_grow_first_test(void);
+
+void block_pool_grow_first_test(void)
{
- struct anv_physical_device physical_device = {
- .use_softpin = true,
- };
- struct anv_device device = {
- .physical = &physical_device,
- };
+ struct anv_physical_device physical_device = {};
+ struct anv_device device = {};
struct anv_block_pool pool;
/* Create a pool with initial size smaller than the block allocated, so
@@ -39,14 +37,20 @@ int main(void)
*/
const uint32_t block_size = 16 * 1024;
const uint32_t initial_size = block_size / 2;
+ const uint32_t _1Gb = 1024 * 1024 * 1024;
+ test_device_info_init(&physical_device.info);
+ anv_device_set_physical(&device, &physical_device);
+ device.kmd_backend = anv_kmd_backend_get(INTEL_KMD_TYPE_STUB);
pthread_mutex_init(&device.mutex, NULL);
- anv_bo_cache_init(&device.bo_cache);
- anv_block_pool_init(&pool, &device, "test", 4096, initial_size);
+ anv_bo_cache_init(&device.bo_cache, &device);
+ anv_block_pool_init(&pool, &device, "test", 4096, initial_size, _1Gb);
ASSERT(pool.size == initial_size);
uint32_t padding;
- int32_t offset = anv_block_pool_alloc(&pool, block_size, &padding);
+ int64_t offset;
+ VkResult result = anv_block_pool_alloc(&pool, block_size, &offset, &padding);
+ ASSERT(result == VK_SUCCESS);
/* Pool will have grown at least space to fit the new allocation. */
ASSERT(pool.size > initial_size);
@@ -63,4 +67,6 @@ int main(void)
memset(map, 22, block_size);
anv_block_pool_finish(&pool);
+ anv_bo_cache_finish(&device.bo_cache);
+ pthread_mutex_destroy(&device.mutex);
}
diff --git a/src/intel/vulkan/tests/block_pool_max_size.c b/src/intel/vulkan/tests/block_pool_max_size.c
new file mode 100644
index 00000000000..b9f6620cbaf
--- /dev/null
+++ b/src/intel/vulkan/tests/block_pool_max_size.c
@@ -0,0 +1,73 @@
+/*
+ * Copyright © 2023 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "anv_private.h"
+#include "test_common.h"
+
+void block_pool_max_size(void);
+
+void block_pool_max_size(void)
+{
+ struct anv_physical_device physical_device = {};
+ struct anv_device device = {};
+ struct anv_block_pool pool;
+
+ const uint32_t block_size = 16 * 1024;
+ const uint32_t initial_size = block_size;
+ const uint32_t _1Mb = 1024 * 1024;
+
+ test_device_info_init(&physical_device.info);
+ anv_device_set_physical(&device, &physical_device);
+ device.kmd_backend = anv_kmd_backend_get(INTEL_KMD_TYPE_STUB);
+ pthread_mutex_init(&device.mutex, NULL);
+ anv_bo_cache_init(&device.bo_cache, &device);
+ anv_block_pool_init(&pool, &device, "test", 4096, initial_size, _1Mb);
+ ASSERT(pool.size == initial_size);
+
+ for (uint32_t i = 0; i < _1Mb / block_size; i++) {
+ uint32_t padding;
+ int64_t offset;
+
+ VkResult result = anv_block_pool_alloc(&pool, block_size, &offset, &padding);
+ ASSERT(result == VK_SUCCESS);
+
+ /* Pool will have grown at least space to fit the new allocation. */
+ ASSERT(pool.size <= _1Mb);
+
+ /* Use the memory to ensure it is valid. */
+ void *map = anv_block_pool_map(&pool, offset, block_size);
+ memset(map, 22, block_size);
+ }
+
+ {
+ uint32_t padding;
+ int64_t offset;
+
+ VkResult result = anv_block_pool_alloc(&pool, block_size, &offset, &padding);
+ ASSERT(result == VK_ERROR_OUT_OF_DEVICE_MEMORY);
+ }
+
+ anv_block_pool_finish(&pool);
+ anv_bo_cache_finish(&device.bo_cache);
+ pthread_mutex_destroy(&device.mutex);
+}
diff --git a/src/intel/vulkan/tests/block_pool_no_free.c b/src/intel/vulkan/tests/block_pool_no_free.c
index 37030bdd7a3..7c9c8951361 100644
--- a/src/intel/vulkan/tests/block_pool_no_free.c
+++ b/src/intel/vulkan/tests/block_pool_no_free.c
@@ -30,12 +30,11 @@
#define BLOCKS_PER_THREAD 1024
#define NUM_RUNS 64
-struct job {
+static struct job {
pthread_t thread;
unsigned id;
struct anv_block_pool *pool;
int32_t blocks[BLOCKS_PER_THREAD];
- int32_t back_blocks[BLOCKS_PER_THREAD];
} jobs[NUM_THREADS];
@@ -44,30 +43,24 @@ static void *alloc_blocks(void *_job)
struct job *job = _job;
uint32_t job_id = job - jobs;
uint32_t block_size = 16 * ((job_id % 4) + 1);
- int32_t block, *data;
+ int64_t block;
+ int32_t *data;
for (unsigned i = 0; i < BLOCKS_PER_THREAD; i++) {
- block = anv_block_pool_alloc(job->pool, block_size, NULL);
+ UNUSED uint32_t padding;
+ VkResult result = anv_block_pool_alloc(job->pool, block_size,
+ &block, &padding);
+ ASSERT(result == VK_SUCCESS);
data = anv_block_pool_map(job->pool, block, block_size);
*data = block;
ASSERT(block >= 0);
job->blocks[i] = block;
-
- block = anv_block_pool_alloc_back(job->pool, block_size);
- data = anv_block_pool_map(job->pool, block, block_size);
- *data = block;
- ASSERT(block < 0);
- job->back_blocks[i] = -block;
}
for (unsigned i = 0; i < BLOCKS_PER_THREAD; i++) {
block = job->blocks[i];
data = anv_block_pool_map(job->pool, block, block_size);
ASSERT(*data == block);
-
- block = -job->back_blocks[i];
- data = anv_block_pool_map(job->pool, block, block_size);
- ASSERT(*data == block);
}
return NULL;
@@ -110,15 +103,17 @@ static void validate_monotonic(int32_t **blocks)
static void run_test()
{
- struct anv_physical_device physical_device = { };
- struct anv_device device = {
- .physical = &physical_device,
- };
+ struct anv_physical_device physical_device = {};
+ struct anv_device device = {};
struct anv_block_pool pool;
+ const uint32_t _1Gb = 1024 * 1024 * 1024;
+ test_device_info_init(&physical_device.info);
+ anv_device_set_physical(&device, &physical_device);
+ device.kmd_backend = anv_kmd_backend_get(INTEL_KMD_TYPE_STUB);
pthread_mutex_init(&device.mutex, NULL);
- anv_bo_cache_init(&device.bo_cache);
- anv_block_pool_init(&pool, &device, "test", 4096, 4096);
+ anv_bo_cache_init(&device.bo_cache, &device);
+ anv_block_pool_init(&pool, &device, "test", 4096, 4096, _1Gb);
for (unsigned i = 0; i < NUM_THREADS; i++) {
jobs[i].pool = &pool;
@@ -135,16 +130,14 @@ static void run_test()
block_ptrs[i] = jobs[i].blocks;
validate_monotonic(block_ptrs);
- /* Validate that the back block allocations were monotonic */
- for (unsigned i = 0; i < NUM_THREADS; i++)
- block_ptrs[i] = jobs[i].back_blocks;
- validate_monotonic(block_ptrs);
-
anv_block_pool_finish(&pool);
+ anv_bo_cache_finish(&device.bo_cache);
pthread_mutex_destroy(&device.mutex);
}
-int main(void)
+void block_pool_no_free_test(void);
+
+void block_pool_no_free_test(void)
{
for (unsigned i = 0; i < NUM_RUNS; i++)
run_test();
diff --git a/src/intel/vulkan/tests/state_pool.c b/src/intel/vulkan/tests/state_pool.c
index 2f54efe783c..20eb2a34750 100644
--- a/src/intel/vulkan/tests/state_pool.c
+++ b/src/intel/vulkan/tests/state_pool.c
@@ -26,34 +26,45 @@
#include "anv_private.h"
#include "test_common.h"
-#define NUM_THREADS 8
-#define STATES_PER_THREAD_LOG2 10
-#define STATES_PER_THREAD (1 << STATES_PER_THREAD_LOG2)
-#define NUM_RUNS 64
-
#include "state_pool_test_helper.h"
-int main(void)
+void state_pool_test(void);
+
+void state_pool_test(void)
{
+ const unsigned num_threads = 8;
+ const unsigned states_per_thread = 1 << 10;
+
struct anv_physical_device physical_device = { };
- struct anv_device device = {
- .physical = &physical_device,
- };
+ struct anv_device device = {};
struct anv_state_pool state_pool;
+ test_device_info_init(&physical_device.info);
+ anv_device_set_physical(&device, &physical_device);
+ device.kmd_backend = anv_kmd_backend_get(INTEL_KMD_TYPE_STUB);
pthread_mutex_init(&device.mutex, NULL);
- anv_bo_cache_init(&device.bo_cache);
+ anv_bo_cache_init(&device.bo_cache, &device);
- for (unsigned i = 0; i < NUM_RUNS; i++) {
- anv_state_pool_init(&state_pool, &device, "test", 4096, 0, 256);
+ const unsigned num_runs = 64;
+ const uint32_t _1Gb = 1024 * 1024 * 1024;
+ for (unsigned i = 0; i < num_runs; i++) {
+ anv_state_pool_init(&state_pool, &device,
+ &(struct anv_state_pool_params) {
+ .name = "test",
+ .base_address = 4096,
+ .start_offset = 0,
+ .block_size = 256,
+ .max_size = _1Gb,
+ });
/* Grab one so a zero offset is impossible */
anv_state_pool_alloc(&state_pool, 16, 16);
- run_state_pool_test(&state_pool);
+ run_state_pool_test(&state_pool, num_threads, states_per_thread);
anv_state_pool_finish(&state_pool);
}
+ anv_bo_cache_finish(&device.bo_cache);
pthread_mutex_destroy(&device.mutex);
}
diff --git a/src/intel/vulkan/tests/state_pool_free_list_only.c b/src/intel/vulkan/tests/state_pool_free_list_only.c
index 193169867c1..d64a8b8f827 100644
--- a/src/intel/vulkan/tests/state_pool_free_list_only.c
+++ b/src/intel/vulkan/tests/state_pool_free_list_only.c
@@ -26,23 +26,33 @@
#include "anv_private.h"
#include "test_common.h"
-#define NUM_THREADS 8
-#define STATES_PER_THREAD_LOG2 12
-#define STATES_PER_THREAD (1 << STATES_PER_THREAD_LOG2)
-
#include "state_pool_test_helper.h"
-int main(void)
+void state_pool_free_list_only_test(void);
+
+void state_pool_free_list_only_test(void)
{
+ const unsigned num_threads = 8;
+ const unsigned states_per_thread = 1 << 12;
+ const uint32_t _1Gb = 1024 * 1024 * 1024;
+
struct anv_physical_device physical_device = { };
- struct anv_device device = {
- .physical = &physical_device,
- };
+ struct anv_device device = {};
struct anv_state_pool state_pool;
+ test_device_info_init(&physical_device.info);
+ anv_device_set_physical(&device, &physical_device);
+ device.kmd_backend = anv_kmd_backend_get(INTEL_KMD_TYPE_STUB);
pthread_mutex_init(&device.mutex, NULL);
- anv_bo_cache_init(&device.bo_cache);
- anv_state_pool_init(&state_pool, &device, "test", 4096, 0, 4096);
+ anv_bo_cache_init(&device.bo_cache, &device);
+ anv_state_pool_init(&state_pool, &device,
+ &(struct anv_state_pool_params) {
+ .name = "test",
+ .base_address = 4096,
+ .start_offset = 0,
+ .block_size = 4096,
+ .max_size = _1Gb,
+ });
/* Grab one so a zero offset is impossible */
anv_state_pool_alloc(&state_pool, 16, 16);
@@ -51,18 +61,19 @@ int main(void)
* actually ever resize anything.
*/
{
- struct anv_state states[NUM_THREADS * STATES_PER_THREAD];
- for (unsigned i = 0; i < NUM_THREADS * STATES_PER_THREAD; i++) {
+ struct anv_state states[num_threads * states_per_thread];
+ for (unsigned i = 0; i < ARRAY_SIZE(states); i++) {
states[i] = anv_state_pool_alloc(&state_pool, 16, 16);
ASSERT(states[i].offset != 0);
}
- for (unsigned i = 0; i < NUM_THREADS * STATES_PER_THREAD; i++)
+ for (unsigned i = 0; i < ARRAY_SIZE(states); i++)
anv_state_pool_free(&state_pool, states[i]);
}
- run_state_pool_test(&state_pool);
+ run_state_pool_test(&state_pool, num_threads, states_per_thread);
anv_state_pool_finish(&state_pool);
+ anv_bo_cache_finish(&device.bo_cache);
pthread_mutex_destroy(&device.mutex);
}
diff --git a/src/intel/vulkan/tests/state_pool_max_size.c b/src/intel/vulkan/tests/state_pool_max_size.c
new file mode 100644
index 00000000000..4b7cb962b4e
--- /dev/null
+++ b/src/intel/vulkan/tests/state_pool_max_size.c
@@ -0,0 +1,131 @@
+/*
+ * Copyright © 2023 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <pthread.h>
+
+#include "anv_private.h"
+#include "test_common.h"
+
+#define NUM_THREADS 16
+#define STATES_PER_THREAD 1024
+#define NUM_RUNS 1
+
+static struct job {
+ pthread_t thread;
+ uint32_t state_size;
+ uint32_t state_alignment;
+ struct anv_state_pool *pool;
+ struct anv_state states[STATES_PER_THREAD];
+} jobs[NUM_THREADS];
+
+static pthread_barrier_t barrier;
+
+static void *alloc_states(void *_job)
+{
+ struct job *job = _job;
+
+ pthread_barrier_wait(&barrier);
+
+ for (unsigned i = 0; i < STATES_PER_THREAD; i++) {
+ struct anv_state state = anv_state_pool_alloc(job->pool,
+ job->state_size,
+ job->state_alignment);
+ job->states[i] = state;
+ }
+
+ return NULL;
+}
+
+static void run_test(uint32_t state_size,
+ uint32_t state_alignment,
+ uint32_t block_size,
+ uint32_t pool_max_size)
+{
+ struct anv_physical_device physical_device = { };
+ struct anv_device device = {};
+ struct anv_state_pool state_pool;
+
+ test_device_info_init(&physical_device.info);
+ anv_device_set_physical(&device, &physical_device);
+ device.kmd_backend = anv_kmd_backend_get(INTEL_KMD_TYPE_STUB);
+ pthread_mutex_init(&device.mutex, NULL);
+ anv_bo_cache_init(&device.bo_cache, &device);
+ anv_state_pool_init(&state_pool, &device,
+ &(struct anv_state_pool_params) {
+ .name = "test",
+ .base_address = 4096,
+ .start_offset = 0,
+ .block_size = block_size,
+ .max_size = pool_max_size,
+ });
+
+ pthread_barrier_init(&barrier, NULL, NUM_THREADS);
+
+ for (unsigned i = 0; i < ARRAY_SIZE(jobs); i++) {
+ jobs[i].state_size = state_size;
+ jobs[i].state_alignment = state_alignment;
+ jobs[i].pool = &state_pool;
+ pthread_create(&jobs[i].thread, NULL, alloc_states, &jobs[i]);
+ }
+
+ for (unsigned i = 0; i < ARRAY_SIZE(jobs); i++)
+ pthread_join(jobs[i].thread, NULL);
+
+ const uint32_t expected_allocation_fails =
+ (NUM_THREADS * STATES_PER_THREAD * block_size) > pool_max_size ?
+ ((NUM_THREADS * STATES_PER_THREAD) - (pool_max_size / block_size)) : 0;
+ uint32_t allocation_fails = 0;
+ for (unsigned j = 0; j < ARRAY_SIZE(jobs); j++) {
+ int64_t last_state_offset = -1;
+ for (unsigned s = 0; s < ARRAY_SIZE(jobs[j].states); s++) {
+ if (jobs[j].states[s].alloc_size) {
+ ASSERT(last_state_offset < jobs[j].states[s].offset);
+ last_state_offset = jobs[j].states[s].offset;
+ } else {
+ allocation_fails++;
+ }
+ }
+ }
+
+ ASSERT(allocation_fails == expected_allocation_fails);
+
+ anv_state_pool_finish(&state_pool);
+ anv_bo_cache_finish(&device.bo_cache);
+ pthread_mutex_destroy(&device.mutex);
+}
+
+void state_pool_max_size_within_limit(void);
+
+void state_pool_max_size_within_limit(void)
+{
+ for (unsigned i = 0; i < NUM_RUNS; i++)
+ run_test(16, 16, 64, 64 * NUM_THREADS * STATES_PER_THREAD);
+}
+
+void state_pool_max_size_over_limit(void);
+
+void state_pool_max_size_over_limit(void)
+{
+ for (unsigned i = 0; i < NUM_RUNS; i++)
+ run_test(16, 16, 64, 16 * NUM_THREADS * STATES_PER_THREAD);
+}
diff --git a/src/intel/vulkan/tests/state_pool_no_free.c b/src/intel/vulkan/tests/state_pool_no_free.c
index 4288e1a1b87..07df9b1847c 100644
--- a/src/intel/vulkan/tests/state_pool_no_free.c
+++ b/src/intel/vulkan/tests/state_pool_no_free.c
@@ -30,14 +30,14 @@
#define STATES_PER_THREAD 1024
#define NUM_RUNS 64
-struct job {
+static struct job {
pthread_t thread;
unsigned id;
struct anv_state_pool *pool;
uint32_t offsets[STATES_PER_THREAD];
} jobs[NUM_THREADS];
-pthread_barrier_t barrier;
+static pthread_barrier_t barrier;
static void *alloc_states(void *_job)
{
@@ -56,14 +56,23 @@ static void *alloc_states(void *_job)
static void run_test()
{
struct anv_physical_device physical_device = { };
- struct anv_device device = {
- .physical = &physical_device,
- };
+ struct anv_device device = {};
struct anv_state_pool state_pool;
+ const uint32_t _1Gb = 1024 * 1024 * 1024;
+ test_device_info_init(&physical_device.info);
+ anv_device_set_physical(&device, &physical_device);
+ device.kmd_backend = anv_kmd_backend_get(INTEL_KMD_TYPE_STUB);
pthread_mutex_init(&device.mutex, NULL);
- anv_bo_cache_init(&device.bo_cache);
- anv_state_pool_init(&state_pool, &device, "test", 4096, 0, 64);
+ anv_bo_cache_init(&device.bo_cache, &device);
+ anv_state_pool_init(&state_pool, &device,
+ &(struct anv_state_pool_params) {
+ .name = "test",
+ .base_address = 4096,
+ .start_offset = 0,
+ .block_size = 64,
+ .max_size = _1Gb,
+ });
pthread_barrier_init(&barrier, NULL, NUM_THREADS);
@@ -109,10 +118,13 @@ static void run_test()
}
anv_state_pool_finish(&state_pool);
+ anv_bo_cache_finish(&device.bo_cache);
pthread_mutex_destroy(&device.mutex);
}
-int main(void)
+void state_pool_no_free_test(void);
+
+void state_pool_no_free_test(void)
{
for (unsigned i = 0; i < NUM_RUNS; i++)
run_test();
diff --git a/src/intel/vulkan/tests/state_pool_padding.c b/src/intel/vulkan/tests/state_pool_padding.c
index 70fb773b5b1..b9fa15f11a3 100644
--- a/src/intel/vulkan/tests/state_pool_padding.c
+++ b/src/intel/vulkan/tests/state_pool_padding.c
@@ -24,19 +24,28 @@
#include "anv_private.h"
#include "test_common.h"
-int main(void)
+void state_pool_padding_test(void);
+
+void state_pool_padding_test(void)
{
- struct anv_physical_device physical_device = {
- .use_softpin = true,
- };
- struct anv_device device = {
- .physical = &physical_device,
- };
+ struct anv_physical_device physical_device = {};
+ struct anv_device device = {};
struct anv_state_pool state_pool;
+ const uint32_t _1Gb = 1024 * 1024 * 1024;
+ test_device_info_init(&physical_device.info);
+ anv_device_set_physical(&device, &physical_device);
+ device.kmd_backend = anv_kmd_backend_get(INTEL_KMD_TYPE_STUB);
pthread_mutex_init(&device.mutex, NULL);
- anv_bo_cache_init(&device.bo_cache);
- anv_state_pool_init(&state_pool, &device, "test", 4096, 0, 4096);
+ anv_bo_cache_init(&device.bo_cache, &device);
+ anv_state_pool_init(&state_pool, &device,
+ &(struct anv_state_pool_params) {
+ .name = "test",
+ .base_address = 4096,
+ .start_offset = 0,
+ .block_size = 4096,
+ .max_size = _1Gb,
+ });
/* Get the size of the underlying block_pool */
struct anv_block_pool *bp = &state_pool.block_pool;
@@ -75,4 +84,6 @@ int main(void)
ASSERT(state.offset == pool_size);
anv_state_pool_finish(&state_pool);
+ anv_bo_cache_finish(&device.bo_cache);
+ pthread_mutex_destroy(&device.mutex);
}
diff --git a/src/intel/vulkan/tests/state_pool_test_helper.h b/src/intel/vulkan/tests/state_pool_test_helper.h
index f22a28ecc6f..de6a363efe1 100644
--- a/src/intel/vulkan/tests/state_pool_test_helper.h
+++ b/src/intel/vulkan/tests/state_pool_test_helper.h
@@ -23,49 +23,70 @@
#include <pthread.h>
+#include "util/u_math.h"
+
struct job {
- struct anv_state_pool *pool;
+ struct state_pool_test_context *ctx;
unsigned id;
pthread_t thread;
-} jobs[NUM_THREADS];
+};
+
+struct state_pool_test_context {
+ struct anv_state_pool *pool;
+ unsigned states_per_thread;
+ pthread_barrier_t barrier;
-pthread_barrier_t barrier;
+ struct job *jobs;
+};
static void *alloc_states(void *void_job)
{
struct job *job = void_job;
+ struct state_pool_test_context *ctx = job->ctx;
- const unsigned chunk_size = 1 << (job->id % STATES_PER_THREAD_LOG2);
- const unsigned num_chunks = STATES_PER_THREAD / chunk_size;
+ const unsigned states_per_thread_log2 = util_logbase2(ctx->states_per_thread);
+ const unsigned chunk_size = 1 << (job->id % states_per_thread_log2);
+ const unsigned num_chunks = ctx->states_per_thread / chunk_size;
struct anv_state states[chunk_size];
- pthread_barrier_wait(&barrier);
+ pthread_barrier_wait(&ctx->barrier);
for (unsigned c = 0; c < num_chunks; c++) {
for (unsigned i = 0; i < chunk_size; i++) {
- states[i] = anv_state_pool_alloc(job->pool, 16, 16);
+ states[i] = anv_state_pool_alloc(ctx->pool, 16, 16);
memset(states[i].map, 139, 16);
ASSERT(states[i].offset != 0);
}
for (unsigned i = 0; i < chunk_size; i++)
- anv_state_pool_free(job->pool, states[i]);
+ anv_state_pool_free(ctx->pool, states[i]);
}
return NULL;
}
-static void run_state_pool_test(struct anv_state_pool *state_pool)
+static void run_state_pool_test(struct anv_state_pool *state_pool, unsigned num_threads,
+ unsigned states_per_thread)
{
- pthread_barrier_init(&barrier, NULL, NUM_THREADS);
+ struct state_pool_test_context ctx = {
+ .pool = state_pool,
+ .states_per_thread = states_per_thread,
+ .jobs = calloc(num_threads, sizeof(struct job)),
+ };
+ pthread_barrier_init(&ctx.barrier, NULL, num_threads);
+
+ for (unsigned i = 0; i < num_threads; i++) {
+ struct job *job = &ctx.jobs[i];
+ job->ctx = &ctx;
+ job->id = i;
+ pthread_create(&job->thread, NULL, alloc_states, job);
+ }
- for (unsigned i = 0; i < NUM_THREADS; i++) {
- jobs[i].pool = state_pool;
- jobs[i].id = i;
- pthread_create(&jobs[i].thread, NULL, alloc_states, &jobs[i]);
+ for (unsigned i = 0; i < num_threads; i++) {
+ struct job *job = &ctx.jobs[i];
+ pthread_join(job->thread, NULL);
}
- for (unsigned i = 0; i < NUM_THREADS; i++)
- pthread_join(jobs[i].thread, NULL);
+ free(ctx.jobs);
}
diff --git a/src/intel/vulkan/tests/test_common.h b/src/intel/vulkan/tests/test_common.h
index 3f883e3bdcd..eea5b5ac82f 100644
--- a/src/intel/vulkan/tests/test_common.h
+++ b/src/intel/vulkan/tests/test_common.h
@@ -21,14 +21,27 @@
* IN THE SOFTWARE.
*/
-#include <stdio.h>
-#include <stdlib.h>
+#include "dev/intel_device_info.h"
-#define ASSERT(cond) \
- do { \
- if (!(cond)) { \
- fprintf(stderr, "%s:%d: Test assertion `%s` failed.\n", \
- __FILE__, __LINE__, # cond); \
- abort(); \
- } \
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define ASSERT(cond) \
+ do { \
+ if (!(cond)) { \
+ FAIL_IN_GTEST(__FILE__, __LINE__, "Test assertion `" # cond \
+ "` failed."); \
+ } \
} while (false)
+
+static inline void test_device_info_init(struct intel_device_info *info)
+{
+ info->mem_alignment = 4096;
+}
+
+void FAIL_IN_GTEST(const char *file_path, unsigned line_number, const char *msg);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
diff --git a/src/intel/vulkan/xe/anv_batch_chain.c b/src/intel/vulkan/xe/anv_batch_chain.c
new file mode 100644
index 00000000000..69a5ed69949
--- /dev/null
+++ b/src/intel/vulkan/xe/anv_batch_chain.c
@@ -0,0 +1,409 @@
+/*
+ * Copyright © 2023 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "xe/anv_batch_chain.h"
+
+#include "anv_private.h"
+#include "anv_measure.h"
+#include "common/intel_bind_timeline.h"
+
+#include "drm-uapi/xe_drm.h"
+
+VkResult
+xe_execute_simple_batch(struct anv_queue *queue,
+ struct anv_bo *batch_bo,
+ uint32_t batch_bo_size,
+ bool is_companion_rcs_batch)
+{
+ struct anv_device *device = queue->device;
+ uint32_t exec_queue_id = is_companion_rcs_batch ?
+ queue->companion_rcs_id :
+ queue->exec_queue_id;
+ struct drm_syncobj_create syncobj_create = {};
+ struct drm_syncobj_destroy syncobj_destroy = {};
+ struct drm_xe_sync syncs[2] = {};
+ VkResult result = VK_SUCCESS;
+
+ if (intel_ioctl(device->fd, DRM_IOCTL_SYNCOBJ_CREATE, &syncobj_create))
+ return vk_errorf(device, VK_ERROR_UNKNOWN, "Unable to create sync obj");
+
+ syncs[0].type = DRM_XE_SYNC_TYPE_SYNCOBJ;
+ syncs[0].flags = DRM_XE_SYNC_FLAG_SIGNAL;
+ syncs[0].handle = syncobj_create.handle;
+
+ /* vm bind sync */
+ syncs[1].type = DRM_XE_SYNC_TYPE_TIMELINE_SYNCOBJ;
+ syncs[1].handle = intel_bind_timeline_get_syncobj(&device->bind_timeline);
+ syncs[1].timeline_value = intel_bind_timeline_get_last_point(&device->bind_timeline);
+
+ struct drm_xe_exec exec = {
+ .exec_queue_id = exec_queue_id,
+ .num_batch_buffer = 1,
+ .address = batch_bo->offset,
+ .num_syncs = ARRAY_SIZE(syncs),
+ .syncs = (uintptr_t)syncs,
+ };
+
+ if (intel_ioctl(device->fd, DRM_IOCTL_XE_EXEC, &exec)) {
+ result = vk_device_set_lost(&device->vk, "XE_EXEC failed: %m");
+ goto exec_error;
+ }
+
+ struct drm_syncobj_wait wait = {
+ .handles = (uintptr_t)&syncobj_create.handle,
+ .timeout_nsec = INT64_MAX,
+ .count_handles = 1,
+ };
+ if (intel_ioctl(device->fd, DRM_IOCTL_SYNCOBJ_WAIT, &wait))
+ result = vk_device_set_lost(&device->vk, "DRM_IOCTL_SYNCOBJ_WAIT failed: %m");
+
+exec_error:
+ syncobj_destroy.handle = syncobj_create.handle;
+ intel_ioctl(device->fd, DRM_IOCTL_SYNCOBJ_DESTROY, &syncobj_destroy);
+
+ return result;
+}
+
+#define TYPE_SIGNAL true
+#define TYPE_WAIT false
+
+struct drm_xe_sync
+vk_sync_to_drm_xe_sync(struct vk_sync *vk_sync, uint64_t value, bool signal)
+{
+ const struct vk_drm_syncobj *syncobj = vk_sync_as_drm_syncobj(vk_sync);
+ assert(syncobj);
+
+ struct drm_xe_sync drm_sync = {
+ .type = value ? DRM_XE_SYNC_TYPE_TIMELINE_SYNCOBJ :
+ DRM_XE_SYNC_TYPE_SYNCOBJ,
+ .flags = signal ? DRM_XE_SYNC_FLAG_SIGNAL : 0,
+ .handle = syncobj->syncobj,
+ .timeline_value = value,
+ };
+
+ return drm_sync;
+}
+
+static VkResult
+xe_exec_process_syncs(struct anv_queue *queue,
+ uint32_t wait_count, const struct vk_sync_wait *waits,
+ uint32_t signal_count, const struct vk_sync_signal *signals,
+ uint32_t extra_sync_count, const struct drm_xe_sync *extra_syncs,
+ struct anv_utrace_submit *utrace_submit,
+ bool is_companion_rcs_queue,
+ struct drm_xe_sync **ret, uint32_t *ret_count)
+{
+ struct anv_device *device = queue->device;
+ /* Signal the utrace sync only if it doesn't have a batch. Otherwise the
+ * it's the utrace batch that should signal its own sync.
+ */
+ const bool has_utrace_sync = utrace_submit &&
+ util_dynarray_num_elements(&utrace_submit->batch_bos, struct anv_bo *) == 0;
+ const uint32_t num_syncs = wait_count + signal_count + extra_sync_count +
+ (has_utrace_sync ? 1 : 0) +
+ ((queue->sync && !is_companion_rcs_queue) ? 1 : 0) +
+ 1 /* vm bind sync */;
+ struct drm_xe_sync *xe_syncs = vk_zalloc(&device->vk.alloc,
+ sizeof(*xe_syncs) * num_syncs, 8,
+ VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
+ if (!xe_syncs)
+ return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+ uint32_t count = 0;
+
+ if (has_utrace_sync) {
+ xe_syncs[count++] = vk_sync_to_drm_xe_sync(utrace_submit->sync, 0,
+ TYPE_SIGNAL);
+ }
+
+ for (uint32_t i = 0; i < wait_count; i++) {
+ xe_syncs[count++] = vk_sync_to_drm_xe_sync(waits[i].sync,
+ waits[i].wait_value,
+ TYPE_WAIT);
+ }
+
+ for (uint32_t i = 0; i < signal_count; i++) {
+ xe_syncs[count++] = vk_sync_to_drm_xe_sync(signals[i].sync,
+ signals[i].signal_value,
+ TYPE_SIGNAL);
+ }
+
+ for (uint32_t i = 0; i < extra_sync_count; i++)
+ xe_syncs[count++] = extra_syncs[i];
+
+ if (queue->sync && !is_companion_rcs_queue)
+ xe_syncs[count++] = vk_sync_to_drm_xe_sync(queue->sync, 0, TYPE_SIGNAL);
+
+ /* vm bind sync */
+ xe_syncs[count++] = (struct drm_xe_sync) {
+ .type = DRM_XE_SYNC_TYPE_TIMELINE_SYNCOBJ,
+ .flags = 0 /* TYPE_WAIT */,
+ .handle = intel_bind_timeline_get_syncobj(&device->bind_timeline),
+ .timeline_value = intel_bind_timeline_get_last_point(&device->bind_timeline),
+ };
+
+ assert(count == num_syncs);
+ *ret = xe_syncs;
+ *ret_count = num_syncs;
+ return VK_SUCCESS;
+}
+
+static void
+xe_exec_print_debug(struct anv_queue *queue, uint32_t cmd_buffer_count,
+ struct anv_cmd_buffer **cmd_buffers, struct anv_query_pool *perf_query_pool,
+ uint32_t perf_query_pass, struct drm_xe_exec *exec)
+{
+ if (INTEL_DEBUG(DEBUG_SUBMIT))
+ fprintf(stderr, "Batch offset=0x%016"PRIx64" on queue %u\n",
+ (uint64_t)exec->address, queue->vk.index_in_family);
+
+ anv_cmd_buffer_exec_batch_debug(queue, cmd_buffer_count, cmd_buffers,
+ perf_query_pool, perf_query_pass);
+}
+
+VkResult
+xe_execute_trtt_batch(struct anv_sparse_submission *submit,
+ struct anv_trtt_batch_bo *trtt_bbo)
+{
+ struct anv_queue *queue = submit->queue;
+ struct anv_device *device = queue->device;
+ struct anv_trtt *trtt = &device->trtt;
+ VkResult result = VK_SUCCESS;
+
+ struct drm_xe_sync extra_sync = {
+ .type = DRM_XE_SYNC_TYPE_TIMELINE_SYNCOBJ,
+ .flags = DRM_XE_SYNC_FLAG_SIGNAL,
+ .handle = trtt->timeline_handle,
+ .timeline_value = trtt_bbo->timeline_val,
+ };
+
+ struct drm_xe_sync *xe_syncs = NULL;
+ uint32_t xe_syncs_count = 0;
+ result = xe_exec_process_syncs(queue, submit->wait_count, submit->waits,
+ submit->signal_count, submit->signals,
+ 1, &extra_sync,
+ NULL, /* utrace_submit */
+ false, /* is_companion_rcs_queue */
+ &xe_syncs, &xe_syncs_count);
+ if (result != VK_SUCCESS)
+ return result;
+
+ struct drm_xe_exec exec = {
+ .exec_queue_id = queue->exec_queue_id,
+ .num_syncs = xe_syncs_count,
+ .syncs = (uintptr_t)xe_syncs,
+ .address = trtt_bbo->bo->offset,
+ .num_batch_buffer = 1,
+ };
+
+ if (!device->info->no_hw) {
+ if (intel_ioctl(device->fd, DRM_IOCTL_XE_EXEC, &exec)) {
+ result = vk_device_set_lost(&device->vk, "XE_EXEC failed: %m");
+ goto out;
+ }
+ }
+
+ if (queue->sync) {
+ result = vk_sync_wait(&device->vk, queue->sync, 0,
+ VK_SYNC_WAIT_COMPLETE, UINT64_MAX);
+ }
+
+out:
+ vk_free(&device->vk.alloc, xe_syncs);
+ return result;
+}
+
+VkResult
+xe_queue_exec_utrace_locked(struct anv_queue *queue,
+ struct anv_utrace_submit *utrace_submit)
+{
+ struct anv_device *device = queue->device;
+ struct drm_xe_sync xe_syncs[2] = {};
+
+ xe_syncs[0] = vk_sync_to_drm_xe_sync(utrace_submit->sync, 0, TYPE_SIGNAL);
+
+ xe_syncs[1].type = DRM_XE_SYNC_TYPE_TIMELINE_SYNCOBJ;
+ xe_syncs[1].handle = intel_bind_timeline_get_syncobj(&device->bind_timeline);
+ xe_syncs[1].timeline_value = intel_bind_timeline_get_last_point(&device->bind_timeline);
+
+#ifdef SUPPORT_INTEL_INTEGRATED_GPUS
+ if (device->physical->memory.need_flush &&
+ anv_bo_needs_host_cache_flush(device->utrace_bo_pool.bo_alloc_flags)) {
+ util_dynarray_foreach(&utrace_submit->batch_bos, struct anv_bo *, bo)
+ intel_flush_range((*bo)->map, (*bo)->size);
+ }
+#endif
+
+ struct anv_bo *batch_bo =
+ *util_dynarray_element(&utrace_submit->batch_bos, struct anv_bo *, 0);
+ struct drm_xe_exec exec = {
+ .exec_queue_id = queue->exec_queue_id,
+ .num_batch_buffer = 1,
+ .syncs = (uintptr_t)xe_syncs,
+ .num_syncs = ARRAY_SIZE(xe_syncs),
+ .address = batch_bo->offset,
+ };
+ if (likely(!device->info->no_hw)) {
+ if (intel_ioctl(device->fd, DRM_IOCTL_XE_EXEC, &exec))
+ return vk_device_set_lost(&device->vk, "anv_xe_queue_exec_locked failed: %m");
+ }
+
+ return VK_SUCCESS;
+}
+
+static VkResult
+xe_companion_rcs_queue_exec_locked(struct anv_queue *queue,
+ struct anv_cmd_buffer *companion_rcs_cmd_buffer,
+ uint32_t wait_count,
+ const struct vk_sync_wait *waits)
+{
+ struct anv_device *device = queue->device;
+ VkResult result;
+
+ struct vk_sync_signal companion_sync = {
+ .sync = queue->companion_sync,
+ };
+ struct drm_xe_sync *xe_syncs = NULL;
+ uint32_t xe_syncs_count = 0;
+ result = xe_exec_process_syncs(queue,
+ wait_count, waits,
+ 1, &companion_sync,
+ 0, NULL, /* extra_syncs */
+ NULL /* utrace_submit */,
+ true /* is_companion_rcs_queue */,
+ &xe_syncs,
+ &xe_syncs_count);
+ if (result != VK_SUCCESS)
+ return result;
+
+ struct drm_xe_exec exec = {
+ .exec_queue_id = queue->companion_rcs_id,
+ .num_batch_buffer = 1,
+ .syncs = (uintptr_t)xe_syncs,
+ .num_syncs = xe_syncs_count,
+ };
+
+ struct anv_batch_bo *batch_bo =
+ list_first_entry(&companion_rcs_cmd_buffer->batch_bos,
+ struct anv_batch_bo, link);
+ exec.address = batch_bo->bo->offset;
+
+ anv_measure_submit(companion_rcs_cmd_buffer);
+ xe_exec_print_debug(queue, 1, &companion_rcs_cmd_buffer, NULL, 0, &exec);
+
+ if (!device->info->no_hw) {
+ if (intel_ioctl(device->fd, DRM_IOCTL_XE_EXEC, &exec))
+ result = vk_device_set_lost(&device->vk, "anv_xe_queue_exec_locked failed: %m");
+ }
+ vk_free(&device->vk.alloc, xe_syncs);
+
+ return result;
+}
+
+VkResult
+xe_queue_exec_locked(struct anv_queue *queue,
+ uint32_t wait_count,
+ const struct vk_sync_wait *waits,
+ uint32_t cmd_buffer_count,
+ struct anv_cmd_buffer **cmd_buffers,
+ uint32_t signal_count,
+ const struct vk_sync_signal *signals,
+ struct anv_query_pool *perf_query_pool,
+ uint32_t perf_query_pass,
+ struct anv_utrace_submit *utrace_submit)
+{
+ struct anv_device *device = queue->device;
+ VkResult result;
+
+ struct drm_xe_sync *xe_syncs = NULL;
+ uint32_t xe_syncs_count = 0;
+ result = xe_exec_process_syncs(queue, wait_count, waits,
+ signal_count, signals,
+ 0, NULL, /* extra_syncs */
+ utrace_submit,
+ false, /* is_companion_rcs_queue */
+ &xe_syncs, &xe_syncs_count);
+ if (result != VK_SUCCESS)
+ return result;
+
+ /* If we have no batch for utrace, just forget about it now. */
+ if (utrace_submit &&
+ util_dynarray_num_elements(&utrace_submit->batch_bos,
+ struct anv_bo *) == 0)
+ utrace_submit = NULL;
+
+ struct drm_xe_exec exec = {
+ .exec_queue_id = queue->exec_queue_id,
+ .num_batch_buffer = 1,
+ .syncs = (uintptr_t)xe_syncs,
+ .num_syncs = xe_syncs_count,
+ };
+
+ if (cmd_buffer_count) {
+ if (unlikely(device->physical->measure_device.config)) {
+ for (uint32_t i = 0; i < cmd_buffer_count; i++)
+ anv_measure_submit(cmd_buffers[i]);
+ }
+
+ anv_cmd_buffer_chain_command_buffers(cmd_buffers, cmd_buffer_count);
+
+#ifdef SUPPORT_INTEL_INTEGRATED_GPUS
+ if (device->physical->memory.need_flush &&
+ anv_bo_needs_host_cache_flush(device->batch_bo_pool.bo_alloc_flags))
+ anv_cmd_buffer_clflush(cmd_buffers, cmd_buffer_count);
+#endif
+
+ struct anv_cmd_buffer *first_cmd_buffer = cmd_buffers[0];
+ struct anv_batch_bo *first_batch_bo = list_first_entry(&first_cmd_buffer->batch_bos,
+ struct anv_batch_bo, link);
+ exec.address = first_batch_bo->bo->offset;
+ } else {
+ exec.address = device->trivial_batch_bo->offset;
+ }
+
+ xe_exec_print_debug(queue, cmd_buffer_count, cmd_buffers, perf_query_pool,
+ perf_query_pass, &exec);
+
+ /* TODO: add perfetto stuff when Xe supports it */
+
+ if (!device->info->no_hw) {
+ if (intel_ioctl(device->fd, DRM_IOCTL_XE_EXEC, &exec))
+ result = vk_device_set_lost(&device->vk, "anv_xe_queue_exec_locked failed: %m");
+ }
+ vk_free(&device->vk.alloc, xe_syncs);
+
+ if (cmd_buffer_count != 0 && cmd_buffers[0]->companion_rcs_cmd_buffer) {
+ /* not allowed to chain cmd_buffers with companion_rcs_cmd_buffer */
+ assert(cmd_buffer_count == 1);
+ result = xe_companion_rcs_queue_exec_locked(queue,
+ cmd_buffers[0]->companion_rcs_cmd_buffer,
+ wait_count, waits);
+ }
+
+ result = anv_queue_post_submit(queue, result);
+
+ if (result == VK_SUCCESS && utrace_submit)
+ result = xe_queue_exec_utrace_locked(queue, utrace_submit);
+
+ return result;
+}
diff --git a/src/intel/vulkan/xe/anv_batch_chain.h b/src/intel/vulkan/xe/anv_batch_chain.h
new file mode 100644
index 00000000000..9afd8f06b6a
--- /dev/null
+++ b/src/intel/vulkan/xe/anv_batch_chain.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright © 2023 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#include <stdint.h>
+
+#include "drm-uapi/xe_drm.h"
+#include "vulkan/vulkan_core.h"
+#include "vk_sync.h"
+
+struct anv_device;
+struct anv_queue;
+struct anv_bo;
+struct anv_cmd_buffer;
+struct anv_query_pool;
+struct anv_utrace_submit;
+struct anv_sparse_submission;
+struct anv_trtt_batch_bo;
+
+VkResult
+xe_execute_simple_batch(struct anv_queue *queue, struct anv_bo *batch_bo,
+ uint32_t batch_bo_size, bool is_companion_rcs_batch);
+VkResult
+xe_execute_trtt_batch(struct anv_sparse_submission *submit,
+ struct anv_trtt_batch_bo *trtt_bbo);
+
+VkResult
+xe_queue_exec_locked(struct anv_queue *queue,
+ uint32_t wait_count,
+ const struct vk_sync_wait *waits,
+ uint32_t cmd_buffer_count,
+ struct anv_cmd_buffer **cmd_buffers,
+ uint32_t signal_count,
+ const struct vk_sync_signal *signals,
+ struct anv_query_pool *perf_query_pool,
+ uint32_t perf_query_pass,
+ struct anv_utrace_submit *utrace_submit);
+
+VkResult
+xe_queue_exec_utrace_locked(struct anv_queue *queue,
+ struct anv_utrace_submit *utrace_submit);
+
+struct drm_xe_sync
+vk_sync_to_drm_xe_sync(struct vk_sync *vk_sync, uint64_t value, bool signal);
diff --git a/src/intel/vulkan/xe/anv_device.c b/src/intel/vulkan/xe/anv_device.c
new file mode 100644
index 00000000000..9eabea31f52
--- /dev/null
+++ b/src/intel/vulkan/xe/anv_device.c
@@ -0,0 +1,199 @@
+/*
+ * Copyright © 2023 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include "xe/anv_device.h"
+#include "anv_private.h"
+
+#include "drm-uapi/gpu_scheduler.h"
+#include "drm-uapi/xe_drm.h"
+
+#include "common/xe/intel_device_query.h"
+
+bool anv_xe_device_destroy_vm(struct anv_device *device)
+{
+ struct drm_xe_vm_destroy destroy = {
+ .vm_id = device->vm_id,
+ };
+
+ intel_bind_timeline_finish(&device->bind_timeline, device->fd);
+
+ return intel_ioctl(device->fd, DRM_IOCTL_XE_VM_DESTROY, &destroy) == 0;
+}
+
+VkResult anv_xe_device_setup_vm(struct anv_device *device)
+{
+ struct drm_xe_vm_create create = {
+ .flags = DRM_XE_VM_CREATE_FLAG_SCRATCH_PAGE,
+ };
+ if (intel_ioctl(device->fd, DRM_IOCTL_XE_VM_CREATE, &create) != 0)
+ return vk_errorf(device, VK_ERROR_INITIALIZATION_FAILED,
+ "vm creation failed");
+
+ device->vm_id = create.vm_id;
+
+ if (!intel_bind_timeline_init(&device->bind_timeline, device->fd)) {
+ anv_xe_device_destroy_vm(device);
+ return vk_errorf(device, VK_ERROR_INITIALIZATION_FAILED,
+ "intel_bind_timeline_init failed");
+ }
+
+ return VK_SUCCESS;
+}
+
+static VkQueueGlobalPriorityKHR
+drm_sched_priority_to_vk_priority(enum drm_sched_priority drm_sched_priority)
+{
+ switch (drm_sched_priority) {
+ case DRM_SCHED_PRIORITY_MIN:
+ return VK_QUEUE_GLOBAL_PRIORITY_LOW_KHR;
+ case DRM_SCHED_PRIORITY_NORMAL:
+ return VK_QUEUE_GLOBAL_PRIORITY_MEDIUM_KHR;
+ case DRM_SCHED_PRIORITY_HIGH:
+ return VK_QUEUE_GLOBAL_PRIORITY_HIGH_KHR;
+ default:
+ unreachable("Invalid drm_sched_priority");
+ return VK_QUEUE_GLOBAL_PRIORITY_LOW_KHR;
+ }
+}
+
+VkResult
+anv_xe_physical_device_get_parameters(struct anv_physical_device *device)
+{
+ struct drm_xe_query_config *config;
+
+ config = xe_device_query_alloc_fetch(device->local_fd, DRM_XE_DEVICE_QUERY_CONFIG, NULL);
+ if (!config)
+ return vk_errorf(device, VK_ERROR_INITIALIZATION_FAILED,
+ "unable to query device config");
+
+ device->has_exec_timeline = true;
+ device->has_vm_control = true;
+ device->max_context_priority =
+ drm_sched_priority_to_vk_priority(config->info[DRM_XE_QUERY_CONFIG_MAX_EXEC_QUEUE_PRIORITY]);
+
+ free(config);
+ return VK_SUCCESS;
+}
+
+VkResult
+anv_xe_physical_device_init_memory_types(struct anv_physical_device *device)
+{
+ if (anv_physical_device_has_vram(device)) {
+ device->memory.type_count = 3;
+ device->memory.types[0] = (struct anv_memory_type) {
+ .propertyFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT,
+ .heapIndex = 0,
+ };
+ device->memory.types[1] = (struct anv_memory_type) {
+ .propertyFlags = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
+ VK_MEMORY_PROPERTY_HOST_COHERENT_BIT |
+ VK_MEMORY_PROPERTY_HOST_CACHED_BIT,
+ .heapIndex = 1,
+ };
+ device->memory.types[2] = (struct anv_memory_type) {
+ .propertyFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT |
+ VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
+ VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
+ /* This memory type either comes from heaps[0] if there is only
+ * mappable vram region, or from heaps[2] if there is both mappable &
+ * non-mappable vram regions.
+ */
+ .heapIndex = device->vram_non_mappable.size > 0 ? 2 : 0,
+ };
+ } else if (device->info.has_llc) {
+ /* Big core GPUs share LLC with the CPU and thus one memory type can be
+ * both cached and coherent at the same time.
+ *
+ * But some game engines can't handle single type well
+ * https://gitlab.freedesktop.org/mesa/mesa/-/issues/7360#note_1719438
+ *
+ * TODO: But with current UAPI we can't change the mmap mode in Xe, so
+ * here only supporting two memory types.
+ */
+ device->memory.type_count = 2;
+ device->memory.types[0] = (struct anv_memory_type) {
+ .propertyFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT,
+ .heapIndex = 0,
+ };
+ device->memory.types[1] = (struct anv_memory_type) {
+ .propertyFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT |
+ VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
+ VK_MEMORY_PROPERTY_HOST_COHERENT_BIT |
+ VK_MEMORY_PROPERTY_HOST_CACHED_BIT,
+ .heapIndex = 0,
+ };
+ } else {
+ device->memory.types[device->memory.type_count++] = (struct anv_memory_type) {
+ .propertyFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT |
+ VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
+ VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
+ .heapIndex = 0,
+ };
+ device->memory.types[device->memory.type_count++] = (struct anv_memory_type) {
+ .propertyFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT |
+ VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
+ VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
+ VK_MEMORY_PROPERTY_HOST_CACHED_BIT,
+ .heapIndex = 0,
+ };
+ }
+ return VK_SUCCESS;
+}
+
+static VkResult
+anv_xe_get_device_status(struct anv_device *device, uint32_t exec_queue_id)
+{
+ VkResult result = VK_SUCCESS;
+ struct drm_xe_exec_queue_get_property exec_queue_get_property = {
+ .exec_queue_id = exec_queue_id,
+ .property = DRM_XE_EXEC_QUEUE_GET_PROPERTY_BAN,
+ };
+ int ret = intel_ioctl(device->fd, DRM_IOCTL_XE_EXEC_QUEUE_GET_PROPERTY,
+ &exec_queue_get_property);
+
+ if (ret || exec_queue_get_property.value)
+ result = vk_device_set_lost(&device->vk, "One or more queues banned");
+
+ return result;
+}
+
+VkResult
+anv_xe_device_check_status(struct vk_device *vk_device)
+{
+ struct anv_device *device = container_of(vk_device, struct anv_device, vk);
+ VkResult result = VK_SUCCESS;
+
+ for (uint32_t i = 0; i < device->queue_count; i++) {
+ result = anv_xe_get_device_status(device, device->queues[i].exec_queue_id);
+ if (result != VK_SUCCESS)
+ return result;
+
+ if (device->queues[i].companion_rcs_id != 0) {
+ uint32_t exec_queue_id = device->queues[i].companion_rcs_id;
+ result = anv_xe_get_device_status(device, exec_queue_id);
+ if (result != VK_SUCCESS)
+ return result;
+ }
+ }
+
+ return result;
+}
diff --git a/src/intel/vulkan/xe/anv_device.h b/src/intel/vulkan/xe/anv_device.h
new file mode 100644
index 00000000000..5ed069d727d
--- /dev/null
+++ b/src/intel/vulkan/xe/anv_device.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright © 2023 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#include <stdbool.h>
+
+#include "vulkan/vulkan_core.h"
+#include "vk_device.h"
+
+struct anv_device;
+struct anv_physical_device;
+
+bool anv_xe_device_destroy_vm(struct anv_device *device);
+VkResult anv_xe_device_setup_vm(struct anv_device *device);
+VkResult anv_xe_device_check_status(struct vk_device *vk_device);
+
+VkResult
+anv_xe_physical_device_get_parameters(struct anv_physical_device *device);
+VkResult
+anv_xe_physical_device_init_memory_types(struct anv_physical_device *device);
diff --git a/src/intel/vulkan/xe/anv_kmd_backend.c b/src/intel/vulkan/xe/anv_kmd_backend.c
new file mode 100644
index 00000000000..19cb1caecf4
--- /dev/null
+++ b/src/intel/vulkan/xe/anv_kmd_backend.c
@@ -0,0 +1,355 @@
+/*
+ * Copyright © 2023 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <sys/mman.h>
+
+#include "common/xe/intel_engine.h"
+
+#include "anv_private.h"
+
+#include "xe/anv_batch_chain.h"
+
+#include "drm-uapi/gpu_scheduler.h"
+#include "drm-uapi/xe_drm.h"
+
+static uint32_t
+xe_gem_create(struct anv_device *device,
+ const struct intel_memory_class_instance **regions,
+ uint16_t regions_count, uint64_t size,
+ enum anv_bo_alloc_flags alloc_flags,
+ uint64_t *actual_size)
+{
+ /* TODO: protected content */
+ assert((alloc_flags & ANV_BO_ALLOC_PROTECTED) == 0);
+ /* WB+0 way coherent not supported by Xe KMD */
+ assert(alloc_flags & ANV_BO_ALLOC_HOST_COHERENT);
+
+ uint32_t flags = 0;
+ if (alloc_flags & ANV_BO_ALLOC_SCANOUT)
+ flags |= DRM_XE_GEM_CREATE_FLAG_SCANOUT;
+ if ((alloc_flags & (ANV_BO_ALLOC_MAPPED | ANV_BO_ALLOC_LOCAL_MEM_CPU_VISIBLE)) &&
+ !(alloc_flags & ANV_BO_ALLOC_NO_LOCAL_MEM) &&
+ device->physical->vram_non_mappable.size > 0)
+ flags |= DRM_XE_GEM_CREATE_FLAG_NEEDS_VISIBLE_VRAM;
+
+ struct drm_xe_gem_create gem_create = {
+ /* From xe_drm.h: If a VM is specified, this BO must:
+ * 1. Only ever be bound to that VM.
+ * 2. Cannot be exported as a PRIME fd.
+ */
+ .vm_id = alloc_flags & ANV_BO_ALLOC_EXTERNAL ? 0 : device->vm_id,
+ .size = align64(size, device->info->mem_alignment),
+ .flags = flags,
+ };
+ for (uint16_t i = 0; i < regions_count; i++)
+ gem_create.placement |= BITFIELD_BIT(regions[i]->instance);
+
+ const struct intel_device_info_pat_entry *pat_entry =
+ anv_device_get_pat_entry(device, alloc_flags);
+ switch (pat_entry->mmap) {
+ case INTEL_DEVICE_INFO_MMAP_MODE_WC:
+ gem_create.cpu_caching = DRM_XE_GEM_CPU_CACHING_WC;
+ break;
+ case INTEL_DEVICE_INFO_MMAP_MODE_WB:
+ gem_create.cpu_caching = DRM_XE_GEM_CPU_CACHING_WB;
+ break;
+ default:
+ unreachable("missing");
+ gem_create.cpu_caching = DRM_XE_GEM_CPU_CACHING_WC;
+ }
+
+ if (intel_ioctl(device->fd, DRM_IOCTL_XE_GEM_CREATE, &gem_create))
+ return 0;
+
+ *actual_size = gem_create.size;
+ return gem_create.handle;
+}
+
+static void
+xe_gem_close(struct anv_device *device, struct anv_bo *bo)
+{
+ if (bo->from_host_ptr)
+ return;
+
+ struct drm_gem_close close = {
+ .handle = bo->gem_handle,
+ };
+ intel_ioctl(device->fd, DRM_IOCTL_GEM_CLOSE, &close);
+}
+
+static void *
+xe_gem_mmap(struct anv_device *device, struct anv_bo *bo, uint64_t offset,
+ uint64_t size, void *placed_addr)
+{
+ struct drm_xe_gem_mmap_offset args = {
+ .handle = bo->gem_handle,
+ };
+ if (intel_ioctl(device->fd, DRM_IOCTL_XE_GEM_MMAP_OFFSET, &args))
+ return MAP_FAILED;
+
+ return mmap(placed_addr, size, PROT_READ | PROT_WRITE,
+ (placed_addr != NULL ? MAP_FIXED : 0) | MAP_SHARED,
+ device->fd, args.offset);
+}
+
+static inline uint32_t
+capture_vm_in_error_dump(struct anv_device *device, struct anv_bo *bo)
+{
+ enum anv_bo_alloc_flags alloc_flags = bo ? bo->alloc_flags : 0;
+ bool capture = INTEL_DEBUG(DEBUG_CAPTURE_ALL) ||
+ (alloc_flags & ANV_BO_ALLOC_CAPTURE);
+
+ return capture ? DRM_XE_VM_BIND_FLAG_DUMPABLE : 0;
+}
+
+static struct drm_xe_vm_bind_op
+anv_vm_bind_to_drm_xe_vm_bind(struct anv_device *device,
+ struct anv_vm_bind *anv_bind)
+{
+ struct anv_bo *bo = anv_bind->bo;
+ uint16_t pat_index = bo ?
+ anv_device_get_pat_entry(device, bo->alloc_flags)->index : 0;
+
+ struct drm_xe_vm_bind_op xe_bind = {
+ .obj = 0,
+ .obj_offset = anv_bind->bo_offset,
+ .range = anv_bind->size,
+ .addr = intel_48b_address(anv_bind->address),
+ .op = DRM_XE_VM_BIND_OP_UNMAP,
+ .flags = capture_vm_in_error_dump(device, bo),
+ .prefetch_mem_region_instance = 0,
+ .pat_index = pat_index,
+ };
+
+ if (anv_bind->op == ANV_VM_BIND) {
+ if (!bo) {
+ xe_bind.op = DRM_XE_VM_BIND_OP_MAP;
+ xe_bind.flags |= DRM_XE_VM_BIND_FLAG_NULL;
+ assert(xe_bind.obj_offset == 0);
+ } else if (bo->from_host_ptr) {
+ xe_bind.op = DRM_XE_VM_BIND_OP_MAP_USERPTR;
+ } else {
+ xe_bind.op = DRM_XE_VM_BIND_OP_MAP;
+ xe_bind.obj = bo->gem_handle;
+ }
+ } else if (anv_bind->op == ANV_VM_UNBIND_ALL) {
+ xe_bind.op = DRM_XE_VM_BIND_OP_UNMAP_ALL;
+ xe_bind.obj = bo->gem_handle;
+ assert(anv_bind->address == 0);
+ assert(anv_bind->size == 0);
+ } else {
+ assert(anv_bind->op == ANV_VM_UNBIND);
+ }
+
+ /* userptr and bo_offset are an union! */
+ if (bo && bo->from_host_ptr)
+ xe_bind.userptr = (uintptr_t)bo->map;
+
+ return xe_bind;
+}
+
+static inline VkResult
+xe_vm_bind_op(struct anv_device *device,
+ struct anv_sparse_submission *submit,
+ enum anv_vm_bind_flags flags)
+{
+ VkResult result = VK_SUCCESS;
+ const bool signal_bind_timeline =
+ flags & ANV_VM_BIND_FLAG_SIGNAL_BIND_TIMELINE;
+
+ int num_syncs = submit->wait_count + submit->signal_count +
+ signal_bind_timeline;
+ STACK_ARRAY(struct drm_xe_sync, xe_syncs, num_syncs);
+ if (!xe_syncs)
+ return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+ int sync_idx = 0;
+ for (int s = 0; s < submit->wait_count; s++) {
+ xe_syncs[sync_idx++] =
+ vk_sync_to_drm_xe_sync(submit->waits[s].sync,
+ submit->waits[s].wait_value,
+ false);
+ }
+ for (int s = 0; s < submit->signal_count; s++) {
+ xe_syncs[sync_idx++] =
+ vk_sync_to_drm_xe_sync(submit->signals[s].sync,
+ submit->signals[s].signal_value,
+ true);
+ }
+ if (signal_bind_timeline) {
+ xe_syncs[sync_idx++] = (struct drm_xe_sync) {
+ .type = DRM_XE_SYNC_TYPE_TIMELINE_SYNCOBJ,
+ .flags = DRM_XE_SYNC_FLAG_SIGNAL,
+ .handle = intel_bind_timeline_get_syncobj(&device->bind_timeline),
+ /* .timeline_value will be set later. */
+ };
+ }
+ assert(sync_idx == num_syncs);
+
+ struct drm_xe_vm_bind args = {
+ .vm_id = device->vm_id,
+ .num_binds = submit->binds_len,
+ .bind = {},
+ .num_syncs = num_syncs,
+ .syncs = (uintptr_t)xe_syncs,
+ };
+
+ STACK_ARRAY(struct drm_xe_vm_bind_op, xe_binds_stackarray,
+ submit->binds_len);
+ struct drm_xe_vm_bind_op *xe_binds;
+ if (submit->binds_len > 1) {
+ if (!xe_binds_stackarray) {
+ result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+ goto out_syncs;
+ }
+
+ xe_binds = xe_binds_stackarray;
+ args.vector_of_binds = (uintptr_t)xe_binds;
+ } else {
+ xe_binds = &args.bind;
+ }
+
+ for (int i = 0; i < submit->binds_len; i++)
+ xe_binds[i] = anv_vm_bind_to_drm_xe_vm_bind(device, &submit->binds[i]);
+
+ if (signal_bind_timeline) {
+ xe_syncs[num_syncs - 1].timeline_value =
+ intel_bind_timeline_bind_begin(&device->bind_timeline);
+ }
+ int ret = intel_ioctl(device->fd, DRM_IOCTL_XE_VM_BIND, &args);
+ int errno_ = errno;
+ if (signal_bind_timeline)
+ intel_bind_timeline_bind_end(&device->bind_timeline);
+
+ if (ret) {
+ assert(errno_ != EINVAL);
+ if (errno_ == ENOMEM)
+ result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+ else
+ result = vk_device_set_lost(&device->vk,
+ "vm_bind failed with errno %d", errno_);
+ goto out_stackarray;
+ }
+
+ ANV_RMV(vm_binds, device, submit->binds, submit->binds_len);
+
+out_stackarray:
+ STACK_ARRAY_FINISH(xe_binds_stackarray);
+out_syncs:
+ STACK_ARRAY_FINISH(xe_syncs);
+
+ return result;
+}
+
+static VkResult
+xe_vm_bind(struct anv_device *device, struct anv_sparse_submission *submit,
+ enum anv_vm_bind_flags flags)
+{
+ return xe_vm_bind_op(device, submit, flags);
+}
+
+static VkResult
+xe_vm_bind_bo(struct anv_device *device, struct anv_bo *bo)
+{
+ struct anv_vm_bind bind = {
+ .bo = bo,
+ .address = bo->offset,
+ .bo_offset = 0,
+ .size = bo->actual_size,
+ .op = ANV_VM_BIND,
+ };
+ struct anv_sparse_submission submit = {
+ .queue = NULL,
+ .binds = &bind,
+ .binds_len = 1,
+ .binds_capacity = 1,
+ .wait_count = 0,
+ .signal_count = 0,
+ };
+ return xe_vm_bind_op(device, &submit,
+ ANV_VM_BIND_FLAG_SIGNAL_BIND_TIMELINE);
+}
+
+static VkResult
+xe_vm_unbind_bo(struct anv_device *device, struct anv_bo *bo)
+{
+ struct anv_vm_bind bind = {
+ .bo = bo,
+ .address = 0,
+ .bo_offset = 0,
+ .size = 0,
+ .op = ANV_VM_UNBIND_ALL,
+ };
+ struct anv_sparse_submission submit = {
+ .queue = NULL,
+ .binds = &bind,
+ .binds_len = 1,
+ .binds_capacity = 1,
+ .wait_count = 0,
+ .signal_count = 0,
+ };
+ if (bo->from_host_ptr) {
+ bind.address = bo->offset;
+ bind.size = bo->actual_size;
+ bind.op = ANV_VM_UNBIND;
+ }
+ return xe_vm_bind_op(device, &submit,
+ ANV_VM_BIND_FLAG_SIGNAL_BIND_TIMELINE);
+}
+
+static uint32_t
+xe_gem_create_userptr(struct anv_device *device, void *mem, uint64_t size)
+{
+ /* We return the workaround BO gem_handle here, because Xe doesn't
+ * create handles for userptrs. But we still need to make it look
+ * to the rest of Anv that the operation succeeded.
+ */
+ return device->workaround_bo->gem_handle;
+}
+
+static uint32_t
+xe_bo_alloc_flags_to_bo_flags(struct anv_device *device,
+ enum anv_bo_alloc_flags alloc_flags)
+{
+ return 0;
+}
+
+const struct anv_kmd_backend *
+anv_xe_kmd_backend_get(void)
+{
+ static const struct anv_kmd_backend xe_backend = {
+ .gem_create = xe_gem_create,
+ .gem_create_userptr = xe_gem_create_userptr,
+ .gem_close = xe_gem_close,
+ .gem_mmap = xe_gem_mmap,
+ .vm_bind = xe_vm_bind,
+ .vm_bind_bo = xe_vm_bind_bo,
+ .vm_unbind_bo = xe_vm_unbind_bo,
+ .execute_simple_batch = xe_execute_simple_batch,
+ .execute_trtt_batch = xe_execute_trtt_batch,
+ .queue_exec_locked = xe_queue_exec_locked,
+ .queue_exec_trace = xe_queue_exec_utrace_locked,
+ .bo_alloc_flags_to_bo_flags = xe_bo_alloc_flags_to_bo_flags,
+ };
+ return &xe_backend;
+}
diff --git a/src/intel/vulkan/xe/anv_queue.c b/src/intel/vulkan/xe/anv_queue.c
new file mode 100644
index 00000000000..ac043a40758
--- /dev/null
+++ b/src/intel/vulkan/xe/anv_queue.c
@@ -0,0 +1,164 @@
+/*
+ * Copyright © 2023 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include "xe/anv_queue.h"
+
+#include "anv_private.h"
+
+#include "common/xe/intel_engine.h"
+#include "common/intel_gem.h"
+
+#include "xe/anv_device.h"
+
+#include "drm-uapi/xe_drm.h"
+#include "drm-uapi/gpu_scheduler.h"
+
+static enum drm_sched_priority
+anv_vk_priority_to_drm_sched_priority(VkQueueGlobalPriorityKHR vk_priority)
+{
+ switch (vk_priority) {
+ case VK_QUEUE_GLOBAL_PRIORITY_LOW_KHR:
+ return DRM_SCHED_PRIORITY_MIN;
+ case VK_QUEUE_GLOBAL_PRIORITY_MEDIUM_KHR:
+ return DRM_SCHED_PRIORITY_NORMAL;
+ case VK_QUEUE_GLOBAL_PRIORITY_HIGH_KHR:
+ return DRM_SCHED_PRIORITY_HIGH;
+ default:
+ unreachable("Invalid priority");
+ return DRM_SCHED_PRIORITY_MIN;
+ }
+}
+
+static VkResult
+create_engine(struct anv_device *device,
+ struct anv_queue *queue,
+ const VkDeviceQueueCreateInfo *pCreateInfo,
+ bool create_companion_rcs_engine)
+{
+ struct anv_physical_device *physical = device->physical;
+ uint32_t queue_family_index =
+ create_companion_rcs_engine ?
+ anv_get_first_render_queue_index(physical) :
+ pCreateInfo->queueFamilyIndex;
+ struct anv_queue_family *queue_family =
+ &physical->queue.families[queue_family_index];
+ const struct intel_query_engine_info *engines = physical->engine_info;
+ struct drm_xe_engine_class_instance *instances;
+ const VkDeviceQueueGlobalPriorityCreateInfoKHR *queue_priority =
+ vk_find_struct_const(pCreateInfo->pNext,
+ DEVICE_QUEUE_GLOBAL_PRIORITY_CREATE_INFO_KHR);
+ const VkQueueGlobalPriorityKHR priority = queue_priority ?
+ queue_priority->globalPriority :
+ VK_QUEUE_GLOBAL_PRIORITY_MEDIUM_KHR;
+
+ /* As per spec, the driver implementation may deny requests to acquire
+ * a priority above the default priority (MEDIUM) if the caller does not
+ * have sufficient privileges. In this scenario VK_ERROR_NOT_PERMITTED_KHR
+ * is returned.
+ */
+ if (physical->max_context_priority >= VK_QUEUE_GLOBAL_PRIORITY_MEDIUM_KHR) {
+ if (priority > physical->max_context_priority)
+ return vk_error(device, VK_ERROR_NOT_PERMITTED_KHR);
+ }
+
+ instances = vk_alloc(&device->vk.alloc,
+ sizeof(*instances) * queue_family->queueCount, 8,
+ VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
+ if (!instances)
+ return VK_ERROR_OUT_OF_HOST_MEMORY;
+
+ /* Build a list of all compatible HW engines */
+ uint32_t count = 0;
+ for (uint32_t i = 0; i < engines->num_engines; i++) {
+ const struct intel_engine_class_instance engine = engines->engines[i];
+ if (engine.engine_class != queue_family->engine_class)
+ continue;
+
+ instances[count].engine_class = intel_engine_class_to_xe(engine.engine_class);
+ instances[count].engine_instance = engine.engine_instance;
+ instances[count++].gt_id = engine.gt_id;
+ }
+
+ assert(device->vm_id != 0);
+ struct drm_xe_ext_set_property ext = {
+ .base.name = DRM_XE_EXEC_QUEUE_EXTENSION_SET_PROPERTY,
+ .property = DRM_XE_EXEC_QUEUE_SET_PROPERTY_PRIORITY,
+ .value = anv_vk_priority_to_drm_sched_priority(priority),
+ };
+ struct drm_xe_exec_queue_create create = {
+ /* Allows KMD to pick one of those engines for the submission queue */
+ .instances = (uintptr_t)instances,
+ .vm_id = device->vm_id,
+ .width = 1,
+ .num_placements = count,
+ .extensions = (uintptr_t)&ext,
+ };
+ int ret = intel_ioctl(device->fd, DRM_IOCTL_XE_EXEC_QUEUE_CREATE, &create);
+ vk_free(&device->vk.alloc, instances);
+ if (ret)
+ return vk_errorf(device, VK_ERROR_UNKNOWN, "Unable to create exec queue");
+
+ if (create_companion_rcs_engine)
+ queue->companion_rcs_id = create.exec_queue_id;
+ else
+ queue->exec_queue_id = create.exec_queue_id;
+
+ return VK_SUCCESS;
+}
+
+VkResult
+anv_xe_create_engine(struct anv_device *device,
+ struct anv_queue *queue,
+ const VkDeviceQueueCreateInfo *pCreateInfo)
+{
+ VkResult result = create_engine(device, queue, pCreateInfo,
+ false /* create_companion_rcs_engine */);
+
+ if (result != VK_SUCCESS)
+ return result;
+
+ if (queue->family->engine_class == INTEL_ENGINE_CLASS_COPY ||
+ queue->family->engine_class == INTEL_ENGINE_CLASS_COMPUTE) {
+ result = create_engine(device, queue, pCreateInfo,
+ true /* create_companion_rcs_engine */);
+ }
+
+ return result;
+}
+
+static void
+destroy_engine(struct anv_device *device, uint32_t exec_queue_id)
+{
+ struct drm_xe_exec_queue_destroy destroy = {
+ .exec_queue_id = exec_queue_id,
+ };
+ intel_ioctl(device->fd, DRM_IOCTL_XE_EXEC_QUEUE_DESTROY, &destroy);
+}
+
+void
+anv_xe_destroy_engine(struct anv_device *device, struct anv_queue *queue)
+{
+ destroy_engine(device, queue->exec_queue_id);
+
+ if (queue->companion_rcs_id != 0)
+ destroy_engine(device, queue->companion_rcs_id);
+}
diff --git a/src/intel/vulkan/xe/anv_queue.h b/src/intel/vulkan/xe/anv_queue.h
new file mode 100644
index 00000000000..646f0ef2f16
--- /dev/null
+++ b/src/intel/vulkan/xe/anv_queue.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright © 2023 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#include "vulkan/vulkan_core.h"
+
+struct anv_device;
+struct anv_queue;
+
+VkResult
+anv_xe_create_engine(struct anv_device *device,
+ struct anv_queue *queue,
+ const VkDeviceQueueCreateInfo *pCreateInfo);
+void
+anv_xe_destroy_engine(struct anv_device *device, struct anv_queue *queue);