diff options
Diffstat (limited to 'src/intel/vulkan/genX_query.c')
-rw-r--r-- | src/intel/vulkan/genX_query.c | 959 |
1 files changed, 714 insertions, 245 deletions
diff --git a/src/intel/vulkan/genX_query.c b/src/intel/vulkan/genX_query.c index 8978f5843a9..2cb492afcf9 100644 --- a/src/intel/vulkan/genX_query.c +++ b/src/intel/vulkan/genX_query.c @@ -29,15 +29,21 @@ #include "anv_private.h" +#include "util/os_time.h" + #include "genxml/gen_macros.h" #include "genxml/genX_pack.h" +#include "ds/intel_tracepoints.h" + +#include "anv_internal_kernels.h" + /* We reserve : * - GPR 14 for perf queries * - GPR 15 for conditional rendering */ #define MI_BUILDER_NUM_ALLOC_GPRS 14 -#define MI_BUILDER_CAN_WRITE_BATCH GFX_VER >= 8 +#define MI_BUILDER_CAN_WRITE_BATCH true #define __gen_get_batch_dwords anv_batch_emit_dwords #define __gen_address_offset anv_address_add #define __gen_get_batch_address(b, a) anv_batch_address(b, a) @@ -57,6 +63,18 @@ anv_query_address(struct anv_query_pool *pool, uint32_t query) }; } +static void +emit_query_mi_flush_availability(struct anv_cmd_buffer *cmd_buffer, + struct anv_address addr, + bool available) +{ + anv_batch_emit(&cmd_buffer->batch, GENX(MI_FLUSH_DW), flush) { + flush.PostSyncOperation = WriteImmediateData; + flush.Address = addr; + flush.ImmediateData = available; + } +} + VkResult genX(CreateQueryPool)( VkDevice _device, const VkQueryPoolCreateInfo* pCreateInfo, @@ -65,12 +83,10 @@ VkResult genX(CreateQueryPool)( { ANV_FROM_HANDLE(anv_device, device, _device); const struct anv_physical_device *pdevice = device->physical; -#if GFX_VER >= 8 const VkQueryPoolPerformanceCreateInfoKHR *perf_query_info = NULL; struct intel_perf_counter_pass *counter_pass; struct intel_perf_query_info **pass_query; uint32_t n_passes = 0; -#endif uint32_t data_offset = 0; VK_MULTIALLOC(ma); VkResult result; @@ -123,14 +139,13 @@ VkResult genX(CreateQueryPool)( uint64s_per_slot = 2; /* availability + marker */ /* Align to the requirement of the layout */ - uint64s_per_slot = align_u32(uint64s_per_slot, - DIV_ROUND_UP(layout->alignment, sizeof(uint64_t))); + uint64s_per_slot = align(uint64s_per_slot, + DIV_ROUND_UP(layout->alignment, sizeof(uint64_t))); data_offset = uint64s_per_slot * sizeof(uint64_t); /* Add the query data for begin & end commands */ uint64s_per_slot += 2 * DIV_ROUND_UP(layout->size, sizeof(uint64_t)); break; } -#if GFX_VER >= 8 case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: { const struct intel_perf_query_field_layout *layout = &pdevice->perf->query_layout; @@ -145,10 +160,10 @@ VkResult genX(CreateQueryPool)( perf_query_info->counterIndexCount); vk_multialloc_add(&ma, &pass_query, struct intel_perf_query_info *, n_passes); - uint64s_per_slot = 4 /* availability + small batch */; + uint64s_per_slot = 1 /* availability */; /* Align to the requirement of the layout */ - uint64s_per_slot = align_u32(uint64s_per_slot, - DIV_ROUND_UP(layout->alignment, sizeof(uint64_t))); + uint64s_per_slot = align(uint64s_per_slot, + DIV_ROUND_UP(layout->alignment, sizeof(uint64_t))); data_offset = uint64s_per_slot * sizeof(uint64_t); /* Add the query data for begin & end commands */ uint64s_per_slot += 2 * DIV_ROUND_UP(layout->size, sizeof(uint64_t)); @@ -156,26 +171,41 @@ VkResult genX(CreateQueryPool)( uint64s_per_slot *= n_passes; break; } + case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: + /* Query has two values: begin and end. */ + uint64s_per_slot = 1 + 2; + break; +#if GFX_VERx10 >= 125 + case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR: + case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR: + uint64s_per_slot = 1 + 1 /* availability + size (PostbuildInfoCurrentSize, PostbuildInfoCompactedSize) */; + break; + + case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR: + case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR: + uint64s_per_slot = 1 + 2 /* availability + size (PostbuildInfoSerializationDesc) */; + break; + #endif + case VK_QUERY_TYPE_RESULT_STATUS_ONLY_KHR: + uint64s_per_slot = 1; + break; default: assert(!"Invalid query type"); } - if (!vk_object_multialloc(&device->vk, &ma, pAllocator, - VK_OBJECT_TYPE_QUERY_POOL)) - return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + if (!vk_multialloc_zalloc2(&ma, &device->vk.alloc, pAllocator, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT)) + return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); - pool->type = pCreateInfo->queryType; - pool->pipeline_statistics = pipeline_statistics; + vk_query_pool_init(&device->vk, &pool->vk, pCreateInfo); pool->stride = uint64s_per_slot * sizeof(uint64_t); - pool->slots = pCreateInfo->queryCount; - if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL) { + if (pool->vk.query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL) { pool->data_offset = data_offset; pool->snapshot_size = (pool->stride - data_offset) / 2; } -#if GFX_VER >= 8 - else if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) { + else if (pool->vk.query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) { pool->pass_size = pool->stride / n_passes; pool->data_offset = data_offset; pool->snapshot_size = (pool->pass_size - data_offset) / 2; @@ -192,19 +222,27 @@ VkResult genX(CreateQueryPool)( perf_query_info->counterIndexCount, pool->pass_query); } -#endif - uint64_t size = pool->slots * (uint64_t)pool->stride; + uint64_t size = pool->vk.query_count * (uint64_t)pool->stride; + + /* For KHR_performance_query we need some space in the buffer for a small + * batch updating ANV_PERF_QUERY_OFFSET_REG. + */ + if (pool->vk.query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) { + pool->khr_perf_preamble_stride = 32; + pool->khr_perf_preambles_offset = size; + size += (uint64_t)pool->n_passes * pool->khr_perf_preamble_stride; + } + result = anv_device_alloc_bo(device, "query-pool", size, ANV_BO_ALLOC_MAPPED | - ANV_BO_ALLOC_SNOOPED, + ANV_BO_ALLOC_HOST_CACHED_COHERENT, 0 /* explicit_address */, &pool->bo); if (result != VK_SUCCESS) goto fail; -#if GFX_VER >= 8 - if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) { + if (pool->vk.query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) { for (uint32_t p = 0; p < pool->n_passes; p++) { struct mi_builder b; struct anv_batch batch = { @@ -213,13 +251,14 @@ VkResult genX(CreateQueryPool)( }; batch.next = batch.start; - mi_builder_init(&b, &device->info, &batch); + mi_builder_init(&b, device->info, &batch); mi_store(&b, mi_reg64(ANV_PERF_QUERY_OFFSET_REG), mi_imm(p * (uint64_t)pool->pass_size)); anv_batch_emit(&batch, GENX(MI_BATCH_BUFFER_END), bbe); } } -#endif + + ANV_RMV(query_pool_create, device, pool, false); *pQueryPool = anv_query_pool_to_handle(pool); @@ -242,47 +281,36 @@ void genX(DestroyQueryPool)( if (!pool) return; + ANV_RMV(resource_destroy, device, pool); + anv_device_release_bo(device, pool->bo); vk_object_free(&device->vk, pAllocator, pool); } -#if GFX_VER >= 8 /** * VK_KHR_performance_query layout : * * -------------------------------------------- * | availability (8b) | | | * |-------------------------------| | | - * | Small batch loading | | | - * | ANV_PERF_QUERY_OFFSET_REG | | | - * | (24b) | | Pass 0 | - * |-------------------------------| | | * | some padding (see | | | - * | query_field_layout:alignment) | | | + * | query_field_layout:alignment) | | Pass 0 | * |-------------------------------| | | * | query data | | | * | (2 * query_field_layout:size) | | | * |-------------------------------|-- | Query 0 * | availability (8b) | | | * |-------------------------------| | | - * | Small batch loading | | | - * | ANV_PERF_QUERY_OFFSET_REG | | | - * | (24b) | | Pass 1 | - * |-------------------------------| | | * | some padding (see | | | - * | query_field_layout:alignment) | | | + * | query_field_layout:alignment) | | Pass 1 | * |-------------------------------| | | * | query data | | | * | (2 * query_field_layout:size) | | | * |-------------------------------|----------- * | availability (8b) | | | * |-------------------------------| | | - * | Small batch loading | | | - * | ANV_PERF_QUERY_OFFSET_REG | | | - * | (24b) | | Pass 0 | - * |-------------------------------| | | * | some padding (see | | | - * | query_field_layout:alignment) | | | + * | query_field_layout:alignment) | | Pass 0 | * |-------------------------------| | | * | query data | | | * | (2 * query_field_layout:size) | | | @@ -333,7 +361,7 @@ khr_perf_query_ensure_relocs(struct anv_cmd_buffer *cmd_buffer) const struct anv_physical_device *pdevice = device->physical; cmd_buffer->self_mod_locations = - vk_alloc(&cmd_buffer->pool->alloc, + vk_alloc(&cmd_buffer->vk.pool->alloc, pdevice->n_perf_query_commands * sizeof(*cmd_buffer->self_mod_locations), 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); @@ -344,7 +372,6 @@ khr_perf_query_ensure_relocs(struct anv_cmd_buffer *cmd_buffer) return true; } -#endif /** * VK_INTEL_performance_query layout : @@ -396,8 +423,7 @@ query_slot(struct anv_query_pool *pool, uint32_t query) static bool query_is_available(struct anv_query_pool *pool, uint32_t query) { -#if GFX_VER >= 8 - if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) { + if (pool->vk.query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) { for (uint32_t p = 0; p < pool->n_passes; p++) { volatile uint64_t *slot = pool->bo->map + khr_perf_query_availability_offset(pool, query, p); @@ -406,7 +432,6 @@ query_is_available(struct anv_query_pool *pool, uint32_t query) } return true; } -#endif return *(volatile uint64_t *)query_slot(pool, query); } @@ -415,17 +440,29 @@ static VkResult wait_for_available(struct anv_device *device, struct anv_query_pool *pool, uint32_t query) { - uint64_t abs_timeout = anv_get_absolute_timeout(2 * NSEC_PER_SEC); + /* By default we leave a 2s timeout before declaring the device lost. */ + uint64_t rel_timeout = 2 * NSEC_PER_SEC; + if (pool->vk.query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) { + /* With performance queries, there is an additional 500us reconfiguration + * time in i915. + */ + rel_timeout += 500 * 1000; + /* Additionally a command buffer can be replayed N times to gather data + * for each of the metric sets to capture all the counters requested. + */ + rel_timeout *= pool->n_passes; + } + uint64_t abs_timeout_ns = os_time_get_absolute_timeout(rel_timeout); - while (anv_gettime_ns() < abs_timeout) { + while (os_time_get_nano() < abs_timeout_ns) { if (query_is_available(pool, query)) return VK_SUCCESS; - VkResult status = anv_device_query_status(device); + VkResult status = vk_device_check_status(&device->vk); if (status != VK_SUCCESS) return status; } - return anv_device_set_lost(device, "query timeout"); + return vk_device_set_lost(&device->vk, "query timeout"); } VkResult genX(GetQueryPoolResults)( @@ -441,14 +478,23 @@ VkResult genX(GetQueryPoolResults)( ANV_FROM_HANDLE(anv_device, device, _device); ANV_FROM_HANDLE(anv_query_pool, pool, queryPool); - assert(pool->type == VK_QUERY_TYPE_OCCLUSION || - pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS || - pool->type == VK_QUERY_TYPE_TIMESTAMP || - pool->type == VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT || - pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR || - pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL); - - if (anv_device_is_lost(device)) + assert( +#if GFX_VERx10 >= 125 + pool->vk.query_type == VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR || + pool->vk.query_type == VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR || + pool->vk.query_type == VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR || + pool->vk.query_type == VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR || +#endif + pool->vk.query_type == VK_QUERY_TYPE_OCCLUSION || + pool->vk.query_type == VK_QUERY_TYPE_PIPELINE_STATISTICS || + pool->vk.query_type == VK_QUERY_TYPE_TIMESTAMP || + pool->vk.query_type == VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT || + pool->vk.query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR || + pool->vk.query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL || + pool->vk.query_type == VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT || + pool->vk.query_type == VK_QUERY_TYPE_RESULT_STATUS_ONLY_KHR); + + if (vk_device_is_lost(&device->vk)) return VK_ERROR_DEVICE_LOST; if (pData == NULL) @@ -487,8 +533,9 @@ VkResult genX(GetQueryPoolResults)( bool write_results = available || (flags & VK_QUERY_RESULT_PARTIAL_BIT); uint32_t idx = 0; - switch (pool->type) { - case VK_QUERY_TYPE_OCCLUSION: { + switch (pool->vk.query_type) { + case VK_QUERY_TYPE_OCCLUSION: + case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: { uint64_t *slot = query_slot(pool, firstQuery + i); if (write_results) { /* From the Vulkan 1.2.132 spec: @@ -507,22 +554,16 @@ VkResult genX(GetQueryPoolResults)( case VK_QUERY_TYPE_PIPELINE_STATISTICS: { uint64_t *slot = query_slot(pool, firstQuery + i); - uint32_t statistics = pool->pipeline_statistics; + uint32_t statistics = pool->vk.pipeline_statistics; while (statistics) { - uint32_t stat = u_bit_scan(&statistics); + UNUSED uint32_t stat = u_bit_scan(&statistics); if (write_results) { uint64_t result = slot[idx * 2 + 2] - slot[idx * 2 + 1]; - - /* WaDividePSInvocationCountBy4:HSW,BDW */ - if ((device->info.ver == 8 || device->info.is_haswell) && - (1 << stat) == VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT) - result >>= 2; - cpu_write_query_result(pData, flags, idx, result); } idx++; } - assert(idx == util_bitcount(pool->pipeline_statistics)); + assert(idx == util_bitcount(pool->vk.pipeline_statistics)); break; } @@ -537,6 +578,26 @@ VkResult genX(GetQueryPoolResults)( break; } +#if GFX_VERx10 >= 125 + case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR: + case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR: + case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR: { + uint64_t *slot = query_slot(pool, firstQuery + i); + if (write_results) + cpu_write_query_result(pData, flags, idx, slot[1]); + idx++; + break; + } + + case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR: { + uint64_t *slot = query_slot(pool, firstQuery + i); + if (write_results) + cpu_write_query_result(pData, flags, idx, slot[2]); + idx++; + break; + } +#endif + case VK_QUERY_TYPE_TIMESTAMP: { uint64_t *slot = query_slot(pool, firstQuery + i); if (write_results) @@ -545,7 +606,6 @@ VkResult genX(GetQueryPoolResults)( break; } -#if GFX_VER >= 8 case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: { const struct anv_physical_device *pdevice = device->physical; assert((flags & (VK_QUERY_RESULT_WITH_AVAILABILITY_BIT | @@ -554,7 +614,7 @@ VkResult genX(GetQueryPoolResults)( const struct intel_perf_query_info *query = pool->pass_query[p]; struct intel_perf_query_result result; intel_perf_query_result_clear(&result); - intel_perf_query_result_accumulate_fields(&result, query, &device->info, + intel_perf_query_result_accumulate_fields(&result, query, pool->bo->map + khr_perf_query_data_offset(pool, firstQuery + i, p, false), pool->bo->map + khr_perf_query_data_offset(pool, firstQuery + i, p, true), false /* no_oa_accumulate */); @@ -562,7 +622,6 @@ VkResult genX(GetQueryPoolResults)( } break; } -#endif case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: { if (!write_results) @@ -571,18 +630,26 @@ VkResult genX(GetQueryPoolResults)( const struct intel_perf_query_info *query = &device->physical->perf->queries[0]; struct intel_perf_query_result result; intel_perf_query_result_clear(&result); - intel_perf_query_result_accumulate_fields(&result, query, &device->info, + intel_perf_query_result_accumulate_fields(&result, query, query_data + intel_perf_query_data_offset(pool, false), query_data + intel_perf_query_data_offset(pool, true), false /* no_oa_accumulate */); intel_perf_query_result_write_mdapi(pData, stride, - &device->info, + device->info, query, &result); const uint64_t *marker = query_data + intel_perf_marker_offset(); - intel_perf_query_mdapi_write_marker(pData, stride, &device->info, *marker); + intel_perf_query_mdapi_write_marker(pData, stride, device->info, *marker); break; } + case VK_QUERY_TYPE_RESULT_STATUS_ONLY_KHR: + if (!write_results) + break; + const uint32_t *query_data = query_slot(pool, firstQuery + i); + uint32_t result = available ? *query_data : 0; + cpu_write_query_result(pData, flags, idx, result); + break; + default: unreachable("invalid pool type"); } @@ -608,15 +675,11 @@ emit_ps_depth_count(struct anv_cmd_buffer *cmd_buffer, cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT; genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); - anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { - pc.DestinationAddressType = DAT_PPGTT; - pc.PostSyncOperation = WritePSDepthCount; - pc.DepthStallEnable = true; - pc.Address = addr; - - if (GFX_VER == 9 && cmd_buffer->device->info.gt == 4) - pc.CommandStreamerStallEnable = true; - } + bool cs_stall_needed = (GFX_VER == 9 && cmd_buffer->device->info->gt == 4); + genx_batch_emit_pipe_control_write + (&cmd_buffer->batch, cmd_buffer->device->info, + cmd_buffer->state.current_pipeline, WritePSDepthCount, addr, 0, + ANV_PIPE_DEPTH_STALL_BIT | (cs_stall_needed ? ANV_PIPE_CS_STALL_BIT : 0)); } static void @@ -635,12 +698,10 @@ emit_query_pc_availability(struct anv_cmd_buffer *cmd_buffer, cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT; genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); - anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { - pc.DestinationAddressType = DAT_PPGTT; - pc.PostSyncOperation = WriteImmediateData; - pc.Address = addr; - pc.ImmediateData = available; - } + genx_batch_emit_pipe_control_write + (&cmd_buffer->batch, cmd_buffer->device->info, + cmd_buffer->state.current_pipeline, WriteImmediateData, addr, + available, 0); } /** @@ -652,7 +713,7 @@ emit_zero_queries(struct anv_cmd_buffer *cmd_buffer, struct mi_builder *b, struct anv_query_pool *pool, uint32_t first_index, uint32_t num_queries) { - switch (pool->type) { + switch (pool->vk.query_type) { case VK_QUERY_TYPE_OCCLUSION: case VK_QUERY_TYPE_TIMESTAMP: /* These queries are written with a PIPE_CONTROL so clear them using the @@ -673,6 +734,7 @@ emit_zero_queries(struct anv_cmd_buffer *cmd_buffer, } break; + case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: case VK_QUERY_TYPE_PIPELINE_STATISTICS: case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: for (uint32_t i = 0; i < num_queries; i++) { @@ -683,7 +745,6 @@ emit_zero_queries(struct anv_cmd_buffer *cmd_buffer, } break; -#if GFX_VER >= 8 case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: { for (uint32_t i = 0; i < num_queries; i++) { for (uint32_t p = 0; p < pool->n_passes; p++) { @@ -696,7 +757,6 @@ emit_zero_queries(struct anv_cmd_buffer *cmd_buffer, } break; } -#endif case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: for (uint32_t i = 0; i < num_queries; i++) { @@ -720,10 +780,44 @@ void genX(CmdResetQueryPool)( { ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); ANV_FROM_HANDLE(anv_query_pool, pool, queryPool); + struct anv_physical_device *pdevice = cmd_buffer->device->physical; + + /* Shader clearing is only possible on render/compute */ + if (anv_cmd_buffer_is_render_or_compute_queue(cmd_buffer) && + queryCount >= pdevice->instance->query_clear_with_blorp_threshold) { + trace_intel_begin_query_clear_blorp(&cmd_buffer->trace); + + anv_cmd_buffer_fill_area(cmd_buffer, + anv_query_address(pool, firstQuery), + queryCount * pool->stride, + 0); + + /* The pending clearing writes are in compute if we're in gpgpu mode on + * the render engine or on the compute engine. + */ + if (anv_cmd_buffer_is_compute_queue(cmd_buffer) || + cmd_buffer->state.current_pipeline == pdevice->gpgpu_pipeline_value) { + cmd_buffer->state.queries.clear_bits = + ANV_QUERY_COMPUTE_WRITES_PENDING_BITS; + } else { + cmd_buffer->state.queries.clear_bits = + ANV_QUERY_RENDER_TARGET_WRITES_PENDING_BITS(&pdevice->info); + } + + trace_intel_end_query_clear_blorp(&cmd_buffer->trace, queryCount); + return; + } - switch (pool->type) { + trace_intel_begin_query_clear_cs(&cmd_buffer->trace); + + switch (pool->vk.query_type) { case VK_QUERY_TYPE_OCCLUSION: - case VK_QUERY_TYPE_TIMESTAMP: +#if GFX_VERx10 >= 125 + case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR: + case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR: + case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR: + case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR: +#endif for (uint32_t i = 0; i < queryCount; i++) { emit_query_pc_availability(cmd_buffer, anv_query_address(pool, firstQuery + i), @@ -731,20 +825,37 @@ void genX(CmdResetQueryPool)( } break; + case VK_QUERY_TYPE_TIMESTAMP: { + for (uint32_t i = 0; i < queryCount; i++) { + emit_query_pc_availability(cmd_buffer, + anv_query_address(pool, firstQuery + i), + false); + } + + /* Add a CS stall here to make sure the PIPE_CONTROL above has + * completed. Otherwise some timestamps written later with MI_STORE_* + * commands might race with the PIPE_CONTROL in the loop above. + */ + anv_add_pending_pipe_bits(cmd_buffer, ANV_PIPE_CS_STALL_BIT, + "vkCmdResetQueryPool of timestamps"); + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); + break; + } + case VK_QUERY_TYPE_PIPELINE_STATISTICS: - case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: { + case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: + case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: { struct mi_builder b; - mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch); + mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch); for (uint32_t i = 0; i < queryCount; i++) emit_query_mi_availability(&b, anv_query_address(pool, firstQuery + i), false); break; } -#if GFX_VER >= 8 case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: { struct mi_builder b; - mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch); + mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch); for (uint32_t i = 0; i < queryCount; i++) { for (uint32_t p = 0; p < pool->n_passes; p++) { @@ -756,20 +867,24 @@ void genX(CmdResetQueryPool)( } break; } -#endif case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: { struct mi_builder b; - mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch); + mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch); for (uint32_t i = 0; i < queryCount; i++) emit_query_mi_availability(&b, anv_query_address(pool, firstQuery + i), false); break; } - + case VK_QUERY_TYPE_RESULT_STATUS_ONLY_KHR: + for (uint32_t i = 0; i < queryCount; i++) + emit_query_mi_flush_availability(cmd_buffer, anv_query_address(pool, firstQuery + i), false); + break; default: unreachable("Unsupported query type"); } + + trace_intel_end_query_clear_cs(&cmd_buffer->trace, queryCount); } void genX(ResetQueryPool)( @@ -781,14 +896,12 @@ void genX(ResetQueryPool)( ANV_FROM_HANDLE(anv_query_pool, pool, queryPool); for (uint32_t i = 0; i < queryCount; i++) { - if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) { -#if GFX_VER >= 8 + if (pool->vk.query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) { for (uint32_t p = 0; p < pool->n_passes; p++) { uint64_t *pass_slot = pool->bo->map + khr_perf_query_availability_offset(pool, firstQuery + i, p); *pass_slot = 0; } -#endif } else { uint64_t *slot = query_slot(pool, firstQuery + i); *slot = 0; @@ -858,6 +971,7 @@ emit_perf_intel_query(struct anv_cmd_buffer *cmd_buffer, case INTEL_PERF_QUERY_FIELD_TYPE_SRM_PERFCNT: case INTEL_PERF_QUERY_FIELD_TYPE_SRM_RPSTAT: + case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_A: case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_B: case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_C: { struct anv_address addr = anv_address_add(data_addr, field->location); @@ -877,15 +991,22 @@ emit_perf_intel_query(struct anv_cmd_buffer *cmd_buffer, } } -void genX(CmdBeginQuery)( - VkCommandBuffer commandBuffer, - VkQueryPool queryPool, - uint32_t query, - VkQueryControlFlags flags) +static void +emit_query_clear_flush(struct anv_cmd_buffer *cmd_buffer, + struct anv_query_pool *pool, + const char *reason) { - genX(CmdBeginQueryIndexedEXT)(commandBuffer, queryPool, query, flags, 0); + if (cmd_buffer->state.queries.clear_bits == 0) + return; + + anv_add_pending_pipe_bits(cmd_buffer, + ANV_PIPE_QUERY_BITS( + cmd_buffer->state.queries.clear_bits), + reason); + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); } + void genX(CmdBeginQueryIndexedEXT)( VkCommandBuffer commandBuffer, VkQueryPool queryPool, @@ -897,22 +1018,39 @@ void genX(CmdBeginQueryIndexedEXT)( ANV_FROM_HANDLE(anv_query_pool, pool, queryPool); struct anv_address query_addr = anv_query_address(pool, query); + emit_query_clear_flush(cmd_buffer, pool, "CmdBeginQuery* flush query clears"); + struct mi_builder b; - mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch); + mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch); + const uint32_t mocs = anv_mocs_for_address(cmd_buffer->device, &query_addr); + mi_builder_set_mocs(&b, mocs); - switch (pool->type) { + switch (pool->vk.query_type) { case VK_QUERY_TYPE_OCCLUSION: + cmd_buffer->state.gfx.n_occlusion_queries++; + cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_OCCLUSION_QUERY_ACTIVE; emit_ps_depth_count(cmd_buffer, anv_address_add(query_addr, 8)); break; + case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: + genx_batch_emit_pipe_control(&cmd_buffer->batch, + cmd_buffer->device->info, + cmd_buffer->state.current_pipeline, + ANV_PIPE_CS_STALL_BIT | + ANV_PIPE_STALL_AT_SCOREBOARD_BIT); + mi_store(&b, mi_mem64(anv_address_add(query_addr, 8)), + mi_reg64(GENX(CL_INVOCATION_COUNT_num))); + break; + case VK_QUERY_TYPE_PIPELINE_STATISTICS: { /* TODO: This might only be necessary for certain stats */ - anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { - pc.CommandStreamerStallEnable = true; - pc.StallAtPixelScoreboard = true; - } + genx_batch_emit_pipe_control(&cmd_buffer->batch, + cmd_buffer->device->info, + cmd_buffer->state.current_pipeline, + ANV_PIPE_CS_STALL_BIT | + ANV_PIPE_STALL_AT_SCOREBOARD_BIT); - uint32_t statistics = pool->pipeline_statistics; + uint32_t statistics = pool->vk.pipeline_statistics; uint32_t offset = 8; while (statistics) { uint32_t stat = u_bit_scan(&statistics); @@ -923,14 +1061,14 @@ void genX(CmdBeginQueryIndexedEXT)( } case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: - anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { - pc.CommandStreamerStallEnable = true; - pc.StallAtPixelScoreboard = true; - } + genx_batch_emit_pipe_control(&cmd_buffer->batch, + cmd_buffer->device->info, + cmd_buffer->state.current_pipeline, + ANV_PIPE_CS_STALL_BIT | + ANV_PIPE_STALL_AT_SCOREBOARD_BIT); emit_xfb_query(&b, index, anv_address_add(query_addr, 8)); break; -#if GFX_VER >= 8 case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: { if (!khr_perf_query_ensure_relocs(cmd_buffer)) return; @@ -979,12 +1117,15 @@ void genX(CmdBeginQueryIndexedEXT)( assert(reloc_idx == pdevice->n_perf_query_commands); - mi_self_mod_barrier(&b); + const struct intel_device_info *devinfo = cmd_buffer->device->info; + const enum intel_engine_class engine_class = cmd_buffer->queue_family->engine_class; + mi_self_mod_barrier(&b, devinfo->engine_class_prefetch[engine_class]); - anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { - pc.CommandStreamerStallEnable = true; - pc.StallAtPixelScoreboard = true; - } + genx_batch_emit_pipe_control(&cmd_buffer->batch, + cmd_buffer->device->info, + cmd_buffer->state.current_pipeline, + ANV_PIPE_CS_STALL_BIT | + ANV_PIPE_STALL_AT_SCOREBOARD_BIT); cmd_buffer->perf_query_pool = pool; cmd_buffer->perf_reloc_idx = 0; @@ -1007,6 +1148,7 @@ void genX(CmdBeginQueryIndexedEXT)( case INTEL_PERF_QUERY_FIELD_TYPE_SRM_PERFCNT: case INTEL_PERF_QUERY_FIELD_TYPE_SRM_RPSTAT: + case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_A: case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_B: case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_C: dws = @@ -1040,30 +1182,24 @@ void genX(CmdBeginQueryIndexedEXT)( } break; } -#endif case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: { - anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { - pc.CommandStreamerStallEnable = true; - pc.StallAtPixelScoreboard = true; - } + genx_batch_emit_pipe_control(&cmd_buffer->batch, + cmd_buffer->device->info, + cmd_buffer->state.current_pipeline, + ANV_PIPE_CS_STALL_BIT | + ANV_PIPE_STALL_AT_SCOREBOARD_BIT); emit_perf_intel_query(cmd_buffer, pool, &b, query_addr, false); break; } - + case VK_QUERY_TYPE_RESULT_STATUS_ONLY_KHR: + emit_query_mi_flush_availability(cmd_buffer, query_addr, false); + break; default: unreachable(""); } } -void genX(CmdEndQuery)( - VkCommandBuffer commandBuffer, - VkQueryPool queryPool, - uint32_t query) -{ - genX(CmdEndQueryIndexedEXT)(commandBuffer, queryPool, query, 0); -} - void genX(CmdEndQueryIndexedEXT)( VkCommandBuffer commandBuffer, VkQueryPool queryPool, @@ -1075,22 +1211,40 @@ void genX(CmdEndQueryIndexedEXT)( struct anv_address query_addr = anv_query_address(pool, query); struct mi_builder b; - mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch); + mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch); - switch (pool->type) { + switch (pool->vk.query_type) { case VK_QUERY_TYPE_OCCLUSION: emit_ps_depth_count(cmd_buffer, anv_address_add(query_addr, 16)); emit_query_pc_availability(cmd_buffer, query_addr, true); + cmd_buffer->state.gfx.n_occlusion_queries--; + cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_OCCLUSION_QUERY_ACTIVE; + break; + + case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: + /* Ensure previous commands have completed before capturing the register + * value. + */ + genx_batch_emit_pipe_control(&cmd_buffer->batch, + cmd_buffer->device->info, + cmd_buffer->state.current_pipeline, + ANV_PIPE_CS_STALL_BIT | + ANV_PIPE_STALL_AT_SCOREBOARD_BIT); + + mi_store(&b, mi_mem64(anv_address_add(query_addr, 16)), + mi_reg64(GENX(CL_INVOCATION_COUNT_num))); + emit_query_mi_availability(&b, query_addr, true); break; case VK_QUERY_TYPE_PIPELINE_STATISTICS: { /* TODO: This might only be necessary for certain stats */ - anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { - pc.CommandStreamerStallEnable = true; - pc.StallAtPixelScoreboard = true; - } + genx_batch_emit_pipe_control(&cmd_buffer->batch, + cmd_buffer->device->info, + cmd_buffer->state.current_pipeline, + ANV_PIPE_CS_STALL_BIT | + ANV_PIPE_STALL_AT_SCOREBOARD_BIT); - uint32_t statistics = pool->pipeline_statistics; + uint32_t statistics = pool->vk.pipeline_statistics; uint32_t offset = 16; while (statistics) { uint32_t stat = u_bit_scan(&statistics); @@ -1103,21 +1257,21 @@ void genX(CmdEndQueryIndexedEXT)( } case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: - anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { - pc.CommandStreamerStallEnable = true; - pc.StallAtPixelScoreboard = true; - } - + genx_batch_emit_pipe_control(&cmd_buffer->batch, + cmd_buffer->device->info, + cmd_buffer->state.current_pipeline, + ANV_PIPE_CS_STALL_BIT | + ANV_PIPE_STALL_AT_SCOREBOARD_BIT); emit_xfb_query(&b, index, anv_address_add(query_addr, 16)); emit_query_mi_availability(&b, query_addr, true); break; -#if GFX_VER >= 8 case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: { - anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { - pc.CommandStreamerStallEnable = true; - pc.StallAtPixelScoreboard = true; - } + genx_batch_emit_pipe_control(&cmd_buffer->batch, + cmd_buffer->device->info, + cmd_buffer->state.current_pipeline, + ANV_PIPE_CS_STALL_BIT | + ANV_PIPE_STALL_AT_SCOREBOARD_BIT); cmd_buffer->perf_query_pool = pool; if (!khr_perf_query_ensure_relocs(cmd_buffer)) @@ -1144,6 +1298,7 @@ void genX(CmdEndQueryIndexedEXT)( case INTEL_PERF_QUERY_FIELD_TYPE_SRM_PERFCNT: case INTEL_PERF_QUERY_FIELD_TYPE_SRM_RPSTAT: + case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_A: case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_B: case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_C: dws = @@ -1189,13 +1344,13 @@ void genX(CmdEndQueryIndexedEXT)( assert(cmd_buffer->perf_reloc_idx == pdevice->n_perf_query_commands); break; } -#endif case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: { - anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { - pc.CommandStreamerStallEnable = true; - pc.StallAtPixelScoreboard = true; - } + genx_batch_emit_pipe_control(&cmd_buffer->batch, + cmd_buffer->device->info, + cmd_buffer->state.current_pipeline, + ANV_PIPE_CS_STALL_BIT | + ANV_PIPE_STALL_AT_SCOREBOARD_BIT); uint32_t marker_offset = intel_perf_marker_offset(); mi_store(&b, mi_mem64(anv_address_add(query_addr, marker_offset)), mi_imm(cmd_buffer->intel_perf_marker)); @@ -1203,6 +1358,9 @@ void genX(CmdEndQueryIndexedEXT)( emit_query_mi_availability(&b, query_addr, true); break; } + case VK_QUERY_TYPE_RESULT_STATUS_ONLY_KHR: + emit_query_mi_flush_availability(cmd_buffer, query_addr, true); + break; default: unreachable(""); @@ -1216,9 +1374,9 @@ void genX(CmdEndQueryIndexedEXT)( * first index, mark the other query indices as being already available * with result 0. */ - if (cmd_buffer->state.subpass && cmd_buffer->state.subpass->view_mask) { + if (cmd_buffer->state.gfx.view_mask) { const uint32_t num_queries = - util_bitcount(cmd_buffer->state.subpass->view_mask); + util_bitcount(cmd_buffer->state.gfx.view_mask); if (num_queries > 1) emit_zero_queries(cmd_buffer, &b, pool, query + 1, num_queries - 1); } @@ -1226,9 +1384,9 @@ void genX(CmdEndQueryIndexedEXT)( #define TIMESTAMP 0x2358 -void genX(CmdWriteTimestamp)( +void genX(CmdWriteTimestamp2)( VkCommandBuffer commandBuffer, - VkPipelineStageFlagBits pipelineStage, + VkPipelineStageFlags2 stage, VkQueryPool queryPool, uint32_t query) { @@ -1236,34 +1394,49 @@ void genX(CmdWriteTimestamp)( ANV_FROM_HANDLE(anv_query_pool, pool, queryPool); struct anv_address query_addr = anv_query_address(pool, query); - assert(pool->type == VK_QUERY_TYPE_TIMESTAMP); + assert(pool->vk.query_type == VK_QUERY_TYPE_TIMESTAMP); + + emit_query_clear_flush(cmd_buffer, pool, + "CmdWriteTimestamp flush query clears"); struct mi_builder b; - mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch); + mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch); - switch (pipelineStage) { - case VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT: + if (stage == VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT) { mi_store(&b, mi_mem64(anv_address_add(query_addr, 8)), mi_reg64(TIMESTAMP)); - break; - - default: + emit_query_mi_availability(&b, query_addr, true); + } else { /* Everything else is bottom-of-pipe */ cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT; genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); - anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { - pc.DestinationAddressType = DAT_PPGTT; - pc.PostSyncOperation = WriteTimestamp; - pc.Address = anv_address_add(query_addr, 8); + bool cs_stall_needed = + (GFX_VER == 9 && cmd_buffer->device->info->gt == 4); - if (GFX_VER == 9 && cmd_buffer->device->info.gt == 4) - pc.CommandStreamerStallEnable = true; + if (anv_cmd_buffer_is_blitter_queue(cmd_buffer) || + anv_cmd_buffer_is_video_queue(cmd_buffer)) { + /* Wa_16018063123 - emit fast color dummy blit before MI_FLUSH_DW. */ + if (intel_needs_workaround(cmd_buffer->device->info, 16018063123)) { + genX(batch_emit_fast_color_dummy_blit)(&cmd_buffer->batch, + cmd_buffer->device); + } + anv_batch_emit(&cmd_buffer->batch, GENX(MI_FLUSH_DW), dw) { + dw.Address = anv_address_add(query_addr, 8); + dw.PostSyncOperation = WriteTimestamp; + } + emit_query_mi_flush_availability(cmd_buffer, query_addr, true); + } else { + genx_batch_emit_pipe_control_write + (&cmd_buffer->batch, cmd_buffer->device->info, + cmd_buffer->state.current_pipeline, WriteTimestamp, + anv_address_add(query_addr, 8), 0, + cs_stall_needed ? ANV_PIPE_CS_STALL_BIT : 0); + emit_query_pc_availability(cmd_buffer, query_addr, true); } - break; + } - emit_query_pc_availability(cmd_buffer, query_addr, true); /* When multiview is active the spec requires that N consecutive query * indices are used, where N is the number of active views in the subpass. @@ -1273,16 +1446,14 @@ void genX(CmdWriteTimestamp)( * first index, mark the other query indices as being already available * with result 0. */ - if (cmd_buffer->state.subpass && cmd_buffer->state.subpass->view_mask) { + if (cmd_buffer->state.gfx.view_mask) { const uint32_t num_queries = - util_bitcount(cmd_buffer->state.subpass->view_mask); + util_bitcount(cmd_buffer->state.gfx.view_mask); if (num_queries > 1) emit_zero_queries(cmd_buffer, &b, pool, query + 1, num_queries - 1); } } -#if GFX_VERx10 >= 75 - #define MI_PREDICATE_SRC0 0x2400 #define MI_PREDICATE_SRC1 0x2408 #define MI_PREDICATE_RESULT 0x2418 @@ -1341,61 +1512,92 @@ compute_query_result(struct mi_builder *b, struct anv_address addr) mi_mem64(anv_address_add(addr, 0))); } -void genX(CmdCopyQueryPoolResults)( - VkCommandBuffer commandBuffer, - VkQueryPool queryPool, - uint32_t firstQuery, - uint32_t queryCount, - VkBuffer destBuffer, - VkDeviceSize destOffset, - VkDeviceSize destStride, - VkQueryResultFlags flags) +static void +copy_query_results_with_cs(struct anv_cmd_buffer *cmd_buffer, + struct anv_query_pool *pool, + struct anv_address dest_addr, + uint64_t dest_stride, + uint32_t first_query, + uint32_t query_count, + VkQueryResultFlags flags) { - ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); - ANV_FROM_HANDLE(anv_query_pool, pool, queryPool); - ANV_FROM_HANDLE(anv_buffer, buffer, destBuffer); + enum anv_pipe_bits needed_flushes = 0; - struct mi_builder b; - mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch); - struct mi_value result; + trace_intel_begin_query_copy_cs(&cmd_buffer->trace); /* If render target writes are ongoing, request a render target cache flush * to ensure proper ordering of the commands from the 3d pipe and the * command streamer. */ - if (cmd_buffer->state.pending_pipe_bits & ANV_PIPE_RENDER_TARGET_BUFFER_WRITES) { - anv_add_pending_pipe_bits(cmd_buffer, - ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT, - "CopyQueryPoolResults"); + if ((cmd_buffer->state.queries.buffer_write_bits | + cmd_buffer->state.queries.clear_bits) & + ANV_QUERY_WRITES_RT_FLUSH) + needed_flushes |= ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT; + + if ((cmd_buffer->state.queries.buffer_write_bits | + cmd_buffer->state.queries.clear_bits) & + ANV_QUERY_WRITES_TILE_FLUSH) + needed_flushes |= ANV_PIPE_TILE_CACHE_FLUSH_BIT; + + if ((cmd_buffer->state.queries.buffer_write_bits | + cmd_buffer->state.queries.clear_bits) & + ANV_QUERY_WRITES_DATA_FLUSH) { + needed_flushes |= (ANV_PIPE_DATA_CACHE_FLUSH_BIT | + ANV_PIPE_HDC_PIPELINE_FLUSH_BIT | + ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT); } - if ((flags & VK_QUERY_RESULT_WAIT_BIT) || - (cmd_buffer->state.pending_pipe_bits & ANV_PIPE_FLUSH_BITS) || - /* Occlusion & timestamp queries are written using a PIPE_CONTROL and - * because we're about to copy values from MI commands, we need to - * stall the command streamer to make sure the PIPE_CONTROL values have - * landed, otherwise we could see inconsistent values & availability. - * - * From the vulkan spec: - * - * "vkCmdCopyQueryPoolResults is guaranteed to see the effect of - * previous uses of vkCmdResetQueryPool in the same queue, without - * any additional synchronization." - */ - pool->type == VK_QUERY_TYPE_OCCLUSION || - pool->type == VK_QUERY_TYPE_TIMESTAMP) { + if ((cmd_buffer->state.queries.buffer_write_bits | + cmd_buffer->state.queries.clear_bits) & + ANV_QUERY_WRITES_CS_STALL) + needed_flushes |= ANV_PIPE_CS_STALL_BIT; + + /* Occlusion & timestamp queries are written using a PIPE_CONTROL and + * because we're about to copy values from MI commands, we need to stall + * the command streamer to make sure the PIPE_CONTROL values have + * landed, otherwise we could see inconsistent values & availability. + * + * From the vulkan spec: + * + * "vkCmdCopyQueryPoolResults is guaranteed to see the effect of + * previous uses of vkCmdResetQueryPool in the same queue, without any + * additional synchronization." + */ + if (pool->vk.query_type == VK_QUERY_TYPE_OCCLUSION || + pool->vk.query_type == VK_QUERY_TYPE_TIMESTAMP) + needed_flushes |= ANV_PIPE_CS_STALL_BIT; + + if (needed_flushes) { anv_add_pending_pipe_bits(cmd_buffer, - ANV_PIPE_CS_STALL_BIT, + needed_flushes, "CopyQueryPoolResults"); genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); } - struct anv_address dest_addr = anv_address_add(buffer->address, destOffset); - for (uint32_t i = 0; i < queryCount; i++) { - struct anv_address query_addr = anv_query_address(pool, firstQuery + i); + struct mi_builder b; + mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch); + struct mi_value result; + + for (uint32_t i = 0; i < query_count; i++) { + struct anv_address query_addr = anv_query_address(pool, first_query + i); + const uint32_t mocs = anv_mocs_for_address(cmd_buffer->device, &query_addr); + + mi_builder_set_mocs(&b, mocs); + + /* Wait for the availability write to land before we go read the data */ + if (flags & VK_QUERY_RESULT_WAIT_BIT) { + anv_batch_emit(&cmd_buffer->batch, GENX(MI_SEMAPHORE_WAIT), sem) { + sem.WaitMode = PollingMode; + sem.CompareOperation = COMPARE_SAD_EQUAL_SDD; + sem.SemaphoreDataDword = true; + sem.SemaphoreAddress = query_addr; + } + } + uint32_t idx = 0; - switch (pool->type) { + switch (pool->vk.query_type) { case VK_QUERY_TYPE_OCCLUSION: + case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: result = compute_query_result(&b, anv_address_add(query_addr, 8)); /* Like in the case of vkGetQueryPoolResults, if the query is * unavailable and the VK_QUERY_RESULT_PARTIAL_BIT flag is set, @@ -1403,32 +1605,23 @@ void genX(CmdCopyQueryPoolResults)( * VK_QUERY_RESULT_PARTIAL_BIT isn't set, don't write any value. */ gpu_write_query_result_cond(cmd_buffer, &b, query_addr, dest_addr, - 1 /* available */, flags, idx, result); + 1 /* available */, flags, idx, result); if (flags & VK_QUERY_RESULT_PARTIAL_BIT) { gpu_write_query_result_cond(cmd_buffer, &b, query_addr, dest_addr, - 0 /* unavailable */, flags, idx, mi_imm(0)); + 0 /* unavailable */, flags, idx, mi_imm(0)); } idx++; break; case VK_QUERY_TYPE_PIPELINE_STATISTICS: { - uint32_t statistics = pool->pipeline_statistics; + uint32_t statistics = pool->vk.pipeline_statistics; while (statistics) { - uint32_t stat = u_bit_scan(&statistics); - + UNUSED uint32_t stat = u_bit_scan(&statistics); result = compute_query_result(&b, anv_address_add(query_addr, idx * 16 + 8)); - - /* WaDividePSInvocationCountBy4:HSW,BDW */ - if ((cmd_buffer->device->info.ver == 8 || - cmd_buffer->device->info.is_haswell) && - (1 << stat) == VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT) { - result = mi_ushr32_imm(&b, result, 2); - } - gpu_write_query_result(&b, dest_addr, flags, idx++, result); } - assert(idx == util_bitcount(pool->pipeline_statistics)); + assert(idx == util_bitcount(pool->vk.pipeline_statistics)); break; } @@ -1444,11 +1637,23 @@ void genX(CmdCopyQueryPoolResults)( gpu_write_query_result(&b, dest_addr, flags, idx++, result); break; -#if GFX_VER >= 8 +#if GFX_VERx10 >= 125 + case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR: + case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR: + case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR: + result = mi_mem64(anv_address_add(query_addr, 8)); + gpu_write_query_result(&b, dest_addr, flags, idx++, result); + break; + + case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR: + result = mi_mem64(anv_address_add(query_addr, 16)); + gpu_write_query_result(&b, dest_addr, flags, idx++, result); + break; +#endif + case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: unreachable("Copy KHR performance query results not implemented"); break; -#endif default: unreachable("unhandled query type"); @@ -1459,11 +1664,182 @@ void genX(CmdCopyQueryPoolResults)( mi_mem64(query_addr)); } - dest_addr = anv_address_add(dest_addr, destStride); + dest_addr = anv_address_add(dest_addr, dest_stride); } + + trace_intel_end_query_copy_cs(&cmd_buffer->trace, query_count); +} + +static void +copy_query_results_with_shader(struct anv_cmd_buffer *cmd_buffer, + struct anv_query_pool *pool, + struct anv_address dest_addr, + uint64_t dest_stride, + uint32_t first_query, + uint32_t query_count, + VkQueryResultFlags flags) +{ + struct anv_device *device = cmd_buffer->device; + enum anv_pipe_bits needed_flushes = 0; + + trace_intel_begin_query_copy_shader(&cmd_buffer->trace); + + /* If this is the first command in the batch buffer, make sure we have + * consistent pipeline mode. + */ + if (cmd_buffer->state.current_pipeline == UINT32_MAX) + genX(flush_pipeline_select_3d)(cmd_buffer); + + if ((cmd_buffer->state.queries.buffer_write_bits | + cmd_buffer->state.queries.clear_bits) & ANV_QUERY_WRITES_RT_FLUSH) + needed_flushes |= ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT; + + if ((cmd_buffer->state.queries.buffer_write_bits | + cmd_buffer->state.queries.clear_bits) & ANV_QUERY_WRITES_DATA_FLUSH) { + needed_flushes |= (ANV_PIPE_HDC_PIPELINE_FLUSH_BIT | + ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT); + } + + /* Flushes for the queries to complete */ + if (flags & VK_QUERY_RESULT_WAIT_BIT) { + /* Some queries are done with shaders, so we need to have them flush + * high level caches writes. The L3 should be shared across the GPU. + */ + if (pool->vk.query_type == VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR || + pool->vk.query_type == VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR || + pool->vk.query_type == VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR || + pool->vk.query_type == VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR) { + needed_flushes |= ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT; + } + /* And we need to stall for previous CS writes to land or the flushes to + * complete. + */ + needed_flushes |= ANV_PIPE_CS_STALL_BIT; + } + + /* Occlusion & timestamp queries are written using a PIPE_CONTROL and + * because we're about to copy values from MI commands, we need to stall + * the command streamer to make sure the PIPE_CONTROL values have + * landed, otherwise we could see inconsistent values & availability. + * + * From the vulkan spec: + * + * "vkCmdCopyQueryPoolResults is guaranteed to see the effect of + * previous uses of vkCmdResetQueryPool in the same queue, without any + * additional synchronization." + */ + if (pool->vk.query_type == VK_QUERY_TYPE_OCCLUSION || + pool->vk.query_type == VK_QUERY_TYPE_TIMESTAMP) + needed_flushes |= ANV_PIPE_CS_STALL_BIT; + + if (needed_flushes) { + anv_add_pending_pipe_bits(cmd_buffer, + needed_flushes | ANV_PIPE_END_OF_PIPE_SYNC_BIT, + "CopyQueryPoolResults"); + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); + } + + struct anv_shader_bin *copy_kernel; + VkResult ret = + anv_device_get_internal_shader( + cmd_buffer->device, + cmd_buffer->state.current_pipeline == GPGPU ? + ANV_INTERNAL_KERNEL_COPY_QUERY_RESULTS_COMPUTE : + ANV_INTERNAL_KERNEL_COPY_QUERY_RESULTS_FRAGMENT, + ©_kernel); + if (ret != VK_SUCCESS) { + anv_batch_set_error(&cmd_buffer->batch, ret); + return; + } + + struct anv_simple_shader state = { + .device = cmd_buffer->device, + .cmd_buffer = cmd_buffer, + .dynamic_state_stream = &cmd_buffer->dynamic_state_stream, + .general_state_stream = &cmd_buffer->general_state_stream, + .batch = &cmd_buffer->batch, + .kernel = copy_kernel, + .l3_config = device->internal_kernels_l3_config, + .urb_cfg = &cmd_buffer->state.gfx.urb_cfg, + }; + genX(emit_simple_shader_init)(&state); + + struct anv_state push_data_state = + genX(simple_shader_alloc_push)(&state, + sizeof(struct anv_query_copy_params)); + if (push_data_state.map == NULL) + return; + + struct anv_query_copy_params *params = push_data_state.map; + + uint32_t copy_flags = + ((flags & VK_QUERY_RESULT_64_BIT) ? ANV_COPY_QUERY_FLAG_RESULT64 : 0) | + ((flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) ? ANV_COPY_QUERY_FLAG_AVAILABLE : 0); + + uint32_t num_items = 1; + uint32_t data_offset = 8 /* behind availability */; + switch (pool->vk.query_type) { + case VK_QUERY_TYPE_OCCLUSION: + case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: + copy_flags |= ANV_COPY_QUERY_FLAG_DELTA; + /* These 2 queries are the only ones where we would have partial data + * because they are capture with a PIPE_CONTROL post sync operation. The + * other ones are captured with MI_STORE_REGISTER_DATA so we're always + * available by the time we reach the copy command. + */ + copy_flags |= (flags & VK_QUERY_RESULT_PARTIAL_BIT) ? ANV_COPY_QUERY_FLAG_PARTIAL : 0; + break; + + case VK_QUERY_TYPE_PIPELINE_STATISTICS: + num_items = util_bitcount(pool->vk.pipeline_statistics); + copy_flags |= ANV_COPY_QUERY_FLAG_DELTA; + break; + + case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: + num_items = 2; + copy_flags |= ANV_COPY_QUERY_FLAG_DELTA; + break; + + case VK_QUERY_TYPE_TIMESTAMP: + case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR: + case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR: + case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR: + break; + + case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR: + data_offset += 8; + break; + + default: + unreachable("unhandled query type"); + } + + *params = (struct anv_query_copy_params) { + .flags = copy_flags, + .num_queries = query_count, + .num_items = num_items, + .query_base = first_query, + .query_stride = pool->stride, + .query_data_offset = data_offset, + .destination_stride = dest_stride, + .query_data_addr = anv_address_physical( + (struct anv_address) { + .bo = pool->bo, + }), + .destination_addr = anv_address_physical(dest_addr), + }; + + genX(emit_simple_shader_dispatch)(&state, query_count, push_data_state); + + /* The query copy result shader is writing using the dataport, flush + * HDC/Data cache depending on the generation. Also stall at pixel + * scoreboard in case we're doing the copy with a fragment shader. + */ + cmd_buffer->state.queries.buffer_write_bits |= ANV_QUERY_WRITES_DATA_FLUSH; + + trace_intel_end_query_copy_shader(&cmd_buffer->trace, query_count); } -#else void genX(CmdCopyQueryPoolResults)( VkCommandBuffer commandBuffer, VkQueryPool queryPool, @@ -1474,6 +1850,99 @@ void genX(CmdCopyQueryPoolResults)( VkDeviceSize destStride, VkQueryResultFlags flags) { - anv_finishme("Queries not yet supported on Ivy Bridge"); + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + ANV_FROM_HANDLE(anv_query_pool, pool, queryPool); + ANV_FROM_HANDLE(anv_buffer, buffer, destBuffer); + struct anv_device *device = cmd_buffer->device; + struct anv_physical_device *pdevice = device->physical; + + if (queryCount > pdevice->instance->query_copy_with_shader_threshold) { + copy_query_results_with_shader(cmd_buffer, pool, + anv_address_add(buffer->address, + destOffset), + destStride, + firstQuery, + queryCount, + flags); + } else { + copy_query_results_with_cs(cmd_buffer, pool, + anv_address_add(buffer->address, + destOffset), + destStride, + firstQuery, + queryCount, + flags); + } +} + +#if GFX_VERx10 == 125 && ANV_SUPPORT_RT + +#include "grl/include/GRLRTASCommon.h" +#include "grl/grl_metakernel_postbuild_info.h" + +void +genX(CmdWriteAccelerationStructuresPropertiesKHR)( + VkCommandBuffer commandBuffer, + uint32_t accelerationStructureCount, + const VkAccelerationStructureKHR* pAccelerationStructures, + VkQueryType queryType, + VkQueryPool queryPool, + uint32_t firstQuery) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + ANV_FROM_HANDLE(anv_query_pool, pool, queryPool); + + assert(queryType == VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR || + queryType == VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR || + queryType == VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR || + queryType == VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR); + + emit_query_clear_flush(cmd_buffer, pool, + "CmdWriteAccelerationStructuresPropertiesKHR flush query clears"); + + struct mi_builder b; + mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch); + + for (uint32_t i = 0; i < accelerationStructureCount; i++) { + ANV_FROM_HANDLE(vk_acceleration_structure, accel, pAccelerationStructures[i]); + struct anv_address query_addr = + anv_address_add(anv_query_address(pool, firstQuery + i), 8); + + switch (queryType) { + case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR: + genX(grl_postbuild_info_compacted_size)(cmd_buffer, + vk_acceleration_structure_get_va(accel), + anv_address_physical(query_addr)); + break; + + case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR: + genX(grl_postbuild_info_current_size)(cmd_buffer, + vk_acceleration_structure_get_va(accel), + anv_address_physical(query_addr)); + break; + + case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR: + case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR: + genX(grl_postbuild_info_serialized_size)(cmd_buffer, + vk_acceleration_structure_get_va(accel), + anv_address_physical(query_addr)); + break; + + default: + unreachable("unhandled query type"); + } + } + + /* TODO: Figure out why MTL needs ANV_PIPE_DATA_CACHE_FLUSH_BIT in order + * to not lose the availability bit. + */ + anv_add_pending_pipe_bits(cmd_buffer, + ANV_PIPE_END_OF_PIPE_SYNC_BIT | + ANV_PIPE_DATA_CACHE_FLUSH_BIT, + "after write acceleration struct props"); + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); + + for (uint32_t i = 0; i < accelerationStructureCount; i++) + emit_query_mi_availability(&b, anv_query_address(pool, firstQuery + i), true); } #endif |