summaryrefslogtreecommitdiff
path: root/src/intel/vulkan/genX_query.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/intel/vulkan/genX_query.c')
-rw-r--r--src/intel/vulkan/genX_query.c959
1 files changed, 714 insertions, 245 deletions
diff --git a/src/intel/vulkan/genX_query.c b/src/intel/vulkan/genX_query.c
index 8978f5843a9..2cb492afcf9 100644
--- a/src/intel/vulkan/genX_query.c
+++ b/src/intel/vulkan/genX_query.c
@@ -29,15 +29,21 @@
#include "anv_private.h"
+#include "util/os_time.h"
+
#include "genxml/gen_macros.h"
#include "genxml/genX_pack.h"
+#include "ds/intel_tracepoints.h"
+
+#include "anv_internal_kernels.h"
+
/* We reserve :
* - GPR 14 for perf queries
* - GPR 15 for conditional rendering
*/
#define MI_BUILDER_NUM_ALLOC_GPRS 14
-#define MI_BUILDER_CAN_WRITE_BATCH GFX_VER >= 8
+#define MI_BUILDER_CAN_WRITE_BATCH true
#define __gen_get_batch_dwords anv_batch_emit_dwords
#define __gen_address_offset anv_address_add
#define __gen_get_batch_address(b, a) anv_batch_address(b, a)
@@ -57,6 +63,18 @@ anv_query_address(struct anv_query_pool *pool, uint32_t query)
};
}
+static void
+emit_query_mi_flush_availability(struct anv_cmd_buffer *cmd_buffer,
+ struct anv_address addr,
+ bool available)
+{
+ anv_batch_emit(&cmd_buffer->batch, GENX(MI_FLUSH_DW), flush) {
+ flush.PostSyncOperation = WriteImmediateData;
+ flush.Address = addr;
+ flush.ImmediateData = available;
+ }
+}
+
VkResult genX(CreateQueryPool)(
VkDevice _device,
const VkQueryPoolCreateInfo* pCreateInfo,
@@ -65,12 +83,10 @@ VkResult genX(CreateQueryPool)(
{
ANV_FROM_HANDLE(anv_device, device, _device);
const struct anv_physical_device *pdevice = device->physical;
-#if GFX_VER >= 8
const VkQueryPoolPerformanceCreateInfoKHR *perf_query_info = NULL;
struct intel_perf_counter_pass *counter_pass;
struct intel_perf_query_info **pass_query;
uint32_t n_passes = 0;
-#endif
uint32_t data_offset = 0;
VK_MULTIALLOC(ma);
VkResult result;
@@ -123,14 +139,13 @@ VkResult genX(CreateQueryPool)(
uint64s_per_slot = 2; /* availability + marker */
/* Align to the requirement of the layout */
- uint64s_per_slot = align_u32(uint64s_per_slot,
- DIV_ROUND_UP(layout->alignment, sizeof(uint64_t)));
+ uint64s_per_slot = align(uint64s_per_slot,
+ DIV_ROUND_UP(layout->alignment, sizeof(uint64_t)));
data_offset = uint64s_per_slot * sizeof(uint64_t);
/* Add the query data for begin & end commands */
uint64s_per_slot += 2 * DIV_ROUND_UP(layout->size, sizeof(uint64_t));
break;
}
-#if GFX_VER >= 8
case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
const struct intel_perf_query_field_layout *layout =
&pdevice->perf->query_layout;
@@ -145,10 +160,10 @@ VkResult genX(CreateQueryPool)(
perf_query_info->counterIndexCount);
vk_multialloc_add(&ma, &pass_query, struct intel_perf_query_info *,
n_passes);
- uint64s_per_slot = 4 /* availability + small batch */;
+ uint64s_per_slot = 1 /* availability */;
/* Align to the requirement of the layout */
- uint64s_per_slot = align_u32(uint64s_per_slot,
- DIV_ROUND_UP(layout->alignment, sizeof(uint64_t)));
+ uint64s_per_slot = align(uint64s_per_slot,
+ DIV_ROUND_UP(layout->alignment, sizeof(uint64_t)));
data_offset = uint64s_per_slot * sizeof(uint64_t);
/* Add the query data for begin & end commands */
uint64s_per_slot += 2 * DIV_ROUND_UP(layout->size, sizeof(uint64_t));
@@ -156,26 +171,41 @@ VkResult genX(CreateQueryPool)(
uint64s_per_slot *= n_passes;
break;
}
+ case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
+ /* Query has two values: begin and end. */
+ uint64s_per_slot = 1 + 2;
+ break;
+#if GFX_VERx10 >= 125
+ case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR:
+ case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR:
+ uint64s_per_slot = 1 + 1 /* availability + size (PostbuildInfoCurrentSize, PostbuildInfoCompactedSize) */;
+ break;
+
+ case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR:
+ case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR:
+ uint64s_per_slot = 1 + 2 /* availability + size (PostbuildInfoSerializationDesc) */;
+ break;
+
#endif
+ case VK_QUERY_TYPE_RESULT_STATUS_ONLY_KHR:
+ uint64s_per_slot = 1;
+ break;
default:
assert(!"Invalid query type");
}
- if (!vk_object_multialloc(&device->vk, &ma, pAllocator,
- VK_OBJECT_TYPE_QUERY_POOL))
- return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+ if (!vk_multialloc_zalloc2(&ma, &device->vk.alloc, pAllocator,
+ VK_SYSTEM_ALLOCATION_SCOPE_OBJECT))
+ return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
- pool->type = pCreateInfo->queryType;
- pool->pipeline_statistics = pipeline_statistics;
+ vk_query_pool_init(&device->vk, &pool->vk, pCreateInfo);
pool->stride = uint64s_per_slot * sizeof(uint64_t);
- pool->slots = pCreateInfo->queryCount;
- if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL) {
+ if (pool->vk.query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL) {
pool->data_offset = data_offset;
pool->snapshot_size = (pool->stride - data_offset) / 2;
}
-#if GFX_VER >= 8
- else if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
+ else if (pool->vk.query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
pool->pass_size = pool->stride / n_passes;
pool->data_offset = data_offset;
pool->snapshot_size = (pool->pass_size - data_offset) / 2;
@@ -192,19 +222,27 @@ VkResult genX(CreateQueryPool)(
perf_query_info->counterIndexCount,
pool->pass_query);
}
-#endif
- uint64_t size = pool->slots * (uint64_t)pool->stride;
+ uint64_t size = pool->vk.query_count * (uint64_t)pool->stride;
+
+ /* For KHR_performance_query we need some space in the buffer for a small
+ * batch updating ANV_PERF_QUERY_OFFSET_REG.
+ */
+ if (pool->vk.query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
+ pool->khr_perf_preamble_stride = 32;
+ pool->khr_perf_preambles_offset = size;
+ size += (uint64_t)pool->n_passes * pool->khr_perf_preamble_stride;
+ }
+
result = anv_device_alloc_bo(device, "query-pool", size,
ANV_BO_ALLOC_MAPPED |
- ANV_BO_ALLOC_SNOOPED,
+ ANV_BO_ALLOC_HOST_CACHED_COHERENT,
0 /* explicit_address */,
&pool->bo);
if (result != VK_SUCCESS)
goto fail;
-#if GFX_VER >= 8
- if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
+ if (pool->vk.query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
for (uint32_t p = 0; p < pool->n_passes; p++) {
struct mi_builder b;
struct anv_batch batch = {
@@ -213,13 +251,14 @@ VkResult genX(CreateQueryPool)(
};
batch.next = batch.start;
- mi_builder_init(&b, &device->info, &batch);
+ mi_builder_init(&b, device->info, &batch);
mi_store(&b, mi_reg64(ANV_PERF_QUERY_OFFSET_REG),
mi_imm(p * (uint64_t)pool->pass_size));
anv_batch_emit(&batch, GENX(MI_BATCH_BUFFER_END), bbe);
}
}
-#endif
+
+ ANV_RMV(query_pool_create, device, pool, false);
*pQueryPool = anv_query_pool_to_handle(pool);
@@ -242,47 +281,36 @@ void genX(DestroyQueryPool)(
if (!pool)
return;
+ ANV_RMV(resource_destroy, device, pool);
+
anv_device_release_bo(device, pool->bo);
vk_object_free(&device->vk, pAllocator, pool);
}
-#if GFX_VER >= 8
/**
* VK_KHR_performance_query layout :
*
* --------------------------------------------
* | availability (8b) | | |
* |-------------------------------| | |
- * | Small batch loading | | |
- * | ANV_PERF_QUERY_OFFSET_REG | | |
- * | (24b) | | Pass 0 |
- * |-------------------------------| | |
* | some padding (see | | |
- * | query_field_layout:alignment) | | |
+ * | query_field_layout:alignment) | | Pass 0 |
* |-------------------------------| | |
* | query data | | |
* | (2 * query_field_layout:size) | | |
* |-------------------------------|-- | Query 0
* | availability (8b) | | |
* |-------------------------------| | |
- * | Small batch loading | | |
- * | ANV_PERF_QUERY_OFFSET_REG | | |
- * | (24b) | | Pass 1 |
- * |-------------------------------| | |
* | some padding (see | | |
- * | query_field_layout:alignment) | | |
+ * | query_field_layout:alignment) | | Pass 1 |
* |-------------------------------| | |
* | query data | | |
* | (2 * query_field_layout:size) | | |
* |-------------------------------|-----------
* | availability (8b) | | |
* |-------------------------------| | |
- * | Small batch loading | | |
- * | ANV_PERF_QUERY_OFFSET_REG | | |
- * | (24b) | | Pass 0 |
- * |-------------------------------| | |
* | some padding (see | | |
- * | query_field_layout:alignment) | | |
+ * | query_field_layout:alignment) | | Pass 0 |
* |-------------------------------| | |
* | query data | | |
* | (2 * query_field_layout:size) | | |
@@ -333,7 +361,7 @@ khr_perf_query_ensure_relocs(struct anv_cmd_buffer *cmd_buffer)
const struct anv_physical_device *pdevice = device->physical;
cmd_buffer->self_mod_locations =
- vk_alloc(&cmd_buffer->pool->alloc,
+ vk_alloc(&cmd_buffer->vk.pool->alloc,
pdevice->n_perf_query_commands * sizeof(*cmd_buffer->self_mod_locations), 8,
VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
@@ -344,7 +372,6 @@ khr_perf_query_ensure_relocs(struct anv_cmd_buffer *cmd_buffer)
return true;
}
-#endif
/**
* VK_INTEL_performance_query layout :
@@ -396,8 +423,7 @@ query_slot(struct anv_query_pool *pool, uint32_t query)
static bool
query_is_available(struct anv_query_pool *pool, uint32_t query)
{
-#if GFX_VER >= 8
- if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
+ if (pool->vk.query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
for (uint32_t p = 0; p < pool->n_passes; p++) {
volatile uint64_t *slot =
pool->bo->map + khr_perf_query_availability_offset(pool, query, p);
@@ -406,7 +432,6 @@ query_is_available(struct anv_query_pool *pool, uint32_t query)
}
return true;
}
-#endif
return *(volatile uint64_t *)query_slot(pool, query);
}
@@ -415,17 +440,29 @@ static VkResult
wait_for_available(struct anv_device *device,
struct anv_query_pool *pool, uint32_t query)
{
- uint64_t abs_timeout = anv_get_absolute_timeout(2 * NSEC_PER_SEC);
+ /* By default we leave a 2s timeout before declaring the device lost. */
+ uint64_t rel_timeout = 2 * NSEC_PER_SEC;
+ if (pool->vk.query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
+ /* With performance queries, there is an additional 500us reconfiguration
+ * time in i915.
+ */
+ rel_timeout += 500 * 1000;
+ /* Additionally a command buffer can be replayed N times to gather data
+ * for each of the metric sets to capture all the counters requested.
+ */
+ rel_timeout *= pool->n_passes;
+ }
+ uint64_t abs_timeout_ns = os_time_get_absolute_timeout(rel_timeout);
- while (anv_gettime_ns() < abs_timeout) {
+ while (os_time_get_nano() < abs_timeout_ns) {
if (query_is_available(pool, query))
return VK_SUCCESS;
- VkResult status = anv_device_query_status(device);
+ VkResult status = vk_device_check_status(&device->vk);
if (status != VK_SUCCESS)
return status;
}
- return anv_device_set_lost(device, "query timeout");
+ return vk_device_set_lost(&device->vk, "query timeout");
}
VkResult genX(GetQueryPoolResults)(
@@ -441,14 +478,23 @@ VkResult genX(GetQueryPoolResults)(
ANV_FROM_HANDLE(anv_device, device, _device);
ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
- assert(pool->type == VK_QUERY_TYPE_OCCLUSION ||
- pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS ||
- pool->type == VK_QUERY_TYPE_TIMESTAMP ||
- pool->type == VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT ||
- pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR ||
- pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL);
-
- if (anv_device_is_lost(device))
+ assert(
+#if GFX_VERx10 >= 125
+ pool->vk.query_type == VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR ||
+ pool->vk.query_type == VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR ||
+ pool->vk.query_type == VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR ||
+ pool->vk.query_type == VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR ||
+#endif
+ pool->vk.query_type == VK_QUERY_TYPE_OCCLUSION ||
+ pool->vk.query_type == VK_QUERY_TYPE_PIPELINE_STATISTICS ||
+ pool->vk.query_type == VK_QUERY_TYPE_TIMESTAMP ||
+ pool->vk.query_type == VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT ||
+ pool->vk.query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR ||
+ pool->vk.query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL ||
+ pool->vk.query_type == VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT ||
+ pool->vk.query_type == VK_QUERY_TYPE_RESULT_STATUS_ONLY_KHR);
+
+ if (vk_device_is_lost(&device->vk))
return VK_ERROR_DEVICE_LOST;
if (pData == NULL)
@@ -487,8 +533,9 @@ VkResult genX(GetQueryPoolResults)(
bool write_results = available || (flags & VK_QUERY_RESULT_PARTIAL_BIT);
uint32_t idx = 0;
- switch (pool->type) {
- case VK_QUERY_TYPE_OCCLUSION: {
+ switch (pool->vk.query_type) {
+ case VK_QUERY_TYPE_OCCLUSION:
+ case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: {
uint64_t *slot = query_slot(pool, firstQuery + i);
if (write_results) {
/* From the Vulkan 1.2.132 spec:
@@ -507,22 +554,16 @@ VkResult genX(GetQueryPoolResults)(
case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
uint64_t *slot = query_slot(pool, firstQuery + i);
- uint32_t statistics = pool->pipeline_statistics;
+ uint32_t statistics = pool->vk.pipeline_statistics;
while (statistics) {
- uint32_t stat = u_bit_scan(&statistics);
+ UNUSED uint32_t stat = u_bit_scan(&statistics);
if (write_results) {
uint64_t result = slot[idx * 2 + 2] - slot[idx * 2 + 1];
-
- /* WaDividePSInvocationCountBy4:HSW,BDW */
- if ((device->info.ver == 8 || device->info.is_haswell) &&
- (1 << stat) == VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT)
- result >>= 2;
-
cpu_write_query_result(pData, flags, idx, result);
}
idx++;
}
- assert(idx == util_bitcount(pool->pipeline_statistics));
+ assert(idx == util_bitcount(pool->vk.pipeline_statistics));
break;
}
@@ -537,6 +578,26 @@ VkResult genX(GetQueryPoolResults)(
break;
}
+#if GFX_VERx10 >= 125
+ case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR:
+ case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR:
+ case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR: {
+ uint64_t *slot = query_slot(pool, firstQuery + i);
+ if (write_results)
+ cpu_write_query_result(pData, flags, idx, slot[1]);
+ idx++;
+ break;
+ }
+
+ case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR: {
+ uint64_t *slot = query_slot(pool, firstQuery + i);
+ if (write_results)
+ cpu_write_query_result(pData, flags, idx, slot[2]);
+ idx++;
+ break;
+ }
+#endif
+
case VK_QUERY_TYPE_TIMESTAMP: {
uint64_t *slot = query_slot(pool, firstQuery + i);
if (write_results)
@@ -545,7 +606,6 @@ VkResult genX(GetQueryPoolResults)(
break;
}
-#if GFX_VER >= 8
case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
const struct anv_physical_device *pdevice = device->physical;
assert((flags & (VK_QUERY_RESULT_WITH_AVAILABILITY_BIT |
@@ -554,7 +614,7 @@ VkResult genX(GetQueryPoolResults)(
const struct intel_perf_query_info *query = pool->pass_query[p];
struct intel_perf_query_result result;
intel_perf_query_result_clear(&result);
- intel_perf_query_result_accumulate_fields(&result, query, &device->info,
+ intel_perf_query_result_accumulate_fields(&result, query,
pool->bo->map + khr_perf_query_data_offset(pool, firstQuery + i, p, false),
pool->bo->map + khr_perf_query_data_offset(pool, firstQuery + i, p, true),
false /* no_oa_accumulate */);
@@ -562,7 +622,6 @@ VkResult genX(GetQueryPoolResults)(
}
break;
}
-#endif
case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
if (!write_results)
@@ -571,18 +630,26 @@ VkResult genX(GetQueryPoolResults)(
const struct intel_perf_query_info *query = &device->physical->perf->queries[0];
struct intel_perf_query_result result;
intel_perf_query_result_clear(&result);
- intel_perf_query_result_accumulate_fields(&result, query, &device->info,
+ intel_perf_query_result_accumulate_fields(&result, query,
query_data + intel_perf_query_data_offset(pool, false),
query_data + intel_perf_query_data_offset(pool, true),
false /* no_oa_accumulate */);
intel_perf_query_result_write_mdapi(pData, stride,
- &device->info,
+ device->info,
query, &result);
const uint64_t *marker = query_data + intel_perf_marker_offset();
- intel_perf_query_mdapi_write_marker(pData, stride, &device->info, *marker);
+ intel_perf_query_mdapi_write_marker(pData, stride, device->info, *marker);
break;
}
+ case VK_QUERY_TYPE_RESULT_STATUS_ONLY_KHR:
+ if (!write_results)
+ break;
+ const uint32_t *query_data = query_slot(pool, firstQuery + i);
+ uint32_t result = available ? *query_data : 0;
+ cpu_write_query_result(pData, flags, idx, result);
+ break;
+
default:
unreachable("invalid pool type");
}
@@ -608,15 +675,11 @@ emit_ps_depth_count(struct anv_cmd_buffer *cmd_buffer,
cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT;
genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
- anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
- pc.DestinationAddressType = DAT_PPGTT;
- pc.PostSyncOperation = WritePSDepthCount;
- pc.DepthStallEnable = true;
- pc.Address = addr;
-
- if (GFX_VER == 9 && cmd_buffer->device->info.gt == 4)
- pc.CommandStreamerStallEnable = true;
- }
+ bool cs_stall_needed = (GFX_VER == 9 && cmd_buffer->device->info->gt == 4);
+ genx_batch_emit_pipe_control_write
+ (&cmd_buffer->batch, cmd_buffer->device->info,
+ cmd_buffer->state.current_pipeline, WritePSDepthCount, addr, 0,
+ ANV_PIPE_DEPTH_STALL_BIT | (cs_stall_needed ? ANV_PIPE_CS_STALL_BIT : 0));
}
static void
@@ -635,12 +698,10 @@ emit_query_pc_availability(struct anv_cmd_buffer *cmd_buffer,
cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT;
genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
- anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
- pc.DestinationAddressType = DAT_PPGTT;
- pc.PostSyncOperation = WriteImmediateData;
- pc.Address = addr;
- pc.ImmediateData = available;
- }
+ genx_batch_emit_pipe_control_write
+ (&cmd_buffer->batch, cmd_buffer->device->info,
+ cmd_buffer->state.current_pipeline, WriteImmediateData, addr,
+ available, 0);
}
/**
@@ -652,7 +713,7 @@ emit_zero_queries(struct anv_cmd_buffer *cmd_buffer,
struct mi_builder *b, struct anv_query_pool *pool,
uint32_t first_index, uint32_t num_queries)
{
- switch (pool->type) {
+ switch (pool->vk.query_type) {
case VK_QUERY_TYPE_OCCLUSION:
case VK_QUERY_TYPE_TIMESTAMP:
/* These queries are written with a PIPE_CONTROL so clear them using the
@@ -673,6 +734,7 @@ emit_zero_queries(struct anv_cmd_buffer *cmd_buffer,
}
break;
+ case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
case VK_QUERY_TYPE_PIPELINE_STATISTICS:
case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
for (uint32_t i = 0; i < num_queries; i++) {
@@ -683,7 +745,6 @@ emit_zero_queries(struct anv_cmd_buffer *cmd_buffer,
}
break;
-#if GFX_VER >= 8
case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
for (uint32_t i = 0; i < num_queries; i++) {
for (uint32_t p = 0; p < pool->n_passes; p++) {
@@ -696,7 +757,6 @@ emit_zero_queries(struct anv_cmd_buffer *cmd_buffer,
}
break;
}
-#endif
case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL:
for (uint32_t i = 0; i < num_queries; i++) {
@@ -720,10 +780,44 @@ void genX(CmdResetQueryPool)(
{
ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
+ struct anv_physical_device *pdevice = cmd_buffer->device->physical;
+
+ /* Shader clearing is only possible on render/compute */
+ if (anv_cmd_buffer_is_render_or_compute_queue(cmd_buffer) &&
+ queryCount >= pdevice->instance->query_clear_with_blorp_threshold) {
+ trace_intel_begin_query_clear_blorp(&cmd_buffer->trace);
+
+ anv_cmd_buffer_fill_area(cmd_buffer,
+ anv_query_address(pool, firstQuery),
+ queryCount * pool->stride,
+ 0);
+
+ /* The pending clearing writes are in compute if we're in gpgpu mode on
+ * the render engine or on the compute engine.
+ */
+ if (anv_cmd_buffer_is_compute_queue(cmd_buffer) ||
+ cmd_buffer->state.current_pipeline == pdevice->gpgpu_pipeline_value) {
+ cmd_buffer->state.queries.clear_bits =
+ ANV_QUERY_COMPUTE_WRITES_PENDING_BITS;
+ } else {
+ cmd_buffer->state.queries.clear_bits =
+ ANV_QUERY_RENDER_TARGET_WRITES_PENDING_BITS(&pdevice->info);
+ }
+
+ trace_intel_end_query_clear_blorp(&cmd_buffer->trace, queryCount);
+ return;
+ }
- switch (pool->type) {
+ trace_intel_begin_query_clear_cs(&cmd_buffer->trace);
+
+ switch (pool->vk.query_type) {
case VK_QUERY_TYPE_OCCLUSION:
- case VK_QUERY_TYPE_TIMESTAMP:
+#if GFX_VERx10 >= 125
+ case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR:
+ case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR:
+ case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR:
+ case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR:
+#endif
for (uint32_t i = 0; i < queryCount; i++) {
emit_query_pc_availability(cmd_buffer,
anv_query_address(pool, firstQuery + i),
@@ -731,20 +825,37 @@ void genX(CmdResetQueryPool)(
}
break;
+ case VK_QUERY_TYPE_TIMESTAMP: {
+ for (uint32_t i = 0; i < queryCount; i++) {
+ emit_query_pc_availability(cmd_buffer,
+ anv_query_address(pool, firstQuery + i),
+ false);
+ }
+
+ /* Add a CS stall here to make sure the PIPE_CONTROL above has
+ * completed. Otherwise some timestamps written later with MI_STORE_*
+ * commands might race with the PIPE_CONTROL in the loop above.
+ */
+ anv_add_pending_pipe_bits(cmd_buffer, ANV_PIPE_CS_STALL_BIT,
+ "vkCmdResetQueryPool of timestamps");
+ genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+ break;
+ }
+
case VK_QUERY_TYPE_PIPELINE_STATISTICS:
- case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: {
+ case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
+ case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: {
struct mi_builder b;
- mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
+ mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
for (uint32_t i = 0; i < queryCount; i++)
emit_query_mi_availability(&b, anv_query_address(pool, firstQuery + i), false);
break;
}
-#if GFX_VER >= 8
case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
struct mi_builder b;
- mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
+ mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
for (uint32_t i = 0; i < queryCount; i++) {
for (uint32_t p = 0; p < pool->n_passes; p++) {
@@ -756,20 +867,24 @@ void genX(CmdResetQueryPool)(
}
break;
}
-#endif
case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
struct mi_builder b;
- mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
+ mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
for (uint32_t i = 0; i < queryCount; i++)
emit_query_mi_availability(&b, anv_query_address(pool, firstQuery + i), false);
break;
}
-
+ case VK_QUERY_TYPE_RESULT_STATUS_ONLY_KHR:
+ for (uint32_t i = 0; i < queryCount; i++)
+ emit_query_mi_flush_availability(cmd_buffer, anv_query_address(pool, firstQuery + i), false);
+ break;
default:
unreachable("Unsupported query type");
}
+
+ trace_intel_end_query_clear_cs(&cmd_buffer->trace, queryCount);
}
void genX(ResetQueryPool)(
@@ -781,14 +896,12 @@ void genX(ResetQueryPool)(
ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
for (uint32_t i = 0; i < queryCount; i++) {
- if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
-#if GFX_VER >= 8
+ if (pool->vk.query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
for (uint32_t p = 0; p < pool->n_passes; p++) {
uint64_t *pass_slot = pool->bo->map +
khr_perf_query_availability_offset(pool, firstQuery + i, p);
*pass_slot = 0;
}
-#endif
} else {
uint64_t *slot = query_slot(pool, firstQuery + i);
*slot = 0;
@@ -858,6 +971,7 @@ emit_perf_intel_query(struct anv_cmd_buffer *cmd_buffer,
case INTEL_PERF_QUERY_FIELD_TYPE_SRM_PERFCNT:
case INTEL_PERF_QUERY_FIELD_TYPE_SRM_RPSTAT:
+ case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_A:
case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_B:
case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_C: {
struct anv_address addr = anv_address_add(data_addr, field->location);
@@ -877,15 +991,22 @@ emit_perf_intel_query(struct anv_cmd_buffer *cmd_buffer,
}
}
-void genX(CmdBeginQuery)(
- VkCommandBuffer commandBuffer,
- VkQueryPool queryPool,
- uint32_t query,
- VkQueryControlFlags flags)
+static void
+emit_query_clear_flush(struct anv_cmd_buffer *cmd_buffer,
+ struct anv_query_pool *pool,
+ const char *reason)
{
- genX(CmdBeginQueryIndexedEXT)(commandBuffer, queryPool, query, flags, 0);
+ if (cmd_buffer->state.queries.clear_bits == 0)
+ return;
+
+ anv_add_pending_pipe_bits(cmd_buffer,
+ ANV_PIPE_QUERY_BITS(
+ cmd_buffer->state.queries.clear_bits),
+ reason);
+ genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
}
+
void genX(CmdBeginQueryIndexedEXT)(
VkCommandBuffer commandBuffer,
VkQueryPool queryPool,
@@ -897,22 +1018,39 @@ void genX(CmdBeginQueryIndexedEXT)(
ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
struct anv_address query_addr = anv_query_address(pool, query);
+ emit_query_clear_flush(cmd_buffer, pool, "CmdBeginQuery* flush query clears");
+
struct mi_builder b;
- mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
+ mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
+ const uint32_t mocs = anv_mocs_for_address(cmd_buffer->device, &query_addr);
+ mi_builder_set_mocs(&b, mocs);
- switch (pool->type) {
+ switch (pool->vk.query_type) {
case VK_QUERY_TYPE_OCCLUSION:
+ cmd_buffer->state.gfx.n_occlusion_queries++;
+ cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_OCCLUSION_QUERY_ACTIVE;
emit_ps_depth_count(cmd_buffer, anv_address_add(query_addr, 8));
break;
+ case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
+ genx_batch_emit_pipe_control(&cmd_buffer->batch,
+ cmd_buffer->device->info,
+ cmd_buffer->state.current_pipeline,
+ ANV_PIPE_CS_STALL_BIT |
+ ANV_PIPE_STALL_AT_SCOREBOARD_BIT);
+ mi_store(&b, mi_mem64(anv_address_add(query_addr, 8)),
+ mi_reg64(GENX(CL_INVOCATION_COUNT_num)));
+ break;
+
case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
/* TODO: This might only be necessary for certain stats */
- anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
- pc.CommandStreamerStallEnable = true;
- pc.StallAtPixelScoreboard = true;
- }
+ genx_batch_emit_pipe_control(&cmd_buffer->batch,
+ cmd_buffer->device->info,
+ cmd_buffer->state.current_pipeline,
+ ANV_PIPE_CS_STALL_BIT |
+ ANV_PIPE_STALL_AT_SCOREBOARD_BIT);
- uint32_t statistics = pool->pipeline_statistics;
+ uint32_t statistics = pool->vk.pipeline_statistics;
uint32_t offset = 8;
while (statistics) {
uint32_t stat = u_bit_scan(&statistics);
@@ -923,14 +1061,14 @@ void genX(CmdBeginQueryIndexedEXT)(
}
case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
- anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
- pc.CommandStreamerStallEnable = true;
- pc.StallAtPixelScoreboard = true;
- }
+ genx_batch_emit_pipe_control(&cmd_buffer->batch,
+ cmd_buffer->device->info,
+ cmd_buffer->state.current_pipeline,
+ ANV_PIPE_CS_STALL_BIT |
+ ANV_PIPE_STALL_AT_SCOREBOARD_BIT);
emit_xfb_query(&b, index, anv_address_add(query_addr, 8));
break;
-#if GFX_VER >= 8
case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
if (!khr_perf_query_ensure_relocs(cmd_buffer))
return;
@@ -979,12 +1117,15 @@ void genX(CmdBeginQueryIndexedEXT)(
assert(reloc_idx == pdevice->n_perf_query_commands);
- mi_self_mod_barrier(&b);
+ const struct intel_device_info *devinfo = cmd_buffer->device->info;
+ const enum intel_engine_class engine_class = cmd_buffer->queue_family->engine_class;
+ mi_self_mod_barrier(&b, devinfo->engine_class_prefetch[engine_class]);
- anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
- pc.CommandStreamerStallEnable = true;
- pc.StallAtPixelScoreboard = true;
- }
+ genx_batch_emit_pipe_control(&cmd_buffer->batch,
+ cmd_buffer->device->info,
+ cmd_buffer->state.current_pipeline,
+ ANV_PIPE_CS_STALL_BIT |
+ ANV_PIPE_STALL_AT_SCOREBOARD_BIT);
cmd_buffer->perf_query_pool = pool;
cmd_buffer->perf_reloc_idx = 0;
@@ -1007,6 +1148,7 @@ void genX(CmdBeginQueryIndexedEXT)(
case INTEL_PERF_QUERY_FIELD_TYPE_SRM_PERFCNT:
case INTEL_PERF_QUERY_FIELD_TYPE_SRM_RPSTAT:
+ case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_A:
case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_B:
case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_C:
dws =
@@ -1040,30 +1182,24 @@ void genX(CmdBeginQueryIndexedEXT)(
}
break;
}
-#endif
case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
- anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
- pc.CommandStreamerStallEnable = true;
- pc.StallAtPixelScoreboard = true;
- }
+ genx_batch_emit_pipe_control(&cmd_buffer->batch,
+ cmd_buffer->device->info,
+ cmd_buffer->state.current_pipeline,
+ ANV_PIPE_CS_STALL_BIT |
+ ANV_PIPE_STALL_AT_SCOREBOARD_BIT);
emit_perf_intel_query(cmd_buffer, pool, &b, query_addr, false);
break;
}
-
+ case VK_QUERY_TYPE_RESULT_STATUS_ONLY_KHR:
+ emit_query_mi_flush_availability(cmd_buffer, query_addr, false);
+ break;
default:
unreachable("");
}
}
-void genX(CmdEndQuery)(
- VkCommandBuffer commandBuffer,
- VkQueryPool queryPool,
- uint32_t query)
-{
- genX(CmdEndQueryIndexedEXT)(commandBuffer, queryPool, query, 0);
-}
-
void genX(CmdEndQueryIndexedEXT)(
VkCommandBuffer commandBuffer,
VkQueryPool queryPool,
@@ -1075,22 +1211,40 @@ void genX(CmdEndQueryIndexedEXT)(
struct anv_address query_addr = anv_query_address(pool, query);
struct mi_builder b;
- mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
+ mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
- switch (pool->type) {
+ switch (pool->vk.query_type) {
case VK_QUERY_TYPE_OCCLUSION:
emit_ps_depth_count(cmd_buffer, anv_address_add(query_addr, 16));
emit_query_pc_availability(cmd_buffer, query_addr, true);
+ cmd_buffer->state.gfx.n_occlusion_queries--;
+ cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_OCCLUSION_QUERY_ACTIVE;
+ break;
+
+ case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
+ /* Ensure previous commands have completed before capturing the register
+ * value.
+ */
+ genx_batch_emit_pipe_control(&cmd_buffer->batch,
+ cmd_buffer->device->info,
+ cmd_buffer->state.current_pipeline,
+ ANV_PIPE_CS_STALL_BIT |
+ ANV_PIPE_STALL_AT_SCOREBOARD_BIT);
+
+ mi_store(&b, mi_mem64(anv_address_add(query_addr, 16)),
+ mi_reg64(GENX(CL_INVOCATION_COUNT_num)));
+ emit_query_mi_availability(&b, query_addr, true);
break;
case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
/* TODO: This might only be necessary for certain stats */
- anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
- pc.CommandStreamerStallEnable = true;
- pc.StallAtPixelScoreboard = true;
- }
+ genx_batch_emit_pipe_control(&cmd_buffer->batch,
+ cmd_buffer->device->info,
+ cmd_buffer->state.current_pipeline,
+ ANV_PIPE_CS_STALL_BIT |
+ ANV_PIPE_STALL_AT_SCOREBOARD_BIT);
- uint32_t statistics = pool->pipeline_statistics;
+ uint32_t statistics = pool->vk.pipeline_statistics;
uint32_t offset = 16;
while (statistics) {
uint32_t stat = u_bit_scan(&statistics);
@@ -1103,21 +1257,21 @@ void genX(CmdEndQueryIndexedEXT)(
}
case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
- anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
- pc.CommandStreamerStallEnable = true;
- pc.StallAtPixelScoreboard = true;
- }
-
+ genx_batch_emit_pipe_control(&cmd_buffer->batch,
+ cmd_buffer->device->info,
+ cmd_buffer->state.current_pipeline,
+ ANV_PIPE_CS_STALL_BIT |
+ ANV_PIPE_STALL_AT_SCOREBOARD_BIT);
emit_xfb_query(&b, index, anv_address_add(query_addr, 16));
emit_query_mi_availability(&b, query_addr, true);
break;
-#if GFX_VER >= 8
case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
- anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
- pc.CommandStreamerStallEnable = true;
- pc.StallAtPixelScoreboard = true;
- }
+ genx_batch_emit_pipe_control(&cmd_buffer->batch,
+ cmd_buffer->device->info,
+ cmd_buffer->state.current_pipeline,
+ ANV_PIPE_CS_STALL_BIT |
+ ANV_PIPE_STALL_AT_SCOREBOARD_BIT);
cmd_buffer->perf_query_pool = pool;
if (!khr_perf_query_ensure_relocs(cmd_buffer))
@@ -1144,6 +1298,7 @@ void genX(CmdEndQueryIndexedEXT)(
case INTEL_PERF_QUERY_FIELD_TYPE_SRM_PERFCNT:
case INTEL_PERF_QUERY_FIELD_TYPE_SRM_RPSTAT:
+ case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_A:
case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_B:
case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_C:
dws =
@@ -1189,13 +1344,13 @@ void genX(CmdEndQueryIndexedEXT)(
assert(cmd_buffer->perf_reloc_idx == pdevice->n_perf_query_commands);
break;
}
-#endif
case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
- anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
- pc.CommandStreamerStallEnable = true;
- pc.StallAtPixelScoreboard = true;
- }
+ genx_batch_emit_pipe_control(&cmd_buffer->batch,
+ cmd_buffer->device->info,
+ cmd_buffer->state.current_pipeline,
+ ANV_PIPE_CS_STALL_BIT |
+ ANV_PIPE_STALL_AT_SCOREBOARD_BIT);
uint32_t marker_offset = intel_perf_marker_offset();
mi_store(&b, mi_mem64(anv_address_add(query_addr, marker_offset)),
mi_imm(cmd_buffer->intel_perf_marker));
@@ -1203,6 +1358,9 @@ void genX(CmdEndQueryIndexedEXT)(
emit_query_mi_availability(&b, query_addr, true);
break;
}
+ case VK_QUERY_TYPE_RESULT_STATUS_ONLY_KHR:
+ emit_query_mi_flush_availability(cmd_buffer, query_addr, true);
+ break;
default:
unreachable("");
@@ -1216,9 +1374,9 @@ void genX(CmdEndQueryIndexedEXT)(
* first index, mark the other query indices as being already available
* with result 0.
*/
- if (cmd_buffer->state.subpass && cmd_buffer->state.subpass->view_mask) {
+ if (cmd_buffer->state.gfx.view_mask) {
const uint32_t num_queries =
- util_bitcount(cmd_buffer->state.subpass->view_mask);
+ util_bitcount(cmd_buffer->state.gfx.view_mask);
if (num_queries > 1)
emit_zero_queries(cmd_buffer, &b, pool, query + 1, num_queries - 1);
}
@@ -1226,9 +1384,9 @@ void genX(CmdEndQueryIndexedEXT)(
#define TIMESTAMP 0x2358
-void genX(CmdWriteTimestamp)(
+void genX(CmdWriteTimestamp2)(
VkCommandBuffer commandBuffer,
- VkPipelineStageFlagBits pipelineStage,
+ VkPipelineStageFlags2 stage,
VkQueryPool queryPool,
uint32_t query)
{
@@ -1236,34 +1394,49 @@ void genX(CmdWriteTimestamp)(
ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
struct anv_address query_addr = anv_query_address(pool, query);
- assert(pool->type == VK_QUERY_TYPE_TIMESTAMP);
+ assert(pool->vk.query_type == VK_QUERY_TYPE_TIMESTAMP);
+
+ emit_query_clear_flush(cmd_buffer, pool,
+ "CmdWriteTimestamp flush query clears");
struct mi_builder b;
- mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
+ mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
- switch (pipelineStage) {
- case VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT:
+ if (stage == VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT) {
mi_store(&b, mi_mem64(anv_address_add(query_addr, 8)),
mi_reg64(TIMESTAMP));
- break;
-
- default:
+ emit_query_mi_availability(&b, query_addr, true);
+ } else {
/* Everything else is bottom-of-pipe */
cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT;
genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
- anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
- pc.DestinationAddressType = DAT_PPGTT;
- pc.PostSyncOperation = WriteTimestamp;
- pc.Address = anv_address_add(query_addr, 8);
+ bool cs_stall_needed =
+ (GFX_VER == 9 && cmd_buffer->device->info->gt == 4);
- if (GFX_VER == 9 && cmd_buffer->device->info.gt == 4)
- pc.CommandStreamerStallEnable = true;
+ if (anv_cmd_buffer_is_blitter_queue(cmd_buffer) ||
+ anv_cmd_buffer_is_video_queue(cmd_buffer)) {
+ /* Wa_16018063123 - emit fast color dummy blit before MI_FLUSH_DW. */
+ if (intel_needs_workaround(cmd_buffer->device->info, 16018063123)) {
+ genX(batch_emit_fast_color_dummy_blit)(&cmd_buffer->batch,
+ cmd_buffer->device);
+ }
+ anv_batch_emit(&cmd_buffer->batch, GENX(MI_FLUSH_DW), dw) {
+ dw.Address = anv_address_add(query_addr, 8);
+ dw.PostSyncOperation = WriteTimestamp;
+ }
+ emit_query_mi_flush_availability(cmd_buffer, query_addr, true);
+ } else {
+ genx_batch_emit_pipe_control_write
+ (&cmd_buffer->batch, cmd_buffer->device->info,
+ cmd_buffer->state.current_pipeline, WriteTimestamp,
+ anv_address_add(query_addr, 8), 0,
+ cs_stall_needed ? ANV_PIPE_CS_STALL_BIT : 0);
+ emit_query_pc_availability(cmd_buffer, query_addr, true);
}
- break;
+
}
- emit_query_pc_availability(cmd_buffer, query_addr, true);
/* When multiview is active the spec requires that N consecutive query
* indices are used, where N is the number of active views in the subpass.
@@ -1273,16 +1446,14 @@ void genX(CmdWriteTimestamp)(
* first index, mark the other query indices as being already available
* with result 0.
*/
- if (cmd_buffer->state.subpass && cmd_buffer->state.subpass->view_mask) {
+ if (cmd_buffer->state.gfx.view_mask) {
const uint32_t num_queries =
- util_bitcount(cmd_buffer->state.subpass->view_mask);
+ util_bitcount(cmd_buffer->state.gfx.view_mask);
if (num_queries > 1)
emit_zero_queries(cmd_buffer, &b, pool, query + 1, num_queries - 1);
}
}
-#if GFX_VERx10 >= 75
-
#define MI_PREDICATE_SRC0 0x2400
#define MI_PREDICATE_SRC1 0x2408
#define MI_PREDICATE_RESULT 0x2418
@@ -1341,61 +1512,92 @@ compute_query_result(struct mi_builder *b, struct anv_address addr)
mi_mem64(anv_address_add(addr, 0)));
}
-void genX(CmdCopyQueryPoolResults)(
- VkCommandBuffer commandBuffer,
- VkQueryPool queryPool,
- uint32_t firstQuery,
- uint32_t queryCount,
- VkBuffer destBuffer,
- VkDeviceSize destOffset,
- VkDeviceSize destStride,
- VkQueryResultFlags flags)
+static void
+copy_query_results_with_cs(struct anv_cmd_buffer *cmd_buffer,
+ struct anv_query_pool *pool,
+ struct anv_address dest_addr,
+ uint64_t dest_stride,
+ uint32_t first_query,
+ uint32_t query_count,
+ VkQueryResultFlags flags)
{
- ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
- ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
- ANV_FROM_HANDLE(anv_buffer, buffer, destBuffer);
+ enum anv_pipe_bits needed_flushes = 0;
- struct mi_builder b;
- mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
- struct mi_value result;
+ trace_intel_begin_query_copy_cs(&cmd_buffer->trace);
/* If render target writes are ongoing, request a render target cache flush
* to ensure proper ordering of the commands from the 3d pipe and the
* command streamer.
*/
- if (cmd_buffer->state.pending_pipe_bits & ANV_PIPE_RENDER_TARGET_BUFFER_WRITES) {
- anv_add_pending_pipe_bits(cmd_buffer,
- ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT,
- "CopyQueryPoolResults");
+ if ((cmd_buffer->state.queries.buffer_write_bits |
+ cmd_buffer->state.queries.clear_bits) &
+ ANV_QUERY_WRITES_RT_FLUSH)
+ needed_flushes |= ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;
+
+ if ((cmd_buffer->state.queries.buffer_write_bits |
+ cmd_buffer->state.queries.clear_bits) &
+ ANV_QUERY_WRITES_TILE_FLUSH)
+ needed_flushes |= ANV_PIPE_TILE_CACHE_FLUSH_BIT;
+
+ if ((cmd_buffer->state.queries.buffer_write_bits |
+ cmd_buffer->state.queries.clear_bits) &
+ ANV_QUERY_WRITES_DATA_FLUSH) {
+ needed_flushes |= (ANV_PIPE_DATA_CACHE_FLUSH_BIT |
+ ANV_PIPE_HDC_PIPELINE_FLUSH_BIT |
+ ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT);
}
- if ((flags & VK_QUERY_RESULT_WAIT_BIT) ||
- (cmd_buffer->state.pending_pipe_bits & ANV_PIPE_FLUSH_BITS) ||
- /* Occlusion & timestamp queries are written using a PIPE_CONTROL and
- * because we're about to copy values from MI commands, we need to
- * stall the command streamer to make sure the PIPE_CONTROL values have
- * landed, otherwise we could see inconsistent values & availability.
- *
- * From the vulkan spec:
- *
- * "vkCmdCopyQueryPoolResults is guaranteed to see the effect of
- * previous uses of vkCmdResetQueryPool in the same queue, without
- * any additional synchronization."
- */
- pool->type == VK_QUERY_TYPE_OCCLUSION ||
- pool->type == VK_QUERY_TYPE_TIMESTAMP) {
+ if ((cmd_buffer->state.queries.buffer_write_bits |
+ cmd_buffer->state.queries.clear_bits) &
+ ANV_QUERY_WRITES_CS_STALL)
+ needed_flushes |= ANV_PIPE_CS_STALL_BIT;
+
+ /* Occlusion & timestamp queries are written using a PIPE_CONTROL and
+ * because we're about to copy values from MI commands, we need to stall
+ * the command streamer to make sure the PIPE_CONTROL values have
+ * landed, otherwise we could see inconsistent values & availability.
+ *
+ * From the vulkan spec:
+ *
+ * "vkCmdCopyQueryPoolResults is guaranteed to see the effect of
+ * previous uses of vkCmdResetQueryPool in the same queue, without any
+ * additional synchronization."
+ */
+ if (pool->vk.query_type == VK_QUERY_TYPE_OCCLUSION ||
+ pool->vk.query_type == VK_QUERY_TYPE_TIMESTAMP)
+ needed_flushes |= ANV_PIPE_CS_STALL_BIT;
+
+ if (needed_flushes) {
anv_add_pending_pipe_bits(cmd_buffer,
- ANV_PIPE_CS_STALL_BIT,
+ needed_flushes,
"CopyQueryPoolResults");
genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
}
- struct anv_address dest_addr = anv_address_add(buffer->address, destOffset);
- for (uint32_t i = 0; i < queryCount; i++) {
- struct anv_address query_addr = anv_query_address(pool, firstQuery + i);
+ struct mi_builder b;
+ mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
+ struct mi_value result;
+
+ for (uint32_t i = 0; i < query_count; i++) {
+ struct anv_address query_addr = anv_query_address(pool, first_query + i);
+ const uint32_t mocs = anv_mocs_for_address(cmd_buffer->device, &query_addr);
+
+ mi_builder_set_mocs(&b, mocs);
+
+ /* Wait for the availability write to land before we go read the data */
+ if (flags & VK_QUERY_RESULT_WAIT_BIT) {
+ anv_batch_emit(&cmd_buffer->batch, GENX(MI_SEMAPHORE_WAIT), sem) {
+ sem.WaitMode = PollingMode;
+ sem.CompareOperation = COMPARE_SAD_EQUAL_SDD;
+ sem.SemaphoreDataDword = true;
+ sem.SemaphoreAddress = query_addr;
+ }
+ }
+
uint32_t idx = 0;
- switch (pool->type) {
+ switch (pool->vk.query_type) {
case VK_QUERY_TYPE_OCCLUSION:
+ case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
result = compute_query_result(&b, anv_address_add(query_addr, 8));
/* Like in the case of vkGetQueryPoolResults, if the query is
* unavailable and the VK_QUERY_RESULT_PARTIAL_BIT flag is set,
@@ -1403,32 +1605,23 @@ void genX(CmdCopyQueryPoolResults)(
* VK_QUERY_RESULT_PARTIAL_BIT isn't set, don't write any value.
*/
gpu_write_query_result_cond(cmd_buffer, &b, query_addr, dest_addr,
- 1 /* available */, flags, idx, result);
+ 1 /* available */, flags, idx, result);
if (flags & VK_QUERY_RESULT_PARTIAL_BIT) {
gpu_write_query_result_cond(cmd_buffer, &b, query_addr, dest_addr,
- 0 /* unavailable */, flags, idx, mi_imm(0));
+ 0 /* unavailable */, flags, idx, mi_imm(0));
}
idx++;
break;
case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
- uint32_t statistics = pool->pipeline_statistics;
+ uint32_t statistics = pool->vk.pipeline_statistics;
while (statistics) {
- uint32_t stat = u_bit_scan(&statistics);
-
+ UNUSED uint32_t stat = u_bit_scan(&statistics);
result = compute_query_result(&b, anv_address_add(query_addr,
idx * 16 + 8));
-
- /* WaDividePSInvocationCountBy4:HSW,BDW */
- if ((cmd_buffer->device->info.ver == 8 ||
- cmd_buffer->device->info.is_haswell) &&
- (1 << stat) == VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT) {
- result = mi_ushr32_imm(&b, result, 2);
- }
-
gpu_write_query_result(&b, dest_addr, flags, idx++, result);
}
- assert(idx == util_bitcount(pool->pipeline_statistics));
+ assert(idx == util_bitcount(pool->vk.pipeline_statistics));
break;
}
@@ -1444,11 +1637,23 @@ void genX(CmdCopyQueryPoolResults)(
gpu_write_query_result(&b, dest_addr, flags, idx++, result);
break;
-#if GFX_VER >= 8
+#if GFX_VERx10 >= 125
+ case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR:
+ case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR:
+ case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR:
+ result = mi_mem64(anv_address_add(query_addr, 8));
+ gpu_write_query_result(&b, dest_addr, flags, idx++, result);
+ break;
+
+ case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR:
+ result = mi_mem64(anv_address_add(query_addr, 16));
+ gpu_write_query_result(&b, dest_addr, flags, idx++, result);
+ break;
+#endif
+
case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
unreachable("Copy KHR performance query results not implemented");
break;
-#endif
default:
unreachable("unhandled query type");
@@ -1459,11 +1664,182 @@ void genX(CmdCopyQueryPoolResults)(
mi_mem64(query_addr));
}
- dest_addr = anv_address_add(dest_addr, destStride);
+ dest_addr = anv_address_add(dest_addr, dest_stride);
}
+
+ trace_intel_end_query_copy_cs(&cmd_buffer->trace, query_count);
+}
+
+static void
+copy_query_results_with_shader(struct anv_cmd_buffer *cmd_buffer,
+ struct anv_query_pool *pool,
+ struct anv_address dest_addr,
+ uint64_t dest_stride,
+ uint32_t first_query,
+ uint32_t query_count,
+ VkQueryResultFlags flags)
+{
+ struct anv_device *device = cmd_buffer->device;
+ enum anv_pipe_bits needed_flushes = 0;
+
+ trace_intel_begin_query_copy_shader(&cmd_buffer->trace);
+
+ /* If this is the first command in the batch buffer, make sure we have
+ * consistent pipeline mode.
+ */
+ if (cmd_buffer->state.current_pipeline == UINT32_MAX)
+ genX(flush_pipeline_select_3d)(cmd_buffer);
+
+ if ((cmd_buffer->state.queries.buffer_write_bits |
+ cmd_buffer->state.queries.clear_bits) & ANV_QUERY_WRITES_RT_FLUSH)
+ needed_flushes |= ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;
+
+ if ((cmd_buffer->state.queries.buffer_write_bits |
+ cmd_buffer->state.queries.clear_bits) & ANV_QUERY_WRITES_DATA_FLUSH) {
+ needed_flushes |= (ANV_PIPE_HDC_PIPELINE_FLUSH_BIT |
+ ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT);
+ }
+
+ /* Flushes for the queries to complete */
+ if (flags & VK_QUERY_RESULT_WAIT_BIT) {
+ /* Some queries are done with shaders, so we need to have them flush
+ * high level caches writes. The L3 should be shared across the GPU.
+ */
+ if (pool->vk.query_type == VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR ||
+ pool->vk.query_type == VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR ||
+ pool->vk.query_type == VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR ||
+ pool->vk.query_type == VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR) {
+ needed_flushes |= ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT;
+ }
+ /* And we need to stall for previous CS writes to land or the flushes to
+ * complete.
+ */
+ needed_flushes |= ANV_PIPE_CS_STALL_BIT;
+ }
+
+ /* Occlusion & timestamp queries are written using a PIPE_CONTROL and
+ * because we're about to copy values from MI commands, we need to stall
+ * the command streamer to make sure the PIPE_CONTROL values have
+ * landed, otherwise we could see inconsistent values & availability.
+ *
+ * From the vulkan spec:
+ *
+ * "vkCmdCopyQueryPoolResults is guaranteed to see the effect of
+ * previous uses of vkCmdResetQueryPool in the same queue, without any
+ * additional synchronization."
+ */
+ if (pool->vk.query_type == VK_QUERY_TYPE_OCCLUSION ||
+ pool->vk.query_type == VK_QUERY_TYPE_TIMESTAMP)
+ needed_flushes |= ANV_PIPE_CS_STALL_BIT;
+
+ if (needed_flushes) {
+ anv_add_pending_pipe_bits(cmd_buffer,
+ needed_flushes | ANV_PIPE_END_OF_PIPE_SYNC_BIT,
+ "CopyQueryPoolResults");
+ genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+ }
+
+ struct anv_shader_bin *copy_kernel;
+ VkResult ret =
+ anv_device_get_internal_shader(
+ cmd_buffer->device,
+ cmd_buffer->state.current_pipeline == GPGPU ?
+ ANV_INTERNAL_KERNEL_COPY_QUERY_RESULTS_COMPUTE :
+ ANV_INTERNAL_KERNEL_COPY_QUERY_RESULTS_FRAGMENT,
+ &copy_kernel);
+ if (ret != VK_SUCCESS) {
+ anv_batch_set_error(&cmd_buffer->batch, ret);
+ return;
+ }
+
+ struct anv_simple_shader state = {
+ .device = cmd_buffer->device,
+ .cmd_buffer = cmd_buffer,
+ .dynamic_state_stream = &cmd_buffer->dynamic_state_stream,
+ .general_state_stream = &cmd_buffer->general_state_stream,
+ .batch = &cmd_buffer->batch,
+ .kernel = copy_kernel,
+ .l3_config = device->internal_kernels_l3_config,
+ .urb_cfg = &cmd_buffer->state.gfx.urb_cfg,
+ };
+ genX(emit_simple_shader_init)(&state);
+
+ struct anv_state push_data_state =
+ genX(simple_shader_alloc_push)(&state,
+ sizeof(struct anv_query_copy_params));
+ if (push_data_state.map == NULL)
+ return;
+
+ struct anv_query_copy_params *params = push_data_state.map;
+
+ uint32_t copy_flags =
+ ((flags & VK_QUERY_RESULT_64_BIT) ? ANV_COPY_QUERY_FLAG_RESULT64 : 0) |
+ ((flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) ? ANV_COPY_QUERY_FLAG_AVAILABLE : 0);
+
+ uint32_t num_items = 1;
+ uint32_t data_offset = 8 /* behind availability */;
+ switch (pool->vk.query_type) {
+ case VK_QUERY_TYPE_OCCLUSION:
+ case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
+ copy_flags |= ANV_COPY_QUERY_FLAG_DELTA;
+ /* These 2 queries are the only ones where we would have partial data
+ * because they are capture with a PIPE_CONTROL post sync operation. The
+ * other ones are captured with MI_STORE_REGISTER_DATA so we're always
+ * available by the time we reach the copy command.
+ */
+ copy_flags |= (flags & VK_QUERY_RESULT_PARTIAL_BIT) ? ANV_COPY_QUERY_FLAG_PARTIAL : 0;
+ break;
+
+ case VK_QUERY_TYPE_PIPELINE_STATISTICS:
+ num_items = util_bitcount(pool->vk.pipeline_statistics);
+ copy_flags |= ANV_COPY_QUERY_FLAG_DELTA;
+ break;
+
+ case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
+ num_items = 2;
+ copy_flags |= ANV_COPY_QUERY_FLAG_DELTA;
+ break;
+
+ case VK_QUERY_TYPE_TIMESTAMP:
+ case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR:
+ case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR:
+ case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR:
+ break;
+
+ case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR:
+ data_offset += 8;
+ break;
+
+ default:
+ unreachable("unhandled query type");
+ }
+
+ *params = (struct anv_query_copy_params) {
+ .flags = copy_flags,
+ .num_queries = query_count,
+ .num_items = num_items,
+ .query_base = first_query,
+ .query_stride = pool->stride,
+ .query_data_offset = data_offset,
+ .destination_stride = dest_stride,
+ .query_data_addr = anv_address_physical(
+ (struct anv_address) {
+ .bo = pool->bo,
+ }),
+ .destination_addr = anv_address_physical(dest_addr),
+ };
+
+ genX(emit_simple_shader_dispatch)(&state, query_count, push_data_state);
+
+ /* The query copy result shader is writing using the dataport, flush
+ * HDC/Data cache depending on the generation. Also stall at pixel
+ * scoreboard in case we're doing the copy with a fragment shader.
+ */
+ cmd_buffer->state.queries.buffer_write_bits |= ANV_QUERY_WRITES_DATA_FLUSH;
+
+ trace_intel_end_query_copy_shader(&cmd_buffer->trace, query_count);
}
-#else
void genX(CmdCopyQueryPoolResults)(
VkCommandBuffer commandBuffer,
VkQueryPool queryPool,
@@ -1474,6 +1850,99 @@ void genX(CmdCopyQueryPoolResults)(
VkDeviceSize destStride,
VkQueryResultFlags flags)
{
- anv_finishme("Queries not yet supported on Ivy Bridge");
+ ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+ ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
+ ANV_FROM_HANDLE(anv_buffer, buffer, destBuffer);
+ struct anv_device *device = cmd_buffer->device;
+ struct anv_physical_device *pdevice = device->physical;
+
+ if (queryCount > pdevice->instance->query_copy_with_shader_threshold) {
+ copy_query_results_with_shader(cmd_buffer, pool,
+ anv_address_add(buffer->address,
+ destOffset),
+ destStride,
+ firstQuery,
+ queryCount,
+ flags);
+ } else {
+ copy_query_results_with_cs(cmd_buffer, pool,
+ anv_address_add(buffer->address,
+ destOffset),
+ destStride,
+ firstQuery,
+ queryCount,
+ flags);
+ }
+}
+
+#if GFX_VERx10 == 125 && ANV_SUPPORT_RT
+
+#include "grl/include/GRLRTASCommon.h"
+#include "grl/grl_metakernel_postbuild_info.h"
+
+void
+genX(CmdWriteAccelerationStructuresPropertiesKHR)(
+ VkCommandBuffer commandBuffer,
+ uint32_t accelerationStructureCount,
+ const VkAccelerationStructureKHR* pAccelerationStructures,
+ VkQueryType queryType,
+ VkQueryPool queryPool,
+ uint32_t firstQuery)
+{
+ ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+ ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
+
+ assert(queryType == VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR ||
+ queryType == VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR ||
+ queryType == VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR ||
+ queryType == VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR);
+
+ emit_query_clear_flush(cmd_buffer, pool,
+ "CmdWriteAccelerationStructuresPropertiesKHR flush query clears");
+
+ struct mi_builder b;
+ mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
+
+ for (uint32_t i = 0; i < accelerationStructureCount; i++) {
+ ANV_FROM_HANDLE(vk_acceleration_structure, accel, pAccelerationStructures[i]);
+ struct anv_address query_addr =
+ anv_address_add(anv_query_address(pool, firstQuery + i), 8);
+
+ switch (queryType) {
+ case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR:
+ genX(grl_postbuild_info_compacted_size)(cmd_buffer,
+ vk_acceleration_structure_get_va(accel),
+ anv_address_physical(query_addr));
+ break;
+
+ case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR:
+ genX(grl_postbuild_info_current_size)(cmd_buffer,
+ vk_acceleration_structure_get_va(accel),
+ anv_address_physical(query_addr));
+ break;
+
+ case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR:
+ case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR:
+ genX(grl_postbuild_info_serialized_size)(cmd_buffer,
+ vk_acceleration_structure_get_va(accel),
+ anv_address_physical(query_addr));
+ break;
+
+ default:
+ unreachable("unhandled query type");
+ }
+ }
+
+ /* TODO: Figure out why MTL needs ANV_PIPE_DATA_CACHE_FLUSH_BIT in order
+ * to not lose the availability bit.
+ */
+ anv_add_pending_pipe_bits(cmd_buffer,
+ ANV_PIPE_END_OF_PIPE_SYNC_BIT |
+ ANV_PIPE_DATA_CACHE_FLUSH_BIT,
+ "after write acceleration struct props");
+ genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+
+ for (uint32_t i = 0; i < accelerationStructureCount; i++)
+ emit_query_mi_availability(&b, anv_query_address(pool, firstQuery + i), true);
}
#endif