diff options
Diffstat (limited to 'src/intel/vulkan/genX_query.c')
-rw-r--r-- | src/intel/vulkan/genX_query.c | 176 |
1 files changed, 120 insertions, 56 deletions
diff --git a/src/intel/vulkan/genX_query.c b/src/intel/vulkan/genX_query.c index 2cb492afcf9..aaf3ca962b2 100644 --- a/src/intel/vulkan/genX_query.c +++ b/src/intel/vulkan/genX_query.c @@ -37,17 +37,14 @@ #include "ds/intel_tracepoints.h" #include "anv_internal_kernels.h" +#include "genX_mi_builder.h" + +#if GFX_VERx10 >= 125 +#define ANV_PIPELINE_STATISTICS_MASK 0x00001fff +#else +#define ANV_PIPELINE_STATISTICS_MASK 0x000007ff +#endif -/* We reserve : - * - GPR 14 for perf queries - * - GPR 15 for conditional rendering - */ -#define MI_BUILDER_NUM_ALLOC_GPRS 14 -#define MI_BUILDER_CAN_WRITE_BATCH true -#define __gen_get_batch_dwords anv_batch_emit_dwords -#define __gen_address_offset anv_address_add -#define __gen_get_batch_address(b, a) anv_batch_address(b, a) -#include "common/mi_builder.h" #include "perf/intel_perf.h" #include "perf/intel_perf_mdapi.h" #include "perf/intel_perf_regs.h" @@ -186,6 +183,11 @@ VkResult genX(CreateQueryPool)( uint64s_per_slot = 1 + 2 /* availability + size (PostbuildInfoSerializationDesc) */; break; + case VK_QUERY_TYPE_MESH_PRIMITIVES_GENERATED_EXT: + /* Query has two values: begin and end. */ + uint64s_per_slot = 1 + 2; + break; + #endif case VK_QUERY_TYPE_RESULT_STATUS_ONLY_KHR: uint64s_per_slot = 1; @@ -484,6 +486,7 @@ VkResult genX(GetQueryPoolResults)( pool->vk.query_type == VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR || pool->vk.query_type == VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR || pool->vk.query_type == VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR || + pool->vk.query_type == VK_QUERY_TYPE_MESH_PRIMITIVES_GENERATED_EXT || #endif pool->vk.query_type == VK_QUERY_TYPE_OCCLUSION || pool->vk.query_type == VK_QUERY_TYPE_PIPELINE_STATISTICS || @@ -535,7 +538,11 @@ VkResult genX(GetQueryPoolResults)( uint32_t idx = 0; switch (pool->vk.query_type) { case VK_QUERY_TYPE_OCCLUSION: - case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: { + case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: +#if GFX_VERx10 >= 125 + case VK_QUERY_TYPE_MESH_PRIMITIVES_GENERATED_EXT: +#endif + { uint64_t *slot = query_slot(pool, firstQuery + i); if (write_results) { /* From the Vulkan 1.2.132 spec: @@ -558,7 +565,8 @@ VkResult genX(GetQueryPoolResults)( while (statistics) { UNUSED uint32_t stat = u_bit_scan(&statistics); if (write_results) { - uint64_t result = slot[idx * 2 + 2] - slot[idx * 2 + 1]; + /* If a query is not available but VK_QUERY_RESULT_PARTIAL_BIT is set, write 0. */ + uint64_t result = available ? slot[idx * 2 + 2] - slot[idx * 2 + 1] : 0; cpu_write_query_result(pData, flags, idx, result); } idx++; @@ -569,11 +577,17 @@ VkResult genX(GetQueryPoolResults)( case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: { uint64_t *slot = query_slot(pool, firstQuery + i); - if (write_results) - cpu_write_query_result(pData, flags, idx, slot[2] - slot[1]); + if (write_results) { + /* If a query is not available but VK_QUERY_RESULT_PARTIAL_BIT is set, write 0. */ + uint64_t result = available ? slot[2] - slot[1] : 0; + cpu_write_query_result(pData, flags, idx, result); + } idx++; - if (write_results) - cpu_write_query_result(pData, flags, idx, slot[4] - slot[3]); + if (write_results) { + /* If a query is not available but VK_QUERY_RESULT_PARTIAL_BIT is set, write 0. */ + uint64_t result = available ? slot[4] - slot[3] : 0; + cpu_write_query_result(pData, flags, idx, result); + } idx++; break; } @@ -737,6 +751,9 @@ emit_zero_queries(struct anv_cmd_buffer *cmd_buffer, case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: case VK_QUERY_TYPE_PIPELINE_STATISTICS: case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: +#if GFX_VERx10 >= 125 + case VK_QUERY_TYPE_MESH_PRIMITIVES_GENERATED_EXT: +#endif for (uint32_t i = 0; i < num_queries; i++) { struct anv_address slot_addr = anv_query_address(pool, first_index + i); @@ -844,7 +861,11 @@ void genX(CmdResetQueryPool)( case VK_QUERY_TYPE_PIPELINE_STATISTICS: case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: - case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: { + case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: +#if GFX_VERx10 >= 125 + case VK_QUERY_TYPE_MESH_PRIMITIVES_GENERATED_EXT: +#endif + { struct mi_builder b; mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch); @@ -921,6 +942,10 @@ static const uint32_t vk_pipeline_stat_to_reg[] = { GENX(HS_INVOCATION_COUNT_num), GENX(DS_INVOCATION_COUNT_num), GENX(CS_INVOCATION_COUNT_num), +#if GFX_VERx10 >= 125 + GENX(TASK_INVOCATION_COUNT_num), + GENX(MESH_INVOCATION_COUNT_num) +#endif }; static void @@ -1042,6 +1067,18 @@ void genX(CmdBeginQueryIndexedEXT)( mi_reg64(GENX(CL_INVOCATION_COUNT_num))); break; +#if GFX_VERx10 >= 125 + case VK_QUERY_TYPE_MESH_PRIMITIVES_GENERATED_EXT: + genx_batch_emit_pipe_control(&cmd_buffer->batch, + cmd_buffer->device->info, + cmd_buffer->state.current_pipeline, + ANV_PIPE_CS_STALL_BIT | + ANV_PIPE_STALL_AT_SCOREBOARD_BIT); + mi_store(&b, mi_mem64(anv_address_add(query_addr, 8)), + mi_reg64(GENX(MESH_PRIMITIVE_COUNT_num))); + break; +#endif + case VK_QUERY_TYPE_PIPELINE_STATISTICS: { /* TODO: This might only be necessary for certain stats */ genx_batch_emit_pipe_control(&cmd_buffer->batch, @@ -1088,7 +1125,8 @@ void genX(CmdBeginQueryIndexedEXT)( khr_perf_query_data_offset(pool, query, 0, end) + field->location)), mi_reg64(ANV_PERF_QUERY_OFFSET_REG)); - cmd_buffer->self_mod_locations[reloc_idx++] = mi_store_address(&b, reg_addr); + cmd_buffer->self_mod_locations[reloc_idx++] = + mi_store_relocated_address_reg64(&b, reg_addr); if (field->type != INTEL_PERF_QUERY_FIELD_TYPE_MI_RPC && field->size == 8) { @@ -1099,7 +1137,8 @@ void genX(CmdBeginQueryIndexedEXT)( khr_perf_query_data_offset(pool, query, 0, end) + field->location + 4)), mi_reg64(ANV_PERF_QUERY_OFFSET_REG)); - cmd_buffer->self_mod_locations[reloc_idx++] = mi_store_address(&b, reg_addr); + cmd_buffer->self_mod_locations[reloc_idx++] = + mi_store_relocated_address_reg64(&b, reg_addr); } } } @@ -1113,7 +1152,7 @@ void genX(CmdBeginQueryIndexedEXT)( khr_perf_query_availability_offset(pool, query, 0 /* pass */))), mi_reg64(ANV_PERF_QUERY_OFFSET_REG)); cmd_buffer->self_mod_locations[reloc_idx++] = - mi_store_address(&b, availability_write_offset); + mi_store_relocated_address_reg64(&b, availability_write_offset); assert(reloc_idx == pdevice->n_perf_query_commands); @@ -1140,10 +1179,10 @@ void genX(CmdBeginQueryIndexedEXT)( GENX(MI_REPORT_PERF_COUNT_length), GENX(MI_REPORT_PERF_COUNT), .MemoryAddress = query_addr /* Will be overwritten */); - _mi_resolve_address_token(&b, - cmd_buffer->self_mod_locations[cmd_buffer->perf_reloc_idx++], - dws + - GENX(MI_REPORT_PERF_COUNT_MemoryAddress_start) / 8); + mi_resolve_relocated_address_token( + &b, + cmd_buffer->self_mod_locations[cmd_buffer->perf_reloc_idx++], + dws + GENX(MI_REPORT_PERF_COUNT_MemoryAddress_start) / 8); break; case INTEL_PERF_QUERY_FIELD_TYPE_SRM_PERFCNT: @@ -1157,10 +1196,10 @@ void genX(CmdBeginQueryIndexedEXT)( GENX(MI_STORE_REGISTER_MEM), .RegisterAddress = field->mmio_offset, .MemoryAddress = query_addr /* Will be overwritten */ ); - _mi_resolve_address_token(&b, - cmd_buffer->self_mod_locations[cmd_buffer->perf_reloc_idx++], - dws + - GENX(MI_STORE_REGISTER_MEM_MemoryAddress_start) / 8); + mi_resolve_relocated_address_token( + &b, + cmd_buffer->self_mod_locations[cmd_buffer->perf_reloc_idx++], + dws + GENX(MI_STORE_REGISTER_MEM_MemoryAddress_start) / 8); if (field->size == 8) { dws = anv_batch_emitn(&cmd_buffer->batch, @@ -1168,10 +1207,10 @@ void genX(CmdBeginQueryIndexedEXT)( GENX(MI_STORE_REGISTER_MEM), .RegisterAddress = field->mmio_offset + 4, .MemoryAddress = query_addr /* Will be overwritten */ ); - _mi_resolve_address_token(&b, - cmd_buffer->self_mod_locations[cmd_buffer->perf_reloc_idx++], - dws + - GENX(MI_STORE_REGISTER_MEM_MemoryAddress_start) / 8); + mi_resolve_relocated_address_token( + &b, + cmd_buffer->self_mod_locations[cmd_buffer->perf_reloc_idx++], + dws + GENX(MI_STORE_REGISTER_MEM_MemoryAddress_start) / 8); } break; @@ -1236,6 +1275,19 @@ void genX(CmdEndQueryIndexedEXT)( emit_query_mi_availability(&b, query_addr, true); break; +#if GFX_VERx10 >= 125 + case VK_QUERY_TYPE_MESH_PRIMITIVES_GENERATED_EXT: + genx_batch_emit_pipe_control(&cmd_buffer->batch, + cmd_buffer->device->info, + cmd_buffer->state.current_pipeline, + ANV_PIPE_CS_STALL_BIT | + ANV_PIPE_STALL_AT_SCOREBOARD_BIT); + mi_store(&b, mi_mem64(anv_address_add(query_addr, 16)), + mi_reg64(GENX(MESH_PRIMITIVE_COUNT_num))); + emit_query_mi_availability(&b, query_addr, true); + break; +#endif + case VK_QUERY_TYPE_PIPELINE_STATISTICS: { /* TODO: This might only be necessary for certain stats */ genx_batch_emit_pipe_control(&cmd_buffer->batch, @@ -1290,10 +1342,10 @@ void genX(CmdEndQueryIndexedEXT)( GENX(MI_REPORT_PERF_COUNT_length), GENX(MI_REPORT_PERF_COUNT), .MemoryAddress = query_addr /* Will be overwritten */); - _mi_resolve_address_token(&b, - cmd_buffer->self_mod_locations[cmd_buffer->perf_reloc_idx++], - dws + - GENX(MI_REPORT_PERF_COUNT_MemoryAddress_start) / 8); + mi_resolve_relocated_address_token( + &b, + cmd_buffer->self_mod_locations[cmd_buffer->perf_reloc_idx++], + dws + GENX(MI_REPORT_PERF_COUNT_MemoryAddress_start) / 8); break; case INTEL_PERF_QUERY_FIELD_TYPE_SRM_PERFCNT: @@ -1307,10 +1359,10 @@ void genX(CmdEndQueryIndexedEXT)( GENX(MI_STORE_REGISTER_MEM), .RegisterAddress = field->mmio_offset, .MemoryAddress = query_addr /* Will be overwritten */ ); - _mi_resolve_address_token(&b, - cmd_buffer->self_mod_locations[cmd_buffer->perf_reloc_idx++], - dws + - GENX(MI_STORE_REGISTER_MEM_MemoryAddress_start) / 8); + mi_resolve_relocated_address_token( + &b, + cmd_buffer->self_mod_locations[cmd_buffer->perf_reloc_idx++], + dws + GENX(MI_STORE_REGISTER_MEM_MemoryAddress_start) / 8); if (field->size == 8) { dws = anv_batch_emitn(&cmd_buffer->batch, @@ -1318,10 +1370,10 @@ void genX(CmdEndQueryIndexedEXT)( GENX(MI_STORE_REGISTER_MEM), .RegisterAddress = field->mmio_offset + 4, .MemoryAddress = query_addr /* Will be overwritten */ ); - _mi_resolve_address_token(&b, - cmd_buffer->self_mod_locations[cmd_buffer->perf_reloc_idx++], - dws + - GENX(MI_STORE_REGISTER_MEM_MemoryAddress_start) / 8); + mi_resolve_relocated_address_token( + &b, + cmd_buffer->self_mod_locations[cmd_buffer->perf_reloc_idx++], + dws + GENX(MI_STORE_REGISTER_MEM_MemoryAddress_start) / 8); } break; @@ -1336,10 +1388,10 @@ void genX(CmdEndQueryIndexedEXT)( GENX(MI_STORE_DATA_IMM_length), GENX(MI_STORE_DATA_IMM), .ImmediateData = true); - _mi_resolve_address_token(&b, - cmd_buffer->self_mod_locations[cmd_buffer->perf_reloc_idx++], - dws + - GENX(MI_STORE_DATA_IMM_Address_start) / 8); + mi_resolve_relocated_address_token( + &b, + cmd_buffer->self_mod_locations[cmd_buffer->perf_reloc_idx++], + dws + GENX(MI_STORE_DATA_IMM_Address_start) / 8); assert(cmd_buffer->perf_reloc_idx == pdevice->n_perf_query_commands); break; @@ -1576,13 +1628,13 @@ copy_query_results_with_cs(struct anv_cmd_buffer *cmd_buffer, struct mi_builder b; mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch); - struct mi_value result; + mi_builder_set_mocs(&b, anv_mocs_for_address( + cmd_buffer->device, + &(struct anv_address) { .bo = pool->bo })); for (uint32_t i = 0; i < query_count; i++) { struct anv_address query_addr = anv_query_address(pool, first_query + i); - const uint32_t mocs = anv_mocs_for_address(cmd_buffer->device, &query_addr); - - mi_builder_set_mocs(&b, mocs); + struct mi_value result; /* Wait for the availability write to land before we go read the data */ if (flags & VK_QUERY_RESULT_WAIT_BIT) { @@ -1598,6 +1650,9 @@ copy_query_results_with_cs(struct anv_cmd_buffer *cmd_buffer, switch (pool->vk.query_type) { case VK_QUERY_TYPE_OCCLUSION: case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: +#if GFX_VERx10 >= 125 + case VK_QUERY_TYPE_MESH_PRIMITIVES_GENERATED_EXT: +#endif result = compute_query_result(&b, anv_address_add(query_addr, 8)); /* Like in the case of vkGetQueryPoolResults, if the query is * unavailable and the VK_QUERY_RESULT_PARTIAL_BIT flag is set, @@ -1780,9 +1835,8 @@ copy_query_results_with_shader(struct anv_cmd_buffer *cmd_buffer, uint32_t data_offset = 8 /* behind availability */; switch (pool->vk.query_type) { case VK_QUERY_TYPE_OCCLUSION: - case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: copy_flags |= ANV_COPY_QUERY_FLAG_DELTA; - /* These 2 queries are the only ones where we would have partial data + /* Occlusion and timestamps queries are the only ones where we would have partial data * because they are capture with a PIPE_CONTROL post sync operation. The * other ones are captured with MI_STORE_REGISTER_DATA so we're always * available by the time we reach the copy command. @@ -1790,6 +1844,17 @@ copy_query_results_with_shader(struct anv_cmd_buffer *cmd_buffer, copy_flags |= (flags & VK_QUERY_RESULT_PARTIAL_BIT) ? ANV_COPY_QUERY_FLAG_PARTIAL : 0; break; + case VK_QUERY_TYPE_TIMESTAMP: + copy_flags |= (flags & VK_QUERY_RESULT_PARTIAL_BIT) ? ANV_COPY_QUERY_FLAG_PARTIAL : 0; + break; + + case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: +#if GFX_VERx10 >= 125 + case VK_QUERY_TYPE_MESH_PRIMITIVES_GENERATED_EXT: +#endif + copy_flags |= ANV_COPY_QUERY_FLAG_DELTA; + break; + case VK_QUERY_TYPE_PIPELINE_STATISTICS: num_items = util_bitcount(pool->vk.pipeline_statistics); copy_flags |= ANV_COPY_QUERY_FLAG_DELTA; @@ -1800,7 +1865,6 @@ copy_query_results_with_shader(struct anv_cmd_buffer *cmd_buffer, copy_flags |= ANV_COPY_QUERY_FLAG_DELTA; break; - case VK_QUERY_TYPE_TIMESTAMP: case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR: case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR: case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR: @@ -1875,7 +1939,7 @@ void genX(CmdCopyQueryPoolResults)( } } -#if GFX_VERx10 == 125 && ANV_SUPPORT_RT +#if GFX_VERx10 >= 125 && ANV_SUPPORT_RT #include "grl/include/GRLRTASCommon.h" #include "grl/grl_metakernel_postbuild_info.h" |