summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorElla Stanforth <estanforth@igalia.com>2021-11-23 22:29:48 +0000
committerMarge Bot <emma+marge@anholt.net>2022-06-27 07:34:16 +0000
commitf392b6c1ad4a360a02eb2a4024e3d7bb03a4b759 (patch)
treef19f2d960f4b3d0b12ad75300e22de8017d11982
parentf2a24fd4a239fcfb196f8cd89b196bcdcace4b2a (diff)
v3dv: Implement VK_KHR_performance_query
Reviewed-by: Iago Toral Quiroga <itoral@igalia.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14061>
-rw-r--r--docs/features.txt2
-rw-r--r--src/broadcom/vulkan/v3dv_cmd_buffer.c90
-rw-r--r--src/broadcom/vulkan/v3dv_device.c22
-rw-r--r--src/broadcom/vulkan/v3dv_private.h62
-rw-r--r--src/broadcom/vulkan/v3dv_query.c468
-rw-r--r--src/broadcom/vulkan/v3dv_queue.c154
6 files changed, 713 insertions, 85 deletions
diff --git a/docs/features.txt b/docs/features.txt
index 047f6ee4d86..cd25d33f3ee 100644
--- a/docs/features.txt
+++ b/docs/features.txt
@@ -510,7 +510,7 @@ Khronos extensions that are not part of any Vulkan version:
VK_KHR_get_display_properties2 DONE (anv, lvp, radv, tu, v3dv)
VK_KHR_get_surface_capabilities2 DONE (anv, lvp, radv, tu, v3dv, vn)
VK_KHR_incremental_present DONE (anv, lvp, radv, tu, v3dv, vn)
- VK_KHR_performance_query DONE (anv/gen8+, tu)
+ VK_KHR_performance_query DONE (anv/gen8+, tu, v3dv)
VK_KHR_pipeline_executable_properties DONE (anv, radv, tu, v3dv)
VK_KHR_pipeline_library DONE (lvp, radv)
VK_KHR_push_descriptor DONE (anv, lvp, radv, tu)
diff --git a/src/broadcom/vulkan/v3dv_cmd_buffer.c b/src/broadcom/vulkan/v3dv_cmd_buffer.c
index 8fd5758ff29..f4e6a9956c7 100644
--- a/src/broadcom/vulkan/v3dv_cmd_buffer.c
+++ b/src/broadcom/vulkan/v3dv_cmd_buffer.c
@@ -777,6 +777,8 @@ v3dv_job_init(struct v3dv_job *job,
job->is_transfer = cmd_buffer->state.is_transfer;
cmd_buffer_serialize_job_if_needed(cmd_buffer, job);
+
+ job->perf = cmd_buffer->state.query.active_query.perf;
}
}
@@ -3223,24 +3225,44 @@ v3dv_cmd_buffer_begin_query(struct v3dv_cmd_buffer *cmd_buffer,
uint32_t query,
VkQueryControlFlags flags)
{
- /* FIXME: we only support one active query for now */
- assert(cmd_buffer->state.query.active_query.bo == NULL);
assert(query < pool->query_count);
+ switch (pool->query_type) {
+ case VK_QUERY_TYPE_OCCLUSION:
+ /* FIXME: we only support one active occlusion query for now */
+ assert(cmd_buffer->state.query.active_query.bo == NULL);
+
+ cmd_buffer->state.query.active_query.bo = pool->queries[query].bo;
+ cmd_buffer->state.query.active_query.offset = pool->queries[query].offset;
+ cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_OCCLUSION_QUERY;
+ break;
+ case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
+ assert(cmd_buffer->state.query.active_query.perf == NULL);
+ if (cmd_buffer->state.pass)
+ v3dv_cmd_buffer_subpass_finish(cmd_buffer);
- cmd_buffer->state.query.active_query.bo = pool->queries[query].bo;
- cmd_buffer->state.query.active_query.offset = pool->queries[query].offset;
- cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_OCCLUSION_QUERY;
+ cmd_buffer->state.query.active_query.perf =
+ &pool->queries[query].perf;
+
+ if (cmd_buffer->state.pass) {
+ v3dv_cmd_buffer_subpass_resume(cmd_buffer,
+ cmd_buffer->state.subpass_idx);
+ }
+ break;
+ }
+ default:
+ unreachable("Unsupported query type");
+ }
}
-void
-v3dv_cmd_buffer_end_query(struct v3dv_cmd_buffer *cmd_buffer,
- struct v3dv_query_pool *pool,
- uint32_t query)
+static void
+v3dv_cmd_buffer_schedule_end_query(struct v3dv_cmd_buffer *cmd_buffer,
+ struct v3dv_query_pool *pool,
+ uint32_t query)
{
assert(query < pool->query_count);
- assert(cmd_buffer->state.query.active_query.bo != NULL);
- if (cmd_buffer->state.pass) {
+ if (cmd_buffer->state.pass &&
+ pool->query_type != VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
/* Queue the EndQuery in the command buffer state, we will create a CPU
* job to flag all of these queries as possibly available right after the
* render pass job in which they have been recorded.
@@ -3295,11 +3317,57 @@ v3dv_cmd_buffer_end_query(struct v3dv_cmd_buffer *cmd_buffer,
list_addtail(&job->list_link, &cmd_buffer->jobs);
}
+}
+
+static void
+v3dv_cmd_buffer_end_occlusion_query(struct v3dv_cmd_buffer *cmd_buffer,
+ struct v3dv_query_pool *pool,
+ uint32_t query)
+{
+ assert(query < pool->query_count);
+ assert(cmd_buffer->state.query.active_query.bo != NULL);
+
+ v3dv_cmd_buffer_schedule_end_query(cmd_buffer, pool, query);
cmd_buffer->state.query.active_query.bo = NULL;
cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_OCCLUSION_QUERY;
}
+static void
+v3dv_cmd_buffer_end_performance_query(struct v3dv_cmd_buffer *cmd_buffer,
+ struct v3dv_query_pool *pool,
+ uint32_t query)
+{
+ assert(query < pool->query_count);
+ assert(cmd_buffer->state.query.active_query.perf != NULL);
+
+ if (cmd_buffer->state.pass)
+ v3dv_cmd_buffer_subpass_finish(cmd_buffer);
+
+ v3dv_cmd_buffer_schedule_end_query(cmd_buffer, pool, query);
+
+ cmd_buffer->state.query.active_query.perf = NULL;
+
+ if (cmd_buffer->state.pass)
+ v3dv_cmd_buffer_subpass_resume(cmd_buffer, cmd_buffer->state.subpass_idx);
+}
+
+void v3dv_cmd_buffer_end_query(struct v3dv_cmd_buffer *cmd_buffer,
+ struct v3dv_query_pool *pool,
+ uint32_t query)
+{
+ switch (pool->query_type) {
+ case VK_QUERY_TYPE_OCCLUSION:
+ v3dv_cmd_buffer_end_occlusion_query(cmd_buffer, pool, query);
+ break;
+ case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
+ v3dv_cmd_buffer_end_performance_query(cmd_buffer, pool, query);
+ break;
+ default:
+ unreachable("Unsupported query type");
+ }
+}
+
void
v3dv_cmd_buffer_copy_query_results(struct v3dv_cmd_buffer *cmd_buffer,
struct v3dv_query_pool *pool,
diff --git a/src/broadcom/vulkan/v3dv_device.c b/src/broadcom/vulkan/v3dv_device.c
index 21ffdbbc07b..6102b0b42cf 100644
--- a/src/broadcom/vulkan/v3dv_device.c
+++ b/src/broadcom/vulkan/v3dv_device.c
@@ -133,6 +133,7 @@ get_device_extensions(const struct v3dv_physical_device *device,
.KHR_get_memory_requirements2 = true,
.KHR_image_format_list = true,
.KHR_imageless_framebuffer = true,
+ .KHR_performance_query = device->caps.perfmon,
.KHR_relaxed_block_layout = true,
.KHR_maintenance1 = true,
.KHR_maintenance2 = true,
@@ -816,6 +817,9 @@ physical_device_init(struct v3dv_physical_device *device,
device->caps.multisync =
v3d_has_feature(device, DRM_V3D_PARAM_SUPPORTS_MULTISYNC_EXT);
+ device->caps.perfmon =
+ v3d_has_feature(device, DRM_V3D_PARAM_SUPPORTS_PERFMON);
+
result = init_uuids(device);
if (result != VK_SUCCESS)
goto fail;
@@ -1144,6 +1148,7 @@ VKAPI_ATTR void VKAPI_CALL
v3dv_GetPhysicalDeviceFeatures2(VkPhysicalDevice physicalDevice,
VkPhysicalDeviceFeatures2 *pFeatures)
{
+ V3DV_FROM_HANDLE(v3dv_physical_device, physical_device, physicalDevice);
v3dv_GetPhysicalDeviceFeatures(physicalDevice, &pFeatures->features);
VkPhysicalDeviceVulkan13Features vk13 = {
@@ -1289,6 +1294,16 @@ v3dv_GetPhysicalDeviceFeatures2(VkPhysicalDevice physicalDevice,
break;
}
+ case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PERFORMANCE_QUERY_FEATURES_KHR: {
+ VkPhysicalDevicePerformanceQueryFeaturesKHR *features =
+ (void *) ext;
+
+ features->performanceCounterQueryPools =
+ physical_device->caps.perfmon;
+ features->performanceCounterMultipleQueryPools = false;
+ break;
+ }
+
default:
v3dv_debug_ignored_stype(ext->sType);
break;
@@ -1637,6 +1652,13 @@ v3dv_GetPhysicalDeviceProperties2(VkPhysicalDevice physicalDevice,
props->maxVertexAttribDivisor = 0xffff;
break;
}
+ case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PERFORMANCE_QUERY_PROPERTIES_KHR : {
+ VkPhysicalDevicePerformanceQueryPropertiesKHR *props =
+ (VkPhysicalDevicePerformanceQueryPropertiesKHR *)ext;
+
+ props->allowCommandBufferQueryCopies = true;
+ break;
+ }
case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DRM_PROPERTIES_EXT: {
VkPhysicalDeviceDrmPropertiesEXT *props =
(VkPhysicalDeviceDrmPropertiesEXT *)ext;
diff --git a/src/broadcom/vulkan/v3dv_private.h b/src/broadcom/vulkan/v3dv_private.h
index 6c1399b04d7..cfd32ec7ad6 100644
--- a/src/broadcom/vulkan/v3dv_private.h
+++ b/src/broadcom/vulkan/v3dv_private.h
@@ -188,6 +188,7 @@ struct v3dv_physical_device {
struct {
bool multisync;
+ bool perfmon;
} caps;
};
@@ -263,6 +264,11 @@ struct v3dv_queue {
struct v3dv_last_job_sync last_job_syncs;
struct v3dv_job *noop_job;
+
+ /* The last active perfmon ID to prevent mixing of counter results when a
+ * job is submitted with a different perfmon id.
+ */
+ uint32_t last_perfmon_id;
};
VkResult v3dv_queue_driver_submit(struct vk_queue *vk_queue,
@@ -1027,6 +1033,19 @@ struct v3dv_timestamp_query_cpu_job_info {
uint32_t count;
};
+/* Number of perfmons required to handle all supported performance counters */
+#define V3DV_MAX_PERFMONS DIV_ROUND_UP(V3D_PERFCNT_NUM, \
+ DRM_V3D_MAX_PERF_COUNTERS)
+
+struct v3dv_perf_query {
+ uint32_t kperfmon_ids[V3DV_MAX_PERFMONS];
+
+ /* A DRM syncobj to wait on the GPU jobs for which we are collecting
+ * performance data.
+ */
+ struct vk_sync *last_job_sync;
+};
+
struct v3dv_job {
struct list_head list_link;
@@ -1127,6 +1146,9 @@ struct v3dv_job {
uint32_t wg_base[3];
struct drm_v3d_submit_csd submit;
} csd;
+
+ /* Perfmons with last job sync for CSD and CL jobs */
+ struct v3dv_perf_query *perf;
};
void v3dv_job_init(struct v3dv_job *job,
@@ -1328,12 +1350,15 @@ struct v3dv_cmd_buffer_state {
struct v3dv_end_query_cpu_job_info *states;
} end;
- /* This BO is not NULL if we have an active query, that is, we have
- * called vkCmdBeginQuery but not vkCmdEndQuery.
- */
struct {
+ /* This BO is not NULL if we have an active occlusion query, that is,
+ * we have called vkCmdBeginQuery but not vkCmdEndQuery.
+ */
struct v3dv_bo *bo;
uint32_t offset;
+
+ /* This pointer is not NULL if we have an active performance query */
+ struct v3dv_perf_query *perf;
} active_query;
} query;
};
@@ -1375,6 +1400,9 @@ struct v3dv_query {
};
/* Used by CPU queries (timestamp) */
uint64_t value;
+
+ /* Used by performance queries */
+ struct v3dv_perf_query perf;
};
};
@@ -1383,18 +1411,32 @@ struct v3dv_query_pool {
struct v3dv_bo *bo; /* Only used with GPU queries (occlusion) */
+ /* Only used with performance queries */
+ struct {
+ uint32_t ncounters;
+ uint8_t counters[V3D_PERFCNT_NUM];
+
+ /* V3D has a limit on the number of counters we can track in a
+ * single performance monitor, so if too many counters are requested
+ * we need to create multiple monitors to record all of them. This
+ * field represents the number of monitors required for the number
+ * of counters requested.
+ */
+ uint8_t nperfmons;
+ } perfmon;
+
VkQueryType query_type;
uint32_t query_count;
struct v3dv_query *queries;
};
-VkResult v3dv_get_query_pool_results_cpu(struct v3dv_device *device,
- struct v3dv_query_pool *pool,
- uint32_t first,
- uint32_t count,
- void *data,
- VkDeviceSize stride,
- VkQueryResultFlags flags);
+VkResult v3dv_get_query_pool_results(struct v3dv_device *device,
+ struct v3dv_query_pool *pool,
+ uint32_t first,
+ uint32_t count,
+ void *data,
+ VkDeviceSize stride,
+ VkQueryResultFlags flags);
void v3dv_reset_query_pools(struct v3dv_device *device,
struct v3dv_query_pool *query_pool,
diff --git a/src/broadcom/vulkan/v3dv_query.c b/src/broadcom/vulkan/v3dv_query.c
index 60edfc52442..4e188fd5512 100644
--- a/src/broadcom/vulkan/v3dv_query.c
+++ b/src/broadcom/vulkan/v3dv_query.c
@@ -25,6 +25,148 @@
#include "util/timespec.h"
+static const char *v3dv_counters[][3] = {
+ {"FEP", "FEP-valid-primitives-no-rendered-pixels", "[FEP] Valid primitives that result in no rendered pixels, for all rendered tiles"},
+ {"FEP", "FEP-valid-primitives-rendered-pixels", "[FEP] Valid primitives for all rendered tiles (primitives may be counted in more than one tile)"},
+ {"FEP", "FEP-clipped-quads", "[FEP] Early-Z/Near/Far clipped quads"},
+ {"FEP", "FEP-valid-quads", "[FEP] Valid quads"},
+ {"TLB", "TLB-quads-not-passing-stencil-test", "[TLB] Quads with no pixels passing the stencil test"},
+ {"TLB", "TLB-quads-not-passing-z-and-stencil-test", "[TLB] Quads with no pixels passing the Z and stencil tests"},
+ {"TLB", "TLB-quads-passing-z-and-stencil-test", "[TLB] Quads with any pixels passing the Z and stencil tests"},
+ {"TLB", "TLB-quads-with-zero-coverage", "[TLB] Quads with all pixels having zero coverage"},
+ {"TLB", "TLB-quads-with-non-zero-coverage", "[TLB] Quads with any pixels having non-zero coverage"},
+ {"TLB", "TLB-quads-written-to-color-buffer", "[TLB] Quads with valid pixels written to colour buffer"},
+ {"PTB", "PTB-primitives-discarded-outside-viewport", "[PTB] Primitives discarded by being outside the viewport"},
+ {"PTB", "PTB-primitives-need-clipping", "[PTB] Primitives that need clipping"},
+ {"PTB", "PTB-primitives-discared-reversed", "[PTB] Primitives that are discarded because they are reversed"},
+ {"QPU", "QPU-total-idle-clk-cycles", "[QPU] Total idle clock cycles for all QPUs"},
+ {"QPU", "QPU-total-active-clk-cycles-vertex-coord-shading", "[QPU] Total active clock cycles for all QPUs doing vertex/coordinate/user shading (counts only when QPU is not stalled)"},
+ {"QPU", "QPU-total-active-clk-cycles-fragment-shading", "[QPU] Total active clock cycles for all QPUs doing fragment shading (counts only when QPU is not stalled)"},
+ {"QPU", "QPU-total-clk-cycles-executing-valid-instr", "[QPU] Total clock cycles for all QPUs executing valid instructions"},
+ {"QPU", "QPU-total-clk-cycles-waiting-TMU", "[QPU] Total clock cycles for all QPUs stalled waiting for TMUs only (counter won't increment if QPU also stalling for another reason)"},
+ {"QPU", "QPU-total-clk-cycles-waiting-scoreboard", "[QPU] Total clock cycles for all QPUs stalled waiting for Scoreboard only (counter won't increment if QPU also stalling for another reason)"},
+ {"QPU", "QPU-total-clk-cycles-waiting-varyings", "[QPU] Total clock cycles for all QPUs stalled waiting for Varyings only (counter won't increment if QPU also stalling for another reason)"},
+ {"QPU", "QPU-total-instr-cache-hit", "[QPU] Total instruction cache hits for all slices"},
+ {"QPU", "QPU-total-instr-cache-miss", "[QPU] Total instruction cache misses for all slices"},
+ {"QPU", "QPU-total-uniform-cache-hit", "[QPU] Total uniforms cache hits for all slices"},
+ {"QPU", "QPU-total-uniform-cache-miss", "[QPU] Total uniforms cache misses for all slices"},
+ {"TMU", "TMU-total-text-quads-access", "[TMU] Total texture cache accesses"},
+ {"TMU", "TMU-total-text-cache-miss", "[TMU] Total texture cache misses (number of fetches from memory/L2cache)"},
+ {"VPM", "VPM-total-clk-cycles-VDW-stalled", "[VPM] Total clock cycles VDW is stalled waiting for VPM access"},
+ {"VPM", "VPM-total-clk-cycles-VCD-stalled", "[VPM] Total clock cycles VCD is stalled waiting for VPM access"},
+ {"CLE", "CLE-bin-thread-active-cycles", "[CLE] Bin thread active cycles"},
+ {"CLE", "CLE-render-thread-active-cycles", "[CLE] Render thread active cycles"},
+ {"L2T", "L2T-total-cache-hit", "[L2T] Total Level 2 cache hits"},
+ {"L2T", "L2T-total-cache-miss", "[L2T] Total Level 2 cache misses"},
+ {"CORE", "cycle-count", "[CORE] Cycle counter"},
+ {"QPU", "QPU-total-clk-cycles-waiting-vertex-coord-shading", "[QPU] Total stalled clock cycles for all QPUs doing vertex/coordinate/user shading"},
+ {"QPU", "QPU-total-clk-cycles-waiting-fragment-shading", "[QPU] Total stalled clock cycles for all QPUs doing fragment shading"},
+ {"PTB", "PTB-primitives-binned", "[PTB] Total primitives binned"},
+ {"AXI", "AXI-writes-seen-watch-0", "[AXI] Writes seen by watch 0"},
+ {"AXI", "AXI-reads-seen-watch-0", "[AXI] Reads seen by watch 0"},
+ {"AXI", "AXI-writes-stalled-seen-watch-0", "[AXI] Write stalls seen by watch 0"},
+ {"AXI", "AXI-reads-stalled-seen-watch-0", "[AXI] Read stalls seen by watch 0"},
+ {"AXI", "AXI-write-bytes-seen-watch-0", "[AXI] Total bytes written seen by watch 0"},
+ {"AXI", "AXI-read-bytes-seen-watch-0", "[AXI] Total bytes read seen by watch 0"},
+ {"AXI", "AXI-writes-seen-watch-1", "[AXI] Writes seen by watch 1"},
+ {"AXI", "AXI-reads-seen-watch-1", "[AXI] Reads seen by watch 1"},
+ {"AXI", "AXI-writes-stalled-seen-watch-1", "[AXI] Write stalls seen by watch 1"},
+ {"AXI", "AXI-reads-stalled-seen-watch-1", "[AXI] Read stalls seen by watch 1"},
+ {"AXI", "AXI-write-bytes-seen-watch-1", "[AXI] Total bytes written seen by watch 1"},
+ {"AXI", "AXI-read-bytes-seen-watch-1", "[AXI] Total bytes read seen by watch 1"},
+ {"TLB", "TLB-partial-quads-written-to-color-buffer", "[TLB] Partial quads written to the colour buffer"},
+ {"TMU", "TMU-total-config-access", "[TMU] Total config accesses"},
+ {"L2T", "L2T-no-id-stalled", "[L2T] No ID stall"},
+ {"L2T", "L2T-command-queue-stalled", "[L2T] Command queue full stall"},
+ {"L2T", "L2T-TMU-writes", "[L2T] TMU write accesses"},
+ {"TMU", "TMU-active-cycles", "[TMU] Active cycles"},
+ {"TMU", "TMU-stalled-cycles", "[TMU] Stalled cycles"},
+ {"CLE", "CLE-thread-active-cycles", "[CLE] Bin or render thread active cycles"},
+ {"L2T", "L2T-TMU-reads", "[L2T] TMU read accesses"},
+ {"L2T", "L2T-CLE-reads", "[L2T] CLE read accesses"},
+ {"L2T", "L2T-VCD-reads", "[L2T] VCD read accesses"},
+ {"L2T", "L2T-TMU-config-reads", "[L2T] TMU CFG read accesses"},
+ {"L2T", "L2T-SLC0-reads", "[L2T] SLC0 read accesses"},
+ {"L2T", "L2T-SLC1-reads", "[L2T] SLC1 read accesses"},
+ {"L2T", "L2T-SLC2-reads", "[L2T] SLC2 read accesses"},
+ {"L2T", "L2T-TMU-write-miss", "[L2T] TMU write misses"},
+ {"L2T", "L2T-TMU-read-miss", "[L2T] TMU read misses"},
+ {"L2T", "L2T-CLE-read-miss", "[L2T] CLE read misses"},
+ {"L2T", "L2T-VCD-read-miss", "[L2T] VCD read misses"},
+ {"L2T", "L2T-TMU-config-read-miss", "[L2T] TMU CFG read misses"},
+ {"L2T", "L2T-SLC0-read-miss", "[L2T] SLC0 read misses"},
+ {"L2T", "L2T-SLC1-read-miss", "[L2T] SLC1 read misses"},
+ {"L2T", "L2T-SLC2-read-miss", "[L2T] SLC2 read misses"},
+ {"CORE", "core-memory-writes", "[CORE] Total memory writes"},
+ {"L2T", "L2T-memory-writes", "[L2T] Total memory writes"},
+ {"PTB", "PTB-memory-writes", "[PTB] Total memory writes"},
+ {"TLB", "TLB-memory-writes", "[TLB] Total memory writes"},
+ {"CORE", "core-memory-reads", "[CORE] Total memory reads"},
+ {"L2T", "L2T-memory-reads", "[L2T] Total memory reads"},
+ {"PTB", "PTB-memory-reads", "[PTB] Total memory reads"},
+ {"PSE", "PSE-memory-reads", "[PSE] Total memory reads"},
+ {"TLB", "TLB-memory-reads", "[TLB] Total memory reads"},
+ {"GMP", "GMP-memory-reads", "[GMP] Total memory reads"},
+ {"PTB", "PTB-memory-words-writes", "[PTB] Total memory words written"},
+ {"TLB", "TLB-memory-words-writes", "[TLB] Total memory words written"},
+ {"PSE", "PSE-memory-words-reads", "[PSE] Total memory words read"},
+ {"TLB", "TLB-memory-words-reads", "[TLB] Total memory words read"},
+ {"TMU", "TMU-MRU-hits", "[TMU] Total MRU hits"},
+ {"CORE", "compute-active-cycles", "[CORE] Compute active cycles"},
+};
+
+static void
+kperfmon_create(struct v3dv_device *device,
+ struct v3dv_query_pool *pool,
+ uint32_t query)
+{
+ for (uint32_t i = 0; i < pool->perfmon.nperfmons; i++) {
+ assert(i * DRM_V3D_MAX_PERF_COUNTERS < pool->perfmon.ncounters);
+
+ struct drm_v3d_perfmon_create req = {
+ .ncounters = MIN2(pool->perfmon.ncounters -
+ i * DRM_V3D_MAX_PERF_COUNTERS,
+ DRM_V3D_MAX_PERF_COUNTERS),
+ };
+ memcpy(req.counters,
+ &pool->perfmon.counters[i * DRM_V3D_MAX_PERF_COUNTERS],
+ req.ncounters);
+
+ int ret = v3dv_ioctl(device->pdevice->render_fd,
+ DRM_IOCTL_V3D_PERFMON_CREATE,
+ &req);
+ if (ret)
+ fprintf(stderr, "Failed to create perfmon: %s\n", strerror(ret));
+
+ pool->queries[query].perf.kperfmon_ids[i] = req.id;
+ }
+}
+
+static void
+kperfmon_destroy(struct v3dv_device *device,
+ struct v3dv_query_pool *pool,
+ uint32_t query)
+{
+ /* Skip destroying if never created */
+ if (!pool->queries[query].perf.kperfmon_ids[0])
+ return;
+
+ for (uint32_t i = 0; i < pool->perfmon.nperfmons; i++) {
+ struct drm_v3d_perfmon_destroy req = {
+ .id = pool->queries[query].perf.kperfmon_ids[i]
+ };
+
+ int ret = v3dv_ioctl(device->pdevice->render_fd,
+ DRM_IOCTL_V3D_PERFMON_DESTROY,
+ &req);
+
+ if (ret) {
+ fprintf(stderr, "Failed to destroy perfmon %u: %s\n",
+ req.id, strerror(ret));
+ }
+ }
+}
+
VKAPI_ATTR VkResult VKAPI_CALL
v3dv_CreateQueryPool(VkDevice _device,
const VkQueryPoolCreateInfo *pCreateInfo,
@@ -34,7 +176,8 @@ v3dv_CreateQueryPool(VkDevice _device,
V3DV_FROM_HANDLE(v3dv_device, device, _device);
assert(pCreateInfo->queryType == VK_QUERY_TYPE_OCCLUSION ||
- pCreateInfo->queryType == VK_QUERY_TYPE_TIMESTAMP);
+ pCreateInfo->queryType == VK_QUERY_TYPE_TIMESTAMP ||
+ pCreateInfo->queryType == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR);
assert(pCreateInfo->queryCount > 0);
struct v3dv_query_pool *pool =
@@ -46,6 +189,7 @@ v3dv_CreateQueryPool(VkDevice _device,
pool->query_type = pCreateInfo->queryType;
pool->query_count = pCreateInfo->queryCount;
+ uint32_t query_idx = 0;
VkResult result;
const uint32_t pool_bytes = sizeof(struct v3dv_query) * pool->query_count;
@@ -56,7 +200,8 @@ v3dv_CreateQueryPool(VkDevice _device,
goto fail;
}
- if (pool->query_type == VK_QUERY_TYPE_OCCLUSION) {
+ switch (pool->query_type) {
+ case VK_QUERY_TYPE_OCCLUSION: {
/* The hardware allows us to setup groups of 16 queries in consecutive
* 4-byte addresses, requiring only that each group of 16 queries is
* aligned to a 1024 byte boundary.
@@ -72,22 +217,56 @@ v3dv_CreateQueryPool(VkDevice _device,
result = vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
goto fail;
}
+ break;
}
+ case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
+ const VkQueryPoolPerformanceCreateInfoKHR *pq_info =
+ vk_find_struct_const(pCreateInfo->pNext,
+ QUERY_POOL_PERFORMANCE_CREATE_INFO_KHR);
+
+ assert(pq_info);
+ assert(pq_info->counterIndexCount <= V3D_PERFCNT_NUM);
+
+ pool->perfmon.ncounters = pq_info->counterIndexCount;
+ for (uint32_t i = 0; i < pq_info->counterIndexCount; i++)
+ pool->perfmon.counters[i] = pq_info->pCounterIndices[i];
+
+ pool->perfmon.nperfmons = DIV_ROUND_UP(pool->perfmon.ncounters,
+ DRM_V3D_MAX_PERF_COUNTERS);
- uint32_t i;
- for (i = 0; i < pool->query_count; i++) {
- pool->queries[i].maybe_available = false;
+ assert(pool->perfmon.nperfmons <= V3DV_MAX_PERFMONS);
+ break;
+ }
+ case VK_QUERY_TYPE_TIMESTAMP:
+ break;
+ default:
+ unreachable("Unsupported query type");
+ }
+
+ for (; query_idx < pool->query_count; query_idx++) {
+ pool->queries[query_idx].maybe_available = false;
switch (pool->query_type) {
case VK_QUERY_TYPE_OCCLUSION: {
- const uint32_t query_group = i / 16;
- const uint32_t query_offset = query_group * 1024 + (i % 16) * 4;
- pool->queries[i].bo = pool->bo;
- pool->queries[i].offset = query_offset;
+ const uint32_t query_group = query_idx / 16;
+ const uint32_t query_offset = query_group * 1024 + (query_idx % 16) * 4;
+ pool->queries[query_idx].bo = pool->bo;
+ pool->queries[query_idx].offset = query_offset;
break;
}
case VK_QUERY_TYPE_TIMESTAMP:
- pool->queries[i].value = 0;
+ pool->queries[query_idx].value = 0;
+ break;
+ case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
+ result = vk_sync_create(&device->vk,
+ &device->pdevice->drm_syncobj_type, 0, 0,
+ &pool->queries[query_idx].perf.last_job_sync);
+ if (result != VK_SUCCESS)
+ goto fail;
+
+ for (uint32_t j = 0; j < pool->perfmon.nperfmons; j++)
+ pool->queries[query_idx].perf.kperfmon_ids[j] = 0;
break;
+ }
default:
unreachable("Unsupported query type");
}
@@ -98,6 +277,11 @@ v3dv_CreateQueryPool(VkDevice _device,
return VK_SUCCESS;
fail:
+ if (pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
+ for (uint32_t j = 0; j < query_idx; j++)
+ vk_sync_destroy(&device->vk, pool->queries[j].perf.last_job_sync);
+ }
+
if (pool->bo)
v3dv_bo_free(device, pool->bo);
if (pool->queries)
@@ -121,6 +305,13 @@ v3dv_DestroyQueryPool(VkDevice _device,
if (pool->bo)
v3dv_bo_free(device, pool->bo);
+ if (pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
+ for (uint32_t i = 0; i < pool->query_count; i++) {
+ kperfmon_destroy(device, pool, i);
+ vk_sync_destroy(&device->vk, pool->queries[i].perf.last_job_sync);
+ }
+ }
+
if (pool->queries)
vk_free2(&device->vk.alloc, pAllocator, pool->queries);
@@ -128,7 +319,7 @@ v3dv_DestroyQueryPool(VkDevice _device,
}
static void
-write_query_result(void *dst, uint32_t idx, bool do_64bit, uint64_t value)
+write_to_buffer(void *dst, uint32_t idx, bool do_64bit, uint64_t value)
{
if (do_64bit) {
uint64_t *dst64 = (uint64_t *) dst;
@@ -177,13 +368,91 @@ query_wait_available(struct v3dv_device *device,
!v3dv_bo_wait(device, q->bo, 0xffffffffffffffffull))
return vk_device_set_lost(&device->vk, "Query BO wait failed: %m");
+ if (query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR &&
+ vk_sync_wait(&device->vk, q->perf.last_job_sync,
+ 0, VK_SYNC_WAIT_COMPLETE, UINT64_MAX) != VK_SUCCESS)
+ return vk_device_set_lost(&device->vk, "Query job wait failed");
+
return VK_SUCCESS;
}
static VkResult
-query_is_available(struct v3dv_device *device,
- struct v3dv_query *q,
- VkQueryType query_type)
+write_occlusion_query_result(struct v3dv_device *device,
+ struct v3dv_query_pool *pool,
+ uint32_t query,
+ bool do_64bit,
+ void *data,
+ uint32_t slot)
+{
+ assert(pool && pool->query_type == VK_QUERY_TYPE_OCCLUSION);
+
+ if (vk_device_is_lost(&device->vk))
+ return VK_ERROR_DEVICE_LOST;
+
+ struct v3dv_query *q = &pool->queries[query];
+ assert(q->bo && q->bo->map);
+
+ const uint8_t *query_addr = ((uint8_t *) q->bo->map) + q->offset;
+ write_to_buffer(data, slot, do_64bit, (uint64_t) *((uint32_t *)query_addr));
+ return VK_SUCCESS;
+}
+
+static VkResult
+write_timestamp_query_result(struct v3dv_device *device,
+ struct v3dv_query_pool *pool,
+ uint32_t query,
+ bool do_64bit,
+ void *data,
+ uint32_t slot)
+{
+ assert(pool && pool->query_type == VK_QUERY_TYPE_TIMESTAMP);
+
+ struct v3dv_query *q = &pool->queries[query];
+
+ write_to_buffer(data, slot, do_64bit, q->value);
+ return VK_SUCCESS;
+}
+
+static VkResult
+write_performance_query_result(struct v3dv_device *device,
+ struct v3dv_query_pool *pool,
+ uint32_t query,
+ bool do_64bit,
+ void *data,
+ uint32_t slot)
+{
+ assert(pool && pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR);
+
+ struct v3dv_query *q = &pool->queries[query];
+ uint64_t counter_values[V3D_PERFCNT_NUM];
+
+ for (uint32_t i = 0; i < pool->perfmon.nperfmons; i++) {
+ struct drm_v3d_perfmon_get_values req = {
+ .id = q->perf.kperfmon_ids[i],
+ .values_ptr = (uintptr_t)(&counter_values[i *
+ DRM_V3D_MAX_PERF_COUNTERS])
+ };
+
+ int ret = v3dv_ioctl(device->pdevice->render_fd,
+ DRM_IOCTL_V3D_PERFMON_GET_VALUES,
+ &req);
+
+ if (ret) {
+ fprintf(stderr, "failed to get perfmon values: %s\n", strerror(ret));
+ return vk_error(device, VK_ERROR_DEVICE_LOST);
+ }
+ }
+
+ for (uint32_t i = 0; i < pool->perfmon.ncounters; i++)
+ write_to_buffer(data, slot + i, do_64bit, counter_values[i]);
+
+ return VK_SUCCESS;
+}
+
+static VkResult
+query_check_available(struct v3dv_device *device,
+ struct v3dv_query *q,
+ VkQueryType query_type)
{
if (!q->maybe_available)
return VK_NOT_READY;
@@ -192,70 +461,105 @@ query_is_available(struct v3dv_device *device,
!v3dv_bo_wait(device, q->bo, 0))
return VK_NOT_READY;
+ if (query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR &&
+ vk_sync_wait(&device->vk, q->perf.last_job_sync,
+ 0, VK_SYNC_WAIT_COMPLETE, 0) != VK_SUCCESS)
+ return VK_NOT_READY;
+
return VK_SUCCESS;
}
static VkResult
-get_query_result(struct v3dv_device *device,
- struct v3dv_query_pool *pool,
- uint32_t query,
- bool do_wait,
- bool *available,
- uint64_t *value)
+write_query_result(struct v3dv_device *device,
+ struct v3dv_query_pool *pool,
+ uint32_t query,
+ bool do_64bit,
+ void *data,
+ uint32_t slot)
+{
+ switch (pool->query_type) {
+ case VK_QUERY_TYPE_OCCLUSION:
+ return write_occlusion_query_result(device, pool, query, do_64bit,
+ data, slot);
+ case VK_QUERY_TYPE_TIMESTAMP:
+ return write_timestamp_query_result(device, pool, query, do_64bit,
+ data, slot);
+ case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
+ return write_performance_query_result(device, pool, query, do_64bit,
+ data, slot);
+ default:
+ unreachable("Unsupported query type");
+ }
+}
+
+static VkResult
+query_is_available(struct v3dv_device *device,
+ struct v3dv_query_pool *pool,
+ uint32_t query,
+ bool do_wait,
+ bool *available)
{
struct v3dv_query *q = &pool->queries[query];
+ assert(pool->query_type != VK_QUERY_TYPE_OCCLUSION ||
+ (q->bo && q->bo->map));
+
if (do_wait) {
VkResult result = query_wait_available(device, q, pool->query_type);
- if (result != VK_SUCCESS)
+ if (result != VK_SUCCESS) {
+ *available = false;
return result;
+ }
*available = true;
} else {
- VkResult result = query_is_available(device, q, pool->query_type);
+ VkResult result = query_check_available(device, q, pool->query_type);
assert(result == VK_SUCCESS || result == VK_NOT_READY);
*available = (result == VK_SUCCESS);
}
- switch (pool->query_type) {
- case VK_QUERY_TYPE_OCCLUSION: {
- const uint8_t *query_addr = ((uint8_t *) q->bo->map) + q->offset;
- *value = (uint64_t) *((uint32_t *)query_addr);
- return VK_SUCCESS;
- }
+ return VK_SUCCESS;
+}
+static uint32_t
+get_query_result_count(struct v3dv_query_pool *pool)
+{
+ switch (pool->query_type) {
+ case VK_QUERY_TYPE_OCCLUSION:
case VK_QUERY_TYPE_TIMESTAMP:
- *value = q->value;
- return VK_SUCCESS;
-
+ return 1;
+ case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
+ return pool->perfmon.ncounters;
default:
unreachable("Unsupported query type");
}
}
VkResult
-v3dv_get_query_pool_results_cpu(struct v3dv_device *device,
- struct v3dv_query_pool *pool,
- uint32_t first,
- uint32_t count,
- void *data,
- VkDeviceSize stride,
- VkQueryResultFlags flags)
+v3dv_get_query_pool_results(struct v3dv_device *device,
+ struct v3dv_query_pool *pool,
+ uint32_t first,
+ uint32_t count,
+ void *data,
+ VkDeviceSize stride,
+ VkQueryResultFlags flags)
{
assert(first < pool->query_count);
assert(first + count <= pool->query_count);
assert(data);
- const bool do_64bit = flags & VK_QUERY_RESULT_64_BIT;
+ const bool do_64bit = flags & VK_QUERY_RESULT_64_BIT ||
+ pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR;
const bool do_wait = flags & VK_QUERY_RESULT_WAIT_BIT;
const bool do_partial = flags & VK_QUERY_RESULT_PARTIAL_BIT;
+ uint32_t result_count = get_query_result_count(pool);
+
VkResult result = VK_SUCCESS;
for (uint32_t i = first; i < first + count; i++) {
bool available = false;
- uint64_t value = 0;
VkResult query_result =
- get_query_result(device, pool, i, do_wait, &available, &value);
+ query_is_available(device, pool, i, do_wait, &available);
if (query_result == VK_ERROR_DEVICE_LOST)
result = VK_ERROR_DEVICE_LOST;
@@ -273,11 +577,11 @@ v3dv_get_query_pool_results_cpu(struct v3dv_device *device,
const bool write_result = available || do_partial;
if (write_result)
- write_query_result(data, slot, do_64bit, value);
- slot++;
+ write_query_result(device, pool, i, do_64bit, data, slot);
+ slot += result_count;
if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)
- write_query_result(data, slot++, do_64bit, available ? 1u : 0u);
+ write_to_buffer(data, slot++, do_64bit, available ? 1u : 0u);
if (!write_result && result != VK_ERROR_DEVICE_LOST)
result = VK_NOT_READY;
@@ -301,8 +605,8 @@ v3dv_GetQueryPoolResults(VkDevice _device,
V3DV_FROM_HANDLE(v3dv_device, device, _device);
V3DV_FROM_HANDLE(v3dv_query_pool, pool, queryPool);
- return v3dv_get_query_pool_results_cpu(device, pool, firstQuery, queryCount,
- pData, stride, flags);
+ return v3dv_get_query_pool_results(device, pool, firstQuery, queryCount,
+ pData, stride, flags);
}
VKAPI_ATTR void VKAPI_CALL
@@ -381,6 +685,12 @@ v3dv_reset_query_pools(struct v3dv_device *device,
case VK_QUERY_TYPE_TIMESTAMP:
q->value = 0;
break;
+ case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
+ kperfmon_destroy(device, pool, i);
+ kperfmon_create(device, pool, i);
+ if (vk_sync_reset(&device->vk, q->perf.last_job_sync) != VK_SUCCESS)
+ fprintf(stderr, "Failed to reset sync");
+ break;
default:
unreachable("Unsupported query type");
}
@@ -400,3 +710,69 @@ v3dv_ResetQueryPool(VkDevice _device,
v3dv_reset_query_pools(device, pool, firstQuery, queryCount);
}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+v3dv_EnumeratePhysicalDeviceQueueFamilyPerformanceQueryCountersKHR(
+ VkPhysicalDevice physicalDevice,
+ uint32_t queueFamilyIndex,
+ uint32_t *pCounterCount,
+ VkPerformanceCounterKHR *pCounters,
+ VkPerformanceCounterDescriptionKHR *pCounterDescriptions)
+{
+ uint32_t desc_count = *pCounterCount;
+
+ VK_OUTARRAY_MAKE_TYPED(VkPerformanceCounterKHR,
+ out, pCounters, pCounterCount);
+ VK_OUTARRAY_MAKE_TYPED(VkPerformanceCounterDescriptionKHR,
+ out_desc, pCounterDescriptions, &desc_count);
+
+ for (int i = 0; i < ARRAY_SIZE(v3dv_counters); i++) {
+ vk_outarray_append_typed(VkPerformanceCounterKHR, &out, counter) {
+ counter->unit = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR;
+ counter->scope = VK_QUERY_SCOPE_COMMAND_KHR;
+ counter->storage = VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR;
+
+ unsigned char sha1_result[20];
+ _mesa_sha1_compute(v3dv_counters[i][1], strlen(v3dv_counters[i][1]),
+ sha1_result);
+
+ memcpy(counter->uuid, sha1_result, sizeof(counter->uuid));
+ }
+
+ vk_outarray_append_typed(VkPerformanceCounterDescriptionKHR,
+ &out_desc, desc) {
+ desc->flags = 0;
+ snprintf(desc->name, sizeof(desc->name), "%s",
+ v3dv_counters[i][1]);
+ snprintf(desc->category, sizeof(desc->category), "%s",
+ v3dv_counters[i][0]);
+ snprintf(desc->description, sizeof(desc->description), "%s",
+ v3dv_counters[i][2]);
+ }
+ }
+
+ return vk_outarray_status(&out);
+}
+
+VKAPI_ATTR void VKAPI_CALL
+v3dv_GetPhysicalDeviceQueueFamilyPerformanceQueryPassesKHR(
+ VkPhysicalDevice physicalDevice,
+ const VkQueryPoolPerformanceCreateInfoKHR *pPerformanceQueryCreateInfo,
+ uint32_t *pNumPasses)
+{
+ *pNumPasses = DIV_ROUND_UP(pPerformanceQueryCreateInfo->counterIndexCount,
+ DRM_V3D_MAX_PERF_COUNTERS);
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+v3dv_AcquireProfilingLockKHR(
+ VkDevice _device,
+ const VkAcquireProfilingLockInfoKHR *pInfo)
+{
+ return VK_SUCCESS;
+}
+
+VKAPI_ATTR void VKAPI_CALL
+v3dv_ReleaseProfilingLockKHR(VkDevice device)
+{
+}
diff --git a/src/broadcom/vulkan/v3dv_queue.c b/src/broadcom/vulkan/v3dv_queue.c
index 799139b9174..a3d92466d88 100644
--- a/src/broadcom/vulkan/v3dv_queue.c
+++ b/src/broadcom/vulkan/v3dv_queue.c
@@ -137,27 +137,129 @@ handle_reset_query_cpu_job(struct v3dv_queue *queue, struct v3dv_job *job,
if (info->pool->query_type == VK_QUERY_TYPE_OCCLUSION)
v3dv_bo_wait(job->device, info->pool->bo, PIPE_TIMEOUT_INFINITE);
+ if (info->pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
+ struct vk_sync_wait waits[info->count];
+ unsigned wait_count = 0;
+ for (int i = 0; i < info->count; i++) {
+ struct v3dv_query *query = &info->pool->queries[i];
+ /* Only wait for a query if we've used it otherwise we will be
+ * waiting forever for the fence to become signaled.
+ */
+ if (query->maybe_available) {
+ waits[wait_count] = (struct vk_sync_wait){
+ .sync = info->pool->queries[i].perf.last_job_sync
+ };
+ wait_count++;
+ };
+ }
+
+ VkResult result = vk_sync_wait_many(&job->device->vk, wait_count, waits,
+ VK_SYNC_WAIT_COMPLETE, UINT64_MAX);
+
+ if (result != VK_SUCCESS)
+ return result;
+ }
+
v3dv_reset_query_pools(job->device, info->pool, info->first, info->count);
return VK_SUCCESS;
}
static VkResult
-handle_end_query_cpu_job(struct v3dv_job *job)
+export_perfmon_last_job_sync(struct v3dv_queue *queue, struct v3dv_job *job, int *fd)
+{
+ int err;
+ if (job->device->pdevice->caps.multisync) {
+ static const enum v3dv_queue_type queues_to_sync[] = {
+ V3DV_QUEUE_CL,
+ V3DV_QUEUE_CSD,
+ };
+
+ for (uint32_t i = 0; i < ARRAY_SIZE(queues_to_sync); i++) {
+ enum v3dv_queue_type queue_type = queues_to_sync[i];
+ int tmp_fd = -1;
+
+ err = drmSyncobjExportSyncFile(job->device->pdevice->render_fd,
+ queue->last_job_syncs.syncs[queue_type],
+ &tmp_fd);
+
+ if (err) {
+ close(*fd);
+ return vk_errorf(&job->device->queue, VK_ERROR_UNKNOWN,
+ "sync file export failed: %m");
+ }
+
+ err = sync_accumulate("v3dv", fd, tmp_fd);
+
+ if (err) {
+ close(tmp_fd);
+ close(*fd);
+ return vk_errorf(&job->device->queue, VK_ERROR_UNKNOWN,
+ "failed to accumulate sync files: %m");
+ }
+ }
+ } else {
+ err = drmSyncobjExportSyncFile(job->device->pdevice->render_fd,
+ queue->last_job_syncs.syncs[V3DV_QUEUE_ANY],
+ fd);
+
+ if (err) {
+ return vk_errorf(&job->device->queue, VK_ERROR_UNKNOWN,
+ "sync file export failed: %m");
+ }
+ }
+ return VK_SUCCESS;
+}
+
+static VkResult
+handle_end_query_cpu_job(struct v3dv_job *job, uint32_t counter_pass_idx)
{
+ VkResult result = VK_SUCCESS;
+
mtx_lock(&job->device->query_mutex);
struct v3dv_end_query_cpu_job_info *info = &job->cpu.query_end;
+ struct v3dv_queue *queue = &job->device->queue;
+
+ int err = 0;
+ int fd = -1;
+
+ if (info->pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
+ result = export_perfmon_last_job_sync(queue, job, &fd);
+
+ if (result != VK_SUCCESS)
+ goto fail;
+
+ assert(fd >= 0);
+ }
+
for (uint32_t i = 0; i < info->count; i++) {
assert(info->query + i < info->pool->query_count);
struct v3dv_query *query = &info->pool->queries[info->query + i];
+
+ if (info->pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
+ uint32_t syncobj = vk_sync_as_drm_syncobj(query->perf.last_job_sync)->syncobj;
+ err = drmSyncobjImportSyncFile(job->device->pdevice->render_fd,
+ syncobj, fd);
+
+ if (err) {
+ result = vk_errorf(queue, VK_ERROR_UNKNOWN,
+ "sync file import failed: %m");
+ goto fail;
+ }
+ }
+
query->maybe_available = true;
}
+fail:
+ if (info->pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR)
+ close(fd);
+
cnd_broadcast(&job->device->query_ended);
mtx_unlock(&job->device->query_mutex);
- return VK_SUCCESS;
+ return result;
}
static VkResult
@@ -176,13 +278,13 @@ handle_copy_query_results_cpu_job(struct v3dv_job *job)
uint8_t *offset = ((uint8_t *) bo->map) +
info->offset + info->dst->mem_offset;
- v3dv_get_query_pool_results_cpu(job->device,
- info->pool,
- info->first,
- info->count,
- offset,
- info->stride,
- info->flags);
+ v3dv_get_query_pool_results(job->device,
+ info->pool,
+ info->first,
+ info->count,
+ offset,
+ info->stride,
+ info->flags);
return VK_SUCCESS;
}
@@ -635,6 +737,7 @@ fail:
static VkResult
handle_cl_job(struct v3dv_queue *queue,
struct v3dv_job *job,
+ uint32_t counter_pass_idx,
struct v3dv_submit_sync_info *sync_info,
bool signal_syncs)
{
@@ -678,9 +781,15 @@ handle_cl_job(struct v3dv_queue *queue,
assert(bo_idx == submit.bo_handle_count);
submit.bo_handles = (uintptr_t)(void *)bo_handles;
+ submit.perfmon_id = job->perf ?
+ job->perf->kperfmon_ids[counter_pass_idx] : 0;
+ const bool needs_perf_sync = queue->last_perfmon_id != submit.perfmon_id;
+ queue->last_perfmon_id = submit.perfmon_id;
+
/* We need a binning sync if we are waiting on a semaphore with a wait stage
* that involves the geometry pipeline, or if the job comes after a pipeline
- * barrier that involves geometry stages (needs_bcl_sync).
+ * barrier that involves geometry stages (needs_bcl_sync), or if
+ * performance queries are in use.
*
* We need a render sync if the job doesn't need a binning sync but has
* still been flagged for serialization. It should be noted that RCL jobs
@@ -705,6 +814,7 @@ handle_cl_job(struct v3dv_queue *queue,
VK_PIPELINE_STAGE_TESSELLATION_EVALUATION_SHADER_BIT |
VK_PIPELINE_STAGE_GEOMETRY_SHADER_BIT);
}
+ needs_bcl_sync |= needs_perf_sync;
bool needs_rcl_sync = job->serialize && !needs_bcl_sync;
/* Replace single semaphore settings whenever our kernel-driver supports
@@ -795,6 +905,7 @@ handle_tfu_job(struct v3dv_queue *queue,
static VkResult
handle_csd_job(struct v3dv_queue *queue,
struct v3dv_job *job,
+ uint32_t counter_pass_idx,
struct v3dv_submit_sync_info *sync_info,
bool signal_syncs)
{
@@ -835,6 +946,9 @@ handle_csd_job(struct v3dv_queue *queue,
submit->in_sync = needs_sync ? last_job_sync : 0;
submit->out_sync = last_job_sync;
}
+ submit->perfmon_id = job->perf ?
+ job->perf->kperfmon_ids[counter_pass_idx] : 0;
+ queue->last_perfmon_id = submit->perfmon_id;
int ret = v3dv_ioctl(device->pdevice->render_fd,
DRM_IOCTL_V3D_SUBMIT_CSD, submit);
@@ -858,20 +972,21 @@ handle_csd_job(struct v3dv_queue *queue,
static VkResult
queue_handle_job(struct v3dv_queue *queue,
struct v3dv_job *job,
+ uint32_t counter_pass_idx,
struct v3dv_submit_sync_info *sync_info,
bool signal_syncs)
{
switch (job->type) {
case V3DV_JOB_TYPE_GPU_CL:
- return handle_cl_job(queue, job, sync_info, signal_syncs);
+ return handle_cl_job(queue, job, counter_pass_idx, sync_info, signal_syncs);
case V3DV_JOB_TYPE_GPU_TFU:
return handle_tfu_job(queue, job, sync_info, signal_syncs);
case V3DV_JOB_TYPE_GPU_CSD:
- return handle_csd_job(queue, job, sync_info, signal_syncs);
+ return handle_csd_job(queue, job, counter_pass_idx, sync_info, signal_syncs);
case V3DV_JOB_TYPE_CPU_RESET_QUERIES:
return handle_reset_query_cpu_job(queue, job, sync_info);
case V3DV_JOB_TYPE_CPU_END_QUERY:
- return handle_end_query_cpu_job(job);
+ return handle_end_query_cpu_job(job, counter_pass_idx);
case V3DV_JOB_TYPE_CPU_COPY_QUERY_RESULTS:
return handle_copy_query_results_cpu_job(job);
case V3DV_JOB_TYPE_CPU_SET_EVENT:
@@ -913,6 +1028,7 @@ queue_create_noop_job(struct v3dv_queue *queue)
static VkResult
queue_submit_noop_job(struct v3dv_queue *queue,
+ uint32_t counter_pass_idx,
struct v3dv_submit_sync_info *sync_info,
bool signal_syncs)
{
@@ -923,7 +1039,8 @@ queue_submit_noop_job(struct v3dv_queue *queue,
}
assert(queue->noop_job);
- return queue_handle_job(queue, queue->noop_job, sync_info, signal_syncs);
+ return queue_handle_job(queue, queue->noop_job, counter_pass_idx,
+ sync_info, signal_syncs);
}
VkResult
@@ -953,7 +1070,8 @@ v3dv_queue_driver_submit(struct vk_queue *vk_queue,
list_for_each_entry_safe(struct v3dv_job, job,
&cmd_buffer->jobs, list_link) {
- result = queue_handle_job(queue, job, &sync_info, false);
+ result = queue_handle_job(queue, job, submit->perf_pass_index,
+ &sync_info, false);
if (result != VK_SUCCESS)
return result;
}
@@ -964,7 +1082,8 @@ v3dv_queue_driver_submit(struct vk_queue *vk_queue,
* barrier state to limit the queues we serialize against.
*/
if (cmd_buffer->state.barrier.dst_mask) {
- result = queue_submit_noop_job(queue, &sync_info, false);
+ result = queue_submit_noop_job(queue, submit->perf_pass_index,
+ &sync_info, false);
if (result != VK_SUCCESS)
return result;
}
@@ -976,7 +1095,8 @@ v3dv_queue_driver_submit(struct vk_queue *vk_queue,
* requirements.
*/
if (submit->signal_count > 0) {
- result = queue_submit_noop_job(queue, &sync_info, true);
+ result = queue_submit_noop_job(queue, submit->perf_pass_index,
+ &sync_info, true);
if (result != VK_SUCCESS)
return result;
}