intel/perf: fix performance counters availability after glFinish

Currently Linux kernel gathers performance counters at fixed intervals (~5-10ms), so if application uses AMD_performance_monitor extension and immediately after glFinish() asks GL driver for HW performance counter values it might not get any data (values == 0). Fix this by moving the "read counters from kernel" code from "is query ready" to "get counter values" callback with a loop around it. Unfortunately it means that the "read counters from kernel" code can spin for up to 10ms. Ideally kernel should gather performance counters whenever we ask it for counter values, but for now we have deal with what we have. Signed-off-by: Marcin Ślusarz <marcin.slusarz@intel.com> Cc: <mesa-stable@lists.freedesktop.org> Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/5788> (cherry picked from commit 2fbab5a1b3998854203c43560a47beac4e1897b7)
author: Marcin Ślusarz <marcin.slusarz@intel.com> 2020-07-10 19:25:10 +0200
committer: Eric Engestrom <eric@engestrom.ch> 2020-08-19 22:28:29 +0200
commit: da36f4f356399a3fc8c18190fccb78677416802f (patch)
tree: 8ddd2b1782f5803c4d91f4b0294a3c92d2cda02d
parent: 204b736d1900e9a1338bb9e3bcde47d00d5a6784 (diff)
6 files changed, 20 insertions, 17 deletions
diff --git a/.pick_status.json b/.pick_status.json
index 6c0a7f30020..2b12351b344 100644
--- a/.pick_status.json
+++ b/.pick_status.json
@@ -3424,7 +3424,7 @@
         "description": "intel/perf: fix performance counters availability after glFinish",
         "nominated": true,
         "nomination_type": 0,
-        "resolution": 0,
+        "resolution": 1,
         "master_sha": null,
         "because_sha": null
     },
diff --git a/src/gallium/drivers/iris/iris_monitor.c b/src/gallium/drivers/iris/iris_monitor.c
index 838c0891b5f..730fae27182 100644
--- a/src/gallium/drivers/iris/iris_monitor.c
+++ b/src/gallium/drivers/iris/iris_monitor.c
@@ -355,7 +355,7 @@ iris_get_monitor_result(struct pipe_context *ctx,
    assert(gen_perf_is_query_ready(perf_ctx, monitor->query, batch));
 
    unsigned bytes_written;
-   gen_perf_get_query_data(perf_ctx, monitor->query,
+   gen_perf_get_query_data(perf_ctx, monitor->query, batch,
                            monitor->result_size,
                            (unsigned*) monitor->result_buffer,
                            &bytes_written);
diff --git a/src/gallium/drivers/iris/iris_performance_query.c b/src/gallium/drivers/iris/iris_performance_query.c
index 0192635601a..73b25081119 100644
--- a/src/gallium/drivers/iris/iris_performance_query.c
+++ b/src/gallium/drivers/iris/iris_performance_query.c
@@ -214,7 +214,8 @@ iris_get_perf_query_data(struct pipe_context *pipe,
    struct gen_perf_query_object *obj = perf_query->query;
    struct gen_perf_context *perf_ctx = ice->perf_ctx;
 
-   gen_perf_get_query_data(perf_ctx, obj, data_size, data, bytes_written);
+   gen_perf_get_query_data(perf_ctx, obj, &ice->batches[IRIS_BATCH_RENDER],
+         data_size, data, bytes_written);
 }
 
 void
diff --git a/src/intel/perf/gen_perf_query.c b/src/intel/perf/gen_perf_query.c
index 57b01203a6f..b9744913b16 100644
--- a/src/intel/perf/gen_perf_query.c
+++ b/src/intel/perf/gen_perf_query.c
@@ -1061,17 +1061,6 @@ gen_perf_wait_query(struct gen_perf_context *perf_ctx,
       perf_cfg->vtbl.batchbuffer_flush(perf_ctx->ctx, __FILE__, __LINE__);
 
    perf_cfg->vtbl.bo_wait_rendering(bo);
-
-   /* Due to a race condition between the OA unit signaling report
-    * availability and the report actually being written into memory,
-    * we need to wait for all the reports to come in before we can
-    * read them.
-    */
-   if (query->queryinfo->kind == GEN_PERF_QUERY_TYPE_OA ||
-       query->queryinfo->kind == GEN_PERF_QUERY_TYPE_RAW) {
-      while (!read_oa_samples_for_query(perf_ctx, query, current_batch))
-         ;
-   }
 }
 
 bool
@@ -1087,8 +1076,8 @@ gen_perf_is_query_ready(struct gen_perf_context *perf_ctx,
       return (query->oa.results_accumulated ||
               (query->oa.bo &&
                !perf_cfg->vtbl.batch_references(current_batch, query->oa.bo) &&
-               !perf_cfg->vtbl.bo_busy(query->oa.bo) &&
-               read_oa_samples_for_query(perf_ctx, query, current_batch)));
+               !perf_cfg->vtbl.bo_busy(query->oa.bo)));
+
    case GEN_PERF_QUERY_TYPE_PIPELINE:
       return (query->pipeline_stats.bo &&
               !perf_cfg->vtbl.batch_references(current_batch, query->pipeline_stats.bo) &&
@@ -1513,6 +1502,7 @@ get_pipeline_stats_data(struct gen_perf_context *perf_ctx,
 void
 gen_perf_get_query_data(struct gen_perf_context *perf_ctx,
                         struct gen_perf_query_object *query,
+                        void *current_batch,
                         int data_size,
                         unsigned *data,
                         unsigned *bytes_written)
@@ -1524,6 +1514,17 @@ gen_perf_get_query_data(struct gen_perf_context *perf_ctx,
    case GEN_PERF_QUERY_TYPE_OA:
    case GEN_PERF_QUERY_TYPE_RAW:
       if (!query->oa.results_accumulated) {
+         /* Due to the sampling frequency of the OA buffer by the i915-perf
+          * driver, there can be a 5ms delay between the Mesa seeing the query
+          * complete and i915 making all the OA buffer reports available to us.
+          * We need to wait for all the reports to come in before we can do
+          * the post processing removing unrelated deltas.
+          * There is a i915-perf series to address this issue, but it's
+          * not been merged upstream yet.
+          */
+         while (!read_oa_samples_for_query(perf_ctx, query, current_batch))
+            ;
+
          read_gt_frequency(perf_ctx, query);
          uint32_t *begin_report = query->oa.map;
          uint32_t *end_report = query->oa.map + MI_RPC_BO_END_OFFSET_BYTES;
diff --git a/src/intel/perf/gen_perf_query.h b/src/intel/perf/gen_perf_query.h
index a0246501f76..d064a5d0669 100644
--- a/src/intel/perf/gen_perf_query.h
+++ b/src/intel/perf/gen_perf_query.h
@@ -76,6 +76,7 @@ void gen_perf_delete_query(struct gen_perf_context *perf_ctx,
                            struct gen_perf_query_object *query);
 void gen_perf_get_query_data(struct gen_perf_context *perf_ctx,
                              struct gen_perf_query_object *query,
+                             void *current_batch,
                              int data_size,
                              unsigned *data,
                              unsigned *bytes_written);
diff --git a/src/mesa/drivers/dri/i965/brw_performance_query.c b/src/mesa/drivers/dri/i965/brw_performance_query.c
index 7037e25679a..ebe2c78f275 100644
--- a/src/mesa/drivers/dri/i965/brw_performance_query.c
+++ b/src/mesa/drivers/dri/i965/brw_performance_query.c
@@ -323,7 +323,7 @@ brw_get_perf_query_data(struct gl_context *ctx,
     */
    assert(o->Ready);
 
-   gen_perf_get_query_data(brw->perf_ctx, obj,
+   gen_perf_get_query_data(brw->perf_ctx, obj, &brw->batch,
                            data_size, data, bytes_written);
 }
author	Marcin Ślusarz <marcin.slusarz@intel.com>	2020-07-10 19:25:10 +0200
committer	Eric Engestrom <eric@engestrom.ch>	2020-08-19 22:28:29 +0200
commit	da36f4f356399a3fc8c18190fccb78677416802f (patch)
tree	8ddd2b1782f5803c4d91f4b0294a3c92d2cda02d
parent	204b736d1900e9a1338bb9e3bcde47d00d5a6784 (diff)