diff options
author | Lionel Landwerlin <lionel.g.landwerlin@intel.com> | 2020-09-07 15:56:54 +0300 |
---|---|---|
committer | Marge Bot <eric+marge@anholt.net> | 2021-02-02 13:25:54 +0000 |
commit | 8750f43a9077b3b53f54505aaa2cc46fab5d4f90 (patch) | |
tree | 057cef574d07a7d70f320eec83b0cfda75c9e6c3 /src/intel | |
parent | f32d1bf5295ce420779b324c6935e68ac6ad8be4 (diff) |
intel/perf: add performance query layout using MI_SRM
For all generations supported we had a layout describing what register
to store to implement a MI_RPC replacement.
This is because, on Gen12 we need to snapshot OAG registers to get
correct values for the perf equations. There, the MI_RPC instruction
captures OAR register which do not have all the information we need.
v2: Fix commented code for debug (Marcin)
Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/6518>
Diffstat (limited to 'src/intel')
-rw-r--r-- | src/intel/perf/gen_perf.c | 86 | ||||
-rw-r--r-- | src/intel/perf/gen_perf.h | 11 | ||||
-rw-r--r-- | src/intel/perf/gen_perf_query.c | 3 | ||||
-rw-r--r-- | src/intel/perf/gen_perf_regs.h | 26 | ||||
-rw-r--r-- | src/intel/vulkan/genX_query.c | 6 |
5 files changed, 118 insertions, 14 deletions
diff --git a/src/intel/perf/gen_perf.c b/src/intel/perf/gen_perf.c index d50ede2b97f..fde8060ca4b 100644 --- a/src/intel/perf/gen_perf.c +++ b/src/intel/perf/gen_perf.c @@ -1027,9 +1027,16 @@ gen_perf_query_result_read_frequencies(struct gen_perf_query_result *result, &result->unslice_frequency[1]); } +static inline bool +can_use_mi_rpc_bc_counters(const struct gen_device_info *devinfo) +{ + return devinfo->gen <= 11; +} + void gen_perf_query_result_accumulate(struct gen_perf_query_result *result, const struct gen_perf_query_info *query, + const struct gen_device_info *devinfo, const uint32_t *start, const uint32_t *end) { @@ -1061,16 +1068,18 @@ gen_perf_query_result_accumulate(struct gen_perf_query_result *result, result->accumulator + query->a_offset + 32 + i); } - /* 8x 32bit B counters */ - for (i = 0; i < 8; i++) { - accumulate_uint32(start + 48 + i, end + 48 + i, - result->accumulator + query->b_offset + i); - } + if (can_use_mi_rpc_bc_counters(devinfo)) { + /* 8x 32bit B counters */ + for (i = 0; i < 8; i++) { + accumulate_uint32(start + 48 + i, end + 48 + i, + result->accumulator + query->b_offset + i); + } - /* 8x 32bit C counters... */ - for (i = 0; i < 8; i++) { - accumulate_uint32(start + 56 + i, end + 56 + i, - result->accumulator + query->c_offset + i); + /* 8x 32bit C counters... */ + for (i = 0; i < 8; i++) { + accumulate_uint32(start + 56 + i, end + 56 + i, + result->accumulator + query->c_offset + i); + } } break; @@ -1138,6 +1147,10 @@ query_accumulator_offset(const struct gen_perf_query_info *query, switch (type) { case GEN_PERF_QUERY_FIELD_TYPE_SRM_PERFCNT: return query->perfcnt_offset + index; + case GEN_PERF_QUERY_FIELD_TYPE_SRM_OA_B: + return query->b_offset + index; + case GEN_PERF_QUERY_FIELD_TYPE_SRM_OA_C: + return query->c_offset + index; default: unreachable("Invalid register type"); return 0; @@ -1166,7 +1179,7 @@ gen_perf_query_result_accumulate_fields(struct gen_perf_query_result *result, * unrelated deltas, so don't accumulate the begin/end reports here. */ if (!no_oa_accumulate) { - gen_perf_query_result_accumulate(result, query, + gen_perf_query_result_accumulate(result, query, devinfo, start + field->location, end + field->location); } @@ -1205,6 +1218,35 @@ gen_perf_query_result_clear(struct gen_perf_query_result *result) result->hw_id = OA_REPORT_INVALID_CTX_ID; /* invalid */ } +void +gen_perf_query_result_print_fields(const struct gen_perf_query_info *query, + const struct gen_device_info *devinfo, + const void *data) +{ + const struct gen_perf_query_field_layout *layout = &query->perf->query_layout; + + for (uint32_t r = 0; r < layout->n_fields; r++) { + const struct gen_perf_query_field *field = &layout->fields[r]; + const uint32_t *value32 = data + field->location; + + switch (field->type) { + case GEN_PERF_QUERY_FIELD_TYPE_MI_RPC: + fprintf(stderr, "MI_RPC:\n"); + fprintf(stderr, " TS: 0x%08x\n", *(value32 + 1)); + fprintf(stderr, " CLK: 0x%08x\n", *(value32 + 3)); + break; + case GEN_PERF_QUERY_FIELD_TYPE_SRM_OA_B: + fprintf(stderr, "B%u: 0x%08x\n", field->index, *value32); + break; + case GEN_PERF_QUERY_FIELD_TYPE_SRM_OA_C: + fprintf(stderr, "C%u: 0x%08x\n", field->index, *value32); + break; + default: + break; + } + } +} + static int gen_perf_compare_query_names(const void *v1, const void *v2) { @@ -1252,6 +1294,8 @@ gen_perf_init_query_fields(struct gen_perf_config *perf_cfg, /* MI_RPC requires a 64byte alignment. */ layout->alignment = 64; + layout->fields = rzalloc_array(perf_cfg, struct gen_perf_query_field, 5 + 16); + add_query_register(layout, GEN_PERF_QUERY_FIELD_TYPE_MI_RPC, 0, 256, 0); @@ -1280,6 +1324,28 @@ gen_perf_init_query_fields(struct gen_perf_config *perf_cfg, GEN9_RPSTAT0, 4, 0); } + if (!can_use_mi_rpc_bc_counters(devinfo)) { + if (devinfo->gen >= 8 && devinfo->gen <= 11) { + for (uint32_t i = 0; i < GEN8_N_OA_PERF_B32; i++) { + add_query_register(layout, GEN_PERF_QUERY_FIELD_TYPE_SRM_OA_B, + GEN8_OA_PERF_B32(i), 4, i); + } + for (uint32_t i = 0; i < GEN8_N_OA_PERF_C32; i++) { + add_query_register(layout, GEN_PERF_QUERY_FIELD_TYPE_SRM_OA_C, + GEN8_OA_PERF_C32(i), 4, i); + } + } else if (devinfo->gen == 12) { + for (uint32_t i = 0; i < GEN12_N_OAG_PERF_B32; i++) { + add_query_register(layout, GEN_PERF_QUERY_FIELD_TYPE_SRM_OA_B, + GEN12_OAG_PERF_B32(i), 4, i); + } + for (uint32_t i = 0; i < GEN12_N_OAG_PERF_C32; i++) { + add_query_register(layout, GEN_PERF_QUERY_FIELD_TYPE_SRM_OA_C, + GEN12_OAG_PERF_C32(i), 4, i); + } + } + } + /* Align the whole package to 64bytes so that 2 snapshots can be put * together without extract alignment for the user. */ diff --git a/src/intel/perf/gen_perf.h b/src/intel/perf/gen_perf.h index b5e751b8d62..87d35ee1b11 100644 --- a/src/intel/perf/gen_perf.h +++ b/src/intel/perf/gen_perf.h @@ -266,7 +266,9 @@ struct gen_perf_query_field_layout { enum gen_perf_query_field_type { GEN_PERF_QUERY_FIELD_TYPE_MI_RPC, GEN_PERF_QUERY_FIELD_TYPE_SRM_PERFCNT, - GEN_PERF_QUERY_FIELD_TYPE_SRM_RPSTAT + GEN_PERF_QUERY_FIELD_TYPE_SRM_RPSTAT, + GEN_PERF_QUERY_FIELD_TYPE_SRM_OA_B, + GEN_PERF_QUERY_FIELD_TYPE_SRM_OA_C, } type; /* Index of register in the given type (for instance A31 or B2, @@ -431,6 +433,7 @@ void gen_perf_query_result_read_perfcnts(struct gen_perf_query_result *result, */ void gen_perf_query_result_accumulate(struct gen_perf_query_result *result, const struct gen_perf_query_info *query, + const struct gen_device_info *devinfo, const uint32_t *start, const uint32_t *end); @@ -446,6 +449,12 @@ void gen_perf_query_result_accumulate_fields(struct gen_perf_query_result *resul void gen_perf_query_result_clear(struct gen_perf_query_result *result); +/** Debug helper printing out query data. + */ +void gen_perf_query_result_print_fields(const struct gen_perf_query_info *query, + const struct gen_device_info *devinfo, + const void *data); + static inline size_t gen_perf_query_counter_get_size(const struct gen_perf_query_counter *counter) { diff --git a/src/intel/perf/gen_perf_query.c b/src/intel/perf/gen_perf_query.c index a1204b830be..84854b9b58e 100644 --- a/src/intel/perf/gen_perf_query.c +++ b/src/intel/perf/gen_perf_query.c @@ -1309,6 +1309,7 @@ accumulate_oa_reports(struct gen_perf_context *perf_ctx, if (add) { gen_perf_query_result_accumulate(&query->oa.result, query->queryinfo, + devinfo, last, report); } else { /* We're not adding the delta because we've identified it's not @@ -1337,7 +1338,7 @@ accumulate_oa_reports(struct gen_perf_context *perf_ctx, end: gen_perf_query_result_accumulate(&query->oa.result, query->queryinfo, - last, end); + devinfo, last, end); query->oa.results_accumulated = true; drop_from_unaccumulated_query_list(perf_ctx, query); diff --git a/src/intel/perf/gen_perf_regs.h b/src/intel/perf/gen_perf_regs.h index f97e387e46b..51ac6ba2119 100644 --- a/src/intel/perf/gen_perf_regs.h +++ b/src/intel/perf/gen_perf_regs.h @@ -46,6 +46,32 @@ #define PERF_CNT_2_DW0 0x91c0 #define PERF_CNT_VALUE_MASK ((1ull << 44) - 1) +/* Global OA perf counters */ +#define GEN7_N_OA_PERF_A32 44 +#define GEN7_OA_PERF_A32(idx) (0x2800 + (idx) * 4) + +#define GEN8_OA_PERF_TICKS 0x2910 +#define GEN8_N_OA_PERF_A64 32 +#define GEN8_N_OA_PERF_A32 4 +#define GEN8_N_OA_PERF_B32 8 +#define GEN8_N_OA_PERF_C32 8 +#define GEN8_OA_PERF_A64_LDW(idx) (0x2800 + (idx) * 8) +#define GEN8_OA_PERF_A64_UDW(idx) (0x2800 + (idx) * 8 + 4) +#define GEN8_OA_PERF_A32(idx) (0x2900 + (idx) * 4) +#define GEN8_OA_PERF_B32(idx) (0x2920 + (idx) * 4) +#define GEN8_OA_PERF_C32(idx) (0x2940 + (idx) * 4) + +#define GEN12_OAG_PERF_TICKS 0xda90 +#define GEN12_N_OAG_PERF_A64 32 +#define GEN12_N_OAG_PERF_A32 4 +#define GEN12_N_OAG_PERF_B32 8 +#define GEN12_N_OAG_PERF_C32 8 +#define GEN12_OAG_PERF_A64_LDW(idx) (0xd980 + (idx) * 8) +#define GEN12_OAG_PERF_A64_UDW(idx) (0xd980 + (idx) * 8 + 4) +#define GEN12_OAG_PERF_A32(idx) (0xda80 + (idx) * 4) +#define GEN12_OAG_PERF_B32(idx) (0xda94 + (idx) * 4) +#define GEN12_OAG_PERF_C32(idx) (0xdab4 + (idx) * 4) + /* Pipeline statistic counters */ #define IA_VERTICES_COUNT 0x2310 #define IA_PRIMITIVES_COUNT 0x2318 diff --git a/src/intel/vulkan/genX_query.c b/src/intel/vulkan/genX_query.c index ab3f6d0da77..d71b5aae531 100644 --- a/src/intel/vulkan/genX_query.c +++ b/src/intel/vulkan/genX_query.c @@ -518,7 +518,8 @@ VkResult genX(GetQueryPoolResults)( const uint32_t *end = pool->bo->map + khr_perf_query_oa_offset(pool, firstQuery + i, p, true); struct gen_perf_query_result result; gen_perf_query_result_clear(&result); - gen_perf_query_result_accumulate(&result, pool->pass_query[p], begin, end); + gen_perf_query_result_accumulate(&result, pool->pass_query[p], + &device->info, begin, end); anv_perf_write_pass_results(pdevice->perf, pool, p, &result, pData); } break; @@ -536,7 +537,8 @@ VkResult genX(GetQueryPoolResults)( const uint32_t *rpstat_end = query_data + intel_perf_mi_rpc_offset(true); struct gen_perf_query_result result; gen_perf_query_result_clear(&result); - gen_perf_query_result_accumulate(&result, query, oa_begin, oa_end); + gen_perf_query_result_accumulate(&result, query, &device->info, + oa_begin, oa_end); gen_perf_query_result_read_frequencies(&result, &device->info, oa_begin, oa_end); gen_perf_query_result_read_gt_frequency(&result, &device->info, |