diff options
author | Danylo Piliaiev <dpiliaiev@igalia.com> | 2021-12-28 21:44:55 +0200 |
---|---|---|
committer | Marge Bot <emma+marge@anholt.net> | 2022-02-10 15:15:33 +0000 |
commit | b84f0596808574bb0d37355a896eaaf1aafe277f (patch) | |
tree | 01168f649739157bcf639223c47ed281cb686729 | |
parent | 03ab9d895e86668f7b2f8e118cefb7341b8b78b9 (diff) |
freedreno/pps: Expose same counters as blob
Expose most of the counters exposed by blob. By faking the value of
counters returned from kgsl I found the exact underlying counters and
constant coefficients being used.
Note, coefficients for counters that depend on time are NOT verified.
Signed-off-by: Danylo Piliaiev <dpiliaiev@igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14323>
-rw-r--r-- | src/freedreno/ds/fd_pps_driver.cc | 331 |
1 files changed, 318 insertions, 13 deletions
diff --git a/src/freedreno/ds/fd_pps_driver.cc b/src/freedreno/ds/fd_pps_driver.cc index 97305c911ae..deadeae11c9 100644 --- a/src/freedreno/ds/fd_pps_driver.cc +++ b/src/freedreno/ds/fd_pps_driver.cc @@ -16,6 +16,27 @@ namespace pps { +double +safe_div(uint64_t a, uint64_t b) +{ + if (b == 0) + return 0; + + return a / static_cast<double>(b); +} + +float +percent(uint64_t a, uint64_t b) +{ + /* Sometimes we get bogus values but we want for the timeline + * to look nice without higher than 100% values. + */ + if (b == 0 || a > b) + return 0; + + return 100.f * (a / static_cast<double>(b)); +} + uint64_t FreedrenoDriver::get_min_sampling_period_ns() { @@ -45,14 +66,58 @@ FreedrenoDriver::setup_a6xx_counters() auto PERF_CP_ALWAYS_COUNT = countable("PERF_CP_ALWAYS_COUNT"); auto PERF_CP_BUSY_CYCLES = countable("PERF_CP_BUSY_CYCLES"); auto PERF_RB_3D_PIXELS = countable("PERF_RB_3D_PIXELS"); - auto PERF_SP_FS_STAGE_FULL_ALU_INSTRUCTIONS = countable("PERF_SP_FS_STAGE_FULL_ALU_INSTRUCTIONS"); - auto PERF_SP_FS_STAGE_HALF_ALU_INSTRUCTIONS = countable("PERF_SP_FS_STAGE_HALF_ALU_INSTRUCTIONS"); auto PERF_TP_L1_CACHELINE_MISSES = countable("PERF_TP_L1_CACHELINE_MISSES"); + auto PERF_TP_L1_CACHELINE_REQUESTS = countable("PERF_TP_L1_CACHELINE_REQUESTS"); + + auto PERF_TP_OUTPUT_PIXELS = countable("PERF_TP_OUTPUT_PIXELS"); + auto PERF_TP_OUTPUT_PIXELS_ANISO = countable("PERF_TP_OUTPUT_PIXELS_ANISO"); + auto PERF_TP_OUTPUT_PIXELS_BILINEAR = countable("PERF_TP_OUTPUT_PIXELS_BILINEAR"); + auto PERF_TP_OUTPUT_PIXELS_POINT = countable("PERF_TP_OUTPUT_PIXELS_POINT"); + auto PERF_TP_OUTPUT_PIXELS_ZERO_LOD = countable("PERF_TP_OUTPUT_PIXELS_ZERO_LOD"); + + auto PERF_TSE_INPUT_PRIM = countable("PERF_TSE_INPUT_PRIM"); + auto PERF_TSE_CLIPPED_PRIM = countable("PERF_TSE_CLIPPED_PRIM"); + auto PERF_TSE_TRIVAL_REJ_PRIM = countable("PERF_TSE_TRIVAL_REJ_PRIM"); + auto PERF_TSE_OUTPUT_VISIBLE_PRIM = countable("PERF_TSE_OUTPUT_VISIBLE_PRIM"); + auto PERF_SP_BUSY_CYCLES = countable("PERF_SP_BUSY_CYCLES"); + auto PERF_SP_ALU_WORKING_CYCLES = countable("PERF_SP_ALU_WORKING_CYCLES"); + auto PERF_SP_EFU_WORKING_CYCLES = countable("PERF_SP_EFU_WORKING_CYCLES"); + auto PERF_SP_VS_STAGE_EFU_INSTRUCTIONS = countable("PERF_SP_VS_STAGE_EFU_INSTRUCTIONS"); + auto PERF_SP_VS_STAGE_FULL_ALU_INSTRUCTIONS = countable("PERF_SP_VS_STAGE_FULL_ALU_INSTRUCTIONS"); + auto PERF_SP_VS_STAGE_TEX_INSTRUCTIONS = countable("PERF_SP_VS_STAGE_TEX_INSTRUCTIONS"); + auto PERF_SP_FS_STAGE_EFU_INSTRUCTIONS = countable("PERF_SP_FS_STAGE_EFU_INSTRUCTIONS"); + auto PERF_SP_FS_STAGE_FULL_ALU_INSTRUCTIONS = countable("PERF_SP_FS_STAGE_FULL_ALU_INSTRUCTIONS"); + auto PERF_SP_FS_STAGE_HALF_ALU_INSTRUCTIONS = countable("PERF_SP_FS_STAGE_HALF_ALU_INSTRUCTIONS"); + auto PERF_SP_STALL_CYCLES_TP = countable("PERF_SP_STALL_CYCLES_TP"); + auto PERF_SP_ANY_EU_WORKING_FS_STAGE = countable("PERF_SP_ANY_EU_WORKING_FS_STAGE"); + auto PERF_SP_ANY_EU_WORKING_VS_STAGE = countable("PERF_SP_ANY_EU_WORKING_VS_STAGE"); + auto PERF_SP_ANY_EU_WORKING_CS_STAGE = countable("PERF_SP_ANY_EU_WORKING_CS_STAGE"); + + auto PERF_UCHE_STALL_CYCLES_ARBITER = countable("PERF_UCHE_STALL_CYCLES_ARBITER"); + auto PERF_UCHE_VBIF_READ_BEATS_TP = countable("PERF_UCHE_VBIF_READ_BEATS_TP"); + auto PERF_UCHE_VBIF_READ_BEATS_VFD = countable("PERF_UCHE_VBIF_READ_BEATS_VFD"); + auto PERF_UCHE_VBIF_READ_BEATS_SP = countable("PERF_UCHE_VBIF_READ_BEATS_SP"); + auto PERF_UCHE_READ_REQUESTS_TP = countable("PERF_UCHE_READ_REQUESTS_TP"); + + auto PERF_PC_STALL_CYCLES_VFD = countable("PERF_PC_STALL_CYCLES_VFD"); + auto PERF_PC_VS_INVOCATIONS = countable("PERF_PC_VS_INVOCATIONS"); + auto PERF_PC_VERTEX_HITS = countable("PERF_PC_VERTEX_HITS"); + + auto PERF_HLSQ_QUADS = countable("PERF_HLSQ_QUADS"); /* Quads (fragments / 4) produced */ + + auto PERF_CP_NUM_PREEMPTIONS = countable("PERF_CP_NUM_PREEMPTIONS"); + auto PERF_CP_PREEMPTION_REACTION_DELAY = countable("PERF_CP_PREEMPTION_REACTION_DELAY"); + + /* TODO: resolve() tells there is no PERF_CMPDECMP_VBIF_READ_DATA */ + // auto PERF_CMPDECMP_VBIF_READ_DATA = countable("PERF_CMPDECMP_VBIF_READ_DATA"); /* * And then setup the derived counters that we are exporting to - * pps based on the captured countable values + * pps based on the captured countable values. + * + * We try to expose the same counters as blob: + * https://gpuinspector.dev/docs/gpu-counters/qualcomm */ counter("GPU Frequency", Counter::Units::Hertz, [=]() { @@ -61,29 +126,269 @@ FreedrenoDriver::setup_a6xx_counters() ); counter("GPU % Utilization", Counter::Units::Percent, [=]() { - return 100.0 * (PERF_CP_BUSY_CYCLES / time) / max_freq; + return percent(PERF_CP_BUSY_CYCLES / time, max_freq); } ); - // This one is a bit of a guess, but seems plausible.. - counter("ALU / Fragment", Counter::Units::None, [=]() { + counter("TP L1 Cache Misses", Counter::Units::None, [=]() { + return PERF_TP_L1_CACHELINE_MISSES / time; + } + ); + + counter("Shader Core Utilization", Counter::Units::Percent, [=]() { + return percent(PERF_SP_BUSY_CYCLES / time, max_freq * info->num_sp_cores); + } + ); + + /* TODO: verify */ + counter("(?) % Texture Fetch Stall", Counter::Units::Percent, [=]() { + return percent(PERF_SP_STALL_CYCLES_TP / time, max_freq * info->num_sp_cores); + } + ); + + /* TODO: verify */ + counter("(?) % Vertex Fetch Stall", Counter::Units::Percent, [=]() { + return percent(PERF_PC_STALL_CYCLES_VFD / time, max_freq * info->num_sp_cores); + } + ); + + counter("L1 Texture Cache Miss Per Pixel", Counter::Units::None, [=]() { + return safe_div(PERF_TP_L1_CACHELINE_MISSES, PERF_HLSQ_QUADS * 4); + } + ); + + counter("% Texture L1 Miss", Counter::Units::Percent, [=]() { + return percent(PERF_TP_L1_CACHELINE_MISSES, PERF_TP_L1_CACHELINE_REQUESTS); + } + ); + + counter("% Texture L2 Miss", Counter::Units::Percent, [=]() { + return percent(PERF_UCHE_VBIF_READ_BEATS_TP / 2, PERF_UCHE_READ_REQUESTS_TP); + } + ); + + /* TODO: verify */ + counter("(?) % Stalled on System Memory", Counter::Units::Percent, [=]() { + return percent(PERF_UCHE_STALL_CYCLES_ARBITER / time, max_freq * info->num_sp_cores); + } + ); + + counter("Pre-clipped Polygons / Second", Counter::Units::None, [=]() { + return PERF_TSE_INPUT_PRIM * (1.f / time); + } + ); + + counter("% Prims Trivially Rejected", Counter::Units::Percent, [=]() { + return percent(PERF_TSE_TRIVAL_REJ_PRIM, PERF_TSE_INPUT_PRIM); + } + ); + + counter("% Prims Clipped", Counter::Units::Percent, [=]() { + return percent(PERF_TSE_CLIPPED_PRIM, PERF_TSE_INPUT_PRIM); + } + ); + + counter("Average Vertices / Polygon", Counter::Units::None, [=]() { + return PERF_PC_VS_INVOCATIONS / PERF_TSE_INPUT_PRIM; + } + ); + + counter("Reused Vertices / Second", Counter::Units::None, [=]() { + return PERF_PC_VERTEX_HITS * (1.f / time); + } + ); + + counter("Average Polygon Area", Counter::Units::None, [=]() { + return safe_div(PERF_HLSQ_QUADS * 4, PERF_TSE_OUTPUT_VISIBLE_PRIM); + } + ); + + /* TODO: find formula */ + // counter("% Shaders Busy", Counter::Units::Percent, [=]() { + // return 100.0 * 0; + // } + // ); + + counter("Vertices Shaded / Second", Counter::Units::None, [=]() { + return PERF_PC_VS_INVOCATIONS * (1.f / time); + } + ); + + counter("Fragments Shaded / Second", Counter::Units::None, [=]() { + return PERF_HLSQ_QUADS * 4 * (1.f / time); + } + ); + + counter("Vertex Instructions / Second", Counter::Units::None, [=]() { + return (PERF_SP_VS_STAGE_FULL_ALU_INSTRUCTIONS + + PERF_SP_VS_STAGE_EFU_INSTRUCTIONS) * (1.f / time); + } + ); + + counter("Fragment Instructions / Second", Counter::Units::None, [=]() { return (PERF_SP_FS_STAGE_FULL_ALU_INSTRUCTIONS + - PERF_SP_FS_STAGE_HALF_ALU_INSTRUCTIONS / 2) / PERF_RB_3D_PIXELS; + PERF_SP_FS_STAGE_HALF_ALU_INSTRUCTIONS / 2 + + PERF_SP_FS_STAGE_EFU_INSTRUCTIONS) * (1.f / time); } ); - counter("TP L1 Cache Misses", Counter::Units::None, [=]() { - return PERF_TP_L1_CACHELINE_MISSES / time; + counter("Fragment ALU Instructions / Sec (Full)", Counter::Units::None, [=]() { + return PERF_SP_FS_STAGE_FULL_ALU_INSTRUCTIONS * (1.f / time); } ); - counter("Shader Core Utilization", Counter::Units::Percent, [=]() { - return 100.0 * (PERF_SP_BUSY_CYCLES / time) / (max_freq * info->num_sp_cores); + counter("Fragment ALU Instructions / Sec (Half)", Counter::Units::None, [=]() { + return PERF_SP_FS_STAGE_HALF_ALU_INSTRUCTIONS * (1.f / time); + } + ); + + counter("Fragment EFU Instructions / Second", Counter::Units::None, [=]() { + return PERF_SP_FS_STAGE_EFU_INSTRUCTIONS * (1.f / time); + } + ); + + counter("Textures / Vertex", Counter::Units::None, [=]() { + return safe_div(PERF_SP_VS_STAGE_TEX_INSTRUCTIONS, PERF_PC_VS_INVOCATIONS); + } + ); + + counter("Textures / Fragment", Counter::Units::None, [=]() { + return safe_div(PERF_TP_OUTPUT_PIXELS, PERF_HLSQ_QUADS * 4); + } + ); + + counter("ALU / Vertex", Counter::Units::None, [=]() { + return safe_div(PERF_SP_VS_STAGE_FULL_ALU_INSTRUCTIONS, PERF_PC_VS_INVOCATIONS); + } + ); + + counter("EFU / Vertex", Counter::Units::None, [=]() { + return safe_div(PERF_SP_VS_STAGE_EFU_INSTRUCTIONS, PERF_PC_VS_INVOCATIONS); + } + ); + + counter("ALU / Fragment", Counter::Units::None, [=]() { + return safe_div(PERF_SP_FS_STAGE_FULL_ALU_INSTRUCTIONS + + PERF_SP_FS_STAGE_HALF_ALU_INSTRUCTIONS / 2, PERF_HLSQ_QUADS); + } + ); + + counter("EFU / Fragment", Counter::Units::None, [=]() { + return safe_div(PERF_SP_FS_STAGE_EFU_INSTRUCTIONS, PERF_HLSQ_QUADS); + } + ); + + counter("% Time Shading Vertices", Counter::Units::Percent, [=]() { + return percent(PERF_SP_ANY_EU_WORKING_VS_STAGE, + (PERF_SP_ANY_EU_WORKING_VS_STAGE + + PERF_SP_ANY_EU_WORKING_FS_STAGE + + PERF_SP_ANY_EU_WORKING_CS_STAGE)); + } + ); + + counter("% Time Shading Fragments", Counter::Units::Percent, [=]() { + return percent(PERF_SP_ANY_EU_WORKING_FS_STAGE, + (PERF_SP_ANY_EU_WORKING_VS_STAGE + + PERF_SP_ANY_EU_WORKING_FS_STAGE + + PERF_SP_ANY_EU_WORKING_CS_STAGE)); + } + ); + + counter("% Time Compute", Counter::Units::Percent, [=]() { + return percent(PERF_SP_ANY_EU_WORKING_CS_STAGE, + (PERF_SP_ANY_EU_WORKING_VS_STAGE + + PERF_SP_ANY_EU_WORKING_FS_STAGE + + PERF_SP_ANY_EU_WORKING_CS_STAGE)); + } + ); + + counter("% Shader ALU Capacity Utilized", Counter::Units::Percent, [=]() { + return percent((PERF_SP_VS_STAGE_FULL_ALU_INSTRUCTIONS + + PERF_SP_FS_STAGE_FULL_ALU_INSTRUCTIONS + + PERF_SP_FS_STAGE_HALF_ALU_INSTRUCTIONS / 2) / 64, + PERF_SP_BUSY_CYCLES); + } + ); + + counter("% Time ALUs Working", Counter::Units::Percent, [=]() { + return percent(PERF_SP_ALU_WORKING_CYCLES / 2, PERF_SP_BUSY_CYCLES); } ); - // TODO add more.. see https://gpuinspector.dev/docs/gpu-counters/qualcomm - // for what blob exposes + counter("% Time EFUs Working", Counter::Units::Percent, [=]() { + return percent(PERF_SP_EFU_WORKING_CYCLES / 2, PERF_SP_BUSY_CYCLES); + } + ); + + counter("% Anisotropic Filtered", Counter::Units::Percent, [=]() { + return percent(PERF_TP_OUTPUT_PIXELS_ANISO, PERF_TP_OUTPUT_PIXELS); + } + ); + + counter("% Linear Filtered", Counter::Units::Percent, [=]() { + return percent(PERF_TP_OUTPUT_PIXELS_BILINEAR, PERF_TP_OUTPUT_PIXELS); + } + ); + + counter("% Nearest Filtered", Counter::Units::Percent, [=]() { + return percent(PERF_TP_OUTPUT_PIXELS_POINT, PERF_TP_OUTPUT_PIXELS); + } + ); + + counter("% Non-Base Level Textures", Counter::Units::Percent, [=]() { + return percent(PERF_TP_OUTPUT_PIXELS_ZERO_LOD, PERF_TP_OUTPUT_PIXELS); + } + ); + + /* Reads from KGSL_PERFCOUNTER_GROUP_VBIF countable=63 */ + // counter("Read Total (Bytes/sec)", Counter::Units::Byte, [=]() { + // return * (1.f / time); + // } + // ); + + /* Reads from KGSL_PERFCOUNTER_GROUP_VBIF countable=84 */ + // counter("Write Total (Bytes/sec)", Counter::Units::Byte, [=]() { + // return * (1.f / time); + // } + // ); + + /* Cannot get PERF_CMPDECMP_VBIF_READ_DATA countable */ + // counter("Texture Memory Read BW (Bytes/Second)", Counter::Units::Byte, [=]() { + // return (PERF_CMPDECMP_VBIF_READ_DATA + PERF_UCHE_VBIF_READ_BEATS_TP) * (1.f / time); + // } + // ); + + /* TODO: verify */ + counter("(?) Vertex Memory Read (Bytes/Second)", Counter::Units::Byte, [=]() { + return PERF_UCHE_VBIF_READ_BEATS_VFD * 32 * (1.f / time); + } + ); + + /* TODO: verify */ + counter("SP Memory Read (Bytes/Second)", Counter::Units::Byte, [=]() { + return PERF_UCHE_VBIF_READ_BEATS_SP * 32 * (1.f / time); + } + ); + + counter("Avg Bytes / Fragment", Counter::Units::Byte, [=]() { + return safe_div(PERF_UCHE_VBIF_READ_BEATS_TP * 32, PERF_HLSQ_QUADS * 4); + } + ); + + counter("Avg Bytes / Vertex", Counter::Units::Byte, [=]() { + return safe_div(PERF_UCHE_VBIF_READ_BEATS_VFD * 32, PERF_PC_VS_INVOCATIONS); + } + ); + + counter("Preemptions / second", Counter::Units::None, [=]() { + return PERF_CP_NUM_PREEMPTIONS * (1.f / time); + } + ); + + counter("Avg Preemption Delay", Counter::Units::None, [=]() { + return PERF_CP_PREEMPTION_REACTION_DELAY * (1.f / time); + } + ); } /** |