diff options
author | Rhys Perry <pendingchaos02@gmail.com> | 2019-09-13 16:41:00 +0100 |
---|---|---|
committer | Rhys Perry <pendingchaos02@gmail.com> | 2019-10-23 19:11:21 +0100 |
commit | 08d510010b7586387e363460b98e6a45bbe97164 (patch) | |
tree | b505450ea74de3fce98048cabab34aa6c97af9cf /src/amd/compiler/aco_live_var_analysis.cpp | |
parent | 7453c1adff9d8a9e09cd7585e05c4db1c70870be (diff) |
aco: increase accuracy of SGPR limits
SGPRs are allocated in groups of 16 on GFX8/GFX9. GFX10 allocates a fixed
number of SGPRs and has 106 addressable SGPRs.
pipeline-db (Vega):
SGPRS: 5912 -> 6232 (5.41 %)
VGPRS: 1772 -> 1780 (0.45 %)
Spilled SGPRs: 0 -> 0 (0.00 %)
Spilled VGPRs: 0 -> 0 (0.00 %)
Private memory VGPRs: 0 -> 0 (0.00 %)
Scratch size: 0 -> 0 (0.00 %) dwords per thread
Code Size: 88228 -> 87904 (-0.37 %) bytes
LDS: 0 -> 0 (0.00 %) blocks
Max Waves: 559 -> 571 (2.15 %)
piepline-db (Navi):
SGPRS: 341256 -> 363384 (6.48 %)
VGPRS: 171536 -> 170960 (-0.34 %)
Spilled SGPRs: 832 -> 581 (-30.17 %)
Spilled VGPRs: 0 -> 0 (0.00 %)
Private memory VGPRs: 0 -> 0 (0.00 %)
Scratch size: 0 -> 0 (0.00 %) dwords per thread
Code Size: 14207332 -> 14190872 (-0.12 %) bytes
LDS: 33 -> 33 (0.00 %) blocks
Max Waves: 18072 -> 18251 (0.99 %)
v2: unconditionally count vcc as an extra sgpr on GFX10+
v3: pass SGPRs rounded to 8
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Diffstat (limited to 'src/amd/compiler/aco_live_var_analysis.cpp')
-rw-r--r-- | src/amd/compiler/aco_live_var_analysis.cpp | 60 |
1 files changed, 49 insertions, 11 deletions
diff --git a/src/amd/compiler/aco_live_var_analysis.cpp b/src/amd/compiler/aco_live_var_analysis.cpp index f99e57c8b3a..3fe413256e7 100644 --- a/src/amd/compiler/aco_live_var_analysis.cpp +++ b/src/amd/compiler/aco_live_var_analysis.cpp @@ -28,6 +28,7 @@ */ #include "aco_ir.h" +#include "util/u_math.h" #include <set> #include <vector> @@ -190,25 +191,62 @@ void process_live_temps_per_block(Program *program, live& lives, Block* block, } } /* end namespace */ +uint16_t get_extra_sgprs(Program *program) +{ + if (program->chip_class >= GFX10) { + assert(!program->needs_flat_scr); + assert(!program->needs_xnack_mask); + return 2; + } else if (program->chip_class >= GFX8) { + if (program->needs_flat_scr) + return 6; + else if (program->needs_xnack_mask) + return 4; + else if (program->needs_vcc) + return 2; + else + return 0; + } else { + assert(!program->needs_xnack_mask); + if (program->needs_flat_scr) + return 4; + else if (program->needs_vcc) + return 2; + else + return 0; + } +} + +uint16_t get_sgpr_alloc(Program *program, uint16_t addressable_sgprs) +{ + assert(addressable_sgprs <= program->sgpr_limit); + uint16_t sgprs = addressable_sgprs + get_extra_sgprs(program); + uint16_t granule = program->sgpr_alloc_granule + 1; + return align(std::max(sgprs, granule), granule); +} + +uint16_t get_addr_sgpr_from_waves(Program *program, uint16_t max_waves) +{ + uint16_t sgprs = program->physical_sgprs / max_waves & ~program->sgpr_alloc_granule; + sgprs -= get_extra_sgprs(program); + return std::min(sgprs, program->sgpr_limit); +} + void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand) { // TODO: also take shared mem into account - const int16_t total_sgpr_regs = program->chip_class >= GFX8 ? 800 : 512; - const int16_t max_addressible_sgpr = program->sgpr_limit; - /* VGPRs are allocated in chunks of 4 */ - const int16_t rounded_vgpr_demand = std::max<int16_t>(4, (new_demand.vgpr + 3) & ~3); - /* SGPRs are allocated in chunks of 16 between 8 and 104. VCC occupies the last 2 registers */ - const int16_t rounded_sgpr_demand = std::min(std::max<int16_t>(8, (new_demand.sgpr + 2 + 7) & ~7), max_addressible_sgpr); + const int16_t vgpr_alloc = std::max<int16_t>(4, (new_demand.vgpr + 3) & ~3); /* this won't compile, register pressure reduction necessary */ - if (new_demand.vgpr > 256 || new_demand.sgpr > max_addressible_sgpr) { + if (new_demand.vgpr > 256 || new_demand.sgpr > program->sgpr_limit) { program->num_waves = 0; program->max_reg_demand = new_demand; } else { - program->num_waves = std::min<uint16_t>(10, - std::min<uint16_t>(256 / rounded_vgpr_demand, - total_sgpr_regs / rounded_sgpr_demand)); + program->num_waves = program->physical_sgprs / get_sgpr_alloc(program, new_demand.sgpr); + program->num_waves = std::min<uint16_t>(program->num_waves, 256 / vgpr_alloc); + program->num_waves = std::min<uint16_t>(program->num_waves, 10); - program->max_reg_demand = { int16_t((256 / program->num_waves) & ~3), std::min<int16_t>(((total_sgpr_regs / program->num_waves) & ~7) - 2, max_addressible_sgpr)}; + program->max_reg_demand.vgpr = int16_t((256 / program->num_waves) & ~3); + program->max_reg_demand.sgpr = get_addr_sgpr_from_waves(program, program->num_waves); } } |