diff options
author | Rhys Perry <pendingchaos02@gmail.com> | 2019-09-18 20:31:33 +0100 |
---|---|---|
committer | Rhys Perry <pendingchaos02@gmail.com> | 2019-11-25 13:59:11 +0000 |
commit | 459bc77763b283aacde6571a7837f27db2bcd012 (patch) | |
tree | fb8804201456e1798f5e913f710d0cbd51c24c05 | |
parent | 0a759c3be6c88fbdb945d823516172a9867836f8 (diff) |
aco: enable load/store vectorizer
Totals from affected shaders:
SGPRS: 1890373 -> 1900772 (0.55 %)
VGPRS: 1210024 -> 1215244 (0.43 %)
Spilled SGPRs: 828 -> 828 (0.00 %)
Spilled VGPRs: 0 -> 0 (0.00 %)
Private memory VGPRs: 0 -> 0 (0.00 %)
Scratch size: 252 -> 252 (0.00 %) dwords per thread
Code Size: 81937504 -> 74608304 (-8.94 %) bytes
LDS: 746 -> 746 (0.00 %) blocks
Max Waves: 230491 -> 230158 (-0.14 %)
In NeiR:Automata and GTA V, the code decrease is especially large: -13.79%
and -15.32%, respectively.
v9: rework the callback function
v10: handle load_shared/store_shared in the callback
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Connor Abbott <cwabbott0@gmail.com> (v9)
-rw-r--r-- | src/amd/compiler/aco_instruction_selection_setup.cpp | 50 |
1 files changed, 32 insertions, 18 deletions
diff --git a/src/amd/compiler/aco_instruction_selection_setup.cpp b/src/amd/compiler/aco_instruction_selection_setup.cpp index 16b53725408..d663343d747 100644 --- a/src/amd/compiler/aco_instruction_selection_setup.cpp +++ b/src/amd/compiler/aco_instruction_selection_setup.cpp @@ -608,23 +608,38 @@ shared_var_info(const struct glsl_type *type, unsigned *size, unsigned *align) *align = comp_size; } -int -get_align(nir_variable_mode mode, bool is_store, unsigned bit_size, unsigned num_components) +static bool +mem_vectorize_callback(unsigned align, unsigned bit_size, + unsigned num_components, unsigned high_offset, + nir_intrinsic_instr *low, nir_intrinsic_instr *high) { - /* TODO: ACO doesn't have good support for non-32-bit reads/writes yet */ - if (bit_size != 32) - return -1; - - switch (mode) { - case nir_var_mem_ubo: - case nir_var_mem_ssbo: - //case nir_var_mem_push_const: enable with 1240! - case nir_var_mem_shared: - /* TODO: what are the alignment requirements for LDS? */ - return num_components <= 4 ? 4 : -1; + if ((bit_size != 32 && bit_size != 64) || num_components > 4) + return false; + + /* >128 bit loads are split except with SMEM */ + if (bit_size * num_components > 128) + return false; + + switch (low->intrinsic) { + case nir_intrinsic_load_ubo: + case nir_intrinsic_load_ssbo: + case nir_intrinsic_store_ssbo: + case nir_intrinsic_load_push_constant: + return align % 4 == 0; + case nir_intrinsic_load_deref: + case nir_intrinsic_store_deref: + assert(nir_src_as_deref(low->src[0])->mode == nir_var_mem_shared); + /* fallthrough */ + case nir_intrinsic_load_shared: + case nir_intrinsic_store_shared: + if (bit_size * num_components > 64) /* 96 and 128 bit loads require 128 bit alignment and are split otherwise */ + return align % 16 == 0; + else + return align % 4 == 0; default: - return -1; + return false; } + return false; } void @@ -816,14 +831,13 @@ setup_isel_context(Program* program, /* optimize and lower memory operations */ bool lower_to_scalar = false; bool lower_pack = false; - // TODO: uncomment this once !1240 is merged - /*if (nir_opt_load_store_vectorize(nir, + if (nir_opt_load_store_vectorize(nir, (nir_variable_mode)(nir_var_mem_ssbo | nir_var_mem_ubo | nir_var_mem_push_const | nir_var_mem_shared), - get_align)) { + mem_vectorize_callback)) { lower_to_scalar = true; lower_pack = true; - }*/ + } if (nir->info.stage == MESA_SHADER_COMPUTE) lower_to_scalar |= nir_lower_explicit_io(nir, nir_var_mem_shared, nir_address_format_32bit_offset); else |