diff options
author | Timur Kristóf <timur.kristof@gmail.com> | 2021-03-20 17:47:05 +0100 |
---|---|---|
committer | Marge Bot <eric+marge@anholt.net> | 2021-05-28 12:14:53 +0000 |
commit | a93092d0edc92eea8e8e96709ad9857f05c45cef (patch) | |
tree | 8d181b0dcf4f1e613829c156fd7fbd33038160dc | |
parent | 0e4747d3fb7ec15f8c1d6b971b1352249e7d95c6 (diff) |
aco: Use s_cbranch_vccz/nz in post-RA optimization.
A simple post-RA optimization which takes advantage of the
s_cbranch_vccz and s_cbranch_vccnz instructions.
It works on the following pattern:
vcc = v_cmp ...
scc = s_and vcc, exec
p_cbranch scc
The result looks like this:
vcc = v_cmp ...
p_cbranch vcc
Fossil DB results on Sienna Cichlid:
Totals from 4814 (3.21% of 149839) affected shaders:
CodeSize: 15371176 -> 15345964 (-0.16%)
Instrs: 3028557 -> 3022254 (-0.21%)
Latency: 21872753 -> 21823476 (-0.23%); split: -0.23%, +0.00%
InvThroughput: 4470282 -> 4468691 (-0.04%); split: -0.04%, +0.00%
Signed-off-by: Timur Kristóf <timur.kristof@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7779>
-rw-r--r-- | src/amd/compiler/aco_instruction_selection.cpp | 2 | ||||
-rw-r--r-- | src/amd/compiler/aco_optimizer_postRA.cpp | 57 | ||||
-rw-r--r-- | src/amd/compiler/tests/test_optimizer_postRA.cpp | 97 |
3 files changed, 154 insertions, 2 deletions
diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index 9efd01d038b..4c0c8871446 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -2910,7 +2910,6 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) Temp tmp = dst.bytes() == 8 ? bld.tmp(RegClass::get(dst.type(), 4)) : dst; if (tmp.regClass() == s1) { - // TODO: in a post-RA optimization, we can check if src is in VCC, and directly use VCCNZ bool_to_scalar_condition(ctx, src, tmp); } else if (tmp.type() == RegType::vgpr) { bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(tmp), Operand(0u), Operand(1u), src); @@ -10168,7 +10167,6 @@ static bool visit_if(isel_context *ctx, nir_if *if_stmt) * merge block. **/ - // TODO: in a post-RA optimizer, we could check if the condition is in VCC and omit this instruction assert(cond.regClass() == ctx->program->lane_mask); cond = bool_to_scalar_condition(ctx, cond); diff --git a/src/amd/compiler/aco_optimizer_postRA.cpp b/src/amd/compiler/aco_optimizer_postRA.cpp index 45b61942b12..12a143c0198 100644 --- a/src/amd/compiler/aco_optimizer_postRA.cpp +++ b/src/amd/compiler/aco_optimizer_postRA.cpp @@ -107,10 +107,67 @@ int last_writer_idx(pr_opt_ctx &ctx, const Operand &op) return instr_idx; } +void try_apply_branch_vcc(pr_opt_ctx &ctx, aco_ptr<Instruction> &instr) +{ + /* We are looking for the following pattern: + * + * vcc = ... ; last_vcc_wr + * sX, scc = s_and_bXX vcc, exec ; op0_instr + * (...vcc and exec must not be clobbered inbetween...) + * s_cbranch_XX scc ; instr + * + * If possible, the above is optimized into: + * + * vcc = ... ; last_vcc_wr + * s_cbranch_XX vcc ; instr modified to use vcc + */ + + /* Don't try to optimize this on GFX6-7 because SMEM may corrupt the vccz bit. */ + if (ctx.program->chip_class < GFX8) + return; + + if (instr->format != Format::PSEUDO_BRANCH || + instr->operands.size() == 0 || + instr->operands[0].physReg() != scc) + return; + + int op0_instr_idx = last_writer_idx(ctx, instr->operands[0]); + int last_vcc_wr_idx = last_writer_idx(ctx, vcc, ctx.program->lane_mask); + int last_exec_wr_idx = last_writer_idx(ctx, exec, ctx.program->lane_mask); + + /* We need to make sure: + * - the operand register used by the branch, and VCC were both written in the current block + * - VCC was NOT written after the operand register + * - EXEC is sane and was NOT written after the operand register + */ + if (op0_instr_idx < 0 || last_vcc_wr_idx < 0 || last_vcc_wr_idx > op0_instr_idx || + last_exec_wr_idx > last_vcc_wr_idx || last_exec_wr_idx < not_written_in_block) + return; + + aco_ptr<Instruction> &op0_instr = ctx.current_block->instructions[op0_instr_idx]; + aco_ptr<Instruction> &last_vcc_wr = ctx.current_block->instructions[last_vcc_wr_idx]; + + if ((op0_instr->opcode != aco_opcode::s_and_b64 /* wave64 */ && + op0_instr->opcode != aco_opcode::s_and_b32 /* wave32 */) || + op0_instr->operands[0].physReg() != vcc || + op0_instr->operands[1].physReg() != exec || + !last_vcc_wr->isVOPC()) + return; + + assert(last_vcc_wr->definitions[0].tempId() == op0_instr->operands[0].tempId()); + + /* Reduce the uses of the SCC def */ + ctx.uses[instr->operands[0].tempId()]--; + /* Use VCC instead of SCC in the branch */ + instr->operands[0] = op0_instr->operands[0]; +} + void process_instruction(pr_opt_ctx &ctx, aco_ptr<Instruction> &instr) { ctx.current_instr_idx++; + try_apply_branch_vcc(ctx, instr); + if (instr) save_reg_writes(ctx, instr); } diff --git a/src/amd/compiler/tests/test_optimizer_postRA.cpp b/src/amd/compiler/tests/test_optimizer_postRA.cpp index a8a5acdf54d..16a427a2259 100644 --- a/src/amd/compiler/tests/test_optimizer_postRA.cpp +++ b/src/amd/compiler/tests/test_optimizer_postRA.cpp @@ -25,3 +25,100 @@ #include "helpers.h" using namespace aco; + +BEGIN_TEST(optimizer_postRA.vcmp) + PhysReg reg_v0(256); + PhysReg reg_s0(0); + PhysReg reg_s2(2); + PhysReg reg_s4(4); + + //>> v1: %a:v[0] = p_startpgm + ASSERTED bool setup_ok = setup_cs("v1", GFX8); + assert(setup_ok); + + auto &startpgm = bld.instructions->at(0); + assert(startpgm->opcode == aco_opcode::p_startpgm); + startpgm->definitions[0].setFixed(reg_v0); + + Temp v_in = inputs[0]; + + { + /* Recognize when the result of VOPC goes to VCC, and use that for the branching then. */ + + //! s2: %b:vcc = v_cmp_eq_u32 0, %a:v[0] + //! s2: %e:s[2-3] = p_cbranch_z %b:vcc + //! p_unit_test 0, %e:s[2-3] + auto vcmp = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm, vcc), Operand(0u), Operand(v_in, reg_v0)); + auto sand = bld.sop2(Builder::s_and, bld.def(bld.lm, reg_s0), bld.def(s1, scc), bld.vcc(vcmp), Operand(exec, bld.lm)); + auto br = bld.branch(aco_opcode::p_cbranch_z, bld.def(s2, reg_s2), bld.scc(sand.def(1).getTemp())); + writeout(0, Operand(br, reg_s2)); + } + + //; del b, e + + { + /* When VCC is overwritten inbetween, don't optimize. */ + + //! s2: %b:vcc = v_cmp_eq_u32 0, %a:v[0] + //! s2: %c:s[0-1], s1: %d:scc = s_and_b64 %b:vcc, %x:exec + //! s2: %f:vcc = s_mov_b64 0 + //! s2: %e:s[2-3] = p_cbranch_z %d:scc + //! p_unit_test 1, %e:s[2-3], %f:vcc + auto vcmp = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm, vcc), Operand(0u), Operand(v_in, reg_v0)); + auto sand = bld.sop2(Builder::s_and, bld.def(bld.lm, reg_s0), bld.def(s1, scc), bld.vcc(vcmp), Operand(exec, bld.lm)); + auto ovrwr = bld.sop1(Builder::s_mov, bld.def(bld.lm, vcc), Operand(0u)); + auto br = bld.branch(aco_opcode::p_cbranch_z, bld.def(s2, reg_s2), bld.scc(sand.def(1).getTemp())); + writeout(1, Operand(br, reg_s2), Operand(ovrwr, vcc)); + } + + //; del b, c, d, e, f + + { + /* When the result of VOPC goes to an SGPR pair other than VCC, don't optimize */ + + //! s2: %b:s[4-5] = v_cmp_eq_u32 0, %a:v[0] + //! s2: %c:s[0-1], s1: %d:scc = s_and_b64 %b:s[4-5], %x:exec + //! s2: %e:s[2-3] = p_cbranch_z %d:scc + //! p_unit_test 2, %e:s[2-3] + auto vcmp = bld.vopc_e64(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm, reg_s4), Operand(0u), Operand(v_in, reg_v0)); + auto sand = bld.sop2(Builder::s_and, bld.def(bld.lm, reg_s0), bld.def(s1, scc), Operand(vcmp, reg_s4), Operand(exec, bld.lm)); + auto br = bld.branch(aco_opcode::p_cbranch_z, bld.def(s2, reg_s2), bld.scc(sand.def(1).getTemp())); + writeout(2, Operand(br, reg_s2)); + } + + //; del b, c, d, e + + { + /* When the VCC isn't written by VOPC, don't optimize */ + + //! s2: %b:vcc, s1: %f:scc = s_or_b64 1, %0:s[4-5] + //! s2: %c:s[0-1], s1: %d:scc = s_and_b64 %b:vcc, %x:exec + //! s2: %e:s[2-3] = p_cbranch_z %d:scc + //! p_unit_test 2, %e:s[2-3] + auto salu = bld.sop2(Builder::s_or, bld.def(bld.lm, vcc), bld.def(s1, scc), Operand(1u), Operand(reg_s4, bld.lm)); + auto sand = bld.sop2(Builder::s_and, bld.def(bld.lm, reg_s0), bld.def(s1, scc), Operand(salu, vcc), Operand(exec, bld.lm)); + auto br = bld.branch(aco_opcode::p_cbranch_z, bld.def(s2, reg_s2), bld.scc(sand.def(1).getTemp())); + writeout(2, Operand(br, reg_s2)); + } + + //; del b, c, d, e, f, x + + { + /* When EXEC is overwritten inbetween, don't optimize. */ + + //! s2: %b:vcc = v_cmp_eq_u32 0, %a:v[0] + //! s2: %c:s[0-1], s1: %d:scc = s_and_b64 %b:vcc, %x:exec + //! s2: %f:exec = s_mov_b64 42 + //! s2: %e:s[2-3] = p_cbranch_z %d:scc + //! p_unit_test 4, %e:s[2-3], %f:exec + auto vcmp = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm, vcc), Operand(0u), Operand(v_in, reg_v0)); + auto sand = bld.sop2(Builder::s_and, bld.def(bld.lm, reg_s0), bld.def(s1, scc), bld.vcc(vcmp), Operand(exec, bld.lm)); + auto ovrwr = bld.sop1(Builder::s_mov, bld.def(bld.lm, exec), Operand(42u)); + auto br = bld.branch(aco_opcode::p_cbranch_z, bld.def(s2, reg_s2), bld.scc(sand.def(1).getTemp())); + writeout(4, Operand(br, reg_s2), Operand(ovrwr, exec)); + } + + //; del b, c, d, e, f, x + + finish_optimizer_postRA_test(); +END_TEST |