summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTimur Kristóf <timur.kristof@gmail.com>2021-03-20 17:47:05 +0100
committerMarge Bot <eric+marge@anholt.net>2021-05-28 12:14:53 +0000
commita93092d0edc92eea8e8e96709ad9857f05c45cef (patch)
tree8d181b0dcf4f1e613829c156fd7fbd33038160dc
parent0e4747d3fb7ec15f8c1d6b971b1352249e7d95c6 (diff)
aco: Use s_cbranch_vccz/nz in post-RA optimization.
A simple post-RA optimization which takes advantage of the s_cbranch_vccz and s_cbranch_vccnz instructions. It works on the following pattern: vcc = v_cmp ... scc = s_and vcc, exec p_cbranch scc The result looks like this: vcc = v_cmp ... p_cbranch vcc Fossil DB results on Sienna Cichlid: Totals from 4814 (3.21% of 149839) affected shaders: CodeSize: 15371176 -> 15345964 (-0.16%) Instrs: 3028557 -> 3022254 (-0.21%) Latency: 21872753 -> 21823476 (-0.23%); split: -0.23%, +0.00% InvThroughput: 4470282 -> 4468691 (-0.04%); split: -0.04%, +0.00% Signed-off-by: Timur Kristóf <timur.kristof@gmail.com> Reviewed-by: Daniel Schürmann <daniel@schuermann.dev> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7779>
-rw-r--r--src/amd/compiler/aco_instruction_selection.cpp2
-rw-r--r--src/amd/compiler/aco_optimizer_postRA.cpp57
-rw-r--r--src/amd/compiler/tests/test_optimizer_postRA.cpp97
3 files changed, 154 insertions, 2 deletions
diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp
index 9efd01d038b..4c0c8871446 100644
--- a/src/amd/compiler/aco_instruction_selection.cpp
+++ b/src/amd/compiler/aco_instruction_selection.cpp
@@ -2910,7 +2910,6 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
Temp tmp = dst.bytes() == 8 ? bld.tmp(RegClass::get(dst.type(), 4)) : dst;
if (tmp.regClass() == s1) {
- // TODO: in a post-RA optimization, we can check if src is in VCC, and directly use VCCNZ
bool_to_scalar_condition(ctx, src, tmp);
} else if (tmp.type() == RegType::vgpr) {
bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(tmp), Operand(0u), Operand(1u), src);
@@ -10168,7 +10167,6 @@ static bool visit_if(isel_context *ctx, nir_if *if_stmt)
* merge block.
**/
- // TODO: in a post-RA optimizer, we could check if the condition is in VCC and omit this instruction
assert(cond.regClass() == ctx->program->lane_mask);
cond = bool_to_scalar_condition(ctx, cond);
diff --git a/src/amd/compiler/aco_optimizer_postRA.cpp b/src/amd/compiler/aco_optimizer_postRA.cpp
index 45b61942b12..12a143c0198 100644
--- a/src/amd/compiler/aco_optimizer_postRA.cpp
+++ b/src/amd/compiler/aco_optimizer_postRA.cpp
@@ -107,10 +107,67 @@ int last_writer_idx(pr_opt_ctx &ctx, const Operand &op)
return instr_idx;
}
+void try_apply_branch_vcc(pr_opt_ctx &ctx, aco_ptr<Instruction> &instr)
+{
+ /* We are looking for the following pattern:
+ *
+ * vcc = ... ; last_vcc_wr
+ * sX, scc = s_and_bXX vcc, exec ; op0_instr
+ * (...vcc and exec must not be clobbered inbetween...)
+ * s_cbranch_XX scc ; instr
+ *
+ * If possible, the above is optimized into:
+ *
+ * vcc = ... ; last_vcc_wr
+ * s_cbranch_XX vcc ; instr modified to use vcc
+ */
+
+ /* Don't try to optimize this on GFX6-7 because SMEM may corrupt the vccz bit. */
+ if (ctx.program->chip_class < GFX8)
+ return;
+
+ if (instr->format != Format::PSEUDO_BRANCH ||
+ instr->operands.size() == 0 ||
+ instr->operands[0].physReg() != scc)
+ return;
+
+ int op0_instr_idx = last_writer_idx(ctx, instr->operands[0]);
+ int last_vcc_wr_idx = last_writer_idx(ctx, vcc, ctx.program->lane_mask);
+ int last_exec_wr_idx = last_writer_idx(ctx, exec, ctx.program->lane_mask);
+
+ /* We need to make sure:
+ * - the operand register used by the branch, and VCC were both written in the current block
+ * - VCC was NOT written after the operand register
+ * - EXEC is sane and was NOT written after the operand register
+ */
+ if (op0_instr_idx < 0 || last_vcc_wr_idx < 0 || last_vcc_wr_idx > op0_instr_idx ||
+ last_exec_wr_idx > last_vcc_wr_idx || last_exec_wr_idx < not_written_in_block)
+ return;
+
+ aco_ptr<Instruction> &op0_instr = ctx.current_block->instructions[op0_instr_idx];
+ aco_ptr<Instruction> &last_vcc_wr = ctx.current_block->instructions[last_vcc_wr_idx];
+
+ if ((op0_instr->opcode != aco_opcode::s_and_b64 /* wave64 */ &&
+ op0_instr->opcode != aco_opcode::s_and_b32 /* wave32 */) ||
+ op0_instr->operands[0].physReg() != vcc ||
+ op0_instr->operands[1].physReg() != exec ||
+ !last_vcc_wr->isVOPC())
+ return;
+
+ assert(last_vcc_wr->definitions[0].tempId() == op0_instr->operands[0].tempId());
+
+ /* Reduce the uses of the SCC def */
+ ctx.uses[instr->operands[0].tempId()]--;
+ /* Use VCC instead of SCC in the branch */
+ instr->operands[0] = op0_instr->operands[0];
+}
+
void process_instruction(pr_opt_ctx &ctx, aco_ptr<Instruction> &instr)
{
ctx.current_instr_idx++;
+ try_apply_branch_vcc(ctx, instr);
+
if (instr)
save_reg_writes(ctx, instr);
}
diff --git a/src/amd/compiler/tests/test_optimizer_postRA.cpp b/src/amd/compiler/tests/test_optimizer_postRA.cpp
index a8a5acdf54d..16a427a2259 100644
--- a/src/amd/compiler/tests/test_optimizer_postRA.cpp
+++ b/src/amd/compiler/tests/test_optimizer_postRA.cpp
@@ -25,3 +25,100 @@
#include "helpers.h"
using namespace aco;
+
+BEGIN_TEST(optimizer_postRA.vcmp)
+ PhysReg reg_v0(256);
+ PhysReg reg_s0(0);
+ PhysReg reg_s2(2);
+ PhysReg reg_s4(4);
+
+ //>> v1: %a:v[0] = p_startpgm
+ ASSERTED bool setup_ok = setup_cs("v1", GFX8);
+ assert(setup_ok);
+
+ auto &startpgm = bld.instructions->at(0);
+ assert(startpgm->opcode == aco_opcode::p_startpgm);
+ startpgm->definitions[0].setFixed(reg_v0);
+
+ Temp v_in = inputs[0];
+
+ {
+ /* Recognize when the result of VOPC goes to VCC, and use that for the branching then. */
+
+ //! s2: %b:vcc = v_cmp_eq_u32 0, %a:v[0]
+ //! s2: %e:s[2-3] = p_cbranch_z %b:vcc
+ //! p_unit_test 0, %e:s[2-3]
+ auto vcmp = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm, vcc), Operand(0u), Operand(v_in, reg_v0));
+ auto sand = bld.sop2(Builder::s_and, bld.def(bld.lm, reg_s0), bld.def(s1, scc), bld.vcc(vcmp), Operand(exec, bld.lm));
+ auto br = bld.branch(aco_opcode::p_cbranch_z, bld.def(s2, reg_s2), bld.scc(sand.def(1).getTemp()));
+ writeout(0, Operand(br, reg_s2));
+ }
+
+ //; del b, e
+
+ {
+ /* When VCC is overwritten inbetween, don't optimize. */
+
+ //! s2: %b:vcc = v_cmp_eq_u32 0, %a:v[0]
+ //! s2: %c:s[0-1], s1: %d:scc = s_and_b64 %b:vcc, %x:exec
+ //! s2: %f:vcc = s_mov_b64 0
+ //! s2: %e:s[2-3] = p_cbranch_z %d:scc
+ //! p_unit_test 1, %e:s[2-3], %f:vcc
+ auto vcmp = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm, vcc), Operand(0u), Operand(v_in, reg_v0));
+ auto sand = bld.sop2(Builder::s_and, bld.def(bld.lm, reg_s0), bld.def(s1, scc), bld.vcc(vcmp), Operand(exec, bld.lm));
+ auto ovrwr = bld.sop1(Builder::s_mov, bld.def(bld.lm, vcc), Operand(0u));
+ auto br = bld.branch(aco_opcode::p_cbranch_z, bld.def(s2, reg_s2), bld.scc(sand.def(1).getTemp()));
+ writeout(1, Operand(br, reg_s2), Operand(ovrwr, vcc));
+ }
+
+ //; del b, c, d, e, f
+
+ {
+ /* When the result of VOPC goes to an SGPR pair other than VCC, don't optimize */
+
+ //! s2: %b:s[4-5] = v_cmp_eq_u32 0, %a:v[0]
+ //! s2: %c:s[0-1], s1: %d:scc = s_and_b64 %b:s[4-5], %x:exec
+ //! s2: %e:s[2-3] = p_cbranch_z %d:scc
+ //! p_unit_test 2, %e:s[2-3]
+ auto vcmp = bld.vopc_e64(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm, reg_s4), Operand(0u), Operand(v_in, reg_v0));
+ auto sand = bld.sop2(Builder::s_and, bld.def(bld.lm, reg_s0), bld.def(s1, scc), Operand(vcmp, reg_s4), Operand(exec, bld.lm));
+ auto br = bld.branch(aco_opcode::p_cbranch_z, bld.def(s2, reg_s2), bld.scc(sand.def(1).getTemp()));
+ writeout(2, Operand(br, reg_s2));
+ }
+
+ //; del b, c, d, e
+
+ {
+ /* When the VCC isn't written by VOPC, don't optimize */
+
+ //! s2: %b:vcc, s1: %f:scc = s_or_b64 1, %0:s[4-5]
+ //! s2: %c:s[0-1], s1: %d:scc = s_and_b64 %b:vcc, %x:exec
+ //! s2: %e:s[2-3] = p_cbranch_z %d:scc
+ //! p_unit_test 2, %e:s[2-3]
+ auto salu = bld.sop2(Builder::s_or, bld.def(bld.lm, vcc), bld.def(s1, scc), Operand(1u), Operand(reg_s4, bld.lm));
+ auto sand = bld.sop2(Builder::s_and, bld.def(bld.lm, reg_s0), bld.def(s1, scc), Operand(salu, vcc), Operand(exec, bld.lm));
+ auto br = bld.branch(aco_opcode::p_cbranch_z, bld.def(s2, reg_s2), bld.scc(sand.def(1).getTemp()));
+ writeout(2, Operand(br, reg_s2));
+ }
+
+ //; del b, c, d, e, f, x
+
+ {
+ /* When EXEC is overwritten inbetween, don't optimize. */
+
+ //! s2: %b:vcc = v_cmp_eq_u32 0, %a:v[0]
+ //! s2: %c:s[0-1], s1: %d:scc = s_and_b64 %b:vcc, %x:exec
+ //! s2: %f:exec = s_mov_b64 42
+ //! s2: %e:s[2-3] = p_cbranch_z %d:scc
+ //! p_unit_test 4, %e:s[2-3], %f:exec
+ auto vcmp = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm, vcc), Operand(0u), Operand(v_in, reg_v0));
+ auto sand = bld.sop2(Builder::s_and, bld.def(bld.lm, reg_s0), bld.def(s1, scc), bld.vcc(vcmp), Operand(exec, bld.lm));
+ auto ovrwr = bld.sop1(Builder::s_mov, bld.def(bld.lm, exec), Operand(42u));
+ auto br = bld.branch(aco_opcode::p_cbranch_z, bld.def(s2, reg_s2), bld.scc(sand.def(1).getTemp()));
+ writeout(4, Operand(br, reg_s2), Operand(ovrwr, exec));
+ }
+
+ //; del b, c, d, e, f, x
+
+ finish_optimizer_postRA_test();
+END_TEST