diff options
author | Rhys Perry <pendingchaos02@gmail.com> | 2023-09-29 11:01:45 +0100 |
---|---|---|
committer | Dylan Baker <dylan.c.baker@intel.com> | 2023-10-06 13:44:04 -0700 |
commit | 56ec87a1f7b90a16b09441b3e6bce7d12d3b4639 (patch) | |
tree | 8e7925ec4b9d009bd0d7b3d48d3a4556959293d3 | |
parent | 979d0420d5b7057c183185e117cb4d82cba74ccc (diff) |
aco/optimizer_postRA: don't combine DPP across exec on GFX8/9
GFX8/9 seem to use FI=0 behaviour.
fossil-db (vega10):
Totals from 1 (0.00% of 63053) affected shaders:
Instrs: 542 -> 570 (+5.17%)
CodeSize: 2928 -> 3040 (+3.83%)
Latency: 2087 -> 2118 (+1.49%)
InvThroughput: 1103 -> 1143 (+3.63%)
Affected shader is from Cyberpunk 2077 fossil.
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Georg Lehmann <dadschoorse@gmail.com>
Cc: 23.2 <mesa-stable>
Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/9784
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/25471>
(cherry picked from commit ea633c128c2af8944e159ec300fca7e7b4db26ad)
-rw-r--r-- | .pick_status.json | 2 | ||||
-rw-r--r-- | src/amd/compiler/aco_assembler.cpp | 2 | ||||
-rw-r--r-- | src/amd/compiler/aco_optimizer_postRA.cpp | 5 | ||||
-rw-r--r-- | src/amd/compiler/tests/test_optimizer_postRA.cpp | 28 |
4 files changed, 35 insertions, 2 deletions
diff --git a/.pick_status.json b/.pick_status.json index 1a915289550..43aa14a1fe4 100644 --- a/.pick_status.json +++ b/.pick_status.json @@ -2774,7 +2774,7 @@ "description": "aco/optimizer_postRA: don't combine DPP across exec on GFX8/9", "nominated": true, "nomination_type": 0, - "resolution": 0, + "resolution": 1, "main_sha": null, "because_sha": null, "notes": null diff --git a/src/amd/compiler/aco_assembler.cpp b/src/amd/compiler/aco_assembler.cpp index 6e9a08822df..bae2ca0fc44 100644 --- a/src/amd/compiler/aco_assembler.cpp +++ b/src/amd/compiler/aco_assembler.cpp @@ -796,7 +796,7 @@ emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction* inst encoding |= dpp.abs[0] << 21; encoding |= dpp.neg[0] << 20; if (ctx.gfx_level >= GFX10) - encoding |= 1 << 18; /* set Fetch Inactive to match GFX9 behaviour */ + encoding |= 1 << 18; /* set Fetch Inactive */ encoding |= dpp.bound_ctrl << 19; encoding |= dpp.dpp_ctrl << 8; encoding |= reg(ctx, dpp_op, 8); diff --git a/src/amd/compiler/aco_optimizer_postRA.cpp b/src/amd/compiler/aco_optimizer_postRA.cpp index 2b2b01236c9..4ef73792fd3 100644 --- a/src/amd/compiler/aco_optimizer_postRA.cpp +++ b/src/amd/compiler/aco_optimizer_postRA.cpp @@ -507,6 +507,11 @@ try_combine_dpp(pr_opt_ctx& ctx, aco_ptr<Instruction>& instr) if (is_overwritten_since(ctx, mov->operands[0], op_instr_idx)) continue; + /* GFX8/9 don't have fetch-inactive. */ + if (ctx.program->gfx_level < GFX10 && + is_overwritten_since(ctx, Operand(exec, ctx.program->lane_mask), op_instr_idx)) + continue; + /* We won't eliminate the DPP mov if the operand is used twice */ bool op_used_twice = false; for (unsigned j = 0; j < instr->operands.size(); j++) diff --git a/src/amd/compiler/tests/test_optimizer_postRA.cpp b/src/amd/compiler/tests/test_optimizer_postRA.cpp index 8913397cead..abc59165366 100644 --- a/src/amd/compiler/tests/test_optimizer_postRA.cpp +++ b/src/amd/compiler/tests/test_optimizer_postRA.cpp @@ -485,6 +485,34 @@ BEGIN_TEST(optimizer_postRA.dpp) finish_optimizer_postRA_test(); END_TEST +BEGIN_TEST(optimizer_postRA.dpp_across_exec) + for (amd_gfx_level gfx : {GFX9, GFX10}) { + //>> v1: %a:v[0], v1: %b:v[1] = p_startpgm + if (!setup_cs("v1 v1", gfx)) + continue; + + bld.instructions->at(0)->definitions[0].setFixed(PhysReg(256)); + bld.instructions->at(0)->definitions[1].setFixed(PhysReg(257)); + + PhysReg reg_v2(258); + Operand a(inputs[0], PhysReg(256)); + Operand b(inputs[1], PhysReg(257)); + + //~gfx9! v1: %tmp0:v[2] = v_mov_b32 %a:v[0] row_mirror bound_ctrl:1 + //! s2: %0:exec, s1: %0:scc = s_not_b64 %0:exec + //~gfx9! v1: %res0:v[2] = v_add_f32 %tmp0:v[2], %b:v[1] + //~gfx10! v1: %res0:v[2] = v_add_f32 %a:v[0], %b:v[1] row_mirror bound_ctrl:1 + //! p_unit_test 0, %res0:v[2] + Temp tmp0 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror); + bld.sop1(Builder::s_not, Definition(exec, bld.lm), Definition(scc, s1), + Operand(exec, bld.lm)); + Temp res0 = bld.vop2(aco_opcode::v_add_f32, bld.def(v1, reg_v2), Operand(tmp0, reg_v2), b); + writeout(0, Operand(res0, reg_v2)); + + finish_optimizer_postRA_test(); + } +END_TEST + BEGIN_TEST(optimizer_postRA.dpp_across_cf) //>> v1: %a:v[0], v1: %b:v[1], v1: %c:v[2], v1: %d:v[3], s2: %e:s[0-1] = p_startpgm if (!setup_cs("v1 v1 v1 v1 s2", GFX10_3)) |