summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRhys Perry <pendingchaos02@gmail.com>2020-08-04 16:06:56 +0100
committerMarge Bot <eric+marge@anholt.net>2020-08-26 13:26:58 +0000
commitfe2dc41258b1d868b59c44256ef8397f301ed7da (patch)
tree9e7e3f382fe5caaab564998005b09803b456e824
parent156fd58cdacb28a7fca88fc9ffc84c3cdfbbb8f3 (diff)
aco: create long jumps
When the branch offset can't be encoded, we have to use s_setpc_b64. Fixes hang in RPCS3 vertex ubershader. Signed-off-by: Rhys Perry <pendingchaos02@gmail.com> Reviewed-by: Daniel Schürmann <daniel@schuermann.dev> Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/3231 Cc: 20.2 <mesa-stable> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/6212>
-rw-r--r--src/amd/compiler/aco_assembler.cpp152
-rw-r--r--src/amd/compiler/aco_builder_h.py2
2 files changed, 126 insertions, 28 deletions
diff --git a/src/amd/compiler/aco_assembler.cpp b/src/amd/compiler/aco_assembler.cpp
index b056cd507b6..17a51cd7478 100644
--- a/src/amd/compiler/aco_assembler.cpp
+++ b/src/amd/compiler/aco_assembler.cpp
@@ -2,6 +2,7 @@
#include <algorithm>
#include "aco_ir.h"
+#include "aco_builder.h"
#include "common/sid.h"
#include "ac_shader_util.h"
#include "util/u_math.h"
@@ -157,8 +158,10 @@ void emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction*
uint32_t encoding = (0b101111111 << 23);
encoding |= opcode << 16;
encoding |= (uint16_t) sopp->imm;
- if (sopp->block != -1)
+ if (sopp->block != -1) {
+ sopp->pass_flags = 0;
ctx.branches.emplace_back(out.size(), sopp);
+ }
out.push_back(encoding);
break;
}
@@ -752,6 +755,36 @@ void fix_exports(asm_context& ctx, std::vector<uint32_t>& out, Program* program)
}
}
+static void insert_code(asm_context& ctx, std::vector<uint32_t>& out, unsigned insert_before,
+ unsigned insert_count, const uint32_t *insert_data)
+{
+ out.insert(out.begin() + insert_before, insert_data, insert_data + insert_count);
+
+ /* Update the offset of each affected block */
+ for (Block& block : ctx.program->blocks) {
+ if (block.offset >= insert_before)
+ block.offset += insert_count;
+ }
+
+ /* Find first branch after the inserted code */
+ auto branch_it = std::find_if(ctx.branches.begin(), ctx.branches.end(), [insert_before](const auto &branch) -> bool {
+ return (unsigned)branch.first >= insert_before;
+ });
+
+ /* Update the locations of branches */
+ for (; branch_it != ctx.branches.end(); ++branch_it)
+ branch_it->first += insert_count;
+
+ /* Find first constant address after the inserted code */
+ auto caddr_it = std::find_if(ctx.constaddrs.begin(), ctx.constaddrs.end(), [insert_before](const int &caddr_pos) -> bool {
+ return (unsigned)caddr_pos >= insert_before;
+ });
+
+ /* Update the locations of constant addresses */
+ for (; caddr_it != ctx.constaddrs.end(); ++caddr_it)
+ (*caddr_it) += insert_count;
+}
+
static void fix_branches_gfx10(asm_context& ctx, std::vector<uint32_t>& out)
{
/* Branches with an offset of 0x3f are buggy on GFX10, we workaround by inserting NOPs if needed. */
@@ -767,42 +800,107 @@ static void fix_branches_gfx10(asm_context& ctx, std::vector<uint32_t>& out)
if (gfx10_3f_bug) {
/* Insert an s_nop after the branch */
constexpr uint32_t s_nop_0 = 0xbf800000u;
- int s_nop_pos = buggy_branch_it->first + 1;
- auto out_pos = std::next(out.begin(), s_nop_pos);
- out.insert(out_pos, s_nop_0);
-
- /* Update the offset of each affected block */
- for (Block& block : ctx.program->blocks) {
- if (block.offset > (unsigned)buggy_branch_it->first)
- block.offset++;
- }
+ insert_code(ctx, out, buggy_branch_it->first + 1, 1, &s_nop_0);
+ }
+ } while (gfx10_3f_bug);
+}
- /* Update the branches following the current one */
- for (auto branch_it = std::next(buggy_branch_it); branch_it != ctx.branches.end(); ++branch_it)
- branch_it->first++;
+void emit_long_jump(asm_context& ctx, SOPP_instruction *branch, bool backwards, std::vector<uint32_t>& out)
+{
+ Builder bld(ctx.program);
- /* Find first constant address after the inserted instruction */
- auto caddr_it = std::find_if(ctx.constaddrs.begin(), ctx.constaddrs.end(), [s_nop_pos](const int &caddr_pos) -> bool {
- return caddr_pos >= s_nop_pos;
- });
+ Definition def_tmp_lo(branch->definitions[0].physReg(), s1);
+ Operand op_tmp_lo(branch->definitions[0].physReg(), s1);
+ Definition def_tmp_hi(branch->definitions[0].physReg().advance(4), s1);
+ Operand op_tmp_hi(branch->definitions[0].physReg().advance(4), s1);
- /* Update the locations of constant addresses */
- for (; caddr_it != ctx.constaddrs.end(); ++caddr_it)
- (*caddr_it)++;
+ aco_ptr<Instruction> instr;
+ if (branch->opcode != aco_opcode::s_branch) {
+ /* for conditional branches, skip the long jump if the condition is false */
+ aco_opcode inv;
+ switch (branch->opcode) {
+ case aco_opcode::s_cbranch_scc0:
+ inv = aco_opcode::s_cbranch_scc1;
+ break;
+ case aco_opcode::s_cbranch_scc1:
+ inv = aco_opcode::s_cbranch_scc0;
+ break;
+ case aco_opcode::s_cbranch_vccz:
+ inv = aco_opcode::s_cbranch_vccnz;
+ break;
+ case aco_opcode::s_cbranch_vccnz:
+ inv = aco_opcode::s_cbranch_vccz;
+ break;
+ case aco_opcode::s_cbranch_execz:
+ inv = aco_opcode::s_cbranch_execnz;
+ break;
+ case aco_opcode::s_cbranch_execnz:
+ inv = aco_opcode::s_cbranch_execz;
+ break;
+ default:
+ unreachable("Unhandled long jump.");
}
- } while (gfx10_3f_bug);
+ instr.reset(bld.sopp(inv, -1, 7));
+ emit_instruction(ctx, out, instr.get());
+ }
+
+ /* create the new PC and stash SCC in the LSB */
+ instr.reset(bld.sop1(aco_opcode::s_getpc_b64, branch->definitions[0]).instr);
+ emit_instruction(ctx, out, instr.get());
+
+ instr.reset(bld.sop2(aco_opcode::s_addc_u32, def_tmp_lo, op_tmp_lo, Operand(0u)).instr);
+ instr->operands[1].setFixed(PhysReg{255}); /* this operand has to be a literal */
+ emit_instruction(ctx, out, instr.get());
+ branch->pass_flags = out.size();
+
+ instr.reset(bld.sop2(aco_opcode::s_addc_u32, def_tmp_hi, op_tmp_hi, Operand(backwards ? UINT32_MAX : 0u)).instr);
+ emit_instruction(ctx, out, instr.get());
+
+ /* restore SCC and clear the LSB of the new PC */
+ instr.reset(bld.sopc(aco_opcode::s_bitcmp1_b32, def_tmp_lo, op_tmp_lo, Operand(0u)).instr);
+ emit_instruction(ctx, out, instr.get());
+ instr.reset(bld.sop1(aco_opcode::s_bitset0_b32, def_tmp_lo, Operand(0u)).instr);
+ emit_instruction(ctx, out, instr.get());
+
+ /* create the s_setpc_b64 to jump */
+ instr.reset(bld.sop1(aco_opcode::s_setpc_b64, Operand(branch->definitions[0].physReg(), s2)).instr);
+ emit_instruction(ctx, out, instr.get());
}
void fix_branches(asm_context& ctx, std::vector<uint32_t>& out)
{
- if (ctx.chip_class == GFX10)
- fix_branches_gfx10(ctx, out);
+ bool repeat = false;
+ do {
+ repeat = false;
- for (std::pair<int, SOPP_instruction*> &branch : ctx.branches) {
- int offset = (int)ctx.program->blocks[branch.second->block].offset - branch.first - 1;
- out[branch.first] |= (uint16_t) offset;
- }
+ if (ctx.chip_class == GFX10)
+ fix_branches_gfx10(ctx, out);
+
+ for (std::pair<int, SOPP_instruction*> &branch : ctx.branches) {
+ int offset = (int)ctx.program->blocks[branch.second->block].offset - branch.first - 1;
+ if ((offset < INT16_MIN || offset > INT16_MAX) && !branch.second->pass_flags) {
+ std::vector<uint32_t> long_jump;
+ bool backwards = ctx.program->blocks[branch.second->block].offset < (unsigned)branch.first;
+ emit_long_jump(ctx, branch.second, backwards, long_jump);
+
+ out[branch.first] = long_jump[0];
+ insert_code(ctx, out, branch.first + 1, long_jump.size() - 1, long_jump.data() + 1);
+
+ repeat = true;
+ break;
+ }
+
+ if (branch.second->pass_flags) {
+ int after_getpc = branch.first + branch.second->pass_flags - 2;
+ offset = (int)ctx.program->blocks[branch.second->block].offset - after_getpc;
+ out[branch.first + branch.second->pass_flags - 1] = offset * 4;
+ } else {
+ out[branch.first] &= 0xffff0000u;
+ out[branch.first] |= (uint16_t) offset;
+ }
+ }
+ } while (repeat);
}
void fix_constaddrs(asm_context& ctx, std::vector<uint32_t>& out)
diff --git a/src/amd/compiler/aco_builder_h.py b/src/amd/compiler/aco_builder_h.py
index 432b39d7070..1e217dea11b 100644
--- a/src/amd/compiler/aco_builder_h.py
+++ b/src/amd/compiler/aco_builder_h.py
@@ -534,7 +534,7 @@ public:
<%
import itertools
formats = [("pseudo", [Format.PSEUDO], 'Pseudo_instruction', list(itertools.product(range(5), range(5))) + [(8, 1), (1, 8)]),
- ("sop1", [Format.SOP1], 'SOP1_instruction', [(1, 1), (2, 1), (3, 2)]),
+ ("sop1", [Format.SOP1], 'SOP1_instruction', [(0, 1), (1, 0), (1, 1), (2, 1), (3, 2)]),
("sop2", [Format.SOP2], 'SOP2_instruction', itertools.product([1, 2], [2, 3])),
("sopk", [Format.SOPK], 'SOPK_instruction', itertools.product([0, 1, 2], [0, 1])),
("sopp", [Format.SOPP], 'SOPP_instruction', itertools.product([0, 1], [0, 1])),