diff options
Diffstat (limited to 'src/amd/compiler/aco_register_allocation.cpp')
-rw-r--r-- | src/amd/compiler/aco_register_allocation.cpp | 2011 |
1 files changed, 1282 insertions, 729 deletions
diff --git a/src/amd/compiler/aco_register_allocation.cpp b/src/amd/compiler/aco_register_allocation.cpp index d474dfe463a..47fefded1e5 100644 --- a/src/amd/compiler/aco_register_allocation.cpp +++ b/src/amd/compiler/aco_register_allocation.cpp @@ -1,33 +1,19 @@ /* * Copyright © 2018 Valve Corporation * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - * + * SPDX-License-Identifier: MIT */ #include "aco_ir.h" +#include "util/bitset.h" +#include "util/enum_operators.h" + #include <algorithm> #include <array> #include <bitset> #include <map> +#include <optional> #include <set> #include <unordered_map> #include <vector> @@ -37,20 +23,35 @@ namespace { struct ra_ctx; -unsigned get_subdword_operand_stride(chip_class chip, const aco_ptr<Instruction>& instr, +unsigned get_subdword_operand_stride(amd_gfx_level gfx_level, const aco_ptr<Instruction>& instr, unsigned idx, RegClass rc); void add_subdword_operand(ra_ctx& ctx, aco_ptr<Instruction>& instr, unsigned idx, unsigned byte, RegClass rc); std::pair<unsigned, unsigned> get_subdword_definition_info(Program* program, const aco_ptr<Instruction>& instr, RegClass rc); -void add_subdword_definition(Program* program, aco_ptr<Instruction>& instr, PhysReg reg); +void add_subdword_definition(Program* program, aco_ptr<Instruction>& instr, PhysReg reg, + bool allow_16bit_write); struct assignment { PhysReg reg; RegClass rc; - uint8_t assigned = 0; + union { + struct { + bool assigned : 1; + bool vcc : 1; + bool m0 : 1; + }; + uint8_t _ = 0; + }; + uint32_t affinity = 0; assignment() = default; - assignment(PhysReg reg_, RegClass rc_) : reg(reg_), rc(rc_), assigned(-1) {} + assignment(PhysReg reg_, RegClass rc_) : reg(reg_), rc(rc_) { assigned = true; } + void set(const Definition& def) + { + assigned = true; + reg = def.physReg(); + rc = def.regClass(); + } }; struct ra_ctx { @@ -61,16 +62,19 @@ struct ra_ctx { std::vector<std::unordered_map<unsigned, Temp>> renames; std::vector<uint32_t> loop_header; std::unordered_map<unsigned, Temp> orig_names; - std::unordered_map<unsigned, unsigned> affinities; std::unordered_map<unsigned, Instruction*> vectors; std::unordered_map<unsigned, Instruction*> split_vectors; aco_ptr<Instruction> pseudo_dummy; + aco_ptr<Instruction> phi_dummy; uint16_t max_used_sgpr = 0; uint16_t max_used_vgpr = 0; uint16_t sgpr_limit; uint16_t vgpr_limit; std::bitset<512> war_hint; - std::bitset<64> defs_done; /* see MAX_ARGS in aco_instruction_selection_setup.cpp */ + + uint16_t sgpr_bounds; + uint16_t vgpr_bounds; + uint16_t num_linear_vgprs; ra_test_policy policy; @@ -78,10 +82,14 @@ struct ra_ctx { : program(program_), assignments(program->peekAllocationId()), renames(program->blocks.size()), policy(policy_) { - pseudo_dummy.reset( - create_instruction<Instruction>(aco_opcode::p_parallelcopy, Format::PSEUDO, 0, 0)); + pseudo_dummy.reset(create_instruction(aco_opcode::p_parallelcopy, Format::PSEUDO, 0, 0)); + phi_dummy.reset(create_instruction(aco_opcode::p_linear_phi, Format::PSEUDO, 0, 0)); sgpr_limit = get_addr_sgpr_from_waves(program, program->min_waves); - vgpr_limit = get_addr_sgpr_from_waves(program, program->min_waves); + vgpr_limit = get_addr_vgpr_from_waves(program, program->min_waves); + + sgpr_bounds = program->max_reg_demand.sgpr; + vgpr_bounds = program->max_reg_demand.vgpr; + num_linear_vgprs = 0; } }; @@ -153,7 +161,7 @@ struct PhysRegInterval { bool intersects(const PhysRegInterval& a, const PhysRegInterval& b) { - return ((a.lo() >= b.lo() && a.lo() < b.hi()) || (a.hi() > b.lo() && a.hi() <= b.hi())); + return a.hi() > b.lo() && b.hi() > a.lo(); } /* Gets the stride for full (non-subdword) registers */ @@ -175,15 +183,24 @@ get_stride(RegClass rc) } PhysRegInterval -get_reg_bounds(Program* program, RegType type) +get_reg_bounds(ra_ctx& ctx, RegType type, bool linear_vgpr) { - if (type == RegType::vgpr) { - return {PhysReg{256}, (unsigned)program->max_reg_demand.vgpr}; + uint16_t linear_vgpr_start = ctx.vgpr_bounds - ctx.num_linear_vgprs; + if (type == RegType::vgpr && linear_vgpr) { + return PhysRegInterval{PhysReg(256 + linear_vgpr_start), ctx.num_linear_vgprs}; + } else if (type == RegType::vgpr) { + return PhysRegInterval{PhysReg(256), linear_vgpr_start}; } else { - return {PhysReg{0}, (unsigned)program->max_reg_demand.sgpr}; + return PhysRegInterval{PhysReg(0), ctx.sgpr_bounds}; } } +PhysRegInterval +get_reg_bounds(ra_ctx& ctx, RegClass rc) +{ + return get_reg_bounds(ctx, rc.type(), rc.is_linear_vgpr()); +} + struct DefInfo { PhysRegInterval bounds; uint8_t size; @@ -195,11 +212,11 @@ struct DefInfo { size = rc.size(); stride = get_stride(rc); - bounds = get_reg_bounds(ctx.program, rc.type()); + bounds = get_reg_bounds(ctx, rc); if (rc.is_subdword() && operand >= 0) { /* stride in bytes */ - stride = get_subdword_operand_stride(ctx.program->chip_class, instr, operand, rc); + stride = get_subdword_operand_stride(ctx.program->gfx_level, instr, operand, rc); } else if (rc.is_subdword()) { std::pair<unsigned, unsigned> info = get_subdword_definition_info(ctx.program, instr, rc); stride = info.first; @@ -214,6 +231,20 @@ struct DefInfo { stride = DIV_ROUND_UP(stride, 4); } assert(stride > 0); + } else if (instr->isMIMG() && instr->mimg().d16 && ctx.program->gfx_level <= GFX9) { + /* Workaround GFX9 hardware bug for D16 image instructions: FeatureImageGather4D16Bug + * + * The register use is not calculated correctly, and the hardware assumes a + * full dword per component. Don't use the last registers of the register file. + * Otherwise, the instruction will be skipped. + * + * https://reviews.llvm.org/D81172 + */ + bool imageGather4D16Bug = operand == -1 && rc == v2 && instr->mimg().dmask != 0xF; + assert(ctx.program->gfx_level == GFX9 && "Image D16 on GFX8 not supported."); + + if (imageGather4D16Bug) + bounds.size -= MAX2(rc.bytes() / 4 - ctx.num_linear_vgprs, 0); } } }; @@ -229,7 +260,7 @@ public: uint32_t& operator[](PhysReg index) { return regs[index]; } - unsigned count_zero(PhysRegInterval reg_interval) + unsigned count_zero(PhysRegInterval reg_interval) const { unsigned res = 0; for (PhysReg reg : reg_interval) @@ -238,16 +269,17 @@ public: } /* Returns true if any of the bytes in the given range are allocated or blocked */ - bool test(PhysReg start, unsigned num_bytes) + bool test(PhysReg start, unsigned num_bytes) const { for (PhysReg i = start; i.reg_b < start.reg_b + num_bytes; i = PhysReg(i + 1)) { assert(i <= 511); if (regs[i] & 0x0FFFFFFF) return true; if (regs[i] == 0xF0000000) { - assert(subdword_regs.find(i) != subdword_regs.end()); + auto it = subdword_regs.find(i); + assert(it != subdword_regs.end()); for (unsigned j = i.byte(); i * 4 + j < start.reg_b + num_bytes && j < 4; j++) { - if (subdword_regs[i][j]) + if (it->second[j]) return true; } } @@ -263,24 +295,28 @@ public: fill(start, rc.size(), 0xFFFFFFFF); } - bool is_blocked(PhysReg start) + bool is_blocked(PhysReg start) const { if (regs[start] == 0xFFFFFFFF) return true; if (regs[start] == 0xF0000000) { + auto it = subdword_regs.find(start); + assert(it != subdword_regs.end()); for (unsigned i = start.byte(); i < 4; i++) - if (subdword_regs[start][i] == 0xFFFFFFFF) + if (it->second[i] == 0xFFFFFFFF) return true; } return false; } - bool is_empty_or_blocked(PhysReg start) + bool is_empty_or_blocked(PhysReg start) const { /* Empty is 0, blocked is 0xFFFFFFFF, so to check both we compare the * incremented value to 1 */ if (regs[start] == 0xF0000000) { - return subdword_regs[start][start.byte()] + 1 <= 1; + auto it = subdword_regs.find(start); + assert(it != subdword_regs.end()); + return it->second[start.byte()] + 1 <= 1; } return regs[start] + 1 <= 1; } @@ -313,9 +349,9 @@ public: void clear(Definition def) { clear(def.physReg(), def.regClass()); } - unsigned get_id(PhysReg reg) + unsigned get_id(PhysReg reg) const { - return regs[reg] == 0xF0000000 ? subdword_regs[reg][reg.byte()] : regs[reg]; + return regs[reg] == 0xF0000000 ? subdword_regs.at(reg)[reg.byte()] : regs[reg]; } private: @@ -343,24 +379,24 @@ private: } }; -std::set<std::pair<unsigned, unsigned>> find_vars(ra_ctx& ctx, RegisterFile& reg_file, - const PhysRegInterval reg_interval); +std::vector<unsigned> find_vars(ra_ctx& ctx, const RegisterFile& reg_file, + const PhysRegInterval reg_interval); /* helper function for debugging */ UNUSED void print_reg(const RegisterFile& reg_file, PhysReg reg, bool has_adjacent_variable) { if (reg_file[reg] == 0xFFFFFFFF) { - printf("☐"); + printf((const char*)u8"☐"); } else if (reg_file[reg]) { const bool show_subdword_alloc = (reg_file[reg] == 0xF0000000); if (show_subdword_alloc) { - const char* block_chars[] = { + auto block_chars = { // clang-format off - "?", "▘", "▝", "▀", - "▖", "▌", "▞", "▛", - "▗", "▚", "▐", "▜", - "▄", "▙", "▟", "▉" + u8"?", u8"▘", u8"▝", u8"▀", + u8"▖", u8"▌", u8"▞", u8"▛", + u8"▗", u8"▚", u8"▐", u8"▜", + u8"▄", u8"▙", u8"▟", u8"▉" // clang-format on }; unsigned index = 0; @@ -369,27 +405,26 @@ print_reg(const RegisterFile& reg_file, PhysReg reg, bool has_adjacent_variable) index |= 1 << i; } } - printf("%s", block_chars[index]); + printf("%s", (const char*)(block_chars.begin()[index])); } else { /* Indicate filled register slot */ if (!has_adjacent_variable) { - printf("█"); + printf((const char*)u8"█"); } else { /* Use a slightly shorter box to leave a small gap between adjacent variables */ - printf("▉"); + printf((const char*)u8"▉"); } } } else { - printf("·"); + printf((const char*)u8"·"); } } /* helper function for debugging */ UNUSED void -print_regs(ra_ctx& ctx, bool vgprs, RegisterFile& reg_file) +print_regs(ra_ctx& ctx, PhysRegInterval regs, const RegisterFile& reg_file) { - PhysRegInterval regs = get_reg_bounds(ctx.program, vgprs ? RegType::vgpr : RegType::sgpr); - char reg_char = vgprs ? 'v' : 's'; + char reg_char = regs.lo().reg() >= 256 ? 'v' : 's'; const int max_regs_per_line = 64; /* print markers */ @@ -428,11 +463,11 @@ print_regs(ra_ctx& ctx, bool vgprs, RegisterFile& reg_file) printf("%u/%u used, %u/%u free\n", regs.size - free_regs, regs.size, free_regs, regs.size); /* print assignments ordered by registers */ - std::map<PhysReg, std::pair<unsigned, unsigned>> - regs_to_vars; /* maps to byte size and temp id */ - for (const auto& size_id : find_vars(ctx, reg_file, regs)) { - auto reg = ctx.assignments[size_id.second].reg; - ASSERTED auto inserted = regs_to_vars.emplace(reg, size_id); + std::map<PhysReg, std::pair<unsigned, unsigned>> regs_to_vars; /* maps to byte size and temp id */ + for (unsigned id : find_vars(ctx, reg_file, regs)) { + const assignment& var = ctx.assignments[id]; + PhysReg reg = var.reg; + ASSERTED auto inserted = regs_to_vars.emplace(reg, std::make_pair(var.rc.bytes(), id)); assert(inserted.second); } @@ -445,11 +480,11 @@ print_regs(ra_ctx& ctx, bool vgprs, RegisterFile& reg_file) ctx.orig_names[size_id.second].id() != size_id.second) { printf("(was %%%d) ", ctx.orig_names[size_id.second].id()); } - printf("= %c[%d", reg_char, first_reg.reg() - regs.lo()); + printf("= %c[%d", reg_char, first_reg.reg() % 256); PhysReg last_reg = first_reg.advance(size_id.first - 1); if (first_reg.reg() != last_reg.reg()) { assert(first_reg.byte() == 0 && last_reg.byte() == 3); - printf("-%d", last_reg.reg() - regs.lo()); + printf("-%d", last_reg.reg() % 256); } printf("]"); if (first_reg.byte() != 0 || last_reg.byte() != 3) { @@ -460,14 +495,14 @@ print_regs(ra_ctx& ctx, bool vgprs, RegisterFile& reg_file) } unsigned -get_subdword_operand_stride(chip_class chip, const aco_ptr<Instruction>& instr, unsigned idx, - RegClass rc) +get_subdword_operand_stride(amd_gfx_level gfx_level, const aco_ptr<Instruction>& instr, + unsigned idx, RegClass rc) { if (instr->isPseudo()) { /* v_readfirstlane_b32 cannot use SDWA */ if (instr->opcode == aco_opcode::p_as_uniform) return 4; - else if (chip >= GFX8) + else if (gfx_level >= GFX8) return rc.bytes() % 2 == 0 ? 2 : 1; else return 4; @@ -475,26 +510,27 @@ get_subdword_operand_stride(chip_class chip, const aco_ptr<Instruction>& instr, assert(rc.bytes() <= 2); if (instr->isVALU()) { - if (can_use_SDWA(chip, instr, false)) + if (can_use_SDWA(gfx_level, instr, false)) return rc.bytes(); - if (can_use_opsel(chip, instr->opcode, idx, true)) + if (can_use_opsel(gfx_level, instr->opcode, idx)) return 2; - if (instr->format == Format::VOP3P) + if (instr->isVOP3P()) return 2; } switch (instr->opcode) { case aco_opcode::v_cvt_f32_ubyte0: return 1; case aco_opcode::ds_write_b8: - case aco_opcode::ds_write_b16: return chip >= GFX9 ? 2 : 4; + case aco_opcode::ds_write_b16: return gfx_level >= GFX9 ? 2 : 4; case aco_opcode::buffer_store_byte: case aco_opcode::buffer_store_short: + case aco_opcode::buffer_store_format_d16_x: case aco_opcode::flat_store_byte: case aco_opcode::flat_store_short: case aco_opcode::scratch_store_byte: case aco_opcode::scratch_store_short: case aco_opcode::global_store_byte: - case aco_opcode::global_store_short: return chip >= GFX9 ? 2 : 4; + case aco_opcode::global_store_short: return gfx_level >= GFX9 ? 2 : 4; default: return 4; } } @@ -503,24 +539,12 @@ void add_subdword_operand(ra_ctx& ctx, aco_ptr<Instruction>& instr, unsigned idx, unsigned byte, RegClass rc) { - chip_class chip = ctx.program->chip_class; + amd_gfx_level gfx_level = ctx.program->gfx_level; if (instr->isPseudo() || byte == 0) return; assert(rc.bytes() <= 2); if (instr->isVALU()) { - /* check if we can use opsel */ - if (instr->format == Format::VOP3) { - assert(byte == 2); - instr->vop3().opsel |= 1 << idx; - return; - } - if (instr->isVOP3P()) { - assert(byte == 2 && !(instr->vop3p().opsel_lo & (1 << idx))); - instr->vop3p().opsel_lo |= 1 << idx; - instr->vop3p().opsel_hi |= 1 << idx; - return; - } if (instr->opcode == aco_opcode::v_cvt_f32_ubyte0) { switch (byte) { case 0: instr->opcode = aco_opcode::v_cvt_f32_ubyte0; break; @@ -532,8 +556,21 @@ add_subdword_operand(ra_ctx& ctx, aco_ptr<Instruction>& instr, unsigned idx, uns } /* use SDWA */ - assert(can_use_SDWA(chip, instr, false)); - convert_to_SDWA(chip, instr); + if (can_use_SDWA(gfx_level, instr, false)) { + convert_to_SDWA(gfx_level, instr); + return; + } + + /* use opsel */ + if (instr->isVOP3P()) { + assert(byte == 2 && !instr->valu().opsel_lo[idx]); + instr->valu().opsel_lo[idx] = true; + instr->valu().opsel_hi[idx] = true; + return; + } + + assert(can_use_opsel(gfx_level, instr->opcode, idx)); + instr->valu().opsel[idx] = true; return; } @@ -546,6 +583,8 @@ add_subdword_operand(ra_ctx& ctx, aco_ptr<Instruction>& instr, unsigned idx, uns instr->opcode = aco_opcode::buffer_store_byte_d16_hi; else if (instr->opcode == aco_opcode::buffer_store_short) instr->opcode = aco_opcode::buffer_store_short_d16_hi; + else if (instr->opcode == aco_opcode::buffer_store_format_d16_x) + instr->opcode = aco_opcode::buffer_store_format_d16_hi_x; else if (instr->opcode == aco_opcode::flat_store_byte) instr->opcode = aco_opcode::flat_store_byte_d16_hi; else if (instr->opcode == aco_opcode::flat_store_short) @@ -567,34 +606,38 @@ add_subdword_operand(ra_ctx& ctx, aco_ptr<Instruction>& instr, unsigned idx, uns std::pair<unsigned, unsigned> get_subdword_definition_info(Program* program, const aco_ptr<Instruction>& instr, RegClass rc) { - chip_class chip = program->chip_class; + amd_gfx_level gfx_level = program->gfx_level; if (instr->isPseudo()) { - if (chip >= GFX8) + if (instr->opcode == aco_opcode::p_interp_gfx11) + return std::make_pair(4u, 4u); + else if (gfx_level >= GFX8) return std::make_pair(rc.bytes() % 2 == 0 ? 2 : 1, rc.bytes()); else return std::make_pair(4, rc.size() * 4u); } - if (instr->isVALU() || instr->isVINTRP()) { + if (instr->isVALU()) { assert(rc.bytes() <= 2); - if (can_use_SDWA(chip, instr, false)) + if (can_use_SDWA(gfx_level, instr, false)) return std::make_pair(rc.bytes(), rc.bytes()); unsigned bytes_written = 4u; - if (instr_is_16bit(chip, instr->opcode)) + if (instr_is_16bit(gfx_level, instr->opcode)) bytes_written = 2u; unsigned stride = 4u; if (instr->opcode == aco_opcode::v_fma_mixlo_f16 || - can_use_opsel(chip, instr->opcode, -1, true)) + can_use_opsel(gfx_level, instr->opcode, -1)) stride = 2u; return std::make_pair(stride, bytes_written); } switch (instr->opcode) { + case aco_opcode::v_interp_p2_f16: return std::make_pair(2u, 2u); + /* D16 loads with _hi version */ case aco_opcode::ds_read_u8_d16: case aco_opcode::ds_read_i8_d16: case aco_opcode::ds_read_u16_d16: @@ -609,58 +652,80 @@ get_subdword_definition_info(Program* program, const aco_ptr<Instruction>& instr case aco_opcode::scratch_load_short_d16: case aco_opcode::buffer_load_ubyte_d16: case aco_opcode::buffer_load_sbyte_d16: - case aco_opcode::buffer_load_short_d16: { - assert(chip >= GFX9); + case aco_opcode::buffer_load_short_d16: + case aco_opcode::buffer_load_format_d16_x: { + assert(gfx_level >= GFX9); if (!program->dev.sram_ecc_enabled) return std::make_pair(2u, 2u); else return std::make_pair(2u, 4u); } + /* 3-component D16 loads */ + case aco_opcode::buffer_load_format_d16_xyz: + case aco_opcode::tbuffer_load_format_d16_xyz: { + assert(gfx_level >= GFX9); + if (!program->dev.sram_ecc_enabled) + return std::make_pair(4u, 6u); + break; + } - default: return std::make_pair(4, rc.size() * 4u); + default: break; } + + if (instr->isMIMG() && instr->mimg().d16 && !program->dev.sram_ecc_enabled) { + assert(gfx_level >= GFX9); + return std::make_pair(4u, rc.bytes()); + } + + return std::make_pair(4, rc.size() * 4u); } void -add_subdword_definition(Program* program, aco_ptr<Instruction>& instr, PhysReg reg) +add_subdword_definition(Program* program, aco_ptr<Instruction>& instr, PhysReg reg, + bool allow_16bit_write) { if (instr->isPseudo()) return; if (instr->isVALU()) { - chip_class chip = program->chip_class; + amd_gfx_level gfx_level = program->gfx_level; assert(instr->definitions[0].bytes() <= 2); - if (reg.byte() == 0 && instr_is_16bit(chip, instr->opcode)) + if (reg.byte() == 0 && allow_16bit_write && instr_is_16bit(gfx_level, instr->opcode)) return; - /* check if we can use opsel */ - if (instr->format == Format::VOP3) { - assert(reg.byte() == 2); - assert(can_use_opsel(chip, instr->opcode, -1, true)); - instr->vop3().opsel |= (1 << 3); /* dst in high half */ + /* use SDWA */ + if (can_use_SDWA(gfx_level, instr, false)) { + convert_to_SDWA(gfx_level, instr); return; } + assert(allow_16bit_write); + if (instr->opcode == aco_opcode::v_fma_mixlo_f16) { instr->opcode = aco_opcode::v_fma_mixhi_f16; return; } - /* use SDWA */ - assert(can_use_SDWA(chip, instr, false)); - convert_to_SDWA(chip, instr); + /* use opsel */ + assert(reg.byte() == 2); + assert(can_use_opsel(gfx_level, instr->opcode, -1)); + instr->valu().opsel[3] = true; /* dst in high half */ return; } if (reg.byte() == 0) return; + else if (instr->opcode == aco_opcode::v_interp_p2_f16) + instr->opcode = aco_opcode::v_interp_p2_hi_f16; else if (instr->opcode == aco_opcode::buffer_load_ubyte_d16) instr->opcode = aco_opcode::buffer_load_ubyte_d16_hi; else if (instr->opcode == aco_opcode::buffer_load_sbyte_d16) instr->opcode = aco_opcode::buffer_load_sbyte_d16_hi; else if (instr->opcode == aco_opcode::buffer_load_short_d16) instr->opcode = aco_opcode::buffer_load_short_d16_hi; + else if (instr->opcode == aco_opcode::buffer_load_format_d16_x) + instr->opcode = aco_opcode::buffer_load_format_d16_hi_x; else if (instr->opcode == aco_opcode::flat_load_ubyte_d16) instr->opcode = aco_opcode::flat_load_ubyte_d16_hi; else if (instr->opcode == aco_opcode::flat_load_sbyte_d16) @@ -697,6 +762,7 @@ adjust_max_used_regs(ra_ctx& ctx, RegClass rc, unsigned reg) if (rc.type() == RegType::vgpr) { assert(reg >= 256); uint16_t hi = reg - 256 + size - 1; + assert(hi <= 255); ctx.max_used_vgpr = std::max(ctx.max_used_vgpr, hi); } else if (reg + rc.size() <= max_addressible_sgpr) { uint16_t hi = reg + size - 1; @@ -707,6 +773,7 @@ adjust_max_used_regs(ra_ctx& ctx, RegClass rc, unsigned reg) enum UpdateRenames { rename_not_killed_ops = 0x1, fill_killed_ops = 0x2, + rename_precolored_ops = 0x4, }; MESA_DEFINE_CPP_ENUM_BITFIELD_OPERATORS(UpdateRenames); @@ -779,28 +846,39 @@ update_renames(ra_ctx& ctx, RegisterFile& reg_file, assert(ctx.assignments.size() == ctx.program->peekAllocationId()); /* check if we moved an operand */ - bool first = true; + bool first[2] = {true, true}; bool fill = true; for (unsigned i = 0; i < instr->operands.size(); i++) { Operand& op = instr->operands[i]; if (!op.isTemp()) continue; if (op.tempId() == copy.first.tempId()) { - bool omit_renaming = !(flags & rename_not_killed_ops) && !op.isKillBeforeDef(); - for (std::pair<Operand, Definition>& pc : parallelcopies) { - PhysReg def_reg = pc.second.physReg(); - omit_renaming &= def_reg > copy.first.physReg() - ? (copy.first.physReg() + copy.first.size() <= def_reg.reg()) - : (def_reg + pc.second.size() <= copy.first.physReg().reg()); + /* only rename precolored operands if the copy-location matches */ + bool omit_renaming = (flags & rename_precolored_ops) && op.isFixed() && + op.physReg() != copy.second.physReg(); + + /* Omit renaming in some cases for p_create_vector in order to avoid + * unnecessary shuffle code. */ + if (!(flags & rename_not_killed_ops) && !op.isKillBeforeDef()) { + omit_renaming = true; + for (std::pair<Operand, Definition>& pc : parallelcopies) { + PhysReg def_reg = pc.second.physReg(); + omit_renaming &= def_reg > copy.first.physReg() + ? (copy.first.physReg() + copy.first.size() <= def_reg.reg()) + : (def_reg + pc.second.size() <= copy.first.physReg().reg()); + } } - if (omit_renaming) { - if (first) - op.setFirstKill(true); - else - op.setKill(true); - first = false; + + /* Fix the kill flags */ + if (first[omit_renaming]) + op.setFirstKill(omit_renaming || op.isKill()); + else + op.setKill(omit_renaming || op.isKill()); + first[omit_renaming] = false; + + if (omit_renaming) continue; - } + op.setTemp(copy.second.getTemp()); op.setFixed(copy.second.physReg()); @@ -815,8 +893,8 @@ update_renames(ra_ctx& ctx, RegisterFile& reg_file, } } -std::pair<PhysReg, bool> -get_reg_simple(ra_ctx& ctx, RegisterFile& reg_file, DefInfo info) +std::optional<PhysReg> +get_reg_simple(ra_ctx& ctx, const RegisterFile& reg_file, DefInfo info) { const PhysRegInterval& bounds = info.bounds; uint32_t size = info.size; @@ -829,8 +907,8 @@ get_reg_simple(ra_ctx& ctx, RegisterFile& reg_file, DefInfo info) if (size % new_stride) continue; new_info.stride = new_stride; - std::pair<PhysReg, bool> res = get_reg_simple(ctx, reg_file, new_info); - if (res.second) + std::optional<PhysReg> res = get_reg_simple(ctx, reg_file, new_info); + if (res) return res; } @@ -864,7 +942,7 @@ get_reg_simple(ra_ctx& ctx, RegisterFile& reg_file, DefInfo info) /* early return on exact matches */ if (size == gap.size) { adjust_max_used_regs(ctx, rc, gap.lo()); - return {gap.lo(), true}; + return gap.lo(); } /* check if it fits and the gap size is smaller */ @@ -877,7 +955,7 @@ get_reg_simple(ra_ctx& ctx, RegisterFile& reg_file, DefInfo info) } if (best_gap.size == UINT_MAX) - return {{}, false}; + return {}; /* find best position within gap by leaving a good stride for other variables*/ unsigned buffer = best_gap.size - size; @@ -889,7 +967,7 @@ get_reg_simple(ra_ctx& ctx, RegisterFile& reg_file, DefInfo info) } adjust_max_used_regs(ctx, rc, best_gap.lo()); - return {best_gap.lo(), true}; + return best_gap.lo(); } for (PhysRegInterval reg_win = {bounds.lo(), size}; reg_win.hi() <= bounds.hi(); @@ -901,7 +979,7 @@ get_reg_simple(ra_ctx& ctx, RegisterFile& reg_file, DefInfo info) bool is_valid = std::all_of(std::next(reg_win.begin()), reg_win.end(), is_free); if (is_valid) { adjust_max_used_regs(ctx, rc, reg_win.lo()); - return {reg_win.lo(), true}; + return reg_win.lo(); } } @@ -909,7 +987,8 @@ get_reg_simple(ra_ctx& ctx, RegisterFile& reg_file, DefInfo info) * larger instruction encodings or copies * TODO: don't do this in situations where it doesn't benefit */ if (rc.is_subdword()) { - for (std::pair<const uint32_t, std::array<uint32_t, 4>>& entry : reg_file.subdword_regs) { + for (const std::pair<const uint32_t, std::array<uint32_t, 4>>& entry : + reg_file.subdword_regs) { assert(reg_file[PhysReg{entry.first}] == 0xF0000000); if (!bounds.contains({PhysReg{entry.first}, rc.size()})) continue; @@ -928,119 +1007,172 @@ get_reg_simple(ra_ctx& ctx, RegisterFile& reg_file, DefInfo info) PhysReg res{entry.first}; res.reg_b += i; adjust_max_used_regs(ctx, rc, entry.first); - return {res, true}; + return res; } } } } - return {{}, false}; + return {}; } -/* collect variables from a register area and clear reg_file */ -std::set<std::pair<unsigned, unsigned>> -find_vars(ra_ctx& ctx, RegisterFile& reg_file, const PhysRegInterval reg_interval) +/* collect variables from a register area */ +std::vector<unsigned> +find_vars(ra_ctx& ctx, const RegisterFile& reg_file, const PhysRegInterval reg_interval) { - std::set<std::pair<unsigned, unsigned>> vars; + std::vector<unsigned> vars; for (PhysReg j : reg_interval) { if (reg_file.is_blocked(j)) continue; if (reg_file[j] == 0xF0000000) { for (unsigned k = 0; k < 4; k++) { - unsigned id = reg_file.subdword_regs[j][k]; - if (id) { - assignment& var = ctx.assignments[id]; - vars.emplace(var.rc.bytes(), id); - } + unsigned id = reg_file.subdword_regs.at(j)[k]; + if (id && (vars.empty() || id != vars.back())) + vars.emplace_back(id); } - } else if (reg_file[j] != 0) { + } else { unsigned id = reg_file[j]; - assignment& var = ctx.assignments[id]; - vars.emplace(var.rc.bytes(), id); + if (id && (vars.empty() || id != vars.back())) + vars.emplace_back(id); } } return vars; } -/* collect variables from a register area and clear reg_file */ -std::set<std::pair<unsigned, unsigned>> +/* collect variables from a register area and clear reg_file + * variables are sorted in decreasing size and + * increasing assigned register + */ +std::vector<unsigned> collect_vars(ra_ctx& ctx, RegisterFile& reg_file, const PhysRegInterval reg_interval) { - std::set<std::pair<unsigned, unsigned>> vars = find_vars(ctx, reg_file, reg_interval); - for (std::pair<unsigned, unsigned> size_id : vars) { - assignment& var = ctx.assignments[size_id.second]; + std::vector<unsigned> ids = find_vars(ctx, reg_file, reg_interval); + std::sort(ids.begin(), ids.end(), + [&](unsigned a, unsigned b) + { + assignment& var_a = ctx.assignments[a]; + assignment& var_b = ctx.assignments[b]; + return var_a.rc.bytes() > var_b.rc.bytes() || + (var_a.rc.bytes() == var_b.rc.bytes() && var_a.reg < var_b.reg); + }); + + for (unsigned id : ids) { + assignment& var = ctx.assignments[id]; reg_file.clear(var.reg, var.rc); } - return vars; + return ids; +} + +std::optional<PhysReg> +get_reg_for_create_vector_copy(ra_ctx& ctx, RegisterFile& reg_file, + std::vector<std::pair<Operand, Definition>>& parallelcopies, + aco_ptr<Instruction>& instr, const PhysRegInterval def_reg, + DefInfo info, unsigned id) +{ + PhysReg reg = def_reg.lo(); + /* dead operand: return position in vector */ + for (unsigned i = 0; i < instr->operands.size(); i++) { + if (instr->operands[i].isTemp() && instr->operands[i].tempId() == id && + instr->operands[i].isKillBeforeDef()) { + assert(!reg_file.test(reg, instr->operands[i].bytes())); + if (info.rc.is_subdword() || reg.byte() == 0) + return reg; + else + return {}; + } + reg.reg_b += instr->operands[i].bytes(); + } + + /* GFX9+ has a VGPR swap instruction. */ + if (ctx.program->gfx_level <= GFX8 || info.rc.type() == RegType::sgpr) + return {}; + + /* check if the previous position was in vector */ + assignment& var = ctx.assignments[id]; + if (def_reg.contains(PhysRegInterval{var.reg, info.size})) { + reg = def_reg.lo(); + /* try to use the previous register of the operand */ + for (unsigned i = 0; i < instr->operands.size(); i++) { + if (reg != var.reg) { + reg.reg_b += instr->operands[i].bytes(); + continue; + } + + /* check if we can swap positions */ + if (instr->operands[i].isTemp() && instr->operands[i].isFirstKill() && + instr->operands[i].regClass() == info.rc) { + assignment& op = ctx.assignments[instr->operands[i].tempId()]; + /* if everything matches, create parallelcopy for the killed operand */ + if (!intersects(def_reg, PhysRegInterval{op.reg, op.rc.size()}) && op.reg != scc && + reg_file.get_id(op.reg) == instr->operands[i].tempId()) { + Definition pc_def = Definition(reg, info.rc); + parallelcopies.emplace_back(instr->operands[i], pc_def); + return op.reg; + } + } + return {}; + } + } + return {}; } bool get_regs_for_copies(ra_ctx& ctx, RegisterFile& reg_file, std::vector<std::pair<Operand, Definition>>& parallelcopies, - const std::set<std::pair<unsigned, unsigned>>& vars, - const PhysRegInterval bounds, aco_ptr<Instruction>& instr, + const std::vector<unsigned>& vars, aco_ptr<Instruction>& instr, const PhysRegInterval def_reg) { - /* variables are sorted from small sized to large */ - /* NOTE: variables are also sorted by ID. this only affects a very small number of shaders - * slightly though. */ - for (std::set<std::pair<unsigned, unsigned>>::const_reverse_iterator it = vars.rbegin(); - it != vars.rend(); ++it) { - unsigned id = it->second; + /* Variables are sorted from large to small and with increasing assigned register */ + for (unsigned id : vars) { assignment& var = ctx.assignments[id]; + PhysRegInterval bounds = get_reg_bounds(ctx, var.rc); DefInfo info = DefInfo(ctx, ctx.pseudo_dummy, var.rc, -1); uint32_t size = info.size; /* check if this is a dead operand, then we can re-use the space from the definition * also use the correct stride for sub-dword operands */ bool is_dead_operand = false; - for (unsigned i = 0; !is_phi(instr) && i < instr->operands.size(); i++) { - if (instr->operands[i].isTemp() && instr->operands[i].tempId() == id) { - if (instr->operands[i].isKillBeforeDef()) - is_dead_operand = true; - info = DefInfo(ctx, instr, var.rc, i); - break; - } - } - - std::pair<PhysReg, bool> res; - if (is_dead_operand) { - if (instr->opcode == aco_opcode::p_create_vector) { - PhysReg reg(def_reg.lo()); - for (unsigned i = 0; i < instr->operands.size(); i++) { - if (instr->operands[i].isTemp() && instr->operands[i].tempId() == id) { - res = {reg, (!var.rc.is_subdword() || (reg.byte() % info.stride == 0)) && - !reg_file.test(reg, var.rc.bytes())}; - break; + std::optional<PhysReg> res; + if (instr->opcode == aco_opcode::p_create_vector) { + res = + get_reg_for_create_vector_copy(ctx, reg_file, parallelcopies, instr, def_reg, info, id); + } else { + for (unsigned i = 0; !is_phi(instr) && i < instr->operands.size(); i++) { + if (instr->operands[i].isTemp() && instr->operands[i].tempId() == id) { + info = DefInfo(ctx, instr, var.rc, i); + if (instr->operands[i].isKillBeforeDef()) { + info.bounds = def_reg; + res = get_reg_simple(ctx, reg_file, info); + is_dead_operand = true; } - reg.reg_b += instr->operands[i].bytes(); + break; } - if (!res.second) - res = {var.reg, !reg_file.test(var.reg, var.rc.bytes())}; - } else { - info.bounds = def_reg; - res = get_reg_simple(ctx, reg_file, info); } - } else { + } + if (!res && !def_reg.size) { + /* If this is before definitions are handled, def_reg may be an empty interval. */ + info.bounds = bounds; + res = get_reg_simple(ctx, reg_file, info); + } else if (!res) { /* Try to find space within the bounds but outside of the definition */ info.bounds = PhysRegInterval::from_until(bounds.lo(), MIN2(def_reg.lo(), bounds.hi())); res = get_reg_simple(ctx, reg_file, info); - if (!res.second && def_reg.hi() <= bounds.hi()) { + if (!res && def_reg.hi() <= bounds.hi()) { unsigned lo = (def_reg.hi() + info.stride - 1) & ~(info.stride - 1); info.bounds = PhysRegInterval::from_until(PhysReg{lo}, bounds.hi()); res = get_reg_simple(ctx, reg_file, info); } } - if (res.second) { + if (res) { /* mark the area as blocked */ - reg_file.block(res.first, var.rc); + reg_file.block(*res, var.rc); /* create parallelcopy pair (without definition id) */ Temp tmp = Temp(id, var.rc); Operand pc_op = Operand(tmp); pc_op.setFixed(var.reg); - Definition pc_def = Definition(res.first, pc_op.regClass()); + Definition pc_def = Definition(*res, pc_op.regClass()); parallelcopies.emplace_back(pc_op, pc_def); continue; } @@ -1075,9 +1207,8 @@ get_regs_for_copies(ra_ctx& ctx, RegisterFile& reg_file, n++; continue; } - /* we cannot split live ranges of linear vgprs inside control flow */ - if (!(ctx.block->kind & block_kind_top_level) && - ctx.assignments[reg_file[j]].rc.is_linear_vgpr()) { + /* we cannot split live ranges of linear vgprs */ + if (ctx.assignments[reg_file[j]].rc.is_linear_vgpr()) { found = false; break; } @@ -1116,13 +1247,13 @@ get_regs_for_copies(ra_ctx& ctx, RegisterFile& reg_file, PhysRegInterval reg_win{best_pos, size}; /* collect variables and block reg file */ - std::set<std::pair<unsigned, unsigned>> new_vars = collect_vars(ctx, reg_file, reg_win); + std::vector<unsigned> new_vars = collect_vars(ctx, reg_file, reg_win); /* mark the area as blocked */ reg_file.block(reg_win.lo(), var.rc); adjust_max_used_regs(ctx, var.rc, reg_win.lo()); - if (!get_regs_for_copies(ctx, reg_file, parallelcopies, new_vars, bounds, instr, def_reg)) + if (!get_regs_for_copies(ctx, reg_file, parallelcopies, new_vars, instr, def_reg)) return false; /* create parallelcopy pair (without definition id) */ @@ -1136,8 +1267,8 @@ get_regs_for_copies(ra_ctx& ctx, RegisterFile& reg_file, return true; } -std::pair<PhysReg, bool> -get_reg_impl(ra_ctx& ctx, RegisterFile& reg_file, +std::optional<PhysReg> +get_reg_impl(ra_ctx& ctx, const RegisterFile& reg_file, std::vector<std::pair<Operand, Definition>>& parallelcopies, const DefInfo& info, aco_ptr<Instruction>& instr) { @@ -1166,7 +1297,8 @@ get_reg_impl(ra_ctx& ctx, RegisterFile& reg_file, } } - assert(regs_free >= size); + assert((regs_free + ctx.num_linear_vgprs) >= size); + /* we might have to move dead operands to dst in order to make space */ unsigned op_moves = 0; @@ -1223,10 +1355,8 @@ get_reg_impl(ra_ctx& ctx, RegisterFile& reg_file, break; } - /* we cannot split live ranges of linear vgprs inside control flow */ - //TODO: ensure that live range splits inside control flow are never necessary - if (!(ctx.block->kind & block_kind_top_level) && - ctx.assignments[reg_file[j]].rc.is_linear_vgpr()) { + /* we cannot split live ranges of linear vgprs */ + if (ctx.assignments[reg_file[j]].rc.is_linear_vgpr()) { found = false; break; } @@ -1251,31 +1381,23 @@ get_reg_impl(ra_ctx& ctx, RegisterFile& reg_file, } if (num_moves == 0xFF) - return {{}, false}; + return {}; /* now, we figured the placement for our definition */ RegisterFile tmp_file(reg_file); - std::set<std::pair<unsigned, unsigned>> vars = collect_vars(ctx, tmp_file, best_win); + /* p_create_vector: also re-place killed operands in the definition space */ if (instr->opcode == aco_opcode::p_create_vector) { - /* move killed operands which aren't yet at the correct position (GFX9+) - * or which are in the definition space */ - PhysReg reg = best_win.lo(); for (Operand& op : instr->operands) { - if (op.isTemp() && op.isFirstKillBeforeDef() && op.getTemp().type() == rc.type()) { - if (op.physReg() != reg && (ctx.program->chip_class >= GFX9 || - (op.physReg().advance(op.bytes()) > best_win.lo() && - op.physReg() < best_win.hi()))) { - vars.emplace(op.bytes(), op.tempId()); - tmp_file.clear(op); - } else { - tmp_file.fill(op); - } - } - reg.reg_b += op.bytes(); + if (op.isTemp() && op.isFirstKillBeforeDef()) + tmp_file.fill(op); } - } else if (!is_phi(instr)) { - /* re-enable killed operands */ + } + + std::vector<unsigned> vars = collect_vars(ctx, tmp_file, best_win); + + /* re-enable killed operands */ + if (!is_phi(instr) && instr->opcode != aco_opcode::p_create_vector) { for (Operand& op : instr->operands) { if (op.isTemp() && op.isFirstKillBeforeDef()) tmp_file.fill(op); @@ -1283,18 +1405,18 @@ get_reg_impl(ra_ctx& ctx, RegisterFile& reg_file, } std::vector<std::pair<Operand, Definition>> pc; - if (!get_regs_for_copies(ctx, tmp_file, pc, vars, bounds, instr, best_win)) - return {{}, false}; + if (!get_regs_for_copies(ctx, tmp_file, pc, vars, instr, best_win)) + return {}; parallelcopies.insert(parallelcopies.end(), pc.begin(), pc.end()); adjust_max_used_regs(ctx, rc, best_win.lo()); - return {best_win.lo(), true}; + return best_win.lo(); } bool -get_reg_specified(ra_ctx& ctx, RegisterFile& reg_file, RegClass rc, aco_ptr<Instruction>& instr, - PhysReg reg) +get_reg_specified(ra_ctx& ctx, const RegisterFile& reg_file, RegClass rc, + aco_ptr<Instruction>& instr, PhysReg reg) { /* catch out-of-range registers */ if (reg >= PhysReg{512}) @@ -1313,10 +1435,10 @@ get_reg_specified(ra_ctx& ctx, RegisterFile& reg_file, RegClass rc, aco_ptr<Inst return false; PhysRegInterval reg_win = {reg, rc.size()}; - PhysRegInterval bounds = get_reg_bounds(ctx.program, rc.type()); + PhysRegInterval bounds = get_reg_bounds(ctx, rc); PhysRegInterval vcc_win = {vcc, 2}; /* VCC is outside the bounds */ - bool is_vcc = rc.type() == RegType::sgpr && vcc_win.contains(reg_win); + bool is_vcc = rc.type() == RegType::sgpr && vcc_win.contains(reg_win) && ctx.program->needs_vcc; bool is_m0 = rc == s1 && reg == m0; if (!bounds.contains(reg_win) && !is_vcc && !is_m0) return false; @@ -1336,17 +1458,24 @@ get_reg_specified(ra_ctx& ctx, RegisterFile& reg_file, RegClass rc, aco_ptr<Inst } bool -increase_register_file(ra_ctx& ctx, RegType type) +increase_register_file(ra_ctx& ctx, RegClass rc) { - if (type == RegType::vgpr && ctx.program->max_reg_demand.vgpr < ctx.vgpr_limit) { - update_vgpr_sgpr_demand(ctx.program, RegisterDemand(ctx.program->max_reg_demand.vgpr + 1, - ctx.program->max_reg_demand.sgpr)); - } else if (type == RegType::sgpr && ctx.program->max_reg_demand.sgpr < ctx.sgpr_limit) { - update_vgpr_sgpr_demand(ctx.program, RegisterDemand(ctx.program->max_reg_demand.vgpr, - ctx.program->max_reg_demand.sgpr + 1)); + if (rc.type() == RegType::vgpr && ctx.num_linear_vgprs == 0 && + ctx.vgpr_bounds < ctx.vgpr_limit) { + /* If vgpr_bounds is less than max_reg_demand.vgpr, this should be a no-op. */ + update_vgpr_sgpr_demand( + ctx.program, RegisterDemand(ctx.vgpr_bounds + 1, ctx.program->max_reg_demand.sgpr)); + + ctx.vgpr_bounds = ctx.program->max_reg_demand.vgpr; + } else if (rc.type() == RegType::sgpr && ctx.program->max_reg_demand.sgpr < ctx.sgpr_limit) { + update_vgpr_sgpr_demand( + ctx.program, RegisterDemand(ctx.program->max_reg_demand.vgpr, ctx.sgpr_bounds + 1)); + + ctx.sgpr_bounds = ctx.program->max_reg_demand.sgpr; } else { return false; } + return true; } @@ -1429,7 +1558,7 @@ compact_relocate_vars(ra_ctx& ctx, const std::vector<IDAndRegClass>& vars, } bool -is_mimg_vaddr_intact(ra_ctx& ctx, RegisterFile& reg_file, Instruction* instr) +is_mimg_vaddr_intact(ra_ctx& ctx, const RegisterFile& reg_file, Instruction* instr) { PhysReg first{512}; for (unsigned i = 0; i < instr->operands.size() - 3u; i++) { @@ -1439,7 +1568,7 @@ is_mimg_vaddr_intact(ra_ctx& ctx, RegisterFile& reg_file, Instruction* instr) PhysReg reg = ctx.assignments[op.tempId()].reg; if (first.reg() == 512) { - PhysRegInterval bounds = get_reg_bounds(ctx.program, RegType::vgpr); + PhysRegInterval bounds = get_reg_bounds(ctx, RegType::vgpr, false); first = reg.advance(i * -4); PhysRegInterval vec = PhysRegInterval{first, instr->operands.size() - 3u}; if (!bounds.contains(vec)) /* not enough space for other operands */ @@ -1460,8 +1589,8 @@ is_mimg_vaddr_intact(ra_ctx& ctx, RegisterFile& reg_file, Instruction* instr) return true; } -std::pair<PhysReg, bool> -get_reg_vector(ra_ctx& ctx, RegisterFile& reg_file, Temp temp, aco_ptr<Instruction>& instr) +std::optional<PhysReg> +get_reg_vector(ra_ctx& ctx, const RegisterFile& reg_file, Temp temp, aco_ptr<Instruction>& instr) { Instruction* vec = ctx.vectors[temp.id()]; unsigned first_operand = vec->format == Format::MIMG ? 3 : 0; @@ -1487,11 +1616,11 @@ get_reg_vector(ra_ctx& ctx, RegisterFile& reg_file, Temp temp, aco_ptr<Instructi PhysReg reg = ctx.assignments[op.tempId()].reg; reg.reg_b += (our_offset - their_offset); if (get_reg_specified(ctx, reg_file, temp.regClass(), instr, reg)) - return {reg, true}; + return reg; /* return if MIMG vaddr components don't remain vector-aligned */ if (vec->format == Format::MIMG) - return {{}, false}; + return {}; } their_offset += op.bytes(); } @@ -1501,20 +1630,137 @@ get_reg_vector(ra_ctx& ctx, RegisterFile& reg_file, Temp temp, aco_ptr<Instructi */ RegClass vec_rc = RegClass::get(temp.type(), their_offset); DefInfo info(ctx, ctx.pseudo_dummy, vec_rc, -1); - std::pair<PhysReg, bool> res = get_reg_simple(ctx, reg_file, info); - PhysReg reg = res.first; - if (res.second) { - reg.reg_b += our_offset; + std::optional<PhysReg> reg = get_reg_simple(ctx, reg_file, info); + if (reg) { + reg->reg_b += our_offset; /* make sure to only use byte offset if the instruction supports it */ - if (get_reg_specified(ctx, reg_file, temp.regClass(), instr, reg)) - return {reg, true}; + if (get_reg_specified(ctx, reg_file, temp.regClass(), instr, *reg)) + return reg; } } - return {{}, false}; + return {}; } +bool +compact_linear_vgprs(ra_ctx& ctx, const RegisterFile& reg_file, + std::vector<std::pair<Operand, Definition>>& parallelcopies) +{ + PhysRegInterval linear_vgpr_bounds = get_reg_bounds(ctx, RegType::vgpr, true); + int zeros = reg_file.count_zero(linear_vgpr_bounds); + if (zeros == 0) + return false; + + std::vector<IDAndRegClass> vars; + for (unsigned id : find_vars(ctx, reg_file, linear_vgpr_bounds)) + vars.emplace_back(id, ctx.assignments[id].rc); + + ctx.num_linear_vgprs -= zeros; + compact_relocate_vars(ctx, vars, parallelcopies, get_reg_bounds(ctx, RegType::vgpr, true).lo()); + + return true; +} + +/* Allocates a linear VGPR. We allocate them at the end of the register file and keep them separate + * from normal VGPRs. This is for two reasons: + * - Because we only ever move linear VGPRs into an empty space or a space previously occupied by a + * linear one, we never have to swap a normal VGPR and a linear one. + * - As linear VGPR's live ranges only start and end on top-level blocks, we never have to move a + * linear VGPR in control flow. + */ PhysReg -get_reg(ra_ctx& ctx, RegisterFile& reg_file, Temp temp, +alloc_linear_vgpr(ra_ctx& ctx, const RegisterFile& reg_file, aco_ptr<Instruction>& instr, + std::vector<std::pair<Operand, Definition>>& parallelcopies) +{ + assert(instr->opcode == aco_opcode::p_start_linear_vgpr); + assert(instr->definitions.size() == 1 && instr->definitions[0].bytes() % 4 == 0); + + RegClass rc = instr->definitions[0].regClass(); + + /* Try to choose an unused space in the linear VGPR bounds. */ + for (unsigned i = rc.size(); i <= ctx.num_linear_vgprs; i++) { + PhysReg reg(256 + ctx.vgpr_bounds - i); + if (!reg_file.test(reg, rc.bytes())) { + adjust_max_used_regs(ctx, rc, reg); + return reg; + } + } + + PhysRegInterval old_normal_bounds = get_reg_bounds(ctx, RegType::vgpr, false); + + /* Compact linear VGPRs, grow the bounds if necessary, and choose a space at the beginning: */ + compact_linear_vgprs(ctx, reg_file, parallelcopies); + + PhysReg reg(256 + ctx.vgpr_bounds - (ctx.num_linear_vgprs + rc.size())); + /* Space that was for normal VGPRs, but is now for linear VGPRs. */ + PhysRegInterval new_win = PhysRegInterval::from_until(reg, MAX2(old_normal_bounds.hi(), reg)); + + RegisterFile tmp_file(reg_file); + PhysRegInterval reg_win{reg, rc.size()}; + std::vector<unsigned> blocking_vars = collect_vars(ctx, tmp_file, new_win); + + /* Re-enable killed operands */ + for (Operand& op : instr->operands) { + if (op.isTemp() && op.isFirstKillBeforeDef()) + tmp_file.fill(op); + } + + /* Find new assignments for blocking vars. */ + std::vector<std::pair<Operand, Definition>> pc; + if (!ctx.policy.skip_optimistic_path && + get_regs_for_copies(ctx, tmp_file, pc, blocking_vars, instr, reg_win)) { + parallelcopies.insert(parallelcopies.end(), pc.begin(), pc.end()); + } else { + /* Fallback algorithm: reallocate all variables at once. */ + std::vector<IDAndRegClass> vars; + for (unsigned id : find_vars(ctx, reg_file, old_normal_bounds)) + vars.emplace_back(id, ctx.assignments[id].rc); + compact_relocate_vars(ctx, vars, parallelcopies, PhysReg(256)); + + std::vector<IDAndRegClass> killed_op_vars; + for (Operand& op : instr->operands) { + if (op.isTemp() && op.isFirstKillBeforeDef() && op.regClass().type() == RegType::vgpr) + killed_op_vars.emplace_back(op.tempId(), op.regClass()); + } + compact_relocate_vars(ctx, killed_op_vars, parallelcopies, reg_win.lo()); + } + + /* If this is updated earlier, a killed operand can't be placed inside the definition. */ + ctx.num_linear_vgprs += rc.size(); + + adjust_max_used_regs(ctx, rc, reg); + return reg; +} + +bool +should_compact_linear_vgprs(ra_ctx& ctx, live& live_vars, const RegisterFile& reg_file) +{ + if (!(ctx.block->kind & block_kind_top_level) || ctx.block->linear_succs.empty()) + return false; + + /* Since we won't be able to copy linear VGPRs to make space when in control flow, we have to + * ensure in advance that there is enough space for normal VGPRs. */ + unsigned max_vgpr_usage = 0; + unsigned next_toplevel = ctx.block->index + 1; + for (; !(ctx.program->blocks[next_toplevel].kind & block_kind_top_level); next_toplevel++) { + max_vgpr_usage = + MAX2(max_vgpr_usage, (unsigned)ctx.program->blocks[next_toplevel].register_demand.vgpr); + } + + std::vector<aco_ptr<Instruction>>& instructions = + ctx.program->blocks[next_toplevel].instructions; + if (!instructions.empty() && is_phi(instructions[0])) { + max_vgpr_usage = + MAX2(max_vgpr_usage, (unsigned)live_vars.register_demand[next_toplevel][0].vgpr); + } + + for (unsigned tmp : find_vars(ctx, reg_file, get_reg_bounds(ctx, RegType::vgpr, true))) + max_vgpr_usage -= ctx.assignments[tmp].rc.size(); + + return max_vgpr_usage > get_reg_bounds(ctx, RegType::vgpr, false).size; +} + +PhysReg +get_reg(ra_ctx& ctx, const RegisterFile& reg_file, Temp temp, std::vector<std::pair<Operand, Definition>>& parallelcopies, aco_ptr<Instruction>& instr, int operand_index = -1) { @@ -1522,30 +1768,41 @@ get_reg(ra_ctx& ctx, RegisterFile& reg_file, Temp temp, if (split_vec != ctx.split_vectors.end()) { unsigned offset = 0; for (Definition def : split_vec->second->definitions) { - auto affinity_it = ctx.affinities.find(def.tempId()); - if (affinity_it != ctx.affinities.end() && ctx.assignments[affinity_it->second].assigned) { - PhysReg reg = ctx.assignments[affinity_it->second].reg; - reg.reg_b -= offset; - if (get_reg_specified(ctx, reg_file, temp.regClass(), instr, reg)) - return reg; + if (ctx.assignments[def.tempId()].affinity) { + assignment& affinity = ctx.assignments[ctx.assignments[def.tempId()].affinity]; + if (affinity.assigned) { + PhysReg reg = affinity.reg; + reg.reg_b -= offset; + if (get_reg_specified(ctx, reg_file, temp.regClass(), instr, reg)) + return reg; + } } offset += def.bytes(); } } - if (ctx.affinities.find(temp.id()) != ctx.affinities.end() && - ctx.assignments[ctx.affinities[temp.id()]].assigned) { - PhysReg reg = ctx.assignments[ctx.affinities[temp.id()]].reg; - if (get_reg_specified(ctx, reg_file, temp.regClass(), instr, reg)) - return reg; + if (ctx.assignments[temp.id()].affinity) { + assignment& affinity = ctx.assignments[ctx.assignments[temp.id()].affinity]; + if (affinity.assigned) { + if (get_reg_specified(ctx, reg_file, temp.regClass(), instr, affinity.reg)) + return affinity.reg; + } + } + if (ctx.assignments[temp.id()].vcc) { + if (get_reg_specified(ctx, reg_file, temp.regClass(), instr, vcc)) + return vcc; + } + if (ctx.assignments[temp.id()].m0) { + if (get_reg_specified(ctx, reg_file, temp.regClass(), instr, m0) && can_write_m0(instr)) + return m0; } - std::pair<PhysReg, bool> res; + std::optional<PhysReg> res; if (ctx.vectors.find(temp.id()) != ctx.vectors.end()) { res = get_reg_vector(ctx, reg_file, temp, instr); - if (res.second) - return res.first; + if (res) + return *res; } DefInfo info(ctx, instr, temp.regClass(), operand_index); @@ -1554,24 +1811,39 @@ get_reg(ra_ctx& ctx, RegisterFile& reg_file, Temp temp, /* try to find space without live-range splits */ res = get_reg_simple(ctx, reg_file, info); - if (res.second) - return res.first; + if (res) + return *res; } /* try to find space with live-range splits */ res = get_reg_impl(ctx, reg_file, parallelcopies, info, instr); - if (res.second) - return res.first; + if (res) + return *res; - /* try using more registers */ + /* try compacting the linear vgprs to make more space */ + std::vector<std::pair<Operand, Definition>> pc; + if (info.rc.type() == RegType::vgpr && (ctx.block->kind & block_kind_top_level) && + compact_linear_vgprs(ctx, reg_file, pc)) { + parallelcopies.insert(parallelcopies.end(), pc.begin(), pc.end()); + + /* We don't need to fill the copy definitions in because we don't care about the linear VGPR + * space here. */ + RegisterFile tmp_file(reg_file); + for (std::pair<Operand, Definition>& copy : pc) + tmp_file.clear(copy.first); + + return get_reg(ctx, tmp_file, temp, parallelcopies, instr, operand_index); + } /* We should only fail here because keeping under the limit would require * too many moves. */ assert(reg_file.count_zero(info.bounds) >= info.size); - if (!increase_register_file(ctx, info.rc.type())) { - /* fallback algorithm: reallocate all variables at once */ + /* try using more registers */ + if (!increase_register_file(ctx, info.rc)) { + /* fallback algorithm: reallocate all variables at once (linear VGPRs should already be + * compact at the end) */ unsigned def_size = info.rc.size(); for (Definition def : instr->definitions) { if (ctx.assignments[def.tempId()].assigned && def.regClass().type() == info.rc.type()) @@ -1584,12 +1856,12 @@ get_reg(ra_ctx& ctx, RegisterFile& reg_file, Temp temp, killed_op_size += op.regClass().size(); } - const PhysRegInterval regs = get_reg_bounds(ctx.program, info.rc.type()); + const PhysRegInterval regs = get_reg_bounds(ctx, info.rc); /* reallocate passthrough variables and non-killed operands */ std::vector<IDAndRegClass> vars; - for (const std::pair<unsigned, unsigned>& var : find_vars(ctx, reg_file, regs)) - vars.emplace_back(var.second, ctx.assignments[var.second].rc); + for (unsigned id : find_vars(ctx, reg_file, regs)) + vars.emplace_back(id, ctx.assignments[id].rc); vars.emplace_back(0xffffffff, RegClass(info.rc.type(), MAX2(def_size, killed_op_size))); PhysReg space = compact_relocate_vars(ctx, vars, parallelcopies, regs.lo()); @@ -1616,7 +1888,7 @@ get_reg(ra_ctx& ctx, RegisterFile& reg_file, Temp temp, } PhysReg -get_reg_create_vector(ra_ctx& ctx, RegisterFile& reg_file, Temp temp, +get_reg_create_vector(ra_ctx& ctx, const RegisterFile& reg_file, Temp temp, std::vector<std::pair<Operand, Definition>>& parallelcopies, aco_ptr<Instruction>& instr) { @@ -1625,13 +1897,14 @@ get_reg_create_vector(ra_ctx& ctx, RegisterFile& reg_file, Temp temp, uint32_t size = rc.size(); uint32_t bytes = rc.bytes(); uint32_t stride = get_stride(rc); - PhysRegInterval bounds = get_reg_bounds(ctx.program, rc.type()); + PhysRegInterval bounds = get_reg_bounds(ctx, rc); // TODO: improve p_create_vector for sub-dword vectors PhysReg best_pos{0xFFF}; unsigned num_moves = 0xFF; bool best_avoid = true; + uint32_t correct_pos_mask = 0; /* test for each operand which definition placement causes the least shuffle instructions */ for (unsigned i = 0, offset = 0; i < instr->operands.size(); @@ -1667,6 +1940,7 @@ get_reg_create_vector(ra_ctx& ctx, RegisterFile& reg_file, Temp temp, /* count variables to be moved and check "avoid" */ bool avoid = false; + bool linear_vgpr = false; for (PhysReg j : reg_win) { if (reg_file[j] != 0) { if (reg_file[j] == 0xF0000000) { @@ -1677,28 +1951,28 @@ get_reg_create_vector(ra_ctx& ctx, RegisterFile& reg_file, Temp temp, k += reg_file.test(reg, 1); } else { k += 4; - /* we cannot split live ranges of linear vgprs inside control flow */ - if (ctx.assignments[reg_file[j]].rc.is_linear_vgpr()) { - if (ctx.block->kind & block_kind_top_level) - avoid = true; - else - break; - } + linear_vgpr |= ctx.assignments[reg_file[j]].rc.is_linear_vgpr(); } } avoid |= ctx.war_hint[j]; } + + /* we cannot split live ranges of linear vgprs */ + if (linear_vgpr) + continue; + if (avoid && !best_avoid) continue; /* count operands in wrong positions */ + uint32_t correct_pos_mask_new = 0; for (unsigned j = 0, offset2 = 0; j < instr->operands.size(); offset2 += instr->operands[j].bytes(), j++) { - if (j == i || !instr->operands[j].isTemp() || - instr->operands[j].getTemp().type() != rc.type()) - continue; - if (instr->operands[j].physReg().reg_b != reg_win.lo() * 4 + offset2) - k += instr->operands[j].bytes(); + Operand& op = instr->operands[j]; + if (op.isTemp() && op.physReg().reg_b == reg_win.lo() * 4 + offset2) + correct_pos_mask_new |= 1 << j; + else + k += op.bytes(); } bool aligned = rc == RegClass::v4 && reg_win.lo() % 4 == 0; if (k > num_moves || (!aligned && k == num_moves)) @@ -1707,49 +1981,39 @@ get_reg_create_vector(ra_ctx& ctx, RegisterFile& reg_file, Temp temp, best_pos = reg_win.lo(); num_moves = k; best_avoid = avoid; + correct_pos_mask = correct_pos_mask_new; } - if (num_moves >= bytes) + /* too many moves: try the generic get_reg() function */ + if (num_moves >= 2 * bytes) { return get_reg(ctx, reg_file, temp, parallelcopies, instr); + } else if (num_moves > bytes) { + DefInfo info(ctx, instr, rc, -1); + std::optional<PhysReg> res = get_reg_simple(ctx, reg_file, info); + if (res) + return *res; + } /* re-enable killed operands which are in the wrong position */ RegisterFile tmp_file(reg_file); - for (unsigned i = 0, offset = 0; i < instr->operands.size(); - offset += instr->operands[i].bytes(), i++) { - if (instr->operands[i].isTemp() && instr->operands[i].isFirstKillBeforeDef() && - instr->operands[i].physReg().reg_b != best_pos.reg_b + offset) - tmp_file.fill(instr->operands[i]); + for (Operand& op : instr->operands) { + if (op.isTemp() && op.isFirstKillBeforeDef()) + tmp_file.fill(op); + } + for (unsigned i = 0; i < instr->operands.size(); i++) { + if ((correct_pos_mask >> i) & 1u && instr->operands[i].isKill()) + tmp_file.clear(instr->operands[i]); } /* collect variables to be moved */ - std::set<std::pair<unsigned, unsigned>> vars = - collect_vars(ctx, tmp_file, PhysRegInterval{best_pos, size}); + std::vector<unsigned> vars = collect_vars(ctx, tmp_file, PhysRegInterval{best_pos, size}); - for (unsigned i = 0, offset = 0; i < instr->operands.size(); - offset += instr->operands[i].bytes(), i++) { - if (!instr->operands[i].isTemp() || !instr->operands[i].isFirstKillBeforeDef() || - instr->operands[i].getTemp().type() != rc.type()) - continue; - bool correct_pos = instr->operands[i].physReg().reg_b == best_pos.reg_b + offset; - /* GFX9+: move killed operands which aren't yet at the correct position - * Moving all killed operands generally leads to more register swaps. - * This is only done on GFX9+ because of the cheap v_swap instruction. - */ - if (ctx.program->chip_class >= GFX9 && !correct_pos) { - vars.emplace(instr->operands[i].bytes(), instr->operands[i].tempId()); - tmp_file.clear(instr->operands[i]); - /* fill operands which are in the correct position to avoid overwriting */ - } else if (correct_pos) { - tmp_file.fill(instr->operands[i]); - } - } bool success = false; std::vector<std::pair<Operand, Definition>> pc; - success = - get_regs_for_copies(ctx, tmp_file, pc, vars, bounds, instr, PhysRegInterval{best_pos, size}); + success = get_regs_for_copies(ctx, tmp_file, pc, vars, instr, PhysRegInterval{best_pos, size}); if (!success) { - if (!increase_register_file(ctx, temp.type())) { + if (!increase_register_file(ctx, temp.regClass())) { /* use the fallback algorithm in get_reg() */ return get_reg(ctx, reg_file, temp, parallelcopies, instr); } @@ -1774,7 +2038,7 @@ handle_pseudo(ra_ctx& ctx, const RegisterFile& reg_file, Instruction* instr) case aco_opcode::p_create_vector: case aco_opcode::p_split_vector: case aco_opcode::p_parallelcopy: - case aco_opcode::p_wqm: break; + case aco_opcode::p_start_linear_vgpr: break; default: return; } @@ -1794,10 +2058,11 @@ handle_pseudo(ra_ctx& ctx, const RegisterFile& reg_file, Instruction* instr) reads_subdword = true; } bool needs_scratch_reg = (writes_linear && reads_linear && reg_file[scc]) || - (ctx.program->chip_class <= GFX7 && reads_subdword); + (ctx.program->gfx_level <= GFX7 && reads_subdword); if (!needs_scratch_reg) return; + instr->pseudo().needs_scratch_reg = true; instr->pseudo().tmp_in_scc = reg_file[scc]; int reg = ctx.max_used_sgpr; @@ -1818,27 +2083,11 @@ handle_pseudo(ra_ctx& ctx, const RegisterFile& reg_file, Instruction* instr) } bool -operand_can_use_reg(chip_class chip, aco_ptr<Instruction>& instr, unsigned idx, PhysReg reg, +operand_can_use_reg(amd_gfx_level gfx_level, aco_ptr<Instruction>& instr, unsigned idx, PhysReg reg, RegClass rc) { - if (instr->operands[idx].isFixed()) - return instr->operands[idx].physReg() == reg; - - bool is_writelane = instr->opcode == aco_opcode::v_writelane_b32 || - instr->opcode == aco_opcode::v_writelane_b32_e64; - if (chip <= GFX9 && is_writelane && idx <= 1) { - /* v_writelane_b32 can take two sgprs but only if one is m0. */ - bool is_other_sgpr = - instr->operands[!idx].isTemp() && - (!instr->operands[!idx].isFixed() || instr->operands[!idx].physReg() != m0); - if (is_other_sgpr && instr->operands[!idx].tempId() != instr->operands[idx].tempId()) { - instr->operands[idx].setFixed(m0); - return reg == m0; - } - } - if (reg.byte()) { - unsigned stride = get_subdword_operand_stride(chip, instr, idx, rc); + unsigned stride = get_subdword_operand_stride(gfx_level, instr, idx, rc); if (reg.byte() % stride) return false; } @@ -1848,7 +2097,7 @@ operand_can_use_reg(chip_class chip, aco_ptr<Instruction>& instr, unsigned idx, return reg != scc && reg != exec && (reg != m0 || idx == 1 || idx == 3) && /* offset can be m0 */ (reg != vcc || (instr->definitions.empty() && idx == 2) || - chip >= GFX10); /* sdata can be vcc */ + gfx_level >= GFX10); /* sdata can be vcc */ default: // TODO: there are more instructions with restrictions on registers return true; @@ -1856,41 +2105,82 @@ operand_can_use_reg(chip_class chip, aco_ptr<Instruction>& instr, unsigned idx, } void -get_reg_for_operand(ra_ctx& ctx, RegisterFile& register_file, - std::vector<std::pair<Operand, Definition>>& parallelcopy, - aco_ptr<Instruction>& instr, Operand& operand, unsigned operand_index) +handle_fixed_operands(ra_ctx& ctx, RegisterFile& register_file, + std::vector<std::pair<Operand, Definition>>& parallelcopy, + aco_ptr<Instruction>& instr) { - /* check if the operand is fixed */ - PhysReg src = ctx.assignments[operand.tempId()].reg; - PhysReg dst; - if (operand.isFixed()) { - assert(operand.physReg() != src); + assert(instr->operands.size() <= 128); - /* check if target reg is blocked, and move away the blocking var */ - if (register_file.test(operand.physReg(), operand.bytes())) { - PhysRegInterval target{operand.physReg(), operand.size()}; + RegisterFile tmp_file(register_file); - RegisterFile tmp_file(register_file); + BITSET_DECLARE(mask, 128) = {0}; - std::set<std::pair<unsigned, unsigned>> blocking_vars = - collect_vars(ctx, tmp_file, target); + for (unsigned i = 0; i < instr->operands.size(); i++) { + Operand& op = instr->operands[i]; + + if (!op.isTemp() || !op.isFixed()) + continue; - tmp_file.clear(src, operand.regClass()); // TODO: try to avoid moving block vars to src - tmp_file.block(operand.physReg(), operand.regClass()); + PhysReg src = ctx.assignments[op.tempId()].reg; + adjust_max_used_regs(ctx, op.regClass(), op.physReg()); - DefInfo info(ctx, instr, operand.regClass(), -1); - get_regs_for_copies(ctx, tmp_file, parallelcopy, blocking_vars, info.bounds, instr, - PhysRegInterval()); + if (op.physReg() == src) { + tmp_file.block(op.physReg(), op.regClass()); + continue; } - dst = operand.physReg(); - } else { - dst = get_reg(ctx, register_file, operand.getTemp(), parallelcopy, instr, operand_index); - update_renames( - ctx, register_file, parallelcopy, instr, - instr->opcode != aco_opcode::p_create_vector ? rename_not_killed_ops : (UpdateRenames)0); + unsigned j; + bool found = false; + BITSET_FOREACH_SET (j, mask, i) { + if (instr->operands[j].tempId() == op.tempId() && + instr->operands[j].physReg() == op.physReg()) { + found = true; + break; + } + } + if (found) + continue; /* the copy is already added to the list */ + + /* clear from register_file so fixed operands are not collected be collect_vars() */ + tmp_file.clear(src, op.regClass()); // TODO: try to avoid moving block vars to src + + BITSET_SET(mask, i); + + Operand pc_op(instr->operands[i].getTemp(), src); + Definition pc_def = Definition(op.physReg(), pc_op.regClass()); + parallelcopy.emplace_back(pc_op, pc_def); } + if (BITSET_IS_EMPTY(mask)) + return; + + unsigned i; + std::vector<unsigned> blocking_vars; + BITSET_FOREACH_SET (i, mask, instr->operands.size()) { + Operand& op = instr->operands[i]; + PhysRegInterval target{op.physReg(), op.size()}; + std::vector<unsigned> blocking_vars2 = collect_vars(ctx, tmp_file, target); + blocking_vars.insert(blocking_vars.end(), blocking_vars2.begin(), blocking_vars2.end()); + + /* prevent get_regs_for_copies() from using these registers */ + tmp_file.block(op.physReg(), op.regClass()); + } + + get_regs_for_copies(ctx, tmp_file, parallelcopy, blocking_vars, instr, PhysRegInterval()); + update_renames(ctx, register_file, parallelcopy, instr, + rename_not_killed_ops | fill_killed_ops | rename_precolored_ops); +} + +void +get_reg_for_operand(ra_ctx& ctx, RegisterFile& register_file, + std::vector<std::pair<Operand, Definition>>& parallelcopy, + aco_ptr<Instruction>& instr, Operand& operand, unsigned operand_index) +{ + /* clear the operand in case it's only a stride mismatch */ + PhysReg src = ctx.assignments[operand.tempId()].reg; + register_file.clear(src, operand.regClass()); + PhysReg dst = get_reg(ctx, register_file, operand.getTemp(), parallelcopy, instr, operand_index); + Operand pc_op = operand; pc_op.setFixed(src); Definition pc_def = Definition(dst, pc_op.regClass()); @@ -1898,6 +2188,151 @@ get_reg_for_operand(ra_ctx& ctx, RegisterFile& register_file, update_renames(ctx, register_file, parallelcopy, instr, rename_not_killed_ops | fill_killed_ops); } +PhysReg +get_reg_phi(ra_ctx& ctx, IDSet& live_in, RegisterFile& register_file, + std::vector<aco_ptr<Instruction>>& instructions, Block& block, + aco_ptr<Instruction>& phi, Temp tmp) +{ + std::vector<std::pair<Operand, Definition>> parallelcopy; + PhysReg reg = get_reg(ctx, register_file, tmp, parallelcopy, phi); + update_renames(ctx, register_file, parallelcopy, phi, rename_not_killed_ops); + + /* process parallelcopy */ + for (std::pair<Operand, Definition> pc : parallelcopy) { + /* see if it's a copy from a different phi */ + // TODO: prefer moving some previous phis over live-ins + // TODO: somehow prevent phis fixed before the RA from being updated (shouldn't be a + // problem in practice since they can only be fixed to exec) + Instruction* prev_phi = NULL; + std::vector<aco_ptr<Instruction>>::iterator phi_it; + for (phi_it = instructions.begin(); phi_it != instructions.end(); ++phi_it) { + if ((*phi_it)->definitions[0].tempId() == pc.first.tempId()) + prev_phi = phi_it->get(); + } + if (prev_phi) { + /* if so, just update that phi's register */ + prev_phi->definitions[0].setFixed(pc.second.physReg()); + register_file.fill(prev_phi->definitions[0]); + ctx.assignments[prev_phi->definitions[0].tempId()] = {pc.second.physReg(), + pc.second.regClass()}; + continue; + } + + /* rename */ + std::unordered_map<unsigned, Temp>::iterator orig_it = ctx.orig_names.find(pc.first.tempId()); + Temp orig = orig_it != ctx.orig_names.end() ? orig_it->second : pc.first.getTemp(); + ctx.orig_names[pc.second.tempId()] = orig; + ctx.renames[block.index][orig.id()] = pc.second.getTemp(); + + /* otherwise, this is a live-in and we need to create a new phi + * to move it in this block's predecessors */ + aco_opcode opcode = + pc.first.getTemp().is_linear() ? aco_opcode::p_linear_phi : aco_opcode::p_phi; + Block::edge_vec& preds = + pc.first.getTemp().is_linear() ? block.linear_preds : block.logical_preds; + aco_ptr<Instruction> new_phi{create_instruction(opcode, Format::PSEUDO, preds.size(), 1)}; + new_phi->definitions[0] = pc.second; + for (unsigned i = 0; i < preds.size(); i++) + new_phi->operands[i] = Operand(pc.first); + instructions.emplace_back(std::move(new_phi)); + + /* Remove from live_in, because handle_loop_phis() would re-create this phi later if this is + * a loop header. + */ + live_in.erase(orig.id()); + } + + return reg; +} + +void +get_regs_for_phis(ra_ctx& ctx, Block& block, RegisterFile& register_file, + std::vector<aco_ptr<Instruction>>& instructions, IDSet& live_in) +{ + /* move all phis to instructions */ + for (aco_ptr<Instruction>& phi : block.instructions) { + if (!is_phi(phi)) + break; + if (!phi->definitions[0].isKill()) + instructions.emplace_back(std::move(phi)); + } + + /* assign phis with all-matching registers to that register */ + for (aco_ptr<Instruction>& phi : instructions) { + Definition& definition = phi->definitions[0]; + if (definition.isFixed()) + continue; + + if (!phi->operands[0].isTemp()) + continue; + + PhysReg reg = phi->operands[0].physReg(); + auto OpsSame = [=](const Operand& op) -> bool + { return op.isTemp() && (!op.isFixed() || op.physReg() == reg); }; + bool all_same = std::all_of(phi->operands.cbegin() + 1, phi->operands.cend(), OpsSame); + if (!all_same) + continue; + + if (!get_reg_specified(ctx, register_file, definition.regClass(), phi, reg)) + continue; + + definition.setFixed(reg); + register_file.fill(definition); + ctx.assignments[definition.tempId()].set(definition); + } + + /* try to find a register that is used by at least one operand */ + for (aco_ptr<Instruction>& phi : instructions) { + Definition& definition = phi->definitions[0]; + if (definition.isFixed()) + continue; + + /* use affinity if available */ + if (ctx.assignments[definition.tempId()].affinity && + ctx.assignments[ctx.assignments[definition.tempId()].affinity].assigned) { + assignment& affinity = ctx.assignments[ctx.assignments[definition.tempId()].affinity]; + assert(affinity.rc == definition.regClass()); + if (get_reg_specified(ctx, register_file, definition.regClass(), phi, affinity.reg)) { + definition.setFixed(affinity.reg); + register_file.fill(definition); + ctx.assignments[definition.tempId()].set(definition); + continue; + } + } + + /* by going backwards, we aim to avoid copies in else-blocks */ + for (int i = phi->operands.size() - 1; i >= 0; i--) { + const Operand& op = phi->operands[i]; + if (!op.isTemp() || !op.isFixed()) + continue; + + PhysReg reg = op.physReg(); + if (get_reg_specified(ctx, register_file, definition.regClass(), phi, reg)) { + definition.setFixed(reg); + register_file.fill(definition); + ctx.assignments[definition.tempId()].set(definition); + break; + } + } + } + + /* find registers for phis where the register was blocked or no operand was assigned */ + + /* Don't use iterators because get_reg_phi() can add phis to the end of the vector. */ + for (unsigned i = 0; i < instructions.size(); i++) { + aco_ptr<Instruction>& phi = instructions[i]; + Definition& definition = phi->definitions[0]; + if (definition.isFixed()) + continue; + + definition.setFixed( + get_reg_phi(ctx, live_in, register_file, instructions, block, phi, definition.getTemp())); + + register_file.fill(definition); + ctx.assignments[definition.tempId()].set(definition); + } +} + Temp read_variable(ra_ctx& ctx, Temp val, unsigned block_idx) { @@ -1911,7 +2346,7 @@ read_variable(ra_ctx& ctx, Temp val, unsigned block_idx) Temp handle_live_in(ra_ctx& ctx, Temp val, Block* block) { - std::vector<unsigned>& preds = val.is_linear() ? block->linear_preds : block->logical_preds; + Block::edge_vec& preds = val.is_linear() ? block->linear_preds : block->logical_preds; if (preds.size() == 0) return val; @@ -1939,20 +2374,18 @@ handle_live_in(ra_ctx& ctx, Temp val, Block* block) /* the variable has been renamed differently in the predecessors: we need to insert a phi */ aco_opcode opcode = val.is_linear() ? aco_opcode::p_linear_phi : aco_opcode::p_phi; - aco_ptr<Instruction> phi{ - create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, preds.size(), 1)}; + aco_ptr<Instruction> phi{create_instruction(opcode, Format::PSEUDO, preds.size(), 1)}; new_val = ctx.program->allocateTmp(val.regClass()); phi->definitions[0] = Definition(new_val); + ctx.assignments.emplace_back(); + assert(ctx.assignments.size() == ctx.program->peekAllocationId()); for (unsigned i = 0; i < preds.size(); i++) { /* update the operands so that it uses the new affinity */ phi->operands[i] = Operand(ops[i]); assert(ctx.assignments[ops[i].id()].assigned); + assert(ops[i].regClass() == new_val.regClass()); phi->operands[i].setFixed(ctx.assignments[ops[i].id()].reg); - if (ops[i].regClass() == new_val.regClass()) - ctx.affinities[new_val.id()] = ops[i].id(); } - ctx.assignments.emplace_back(); - assert(ctx.assignments.size() == ctx.program->peekAllocationId()); block->instructions.insert(block->instructions.begin(), std::move(phi)); } @@ -2002,7 +2435,7 @@ handle_loop_phis(ra_ctx& ctx, const IDSet& live_in, uint32_t loop_header_idx, aco_ptr<Instruction>& phi = loop_header.instructions[i]; if (!is_phi(phi)) break; - const std::vector<unsigned>& preds = + const Block::edge_vec& preds = phi->opcode == aco_opcode::p_phi ? loop_header.logical_preds : loop_header.linear_preds; for (unsigned j = 1; j < phi->operands.size(); j++) { Operand& op = phi->operands[j]; @@ -2093,7 +2526,7 @@ init_reg_file(ra_ctx& ctx, const std::vector<IDSet>& live_out_per_block, Block& for (aco_ptr<Instruction>& instr : block.instructions) { if (!is_phi(instr)) break; - const std::vector<unsigned>& preds = + const Block::edge_vec& preds = instr->opcode == aco_opcode::p_phi ? block.logical_preds : block.linear_preds; for (unsigned i = 0; i < instr->operands.size(); i++) { @@ -2125,8 +2558,8 @@ init_reg_file(ra_ctx& ctx, const std::vector<IDSet>& live_out_per_block, Block& void get_affinities(ra_ctx& ctx, std::vector<IDSet>& live_out_per_block) { - std::vector<std::vector<Temp>> phi_ressources; - std::unordered_map<unsigned, unsigned> temp_to_phi_ressources; + std::vector<std::vector<Temp>> phi_resources; + std::unordered_map<unsigned, unsigned> temp_to_phi_resources; for (auto block_rit = ctx.program->blocks.rbegin(); block_rit != ctx.program->blocks.rend(); block_rit++) { @@ -2138,46 +2571,48 @@ get_affinities(ra_ctx& ctx, std::vector<IDSet>& live_out_per_block) std::vector<aco_ptr<Instruction>>::reverse_iterator rit; for (rit = block.instructions.rbegin(); rit != block.instructions.rend(); ++rit) { aco_ptr<Instruction>& instr = *rit; - if (is_phi(instr)) { - if (instr->definitions[0].isKill() || instr->definitions[0].isFixed()) { - live.erase(instr->definitions[0].tempId()); - continue; - } - /* collect information about affinity-related temporaries */ - std::vector<Temp> affinity_related; - /* affinity_related[0] is the last seen affinity-related temp */ - affinity_related.emplace_back(instr->definitions[0].getTemp()); - affinity_related.emplace_back(instr->definitions[0].getTemp()); - for (const Operand& op : instr->operands) { - if (op.isTemp() && op.isKill() && - op.regClass() == instr->definitions[0].regClass()) { - affinity_related.emplace_back(op.getTemp()); - temp_to_phi_ressources[op.tempId()] = phi_ressources.size(); - } - } - phi_ressources.emplace_back(std::move(affinity_related)); - } else { - /* add vector affinities */ - if (instr->opcode == aco_opcode::p_create_vector) { - for (const Operand& op : instr->operands) { - if (op.isTemp() && op.isFirstKill() && - op.getTemp().type() == instr->definitions[0].getTemp().type()) - ctx.vectors[op.tempId()] = instr.get(); - } - } else if (instr->format == Format::MIMG && instr->operands.size() > 4) { - for (unsigned i = 3; i < instr->operands.size(); i++) - ctx.vectors[instr->operands[i].tempId()] = instr.get(); - } - - if (instr->opcode == aco_opcode::p_split_vector && - instr->operands[0].isFirstKillBeforeDef()) - ctx.split_vectors[instr->operands[0].tempId()] = instr.get(); + if (is_phi(instr)) + break; - /* add operands to live variables */ + /* add vector affinities */ + if (instr->opcode == aco_opcode::p_create_vector) { for (const Operand& op : instr->operands) { - if (op.isTemp()) - live.insert(op.tempId()); + if (op.isTemp() && op.isFirstKill() && + op.getTemp().type() == instr->definitions[0].getTemp().type()) + ctx.vectors[op.tempId()] = instr.get(); } + } else if (instr->format == Format::MIMG && instr->operands.size() > 4 && + !instr->mimg().strict_wqm) { + for (unsigned i = 3; i < instr->operands.size(); i++) + ctx.vectors[instr->operands[i].tempId()] = instr.get(); + } else if (instr->opcode == aco_opcode::p_split_vector && + instr->operands[0].isFirstKillBeforeDef()) { + ctx.split_vectors[instr->operands[0].tempId()] = instr.get(); + } else if (instr->isVOPC() && !instr->isVOP3()) { + if (!instr->isSDWA() || ctx.program->gfx_level == GFX8) + ctx.assignments[instr->definitions[0].tempId()].vcc = true; + } else if (instr->isVOP2() && !instr->isVOP3()) { + if (instr->operands.size() == 3 && instr->operands[2].isTemp() && + instr->operands[2].regClass().type() == RegType::sgpr) + ctx.assignments[instr->operands[2].tempId()].vcc = true; + if (instr->definitions.size() == 2) + ctx.assignments[instr->definitions[1].tempId()].vcc = true; + } else if (instr->opcode == aco_opcode::s_and_b32 || + instr->opcode == aco_opcode::s_and_b64) { + /* If SCC is used by a branch, we might be able to use + * s_cbranch_vccz/s_cbranch_vccnz if the operand is VCC. + */ + if (!instr->definitions[1].isKill() && instr->operands[0].isTemp() && + instr->operands[1].isFixed() && instr->operands[1].physReg() == exec) + ctx.assignments[instr->operands[0].tempId()].vcc = true; + } else if (instr->opcode == aco_opcode::s_sendmsg) { + ctx.assignments[instr->operands[0].tempId()].m0 = true; + } + + /* add operands to live variables */ + for (const Operand& op : instr->operands) { + if (op.isTemp()) + live.insert(op.tempId()); } /* erase definitions from live */ @@ -2188,10 +2623,10 @@ get_affinities(ra_ctx& ctx, std::vector<IDSet>& live_out_per_block) live.erase(def.tempId()); /* mark last-seen phi operand */ std::unordered_map<unsigned, unsigned>::iterator it = - temp_to_phi_ressources.find(def.tempId()); - if (it != temp_to_phi_ressources.end() && - def.regClass() == phi_ressources[it->second][0].regClass()) { - phi_ressources[it->second][0] = def.getTemp(); + temp_to_phi_resources.find(def.tempId()); + if (it != temp_to_phi_resources.end() && + def.regClass() == phi_resources[it->second][0].regClass()) { + phi_resources[it->second][0] = def.getTemp(); /* try to coalesce phi affinities with parallelcopies */ Operand op = Operand(); switch (instr->opcode) { @@ -2204,7 +2639,7 @@ get_affinities(ra_ctx& ctx, std::vector<IDSet>& live_out_per_block) case aco_opcode::v_fma_f32: case aco_opcode::v_fma_f16: case aco_opcode::v_pk_fma_f16: - if (ctx.program->chip_class < GFX10) + if (ctx.program->gfx_level < GFX10) continue; FALLTHROUGH; case aco_opcode::v_mad_f32: @@ -2214,193 +2649,371 @@ get_affinities(ra_ctx& ctx, std::vector<IDSet>& live_out_per_block) op = instr->operands[2]; break; + case aco_opcode::v_mad_legacy_f32: + case aco_opcode::v_fma_legacy_f32: + if (instr->usesModifiers() || !ctx.program->dev.has_mac_legacy32) + continue; + op = instr->operands[2]; + break; + default: continue; } if (op.isTemp() && op.isFirstKillBeforeDef() && def.regClass() == op.regClass()) { - phi_ressources[it->second].emplace_back(op.getTemp()); - temp_to_phi_ressources[op.tempId()] = it->second; + phi_resources[it->second].emplace_back(op.getTemp()); + temp_to_phi_resources[op.tempId()] = it->second; + } + } + } + } + + /* collect phi affinities */ + for (; rit != block.instructions.rend(); ++rit) { + aco_ptr<Instruction>& instr = *rit; + assert(is_phi(instr)); + + live.erase(instr->definitions[0].tempId()); + if (instr->definitions[0].isKill() || instr->definitions[0].isFixed()) + continue; + + assert(instr->definitions[0].isTemp()); + std::unordered_map<unsigned, unsigned>::iterator it = + temp_to_phi_resources.find(instr->definitions[0].tempId()); + unsigned index = phi_resources.size(); + std::vector<Temp>* affinity_related; + if (it != temp_to_phi_resources.end()) { + index = it->second; + phi_resources[index][0] = instr->definitions[0].getTemp(); + affinity_related = &phi_resources[index]; + } else { + phi_resources.emplace_back(std::vector<Temp>{instr->definitions[0].getTemp()}); + affinity_related = &phi_resources.back(); + } + + for (const Operand& op : instr->operands) { + if (op.isTemp() && op.isKill() && op.regClass() == instr->definitions[0].regClass()) { + affinity_related->emplace_back(op.getTemp()); + if (block.kind & block_kind_loop_header) + continue; + temp_to_phi_resources[op.tempId()] = index; + } + } + } + + /* visit the loop header phis first in order to create nested affinities */ + if (block.kind & block_kind_loop_exit) { + /* find loop header */ + auto header_rit = block_rit; + while ((header_rit + 1)->loop_nest_depth > block.loop_nest_depth) + header_rit++; + + for (aco_ptr<Instruction>& phi : header_rit->instructions) { + if (!is_phi(phi)) + break; + if (phi->definitions[0].isKill() || phi->definitions[0].isFixed()) + continue; + + /* create an (empty) merge-set for the phi-related variables */ + auto it = temp_to_phi_resources.find(phi->definitions[0].tempId()); + unsigned index = phi_resources.size(); + if (it == temp_to_phi_resources.end()) { + temp_to_phi_resources[phi->definitions[0].tempId()] = index; + phi_resources.emplace_back(std::vector<Temp>{phi->definitions[0].getTemp()}); + } else { + index = it->second; + } + for (unsigned i = 1; i < phi->operands.size(); i++) { + const Operand& op = phi->operands[i]; + if (op.isTemp() && op.isKill() && op.regClass() == phi->definitions[0].regClass()) { + temp_to_phi_resources[op.tempId()] = index; } } } } } /* create affinities */ - for (std::vector<Temp>& vec : phi_ressources) { - assert(vec.size() > 1); + for (std::vector<Temp>& vec : phi_resources) { for (unsigned i = 1; i < vec.size(); i++) if (vec[i].id() != vec[0].id()) - ctx.affinities[vec[i].id()] = vec[0].id(); + ctx.assignments[vec[i].id()].affinity = vec[0].id(); } } -} /* end namespace */ +void +optimize_encoding_vop2(Program* program, ra_ctx& ctx, RegisterFile& register_file, + aco_ptr<Instruction>& instr) +{ + /* try to optimize v_mad_f32 -> v_mac_f32 */ + if ((instr->opcode != aco_opcode::v_mad_f32 && + (instr->opcode != aco_opcode::v_fma_f32 || program->gfx_level < GFX10) && + instr->opcode != aco_opcode::v_mad_f16 && instr->opcode != aco_opcode::v_mad_legacy_f16 && + (instr->opcode != aco_opcode::v_fma_f16 || program->gfx_level < GFX10) && + (instr->opcode != aco_opcode::v_pk_fma_f16 || program->gfx_level < GFX10) && + (instr->opcode != aco_opcode::v_mad_legacy_f32 || !program->dev.has_mac_legacy32) && + (instr->opcode != aco_opcode::v_fma_legacy_f32 || !program->dev.has_mac_legacy32) && + (instr->opcode != aco_opcode::v_dot4_i32_i8 || program->family == CHIP_VEGA20)) || + !instr->operands[2].isTemp() || !instr->operands[2].isKillBeforeDef() || + instr->operands[2].getTemp().type() != RegType::vgpr || + (!instr->operands[0].isOfType(RegType::vgpr) && + !instr->operands[1].isOfType(RegType::vgpr)) || + instr->operands[2].physReg().byte() != 0 || instr->valu().opsel[2]) + return; + + if (instr->isVOP3P() && (instr->valu().opsel_lo != 0 || instr->valu().opsel_hi != 0x7)) + return; + + if ((instr->operands[0].physReg().byte() != 0 || instr->operands[1].physReg().byte() != 0 || + instr->valu().opsel) && + program->gfx_level < GFX11) + return; + + unsigned im_mask = instr->isDPP16() ? 0x3 : 0; + if (instr->valu().omod || instr->valu().clamp || (instr->valu().abs & ~im_mask) || + (instr->valu().neg & ~im_mask)) + return; + + if (!instr->operands[1].isOfType(RegType::vgpr)) + instr->valu().swapOperands(0, 1); + + if (!instr->operands[0].isOfType(RegType::vgpr) && instr->valu().opsel[0]) + return; + + unsigned def_id = instr->definitions[0].tempId(); + if (ctx.assignments[def_id].affinity) { + assignment& affinity = ctx.assignments[ctx.assignments[def_id].affinity]; + if (affinity.assigned && affinity.reg != instr->operands[2].physReg() && + !register_file.test(affinity.reg, instr->operands[2].bytes())) + return; + } + + instr->format = (Format)(((unsigned)withoutVOP3(instr->format) & ~(unsigned)Format::VOP3P) | + (unsigned)Format::VOP2); + instr->valu().opsel_hi = 0; + switch (instr->opcode) { + case aco_opcode::v_mad_f32: instr->opcode = aco_opcode::v_mac_f32; break; + case aco_opcode::v_fma_f32: instr->opcode = aco_opcode::v_fmac_f32; break; + case aco_opcode::v_mad_f16: + case aco_opcode::v_mad_legacy_f16: instr->opcode = aco_opcode::v_mac_f16; break; + case aco_opcode::v_fma_f16: instr->opcode = aco_opcode::v_fmac_f16; break; + case aco_opcode::v_pk_fma_f16: instr->opcode = aco_opcode::v_pk_fmac_f16; break; + case aco_opcode::v_dot4_i32_i8: instr->opcode = aco_opcode::v_dot4c_i32_i8; break; + case aco_opcode::v_mad_legacy_f32: instr->opcode = aco_opcode::v_mac_legacy_f32; break; + case aco_opcode::v_fma_legacy_f32: instr->opcode = aco_opcode::v_fmac_legacy_f32; break; + default: break; + } +} void -register_allocation(Program* program, std::vector<IDSet>& live_out_per_block, ra_test_policy policy) +optimize_encoding_sopk(Program* program, ra_ctx& ctx, RegisterFile& register_file, + aco_ptr<Instruction>& instr) { - ra_ctx ctx(program, policy); - get_affinities(ctx, live_out_per_block); + /* try to optimize sop2 with literal source to sopk */ + if (instr->opcode != aco_opcode::s_add_i32 && instr->opcode != aco_opcode::s_mul_i32 && + instr->opcode != aco_opcode::s_cselect_b32) + return; - /* state of register file after phis */ - std::vector<std::bitset<128>> sgpr_live_in(program->blocks.size()); + uint32_t literal_idx = 0; - for (Block& block : program->blocks) { - ctx.block = █ + if (instr->opcode != aco_opcode::s_cselect_b32 && instr->operands[1].isLiteral()) + literal_idx = 1; - /* initialize register file */ - RegisterFile register_file = init_reg_file(ctx, live_out_per_block, block); - ctx.war_hint.reset(); + if (!instr->operands[!literal_idx].isTemp() || + !instr->operands[!literal_idx].isKillBeforeDef() || + instr->operands[!literal_idx].getTemp().type() != RegType::sgpr || + instr->operands[!literal_idx].physReg() >= 128) + return; - std::vector<aco_ptr<Instruction>> instructions; - std::vector<aco_ptr<Instruction>>::iterator instr_it; + if (!instr->operands[literal_idx].isLiteral()) + return; - /* this is a slight adjustment from the paper as we already have phi nodes: - * We consider them incomplete phis and only handle the definition. */ + const uint32_t i16_mask = 0xffff8000u; + uint32_t value = instr->operands[literal_idx].constantValue(); + if ((value & i16_mask) && (value & i16_mask) != i16_mask) + return; - /* look up the affinities */ - for (instr_it = block.instructions.begin(); instr_it != block.instructions.end(); - ++instr_it) { - aco_ptr<Instruction>& phi = *instr_it; - if (!is_phi(phi)) - break; - Definition& definition = phi->definitions[0]; - if (definition.isKill() || definition.isFixed()) - continue; + unsigned def_id = instr->definitions[0].tempId(); + if (ctx.assignments[def_id].affinity) { + assignment& affinity = ctx.assignments[ctx.assignments[def_id].affinity]; + if (affinity.assigned && affinity.reg != instr->operands[!literal_idx].physReg() && + !register_file.test(affinity.reg, instr->operands[!literal_idx].bytes())) + return; + } - if (ctx.affinities.find(definition.tempId()) != ctx.affinities.end() && - ctx.assignments[ctx.affinities[definition.tempId()]].assigned) { - assert(ctx.assignments[ctx.affinities[definition.tempId()]].rc == - definition.regClass()); - PhysReg reg = ctx.assignments[ctx.affinities[definition.tempId()]].reg; - if (reg == scc) { - /* only use scc if all operands are already placed there */ - bool use_scc = - std::all_of(phi->operands.begin(), phi->operands.end(), - [](const Operand& op) - { return op.isTemp() && op.isFixed() && op.physReg() == scc; }); - if (!use_scc) - continue; - } + instr->format = Format::SOPK; + SALU_instruction* instr_sopk = &instr->salu(); - /* only assign if register is still free */ - if (!register_file.test(reg, definition.bytes())) { - definition.setFixed(reg); - register_file.fill(definition); - ctx.assignments[definition.tempId()] = {definition.physReg(), definition.regClass()}; - } + instr_sopk->imm = instr_sopk->operands[literal_idx].constantValue() & 0xffff; + if (literal_idx == 0) + std::swap(instr_sopk->operands[0], instr_sopk->operands[1]); + if (instr_sopk->operands.size() > 2) + std::swap(instr_sopk->operands[1], instr_sopk->operands[2]); + instr_sopk->operands.pop_back(); + + switch (instr_sopk->opcode) { + case aco_opcode::s_add_i32: instr_sopk->opcode = aco_opcode::s_addk_i32; break; + case aco_opcode::s_mul_i32: instr_sopk->opcode = aco_opcode::s_mulk_i32; break; + case aco_opcode::s_cselect_b32: instr_sopk->opcode = aco_opcode::s_cmovk_i32; break; + default: unreachable("illegal instruction"); + } +} + +void +optimize_encoding(Program* program, ra_ctx& ctx, RegisterFile& register_file, + aco_ptr<Instruction>& instr) +{ + if (instr->isVALU()) + optimize_encoding_vop2(program, ctx, register_file, instr); + if (instr->isSALU()) + optimize_encoding_sopk(program, ctx, register_file, instr); +} + +void +emit_parallel_copy_internal(ra_ctx& ctx, std::vector<std::pair<Operand, Definition>>& parallelcopy, + aco_ptr<Instruction>& instr, + std::vector<aco_ptr<Instruction>>& instructions, bool temp_in_scc, + RegisterFile& register_file) +{ + if (parallelcopy.empty()) + return; + + aco_ptr<Instruction> pc; + pc.reset(create_instruction(aco_opcode::p_parallelcopy, Format::PSEUDO, parallelcopy.size(), + parallelcopy.size())); + bool linear_vgpr = false; + bool sgpr_operands_alias_defs = false; + uint64_t sgpr_operands[4] = {0, 0, 0, 0}; + for (unsigned i = 0; i < parallelcopy.size(); i++) { + linear_vgpr |= parallelcopy[i].first.regClass().is_linear_vgpr(); + + if (temp_in_scc && parallelcopy[i].first.isTemp() && + parallelcopy[i].first.getTemp().type() == RegType::sgpr) { + if (!sgpr_operands_alias_defs) { + unsigned reg = parallelcopy[i].first.physReg().reg(); + unsigned size = parallelcopy[i].first.getTemp().size(); + sgpr_operands[reg / 64u] |= u_bit_consecutive64(reg % 64u, size); + + reg = parallelcopy[i].second.physReg().reg(); + size = parallelcopy[i].second.getTemp().size(); + if (sgpr_operands[reg / 64u] & u_bit_consecutive64(reg % 64u, size)) + sgpr_operands_alias_defs = true; } } - /* find registers for phis without affinity or where the register was blocked */ - for (instr_it = block.instructions.begin(); instr_it != block.instructions.end(); - ++instr_it) { - aco_ptr<Instruction>& phi = *instr_it; - if (!is_phi(phi)) - break; + pc->operands[i] = parallelcopy[i].first; + pc->definitions[i] = parallelcopy[i].second; + assert(pc->operands[i].size() == pc->definitions[i].size()); - Definition& definition = phi->definitions[0]; - if (definition.isKill()) - continue; + /* it might happen that the operand is already renamed. we have to restore the + * original name. */ + std::unordered_map<unsigned, Temp>::iterator it = + ctx.orig_names.find(pc->operands[i].tempId()); + Temp orig = it != ctx.orig_names.end() ? it->second : pc->operands[i].getTemp(); + ctx.orig_names[pc->definitions[i].tempId()] = orig; + ctx.renames[ctx.block->index][orig.id()] = pc->definitions[i].getTemp(); + } - if (!definition.isFixed()) { - std::vector<std::pair<Operand, Definition>> parallelcopy; - /* try to find a register that is used by at least one operand */ - for (int i = phi->operands.size() - 1; i >= 0; i--) { - /* by going backwards, we aim to avoid copies in else-blocks */ - const Operand& op = phi->operands[i]; - if (!op.isTemp() || !op.isFixed()) - continue; - PhysReg reg = op.physReg(); - /* we tried this already on the previous loop */ - if (reg == scc) - continue; - if (get_reg_specified(ctx, register_file, definition.regClass(), phi, reg)) { - definition.setFixed(reg); - break; - } - } - if (!definition.isFixed()) { - definition.setFixed( - get_reg(ctx, register_file, definition.getTemp(), parallelcopy, phi)); - update_renames(ctx, register_file, parallelcopy, phi, rename_not_killed_ops); - } + if (temp_in_scc && (sgpr_operands_alias_defs || linear_vgpr)) { + /* disable definitions and re-enable operands */ + RegisterFile tmp_file(register_file); + for (const Definition& def : instr->definitions) { + if (def.isTemp() && !def.isKill()) + tmp_file.clear(def); + } + for (const Operand& op : instr->operands) { + if (op.isTemp() && op.isFirstKill()) + tmp_file.block(op.physReg(), op.regClass()); + } - /* process parallelcopy */ - for (std::pair<Operand, Definition> pc : parallelcopy) { - /* see if it's a copy from a different phi */ - // TODO: prefer moving some previous phis over live-ins - // TODO: somehow prevent phis fixed before the RA from being updated (shouldn't be a - // problem in practice since they can only be fixed to exec) - Instruction* prev_phi = NULL; - std::vector<aco_ptr<Instruction>>::iterator phi_it; - for (phi_it = instructions.begin(); phi_it != instructions.end(); ++phi_it) { - if ((*phi_it)->definitions[0].tempId() == pc.first.tempId()) - prev_phi = phi_it->get(); - } - phi_it = instr_it; - while (!prev_phi && is_phi(*++phi_it)) { - if ((*phi_it)->definitions[0].tempId() == pc.first.tempId()) - prev_phi = phi_it->get(); - } - if (prev_phi) { - /* if so, just update that phi's register */ - register_file.clear(prev_phi->definitions[0]); - prev_phi->definitions[0].setFixed(pc.second.physReg()); - ctx.assignments[prev_phi->definitions[0].tempId()] = {pc.second.physReg(), - pc.second.regClass()}; - register_file.fill(prev_phi->definitions[0]); - continue; - } + handle_pseudo(ctx, tmp_file, pc.get()); + } else { + pc->pseudo().needs_scratch_reg = sgpr_operands_alias_defs || linear_vgpr; + pc->pseudo().tmp_in_scc = false; + } - /* rename */ - std::unordered_map<unsigned, Temp>::iterator orig_it = - ctx.orig_names.find(pc.first.tempId()); - Temp orig = pc.first.getTemp(); - if (orig_it != ctx.orig_names.end()) - orig = orig_it->second; - else - ctx.orig_names[pc.second.tempId()] = orig; - ctx.renames[block.index][orig.id()] = pc.second.getTemp(); - - /* otherwise, this is a live-in and we need to create a new phi - * to move it in this block's predecessors */ - aco_opcode opcode = - pc.first.getTemp().is_linear() ? aco_opcode::p_linear_phi : aco_opcode::p_phi; - std::vector<unsigned>& preds = - pc.first.getTemp().is_linear() ? block.linear_preds : block.logical_preds; - aco_ptr<Instruction> new_phi{ - create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, preds.size(), 1)}; - new_phi->definitions[0] = pc.second; - for (unsigned i = 0; i < preds.size(); i++) - new_phi->operands[i] = Operand(pc.first); - instructions.emplace_back(std::move(new_phi)); - - /* Remove from live_out_per_block (now used for live-in), because handle_loop_phis() - * would re-create this phi later if this is a loop header. - */ - live_out_per_block[block.index].erase(orig.id()); - } + instructions.emplace_back(std::move(pc)); - register_file.fill(definition); - ctx.assignments[definition.tempId()] = {definition.physReg(), definition.regClass()}; - } + parallelcopy.clear(); +} - /* update phi affinities */ - for (const Operand& op : phi->operands) { - if (op.isTemp() && op.regClass() == phi->definitions[0].regClass()) - ctx.affinities[op.tempId()] = definition.tempId(); +void +emit_parallel_copy(ra_ctx& ctx, std::vector<std::pair<Operand, Definition>>& parallelcopy, + aco_ptr<Instruction>& instr, std::vector<aco_ptr<Instruction>>& instructions, + bool temp_in_scc, RegisterFile& register_file) +{ + if (parallelcopy.empty()) + return; + + std::vector<std::pair<Operand, Definition>> linear_vgpr; + if (ctx.num_linear_vgprs) { + unsigned next = 0; + for (unsigned i = 0; i < parallelcopy.size(); i++) { + if (parallelcopy[i].first.regClass().is_linear_vgpr()) { + linear_vgpr.push_back(parallelcopy[i]); + continue; } - instructions.emplace_back(std::move(*instr_it)); + if (next != i) + parallelcopy[next] = parallelcopy[i]; + next++; } + parallelcopy.resize(next); + } + + /* Because of how linear VGPRs are allocated, we should never have to move a linear VGPR into the + * space of a normal one. This means the copy can be done entirely before normal VGPR copies. */ + emit_parallel_copy_internal(ctx, linear_vgpr, instr, instructions, temp_in_scc, + register_file); + emit_parallel_copy_internal(ctx, parallelcopy, instr, instructions, temp_in_scc, + register_file); +} + +} /* end namespace */ + +void +register_allocation(Program* program, live& live_vars, ra_test_policy policy) +{ + std::vector<IDSet>& live_out_per_block = live_vars.live_out; + ra_ctx ctx(program, policy); + get_affinities(ctx, live_out_per_block); - /* fill in sgpr_live_in */ - for (unsigned i = 0; i <= ctx.max_used_sgpr; i++) - sgpr_live_in[block.index][i] = register_file[PhysReg{i}]; - sgpr_live_in[block.index][127] = register_file[scc]; + for (Block& block : program->blocks) { + ctx.block = █ + + /* initialize register file */ + RegisterFile register_file = init_reg_file(ctx, live_out_per_block, block); + ctx.war_hint.reset(); + + std::vector<aco_ptr<Instruction>> instructions; + instructions.reserve(block.instructions.size()); + + /* this is a slight adjustment from the paper as we already have phi nodes: + * We consider them incomplete phis and only handle the definition. */ + get_regs_for_phis(ctx, block, register_file, instructions, live_out_per_block[block.index]); + + /* If this is a merge block, the state of the register file at the branch instruction of the + * predecessors corresponds to the state after phis at the merge block. So, we allocate a + * register for the predecessor's branch definitions as if there was a phi. + */ + if (!block.linear_preds.empty() && + (block.linear_preds.size() != 1 || + program->blocks[block.linear_preds[0]].linear_succs.size() == 1)) { + PhysReg br_reg = get_reg_phi(ctx, live_out_per_block[block.index], register_file, + instructions, block, ctx.phi_dummy, Temp(0, s2)); + for (unsigned pred : block.linear_preds) { + program->blocks[pred].scc_live_out = register_file[scc]; + aco_ptr<Instruction>& br = program->blocks[pred].instructions.back(); + + assert(br->definitions.size() == 1 && br->definitions[0].regClass() == s2 && + br->definitions[0].isKill()); + + br->definitions[0].setFixed(br_reg); + } + } /* Handle all other instructions of the block */ + auto NonPhi = [](aco_ptr<Instruction>& instr) -> bool { return instr && !is_phi(instr); }; + std::vector<aco_ptr<Instruction>>::iterator instr_it = + std::find_if(block.instructions.begin(), block.instructions.end(), NonPhi); for (; instr_it != block.instructions.end(); ++instr_it) { aco_ptr<Instruction>& instr = *instr_it; @@ -2438,12 +3051,18 @@ register_allocation(Program* program, std::vector<IDSet>& live_out_per_block, ra } std::vector<std::pair<Operand, Definition>> parallelcopy; + bool temp_in_scc = register_file[scc]; - assert(!is_phi(instr)); + if (instr->opcode == aco_opcode::p_branch) { + /* unconditional branches are handled after phis of the target */ + instructions.emplace_back(std::move(instr)); + break; + } - bool temp_in_scc = register_file[scc]; + assert(!is_phi(instr)); /* handle operands */ + bool fixed = false; for (unsigned i = 0; i < instr->operands.size(); ++i) { auto& operand = instr->operands[i]; if (!operand.isTemp()) @@ -2453,13 +3072,37 @@ register_allocation(Program* program, std::vector<IDSet>& live_out_per_block, ra operand.setTemp(read_variable(ctx, operand.getTemp(), block.index)); assert(ctx.assignments[operand.tempId()].assigned); + fixed |= + operand.isFixed() && ctx.assignments[operand.tempId()].reg != operand.physReg(); + } + + bool is_writelane = instr->opcode == aco_opcode::v_writelane_b32 || + instr->opcode == aco_opcode::v_writelane_b32_e64; + if (program->gfx_level <= GFX9 && is_writelane && instr->operands[0].isTemp() && + instr->operands[1].isTemp()) { + /* v_writelane_b32 can take two sgprs but only if one is m0. */ + if (ctx.assignments[instr->operands[0].tempId()].reg != m0 && + ctx.assignments[instr->operands[1].tempId()].reg != m0) { + instr->operands[0].setFixed(m0); + fixed = true; + } + } + + if (fixed) + handle_fixed_operands(ctx, register_file, parallelcopy, instr); + + for (unsigned i = 0; i < instr->operands.size(); ++i) { + auto& operand = instr->operands[i]; + if (!operand.isTemp() || operand.isFixed()) + continue; + PhysReg reg = ctx.assignments[operand.tempId()].reg; - if (operand_can_use_reg(program->chip_class, instr, i, reg, operand.regClass())) + if (operand_can_use_reg(program->gfx_level, instr, i, reg, operand.regClass())) operand.setFixed(reg); else get_reg_for_operand(ctx, register_file, parallelcopy, instr, operand, i); - if (instr->isEXP() || (instr->isVMEM() && i == 3 && ctx.program->chip_class == GFX6) || + if (instr->isEXP() || (instr->isVMEM() && i == 3 && ctx.program->gfx_level == GFX6) || (instr->isDS() && instr->ds().gds)) { for (unsigned j = 0; j < operand.size(); j++) ctx.war_hint.set(operand.physReg().reg() + j); @@ -2472,59 +3115,17 @@ register_allocation(Program* program, std::vector<IDSet>& live_out_per_block, ra register_file.clear(op); } - /* try to optimize v_mad_f32 -> v_mac_f32 */ - if ((instr->opcode == aco_opcode::v_mad_f32 || - (instr->opcode == aco_opcode::v_fma_f32 && program->chip_class >= GFX10) || - instr->opcode == aco_opcode::v_mad_f16 || - instr->opcode == aco_opcode::v_mad_legacy_f16 || - (instr->opcode == aco_opcode::v_fma_f16 && program->chip_class >= GFX10) || - (instr->opcode == aco_opcode::v_pk_fma_f16 && program->chip_class >= GFX10) || - (instr->opcode == aco_opcode::v_dot4_i32_i8 && program->family != CHIP_VEGA20)) && - instr->operands[2].isTemp() && instr->operands[2].isKillBeforeDef() && - instr->operands[2].getTemp().type() == RegType::vgpr && instr->operands[1].isTemp() && - instr->operands[1].getTemp().type() == RegType::vgpr && !instr->usesModifiers() && - instr->operands[0].physReg().byte() == 0 && instr->operands[1].physReg().byte() == 0 && - instr->operands[2].physReg().byte() == 0) { - unsigned def_id = instr->definitions[0].tempId(); - auto it = ctx.affinities.find(def_id); - if (it == ctx.affinities.end() || !ctx.assignments[it->second].assigned || - instr->operands[2].physReg() == ctx.assignments[it->second].reg || - register_file.test(ctx.assignments[it->second].reg, instr->operands[2].bytes())) { - instr->format = Format::VOP2; - switch (instr->opcode) { - case aco_opcode::v_mad_f32: instr->opcode = aco_opcode::v_mac_f32; break; - case aco_opcode::v_fma_f32: instr->opcode = aco_opcode::v_fmac_f32; break; - case aco_opcode::v_mad_f16: - case aco_opcode::v_mad_legacy_f16: instr->opcode = aco_opcode::v_mac_f16; break; - case aco_opcode::v_fma_f16: instr->opcode = aco_opcode::v_fmac_f16; break; - case aco_opcode::v_pk_fma_f16: instr->opcode = aco_opcode::v_pk_fmac_f16; break; - case aco_opcode::v_dot4_i32_i8: instr->opcode = aco_opcode::v_dot4c_i32_i8; break; - default: break; - } - } - } - - /* handle definitions which must have the same register as an operand */ - if (instr->opcode == aco_opcode::v_interp_p2_f32 || - instr->opcode == aco_opcode::v_mac_f32 || instr->opcode == aco_opcode::v_fmac_f32 || - instr->opcode == aco_opcode::v_mac_f16 || instr->opcode == aco_opcode::v_fmac_f16 || - instr->opcode == aco_opcode::v_pk_fmac_f16 || - instr->opcode == aco_opcode::v_writelane_b32 || - instr->opcode == aco_opcode::v_writelane_b32_e64 || - instr->opcode == aco_opcode::v_dot4c_i32_i8) { - instr->definitions[0].setFixed(instr->operands[2].physReg()); - } else if (instr->opcode == aco_opcode::s_addk_i32 || - instr->opcode == aco_opcode::s_mulk_i32) { - instr->definitions[0].setFixed(instr->operands[0].physReg()); - } else if (instr->isMUBUF() && instr->definitions.size() == 1 && - instr->operands.size() == 4) { - instr->definitions[0].setFixed(instr->operands[3].physReg()); - } else if (instr->isMIMG() && instr->definitions.size() == 1 && - !instr->operands[2].isUndefined()) { - instr->definitions[0].setFixed(instr->operands[2].physReg()); - } + optimize_encoding(program, ctx, register_file, instr); - ctx.defs_done.reset(); + /* Handle definitions which must have the same register as an operand. + * We expect that the definition has the same size as the operand, otherwise the new + * location for the operand (if it's not killed) might intersect with the old one. + * We can't read from the old location because it's corrupted, and we can't write the new + * location because that's used by a live-through operand. + */ + int op_fixed_to_def = get_op_fixed_to_def(instr.get()); + if (op_fixed_to_def != -1) + instr->definitions[0].setFixed(instr->operands[op_fixed_to_def].physReg()); /* handle fixed definitions first */ for (unsigned i = 0; i < instr->definitions.size(); ++i) { @@ -2538,8 +3139,7 @@ register_allocation(Program* program, std::vector<IDSet>& live_out_per_block, ra const PhysRegInterval def_regs{definition.physReg(), definition.size()}; /* create parallelcopy pair to move blocking vars */ - std::set<std::pair<unsigned, unsigned>> vars = - collect_vars(ctx, register_file, def_regs); + std::vector<unsigned> vars = collect_vars(ctx, register_file, def_regs); RegisterFile tmp_file(register_file); /* re-enable the killed operands, so that we don't move the blocking vars there */ @@ -2549,19 +3149,16 @@ register_allocation(Program* program, std::vector<IDSet>& live_out_per_block, ra } ASSERTED bool success = false; - DefInfo info(ctx, instr, definition.regClass(), -1); - success = get_regs_for_copies(ctx, tmp_file, parallelcopy, vars, info.bounds, instr, - def_regs); + success = get_regs_for_copies(ctx, tmp_file, parallelcopy, vars, instr, def_regs); assert(success); update_renames(ctx, register_file, parallelcopy, instr, (UpdateRenames)0); } - ctx.defs_done.set(i); if (!definition.isTemp()) continue; - ctx.assignments[definition.tempId()] = {definition.physReg(), definition.regClass()}; + ctx.assignments[definition.tempId()].set(definition); register_file.fill(definition); } @@ -2573,18 +3170,30 @@ register_allocation(Program* program, std::vector<IDSet>& live_out_per_block, ra continue; /* find free reg */ - if (definition->hasHint() && - get_reg_specified(ctx, register_file, definition->regClass(), instr, - definition->physReg())) { - definition->setFixed(definition->physReg()); + if (instr->opcode == aco_opcode::p_start_linear_vgpr) { + /* Allocation of linear VGPRs is special. */ + definition->setFixed(alloc_linear_vgpr(ctx, register_file, instr, parallelcopy)); + update_renames(ctx, register_file, parallelcopy, instr, rename_not_killed_ops); } else if (instr->opcode == aco_opcode::p_split_vector) { PhysReg reg = instr->operands[0].physReg(); + RegClass rc = definition->regClass(); for (unsigned j = 0; j < i; j++) reg.reg_b += instr->definitions[j].bytes(); - if (get_reg_specified(ctx, register_file, definition->regClass(), instr, reg)) + if (get_reg_specified(ctx, register_file, rc, instr, reg)) { definition->setFixed(reg); - } else if (instr->opcode == aco_opcode::p_wqm || - instr->opcode == aco_opcode::p_parallelcopy) { + } else if (i == 0) { + RegClass vec_rc = RegClass::get(rc.type(), instr->operands[0].bytes()); + DefInfo info(ctx, ctx.pseudo_dummy, vec_rc, -1); + std::optional<PhysReg> res = get_reg_simple(ctx, register_file, info); + if (res && get_reg_specified(ctx, register_file, rc, instr, *res)) + definition->setFixed(*res); + } else if (instr->definitions[i - 1].isFixed()) { + reg = instr->definitions[i - 1].physReg(); + reg.reg_b += instr->definitions[i - 1].bytes(); + if (get_reg_specified(ctx, register_file, rc, instr, reg)) + definition->setFixed(reg); + } + } else if (instr->opcode == aco_opcode::p_parallelcopy) { PhysReg reg = instr->operands[i].physReg(); if (instr->operands[i].isTemp() && instr->operands[i].getTemp().type() == definition->getTemp().type() && @@ -2600,6 +3209,14 @@ register_allocation(Program* program, std::vector<IDSet>& live_out_per_block, ra parallelcopy, instr); update_renames(ctx, register_file, parallelcopy, instr, (UpdateRenames)0); definition->setFixed(reg); + } else if (instr_info.classes[(int)instr->opcode] == instr_class::wmma && + instr->operands[2].isTemp() && instr->operands[2].isKill() && + instr->operands[2].regClass() == definition->regClass()) { + /* For WMMA, the dest needs to either be equal to operands[2], or not overlap it. + * Here we set a policy of forcing them the same if operands[2] gets killed (and + * otherwise they don't overlap). This may not be optimal if RA would select a + * different location due to affinity, but that gets complicated very quickly. */ + definition->setFixed(instr->operands[2].physReg()); } if (!definition->isFixed()) { @@ -2608,7 +3225,8 @@ register_allocation(Program* program, std::vector<IDSet>& live_out_per_block, ra PhysReg reg = get_reg(ctx, register_file, tmp, parallelcopy, instr); definition->setFixed(reg); if (reg.byte() || register_file.test(reg, 4)) { - add_subdword_definition(program, instr, reg); + bool allow_16bit_write = reg.byte() % 2 == 0 && !register_file.test(reg, 2); + add_subdword_definition(program, instr, reg, allow_16bit_write); definition = &instr->definitions[i]; /* add_subdword_definition can invalidate the reference */ } @@ -2624,8 +3242,7 @@ register_allocation(Program* program, std::vector<IDSet>& live_out_per_block, ra definition->isFixed() && ((definition->getTemp().type() == RegType::vgpr && definition->physReg() >= 256) || (definition->getTemp().type() != RegType::vgpr && definition->physReg() < 256))); - ctx.defs_done.set(i); - ctx.assignments[definition->tempId()] = {definition->physReg(), definition->regClass()}; + ctx.assignments[definition->tempId()].set(*definition); register_file.fill(*definition); } @@ -2645,87 +3262,30 @@ register_allocation(Program* program, std::vector<IDSet>& live_out_per_block, ra add_subdword_operand(ctx, instr, i, op.physReg().byte(), op.regClass()); } - /* emit parallelcopy */ - if (!parallelcopy.empty()) { - aco_ptr<Pseudo_instruction> pc; - pc.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_parallelcopy, - Format::PSEUDO, parallelcopy.size(), - parallelcopy.size())); - bool linear_vgpr = false; - bool sgpr_operands_alias_defs = false; - uint64_t sgpr_operands[4] = {0, 0, 0, 0}; - for (unsigned i = 0; i < parallelcopy.size(); i++) { - linear_vgpr |= parallelcopy[i].first.regClass().is_linear_vgpr(); - - if (temp_in_scc && parallelcopy[i].first.isTemp() && - parallelcopy[i].first.getTemp().type() == RegType::sgpr) { - if (!sgpr_operands_alias_defs) { - unsigned reg = parallelcopy[i].first.physReg().reg(); - unsigned size = parallelcopy[i].first.getTemp().size(); - sgpr_operands[reg / 64u] |= u_bit_consecutive64(reg % 64u, size); - - reg = parallelcopy[i].second.physReg().reg(); - size = parallelcopy[i].second.getTemp().size(); - if (sgpr_operands[reg / 64u] & u_bit_consecutive64(reg % 64u, size)) - sgpr_operands_alias_defs = true; - } - } - - pc->operands[i] = parallelcopy[i].first; - pc->definitions[i] = parallelcopy[i].second; - assert(pc->operands[i].size() == pc->definitions[i].size()); - - /* it might happen that the operand is already renamed. we have to restore the - * original name. */ - std::unordered_map<unsigned, Temp>::iterator it = - ctx.orig_names.find(pc->operands[i].tempId()); - Temp orig = it != ctx.orig_names.end() ? it->second : pc->operands[i].getTemp(); - ctx.orig_names[pc->definitions[i].tempId()] = orig; - ctx.renames[block.index][orig.id()] = pc->definitions[i].getTemp(); - } - - if (temp_in_scc && (sgpr_operands_alias_defs || linear_vgpr)) { - /* disable definitions and re-enable operands */ - RegisterFile tmp_file(register_file); - for (const Definition& def : instr->definitions) { - if (def.isTemp() && !def.isKill()) - tmp_file.clear(def); - } - for (const Operand& op : instr->operands) { - if (op.isTemp() && op.isFirstKill()) - tmp_file.block(op.physReg(), op.regClass()); - } - - handle_pseudo(ctx, tmp_file, pc.get()); - } else { - pc->tmp_in_scc = false; - } - - instructions.emplace_back(std::move(pc)); - } + emit_parallel_copy(ctx, parallelcopy, instr, instructions, temp_in_scc, register_file); /* some instructions need VOP3 encoding if operand/definition is not assigned to VCC */ bool instr_needs_vop3 = !instr->isVOP3() && - ((instr->format == Format::VOPC && !(instr->definitions[0].physReg() == vcc)) || - (instr->opcode == aco_opcode::v_cndmask_b32 && - !(instr->operands[2].physReg() == vcc)) || + ((withoutDPP(instr->format) == Format::VOPC && + instr->definitions[0].physReg() != vcc) || + (instr->opcode == aco_opcode::v_cndmask_b32 && instr->operands[2].physReg() != vcc) || ((instr->opcode == aco_opcode::v_add_co_u32 || instr->opcode == aco_opcode::v_addc_co_u32 || instr->opcode == aco_opcode::v_sub_co_u32 || instr->opcode == aco_opcode::v_subb_co_u32 || instr->opcode == aco_opcode::v_subrev_co_u32 || instr->opcode == aco_opcode::v_subbrev_co_u32) && - !(instr->definitions[1].physReg() == vcc)) || + instr->definitions[1].physReg() != vcc) || ((instr->opcode == aco_opcode::v_addc_co_u32 || instr->opcode == aco_opcode::v_subb_co_u32 || instr->opcode == aco_opcode::v_subbrev_co_u32) && - !(instr->operands[2].physReg() == vcc))); + instr->operands[2].physReg() != vcc)); if (instr_needs_vop3) { /* if the first operand is a literal, we have to move it to a reg */ if (instr->operands.size() && instr->operands[0].isLiteral() && - program->chip_class < GFX10) { + program->gfx_level < GFX10) { bool can_sgpr = true; /* check, if we have to move to vgpr */ for (const Operand& op : instr->operands) { @@ -2749,11 +3309,9 @@ register_allocation(Program* program, std::vector<IDSet>& live_out_per_block, ra aco_ptr<Instruction> mov; if (can_sgpr) - mov.reset(create_instruction<SOP1_instruction>(aco_opcode::s_mov_b32, - Format::SOP1, 1, 1)); + mov.reset(create_instruction(aco_opcode::s_mov_b32, Format::SOP1, 1, 1)); else - mov.reset(create_instruction<VOP1_instruction>(aco_opcode::v_mov_b32, - Format::VOP1, 1, 1)); + mov.reset(create_instruction(aco_opcode::v_mov_b32, Format::VOP1, 1, 1)); mov->operands[0] = instr->operands[0]; mov->definitions[0] = Definition(tmp); mov->definitions[0].setFixed(reg); @@ -2766,47 +3324,42 @@ register_allocation(Program* program, std::vector<IDSet>& live_out_per_block, ra } /* change the instruction to VOP3 to enable an arbitrary register pair as dst */ - aco_ptr<Instruction> tmp = std::move(instr); - Format format = asVOP3(tmp->format); - instr.reset(create_instruction<VOP3_instruction>( - tmp->opcode, format, tmp->operands.size(), tmp->definitions.size())); - std::copy(tmp->operands.begin(), tmp->operands.end(), instr->operands.begin()); - std::copy(tmp->definitions.begin(), tmp->definitions.end(), instr->definitions.begin()); + instr->format = asVOP3(instr->format); } instructions.emplace_back(std::move(*instr_it)); } /* end for Instr */ - block.instructions = std::move(instructions); - } /* end for BB */ + if ((block.kind & block_kind_top_level) && block.linear_succs.empty()) { + /* Reset this for block_kind_resume. */ + ctx.num_linear_vgprs = 0; - /* find scc spill registers which may be needed for parallelcopies created by phis */ - for (Block& block : program->blocks) { - if (block.linear_preds.size() <= 1) - continue; + ASSERTED PhysRegInterval vgpr_bounds = get_reg_bounds(ctx, RegType::vgpr, false); + ASSERTED PhysRegInterval sgpr_bounds = get_reg_bounds(ctx, RegType::sgpr, false); + assert(register_file.count_zero(vgpr_bounds) == ctx.vgpr_bounds); + assert(register_file.count_zero(sgpr_bounds) == ctx.sgpr_bounds); + } else if (should_compact_linear_vgprs(ctx, live_vars, register_file)) { + aco_ptr<Instruction> br = std::move(instructions.back()); + instructions.pop_back(); - std::bitset<128> regs = sgpr_live_in[block.index]; - if (!regs[127]) - continue; + bool temp_in_scc = + register_file[scc] || (!br->operands.empty() && br->operands[0].physReg() == scc); - /* choose a register */ - int16_t reg = 0; - for (; reg < ctx.program->max_reg_demand.sgpr && regs[reg]; reg++) - ; - assert(reg < ctx.program->max_reg_demand.sgpr); - adjust_max_used_regs(ctx, s1, reg); + std::vector<std::pair<Operand, Definition>> parallelcopy; + compact_linear_vgprs(ctx, register_file, parallelcopy); + update_renames(ctx, register_file, parallelcopy, br, rename_not_killed_ops); + emit_parallel_copy_internal(ctx, parallelcopy, br, instructions, temp_in_scc, register_file); - /* update predecessors */ - for (unsigned& pred_index : block.linear_preds) { - Block& pred = program->blocks[pred_index]; - pred.scc_live_out = true; - pred.scratch_sgpr = PhysReg{(uint16_t)reg}; + instructions.push_back(std::move(br)); } - } + + block.instructions = std::move(instructions); + } /* end for BB */ /* num_gpr = rnd_up(max_used_gpr + 1) */ - program->config->num_vgprs = get_vgpr_alloc(program, ctx.max_used_vgpr + 1); + program->config->num_vgprs = + std::min<uint16_t>(get_vgpr_alloc(program, ctx.max_used_vgpr + 1), 256); program->config->num_sgprs = get_sgpr_alloc(program, ctx.max_used_sgpr + 1); program->progress = CompilationProgress::after_ra; |