summaryrefslogtreecommitdiff
path: root/src/amd/compiler/aco_register_allocation.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/amd/compiler/aco_register_allocation.cpp')
-rw-r--r--src/amd/compiler/aco_register_allocation.cpp2011
1 files changed, 1282 insertions, 729 deletions
diff --git a/src/amd/compiler/aco_register_allocation.cpp b/src/amd/compiler/aco_register_allocation.cpp
index d474dfe463a..47fefded1e5 100644
--- a/src/amd/compiler/aco_register_allocation.cpp
+++ b/src/amd/compiler/aco_register_allocation.cpp
@@ -1,33 +1,19 @@
/*
* Copyright © 2018 Valve Corporation
*
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
+ * SPDX-License-Identifier: MIT
*/
#include "aco_ir.h"
+#include "util/bitset.h"
+#include "util/enum_operators.h"
+
#include <algorithm>
#include <array>
#include <bitset>
#include <map>
+#include <optional>
#include <set>
#include <unordered_map>
#include <vector>
@@ -37,20 +23,35 @@ namespace {
struct ra_ctx;
-unsigned get_subdword_operand_stride(chip_class chip, const aco_ptr<Instruction>& instr,
+unsigned get_subdword_operand_stride(amd_gfx_level gfx_level, const aco_ptr<Instruction>& instr,
unsigned idx, RegClass rc);
void add_subdword_operand(ra_ctx& ctx, aco_ptr<Instruction>& instr, unsigned idx, unsigned byte,
RegClass rc);
std::pair<unsigned, unsigned>
get_subdword_definition_info(Program* program, const aco_ptr<Instruction>& instr, RegClass rc);
-void add_subdword_definition(Program* program, aco_ptr<Instruction>& instr, PhysReg reg);
+void add_subdword_definition(Program* program, aco_ptr<Instruction>& instr, PhysReg reg,
+ bool allow_16bit_write);
struct assignment {
PhysReg reg;
RegClass rc;
- uint8_t assigned = 0;
+ union {
+ struct {
+ bool assigned : 1;
+ bool vcc : 1;
+ bool m0 : 1;
+ };
+ uint8_t _ = 0;
+ };
+ uint32_t affinity = 0;
assignment() = default;
- assignment(PhysReg reg_, RegClass rc_) : reg(reg_), rc(rc_), assigned(-1) {}
+ assignment(PhysReg reg_, RegClass rc_) : reg(reg_), rc(rc_) { assigned = true; }
+ void set(const Definition& def)
+ {
+ assigned = true;
+ reg = def.physReg();
+ rc = def.regClass();
+ }
};
struct ra_ctx {
@@ -61,16 +62,19 @@ struct ra_ctx {
std::vector<std::unordered_map<unsigned, Temp>> renames;
std::vector<uint32_t> loop_header;
std::unordered_map<unsigned, Temp> orig_names;
- std::unordered_map<unsigned, unsigned> affinities;
std::unordered_map<unsigned, Instruction*> vectors;
std::unordered_map<unsigned, Instruction*> split_vectors;
aco_ptr<Instruction> pseudo_dummy;
+ aco_ptr<Instruction> phi_dummy;
uint16_t max_used_sgpr = 0;
uint16_t max_used_vgpr = 0;
uint16_t sgpr_limit;
uint16_t vgpr_limit;
std::bitset<512> war_hint;
- std::bitset<64> defs_done; /* see MAX_ARGS in aco_instruction_selection_setup.cpp */
+
+ uint16_t sgpr_bounds;
+ uint16_t vgpr_bounds;
+ uint16_t num_linear_vgprs;
ra_test_policy policy;
@@ -78,10 +82,14 @@ struct ra_ctx {
: program(program_), assignments(program->peekAllocationId()),
renames(program->blocks.size()), policy(policy_)
{
- pseudo_dummy.reset(
- create_instruction<Instruction>(aco_opcode::p_parallelcopy, Format::PSEUDO, 0, 0));
+ pseudo_dummy.reset(create_instruction(aco_opcode::p_parallelcopy, Format::PSEUDO, 0, 0));
+ phi_dummy.reset(create_instruction(aco_opcode::p_linear_phi, Format::PSEUDO, 0, 0));
sgpr_limit = get_addr_sgpr_from_waves(program, program->min_waves);
- vgpr_limit = get_addr_sgpr_from_waves(program, program->min_waves);
+ vgpr_limit = get_addr_vgpr_from_waves(program, program->min_waves);
+
+ sgpr_bounds = program->max_reg_demand.sgpr;
+ vgpr_bounds = program->max_reg_demand.vgpr;
+ num_linear_vgprs = 0;
}
};
@@ -153,7 +161,7 @@ struct PhysRegInterval {
bool
intersects(const PhysRegInterval& a, const PhysRegInterval& b)
{
- return ((a.lo() >= b.lo() && a.lo() < b.hi()) || (a.hi() > b.lo() && a.hi() <= b.hi()));
+ return a.hi() > b.lo() && b.hi() > a.lo();
}
/* Gets the stride for full (non-subdword) registers */
@@ -175,15 +183,24 @@ get_stride(RegClass rc)
}
PhysRegInterval
-get_reg_bounds(Program* program, RegType type)
+get_reg_bounds(ra_ctx& ctx, RegType type, bool linear_vgpr)
{
- if (type == RegType::vgpr) {
- return {PhysReg{256}, (unsigned)program->max_reg_demand.vgpr};
+ uint16_t linear_vgpr_start = ctx.vgpr_bounds - ctx.num_linear_vgprs;
+ if (type == RegType::vgpr && linear_vgpr) {
+ return PhysRegInterval{PhysReg(256 + linear_vgpr_start), ctx.num_linear_vgprs};
+ } else if (type == RegType::vgpr) {
+ return PhysRegInterval{PhysReg(256), linear_vgpr_start};
} else {
- return {PhysReg{0}, (unsigned)program->max_reg_demand.sgpr};
+ return PhysRegInterval{PhysReg(0), ctx.sgpr_bounds};
}
}
+PhysRegInterval
+get_reg_bounds(ra_ctx& ctx, RegClass rc)
+{
+ return get_reg_bounds(ctx, rc.type(), rc.is_linear_vgpr());
+}
+
struct DefInfo {
PhysRegInterval bounds;
uint8_t size;
@@ -195,11 +212,11 @@ struct DefInfo {
size = rc.size();
stride = get_stride(rc);
- bounds = get_reg_bounds(ctx.program, rc.type());
+ bounds = get_reg_bounds(ctx, rc);
if (rc.is_subdword() && operand >= 0) {
/* stride in bytes */
- stride = get_subdword_operand_stride(ctx.program->chip_class, instr, operand, rc);
+ stride = get_subdword_operand_stride(ctx.program->gfx_level, instr, operand, rc);
} else if (rc.is_subdword()) {
std::pair<unsigned, unsigned> info = get_subdword_definition_info(ctx.program, instr, rc);
stride = info.first;
@@ -214,6 +231,20 @@ struct DefInfo {
stride = DIV_ROUND_UP(stride, 4);
}
assert(stride > 0);
+ } else if (instr->isMIMG() && instr->mimg().d16 && ctx.program->gfx_level <= GFX9) {
+ /* Workaround GFX9 hardware bug for D16 image instructions: FeatureImageGather4D16Bug
+ *
+ * The register use is not calculated correctly, and the hardware assumes a
+ * full dword per component. Don't use the last registers of the register file.
+ * Otherwise, the instruction will be skipped.
+ *
+ * https://reviews.llvm.org/D81172
+ */
+ bool imageGather4D16Bug = operand == -1 && rc == v2 && instr->mimg().dmask != 0xF;
+ assert(ctx.program->gfx_level == GFX9 && "Image D16 on GFX8 not supported.");
+
+ if (imageGather4D16Bug)
+ bounds.size -= MAX2(rc.bytes() / 4 - ctx.num_linear_vgprs, 0);
}
}
};
@@ -229,7 +260,7 @@ public:
uint32_t& operator[](PhysReg index) { return regs[index]; }
- unsigned count_zero(PhysRegInterval reg_interval)
+ unsigned count_zero(PhysRegInterval reg_interval) const
{
unsigned res = 0;
for (PhysReg reg : reg_interval)
@@ -238,16 +269,17 @@ public:
}
/* Returns true if any of the bytes in the given range are allocated or blocked */
- bool test(PhysReg start, unsigned num_bytes)
+ bool test(PhysReg start, unsigned num_bytes) const
{
for (PhysReg i = start; i.reg_b < start.reg_b + num_bytes; i = PhysReg(i + 1)) {
assert(i <= 511);
if (regs[i] & 0x0FFFFFFF)
return true;
if (regs[i] == 0xF0000000) {
- assert(subdword_regs.find(i) != subdword_regs.end());
+ auto it = subdword_regs.find(i);
+ assert(it != subdword_regs.end());
for (unsigned j = i.byte(); i * 4 + j < start.reg_b + num_bytes && j < 4; j++) {
- if (subdword_regs[i][j])
+ if (it->second[j])
return true;
}
}
@@ -263,24 +295,28 @@ public:
fill(start, rc.size(), 0xFFFFFFFF);
}
- bool is_blocked(PhysReg start)
+ bool is_blocked(PhysReg start) const
{
if (regs[start] == 0xFFFFFFFF)
return true;
if (regs[start] == 0xF0000000) {
+ auto it = subdword_regs.find(start);
+ assert(it != subdword_regs.end());
for (unsigned i = start.byte(); i < 4; i++)
- if (subdword_regs[start][i] == 0xFFFFFFFF)
+ if (it->second[i] == 0xFFFFFFFF)
return true;
}
return false;
}
- bool is_empty_or_blocked(PhysReg start)
+ bool is_empty_or_blocked(PhysReg start) const
{
/* Empty is 0, blocked is 0xFFFFFFFF, so to check both we compare the
* incremented value to 1 */
if (regs[start] == 0xF0000000) {
- return subdword_regs[start][start.byte()] + 1 <= 1;
+ auto it = subdword_regs.find(start);
+ assert(it != subdword_regs.end());
+ return it->second[start.byte()] + 1 <= 1;
}
return regs[start] + 1 <= 1;
}
@@ -313,9 +349,9 @@ public:
void clear(Definition def) { clear(def.physReg(), def.regClass()); }
- unsigned get_id(PhysReg reg)
+ unsigned get_id(PhysReg reg) const
{
- return regs[reg] == 0xF0000000 ? subdword_regs[reg][reg.byte()] : regs[reg];
+ return regs[reg] == 0xF0000000 ? subdword_regs.at(reg)[reg.byte()] : regs[reg];
}
private:
@@ -343,24 +379,24 @@ private:
}
};
-std::set<std::pair<unsigned, unsigned>> find_vars(ra_ctx& ctx, RegisterFile& reg_file,
- const PhysRegInterval reg_interval);
+std::vector<unsigned> find_vars(ra_ctx& ctx, const RegisterFile& reg_file,
+ const PhysRegInterval reg_interval);
/* helper function for debugging */
UNUSED void
print_reg(const RegisterFile& reg_file, PhysReg reg, bool has_adjacent_variable)
{
if (reg_file[reg] == 0xFFFFFFFF) {
- printf("☐");
+ printf((const char*)u8"☐");
} else if (reg_file[reg]) {
const bool show_subdword_alloc = (reg_file[reg] == 0xF0000000);
if (show_subdword_alloc) {
- const char* block_chars[] = {
+ auto block_chars = {
// clang-format off
- "?", "▘", "▝", "▀",
- "▖", "▌", "▞", "▛",
- "▗", "▚", "▐", "▜",
- "▄", "▙", "▟", "▉"
+ u8"?", u8"▘", u8"▝", u8"▀",
+ u8"▖", u8"▌", u8"▞", u8"▛",
+ u8"▗", u8"▚", u8"▐", u8"▜",
+ u8"▄", u8"▙", u8"▟", u8"▉"
// clang-format on
};
unsigned index = 0;
@@ -369,27 +405,26 @@ print_reg(const RegisterFile& reg_file, PhysReg reg, bool has_adjacent_variable)
index |= 1 << i;
}
}
- printf("%s", block_chars[index]);
+ printf("%s", (const char*)(block_chars.begin()[index]));
} else {
/* Indicate filled register slot */
if (!has_adjacent_variable) {
- printf("█");
+ printf((const char*)u8"█");
} else {
/* Use a slightly shorter box to leave a small gap between adjacent variables */
- printf("▉");
+ printf((const char*)u8"▉");
}
}
} else {
- printf("·");
+ printf((const char*)u8"·");
}
}
/* helper function for debugging */
UNUSED void
-print_regs(ra_ctx& ctx, bool vgprs, RegisterFile& reg_file)
+print_regs(ra_ctx& ctx, PhysRegInterval regs, const RegisterFile& reg_file)
{
- PhysRegInterval regs = get_reg_bounds(ctx.program, vgprs ? RegType::vgpr : RegType::sgpr);
- char reg_char = vgprs ? 'v' : 's';
+ char reg_char = regs.lo().reg() >= 256 ? 'v' : 's';
const int max_regs_per_line = 64;
/* print markers */
@@ -428,11 +463,11 @@ print_regs(ra_ctx& ctx, bool vgprs, RegisterFile& reg_file)
printf("%u/%u used, %u/%u free\n", regs.size - free_regs, regs.size, free_regs, regs.size);
/* print assignments ordered by registers */
- std::map<PhysReg, std::pair<unsigned, unsigned>>
- regs_to_vars; /* maps to byte size and temp id */
- for (const auto& size_id : find_vars(ctx, reg_file, regs)) {
- auto reg = ctx.assignments[size_id.second].reg;
- ASSERTED auto inserted = regs_to_vars.emplace(reg, size_id);
+ std::map<PhysReg, std::pair<unsigned, unsigned>> regs_to_vars; /* maps to byte size and temp id */
+ for (unsigned id : find_vars(ctx, reg_file, regs)) {
+ const assignment& var = ctx.assignments[id];
+ PhysReg reg = var.reg;
+ ASSERTED auto inserted = regs_to_vars.emplace(reg, std::make_pair(var.rc.bytes(), id));
assert(inserted.second);
}
@@ -445,11 +480,11 @@ print_regs(ra_ctx& ctx, bool vgprs, RegisterFile& reg_file)
ctx.orig_names[size_id.second].id() != size_id.second) {
printf("(was %%%d) ", ctx.orig_names[size_id.second].id());
}
- printf("= %c[%d", reg_char, first_reg.reg() - regs.lo());
+ printf("= %c[%d", reg_char, first_reg.reg() % 256);
PhysReg last_reg = first_reg.advance(size_id.first - 1);
if (first_reg.reg() != last_reg.reg()) {
assert(first_reg.byte() == 0 && last_reg.byte() == 3);
- printf("-%d", last_reg.reg() - regs.lo());
+ printf("-%d", last_reg.reg() % 256);
}
printf("]");
if (first_reg.byte() != 0 || last_reg.byte() != 3) {
@@ -460,14 +495,14 @@ print_regs(ra_ctx& ctx, bool vgprs, RegisterFile& reg_file)
}
unsigned
-get_subdword_operand_stride(chip_class chip, const aco_ptr<Instruction>& instr, unsigned idx,
- RegClass rc)
+get_subdword_operand_stride(amd_gfx_level gfx_level, const aco_ptr<Instruction>& instr,
+ unsigned idx, RegClass rc)
{
if (instr->isPseudo()) {
/* v_readfirstlane_b32 cannot use SDWA */
if (instr->opcode == aco_opcode::p_as_uniform)
return 4;
- else if (chip >= GFX8)
+ else if (gfx_level >= GFX8)
return rc.bytes() % 2 == 0 ? 2 : 1;
else
return 4;
@@ -475,26 +510,27 @@ get_subdword_operand_stride(chip_class chip, const aco_ptr<Instruction>& instr,
assert(rc.bytes() <= 2);
if (instr->isVALU()) {
- if (can_use_SDWA(chip, instr, false))
+ if (can_use_SDWA(gfx_level, instr, false))
return rc.bytes();
- if (can_use_opsel(chip, instr->opcode, idx, true))
+ if (can_use_opsel(gfx_level, instr->opcode, idx))
return 2;
- if (instr->format == Format::VOP3P)
+ if (instr->isVOP3P())
return 2;
}
switch (instr->opcode) {
case aco_opcode::v_cvt_f32_ubyte0: return 1;
case aco_opcode::ds_write_b8:
- case aco_opcode::ds_write_b16: return chip >= GFX9 ? 2 : 4;
+ case aco_opcode::ds_write_b16: return gfx_level >= GFX9 ? 2 : 4;
case aco_opcode::buffer_store_byte:
case aco_opcode::buffer_store_short:
+ case aco_opcode::buffer_store_format_d16_x:
case aco_opcode::flat_store_byte:
case aco_opcode::flat_store_short:
case aco_opcode::scratch_store_byte:
case aco_opcode::scratch_store_short:
case aco_opcode::global_store_byte:
- case aco_opcode::global_store_short: return chip >= GFX9 ? 2 : 4;
+ case aco_opcode::global_store_short: return gfx_level >= GFX9 ? 2 : 4;
default: return 4;
}
}
@@ -503,24 +539,12 @@ void
add_subdword_operand(ra_ctx& ctx, aco_ptr<Instruction>& instr, unsigned idx, unsigned byte,
RegClass rc)
{
- chip_class chip = ctx.program->chip_class;
+ amd_gfx_level gfx_level = ctx.program->gfx_level;
if (instr->isPseudo() || byte == 0)
return;
assert(rc.bytes() <= 2);
if (instr->isVALU()) {
- /* check if we can use opsel */
- if (instr->format == Format::VOP3) {
- assert(byte == 2);
- instr->vop3().opsel |= 1 << idx;
- return;
- }
- if (instr->isVOP3P()) {
- assert(byte == 2 && !(instr->vop3p().opsel_lo & (1 << idx)));
- instr->vop3p().opsel_lo |= 1 << idx;
- instr->vop3p().opsel_hi |= 1 << idx;
- return;
- }
if (instr->opcode == aco_opcode::v_cvt_f32_ubyte0) {
switch (byte) {
case 0: instr->opcode = aco_opcode::v_cvt_f32_ubyte0; break;
@@ -532,8 +556,21 @@ add_subdword_operand(ra_ctx& ctx, aco_ptr<Instruction>& instr, unsigned idx, uns
}
/* use SDWA */
- assert(can_use_SDWA(chip, instr, false));
- convert_to_SDWA(chip, instr);
+ if (can_use_SDWA(gfx_level, instr, false)) {
+ convert_to_SDWA(gfx_level, instr);
+ return;
+ }
+
+ /* use opsel */
+ if (instr->isVOP3P()) {
+ assert(byte == 2 && !instr->valu().opsel_lo[idx]);
+ instr->valu().opsel_lo[idx] = true;
+ instr->valu().opsel_hi[idx] = true;
+ return;
+ }
+
+ assert(can_use_opsel(gfx_level, instr->opcode, idx));
+ instr->valu().opsel[idx] = true;
return;
}
@@ -546,6 +583,8 @@ add_subdword_operand(ra_ctx& ctx, aco_ptr<Instruction>& instr, unsigned idx, uns
instr->opcode = aco_opcode::buffer_store_byte_d16_hi;
else if (instr->opcode == aco_opcode::buffer_store_short)
instr->opcode = aco_opcode::buffer_store_short_d16_hi;
+ else if (instr->opcode == aco_opcode::buffer_store_format_d16_x)
+ instr->opcode = aco_opcode::buffer_store_format_d16_hi_x;
else if (instr->opcode == aco_opcode::flat_store_byte)
instr->opcode = aco_opcode::flat_store_byte_d16_hi;
else if (instr->opcode == aco_opcode::flat_store_short)
@@ -567,34 +606,38 @@ add_subdword_operand(ra_ctx& ctx, aco_ptr<Instruction>& instr, unsigned idx, uns
std::pair<unsigned, unsigned>
get_subdword_definition_info(Program* program, const aco_ptr<Instruction>& instr, RegClass rc)
{
- chip_class chip = program->chip_class;
+ amd_gfx_level gfx_level = program->gfx_level;
if (instr->isPseudo()) {
- if (chip >= GFX8)
+ if (instr->opcode == aco_opcode::p_interp_gfx11)
+ return std::make_pair(4u, 4u);
+ else if (gfx_level >= GFX8)
return std::make_pair(rc.bytes() % 2 == 0 ? 2 : 1, rc.bytes());
else
return std::make_pair(4, rc.size() * 4u);
}
- if (instr->isVALU() || instr->isVINTRP()) {
+ if (instr->isVALU()) {
assert(rc.bytes() <= 2);
- if (can_use_SDWA(chip, instr, false))
+ if (can_use_SDWA(gfx_level, instr, false))
return std::make_pair(rc.bytes(), rc.bytes());
unsigned bytes_written = 4u;
- if (instr_is_16bit(chip, instr->opcode))
+ if (instr_is_16bit(gfx_level, instr->opcode))
bytes_written = 2u;
unsigned stride = 4u;
if (instr->opcode == aco_opcode::v_fma_mixlo_f16 ||
- can_use_opsel(chip, instr->opcode, -1, true))
+ can_use_opsel(gfx_level, instr->opcode, -1))
stride = 2u;
return std::make_pair(stride, bytes_written);
}
switch (instr->opcode) {
+ case aco_opcode::v_interp_p2_f16: return std::make_pair(2u, 2u);
+ /* D16 loads with _hi version */
case aco_opcode::ds_read_u8_d16:
case aco_opcode::ds_read_i8_d16:
case aco_opcode::ds_read_u16_d16:
@@ -609,58 +652,80 @@ get_subdword_definition_info(Program* program, const aco_ptr<Instruction>& instr
case aco_opcode::scratch_load_short_d16:
case aco_opcode::buffer_load_ubyte_d16:
case aco_opcode::buffer_load_sbyte_d16:
- case aco_opcode::buffer_load_short_d16: {
- assert(chip >= GFX9);
+ case aco_opcode::buffer_load_short_d16:
+ case aco_opcode::buffer_load_format_d16_x: {
+ assert(gfx_level >= GFX9);
if (!program->dev.sram_ecc_enabled)
return std::make_pair(2u, 2u);
else
return std::make_pair(2u, 4u);
}
+ /* 3-component D16 loads */
+ case aco_opcode::buffer_load_format_d16_xyz:
+ case aco_opcode::tbuffer_load_format_d16_xyz: {
+ assert(gfx_level >= GFX9);
+ if (!program->dev.sram_ecc_enabled)
+ return std::make_pair(4u, 6u);
+ break;
+ }
- default: return std::make_pair(4, rc.size() * 4u);
+ default: break;
}
+
+ if (instr->isMIMG() && instr->mimg().d16 && !program->dev.sram_ecc_enabled) {
+ assert(gfx_level >= GFX9);
+ return std::make_pair(4u, rc.bytes());
+ }
+
+ return std::make_pair(4, rc.size() * 4u);
}
void
-add_subdword_definition(Program* program, aco_ptr<Instruction>& instr, PhysReg reg)
+add_subdword_definition(Program* program, aco_ptr<Instruction>& instr, PhysReg reg,
+ bool allow_16bit_write)
{
if (instr->isPseudo())
return;
if (instr->isVALU()) {
- chip_class chip = program->chip_class;
+ amd_gfx_level gfx_level = program->gfx_level;
assert(instr->definitions[0].bytes() <= 2);
- if (reg.byte() == 0 && instr_is_16bit(chip, instr->opcode))
+ if (reg.byte() == 0 && allow_16bit_write && instr_is_16bit(gfx_level, instr->opcode))
return;
- /* check if we can use opsel */
- if (instr->format == Format::VOP3) {
- assert(reg.byte() == 2);
- assert(can_use_opsel(chip, instr->opcode, -1, true));
- instr->vop3().opsel |= (1 << 3); /* dst in high half */
+ /* use SDWA */
+ if (can_use_SDWA(gfx_level, instr, false)) {
+ convert_to_SDWA(gfx_level, instr);
return;
}
+ assert(allow_16bit_write);
+
if (instr->opcode == aco_opcode::v_fma_mixlo_f16) {
instr->opcode = aco_opcode::v_fma_mixhi_f16;
return;
}
- /* use SDWA */
- assert(can_use_SDWA(chip, instr, false));
- convert_to_SDWA(chip, instr);
+ /* use opsel */
+ assert(reg.byte() == 2);
+ assert(can_use_opsel(gfx_level, instr->opcode, -1));
+ instr->valu().opsel[3] = true; /* dst in high half */
return;
}
if (reg.byte() == 0)
return;
+ else if (instr->opcode == aco_opcode::v_interp_p2_f16)
+ instr->opcode = aco_opcode::v_interp_p2_hi_f16;
else if (instr->opcode == aco_opcode::buffer_load_ubyte_d16)
instr->opcode = aco_opcode::buffer_load_ubyte_d16_hi;
else if (instr->opcode == aco_opcode::buffer_load_sbyte_d16)
instr->opcode = aco_opcode::buffer_load_sbyte_d16_hi;
else if (instr->opcode == aco_opcode::buffer_load_short_d16)
instr->opcode = aco_opcode::buffer_load_short_d16_hi;
+ else if (instr->opcode == aco_opcode::buffer_load_format_d16_x)
+ instr->opcode = aco_opcode::buffer_load_format_d16_hi_x;
else if (instr->opcode == aco_opcode::flat_load_ubyte_d16)
instr->opcode = aco_opcode::flat_load_ubyte_d16_hi;
else if (instr->opcode == aco_opcode::flat_load_sbyte_d16)
@@ -697,6 +762,7 @@ adjust_max_used_regs(ra_ctx& ctx, RegClass rc, unsigned reg)
if (rc.type() == RegType::vgpr) {
assert(reg >= 256);
uint16_t hi = reg - 256 + size - 1;
+ assert(hi <= 255);
ctx.max_used_vgpr = std::max(ctx.max_used_vgpr, hi);
} else if (reg + rc.size() <= max_addressible_sgpr) {
uint16_t hi = reg + size - 1;
@@ -707,6 +773,7 @@ adjust_max_used_regs(ra_ctx& ctx, RegClass rc, unsigned reg)
enum UpdateRenames {
rename_not_killed_ops = 0x1,
fill_killed_ops = 0x2,
+ rename_precolored_ops = 0x4,
};
MESA_DEFINE_CPP_ENUM_BITFIELD_OPERATORS(UpdateRenames);
@@ -779,28 +846,39 @@ update_renames(ra_ctx& ctx, RegisterFile& reg_file,
assert(ctx.assignments.size() == ctx.program->peekAllocationId());
/* check if we moved an operand */
- bool first = true;
+ bool first[2] = {true, true};
bool fill = true;
for (unsigned i = 0; i < instr->operands.size(); i++) {
Operand& op = instr->operands[i];
if (!op.isTemp())
continue;
if (op.tempId() == copy.first.tempId()) {
- bool omit_renaming = !(flags & rename_not_killed_ops) && !op.isKillBeforeDef();
- for (std::pair<Operand, Definition>& pc : parallelcopies) {
- PhysReg def_reg = pc.second.physReg();
- omit_renaming &= def_reg > copy.first.physReg()
- ? (copy.first.physReg() + copy.first.size() <= def_reg.reg())
- : (def_reg + pc.second.size() <= copy.first.physReg().reg());
+ /* only rename precolored operands if the copy-location matches */
+ bool omit_renaming = (flags & rename_precolored_ops) && op.isFixed() &&
+ op.physReg() != copy.second.physReg();
+
+ /* Omit renaming in some cases for p_create_vector in order to avoid
+ * unnecessary shuffle code. */
+ if (!(flags & rename_not_killed_ops) && !op.isKillBeforeDef()) {
+ omit_renaming = true;
+ for (std::pair<Operand, Definition>& pc : parallelcopies) {
+ PhysReg def_reg = pc.second.physReg();
+ omit_renaming &= def_reg > copy.first.physReg()
+ ? (copy.first.physReg() + copy.first.size() <= def_reg.reg())
+ : (def_reg + pc.second.size() <= copy.first.physReg().reg());
+ }
}
- if (omit_renaming) {
- if (first)
- op.setFirstKill(true);
- else
- op.setKill(true);
- first = false;
+
+ /* Fix the kill flags */
+ if (first[omit_renaming])
+ op.setFirstKill(omit_renaming || op.isKill());
+ else
+ op.setKill(omit_renaming || op.isKill());
+ first[omit_renaming] = false;
+
+ if (omit_renaming)
continue;
- }
+
op.setTemp(copy.second.getTemp());
op.setFixed(copy.second.physReg());
@@ -815,8 +893,8 @@ update_renames(ra_ctx& ctx, RegisterFile& reg_file,
}
}
-std::pair<PhysReg, bool>
-get_reg_simple(ra_ctx& ctx, RegisterFile& reg_file, DefInfo info)
+std::optional<PhysReg>
+get_reg_simple(ra_ctx& ctx, const RegisterFile& reg_file, DefInfo info)
{
const PhysRegInterval& bounds = info.bounds;
uint32_t size = info.size;
@@ -829,8 +907,8 @@ get_reg_simple(ra_ctx& ctx, RegisterFile& reg_file, DefInfo info)
if (size % new_stride)
continue;
new_info.stride = new_stride;
- std::pair<PhysReg, bool> res = get_reg_simple(ctx, reg_file, new_info);
- if (res.second)
+ std::optional<PhysReg> res = get_reg_simple(ctx, reg_file, new_info);
+ if (res)
return res;
}
@@ -864,7 +942,7 @@ get_reg_simple(ra_ctx& ctx, RegisterFile& reg_file, DefInfo info)
/* early return on exact matches */
if (size == gap.size) {
adjust_max_used_regs(ctx, rc, gap.lo());
- return {gap.lo(), true};
+ return gap.lo();
}
/* check if it fits and the gap size is smaller */
@@ -877,7 +955,7 @@ get_reg_simple(ra_ctx& ctx, RegisterFile& reg_file, DefInfo info)
}
if (best_gap.size == UINT_MAX)
- return {{}, false};
+ return {};
/* find best position within gap by leaving a good stride for other variables*/
unsigned buffer = best_gap.size - size;
@@ -889,7 +967,7 @@ get_reg_simple(ra_ctx& ctx, RegisterFile& reg_file, DefInfo info)
}
adjust_max_used_regs(ctx, rc, best_gap.lo());
- return {best_gap.lo(), true};
+ return best_gap.lo();
}
for (PhysRegInterval reg_win = {bounds.lo(), size}; reg_win.hi() <= bounds.hi();
@@ -901,7 +979,7 @@ get_reg_simple(ra_ctx& ctx, RegisterFile& reg_file, DefInfo info)
bool is_valid = std::all_of(std::next(reg_win.begin()), reg_win.end(), is_free);
if (is_valid) {
adjust_max_used_regs(ctx, rc, reg_win.lo());
- return {reg_win.lo(), true};
+ return reg_win.lo();
}
}
@@ -909,7 +987,8 @@ get_reg_simple(ra_ctx& ctx, RegisterFile& reg_file, DefInfo info)
* larger instruction encodings or copies
* TODO: don't do this in situations where it doesn't benefit */
if (rc.is_subdword()) {
- for (std::pair<const uint32_t, std::array<uint32_t, 4>>& entry : reg_file.subdword_regs) {
+ for (const std::pair<const uint32_t, std::array<uint32_t, 4>>& entry :
+ reg_file.subdword_regs) {
assert(reg_file[PhysReg{entry.first}] == 0xF0000000);
if (!bounds.contains({PhysReg{entry.first}, rc.size()}))
continue;
@@ -928,119 +1007,172 @@ get_reg_simple(ra_ctx& ctx, RegisterFile& reg_file, DefInfo info)
PhysReg res{entry.first};
res.reg_b += i;
adjust_max_used_regs(ctx, rc, entry.first);
- return {res, true};
+ return res;
}
}
}
}
- return {{}, false};
+ return {};
}
-/* collect variables from a register area and clear reg_file */
-std::set<std::pair<unsigned, unsigned>>
-find_vars(ra_ctx& ctx, RegisterFile& reg_file, const PhysRegInterval reg_interval)
+/* collect variables from a register area */
+std::vector<unsigned>
+find_vars(ra_ctx& ctx, const RegisterFile& reg_file, const PhysRegInterval reg_interval)
{
- std::set<std::pair<unsigned, unsigned>> vars;
+ std::vector<unsigned> vars;
for (PhysReg j : reg_interval) {
if (reg_file.is_blocked(j))
continue;
if (reg_file[j] == 0xF0000000) {
for (unsigned k = 0; k < 4; k++) {
- unsigned id = reg_file.subdword_regs[j][k];
- if (id) {
- assignment& var = ctx.assignments[id];
- vars.emplace(var.rc.bytes(), id);
- }
+ unsigned id = reg_file.subdword_regs.at(j)[k];
+ if (id && (vars.empty() || id != vars.back()))
+ vars.emplace_back(id);
}
- } else if (reg_file[j] != 0) {
+ } else {
unsigned id = reg_file[j];
- assignment& var = ctx.assignments[id];
- vars.emplace(var.rc.bytes(), id);
+ if (id && (vars.empty() || id != vars.back()))
+ vars.emplace_back(id);
}
}
return vars;
}
-/* collect variables from a register area and clear reg_file */
-std::set<std::pair<unsigned, unsigned>>
+/* collect variables from a register area and clear reg_file
+ * variables are sorted in decreasing size and
+ * increasing assigned register
+ */
+std::vector<unsigned>
collect_vars(ra_ctx& ctx, RegisterFile& reg_file, const PhysRegInterval reg_interval)
{
- std::set<std::pair<unsigned, unsigned>> vars = find_vars(ctx, reg_file, reg_interval);
- for (std::pair<unsigned, unsigned> size_id : vars) {
- assignment& var = ctx.assignments[size_id.second];
+ std::vector<unsigned> ids = find_vars(ctx, reg_file, reg_interval);
+ std::sort(ids.begin(), ids.end(),
+ [&](unsigned a, unsigned b)
+ {
+ assignment& var_a = ctx.assignments[a];
+ assignment& var_b = ctx.assignments[b];
+ return var_a.rc.bytes() > var_b.rc.bytes() ||
+ (var_a.rc.bytes() == var_b.rc.bytes() && var_a.reg < var_b.reg);
+ });
+
+ for (unsigned id : ids) {
+ assignment& var = ctx.assignments[id];
reg_file.clear(var.reg, var.rc);
}
- return vars;
+ return ids;
+}
+
+std::optional<PhysReg>
+get_reg_for_create_vector_copy(ra_ctx& ctx, RegisterFile& reg_file,
+ std::vector<std::pair<Operand, Definition>>& parallelcopies,
+ aco_ptr<Instruction>& instr, const PhysRegInterval def_reg,
+ DefInfo info, unsigned id)
+{
+ PhysReg reg = def_reg.lo();
+ /* dead operand: return position in vector */
+ for (unsigned i = 0; i < instr->operands.size(); i++) {
+ if (instr->operands[i].isTemp() && instr->operands[i].tempId() == id &&
+ instr->operands[i].isKillBeforeDef()) {
+ assert(!reg_file.test(reg, instr->operands[i].bytes()));
+ if (info.rc.is_subdword() || reg.byte() == 0)
+ return reg;
+ else
+ return {};
+ }
+ reg.reg_b += instr->operands[i].bytes();
+ }
+
+ /* GFX9+ has a VGPR swap instruction. */
+ if (ctx.program->gfx_level <= GFX8 || info.rc.type() == RegType::sgpr)
+ return {};
+
+ /* check if the previous position was in vector */
+ assignment& var = ctx.assignments[id];
+ if (def_reg.contains(PhysRegInterval{var.reg, info.size})) {
+ reg = def_reg.lo();
+ /* try to use the previous register of the operand */
+ for (unsigned i = 0; i < instr->operands.size(); i++) {
+ if (reg != var.reg) {
+ reg.reg_b += instr->operands[i].bytes();
+ continue;
+ }
+
+ /* check if we can swap positions */
+ if (instr->operands[i].isTemp() && instr->operands[i].isFirstKill() &&
+ instr->operands[i].regClass() == info.rc) {
+ assignment& op = ctx.assignments[instr->operands[i].tempId()];
+ /* if everything matches, create parallelcopy for the killed operand */
+ if (!intersects(def_reg, PhysRegInterval{op.reg, op.rc.size()}) && op.reg != scc &&
+ reg_file.get_id(op.reg) == instr->operands[i].tempId()) {
+ Definition pc_def = Definition(reg, info.rc);
+ parallelcopies.emplace_back(instr->operands[i], pc_def);
+ return op.reg;
+ }
+ }
+ return {};
+ }
+ }
+ return {};
}
bool
get_regs_for_copies(ra_ctx& ctx, RegisterFile& reg_file,
std::vector<std::pair<Operand, Definition>>& parallelcopies,
- const std::set<std::pair<unsigned, unsigned>>& vars,
- const PhysRegInterval bounds, aco_ptr<Instruction>& instr,
+ const std::vector<unsigned>& vars, aco_ptr<Instruction>& instr,
const PhysRegInterval def_reg)
{
- /* variables are sorted from small sized to large */
- /* NOTE: variables are also sorted by ID. this only affects a very small number of shaders
- * slightly though. */
- for (std::set<std::pair<unsigned, unsigned>>::const_reverse_iterator it = vars.rbegin();
- it != vars.rend(); ++it) {
- unsigned id = it->second;
+ /* Variables are sorted from large to small and with increasing assigned register */
+ for (unsigned id : vars) {
assignment& var = ctx.assignments[id];
+ PhysRegInterval bounds = get_reg_bounds(ctx, var.rc);
DefInfo info = DefInfo(ctx, ctx.pseudo_dummy, var.rc, -1);
uint32_t size = info.size;
/* check if this is a dead operand, then we can re-use the space from the definition
* also use the correct stride for sub-dword operands */
bool is_dead_operand = false;
- for (unsigned i = 0; !is_phi(instr) && i < instr->operands.size(); i++) {
- if (instr->operands[i].isTemp() && instr->operands[i].tempId() == id) {
- if (instr->operands[i].isKillBeforeDef())
- is_dead_operand = true;
- info = DefInfo(ctx, instr, var.rc, i);
- break;
- }
- }
-
- std::pair<PhysReg, bool> res;
- if (is_dead_operand) {
- if (instr->opcode == aco_opcode::p_create_vector) {
- PhysReg reg(def_reg.lo());
- for (unsigned i = 0; i < instr->operands.size(); i++) {
- if (instr->operands[i].isTemp() && instr->operands[i].tempId() == id) {
- res = {reg, (!var.rc.is_subdword() || (reg.byte() % info.stride == 0)) &&
- !reg_file.test(reg, var.rc.bytes())};
- break;
+ std::optional<PhysReg> res;
+ if (instr->opcode == aco_opcode::p_create_vector) {
+ res =
+ get_reg_for_create_vector_copy(ctx, reg_file, parallelcopies, instr, def_reg, info, id);
+ } else {
+ for (unsigned i = 0; !is_phi(instr) && i < instr->operands.size(); i++) {
+ if (instr->operands[i].isTemp() && instr->operands[i].tempId() == id) {
+ info = DefInfo(ctx, instr, var.rc, i);
+ if (instr->operands[i].isKillBeforeDef()) {
+ info.bounds = def_reg;
+ res = get_reg_simple(ctx, reg_file, info);
+ is_dead_operand = true;
}
- reg.reg_b += instr->operands[i].bytes();
+ break;
}
- if (!res.second)
- res = {var.reg, !reg_file.test(var.reg, var.rc.bytes())};
- } else {
- info.bounds = def_reg;
- res = get_reg_simple(ctx, reg_file, info);
}
- } else {
+ }
+ if (!res && !def_reg.size) {
+ /* If this is before definitions are handled, def_reg may be an empty interval. */
+ info.bounds = bounds;
+ res = get_reg_simple(ctx, reg_file, info);
+ } else if (!res) {
/* Try to find space within the bounds but outside of the definition */
info.bounds = PhysRegInterval::from_until(bounds.lo(), MIN2(def_reg.lo(), bounds.hi()));
res = get_reg_simple(ctx, reg_file, info);
- if (!res.second && def_reg.hi() <= bounds.hi()) {
+ if (!res && def_reg.hi() <= bounds.hi()) {
unsigned lo = (def_reg.hi() + info.stride - 1) & ~(info.stride - 1);
info.bounds = PhysRegInterval::from_until(PhysReg{lo}, bounds.hi());
res = get_reg_simple(ctx, reg_file, info);
}
}
- if (res.second) {
+ if (res) {
/* mark the area as blocked */
- reg_file.block(res.first, var.rc);
+ reg_file.block(*res, var.rc);
/* create parallelcopy pair (without definition id) */
Temp tmp = Temp(id, var.rc);
Operand pc_op = Operand(tmp);
pc_op.setFixed(var.reg);
- Definition pc_def = Definition(res.first, pc_op.regClass());
+ Definition pc_def = Definition(*res, pc_op.regClass());
parallelcopies.emplace_back(pc_op, pc_def);
continue;
}
@@ -1075,9 +1207,8 @@ get_regs_for_copies(ra_ctx& ctx, RegisterFile& reg_file,
n++;
continue;
}
- /* we cannot split live ranges of linear vgprs inside control flow */
- if (!(ctx.block->kind & block_kind_top_level) &&
- ctx.assignments[reg_file[j]].rc.is_linear_vgpr()) {
+ /* we cannot split live ranges of linear vgprs */
+ if (ctx.assignments[reg_file[j]].rc.is_linear_vgpr()) {
found = false;
break;
}
@@ -1116,13 +1247,13 @@ get_regs_for_copies(ra_ctx& ctx, RegisterFile& reg_file,
PhysRegInterval reg_win{best_pos, size};
/* collect variables and block reg file */
- std::set<std::pair<unsigned, unsigned>> new_vars = collect_vars(ctx, reg_file, reg_win);
+ std::vector<unsigned> new_vars = collect_vars(ctx, reg_file, reg_win);
/* mark the area as blocked */
reg_file.block(reg_win.lo(), var.rc);
adjust_max_used_regs(ctx, var.rc, reg_win.lo());
- if (!get_regs_for_copies(ctx, reg_file, parallelcopies, new_vars, bounds, instr, def_reg))
+ if (!get_regs_for_copies(ctx, reg_file, parallelcopies, new_vars, instr, def_reg))
return false;
/* create parallelcopy pair (without definition id) */
@@ -1136,8 +1267,8 @@ get_regs_for_copies(ra_ctx& ctx, RegisterFile& reg_file,
return true;
}
-std::pair<PhysReg, bool>
-get_reg_impl(ra_ctx& ctx, RegisterFile& reg_file,
+std::optional<PhysReg>
+get_reg_impl(ra_ctx& ctx, const RegisterFile& reg_file,
std::vector<std::pair<Operand, Definition>>& parallelcopies, const DefInfo& info,
aco_ptr<Instruction>& instr)
{
@@ -1166,7 +1297,8 @@ get_reg_impl(ra_ctx& ctx, RegisterFile& reg_file,
}
}
- assert(regs_free >= size);
+ assert((regs_free + ctx.num_linear_vgprs) >= size);
+
/* we might have to move dead operands to dst in order to make space */
unsigned op_moves = 0;
@@ -1223,10 +1355,8 @@ get_reg_impl(ra_ctx& ctx, RegisterFile& reg_file,
break;
}
- /* we cannot split live ranges of linear vgprs inside control flow */
- //TODO: ensure that live range splits inside control flow are never necessary
- if (!(ctx.block->kind & block_kind_top_level) &&
- ctx.assignments[reg_file[j]].rc.is_linear_vgpr()) {
+ /* we cannot split live ranges of linear vgprs */
+ if (ctx.assignments[reg_file[j]].rc.is_linear_vgpr()) {
found = false;
break;
}
@@ -1251,31 +1381,23 @@ get_reg_impl(ra_ctx& ctx, RegisterFile& reg_file,
}
if (num_moves == 0xFF)
- return {{}, false};
+ return {};
/* now, we figured the placement for our definition */
RegisterFile tmp_file(reg_file);
- std::set<std::pair<unsigned, unsigned>> vars = collect_vars(ctx, tmp_file, best_win);
+ /* p_create_vector: also re-place killed operands in the definition space */
if (instr->opcode == aco_opcode::p_create_vector) {
- /* move killed operands which aren't yet at the correct position (GFX9+)
- * or which are in the definition space */
- PhysReg reg = best_win.lo();
for (Operand& op : instr->operands) {
- if (op.isTemp() && op.isFirstKillBeforeDef() && op.getTemp().type() == rc.type()) {
- if (op.physReg() != reg && (ctx.program->chip_class >= GFX9 ||
- (op.physReg().advance(op.bytes()) > best_win.lo() &&
- op.physReg() < best_win.hi()))) {
- vars.emplace(op.bytes(), op.tempId());
- tmp_file.clear(op);
- } else {
- tmp_file.fill(op);
- }
- }
- reg.reg_b += op.bytes();
+ if (op.isTemp() && op.isFirstKillBeforeDef())
+ tmp_file.fill(op);
}
- } else if (!is_phi(instr)) {
- /* re-enable killed operands */
+ }
+
+ std::vector<unsigned> vars = collect_vars(ctx, tmp_file, best_win);
+
+ /* re-enable killed operands */
+ if (!is_phi(instr) && instr->opcode != aco_opcode::p_create_vector) {
for (Operand& op : instr->operands) {
if (op.isTemp() && op.isFirstKillBeforeDef())
tmp_file.fill(op);
@@ -1283,18 +1405,18 @@ get_reg_impl(ra_ctx& ctx, RegisterFile& reg_file,
}
std::vector<std::pair<Operand, Definition>> pc;
- if (!get_regs_for_copies(ctx, tmp_file, pc, vars, bounds, instr, best_win))
- return {{}, false};
+ if (!get_regs_for_copies(ctx, tmp_file, pc, vars, instr, best_win))
+ return {};
parallelcopies.insert(parallelcopies.end(), pc.begin(), pc.end());
adjust_max_used_regs(ctx, rc, best_win.lo());
- return {best_win.lo(), true};
+ return best_win.lo();
}
bool
-get_reg_specified(ra_ctx& ctx, RegisterFile& reg_file, RegClass rc, aco_ptr<Instruction>& instr,
- PhysReg reg)
+get_reg_specified(ra_ctx& ctx, const RegisterFile& reg_file, RegClass rc,
+ aco_ptr<Instruction>& instr, PhysReg reg)
{
/* catch out-of-range registers */
if (reg >= PhysReg{512})
@@ -1313,10 +1435,10 @@ get_reg_specified(ra_ctx& ctx, RegisterFile& reg_file, RegClass rc, aco_ptr<Inst
return false;
PhysRegInterval reg_win = {reg, rc.size()};
- PhysRegInterval bounds = get_reg_bounds(ctx.program, rc.type());
+ PhysRegInterval bounds = get_reg_bounds(ctx, rc);
PhysRegInterval vcc_win = {vcc, 2};
/* VCC is outside the bounds */
- bool is_vcc = rc.type() == RegType::sgpr && vcc_win.contains(reg_win);
+ bool is_vcc = rc.type() == RegType::sgpr && vcc_win.contains(reg_win) && ctx.program->needs_vcc;
bool is_m0 = rc == s1 && reg == m0;
if (!bounds.contains(reg_win) && !is_vcc && !is_m0)
return false;
@@ -1336,17 +1458,24 @@ get_reg_specified(ra_ctx& ctx, RegisterFile& reg_file, RegClass rc, aco_ptr<Inst
}
bool
-increase_register_file(ra_ctx& ctx, RegType type)
+increase_register_file(ra_ctx& ctx, RegClass rc)
{
- if (type == RegType::vgpr && ctx.program->max_reg_demand.vgpr < ctx.vgpr_limit) {
- update_vgpr_sgpr_demand(ctx.program, RegisterDemand(ctx.program->max_reg_demand.vgpr + 1,
- ctx.program->max_reg_demand.sgpr));
- } else if (type == RegType::sgpr && ctx.program->max_reg_demand.sgpr < ctx.sgpr_limit) {
- update_vgpr_sgpr_demand(ctx.program, RegisterDemand(ctx.program->max_reg_demand.vgpr,
- ctx.program->max_reg_demand.sgpr + 1));
+ if (rc.type() == RegType::vgpr && ctx.num_linear_vgprs == 0 &&
+ ctx.vgpr_bounds < ctx.vgpr_limit) {
+ /* If vgpr_bounds is less than max_reg_demand.vgpr, this should be a no-op. */
+ update_vgpr_sgpr_demand(
+ ctx.program, RegisterDemand(ctx.vgpr_bounds + 1, ctx.program->max_reg_demand.sgpr));
+
+ ctx.vgpr_bounds = ctx.program->max_reg_demand.vgpr;
+ } else if (rc.type() == RegType::sgpr && ctx.program->max_reg_demand.sgpr < ctx.sgpr_limit) {
+ update_vgpr_sgpr_demand(
+ ctx.program, RegisterDemand(ctx.program->max_reg_demand.vgpr, ctx.sgpr_bounds + 1));
+
+ ctx.sgpr_bounds = ctx.program->max_reg_demand.sgpr;
} else {
return false;
}
+
return true;
}
@@ -1429,7 +1558,7 @@ compact_relocate_vars(ra_ctx& ctx, const std::vector<IDAndRegClass>& vars,
}
bool
-is_mimg_vaddr_intact(ra_ctx& ctx, RegisterFile& reg_file, Instruction* instr)
+is_mimg_vaddr_intact(ra_ctx& ctx, const RegisterFile& reg_file, Instruction* instr)
{
PhysReg first{512};
for (unsigned i = 0; i < instr->operands.size() - 3u; i++) {
@@ -1439,7 +1568,7 @@ is_mimg_vaddr_intact(ra_ctx& ctx, RegisterFile& reg_file, Instruction* instr)
PhysReg reg = ctx.assignments[op.tempId()].reg;
if (first.reg() == 512) {
- PhysRegInterval bounds = get_reg_bounds(ctx.program, RegType::vgpr);
+ PhysRegInterval bounds = get_reg_bounds(ctx, RegType::vgpr, false);
first = reg.advance(i * -4);
PhysRegInterval vec = PhysRegInterval{first, instr->operands.size() - 3u};
if (!bounds.contains(vec)) /* not enough space for other operands */
@@ -1460,8 +1589,8 @@ is_mimg_vaddr_intact(ra_ctx& ctx, RegisterFile& reg_file, Instruction* instr)
return true;
}
-std::pair<PhysReg, bool>
-get_reg_vector(ra_ctx& ctx, RegisterFile& reg_file, Temp temp, aco_ptr<Instruction>& instr)
+std::optional<PhysReg>
+get_reg_vector(ra_ctx& ctx, const RegisterFile& reg_file, Temp temp, aco_ptr<Instruction>& instr)
{
Instruction* vec = ctx.vectors[temp.id()];
unsigned first_operand = vec->format == Format::MIMG ? 3 : 0;
@@ -1487,11 +1616,11 @@ get_reg_vector(ra_ctx& ctx, RegisterFile& reg_file, Temp temp, aco_ptr<Instructi
PhysReg reg = ctx.assignments[op.tempId()].reg;
reg.reg_b += (our_offset - their_offset);
if (get_reg_specified(ctx, reg_file, temp.regClass(), instr, reg))
- return {reg, true};
+ return reg;
/* return if MIMG vaddr components don't remain vector-aligned */
if (vec->format == Format::MIMG)
- return {{}, false};
+ return {};
}
their_offset += op.bytes();
}
@@ -1501,20 +1630,137 @@ get_reg_vector(ra_ctx& ctx, RegisterFile& reg_file, Temp temp, aco_ptr<Instructi
*/
RegClass vec_rc = RegClass::get(temp.type(), their_offset);
DefInfo info(ctx, ctx.pseudo_dummy, vec_rc, -1);
- std::pair<PhysReg, bool> res = get_reg_simple(ctx, reg_file, info);
- PhysReg reg = res.first;
- if (res.second) {
- reg.reg_b += our_offset;
+ std::optional<PhysReg> reg = get_reg_simple(ctx, reg_file, info);
+ if (reg) {
+ reg->reg_b += our_offset;
/* make sure to only use byte offset if the instruction supports it */
- if (get_reg_specified(ctx, reg_file, temp.regClass(), instr, reg))
- return {reg, true};
+ if (get_reg_specified(ctx, reg_file, temp.regClass(), instr, *reg))
+ return reg;
}
}
- return {{}, false};
+ return {};
}
+bool
+compact_linear_vgprs(ra_ctx& ctx, const RegisterFile& reg_file,
+ std::vector<std::pair<Operand, Definition>>& parallelcopies)
+{
+ PhysRegInterval linear_vgpr_bounds = get_reg_bounds(ctx, RegType::vgpr, true);
+ int zeros = reg_file.count_zero(linear_vgpr_bounds);
+ if (zeros == 0)
+ return false;
+
+ std::vector<IDAndRegClass> vars;
+ for (unsigned id : find_vars(ctx, reg_file, linear_vgpr_bounds))
+ vars.emplace_back(id, ctx.assignments[id].rc);
+
+ ctx.num_linear_vgprs -= zeros;
+ compact_relocate_vars(ctx, vars, parallelcopies, get_reg_bounds(ctx, RegType::vgpr, true).lo());
+
+ return true;
+}
+
+/* Allocates a linear VGPR. We allocate them at the end of the register file and keep them separate
+ * from normal VGPRs. This is for two reasons:
+ * - Because we only ever move linear VGPRs into an empty space or a space previously occupied by a
+ * linear one, we never have to swap a normal VGPR and a linear one.
+ * - As linear VGPR's live ranges only start and end on top-level blocks, we never have to move a
+ * linear VGPR in control flow.
+ */
PhysReg
-get_reg(ra_ctx& ctx, RegisterFile& reg_file, Temp temp,
+alloc_linear_vgpr(ra_ctx& ctx, const RegisterFile& reg_file, aco_ptr<Instruction>& instr,
+ std::vector<std::pair<Operand, Definition>>& parallelcopies)
+{
+ assert(instr->opcode == aco_opcode::p_start_linear_vgpr);
+ assert(instr->definitions.size() == 1 && instr->definitions[0].bytes() % 4 == 0);
+
+ RegClass rc = instr->definitions[0].regClass();
+
+ /* Try to choose an unused space in the linear VGPR bounds. */
+ for (unsigned i = rc.size(); i <= ctx.num_linear_vgprs; i++) {
+ PhysReg reg(256 + ctx.vgpr_bounds - i);
+ if (!reg_file.test(reg, rc.bytes())) {
+ adjust_max_used_regs(ctx, rc, reg);
+ return reg;
+ }
+ }
+
+ PhysRegInterval old_normal_bounds = get_reg_bounds(ctx, RegType::vgpr, false);
+
+ /* Compact linear VGPRs, grow the bounds if necessary, and choose a space at the beginning: */
+ compact_linear_vgprs(ctx, reg_file, parallelcopies);
+
+ PhysReg reg(256 + ctx.vgpr_bounds - (ctx.num_linear_vgprs + rc.size()));
+ /* Space that was for normal VGPRs, but is now for linear VGPRs. */
+ PhysRegInterval new_win = PhysRegInterval::from_until(reg, MAX2(old_normal_bounds.hi(), reg));
+
+ RegisterFile tmp_file(reg_file);
+ PhysRegInterval reg_win{reg, rc.size()};
+ std::vector<unsigned> blocking_vars = collect_vars(ctx, tmp_file, new_win);
+
+ /* Re-enable killed operands */
+ for (Operand& op : instr->operands) {
+ if (op.isTemp() && op.isFirstKillBeforeDef())
+ tmp_file.fill(op);
+ }
+
+ /* Find new assignments for blocking vars. */
+ std::vector<std::pair<Operand, Definition>> pc;
+ if (!ctx.policy.skip_optimistic_path &&
+ get_regs_for_copies(ctx, tmp_file, pc, blocking_vars, instr, reg_win)) {
+ parallelcopies.insert(parallelcopies.end(), pc.begin(), pc.end());
+ } else {
+ /* Fallback algorithm: reallocate all variables at once. */
+ std::vector<IDAndRegClass> vars;
+ for (unsigned id : find_vars(ctx, reg_file, old_normal_bounds))
+ vars.emplace_back(id, ctx.assignments[id].rc);
+ compact_relocate_vars(ctx, vars, parallelcopies, PhysReg(256));
+
+ std::vector<IDAndRegClass> killed_op_vars;
+ for (Operand& op : instr->operands) {
+ if (op.isTemp() && op.isFirstKillBeforeDef() && op.regClass().type() == RegType::vgpr)
+ killed_op_vars.emplace_back(op.tempId(), op.regClass());
+ }
+ compact_relocate_vars(ctx, killed_op_vars, parallelcopies, reg_win.lo());
+ }
+
+ /* If this is updated earlier, a killed operand can't be placed inside the definition. */
+ ctx.num_linear_vgprs += rc.size();
+
+ adjust_max_used_regs(ctx, rc, reg);
+ return reg;
+}
+
+bool
+should_compact_linear_vgprs(ra_ctx& ctx, live& live_vars, const RegisterFile& reg_file)
+{
+ if (!(ctx.block->kind & block_kind_top_level) || ctx.block->linear_succs.empty())
+ return false;
+
+ /* Since we won't be able to copy linear VGPRs to make space when in control flow, we have to
+ * ensure in advance that there is enough space for normal VGPRs. */
+ unsigned max_vgpr_usage = 0;
+ unsigned next_toplevel = ctx.block->index + 1;
+ for (; !(ctx.program->blocks[next_toplevel].kind & block_kind_top_level); next_toplevel++) {
+ max_vgpr_usage =
+ MAX2(max_vgpr_usage, (unsigned)ctx.program->blocks[next_toplevel].register_demand.vgpr);
+ }
+
+ std::vector<aco_ptr<Instruction>>& instructions =
+ ctx.program->blocks[next_toplevel].instructions;
+ if (!instructions.empty() && is_phi(instructions[0])) {
+ max_vgpr_usage =
+ MAX2(max_vgpr_usage, (unsigned)live_vars.register_demand[next_toplevel][0].vgpr);
+ }
+
+ for (unsigned tmp : find_vars(ctx, reg_file, get_reg_bounds(ctx, RegType::vgpr, true)))
+ max_vgpr_usage -= ctx.assignments[tmp].rc.size();
+
+ return max_vgpr_usage > get_reg_bounds(ctx, RegType::vgpr, false).size;
+}
+
+PhysReg
+get_reg(ra_ctx& ctx, const RegisterFile& reg_file, Temp temp,
std::vector<std::pair<Operand, Definition>>& parallelcopies, aco_ptr<Instruction>& instr,
int operand_index = -1)
{
@@ -1522,30 +1768,41 @@ get_reg(ra_ctx& ctx, RegisterFile& reg_file, Temp temp,
if (split_vec != ctx.split_vectors.end()) {
unsigned offset = 0;
for (Definition def : split_vec->second->definitions) {
- auto affinity_it = ctx.affinities.find(def.tempId());
- if (affinity_it != ctx.affinities.end() && ctx.assignments[affinity_it->second].assigned) {
- PhysReg reg = ctx.assignments[affinity_it->second].reg;
- reg.reg_b -= offset;
- if (get_reg_specified(ctx, reg_file, temp.regClass(), instr, reg))
- return reg;
+ if (ctx.assignments[def.tempId()].affinity) {
+ assignment& affinity = ctx.assignments[ctx.assignments[def.tempId()].affinity];
+ if (affinity.assigned) {
+ PhysReg reg = affinity.reg;
+ reg.reg_b -= offset;
+ if (get_reg_specified(ctx, reg_file, temp.regClass(), instr, reg))
+ return reg;
+ }
}
offset += def.bytes();
}
}
- if (ctx.affinities.find(temp.id()) != ctx.affinities.end() &&
- ctx.assignments[ctx.affinities[temp.id()]].assigned) {
- PhysReg reg = ctx.assignments[ctx.affinities[temp.id()]].reg;
- if (get_reg_specified(ctx, reg_file, temp.regClass(), instr, reg))
- return reg;
+ if (ctx.assignments[temp.id()].affinity) {
+ assignment& affinity = ctx.assignments[ctx.assignments[temp.id()].affinity];
+ if (affinity.assigned) {
+ if (get_reg_specified(ctx, reg_file, temp.regClass(), instr, affinity.reg))
+ return affinity.reg;
+ }
+ }
+ if (ctx.assignments[temp.id()].vcc) {
+ if (get_reg_specified(ctx, reg_file, temp.regClass(), instr, vcc))
+ return vcc;
+ }
+ if (ctx.assignments[temp.id()].m0) {
+ if (get_reg_specified(ctx, reg_file, temp.regClass(), instr, m0) && can_write_m0(instr))
+ return m0;
}
- std::pair<PhysReg, bool> res;
+ std::optional<PhysReg> res;
if (ctx.vectors.find(temp.id()) != ctx.vectors.end()) {
res = get_reg_vector(ctx, reg_file, temp, instr);
- if (res.second)
- return res.first;
+ if (res)
+ return *res;
}
DefInfo info(ctx, instr, temp.regClass(), operand_index);
@@ -1554,24 +1811,39 @@ get_reg(ra_ctx& ctx, RegisterFile& reg_file, Temp temp,
/* try to find space without live-range splits */
res = get_reg_simple(ctx, reg_file, info);
- if (res.second)
- return res.first;
+ if (res)
+ return *res;
}
/* try to find space with live-range splits */
res = get_reg_impl(ctx, reg_file, parallelcopies, info, instr);
- if (res.second)
- return res.first;
+ if (res)
+ return *res;
- /* try using more registers */
+ /* try compacting the linear vgprs to make more space */
+ std::vector<std::pair<Operand, Definition>> pc;
+ if (info.rc.type() == RegType::vgpr && (ctx.block->kind & block_kind_top_level) &&
+ compact_linear_vgprs(ctx, reg_file, pc)) {
+ parallelcopies.insert(parallelcopies.end(), pc.begin(), pc.end());
+
+ /* We don't need to fill the copy definitions in because we don't care about the linear VGPR
+ * space here. */
+ RegisterFile tmp_file(reg_file);
+ for (std::pair<Operand, Definition>& copy : pc)
+ tmp_file.clear(copy.first);
+
+ return get_reg(ctx, tmp_file, temp, parallelcopies, instr, operand_index);
+ }
/* We should only fail here because keeping under the limit would require
* too many moves. */
assert(reg_file.count_zero(info.bounds) >= info.size);
- if (!increase_register_file(ctx, info.rc.type())) {
- /* fallback algorithm: reallocate all variables at once */
+ /* try using more registers */
+ if (!increase_register_file(ctx, info.rc)) {
+ /* fallback algorithm: reallocate all variables at once (linear VGPRs should already be
+ * compact at the end) */
unsigned def_size = info.rc.size();
for (Definition def : instr->definitions) {
if (ctx.assignments[def.tempId()].assigned && def.regClass().type() == info.rc.type())
@@ -1584,12 +1856,12 @@ get_reg(ra_ctx& ctx, RegisterFile& reg_file, Temp temp,
killed_op_size += op.regClass().size();
}
- const PhysRegInterval regs = get_reg_bounds(ctx.program, info.rc.type());
+ const PhysRegInterval regs = get_reg_bounds(ctx, info.rc);
/* reallocate passthrough variables and non-killed operands */
std::vector<IDAndRegClass> vars;
- for (const std::pair<unsigned, unsigned>& var : find_vars(ctx, reg_file, regs))
- vars.emplace_back(var.second, ctx.assignments[var.second].rc);
+ for (unsigned id : find_vars(ctx, reg_file, regs))
+ vars.emplace_back(id, ctx.assignments[id].rc);
vars.emplace_back(0xffffffff, RegClass(info.rc.type(), MAX2(def_size, killed_op_size)));
PhysReg space = compact_relocate_vars(ctx, vars, parallelcopies, regs.lo());
@@ -1616,7 +1888,7 @@ get_reg(ra_ctx& ctx, RegisterFile& reg_file, Temp temp,
}
PhysReg
-get_reg_create_vector(ra_ctx& ctx, RegisterFile& reg_file, Temp temp,
+get_reg_create_vector(ra_ctx& ctx, const RegisterFile& reg_file, Temp temp,
std::vector<std::pair<Operand, Definition>>& parallelcopies,
aco_ptr<Instruction>& instr)
{
@@ -1625,13 +1897,14 @@ get_reg_create_vector(ra_ctx& ctx, RegisterFile& reg_file, Temp temp,
uint32_t size = rc.size();
uint32_t bytes = rc.bytes();
uint32_t stride = get_stride(rc);
- PhysRegInterval bounds = get_reg_bounds(ctx.program, rc.type());
+ PhysRegInterval bounds = get_reg_bounds(ctx, rc);
// TODO: improve p_create_vector for sub-dword vectors
PhysReg best_pos{0xFFF};
unsigned num_moves = 0xFF;
bool best_avoid = true;
+ uint32_t correct_pos_mask = 0;
/* test for each operand which definition placement causes the least shuffle instructions */
for (unsigned i = 0, offset = 0; i < instr->operands.size();
@@ -1667,6 +1940,7 @@ get_reg_create_vector(ra_ctx& ctx, RegisterFile& reg_file, Temp temp,
/* count variables to be moved and check "avoid" */
bool avoid = false;
+ bool linear_vgpr = false;
for (PhysReg j : reg_win) {
if (reg_file[j] != 0) {
if (reg_file[j] == 0xF0000000) {
@@ -1677,28 +1951,28 @@ get_reg_create_vector(ra_ctx& ctx, RegisterFile& reg_file, Temp temp,
k += reg_file.test(reg, 1);
} else {
k += 4;
- /* we cannot split live ranges of linear vgprs inside control flow */
- if (ctx.assignments[reg_file[j]].rc.is_linear_vgpr()) {
- if (ctx.block->kind & block_kind_top_level)
- avoid = true;
- else
- break;
- }
+ linear_vgpr |= ctx.assignments[reg_file[j]].rc.is_linear_vgpr();
}
}
avoid |= ctx.war_hint[j];
}
+
+ /* we cannot split live ranges of linear vgprs */
+ if (linear_vgpr)
+ continue;
+
if (avoid && !best_avoid)
continue;
/* count operands in wrong positions */
+ uint32_t correct_pos_mask_new = 0;
for (unsigned j = 0, offset2 = 0; j < instr->operands.size();
offset2 += instr->operands[j].bytes(), j++) {
- if (j == i || !instr->operands[j].isTemp() ||
- instr->operands[j].getTemp().type() != rc.type())
- continue;
- if (instr->operands[j].physReg().reg_b != reg_win.lo() * 4 + offset2)
- k += instr->operands[j].bytes();
+ Operand& op = instr->operands[j];
+ if (op.isTemp() && op.physReg().reg_b == reg_win.lo() * 4 + offset2)
+ correct_pos_mask_new |= 1 << j;
+ else
+ k += op.bytes();
}
bool aligned = rc == RegClass::v4 && reg_win.lo() % 4 == 0;
if (k > num_moves || (!aligned && k == num_moves))
@@ -1707,49 +1981,39 @@ get_reg_create_vector(ra_ctx& ctx, RegisterFile& reg_file, Temp temp,
best_pos = reg_win.lo();
num_moves = k;
best_avoid = avoid;
+ correct_pos_mask = correct_pos_mask_new;
}
- if (num_moves >= bytes)
+ /* too many moves: try the generic get_reg() function */
+ if (num_moves >= 2 * bytes) {
return get_reg(ctx, reg_file, temp, parallelcopies, instr);
+ } else if (num_moves > bytes) {
+ DefInfo info(ctx, instr, rc, -1);
+ std::optional<PhysReg> res = get_reg_simple(ctx, reg_file, info);
+ if (res)
+ return *res;
+ }
/* re-enable killed operands which are in the wrong position */
RegisterFile tmp_file(reg_file);
- for (unsigned i = 0, offset = 0; i < instr->operands.size();
- offset += instr->operands[i].bytes(), i++) {
- if (instr->operands[i].isTemp() && instr->operands[i].isFirstKillBeforeDef() &&
- instr->operands[i].physReg().reg_b != best_pos.reg_b + offset)
- tmp_file.fill(instr->operands[i]);
+ for (Operand& op : instr->operands) {
+ if (op.isTemp() && op.isFirstKillBeforeDef())
+ tmp_file.fill(op);
+ }
+ for (unsigned i = 0; i < instr->operands.size(); i++) {
+ if ((correct_pos_mask >> i) & 1u && instr->operands[i].isKill())
+ tmp_file.clear(instr->operands[i]);
}
/* collect variables to be moved */
- std::set<std::pair<unsigned, unsigned>> vars =
- collect_vars(ctx, tmp_file, PhysRegInterval{best_pos, size});
+ std::vector<unsigned> vars = collect_vars(ctx, tmp_file, PhysRegInterval{best_pos, size});
- for (unsigned i = 0, offset = 0; i < instr->operands.size();
- offset += instr->operands[i].bytes(), i++) {
- if (!instr->operands[i].isTemp() || !instr->operands[i].isFirstKillBeforeDef() ||
- instr->operands[i].getTemp().type() != rc.type())
- continue;
- bool correct_pos = instr->operands[i].physReg().reg_b == best_pos.reg_b + offset;
- /* GFX9+: move killed operands which aren't yet at the correct position
- * Moving all killed operands generally leads to more register swaps.
- * This is only done on GFX9+ because of the cheap v_swap instruction.
- */
- if (ctx.program->chip_class >= GFX9 && !correct_pos) {
- vars.emplace(instr->operands[i].bytes(), instr->operands[i].tempId());
- tmp_file.clear(instr->operands[i]);
- /* fill operands which are in the correct position to avoid overwriting */
- } else if (correct_pos) {
- tmp_file.fill(instr->operands[i]);
- }
- }
bool success = false;
std::vector<std::pair<Operand, Definition>> pc;
- success =
- get_regs_for_copies(ctx, tmp_file, pc, vars, bounds, instr, PhysRegInterval{best_pos, size});
+ success = get_regs_for_copies(ctx, tmp_file, pc, vars, instr, PhysRegInterval{best_pos, size});
if (!success) {
- if (!increase_register_file(ctx, temp.type())) {
+ if (!increase_register_file(ctx, temp.regClass())) {
/* use the fallback algorithm in get_reg() */
return get_reg(ctx, reg_file, temp, parallelcopies, instr);
}
@@ -1774,7 +2038,7 @@ handle_pseudo(ra_ctx& ctx, const RegisterFile& reg_file, Instruction* instr)
case aco_opcode::p_create_vector:
case aco_opcode::p_split_vector:
case aco_opcode::p_parallelcopy:
- case aco_opcode::p_wqm: break;
+ case aco_opcode::p_start_linear_vgpr: break;
default: return;
}
@@ -1794,10 +2058,11 @@ handle_pseudo(ra_ctx& ctx, const RegisterFile& reg_file, Instruction* instr)
reads_subdword = true;
}
bool needs_scratch_reg = (writes_linear && reads_linear && reg_file[scc]) ||
- (ctx.program->chip_class <= GFX7 && reads_subdword);
+ (ctx.program->gfx_level <= GFX7 && reads_subdword);
if (!needs_scratch_reg)
return;
+ instr->pseudo().needs_scratch_reg = true;
instr->pseudo().tmp_in_scc = reg_file[scc];
int reg = ctx.max_used_sgpr;
@@ -1818,27 +2083,11 @@ handle_pseudo(ra_ctx& ctx, const RegisterFile& reg_file, Instruction* instr)
}
bool
-operand_can_use_reg(chip_class chip, aco_ptr<Instruction>& instr, unsigned idx, PhysReg reg,
+operand_can_use_reg(amd_gfx_level gfx_level, aco_ptr<Instruction>& instr, unsigned idx, PhysReg reg,
RegClass rc)
{
- if (instr->operands[idx].isFixed())
- return instr->operands[idx].physReg() == reg;
-
- bool is_writelane = instr->opcode == aco_opcode::v_writelane_b32 ||
- instr->opcode == aco_opcode::v_writelane_b32_e64;
- if (chip <= GFX9 && is_writelane && idx <= 1) {
- /* v_writelane_b32 can take two sgprs but only if one is m0. */
- bool is_other_sgpr =
- instr->operands[!idx].isTemp() &&
- (!instr->operands[!idx].isFixed() || instr->operands[!idx].physReg() != m0);
- if (is_other_sgpr && instr->operands[!idx].tempId() != instr->operands[idx].tempId()) {
- instr->operands[idx].setFixed(m0);
- return reg == m0;
- }
- }
-
if (reg.byte()) {
- unsigned stride = get_subdword_operand_stride(chip, instr, idx, rc);
+ unsigned stride = get_subdword_operand_stride(gfx_level, instr, idx, rc);
if (reg.byte() % stride)
return false;
}
@@ -1848,7 +2097,7 @@ operand_can_use_reg(chip_class chip, aco_ptr<Instruction>& instr, unsigned idx,
return reg != scc && reg != exec &&
(reg != m0 || idx == 1 || idx == 3) && /* offset can be m0 */
(reg != vcc || (instr->definitions.empty() && idx == 2) ||
- chip >= GFX10); /* sdata can be vcc */
+ gfx_level >= GFX10); /* sdata can be vcc */
default:
// TODO: there are more instructions with restrictions on registers
return true;
@@ -1856,41 +2105,82 @@ operand_can_use_reg(chip_class chip, aco_ptr<Instruction>& instr, unsigned idx,
}
void
-get_reg_for_operand(ra_ctx& ctx, RegisterFile& register_file,
- std::vector<std::pair<Operand, Definition>>& parallelcopy,
- aco_ptr<Instruction>& instr, Operand& operand, unsigned operand_index)
+handle_fixed_operands(ra_ctx& ctx, RegisterFile& register_file,
+ std::vector<std::pair<Operand, Definition>>& parallelcopy,
+ aco_ptr<Instruction>& instr)
{
- /* check if the operand is fixed */
- PhysReg src = ctx.assignments[operand.tempId()].reg;
- PhysReg dst;
- if (operand.isFixed()) {
- assert(operand.physReg() != src);
+ assert(instr->operands.size() <= 128);
- /* check if target reg is blocked, and move away the blocking var */
- if (register_file.test(operand.physReg(), operand.bytes())) {
- PhysRegInterval target{operand.physReg(), operand.size()};
+ RegisterFile tmp_file(register_file);
- RegisterFile tmp_file(register_file);
+ BITSET_DECLARE(mask, 128) = {0};
- std::set<std::pair<unsigned, unsigned>> blocking_vars =
- collect_vars(ctx, tmp_file, target);
+ for (unsigned i = 0; i < instr->operands.size(); i++) {
+ Operand& op = instr->operands[i];
+
+ if (!op.isTemp() || !op.isFixed())
+ continue;
- tmp_file.clear(src, operand.regClass()); // TODO: try to avoid moving block vars to src
- tmp_file.block(operand.physReg(), operand.regClass());
+ PhysReg src = ctx.assignments[op.tempId()].reg;
+ adjust_max_used_regs(ctx, op.regClass(), op.physReg());
- DefInfo info(ctx, instr, operand.regClass(), -1);
- get_regs_for_copies(ctx, tmp_file, parallelcopy, blocking_vars, info.bounds, instr,
- PhysRegInterval());
+ if (op.physReg() == src) {
+ tmp_file.block(op.physReg(), op.regClass());
+ continue;
}
- dst = operand.physReg();
- } else {
- dst = get_reg(ctx, register_file, operand.getTemp(), parallelcopy, instr, operand_index);
- update_renames(
- ctx, register_file, parallelcopy, instr,
- instr->opcode != aco_opcode::p_create_vector ? rename_not_killed_ops : (UpdateRenames)0);
+ unsigned j;
+ bool found = false;
+ BITSET_FOREACH_SET (j, mask, i) {
+ if (instr->operands[j].tempId() == op.tempId() &&
+ instr->operands[j].physReg() == op.physReg()) {
+ found = true;
+ break;
+ }
+ }
+ if (found)
+ continue; /* the copy is already added to the list */
+
+ /* clear from register_file so fixed operands are not collected be collect_vars() */
+ tmp_file.clear(src, op.regClass()); // TODO: try to avoid moving block vars to src
+
+ BITSET_SET(mask, i);
+
+ Operand pc_op(instr->operands[i].getTemp(), src);
+ Definition pc_def = Definition(op.physReg(), pc_op.regClass());
+ parallelcopy.emplace_back(pc_op, pc_def);
}
+ if (BITSET_IS_EMPTY(mask))
+ return;
+
+ unsigned i;
+ std::vector<unsigned> blocking_vars;
+ BITSET_FOREACH_SET (i, mask, instr->operands.size()) {
+ Operand& op = instr->operands[i];
+ PhysRegInterval target{op.physReg(), op.size()};
+ std::vector<unsigned> blocking_vars2 = collect_vars(ctx, tmp_file, target);
+ blocking_vars.insert(blocking_vars.end(), blocking_vars2.begin(), blocking_vars2.end());
+
+ /* prevent get_regs_for_copies() from using these registers */
+ tmp_file.block(op.physReg(), op.regClass());
+ }
+
+ get_regs_for_copies(ctx, tmp_file, parallelcopy, blocking_vars, instr, PhysRegInterval());
+ update_renames(ctx, register_file, parallelcopy, instr,
+ rename_not_killed_ops | fill_killed_ops | rename_precolored_ops);
+}
+
+void
+get_reg_for_operand(ra_ctx& ctx, RegisterFile& register_file,
+ std::vector<std::pair<Operand, Definition>>& parallelcopy,
+ aco_ptr<Instruction>& instr, Operand& operand, unsigned operand_index)
+{
+ /* clear the operand in case it's only a stride mismatch */
+ PhysReg src = ctx.assignments[operand.tempId()].reg;
+ register_file.clear(src, operand.regClass());
+ PhysReg dst = get_reg(ctx, register_file, operand.getTemp(), parallelcopy, instr, operand_index);
+
Operand pc_op = operand;
pc_op.setFixed(src);
Definition pc_def = Definition(dst, pc_op.regClass());
@@ -1898,6 +2188,151 @@ get_reg_for_operand(ra_ctx& ctx, RegisterFile& register_file,
update_renames(ctx, register_file, parallelcopy, instr, rename_not_killed_ops | fill_killed_ops);
}
+PhysReg
+get_reg_phi(ra_ctx& ctx, IDSet& live_in, RegisterFile& register_file,
+ std::vector<aco_ptr<Instruction>>& instructions, Block& block,
+ aco_ptr<Instruction>& phi, Temp tmp)
+{
+ std::vector<std::pair<Operand, Definition>> parallelcopy;
+ PhysReg reg = get_reg(ctx, register_file, tmp, parallelcopy, phi);
+ update_renames(ctx, register_file, parallelcopy, phi, rename_not_killed_ops);
+
+ /* process parallelcopy */
+ for (std::pair<Operand, Definition> pc : parallelcopy) {
+ /* see if it's a copy from a different phi */
+ // TODO: prefer moving some previous phis over live-ins
+ // TODO: somehow prevent phis fixed before the RA from being updated (shouldn't be a
+ // problem in practice since they can only be fixed to exec)
+ Instruction* prev_phi = NULL;
+ std::vector<aco_ptr<Instruction>>::iterator phi_it;
+ for (phi_it = instructions.begin(); phi_it != instructions.end(); ++phi_it) {
+ if ((*phi_it)->definitions[0].tempId() == pc.first.tempId())
+ prev_phi = phi_it->get();
+ }
+ if (prev_phi) {
+ /* if so, just update that phi's register */
+ prev_phi->definitions[0].setFixed(pc.second.physReg());
+ register_file.fill(prev_phi->definitions[0]);
+ ctx.assignments[prev_phi->definitions[0].tempId()] = {pc.second.physReg(),
+ pc.second.regClass()};
+ continue;
+ }
+
+ /* rename */
+ std::unordered_map<unsigned, Temp>::iterator orig_it = ctx.orig_names.find(pc.first.tempId());
+ Temp orig = orig_it != ctx.orig_names.end() ? orig_it->second : pc.first.getTemp();
+ ctx.orig_names[pc.second.tempId()] = orig;
+ ctx.renames[block.index][orig.id()] = pc.second.getTemp();
+
+ /* otherwise, this is a live-in and we need to create a new phi
+ * to move it in this block's predecessors */
+ aco_opcode opcode =
+ pc.first.getTemp().is_linear() ? aco_opcode::p_linear_phi : aco_opcode::p_phi;
+ Block::edge_vec& preds =
+ pc.first.getTemp().is_linear() ? block.linear_preds : block.logical_preds;
+ aco_ptr<Instruction> new_phi{create_instruction(opcode, Format::PSEUDO, preds.size(), 1)};
+ new_phi->definitions[0] = pc.second;
+ for (unsigned i = 0; i < preds.size(); i++)
+ new_phi->operands[i] = Operand(pc.first);
+ instructions.emplace_back(std::move(new_phi));
+
+ /* Remove from live_in, because handle_loop_phis() would re-create this phi later if this is
+ * a loop header.
+ */
+ live_in.erase(orig.id());
+ }
+
+ return reg;
+}
+
+void
+get_regs_for_phis(ra_ctx& ctx, Block& block, RegisterFile& register_file,
+ std::vector<aco_ptr<Instruction>>& instructions, IDSet& live_in)
+{
+ /* move all phis to instructions */
+ for (aco_ptr<Instruction>& phi : block.instructions) {
+ if (!is_phi(phi))
+ break;
+ if (!phi->definitions[0].isKill())
+ instructions.emplace_back(std::move(phi));
+ }
+
+ /* assign phis with all-matching registers to that register */
+ for (aco_ptr<Instruction>& phi : instructions) {
+ Definition& definition = phi->definitions[0];
+ if (definition.isFixed())
+ continue;
+
+ if (!phi->operands[0].isTemp())
+ continue;
+
+ PhysReg reg = phi->operands[0].physReg();
+ auto OpsSame = [=](const Operand& op) -> bool
+ { return op.isTemp() && (!op.isFixed() || op.physReg() == reg); };
+ bool all_same = std::all_of(phi->operands.cbegin() + 1, phi->operands.cend(), OpsSame);
+ if (!all_same)
+ continue;
+
+ if (!get_reg_specified(ctx, register_file, definition.regClass(), phi, reg))
+ continue;
+
+ definition.setFixed(reg);
+ register_file.fill(definition);
+ ctx.assignments[definition.tempId()].set(definition);
+ }
+
+ /* try to find a register that is used by at least one operand */
+ for (aco_ptr<Instruction>& phi : instructions) {
+ Definition& definition = phi->definitions[0];
+ if (definition.isFixed())
+ continue;
+
+ /* use affinity if available */
+ if (ctx.assignments[definition.tempId()].affinity &&
+ ctx.assignments[ctx.assignments[definition.tempId()].affinity].assigned) {
+ assignment& affinity = ctx.assignments[ctx.assignments[definition.tempId()].affinity];
+ assert(affinity.rc == definition.regClass());
+ if (get_reg_specified(ctx, register_file, definition.regClass(), phi, affinity.reg)) {
+ definition.setFixed(affinity.reg);
+ register_file.fill(definition);
+ ctx.assignments[definition.tempId()].set(definition);
+ continue;
+ }
+ }
+
+ /* by going backwards, we aim to avoid copies in else-blocks */
+ for (int i = phi->operands.size() - 1; i >= 0; i--) {
+ const Operand& op = phi->operands[i];
+ if (!op.isTemp() || !op.isFixed())
+ continue;
+
+ PhysReg reg = op.physReg();
+ if (get_reg_specified(ctx, register_file, definition.regClass(), phi, reg)) {
+ definition.setFixed(reg);
+ register_file.fill(definition);
+ ctx.assignments[definition.tempId()].set(definition);
+ break;
+ }
+ }
+ }
+
+ /* find registers for phis where the register was blocked or no operand was assigned */
+
+ /* Don't use iterators because get_reg_phi() can add phis to the end of the vector. */
+ for (unsigned i = 0; i < instructions.size(); i++) {
+ aco_ptr<Instruction>& phi = instructions[i];
+ Definition& definition = phi->definitions[0];
+ if (definition.isFixed())
+ continue;
+
+ definition.setFixed(
+ get_reg_phi(ctx, live_in, register_file, instructions, block, phi, definition.getTemp()));
+
+ register_file.fill(definition);
+ ctx.assignments[definition.tempId()].set(definition);
+ }
+}
+
Temp
read_variable(ra_ctx& ctx, Temp val, unsigned block_idx)
{
@@ -1911,7 +2346,7 @@ read_variable(ra_ctx& ctx, Temp val, unsigned block_idx)
Temp
handle_live_in(ra_ctx& ctx, Temp val, Block* block)
{
- std::vector<unsigned>& preds = val.is_linear() ? block->linear_preds : block->logical_preds;
+ Block::edge_vec& preds = val.is_linear() ? block->linear_preds : block->logical_preds;
if (preds.size() == 0)
return val;
@@ -1939,20 +2374,18 @@ handle_live_in(ra_ctx& ctx, Temp val, Block* block)
/* the variable has been renamed differently in the predecessors: we need to insert a phi */
aco_opcode opcode = val.is_linear() ? aco_opcode::p_linear_phi : aco_opcode::p_phi;
- aco_ptr<Instruction> phi{
- create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, preds.size(), 1)};
+ aco_ptr<Instruction> phi{create_instruction(opcode, Format::PSEUDO, preds.size(), 1)};
new_val = ctx.program->allocateTmp(val.regClass());
phi->definitions[0] = Definition(new_val);
+ ctx.assignments.emplace_back();
+ assert(ctx.assignments.size() == ctx.program->peekAllocationId());
for (unsigned i = 0; i < preds.size(); i++) {
/* update the operands so that it uses the new affinity */
phi->operands[i] = Operand(ops[i]);
assert(ctx.assignments[ops[i].id()].assigned);
+ assert(ops[i].regClass() == new_val.regClass());
phi->operands[i].setFixed(ctx.assignments[ops[i].id()].reg);
- if (ops[i].regClass() == new_val.regClass())
- ctx.affinities[new_val.id()] = ops[i].id();
}
- ctx.assignments.emplace_back();
- assert(ctx.assignments.size() == ctx.program->peekAllocationId());
block->instructions.insert(block->instructions.begin(), std::move(phi));
}
@@ -2002,7 +2435,7 @@ handle_loop_phis(ra_ctx& ctx, const IDSet& live_in, uint32_t loop_header_idx,
aco_ptr<Instruction>& phi = loop_header.instructions[i];
if (!is_phi(phi))
break;
- const std::vector<unsigned>& preds =
+ const Block::edge_vec& preds =
phi->opcode == aco_opcode::p_phi ? loop_header.logical_preds : loop_header.linear_preds;
for (unsigned j = 1; j < phi->operands.size(); j++) {
Operand& op = phi->operands[j];
@@ -2093,7 +2526,7 @@ init_reg_file(ra_ctx& ctx, const std::vector<IDSet>& live_out_per_block, Block&
for (aco_ptr<Instruction>& instr : block.instructions) {
if (!is_phi(instr))
break;
- const std::vector<unsigned>& preds =
+ const Block::edge_vec& preds =
instr->opcode == aco_opcode::p_phi ? block.logical_preds : block.linear_preds;
for (unsigned i = 0; i < instr->operands.size(); i++) {
@@ -2125,8 +2558,8 @@ init_reg_file(ra_ctx& ctx, const std::vector<IDSet>& live_out_per_block, Block&
void
get_affinities(ra_ctx& ctx, std::vector<IDSet>& live_out_per_block)
{
- std::vector<std::vector<Temp>> phi_ressources;
- std::unordered_map<unsigned, unsigned> temp_to_phi_ressources;
+ std::vector<std::vector<Temp>> phi_resources;
+ std::unordered_map<unsigned, unsigned> temp_to_phi_resources;
for (auto block_rit = ctx.program->blocks.rbegin(); block_rit != ctx.program->blocks.rend();
block_rit++) {
@@ -2138,46 +2571,48 @@ get_affinities(ra_ctx& ctx, std::vector<IDSet>& live_out_per_block)
std::vector<aco_ptr<Instruction>>::reverse_iterator rit;
for (rit = block.instructions.rbegin(); rit != block.instructions.rend(); ++rit) {
aco_ptr<Instruction>& instr = *rit;
- if (is_phi(instr)) {
- if (instr->definitions[0].isKill() || instr->definitions[0].isFixed()) {
- live.erase(instr->definitions[0].tempId());
- continue;
- }
- /* collect information about affinity-related temporaries */
- std::vector<Temp> affinity_related;
- /* affinity_related[0] is the last seen affinity-related temp */
- affinity_related.emplace_back(instr->definitions[0].getTemp());
- affinity_related.emplace_back(instr->definitions[0].getTemp());
- for (const Operand& op : instr->operands) {
- if (op.isTemp() && op.isKill() &&
- op.regClass() == instr->definitions[0].regClass()) {
- affinity_related.emplace_back(op.getTemp());
- temp_to_phi_ressources[op.tempId()] = phi_ressources.size();
- }
- }
- phi_ressources.emplace_back(std::move(affinity_related));
- } else {
- /* add vector affinities */
- if (instr->opcode == aco_opcode::p_create_vector) {
- for (const Operand& op : instr->operands) {
- if (op.isTemp() && op.isFirstKill() &&
- op.getTemp().type() == instr->definitions[0].getTemp().type())
- ctx.vectors[op.tempId()] = instr.get();
- }
- } else if (instr->format == Format::MIMG && instr->operands.size() > 4) {
- for (unsigned i = 3; i < instr->operands.size(); i++)
- ctx.vectors[instr->operands[i].tempId()] = instr.get();
- }
-
- if (instr->opcode == aco_opcode::p_split_vector &&
- instr->operands[0].isFirstKillBeforeDef())
- ctx.split_vectors[instr->operands[0].tempId()] = instr.get();
+ if (is_phi(instr))
+ break;
- /* add operands to live variables */
+ /* add vector affinities */
+ if (instr->opcode == aco_opcode::p_create_vector) {
for (const Operand& op : instr->operands) {
- if (op.isTemp())
- live.insert(op.tempId());
+ if (op.isTemp() && op.isFirstKill() &&
+ op.getTemp().type() == instr->definitions[0].getTemp().type())
+ ctx.vectors[op.tempId()] = instr.get();
}
+ } else if (instr->format == Format::MIMG && instr->operands.size() > 4 &&
+ !instr->mimg().strict_wqm) {
+ for (unsigned i = 3; i < instr->operands.size(); i++)
+ ctx.vectors[instr->operands[i].tempId()] = instr.get();
+ } else if (instr->opcode == aco_opcode::p_split_vector &&
+ instr->operands[0].isFirstKillBeforeDef()) {
+ ctx.split_vectors[instr->operands[0].tempId()] = instr.get();
+ } else if (instr->isVOPC() && !instr->isVOP3()) {
+ if (!instr->isSDWA() || ctx.program->gfx_level == GFX8)
+ ctx.assignments[instr->definitions[0].tempId()].vcc = true;
+ } else if (instr->isVOP2() && !instr->isVOP3()) {
+ if (instr->operands.size() == 3 && instr->operands[2].isTemp() &&
+ instr->operands[2].regClass().type() == RegType::sgpr)
+ ctx.assignments[instr->operands[2].tempId()].vcc = true;
+ if (instr->definitions.size() == 2)
+ ctx.assignments[instr->definitions[1].tempId()].vcc = true;
+ } else if (instr->opcode == aco_opcode::s_and_b32 ||
+ instr->opcode == aco_opcode::s_and_b64) {
+ /* If SCC is used by a branch, we might be able to use
+ * s_cbranch_vccz/s_cbranch_vccnz if the operand is VCC.
+ */
+ if (!instr->definitions[1].isKill() && instr->operands[0].isTemp() &&
+ instr->operands[1].isFixed() && instr->operands[1].physReg() == exec)
+ ctx.assignments[instr->operands[0].tempId()].vcc = true;
+ } else if (instr->opcode == aco_opcode::s_sendmsg) {
+ ctx.assignments[instr->operands[0].tempId()].m0 = true;
+ }
+
+ /* add operands to live variables */
+ for (const Operand& op : instr->operands) {
+ if (op.isTemp())
+ live.insert(op.tempId());
}
/* erase definitions from live */
@@ -2188,10 +2623,10 @@ get_affinities(ra_ctx& ctx, std::vector<IDSet>& live_out_per_block)
live.erase(def.tempId());
/* mark last-seen phi operand */
std::unordered_map<unsigned, unsigned>::iterator it =
- temp_to_phi_ressources.find(def.tempId());
- if (it != temp_to_phi_ressources.end() &&
- def.regClass() == phi_ressources[it->second][0].regClass()) {
- phi_ressources[it->second][0] = def.getTemp();
+ temp_to_phi_resources.find(def.tempId());
+ if (it != temp_to_phi_resources.end() &&
+ def.regClass() == phi_resources[it->second][0].regClass()) {
+ phi_resources[it->second][0] = def.getTemp();
/* try to coalesce phi affinities with parallelcopies */
Operand op = Operand();
switch (instr->opcode) {
@@ -2204,7 +2639,7 @@ get_affinities(ra_ctx& ctx, std::vector<IDSet>& live_out_per_block)
case aco_opcode::v_fma_f32:
case aco_opcode::v_fma_f16:
case aco_opcode::v_pk_fma_f16:
- if (ctx.program->chip_class < GFX10)
+ if (ctx.program->gfx_level < GFX10)
continue;
FALLTHROUGH;
case aco_opcode::v_mad_f32:
@@ -2214,193 +2649,371 @@ get_affinities(ra_ctx& ctx, std::vector<IDSet>& live_out_per_block)
op = instr->operands[2];
break;
+ case aco_opcode::v_mad_legacy_f32:
+ case aco_opcode::v_fma_legacy_f32:
+ if (instr->usesModifiers() || !ctx.program->dev.has_mac_legacy32)
+ continue;
+ op = instr->operands[2];
+ break;
+
default: continue;
}
if (op.isTemp() && op.isFirstKillBeforeDef() && def.regClass() == op.regClass()) {
- phi_ressources[it->second].emplace_back(op.getTemp());
- temp_to_phi_ressources[op.tempId()] = it->second;
+ phi_resources[it->second].emplace_back(op.getTemp());
+ temp_to_phi_resources[op.tempId()] = it->second;
+ }
+ }
+ }
+ }
+
+ /* collect phi affinities */
+ for (; rit != block.instructions.rend(); ++rit) {
+ aco_ptr<Instruction>& instr = *rit;
+ assert(is_phi(instr));
+
+ live.erase(instr->definitions[0].tempId());
+ if (instr->definitions[0].isKill() || instr->definitions[0].isFixed())
+ continue;
+
+ assert(instr->definitions[0].isTemp());
+ std::unordered_map<unsigned, unsigned>::iterator it =
+ temp_to_phi_resources.find(instr->definitions[0].tempId());
+ unsigned index = phi_resources.size();
+ std::vector<Temp>* affinity_related;
+ if (it != temp_to_phi_resources.end()) {
+ index = it->second;
+ phi_resources[index][0] = instr->definitions[0].getTemp();
+ affinity_related = &phi_resources[index];
+ } else {
+ phi_resources.emplace_back(std::vector<Temp>{instr->definitions[0].getTemp()});
+ affinity_related = &phi_resources.back();
+ }
+
+ for (const Operand& op : instr->operands) {
+ if (op.isTemp() && op.isKill() && op.regClass() == instr->definitions[0].regClass()) {
+ affinity_related->emplace_back(op.getTemp());
+ if (block.kind & block_kind_loop_header)
+ continue;
+ temp_to_phi_resources[op.tempId()] = index;
+ }
+ }
+ }
+
+ /* visit the loop header phis first in order to create nested affinities */
+ if (block.kind & block_kind_loop_exit) {
+ /* find loop header */
+ auto header_rit = block_rit;
+ while ((header_rit + 1)->loop_nest_depth > block.loop_nest_depth)
+ header_rit++;
+
+ for (aco_ptr<Instruction>& phi : header_rit->instructions) {
+ if (!is_phi(phi))
+ break;
+ if (phi->definitions[0].isKill() || phi->definitions[0].isFixed())
+ continue;
+
+ /* create an (empty) merge-set for the phi-related variables */
+ auto it = temp_to_phi_resources.find(phi->definitions[0].tempId());
+ unsigned index = phi_resources.size();
+ if (it == temp_to_phi_resources.end()) {
+ temp_to_phi_resources[phi->definitions[0].tempId()] = index;
+ phi_resources.emplace_back(std::vector<Temp>{phi->definitions[0].getTemp()});
+ } else {
+ index = it->second;
+ }
+ for (unsigned i = 1; i < phi->operands.size(); i++) {
+ const Operand& op = phi->operands[i];
+ if (op.isTemp() && op.isKill() && op.regClass() == phi->definitions[0].regClass()) {
+ temp_to_phi_resources[op.tempId()] = index;
}
}
}
}
}
/* create affinities */
- for (std::vector<Temp>& vec : phi_ressources) {
- assert(vec.size() > 1);
+ for (std::vector<Temp>& vec : phi_resources) {
for (unsigned i = 1; i < vec.size(); i++)
if (vec[i].id() != vec[0].id())
- ctx.affinities[vec[i].id()] = vec[0].id();
+ ctx.assignments[vec[i].id()].affinity = vec[0].id();
}
}
-} /* end namespace */
+void
+optimize_encoding_vop2(Program* program, ra_ctx& ctx, RegisterFile& register_file,
+ aco_ptr<Instruction>& instr)
+{
+ /* try to optimize v_mad_f32 -> v_mac_f32 */
+ if ((instr->opcode != aco_opcode::v_mad_f32 &&
+ (instr->opcode != aco_opcode::v_fma_f32 || program->gfx_level < GFX10) &&
+ instr->opcode != aco_opcode::v_mad_f16 && instr->opcode != aco_opcode::v_mad_legacy_f16 &&
+ (instr->opcode != aco_opcode::v_fma_f16 || program->gfx_level < GFX10) &&
+ (instr->opcode != aco_opcode::v_pk_fma_f16 || program->gfx_level < GFX10) &&
+ (instr->opcode != aco_opcode::v_mad_legacy_f32 || !program->dev.has_mac_legacy32) &&
+ (instr->opcode != aco_opcode::v_fma_legacy_f32 || !program->dev.has_mac_legacy32) &&
+ (instr->opcode != aco_opcode::v_dot4_i32_i8 || program->family == CHIP_VEGA20)) ||
+ !instr->operands[2].isTemp() || !instr->operands[2].isKillBeforeDef() ||
+ instr->operands[2].getTemp().type() != RegType::vgpr ||
+ (!instr->operands[0].isOfType(RegType::vgpr) &&
+ !instr->operands[1].isOfType(RegType::vgpr)) ||
+ instr->operands[2].physReg().byte() != 0 || instr->valu().opsel[2])
+ return;
+
+ if (instr->isVOP3P() && (instr->valu().opsel_lo != 0 || instr->valu().opsel_hi != 0x7))
+ return;
+
+ if ((instr->operands[0].physReg().byte() != 0 || instr->operands[1].physReg().byte() != 0 ||
+ instr->valu().opsel) &&
+ program->gfx_level < GFX11)
+ return;
+
+ unsigned im_mask = instr->isDPP16() ? 0x3 : 0;
+ if (instr->valu().omod || instr->valu().clamp || (instr->valu().abs & ~im_mask) ||
+ (instr->valu().neg & ~im_mask))
+ return;
+
+ if (!instr->operands[1].isOfType(RegType::vgpr))
+ instr->valu().swapOperands(0, 1);
+
+ if (!instr->operands[0].isOfType(RegType::vgpr) && instr->valu().opsel[0])
+ return;
+
+ unsigned def_id = instr->definitions[0].tempId();
+ if (ctx.assignments[def_id].affinity) {
+ assignment& affinity = ctx.assignments[ctx.assignments[def_id].affinity];
+ if (affinity.assigned && affinity.reg != instr->operands[2].physReg() &&
+ !register_file.test(affinity.reg, instr->operands[2].bytes()))
+ return;
+ }
+
+ instr->format = (Format)(((unsigned)withoutVOP3(instr->format) & ~(unsigned)Format::VOP3P) |
+ (unsigned)Format::VOP2);
+ instr->valu().opsel_hi = 0;
+ switch (instr->opcode) {
+ case aco_opcode::v_mad_f32: instr->opcode = aco_opcode::v_mac_f32; break;
+ case aco_opcode::v_fma_f32: instr->opcode = aco_opcode::v_fmac_f32; break;
+ case aco_opcode::v_mad_f16:
+ case aco_opcode::v_mad_legacy_f16: instr->opcode = aco_opcode::v_mac_f16; break;
+ case aco_opcode::v_fma_f16: instr->opcode = aco_opcode::v_fmac_f16; break;
+ case aco_opcode::v_pk_fma_f16: instr->opcode = aco_opcode::v_pk_fmac_f16; break;
+ case aco_opcode::v_dot4_i32_i8: instr->opcode = aco_opcode::v_dot4c_i32_i8; break;
+ case aco_opcode::v_mad_legacy_f32: instr->opcode = aco_opcode::v_mac_legacy_f32; break;
+ case aco_opcode::v_fma_legacy_f32: instr->opcode = aco_opcode::v_fmac_legacy_f32; break;
+ default: break;
+ }
+}
void
-register_allocation(Program* program, std::vector<IDSet>& live_out_per_block, ra_test_policy policy)
+optimize_encoding_sopk(Program* program, ra_ctx& ctx, RegisterFile& register_file,
+ aco_ptr<Instruction>& instr)
{
- ra_ctx ctx(program, policy);
- get_affinities(ctx, live_out_per_block);
+ /* try to optimize sop2 with literal source to sopk */
+ if (instr->opcode != aco_opcode::s_add_i32 && instr->opcode != aco_opcode::s_mul_i32 &&
+ instr->opcode != aco_opcode::s_cselect_b32)
+ return;
- /* state of register file after phis */
- std::vector<std::bitset<128>> sgpr_live_in(program->blocks.size());
+ uint32_t literal_idx = 0;
- for (Block& block : program->blocks) {
- ctx.block = &block;
+ if (instr->opcode != aco_opcode::s_cselect_b32 && instr->operands[1].isLiteral())
+ literal_idx = 1;
- /* initialize register file */
- RegisterFile register_file = init_reg_file(ctx, live_out_per_block, block);
- ctx.war_hint.reset();
+ if (!instr->operands[!literal_idx].isTemp() ||
+ !instr->operands[!literal_idx].isKillBeforeDef() ||
+ instr->operands[!literal_idx].getTemp().type() != RegType::sgpr ||
+ instr->operands[!literal_idx].physReg() >= 128)
+ return;
- std::vector<aco_ptr<Instruction>> instructions;
- std::vector<aco_ptr<Instruction>>::iterator instr_it;
+ if (!instr->operands[literal_idx].isLiteral())
+ return;
- /* this is a slight adjustment from the paper as we already have phi nodes:
- * We consider them incomplete phis and only handle the definition. */
+ const uint32_t i16_mask = 0xffff8000u;
+ uint32_t value = instr->operands[literal_idx].constantValue();
+ if ((value & i16_mask) && (value & i16_mask) != i16_mask)
+ return;
- /* look up the affinities */
- for (instr_it = block.instructions.begin(); instr_it != block.instructions.end();
- ++instr_it) {
- aco_ptr<Instruction>& phi = *instr_it;
- if (!is_phi(phi))
- break;
- Definition& definition = phi->definitions[0];
- if (definition.isKill() || definition.isFixed())
- continue;
+ unsigned def_id = instr->definitions[0].tempId();
+ if (ctx.assignments[def_id].affinity) {
+ assignment& affinity = ctx.assignments[ctx.assignments[def_id].affinity];
+ if (affinity.assigned && affinity.reg != instr->operands[!literal_idx].physReg() &&
+ !register_file.test(affinity.reg, instr->operands[!literal_idx].bytes()))
+ return;
+ }
- if (ctx.affinities.find(definition.tempId()) != ctx.affinities.end() &&
- ctx.assignments[ctx.affinities[definition.tempId()]].assigned) {
- assert(ctx.assignments[ctx.affinities[definition.tempId()]].rc ==
- definition.regClass());
- PhysReg reg = ctx.assignments[ctx.affinities[definition.tempId()]].reg;
- if (reg == scc) {
- /* only use scc if all operands are already placed there */
- bool use_scc =
- std::all_of(phi->operands.begin(), phi->operands.end(),
- [](const Operand& op)
- { return op.isTemp() && op.isFixed() && op.physReg() == scc; });
- if (!use_scc)
- continue;
- }
+ instr->format = Format::SOPK;
+ SALU_instruction* instr_sopk = &instr->salu();
- /* only assign if register is still free */
- if (!register_file.test(reg, definition.bytes())) {
- definition.setFixed(reg);
- register_file.fill(definition);
- ctx.assignments[definition.tempId()] = {definition.physReg(), definition.regClass()};
- }
+ instr_sopk->imm = instr_sopk->operands[literal_idx].constantValue() & 0xffff;
+ if (literal_idx == 0)
+ std::swap(instr_sopk->operands[0], instr_sopk->operands[1]);
+ if (instr_sopk->operands.size() > 2)
+ std::swap(instr_sopk->operands[1], instr_sopk->operands[2]);
+ instr_sopk->operands.pop_back();
+
+ switch (instr_sopk->opcode) {
+ case aco_opcode::s_add_i32: instr_sopk->opcode = aco_opcode::s_addk_i32; break;
+ case aco_opcode::s_mul_i32: instr_sopk->opcode = aco_opcode::s_mulk_i32; break;
+ case aco_opcode::s_cselect_b32: instr_sopk->opcode = aco_opcode::s_cmovk_i32; break;
+ default: unreachable("illegal instruction");
+ }
+}
+
+void
+optimize_encoding(Program* program, ra_ctx& ctx, RegisterFile& register_file,
+ aco_ptr<Instruction>& instr)
+{
+ if (instr->isVALU())
+ optimize_encoding_vop2(program, ctx, register_file, instr);
+ if (instr->isSALU())
+ optimize_encoding_sopk(program, ctx, register_file, instr);
+}
+
+void
+emit_parallel_copy_internal(ra_ctx& ctx, std::vector<std::pair<Operand, Definition>>& parallelcopy,
+ aco_ptr<Instruction>& instr,
+ std::vector<aco_ptr<Instruction>>& instructions, bool temp_in_scc,
+ RegisterFile& register_file)
+{
+ if (parallelcopy.empty())
+ return;
+
+ aco_ptr<Instruction> pc;
+ pc.reset(create_instruction(aco_opcode::p_parallelcopy, Format::PSEUDO, parallelcopy.size(),
+ parallelcopy.size()));
+ bool linear_vgpr = false;
+ bool sgpr_operands_alias_defs = false;
+ uint64_t sgpr_operands[4] = {0, 0, 0, 0};
+ for (unsigned i = 0; i < parallelcopy.size(); i++) {
+ linear_vgpr |= parallelcopy[i].first.regClass().is_linear_vgpr();
+
+ if (temp_in_scc && parallelcopy[i].first.isTemp() &&
+ parallelcopy[i].first.getTemp().type() == RegType::sgpr) {
+ if (!sgpr_operands_alias_defs) {
+ unsigned reg = parallelcopy[i].first.physReg().reg();
+ unsigned size = parallelcopy[i].first.getTemp().size();
+ sgpr_operands[reg / 64u] |= u_bit_consecutive64(reg % 64u, size);
+
+ reg = parallelcopy[i].second.physReg().reg();
+ size = parallelcopy[i].second.getTemp().size();
+ if (sgpr_operands[reg / 64u] & u_bit_consecutive64(reg % 64u, size))
+ sgpr_operands_alias_defs = true;
}
}
- /* find registers for phis without affinity or where the register was blocked */
- for (instr_it = block.instructions.begin(); instr_it != block.instructions.end();
- ++instr_it) {
- aco_ptr<Instruction>& phi = *instr_it;
- if (!is_phi(phi))
- break;
+ pc->operands[i] = parallelcopy[i].first;
+ pc->definitions[i] = parallelcopy[i].second;
+ assert(pc->operands[i].size() == pc->definitions[i].size());
- Definition& definition = phi->definitions[0];
- if (definition.isKill())
- continue;
+ /* it might happen that the operand is already renamed. we have to restore the
+ * original name. */
+ std::unordered_map<unsigned, Temp>::iterator it =
+ ctx.orig_names.find(pc->operands[i].tempId());
+ Temp orig = it != ctx.orig_names.end() ? it->second : pc->operands[i].getTemp();
+ ctx.orig_names[pc->definitions[i].tempId()] = orig;
+ ctx.renames[ctx.block->index][orig.id()] = pc->definitions[i].getTemp();
+ }
- if (!definition.isFixed()) {
- std::vector<std::pair<Operand, Definition>> parallelcopy;
- /* try to find a register that is used by at least one operand */
- for (int i = phi->operands.size() - 1; i >= 0; i--) {
- /* by going backwards, we aim to avoid copies in else-blocks */
- const Operand& op = phi->operands[i];
- if (!op.isTemp() || !op.isFixed())
- continue;
- PhysReg reg = op.physReg();
- /* we tried this already on the previous loop */
- if (reg == scc)
- continue;
- if (get_reg_specified(ctx, register_file, definition.regClass(), phi, reg)) {
- definition.setFixed(reg);
- break;
- }
- }
- if (!definition.isFixed()) {
- definition.setFixed(
- get_reg(ctx, register_file, definition.getTemp(), parallelcopy, phi));
- update_renames(ctx, register_file, parallelcopy, phi, rename_not_killed_ops);
- }
+ if (temp_in_scc && (sgpr_operands_alias_defs || linear_vgpr)) {
+ /* disable definitions and re-enable operands */
+ RegisterFile tmp_file(register_file);
+ for (const Definition& def : instr->definitions) {
+ if (def.isTemp() && !def.isKill())
+ tmp_file.clear(def);
+ }
+ for (const Operand& op : instr->operands) {
+ if (op.isTemp() && op.isFirstKill())
+ tmp_file.block(op.physReg(), op.regClass());
+ }
- /* process parallelcopy */
- for (std::pair<Operand, Definition> pc : parallelcopy) {
- /* see if it's a copy from a different phi */
- // TODO: prefer moving some previous phis over live-ins
- // TODO: somehow prevent phis fixed before the RA from being updated (shouldn't be a
- // problem in practice since they can only be fixed to exec)
- Instruction* prev_phi = NULL;
- std::vector<aco_ptr<Instruction>>::iterator phi_it;
- for (phi_it = instructions.begin(); phi_it != instructions.end(); ++phi_it) {
- if ((*phi_it)->definitions[0].tempId() == pc.first.tempId())
- prev_phi = phi_it->get();
- }
- phi_it = instr_it;
- while (!prev_phi && is_phi(*++phi_it)) {
- if ((*phi_it)->definitions[0].tempId() == pc.first.tempId())
- prev_phi = phi_it->get();
- }
- if (prev_phi) {
- /* if so, just update that phi's register */
- register_file.clear(prev_phi->definitions[0]);
- prev_phi->definitions[0].setFixed(pc.second.physReg());
- ctx.assignments[prev_phi->definitions[0].tempId()] = {pc.second.physReg(),
- pc.second.regClass()};
- register_file.fill(prev_phi->definitions[0]);
- continue;
- }
+ handle_pseudo(ctx, tmp_file, pc.get());
+ } else {
+ pc->pseudo().needs_scratch_reg = sgpr_operands_alias_defs || linear_vgpr;
+ pc->pseudo().tmp_in_scc = false;
+ }
- /* rename */
- std::unordered_map<unsigned, Temp>::iterator orig_it =
- ctx.orig_names.find(pc.first.tempId());
- Temp orig = pc.first.getTemp();
- if (orig_it != ctx.orig_names.end())
- orig = orig_it->second;
- else
- ctx.orig_names[pc.second.tempId()] = orig;
- ctx.renames[block.index][orig.id()] = pc.second.getTemp();
-
- /* otherwise, this is a live-in and we need to create a new phi
- * to move it in this block's predecessors */
- aco_opcode opcode =
- pc.first.getTemp().is_linear() ? aco_opcode::p_linear_phi : aco_opcode::p_phi;
- std::vector<unsigned>& preds =
- pc.first.getTemp().is_linear() ? block.linear_preds : block.logical_preds;
- aco_ptr<Instruction> new_phi{
- create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, preds.size(), 1)};
- new_phi->definitions[0] = pc.second;
- for (unsigned i = 0; i < preds.size(); i++)
- new_phi->operands[i] = Operand(pc.first);
- instructions.emplace_back(std::move(new_phi));
-
- /* Remove from live_out_per_block (now used for live-in), because handle_loop_phis()
- * would re-create this phi later if this is a loop header.
- */
- live_out_per_block[block.index].erase(orig.id());
- }
+ instructions.emplace_back(std::move(pc));
- register_file.fill(definition);
- ctx.assignments[definition.tempId()] = {definition.physReg(), definition.regClass()};
- }
+ parallelcopy.clear();
+}
- /* update phi affinities */
- for (const Operand& op : phi->operands) {
- if (op.isTemp() && op.regClass() == phi->definitions[0].regClass())
- ctx.affinities[op.tempId()] = definition.tempId();
+void
+emit_parallel_copy(ra_ctx& ctx, std::vector<std::pair<Operand, Definition>>& parallelcopy,
+ aco_ptr<Instruction>& instr, std::vector<aco_ptr<Instruction>>& instructions,
+ bool temp_in_scc, RegisterFile& register_file)
+{
+ if (parallelcopy.empty())
+ return;
+
+ std::vector<std::pair<Operand, Definition>> linear_vgpr;
+ if (ctx.num_linear_vgprs) {
+ unsigned next = 0;
+ for (unsigned i = 0; i < parallelcopy.size(); i++) {
+ if (parallelcopy[i].first.regClass().is_linear_vgpr()) {
+ linear_vgpr.push_back(parallelcopy[i]);
+ continue;
}
- instructions.emplace_back(std::move(*instr_it));
+ if (next != i)
+ parallelcopy[next] = parallelcopy[i];
+ next++;
}
+ parallelcopy.resize(next);
+ }
+
+ /* Because of how linear VGPRs are allocated, we should never have to move a linear VGPR into the
+ * space of a normal one. This means the copy can be done entirely before normal VGPR copies. */
+ emit_parallel_copy_internal(ctx, linear_vgpr, instr, instructions, temp_in_scc,
+ register_file);
+ emit_parallel_copy_internal(ctx, parallelcopy, instr, instructions, temp_in_scc,
+ register_file);
+}
+
+} /* end namespace */
+
+void
+register_allocation(Program* program, live& live_vars, ra_test_policy policy)
+{
+ std::vector<IDSet>& live_out_per_block = live_vars.live_out;
+ ra_ctx ctx(program, policy);
+ get_affinities(ctx, live_out_per_block);
- /* fill in sgpr_live_in */
- for (unsigned i = 0; i <= ctx.max_used_sgpr; i++)
- sgpr_live_in[block.index][i] = register_file[PhysReg{i}];
- sgpr_live_in[block.index][127] = register_file[scc];
+ for (Block& block : program->blocks) {
+ ctx.block = &block;
+
+ /* initialize register file */
+ RegisterFile register_file = init_reg_file(ctx, live_out_per_block, block);
+ ctx.war_hint.reset();
+
+ std::vector<aco_ptr<Instruction>> instructions;
+ instructions.reserve(block.instructions.size());
+
+ /* this is a slight adjustment from the paper as we already have phi nodes:
+ * We consider them incomplete phis and only handle the definition. */
+ get_regs_for_phis(ctx, block, register_file, instructions, live_out_per_block[block.index]);
+
+ /* If this is a merge block, the state of the register file at the branch instruction of the
+ * predecessors corresponds to the state after phis at the merge block. So, we allocate a
+ * register for the predecessor's branch definitions as if there was a phi.
+ */
+ if (!block.linear_preds.empty() &&
+ (block.linear_preds.size() != 1 ||
+ program->blocks[block.linear_preds[0]].linear_succs.size() == 1)) {
+ PhysReg br_reg = get_reg_phi(ctx, live_out_per_block[block.index], register_file,
+ instructions, block, ctx.phi_dummy, Temp(0, s2));
+ for (unsigned pred : block.linear_preds) {
+ program->blocks[pred].scc_live_out = register_file[scc];
+ aco_ptr<Instruction>& br = program->blocks[pred].instructions.back();
+
+ assert(br->definitions.size() == 1 && br->definitions[0].regClass() == s2 &&
+ br->definitions[0].isKill());
+
+ br->definitions[0].setFixed(br_reg);
+ }
+ }
/* Handle all other instructions of the block */
+ auto NonPhi = [](aco_ptr<Instruction>& instr) -> bool { return instr && !is_phi(instr); };
+ std::vector<aco_ptr<Instruction>>::iterator instr_it =
+ std::find_if(block.instructions.begin(), block.instructions.end(), NonPhi);
for (; instr_it != block.instructions.end(); ++instr_it) {
aco_ptr<Instruction>& instr = *instr_it;
@@ -2438,12 +3051,18 @@ register_allocation(Program* program, std::vector<IDSet>& live_out_per_block, ra
}
std::vector<std::pair<Operand, Definition>> parallelcopy;
+ bool temp_in_scc = register_file[scc];
- assert(!is_phi(instr));
+ if (instr->opcode == aco_opcode::p_branch) {
+ /* unconditional branches are handled after phis of the target */
+ instructions.emplace_back(std::move(instr));
+ break;
+ }
- bool temp_in_scc = register_file[scc];
+ assert(!is_phi(instr));
/* handle operands */
+ bool fixed = false;
for (unsigned i = 0; i < instr->operands.size(); ++i) {
auto& operand = instr->operands[i];
if (!operand.isTemp())
@@ -2453,13 +3072,37 @@ register_allocation(Program* program, std::vector<IDSet>& live_out_per_block, ra
operand.setTemp(read_variable(ctx, operand.getTemp(), block.index));
assert(ctx.assignments[operand.tempId()].assigned);
+ fixed |=
+ operand.isFixed() && ctx.assignments[operand.tempId()].reg != operand.physReg();
+ }
+
+ bool is_writelane = instr->opcode == aco_opcode::v_writelane_b32 ||
+ instr->opcode == aco_opcode::v_writelane_b32_e64;
+ if (program->gfx_level <= GFX9 && is_writelane && instr->operands[0].isTemp() &&
+ instr->operands[1].isTemp()) {
+ /* v_writelane_b32 can take two sgprs but only if one is m0. */
+ if (ctx.assignments[instr->operands[0].tempId()].reg != m0 &&
+ ctx.assignments[instr->operands[1].tempId()].reg != m0) {
+ instr->operands[0].setFixed(m0);
+ fixed = true;
+ }
+ }
+
+ if (fixed)
+ handle_fixed_operands(ctx, register_file, parallelcopy, instr);
+
+ for (unsigned i = 0; i < instr->operands.size(); ++i) {
+ auto& operand = instr->operands[i];
+ if (!operand.isTemp() || operand.isFixed())
+ continue;
+
PhysReg reg = ctx.assignments[operand.tempId()].reg;
- if (operand_can_use_reg(program->chip_class, instr, i, reg, operand.regClass()))
+ if (operand_can_use_reg(program->gfx_level, instr, i, reg, operand.regClass()))
operand.setFixed(reg);
else
get_reg_for_operand(ctx, register_file, parallelcopy, instr, operand, i);
- if (instr->isEXP() || (instr->isVMEM() && i == 3 && ctx.program->chip_class == GFX6) ||
+ if (instr->isEXP() || (instr->isVMEM() && i == 3 && ctx.program->gfx_level == GFX6) ||
(instr->isDS() && instr->ds().gds)) {
for (unsigned j = 0; j < operand.size(); j++)
ctx.war_hint.set(operand.physReg().reg() + j);
@@ -2472,59 +3115,17 @@ register_allocation(Program* program, std::vector<IDSet>& live_out_per_block, ra
register_file.clear(op);
}
- /* try to optimize v_mad_f32 -> v_mac_f32 */
- if ((instr->opcode == aco_opcode::v_mad_f32 ||
- (instr->opcode == aco_opcode::v_fma_f32 && program->chip_class >= GFX10) ||
- instr->opcode == aco_opcode::v_mad_f16 ||
- instr->opcode == aco_opcode::v_mad_legacy_f16 ||
- (instr->opcode == aco_opcode::v_fma_f16 && program->chip_class >= GFX10) ||
- (instr->opcode == aco_opcode::v_pk_fma_f16 && program->chip_class >= GFX10) ||
- (instr->opcode == aco_opcode::v_dot4_i32_i8 && program->family != CHIP_VEGA20)) &&
- instr->operands[2].isTemp() && instr->operands[2].isKillBeforeDef() &&
- instr->operands[2].getTemp().type() == RegType::vgpr && instr->operands[1].isTemp() &&
- instr->operands[1].getTemp().type() == RegType::vgpr && !instr->usesModifiers() &&
- instr->operands[0].physReg().byte() == 0 && instr->operands[1].physReg().byte() == 0 &&
- instr->operands[2].physReg().byte() == 0) {
- unsigned def_id = instr->definitions[0].tempId();
- auto it = ctx.affinities.find(def_id);
- if (it == ctx.affinities.end() || !ctx.assignments[it->second].assigned ||
- instr->operands[2].physReg() == ctx.assignments[it->second].reg ||
- register_file.test(ctx.assignments[it->second].reg, instr->operands[2].bytes())) {
- instr->format = Format::VOP2;
- switch (instr->opcode) {
- case aco_opcode::v_mad_f32: instr->opcode = aco_opcode::v_mac_f32; break;
- case aco_opcode::v_fma_f32: instr->opcode = aco_opcode::v_fmac_f32; break;
- case aco_opcode::v_mad_f16:
- case aco_opcode::v_mad_legacy_f16: instr->opcode = aco_opcode::v_mac_f16; break;
- case aco_opcode::v_fma_f16: instr->opcode = aco_opcode::v_fmac_f16; break;
- case aco_opcode::v_pk_fma_f16: instr->opcode = aco_opcode::v_pk_fmac_f16; break;
- case aco_opcode::v_dot4_i32_i8: instr->opcode = aco_opcode::v_dot4c_i32_i8; break;
- default: break;
- }
- }
- }
-
- /* handle definitions which must have the same register as an operand */
- if (instr->opcode == aco_opcode::v_interp_p2_f32 ||
- instr->opcode == aco_opcode::v_mac_f32 || instr->opcode == aco_opcode::v_fmac_f32 ||
- instr->opcode == aco_opcode::v_mac_f16 || instr->opcode == aco_opcode::v_fmac_f16 ||
- instr->opcode == aco_opcode::v_pk_fmac_f16 ||
- instr->opcode == aco_opcode::v_writelane_b32 ||
- instr->opcode == aco_opcode::v_writelane_b32_e64 ||
- instr->opcode == aco_opcode::v_dot4c_i32_i8) {
- instr->definitions[0].setFixed(instr->operands[2].physReg());
- } else if (instr->opcode == aco_opcode::s_addk_i32 ||
- instr->opcode == aco_opcode::s_mulk_i32) {
- instr->definitions[0].setFixed(instr->operands[0].physReg());
- } else if (instr->isMUBUF() && instr->definitions.size() == 1 &&
- instr->operands.size() == 4) {
- instr->definitions[0].setFixed(instr->operands[3].physReg());
- } else if (instr->isMIMG() && instr->definitions.size() == 1 &&
- !instr->operands[2].isUndefined()) {
- instr->definitions[0].setFixed(instr->operands[2].physReg());
- }
+ optimize_encoding(program, ctx, register_file, instr);
- ctx.defs_done.reset();
+ /* Handle definitions which must have the same register as an operand.
+ * We expect that the definition has the same size as the operand, otherwise the new
+ * location for the operand (if it's not killed) might intersect with the old one.
+ * We can't read from the old location because it's corrupted, and we can't write the new
+ * location because that's used by a live-through operand.
+ */
+ int op_fixed_to_def = get_op_fixed_to_def(instr.get());
+ if (op_fixed_to_def != -1)
+ instr->definitions[0].setFixed(instr->operands[op_fixed_to_def].physReg());
/* handle fixed definitions first */
for (unsigned i = 0; i < instr->definitions.size(); ++i) {
@@ -2538,8 +3139,7 @@ register_allocation(Program* program, std::vector<IDSet>& live_out_per_block, ra
const PhysRegInterval def_regs{definition.physReg(), definition.size()};
/* create parallelcopy pair to move blocking vars */
- std::set<std::pair<unsigned, unsigned>> vars =
- collect_vars(ctx, register_file, def_regs);
+ std::vector<unsigned> vars = collect_vars(ctx, register_file, def_regs);
RegisterFile tmp_file(register_file);
/* re-enable the killed operands, so that we don't move the blocking vars there */
@@ -2549,19 +3149,16 @@ register_allocation(Program* program, std::vector<IDSet>& live_out_per_block, ra
}
ASSERTED bool success = false;
- DefInfo info(ctx, instr, definition.regClass(), -1);
- success = get_regs_for_copies(ctx, tmp_file, parallelcopy, vars, info.bounds, instr,
- def_regs);
+ success = get_regs_for_copies(ctx, tmp_file, parallelcopy, vars, instr, def_regs);
assert(success);
update_renames(ctx, register_file, parallelcopy, instr, (UpdateRenames)0);
}
- ctx.defs_done.set(i);
if (!definition.isTemp())
continue;
- ctx.assignments[definition.tempId()] = {definition.physReg(), definition.regClass()};
+ ctx.assignments[definition.tempId()].set(definition);
register_file.fill(definition);
}
@@ -2573,18 +3170,30 @@ register_allocation(Program* program, std::vector<IDSet>& live_out_per_block, ra
continue;
/* find free reg */
- if (definition->hasHint() &&
- get_reg_specified(ctx, register_file, definition->regClass(), instr,
- definition->physReg())) {
- definition->setFixed(definition->physReg());
+ if (instr->opcode == aco_opcode::p_start_linear_vgpr) {
+ /* Allocation of linear VGPRs is special. */
+ definition->setFixed(alloc_linear_vgpr(ctx, register_file, instr, parallelcopy));
+ update_renames(ctx, register_file, parallelcopy, instr, rename_not_killed_ops);
} else if (instr->opcode == aco_opcode::p_split_vector) {
PhysReg reg = instr->operands[0].physReg();
+ RegClass rc = definition->regClass();
for (unsigned j = 0; j < i; j++)
reg.reg_b += instr->definitions[j].bytes();
- if (get_reg_specified(ctx, register_file, definition->regClass(), instr, reg))
+ if (get_reg_specified(ctx, register_file, rc, instr, reg)) {
definition->setFixed(reg);
- } else if (instr->opcode == aco_opcode::p_wqm ||
- instr->opcode == aco_opcode::p_parallelcopy) {
+ } else if (i == 0) {
+ RegClass vec_rc = RegClass::get(rc.type(), instr->operands[0].bytes());
+ DefInfo info(ctx, ctx.pseudo_dummy, vec_rc, -1);
+ std::optional<PhysReg> res = get_reg_simple(ctx, register_file, info);
+ if (res && get_reg_specified(ctx, register_file, rc, instr, *res))
+ definition->setFixed(*res);
+ } else if (instr->definitions[i - 1].isFixed()) {
+ reg = instr->definitions[i - 1].physReg();
+ reg.reg_b += instr->definitions[i - 1].bytes();
+ if (get_reg_specified(ctx, register_file, rc, instr, reg))
+ definition->setFixed(reg);
+ }
+ } else if (instr->opcode == aco_opcode::p_parallelcopy) {
PhysReg reg = instr->operands[i].physReg();
if (instr->operands[i].isTemp() &&
instr->operands[i].getTemp().type() == definition->getTemp().type() &&
@@ -2600,6 +3209,14 @@ register_allocation(Program* program, std::vector<IDSet>& live_out_per_block, ra
parallelcopy, instr);
update_renames(ctx, register_file, parallelcopy, instr, (UpdateRenames)0);
definition->setFixed(reg);
+ } else if (instr_info.classes[(int)instr->opcode] == instr_class::wmma &&
+ instr->operands[2].isTemp() && instr->operands[2].isKill() &&
+ instr->operands[2].regClass() == definition->regClass()) {
+ /* For WMMA, the dest needs to either be equal to operands[2], or not overlap it.
+ * Here we set a policy of forcing them the same if operands[2] gets killed (and
+ * otherwise they don't overlap). This may not be optimal if RA would select a
+ * different location due to affinity, but that gets complicated very quickly. */
+ definition->setFixed(instr->operands[2].physReg());
}
if (!definition->isFixed()) {
@@ -2608,7 +3225,8 @@ register_allocation(Program* program, std::vector<IDSet>& live_out_per_block, ra
PhysReg reg = get_reg(ctx, register_file, tmp, parallelcopy, instr);
definition->setFixed(reg);
if (reg.byte() || register_file.test(reg, 4)) {
- add_subdword_definition(program, instr, reg);
+ bool allow_16bit_write = reg.byte() % 2 == 0 && !register_file.test(reg, 2);
+ add_subdword_definition(program, instr, reg, allow_16bit_write);
definition = &instr->definitions[i]; /* add_subdword_definition can invalidate
the reference */
}
@@ -2624,8 +3242,7 @@ register_allocation(Program* program, std::vector<IDSet>& live_out_per_block, ra
definition->isFixed() &&
((definition->getTemp().type() == RegType::vgpr && definition->physReg() >= 256) ||
(definition->getTemp().type() != RegType::vgpr && definition->physReg() < 256)));
- ctx.defs_done.set(i);
- ctx.assignments[definition->tempId()] = {definition->physReg(), definition->regClass()};
+ ctx.assignments[definition->tempId()].set(*definition);
register_file.fill(*definition);
}
@@ -2645,87 +3262,30 @@ register_allocation(Program* program, std::vector<IDSet>& live_out_per_block, ra
add_subdword_operand(ctx, instr, i, op.physReg().byte(), op.regClass());
}
- /* emit parallelcopy */
- if (!parallelcopy.empty()) {
- aco_ptr<Pseudo_instruction> pc;
- pc.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_parallelcopy,
- Format::PSEUDO, parallelcopy.size(),
- parallelcopy.size()));
- bool linear_vgpr = false;
- bool sgpr_operands_alias_defs = false;
- uint64_t sgpr_operands[4] = {0, 0, 0, 0};
- for (unsigned i = 0; i < parallelcopy.size(); i++) {
- linear_vgpr |= parallelcopy[i].first.regClass().is_linear_vgpr();
-
- if (temp_in_scc && parallelcopy[i].first.isTemp() &&
- parallelcopy[i].first.getTemp().type() == RegType::sgpr) {
- if (!sgpr_operands_alias_defs) {
- unsigned reg = parallelcopy[i].first.physReg().reg();
- unsigned size = parallelcopy[i].first.getTemp().size();
- sgpr_operands[reg / 64u] |= u_bit_consecutive64(reg % 64u, size);
-
- reg = parallelcopy[i].second.physReg().reg();
- size = parallelcopy[i].second.getTemp().size();
- if (sgpr_operands[reg / 64u] & u_bit_consecutive64(reg % 64u, size))
- sgpr_operands_alias_defs = true;
- }
- }
-
- pc->operands[i] = parallelcopy[i].first;
- pc->definitions[i] = parallelcopy[i].second;
- assert(pc->operands[i].size() == pc->definitions[i].size());
-
- /* it might happen that the operand is already renamed. we have to restore the
- * original name. */
- std::unordered_map<unsigned, Temp>::iterator it =
- ctx.orig_names.find(pc->operands[i].tempId());
- Temp orig = it != ctx.orig_names.end() ? it->second : pc->operands[i].getTemp();
- ctx.orig_names[pc->definitions[i].tempId()] = orig;
- ctx.renames[block.index][orig.id()] = pc->definitions[i].getTemp();
- }
-
- if (temp_in_scc && (sgpr_operands_alias_defs || linear_vgpr)) {
- /* disable definitions and re-enable operands */
- RegisterFile tmp_file(register_file);
- for (const Definition& def : instr->definitions) {
- if (def.isTemp() && !def.isKill())
- tmp_file.clear(def);
- }
- for (const Operand& op : instr->operands) {
- if (op.isTemp() && op.isFirstKill())
- tmp_file.block(op.physReg(), op.regClass());
- }
-
- handle_pseudo(ctx, tmp_file, pc.get());
- } else {
- pc->tmp_in_scc = false;
- }
-
- instructions.emplace_back(std::move(pc));
- }
+ emit_parallel_copy(ctx, parallelcopy, instr, instructions, temp_in_scc, register_file);
/* some instructions need VOP3 encoding if operand/definition is not assigned to VCC */
bool instr_needs_vop3 =
!instr->isVOP3() &&
- ((instr->format == Format::VOPC && !(instr->definitions[0].physReg() == vcc)) ||
- (instr->opcode == aco_opcode::v_cndmask_b32 &&
- !(instr->operands[2].physReg() == vcc)) ||
+ ((withoutDPP(instr->format) == Format::VOPC &&
+ instr->definitions[0].physReg() != vcc) ||
+ (instr->opcode == aco_opcode::v_cndmask_b32 && instr->operands[2].physReg() != vcc) ||
((instr->opcode == aco_opcode::v_add_co_u32 ||
instr->opcode == aco_opcode::v_addc_co_u32 ||
instr->opcode == aco_opcode::v_sub_co_u32 ||
instr->opcode == aco_opcode::v_subb_co_u32 ||
instr->opcode == aco_opcode::v_subrev_co_u32 ||
instr->opcode == aco_opcode::v_subbrev_co_u32) &&
- !(instr->definitions[1].physReg() == vcc)) ||
+ instr->definitions[1].physReg() != vcc) ||
((instr->opcode == aco_opcode::v_addc_co_u32 ||
instr->opcode == aco_opcode::v_subb_co_u32 ||
instr->opcode == aco_opcode::v_subbrev_co_u32) &&
- !(instr->operands[2].physReg() == vcc)));
+ instr->operands[2].physReg() != vcc));
if (instr_needs_vop3) {
/* if the first operand is a literal, we have to move it to a reg */
if (instr->operands.size() && instr->operands[0].isLiteral() &&
- program->chip_class < GFX10) {
+ program->gfx_level < GFX10) {
bool can_sgpr = true;
/* check, if we have to move to vgpr */
for (const Operand& op : instr->operands) {
@@ -2749,11 +3309,9 @@ register_allocation(Program* program, std::vector<IDSet>& live_out_per_block, ra
aco_ptr<Instruction> mov;
if (can_sgpr)
- mov.reset(create_instruction<SOP1_instruction>(aco_opcode::s_mov_b32,
- Format::SOP1, 1, 1));
+ mov.reset(create_instruction(aco_opcode::s_mov_b32, Format::SOP1, 1, 1));
else
- mov.reset(create_instruction<VOP1_instruction>(aco_opcode::v_mov_b32,
- Format::VOP1, 1, 1));
+ mov.reset(create_instruction(aco_opcode::v_mov_b32, Format::VOP1, 1, 1));
mov->operands[0] = instr->operands[0];
mov->definitions[0] = Definition(tmp);
mov->definitions[0].setFixed(reg);
@@ -2766,47 +3324,42 @@ register_allocation(Program* program, std::vector<IDSet>& live_out_per_block, ra
}
/* change the instruction to VOP3 to enable an arbitrary register pair as dst */
- aco_ptr<Instruction> tmp = std::move(instr);
- Format format = asVOP3(tmp->format);
- instr.reset(create_instruction<VOP3_instruction>(
- tmp->opcode, format, tmp->operands.size(), tmp->definitions.size()));
- std::copy(tmp->operands.begin(), tmp->operands.end(), instr->operands.begin());
- std::copy(tmp->definitions.begin(), tmp->definitions.end(), instr->definitions.begin());
+ instr->format = asVOP3(instr->format);
}
instructions.emplace_back(std::move(*instr_it));
} /* end for Instr */
- block.instructions = std::move(instructions);
- } /* end for BB */
+ if ((block.kind & block_kind_top_level) && block.linear_succs.empty()) {
+ /* Reset this for block_kind_resume. */
+ ctx.num_linear_vgprs = 0;
- /* find scc spill registers which may be needed for parallelcopies created by phis */
- for (Block& block : program->blocks) {
- if (block.linear_preds.size() <= 1)
- continue;
+ ASSERTED PhysRegInterval vgpr_bounds = get_reg_bounds(ctx, RegType::vgpr, false);
+ ASSERTED PhysRegInterval sgpr_bounds = get_reg_bounds(ctx, RegType::sgpr, false);
+ assert(register_file.count_zero(vgpr_bounds) == ctx.vgpr_bounds);
+ assert(register_file.count_zero(sgpr_bounds) == ctx.sgpr_bounds);
+ } else if (should_compact_linear_vgprs(ctx, live_vars, register_file)) {
+ aco_ptr<Instruction> br = std::move(instructions.back());
+ instructions.pop_back();
- std::bitset<128> regs = sgpr_live_in[block.index];
- if (!regs[127])
- continue;
+ bool temp_in_scc =
+ register_file[scc] || (!br->operands.empty() && br->operands[0].physReg() == scc);
- /* choose a register */
- int16_t reg = 0;
- for (; reg < ctx.program->max_reg_demand.sgpr && regs[reg]; reg++)
- ;
- assert(reg < ctx.program->max_reg_demand.sgpr);
- adjust_max_used_regs(ctx, s1, reg);
+ std::vector<std::pair<Operand, Definition>> parallelcopy;
+ compact_linear_vgprs(ctx, register_file, parallelcopy);
+ update_renames(ctx, register_file, parallelcopy, br, rename_not_killed_ops);
+ emit_parallel_copy_internal(ctx, parallelcopy, br, instructions, temp_in_scc, register_file);
- /* update predecessors */
- for (unsigned& pred_index : block.linear_preds) {
- Block& pred = program->blocks[pred_index];
- pred.scc_live_out = true;
- pred.scratch_sgpr = PhysReg{(uint16_t)reg};
+ instructions.push_back(std::move(br));
}
- }
+
+ block.instructions = std::move(instructions);
+ } /* end for BB */
/* num_gpr = rnd_up(max_used_gpr + 1) */
- program->config->num_vgprs = get_vgpr_alloc(program, ctx.max_used_vgpr + 1);
+ program->config->num_vgprs =
+ std::min<uint16_t>(get_vgpr_alloc(program, ctx.max_used_vgpr + 1), 256);
program->config->num_sgprs = get_sgpr_alloc(program, ctx.max_used_sgpr + 1);
program->progress = CompilationProgress::after_ra;