1 files changed, 1282 insertions, 729 deletions
diff --git a/src/amd/compiler/aco_register_allocation.cpp b/src/amd/compiler/aco_register_allocation.cpp
index d474dfe463a..47fefded1e5 100644
--- a/src/amd/compiler/aco_register_allocation.cpp
+++ b/src/amd/compiler/aco_register_allocation.cpp
@@ -1,33 +1,19 @@
 /*
  * Copyright © 2018 Valve Corporation
  *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
+ * SPDX-License-Identifier: MIT
  */
 
 #include "aco_ir.h"
 
+#include "util/bitset.h"
+#include "util/enum_operators.h"
+
 #include <algorithm>
 #include <array>
 #include <bitset>
 #include <map>
+#include <optional>
 #include <set>
 #include <unordered_map>
 #include <vector>
@@ -37,20 +23,35 @@ namespace {
 
 struct ra_ctx;
 
-unsigned get_subdword_operand_stride(chip_class chip, const aco_ptr<Instruction>& instr,
+unsigned get_subdword_operand_stride(amd_gfx_level gfx_level, const aco_ptr<Instruction>& instr,
                                      unsigned idx, RegClass rc);
 void add_subdword_operand(ra_ctx& ctx, aco_ptr<Instruction>& instr, unsigned idx, unsigned byte,
                           RegClass rc);
 std::pair<unsigned, unsigned>
 get_subdword_definition_info(Program* program, const aco_ptr<Instruction>& instr, RegClass rc);
-void add_subdword_definition(Program* program, aco_ptr<Instruction>& instr, PhysReg reg);
+void add_subdword_definition(Program* program, aco_ptr<Instruction>& instr, PhysReg reg,
+                             bool allow_16bit_write);
 
 struct assignment {
    PhysReg reg;
    RegClass rc;
-   uint8_t assigned = 0;
+   union {
+      struct {
+         bool assigned : 1;
+         bool vcc : 1;
+         bool m0 : 1;
+      };
+      uint8_t _ = 0;
+   };
+   uint32_t affinity = 0;
    assignment() = default;
-   assignment(PhysReg reg_, RegClass rc_) : reg(reg_), rc(rc_), assigned(-1) {}
+   assignment(PhysReg reg_, RegClass rc_) : reg(reg_), rc(rc_) { assigned = true; }
+   void set(const Definition& def)
+   {
+      assigned = true;
+      reg = def.physReg();
+      rc = def.regClass();
+   }
 };
 
 struct ra_ctx {
@@ -61,16 +62,19 @@ struct ra_ctx {
    std::vector<std::unordered_map<unsigned, Temp>> renames;
    std::vector<uint32_t> loop_header;
    std::unordered_map<unsigned, Temp> orig_names;
-   std::unordered_map<unsigned, unsigned> affinities;
    std::unordered_map<unsigned, Instruction*> vectors;
    std::unordered_map<unsigned, Instruction*> split_vectors;
    aco_ptr<Instruction> pseudo_dummy;
+   aco_ptr<Instruction> phi_dummy;
    uint16_t max_used_sgpr = 0;
    uint16_t max_used_vgpr = 0;
    uint16_t sgpr_limit;
    uint16_t vgpr_limit;
    std::bitset<512> war_hint;
-   std::bitset<64> defs_done; /* see MAX_ARGS in aco_instruction_selection_setup.cpp */
+
+   uint16_t sgpr_bounds;
+   uint16_t vgpr_bounds;
+   uint16_t num_linear_vgprs;
 
    ra_test_policy policy;
 
@@ -78,10 +82,14 @@ struct ra_ctx {
        : program(program_), assignments(program->peekAllocationId()),
          renames(program->blocks.size()), policy(policy_)
    {
-      pseudo_dummy.reset(
-         create_instruction<Instruction>(aco_opcode::p_parallelcopy, Format::PSEUDO, 0, 0));
+      pseudo_dummy.reset(create_instruction(aco_opcode::p_parallelcopy, Format::PSEUDO, 0, 0));
+      phi_dummy.reset(create_instruction(aco_opcode::p_linear_phi, Format::PSEUDO, 0, 0));
       sgpr_limit = get_addr_sgpr_from_waves(program, program->min_waves);
-      vgpr_limit = get_addr_sgpr_from_waves(program, program->min_waves);
+      vgpr_limit = get_addr_vgpr_from_waves(program, program->min_waves);
+
+      sgpr_bounds = program->max_reg_demand.sgpr;
+      vgpr_bounds = program->max_reg_demand.vgpr;
+      num_linear_vgprs = 0;
    }
 };
 
@@ -153,7 +161,7 @@ struct PhysRegInterval {
 bool
 intersects(const PhysRegInterval& a, const PhysRegInterval& b)
 {
-   return ((a.lo() >= b.lo() && a.lo() < b.hi()) || (a.hi() > b.lo() && a.hi() <= b.hi()));
+   return a.hi() > b.lo() && b.hi() > a.lo();
 }
 
 /* Gets the stride for full (non-subdword) registers */
@@ -175,15 +183,24 @@ get_stride(RegClass rc)
 }
 
 PhysRegInterval
-get_reg_bounds(Program* program, RegType type)
+get_reg_bounds(ra_ctx& ctx, RegType type, bool linear_vgpr)
 {
-   if (type == RegType::vgpr) {
-      return {PhysReg{256}, (unsigned)program->max_reg_demand.vgpr};
+   uint16_t linear_vgpr_start = ctx.vgpr_bounds - ctx.num_linear_vgprs;
+   if (type == RegType::vgpr && linear_vgpr) {
+      return PhysRegInterval{PhysReg(256 + linear_vgpr_start), ctx.num_linear_vgprs};
+   } else if (type == RegType::vgpr) {
+      return PhysRegInterval{PhysReg(256), linear_vgpr_start};
    } else {
-      return {PhysReg{0}, (unsigned)program->max_reg_demand.sgpr};
+      return PhysRegInterval{PhysReg(0), ctx.sgpr_bounds};
    }
 }
 
+PhysRegInterval
+get_reg_bounds(ra_ctx& ctx, RegClass rc)
+{
+   return get_reg_bounds(ctx, rc.type(), rc.is_linear_vgpr());
+}
+
 struct DefInfo {
    PhysRegInterval bounds;
    uint8_t size;
@@ -195,11 +212,11 @@ struct DefInfo {
       size = rc.size();
       stride = get_stride(rc);
 
-      bounds = get_reg_bounds(ctx.program, rc.type());
+      bounds = get_reg_bounds(ctx, rc);
 
       if (rc.is_subdword() && operand >= 0) {
          /* stride in bytes */
-         stride = get_subdword_operand_stride(ctx.program->chip_class, instr, operand, rc);
+         stride = get_subdword_operand_stride(ctx.program->gfx_level, instr, operand, rc);
       } else if (rc.is_subdword()) {
          std::pair<unsigned, unsigned> info = get_subdword_definition_info(ctx.program, instr, rc);
          stride = info.first;
@@ -214,6 +231,20 @@ struct DefInfo {
                stride = DIV_ROUND_UP(stride, 4);
          }
          assert(stride > 0);
+      } else if (instr->isMIMG() && instr->mimg().d16 && ctx.program->gfx_level <= GFX9) {
+         /* Workaround GFX9 hardware bug for D16 image instructions: FeatureImageGather4D16Bug
+          *
+          * The register use is not calculated correctly, and the hardware assumes a
+          * full dword per component. Don't use the last registers of the register file.
+          * Otherwise, the instruction will be skipped.
+          *
+          * https://reviews.llvm.org/D81172
+          */
+         bool imageGather4D16Bug = operand == -1 && rc == v2 && instr->mimg().dmask != 0xF;
+         assert(ctx.program->gfx_level == GFX9 && "Image D16 on GFX8 not supported.");
+
+         if (imageGather4D16Bug)
+            bounds.size -= MAX2(rc.bytes() / 4 - ctx.num_linear_vgprs, 0);
       }
    }
 };
@@ -229,7 +260,7 @@ public:
 
    uint32_t& operator[](PhysReg index) { return regs[index]; }
 
-   unsigned count_zero(PhysRegInterval reg_interval)
+   unsigned count_zero(PhysRegInterval reg_interval) const
    {
       unsigned res = 0;
       for (PhysReg reg : reg_interval)
@@ -238,16 +269,17 @@ public:
    }
 
    /* Returns true if any of the bytes in the given range are allocated or blocked */
-   bool test(PhysReg start, unsigned num_bytes)
+   bool test(PhysReg start, unsigned num_bytes) const
    {
       for (PhysReg i = start; i.reg_b < start.reg_b + num_bytes; i = PhysReg(i + 1)) {
          assert(i <= 511);
          if (regs[i] & 0x0FFFFFFF)
             return true;
          if (regs[i] == 0xF0000000) {
-            assert(subdword_regs.find(i) != subdword_regs.end());
+            auto it = subdword_regs.find(i);
+            assert(it != subdword_regs.end());
             for (unsigned j = i.byte(); i * 4 + j < start.reg_b + num_bytes && j < 4; j++) {
-               if (subdword_regs[i][j])
+               if (it->second[j])
                   return true;
             }
          }
@@ -263,24 +295,28 @@ public:
          fill(start, rc.size(), 0xFFFFFFFF);
    }
 
-   bool is_blocked(PhysReg start)
+   bool is_blocked(PhysReg start) const
    {
       if (regs[start] == 0xFFFFFFFF)
          return true;
       if (regs[start] == 0xF0000000) {
+         auto it = subdword_regs.find(start);
+         assert(it != subdword_regs.end());
          for (unsigned i = start.byte(); i < 4; i++)
-            if (subdword_regs[start][i] == 0xFFFFFFFF)
+            if (it->second[i] == 0xFFFFFFFF)
                return true;
       }
       return false;
    }
 
-   bool is_empty_or_blocked(PhysReg start)
+   bool is_empty_or_blocked(PhysReg start) const
    {
       /* Empty is 0, blocked is 0xFFFFFFFF, so to check both we compare the
        * incremented value to 1 */
       if (regs[start] == 0xF0000000) {
-         return subdword_regs[start][start.byte()] + 1 <= 1;
+         auto it = subdword_regs.find(start);
+         assert(it != subdword_regs.end());
+         return it->second[start.byte()] + 1 <= 1;
       }
       return regs[start] + 1 <= 1;
    }
@@ -313,9 +349,9 @@ public:
 
    void clear(Definition def) { clear(def.physReg(), def.regClass()); }
 
-   unsigned get_id(PhysReg reg)
+   unsigned get_id(PhysReg reg) const
    {
-      return regs[reg] == 0xF0000000 ? subdword_regs[reg][reg.byte()] : regs[reg];
+      return regs[reg] == 0xF0000000 ? subdword_regs.at(reg)[reg.byte()] : regs[reg];
    }
 
 private:
@@ -343,24 +379,24 @@ private:
    }
 };
 
-std::set<std::pair<unsigned, unsigned>> find_vars(ra_ctx& ctx, RegisterFile& reg_file,
-                                                  const PhysRegInterval reg_interval);
+std::vector<unsigned> find_vars(ra_ctx& ctx, const RegisterFile& reg_file,
+                                const PhysRegInterval reg_interval);
 
 /* helper function for debugging */
 UNUSED void
 print_reg(const RegisterFile& reg_file, PhysReg reg, bool has_adjacent_variable)
 {
    if (reg_file[reg] == 0xFFFFFFFF) {
-      printf("☐");
+      printf((const char*)u8"☐");
    } else if (reg_file[reg]) {
       const bool show_subdword_alloc = (reg_file[reg] == 0xF0000000);
       if (show_subdword_alloc) {
-         const char* block_chars[] = {
+         auto block_chars = {
             // clang-format off
-            "?", "▘", "▝", "▀",
-            "▖", "▌", "▞", "▛",
-            "▗", "▚", "▐", "▜",
-            "▄", "▙", "▟", "▉"
+            u8"?", u8"▘", u8"▝", u8"▀",
+            u8"▖", u8"▌", u8"▞", u8"▛",
+            u8"▗", u8"▚", u8"▐", u8"▜",
+            u8"▄", u8"▙", u8"▟", u8"▉"
             // clang-format on
          };
          unsigned index = 0;
@@ -369,27 +405,26 @@ print_reg(const RegisterFile& reg_file, PhysReg reg, bool has_adjacent_variable)
                index |= 1 << i;
             }
          }
-         printf("%s", block_chars[index]);
+         printf("%s", (const char*)(block_chars.begin()[index]));
       } else {
          /* Indicate filled register slot */
          if (!has_adjacent_variable) {
-            printf("█");
+            printf((const char*)u8"█");
          } else {
             /* Use a slightly shorter box to leave a small gap between adjacent variables */
-            printf("▉");
+            printf((const char*)u8"▉");
          }
       }
    } else {
-      printf("·");
+      printf((const char*)u8"·");
    }
 }
 
 /* helper function for debugging */
 UNUSED void
-print_regs(ra_ctx& ctx, bool vgprs, RegisterFile& reg_file)
+print_regs(ra_ctx& ctx, PhysRegInterval regs, const RegisterFile& reg_file)
 {
-   PhysRegInterval regs = get_reg_bounds(ctx.program, vgprs ? RegType::vgpr : RegType::sgpr);
-   char reg_char = vgprs ? 'v' : 's';
+   char reg_char = regs.lo().reg() >= 256 ? 'v' : 's';
    const int max_regs_per_line = 64;
 
    /* print markers */
@@ -428,11 +463,11 @@ print_regs(ra_ctx& ctx, bool vgprs, RegisterFile& reg_file)
    printf("%u/%u used, %u/%u free\n", regs.size - free_regs, regs.size, free_regs, regs.size);
 
    /* print assignments ordered by registers */
-   std::map<PhysReg, std::pair<unsigned, unsigned>>
-      regs_to_vars; /* maps to byte size and temp id */
-   for (const auto& size_id : find_vars(ctx, reg_file, regs)) {
-      auto reg = ctx.assignments[size_id.second].reg;
-      ASSERTED auto inserted = regs_to_vars.emplace(reg, size_id);
+   std::map<PhysReg, std::pair<unsigned, unsigned>> regs_to_vars; /* maps to byte size and temp id */
+   for (unsigned id : find_vars(ctx, reg_file, regs)) {
+      const assignment& var = ctx.assignments[id];
+      PhysReg reg = var.reg;
+      ASSERTED auto inserted = regs_to_vars.emplace(reg, std::make_pair(var.rc.bytes(), id));
       assert(inserted.second);
    }
 
@@ -445,11 +480,11 @@ print_regs(ra_ctx& ctx, bool vgprs, RegisterFile& reg_file)
           ctx.orig_names[size_id.second].id() != size_id.second) {
          printf("(was %%%d) ", ctx.orig_names[size_id.second].id());
       }
-      printf("= %c[%d", reg_char, first_reg.reg() - regs.lo());
+      printf("= %c[%d", reg_char, first_reg.reg() % 256);
       PhysReg last_reg = first_reg.advance(size_id.first - 1);
       if (first_reg.reg() != last_reg.reg()) {
          assert(first_reg.byte() == 0 && last_reg.byte() == 3);
-         printf("-%d", last_reg.reg() - regs.lo());
+         printf("-%d", last_reg.reg() % 256);
       }
       printf("]");
       if (first_reg.byte() != 0 || last_reg.byte() != 3) {
@@ -460,14 +495,14 @@ print_regs(ra_ctx& ctx, bool vgprs, RegisterFile& reg_file)
 }
 
 unsigned
-get_subdword_operand_stride(chip_class chip, const aco_ptr<Instruction>& instr, unsigned idx,
-                            RegClass rc)
+get_subdword_operand_stride(amd_gfx_level gfx_level, const aco_ptr<Instruction>& instr,
+                            unsigned idx, RegClass rc)
 {
    if (instr->isPseudo()) {
       /* v_readfirstlane_b32 cannot use SDWA */
       if (instr->opcode == aco_opcode::p_as_uniform)
          return 4;
-      else if (chip >= GFX8)
+      else if (gfx_level >= GFX8)
          return rc.bytes() % 2 == 0 ? 2 : 1;
       else
          return 4;
@@ -475,26 +510,27 @@ get_subdword_operand_stride(chip_class chip, const aco_ptr<Instruction>& instr,
 
    assert(rc.bytes() <= 2);
    if (instr->isVALU()) {
-      if (can_use_SDWA(chip, instr, false))
+      if (can_use_SDWA(gfx_level, instr, false))
          return rc.bytes();
-      if (can_use_opsel(chip, instr->opcode, idx, true))
+      if (can_use_opsel(gfx_level, instr->opcode, idx))
          return 2;
-      if (instr->format == Format::VOP3P)
+      if (instr->isVOP3P())
          return 2;
    }
 
    switch (instr->opcode) {
    case aco_opcode::v_cvt_f32_ubyte0: return 1;
    case aco_opcode::ds_write_b8:
-   case aco_opcode::ds_write_b16: return chip >= GFX9 ? 2 : 4;
+   case aco_opcode::ds_write_b16: return gfx_level >= GFX9 ? 2 : 4;
    case aco_opcode::buffer_store_byte:
    case aco_opcode::buffer_store_short:
+   case aco_opcode::buffer_store_format_d16_x:
    case aco_opcode::flat_store_byte:
    case aco_opcode::flat_store_short:
    case aco_opcode::scratch_store_byte:
    case aco_opcode::scratch_store_short:
    case aco_opcode::global_store_byte:
-   case aco_opcode::global_store_short: return chip >= GFX9 ? 2 : 4;
+   case aco_opcode::global_store_short: return gfx_level >= GFX9 ? 2 : 4;
    default: return 4;
    }
 }
@@ -503,24 +539,12 @@ void
 add_subdword_operand(ra_ctx& ctx, aco_ptr<Instruction>& instr, unsigned idx, unsigned byte,
                      RegClass rc)
 {
-   chip_class chip = ctx.program->chip_class;
+   amd_gfx_level gfx_level = ctx.program->gfx_level;
    if (instr->isPseudo() || byte == 0)
       return;
 
    assert(rc.bytes() <= 2);
    if (instr->isVALU()) {
-      /* check if we can use opsel */
-      if (instr->format == Format::VOP3) {
-         assert(byte == 2);
-         instr->vop3().opsel |= 1 << idx;
-         return;
-      }
-      if (instr->isVOP3P()) {
-         assert(byte == 2 && !(instr->vop3p().opsel_lo & (1 << idx)));
-         instr->vop3p().opsel_lo |= 1 << idx;
-         instr->vop3p().opsel_hi |= 1 << idx;
-         return;
-      }
       if (instr->opcode == aco_opcode::v_cvt_f32_ubyte0) {
          switch (byte) {
          case 0: instr->opcode = aco_opcode::v_cvt_f32_ubyte0; break;
@@ -532,8 +556,21 @@ add_subdword_operand(ra_ctx& ctx, aco_ptr<Instruction>& instr, unsigned idx, uns
       }
 
       /* use SDWA */
-      assert(can_use_SDWA(chip, instr, false));
-      convert_to_SDWA(chip, instr);
+      if (can_use_SDWA(gfx_level, instr, false)) {
+         convert_to_SDWA(gfx_level, instr);
+         return;
+      }
+
+      /* use opsel */
+      if (instr->isVOP3P()) {
+         assert(byte == 2 && !instr->valu().opsel_lo[idx]);
+         instr->valu().opsel_lo[idx] = true;
+         instr->valu().opsel_hi[idx] = true;
+         return;
+      }
+
+      assert(can_use_opsel(gfx_level, instr->opcode, idx));
+      instr->valu().opsel[idx] = true;
       return;
    }
 
@@ -546,6 +583,8 @@ add_subdword_operand(ra_ctx& ctx, aco_ptr<Instruction>& instr, unsigned idx, uns
       instr->opcode = aco_opcode::buffer_store_byte_d16_hi;
    else if (instr->opcode == aco_opcode::buffer_store_short)
       instr->opcode = aco_opcode::buffer_store_short_d16_hi;
+   else if (instr->opcode == aco_opcode::buffer_store_format_d16_x)
+      instr->opcode = aco_opcode::buffer_store_format_d16_hi_x;
    else if (instr->opcode == aco_opcode::flat_store_byte)
       instr->opcode = aco_opcode::flat_store_byte_d16_hi;
    else if (instr->opcode == aco_opcode::flat_store_short)
@@ -567,34 +606,38 @@ add_subdword_operand(ra_ctx& ctx, aco_ptr<Instruction>& instr, unsigned idx, uns
 std::pair<unsigned, unsigned>
 get_subdword_definition_info(Program* program, const aco_ptr<Instruction>& instr, RegClass rc)
 {
-   chip_class chip = program->chip_class;
+   amd_gfx_level gfx_level = program->gfx_level;
 
    if (instr->isPseudo()) {
-      if (chip >= GFX8)
+      if (instr->opcode == aco_opcode::p_interp_gfx11)
+         return std::make_pair(4u, 4u);
+      else if (gfx_level >= GFX8)
          return std::make_pair(rc.bytes() % 2 == 0 ? 2 : 1, rc.bytes());
       else
          return std::make_pair(4, rc.size() * 4u);
    }
 
-   if (instr->isVALU() || instr->isVINTRP()) {
+   if (instr->isVALU()) {
       assert(rc.bytes() <= 2);
 
-      if (can_use_SDWA(chip, instr, false))
+      if (can_use_SDWA(gfx_level, instr, false))
          return std::make_pair(rc.bytes(), rc.bytes());
 
       unsigned bytes_written = 4u;
-      if (instr_is_16bit(chip, instr->opcode))
+      if (instr_is_16bit(gfx_level, instr->opcode))
          bytes_written = 2u;
 
       unsigned stride = 4u;
       if (instr->opcode == aco_opcode::v_fma_mixlo_f16 ||
-          can_use_opsel(chip, instr->opcode, -1, true))
+          can_use_opsel(gfx_level, instr->opcode, -1))
          stride = 2u;
 
       return std::make_pair(stride, bytes_written);
    }
 
    switch (instr->opcode) {
+   case aco_opcode::v_interp_p2_f16: return std::make_pair(2u, 2u);
+   /* D16 loads with _hi version */
    case aco_opcode::ds_read_u8_d16:
    case aco_opcode::ds_read_i8_d16:
    case aco_opcode::ds_read_u16_d16:
@@ -609,58 +652,80 @@ get_subdword_definition_info(Program* program, const aco_ptr<Instruction>& instr
    case aco_opcode::scratch_load_short_d16:
    case aco_opcode::buffer_load_ubyte_d16:
    case aco_opcode::buffer_load_sbyte_d16:
-   case aco_opcode::buffer_load_short_d16: {
-      assert(chip >= GFX9);
+   case aco_opcode::buffer_load_short_d16:
+   case aco_opcode::buffer_load_format_d16_x: {
+      assert(gfx_level >= GFX9);
       if (!program->dev.sram_ecc_enabled)
          return std::make_pair(2u, 2u);
       else
          return std::make_pair(2u, 4u);
    }
+   /* 3-component D16 loads */
+   case aco_opcode::buffer_load_format_d16_xyz:
+   case aco_opcode::tbuffer_load_format_d16_xyz: {
+      assert(gfx_level >= GFX9);
+      if (!program->dev.sram_ecc_enabled)
+         return std::make_pair(4u, 6u);
+      break;
+   }
 
-   default: return std::make_pair(4, rc.size() * 4u);
+   default: break;
    }
+
+   if (instr->isMIMG() && instr->mimg().d16 && !program->dev.sram_ecc_enabled) {
+      assert(gfx_level >= GFX9);
+      return std::make_pair(4u, rc.bytes());
+   }
+
+   return std::make_pair(4, rc.size() * 4u);
 }
 
 void
-add_subdword_definition(Program* program, aco_ptr<Instruction>& instr, PhysReg reg)
+add_subdword_definition(Program* program, aco_ptr<Instruction>& instr, PhysReg reg,
+                        bool allow_16bit_write)
 {
    if (instr->isPseudo())
       return;
 
    if (instr->isVALU()) {
-      chip_class chip = program->chip_class;
+      amd_gfx_level gfx_level = program->gfx_level;
       assert(instr->definitions[0].bytes() <= 2);
 
-      if (reg.byte() == 0 && instr_is_16bit(chip, instr->opcode))
+      if (reg.byte() == 0 && allow_16bit_write && instr_is_16bit(gfx_level, instr->opcode))
          return;
 
-      /* check if we can use opsel */
-      if (instr->format == Format::VOP3) {
-         assert(reg.byte() == 2);
-         assert(can_use_opsel(chip, instr->opcode, -1, true));
-         instr->vop3().opsel |= (1 << 3); /* dst in high half */
+      /* use SDWA */
+      if (can_use_SDWA(gfx_level, instr, false)) {
+         convert_to_SDWA(gfx_level, instr);
          return;
       }
 
+      assert(allow_16bit_write);
+
       if (instr->opcode == aco_opcode::v_fma_mixlo_f16) {
          instr->opcode = aco_opcode::v_fma_mixhi_f16;
          return;
       }
 
-      /* use SDWA */
-      assert(can_use_SDWA(chip, instr, false));
-      convert_to_SDWA(chip, instr);
+      /* use opsel */
+      assert(reg.byte() == 2);
+      assert(can_use_opsel(gfx_level, instr->opcode, -1));
+      instr->valu().opsel[3] = true; /* dst in high half */
       return;
    }
 
    if (reg.byte() == 0)
       return;
+   else if (instr->opcode == aco_opcode::v_interp_p2_f16)
+      instr->opcode = aco_opcode::v_interp_p2_hi_f16;
    else if (instr->opcode == aco_opcode::buffer_load_ubyte_d16)
       instr->opcode = aco_opcode::buffer_load_ubyte_d16_hi;
    else if (instr->opcode == aco_opcode::buffer_load_sbyte_d16)
       instr->opcode = aco_opcode::buffer_load_sbyte_d16_hi;
    else if (instr->opcode == aco_opcode::buffer_load_short_d16)
       instr->opcode = aco_opcode::buffer_load_short_d16_hi;
+   else if (instr->opcode == aco_opcode::buffer_load_format_d16_x)
+      instr->opcode = aco_opcode::buffer_load_format_d16_hi_x;
    else if (instr->opcode == aco_opcode::flat_load_ubyte_d16)
       instr->opcode = aco_opcode::flat_load_ubyte_d16_hi;
    else if (instr->opcode == aco_opcode::flat_load_sbyte_d16)
@@ -697,6 +762,7 @@ adjust_max_used_regs(ra_ctx& ctx, RegClass rc, unsigned reg)
    if (rc.type() == RegType::vgpr) {
       assert(reg >= 256);
       uint16_t hi = reg - 256 + size - 1;
+      assert(hi <= 255);
       ctx.max_used_vgpr = std::max(ctx.max_used_vgpr, hi);
    } else if (reg + rc.size() <= max_addressible_sgpr) {
       uint16_t hi = reg + size - 1;
@@ -707,6 +773,7 @@ adjust_max_used_regs(ra_ctx& ctx, RegClass rc, unsigned reg)
 enum UpdateRenames {
    rename_not_killed_ops = 0x1,
    fill_killed_ops = 0x2,
+   rename_precolored_ops = 0x4,
 };
 MESA_DEFINE_CPP_ENUM_BITFIELD_OPERATORS(UpdateRenames);
 
@@ -779,28 +846,39 @@ update_renames(ra_ctx& ctx, RegisterFile& reg_file,
       assert(ctx.assignments.size() == ctx.program->peekAllocationId());
 
       /* check if we moved an operand */
-      bool first = true;
+      bool first[2] = {true, true};
       bool fill = true;
       for (unsigned i = 0; i < instr->operands.size(); i++) {
          Operand& op = instr->operands[i];
          if (!op.isTemp())
             continue;
          if (op.tempId() == copy.first.tempId()) {
-            bool omit_renaming = !(flags & rename_not_killed_ops) && !op.isKillBeforeDef();
-            for (std::pair<Operand, Definition>& pc : parallelcopies) {
-               PhysReg def_reg = pc.second.physReg();
-               omit_renaming &= def_reg > copy.first.physReg()
-                                   ? (copy.first.physReg() + copy.first.size() <= def_reg.reg())
-                                   : (def_reg + pc.second.size() <= copy.first.physReg().reg());
+            /* only rename precolored operands if the copy-location matches */
+            bool omit_renaming = (flags & rename_precolored_ops) && op.isFixed() &&
+                                 op.physReg() != copy.second.physReg();
+
+            /* Omit renaming in some cases for p_create_vector in order to avoid
+             * unnecessary shuffle code. */
+            if (!(flags & rename_not_killed_ops) && !op.isKillBeforeDef()) {
+               omit_renaming = true;
+               for (std::pair<Operand, Definition>& pc : parallelcopies) {
+                  PhysReg def_reg = pc.second.physReg();
+                  omit_renaming &= def_reg > copy.first.physReg()
+                                      ? (copy.first.physReg() + copy.first.size() <= def_reg.reg())
+                                      : (def_reg + pc.second.size() <= copy.first.physReg().reg());
+               }
             }
-            if (omit_renaming) {
-               if (first)
-                  op.setFirstKill(true);
-               else
-                  op.setKill(true);
-               first = false;
+
+            /* Fix the kill flags */
+            if (first[omit_renaming])
+               op.setFirstKill(omit_renaming || op.isKill());
+            else
+               op.setKill(omit_renaming || op.isKill());
+            first[omit_renaming] = false;
+
+            if (omit_renaming)
                continue;
-            }
+
             op.setTemp(copy.second.getTemp());
             op.setFixed(copy.second.physReg());
 
@@ -815,8 +893,8 @@ update_renames(ra_ctx& ctx, RegisterFile& reg_file,
    }
 }
 
-std::pair<PhysReg, bool>
-get_reg_simple(ra_ctx& ctx, RegisterFile& reg_file, DefInfo info)
+std::optional<PhysReg>
+get_reg_simple(ra_ctx& ctx, const RegisterFile& reg_file, DefInfo info)
 {
    const PhysRegInterval& bounds = info.bounds;
    uint32_t size = info.size;
@@ -829,8 +907,8 @@ get_reg_simple(ra_ctx& ctx, RegisterFile& reg_file, DefInfo info)
       if (size % new_stride)
          continue;
       new_info.stride = new_stride;
-      std::pair<PhysReg, bool> res = get_reg_simple(ctx, reg_file, new_info);
-      if (res.second)
+      std::optional<PhysReg> res = get_reg_simple(ctx, reg_file, new_info);
+      if (res)
          return res;
    }
 
@@ -864,7 +942,7 @@ get_reg_simple(ra_ctx& ctx, RegisterFile& reg_file, DefInfo info)
          /* early return on exact matches */
          if (size == gap.size) {
             adjust_max_used_regs(ctx, rc, gap.lo());
-            return {gap.lo(), true};
+            return gap.lo();
          }
 
          /* check if it fits and the gap size is smaller */
@@ -877,7 +955,7 @@ get_reg_simple(ra_ctx& ctx, RegisterFile& reg_file, DefInfo info)
       }
 
       if (best_gap.size == UINT_MAX)
-         return {{}, false};
+         return {};
 
       /* find best position within gap by leaving a good stride for other variables*/
       unsigned buffer = best_gap.size - size;
@@ -889,7 +967,7 @@ get_reg_simple(ra_ctx& ctx, RegisterFile& reg_file, DefInfo info)
       }
 
       adjust_max_used_regs(ctx, rc, best_gap.lo());
-      return {best_gap.lo(), true};
+      return best_gap.lo();
    }
 
    for (PhysRegInterval reg_win = {bounds.lo(), size}; reg_win.hi() <= bounds.hi();
@@ -901,7 +979,7 @@ get_reg_simple(ra_ctx& ctx, RegisterFile& reg_file, DefInfo info)
       bool is_valid = std::all_of(std::next(reg_win.begin()), reg_win.end(), is_free);
       if (is_valid) {
          adjust_max_used_regs(ctx, rc, reg_win.lo());
-         return {reg_win.lo(), true};
+         return reg_win.lo();
       }
    }
 
@@ -909,7 +987,8 @@ get_reg_simple(ra_ctx& ctx, RegisterFile& reg_file, DefInfo info)
     * larger instruction encodings or copies
     * TODO: don't do this in situations where it doesn't benefit */
    if (rc.is_subdword()) {
-      for (std::pair<const uint32_t, std::array<uint32_t, 4>>& entry : reg_file.subdword_regs) {
+      for (const std::pair<const uint32_t, std::array<uint32_t, 4>>& entry :
+           reg_file.subdword_regs) {
          assert(reg_file[PhysReg{entry.first}] == 0xF0000000);
          if (!bounds.contains({PhysReg{entry.first}, rc.size()}))
             continue;
@@ -928,119 +1007,172 @@ get_reg_simple(ra_ctx& ctx, RegisterFile& reg_file, DefInfo info)
                PhysReg res{entry.first};
                res.reg_b += i;
                adjust_max_used_regs(ctx, rc, entry.first);
-               return {res, true};
+               return res;
             }
          }
       }
    }
 
-   return {{}, false};
+   return {};
 }
 
-/* collect variables from a register area and clear reg_file */
-std::set<std::pair<unsigned, unsigned>>
-find_vars(ra_ctx& ctx, RegisterFile& reg_file, const PhysRegInterval reg_interval)
+/* collect variables from a register area */
+std::vector<unsigned>
+find_vars(ra_ctx& ctx, const RegisterFile& reg_file, const PhysRegInterval reg_interval)
 {
-   std::set<std::pair<unsigned, unsigned>> vars;
+   std::vector<unsigned> vars;
    for (PhysReg j : reg_interval) {
       if (reg_file.is_blocked(j))
          continue;
       if (reg_file[j] == 0xF0000000) {
          for (unsigned k = 0; k < 4; k++) {
-            unsigned id = reg_file.subdword_regs[j][k];
-            if (id) {
-               assignment& var = ctx.assignments[id];
-               vars.emplace(var.rc.bytes(), id);
-            }
+            unsigned id = reg_file.subdword_regs.at(j)[k];
+            if (id && (vars.empty() || id != vars.back()))
+               vars.emplace_back(id);
          }
-      } else if (reg_file[j] != 0) {
+      } else {
          unsigned id = reg_file[j];
-         assignment& var = ctx.assignments[id];
-         vars.emplace(var.rc.bytes(), id);
+         if (id && (vars.empty() || id != vars.back()))
+            vars.emplace_back(id);
       }
    }
    return vars;
 }
 
-/* collect variables from a register area and clear reg_file */
-std::set<std::pair<unsigned, unsigned>>
+/* collect variables from a register area and clear reg_file
+ * variables are sorted in decreasing size and
+ * increasing assigned register
+ */
+std::vector<unsigned>
 collect_vars(ra_ctx& ctx, RegisterFile& reg_file, const PhysRegInterval reg_interval)
 {
-   std::set<std::pair<unsigned, unsigned>> vars = find_vars(ctx, reg_file, reg_interval);
-   for (std::pair<unsigned, unsigned> size_id : vars) {
-      assignment& var = ctx.assignments[size_id.second];
+   std::vector<unsigned> ids = find_vars(ctx, reg_file, reg_interval);
+   std::sort(ids.begin(), ids.end(),
+             [&](unsigned a, unsigned b)
+             {
+                assignment& var_a = ctx.assignments[a];
+                assignment& var_b = ctx.assignments[b];
+                return var_a.rc.bytes() > var_b.rc.bytes() ||
+                       (var_a.rc.bytes() == var_b.rc.bytes() && var_a.reg < var_b.reg);
+             });
+
+   for (unsigned id : ids) {
+      assignment& var = ctx.assignments[id];
       reg_file.clear(var.reg, var.rc);
    }
-   return vars;
+   return ids;
+}
+
+std::optional<PhysReg>
+get_reg_for_create_vector_copy(ra_ctx& ctx, RegisterFile& reg_file,
+                               std::vector<std::pair<Operand, Definition>>& parallelcopies,
+                               aco_ptr<Instruction>& instr, const PhysRegInterval def_reg,
+                               DefInfo info, unsigned id)
+{
+   PhysReg reg = def_reg.lo();
+   /* dead operand: return position in vector */
+   for (unsigned i = 0; i < instr->operands.size(); i++) {
+      if (instr->operands[i].isTemp() && instr->operands[i].tempId() == id &&
+          instr->operands[i].isKillBeforeDef()) {
+         assert(!reg_file.test(reg, instr->operands[i].bytes()));
+         if (info.rc.is_subdword() || reg.byte() == 0)
+            return reg;
+         else
+            return {};
+      }
+      reg.reg_b += instr->operands[i].bytes();
+   }
+
+   /* GFX9+ has a VGPR swap instruction. */
+   if (ctx.program->gfx_level <= GFX8 || info.rc.type() == RegType::sgpr)
+      return {};
+
+   /* check if the previous position was in vector */
+   assignment& var = ctx.assignments[id];
+   if (def_reg.contains(PhysRegInterval{var.reg, info.size})) {
+      reg = def_reg.lo();
+      /* try to use the previous register of the operand */
+      for (unsigned i = 0; i < instr->operands.size(); i++) {
+         if (reg != var.reg) {
+            reg.reg_b += instr->operands[i].bytes();
+            continue;
+         }
+
+         /* check if we can swap positions */
+         if (instr->operands[i].isTemp() && instr->operands[i].isFirstKill() &&
+             instr->operands[i].regClass() == info.rc) {
+            assignment& op = ctx.assignments[instr->operands[i].tempId()];
+            /* if everything matches, create parallelcopy for the killed operand */
+            if (!intersects(def_reg, PhysRegInterval{op.reg, op.rc.size()}) && op.reg != scc &&
+                reg_file.get_id(op.reg) == instr->operands[i].tempId()) {
+               Definition pc_def = Definition(reg, info.rc);
+               parallelcopies.emplace_back(instr->operands[i], pc_def);
+               return op.reg;
+            }
+         }
+         return {};
+      }
+   }
+   return {};
 }
 
 bool
 get_regs_for_copies(ra_ctx& ctx, RegisterFile& reg_file,
                     std::vector<std::pair<Operand, Definition>>& parallelcopies,
-                    const std::set<std::pair<unsigned, unsigned>>& vars,
-                    const PhysRegInterval bounds, aco_ptr<Instruction>& instr,
+                    const std::vector<unsigned>& vars, aco_ptr<Instruction>& instr,
                     const PhysRegInterval def_reg)
 {
-   /* variables are sorted from small sized to large */
-   /* NOTE: variables are also sorted by ID. this only affects a very small number of shaders
-    * slightly though. */
-   for (std::set<std::pair<unsigned, unsigned>>::const_reverse_iterator it = vars.rbegin();
-        it != vars.rend(); ++it) {
-      unsigned id = it->second;
+   /* Variables are sorted from large to small and with increasing assigned register */
+   for (unsigned id : vars) {
       assignment& var = ctx.assignments[id];
+      PhysRegInterval bounds = get_reg_bounds(ctx, var.rc);
       DefInfo info = DefInfo(ctx, ctx.pseudo_dummy, var.rc, -1);
       uint32_t size = info.size;
 
       /* check if this is a dead operand, then we can re-use the space from the definition
        * also use the correct stride for sub-dword operands */
       bool is_dead_operand = false;
-      for (unsigned i = 0; !is_phi(instr) && i < instr->operands.size(); i++) {
-         if (instr->operands[i].isTemp() && instr->operands[i].tempId() == id) {
-            if (instr->operands[i].isKillBeforeDef())
-               is_dead_operand = true;
-            info = DefInfo(ctx, instr, var.rc, i);
-            break;
-         }
-      }
-
-      std::pair<PhysReg, bool> res;
-      if (is_dead_operand) {
-         if (instr->opcode == aco_opcode::p_create_vector) {
-            PhysReg reg(def_reg.lo());
-            for (unsigned i = 0; i < instr->operands.size(); i++) {
-               if (instr->operands[i].isTemp() && instr->operands[i].tempId() == id) {
-                  res = {reg, (!var.rc.is_subdword() || (reg.byte() % info.stride == 0)) &&
-                                 !reg_file.test(reg, var.rc.bytes())};
-                  break;
+      std::optional<PhysReg> res;
+      if (instr->opcode == aco_opcode::p_create_vector) {
+         res =
+            get_reg_for_create_vector_copy(ctx, reg_file, parallelcopies, instr, def_reg, info, id);
+      } else {
+         for (unsigned i = 0; !is_phi(instr) && i < instr->operands.size(); i++) {
+            if (instr->operands[i].isTemp() && instr->operands[i].tempId() == id) {
+               info = DefInfo(ctx, instr, var.rc, i);
+               if (instr->operands[i].isKillBeforeDef()) {
+                  info.bounds = def_reg;
+                  res = get_reg_simple(ctx, reg_file, info);
+                  is_dead_operand = true;
                }
-               reg.reg_b += instr->operands[i].bytes();
+               break;
             }
-            if (!res.second)
-               res = {var.reg, !reg_file.test(var.reg, var.rc.bytes())};
-         } else {
-            info.bounds = def_reg;
-            res = get_reg_simple(ctx, reg_file, info);
          }
-      } else {
+      }
+      if (!res && !def_reg.size) {
+         /* If this is before definitions are handled, def_reg may be an empty interval. */
+         info.bounds = bounds;
+         res = get_reg_simple(ctx, reg_file, info);
+      } else if (!res) {
          /* Try to find space within the bounds but outside of the definition */
          info.bounds = PhysRegInterval::from_until(bounds.lo(), MIN2(def_reg.lo(), bounds.hi()));
          res = get_reg_simple(ctx, reg_file, info);
-         if (!res.second && def_reg.hi() <= bounds.hi()) {
+         if (!res && def_reg.hi() <= bounds.hi()) {
             unsigned lo = (def_reg.hi() + info.stride - 1) & ~(info.stride - 1);
             info.bounds = PhysRegInterval::from_until(PhysReg{lo}, bounds.hi());
             res = get_reg_simple(ctx, reg_file, info);
          }
       }
 
-      if (res.second) {
+      if (res) {
          /* mark the area as blocked */
-         reg_file.block(res.first, var.rc);
+         reg_file.block(*res, var.rc);
 
          /* create parallelcopy pair (without definition id) */
          Temp tmp = Temp(id, var.rc);
          Operand pc_op = Operand(tmp);
          pc_op.setFixed(var.reg);
-         Definition pc_def = Definition(res.first, pc_op.regClass());
+         Definition pc_def = Definition(*res, pc_op.regClass());
          parallelcopies.emplace_back(pc_op, pc_def);
          continue;
       }
@@ -1075,9 +1207,8 @@ get_regs_for_copies(ra_ctx& ctx, RegisterFile& reg_file,
                n++;
                continue;
             }
-            /* we cannot split live ranges of linear vgprs inside control flow */
-            if (!(ctx.block->kind & block_kind_top_level) &&
-                ctx.assignments[reg_file[j]].rc.is_linear_vgpr()) {
+            /* we cannot split live ranges of linear vgprs */
+            if (ctx.assignments[reg_file[j]].rc.is_linear_vgpr()) {
                found = false;
                break;
             }
@@ -1116,13 +1247,13 @@ get_regs_for_copies(ra_ctx& ctx, RegisterFile& reg_file,
       PhysRegInterval reg_win{best_pos, size};
 
       /* collect variables and block reg file */
-      std::set<std::pair<unsigned, unsigned>> new_vars = collect_vars(ctx, reg_file, reg_win);
+      std::vector<unsigned> new_vars = collect_vars(ctx, reg_file, reg_win);
 
       /* mark the area as blocked */
       reg_file.block(reg_win.lo(), var.rc);
       adjust_max_used_regs(ctx, var.rc, reg_win.lo());
 
-      if (!get_regs_for_copies(ctx, reg_file, parallelcopies, new_vars, bounds, instr, def_reg))
+      if (!get_regs_for_copies(ctx, reg_file, parallelcopies, new_vars, instr, def_reg))
          return false;
 
       /* create parallelcopy pair (without definition id) */
@@ -1136,8 +1267,8 @@ get_regs_for_copies(ra_ctx& ctx, RegisterFile& reg_file,
    return true;
 }
 
-std::pair<PhysReg, bool>
-get_reg_impl(ra_ctx& ctx, RegisterFile& reg_file,
+std::optional<PhysReg>
+get_reg_impl(ra_ctx& ctx, const RegisterFile& reg_file,
              std::vector<std::pair<Operand, Definition>>& parallelcopies, const DefInfo& info,
              aco_ptr<Instruction>& instr)
 {
@@ -1166,7 +1297,8 @@ get_reg_impl(ra_ctx& ctx, RegisterFile& reg_file,
       }
    }
 
-   assert(regs_free >= size);
+   assert((regs_free + ctx.num_linear_vgprs) >= size);
+
    /* we might have to move dead operands to dst in order to make space */
    unsigned op_moves = 0;
 
@@ -1223,10 +1355,8 @@ get_reg_impl(ra_ctx& ctx, RegisterFile& reg_file,
             break;
          }
 
-         /* we cannot split live ranges of linear vgprs inside control flow */
-         //TODO: ensure that live range splits inside control flow are never necessary
-         if (!(ctx.block->kind & block_kind_top_level) &&
-             ctx.assignments[reg_file[j]].rc.is_linear_vgpr()) {
+         /* we cannot split live ranges of linear vgprs */
+         if (ctx.assignments[reg_file[j]].rc.is_linear_vgpr()) {
             found = false;
             break;
          }
@@ -1251,31 +1381,23 @@ get_reg_impl(ra_ctx& ctx, RegisterFile& reg_file,
    }
 
    if (num_moves == 0xFF)
-      return {{}, false};
+      return {};
 
    /* now, we figured the placement for our definition */
    RegisterFile tmp_file(reg_file);
-   std::set<std::pair<unsigned, unsigned>> vars = collect_vars(ctx, tmp_file, best_win);
 
+   /* p_create_vector: also re-place killed operands in the definition space */
    if (instr->opcode == aco_opcode::p_create_vector) {
-      /* move killed operands which aren't yet at the correct position (GFX9+)
-       * or which are in the definition space */
-      PhysReg reg = best_win.lo();
       for (Operand& op : instr->operands) {
-         if (op.isTemp() && op.isFirstKillBeforeDef() && op.getTemp().type() == rc.type()) {
-            if (op.physReg() != reg && (ctx.program->chip_class >= GFX9 ||
-                                        (op.physReg().advance(op.bytes()) > best_win.lo() &&
-                                         op.physReg() < best_win.hi()))) {
-               vars.emplace(op.bytes(), op.tempId());
-               tmp_file.clear(op);
-            } else {
-               tmp_file.fill(op);
-            }
-         }
-         reg.reg_b += op.bytes();
+         if (op.isTemp() && op.isFirstKillBeforeDef())
+            tmp_file.fill(op);
       }
-   } else if (!is_phi(instr)) {
-      /* re-enable killed operands */
+   }
+
+   std::vector<unsigned> vars = collect_vars(ctx, tmp_file, best_win);
+
+   /* re-enable killed operands */
+   if (!is_phi(instr) && instr->opcode != aco_opcode::p_create_vector) {
       for (Operand& op : instr->operands) {
          if (op.isTemp() && op.isFirstKillBeforeDef())
             tmp_file.fill(op);
@@ -1283,18 +1405,18 @@ get_reg_impl(ra_ctx& ctx, RegisterFile& reg_file,
    }
 
    std::vector<std::pair<Operand, Definition>> pc;
-   if (!get_regs_for_copies(ctx, tmp_file, pc, vars, bounds, instr, best_win))
-      return {{}, false};
+   if (!get_regs_for_copies(ctx, tmp_file, pc, vars, instr, best_win))
+      return {};
 
    parallelcopies.insert(parallelcopies.end(), pc.begin(), pc.end());
 
    adjust_max_used_regs(ctx, rc, best_win.lo());
-   return {best_win.lo(), true};
+   return best_win.lo();
 }
 
 bool
-get_reg_specified(ra_ctx& ctx, RegisterFile& reg_file, RegClass rc, aco_ptr<Instruction>& instr,
-                  PhysReg reg)
+get_reg_specified(ra_ctx& ctx, const RegisterFile& reg_file, RegClass rc,
+                  aco_ptr<Instruction>& instr, PhysReg reg)
 {
    /* catch out-of-range registers */
    if (reg >= PhysReg{512})
@@ -1313,10 +1435,10 @@ get_reg_specified(ra_ctx& ctx, RegisterFile& reg_file, RegClass rc, aco_ptr<Inst
       return false;
 
    PhysRegInterval reg_win = {reg, rc.size()};
-   PhysRegInterval bounds = get_reg_bounds(ctx.program, rc.type());
+   PhysRegInterval bounds = get_reg_bounds(ctx, rc);
    PhysRegInterval vcc_win = {vcc, 2};
    /* VCC is outside the bounds */
-   bool is_vcc = rc.type() == RegType::sgpr && vcc_win.contains(reg_win);
+   bool is_vcc = rc.type() == RegType::sgpr && vcc_win.contains(reg_win) && ctx.program->needs_vcc;
    bool is_m0 = rc == s1 && reg == m0;
    if (!bounds.contains(reg_win) && !is_vcc && !is_m0)
       return false;
@@ -1336,17 +1458,24 @@ get_reg_specified(ra_ctx& ctx, RegisterFile& reg_file, RegClass rc, aco_ptr<Inst
 }
 
 bool
-increase_register_file(ra_ctx& ctx, RegType type)
+increase_register_file(ra_ctx& ctx, RegClass rc)
 {
-   if (type == RegType::vgpr && ctx.program->max_reg_demand.vgpr < ctx.vgpr_limit) {
-      update_vgpr_sgpr_demand(ctx.program, RegisterDemand(ctx.program->max_reg_demand.vgpr + 1,
-                                                          ctx.program->max_reg_demand.sgpr));
-   } else if (type == RegType::sgpr && ctx.program->max_reg_demand.sgpr < ctx.sgpr_limit) {
-      update_vgpr_sgpr_demand(ctx.program, RegisterDemand(ctx.program->max_reg_demand.vgpr,
-                                                          ctx.program->max_reg_demand.sgpr + 1));
+   if (rc.type() == RegType::vgpr && ctx.num_linear_vgprs == 0 &&
+       ctx.vgpr_bounds < ctx.vgpr_limit) {
+      /* If vgpr_bounds is less than max_reg_demand.vgpr, this should be a no-op. */
+      update_vgpr_sgpr_demand(
+         ctx.program, RegisterDemand(ctx.vgpr_bounds + 1, ctx.program->max_reg_demand.sgpr));
+
+      ctx.vgpr_bounds = ctx.program->max_reg_demand.vgpr;
+   } else if (rc.type() == RegType::sgpr && ctx.program->max_reg_demand.sgpr < ctx.sgpr_limit) {
+      update_vgpr_sgpr_demand(
+         ctx.program, RegisterDemand(ctx.program->max_reg_demand.vgpr, ctx.sgpr_bounds + 1));
+
+      ctx.sgpr_bounds = ctx.program->max_reg_demand.sgpr;
    } else {
       return false;
    }
+
    return true;
 }
 
@@ -1429,7 +1558,7 @@ compact_relocate_vars(ra_ctx& ctx, const std::vector<IDAndRegClass>& vars,
 }
 
 bool
-is_mimg_vaddr_intact(ra_ctx& ctx, RegisterFile& reg_file, Instruction* instr)
+is_mimg_vaddr_intact(ra_ctx& ctx, const RegisterFile& reg_file, Instruction* instr)
 {
    PhysReg first{512};
    for (unsigned i = 0; i < instr->operands.size() - 3u; i++) {
@@ -1439,7 +1568,7 @@ is_mimg_vaddr_intact(ra_ctx& ctx, RegisterFile& reg_file, Instruction* instr)
          PhysReg reg = ctx.assignments[op.tempId()].reg;
 
          if (first.reg() == 512) {
-            PhysRegInterval bounds = get_reg_bounds(ctx.program, RegType::vgpr);
+            PhysRegInterval bounds = get_reg_bounds(ctx, RegType::vgpr, false);
             first = reg.advance(i * -4);
             PhysRegInterval vec = PhysRegInterval{first, instr->operands.size() - 3u};
             if (!bounds.contains(vec)) /* not enough space for other operands */
@@ -1460,8 +1589,8 @@ is_mimg_vaddr_intact(ra_ctx& ctx, RegisterFile& reg_file, Instruction* instr)
    return true;
 }
 
-std::pair<PhysReg, bool>
-get_reg_vector(ra_ctx& ctx, RegisterFile& reg_file, Temp temp, aco_ptr<Instruction>& instr)
+std::optional<PhysReg>
+get_reg_vector(ra_ctx& ctx, const RegisterFile& reg_file, Temp temp, aco_ptr<Instruction>& instr)
 {
    Instruction* vec = ctx.vectors[temp.id()];
    unsigned first_operand = vec->format == Format::MIMG ? 3 : 0;
@@ -1487,11 +1616,11 @@ get_reg_vector(ra_ctx& ctx, RegisterFile& reg_file, Temp temp, aco_ptr<Instructi
             PhysReg reg = ctx.assignments[op.tempId()].reg;
             reg.reg_b += (our_offset - their_offset);
             if (get_reg_specified(ctx, reg_file, temp.regClass(), instr, reg))
-               return {reg, true};
+               return reg;
 
             /* return if MIMG vaddr components don't remain vector-aligned */
             if (vec->format == Format::MIMG)
-               return {{}, false};
+               return {};
          }
          their_offset += op.bytes();
       }
@@ -1501,20 +1630,137 @@ get_reg_vector(ra_ctx& ctx, RegisterFile& reg_file, Temp temp, aco_ptr<Instructi
        */
       RegClass vec_rc = RegClass::get(temp.type(), their_offset);
       DefInfo info(ctx, ctx.pseudo_dummy, vec_rc, -1);
-      std::pair<PhysReg, bool> res = get_reg_simple(ctx, reg_file, info);
-      PhysReg reg = res.first;
-      if (res.second) {
-         reg.reg_b += our_offset;
+      std::optional<PhysReg> reg = get_reg_simple(ctx, reg_file, info);
+      if (reg) {
+         reg->reg_b += our_offset;
          /* make sure to only use byte offset if the instruction supports it */
-         if (get_reg_specified(ctx, reg_file, temp.regClass(), instr, reg))
-            return {reg, true};
+         if (get_reg_specified(ctx, reg_file, temp.regClass(), instr, *reg))
+            return reg;
       }
    }
-   return {{}, false};
+   return {};
 }
 
+bool
+compact_linear_vgprs(ra_ctx& ctx, const RegisterFile& reg_file,
+                     std::vector<std::pair<Operand, Definition>>& parallelcopies)
+{
+   PhysRegInterval linear_vgpr_bounds = get_reg_bounds(ctx, RegType::vgpr, true);
+   int zeros = reg_file.count_zero(linear_vgpr_bounds);
+   if (zeros == 0)
+      return false;
+
+   std::vector<IDAndRegClass> vars;
+   for (unsigned id : find_vars(ctx, reg_file, linear_vgpr_bounds))
+      vars.emplace_back(id, ctx.assignments[id].rc);
+
+   ctx.num_linear_vgprs -= zeros;
+   compact_relocate_vars(ctx, vars, parallelcopies, get_reg_bounds(ctx, RegType::vgpr, true).lo());
+
+   return true;
+}
+
+/* Allocates a linear VGPR. We allocate them at the end of the register file and keep them separate
+ * from normal VGPRs. This is for two reasons:
+ * - Because we only ever move linear VGPRs into an empty space or a space previously occupied by a
+ *   linear one, we never have to swap a normal VGPR and a linear one.
+ * - As linear VGPR's live ranges only start and end on top-level blocks, we never have to move a
+ *   linear VGPR in control flow.
+ */
 PhysReg
-get_reg(ra_ctx& ctx, RegisterFile& reg_file, Temp temp,
+alloc_linear_vgpr(ra_ctx& ctx, const RegisterFile& reg_file, aco_ptr<Instruction>& instr,
+                  std::vector<std::pair<Operand, Definition>>& parallelcopies)
+{
+   assert(instr->opcode == aco_opcode::p_start_linear_vgpr);
+   assert(instr->definitions.size() == 1 && instr->definitions[0].bytes() % 4 == 0);
+
+   RegClass rc = instr->definitions[0].regClass();
+
+   /* Try to choose an unused space in the linear VGPR bounds. */
+   for (unsigned i = rc.size(); i <= ctx.num_linear_vgprs; i++) {
+      PhysReg reg(256 + ctx.vgpr_bounds - i);
+      if (!reg_file.test(reg, rc.bytes())) {
+         adjust_max_used_regs(ctx, rc, reg);
+         return reg;
+      }
+   }
+
+   PhysRegInterval old_normal_bounds = get_reg_bounds(ctx, RegType::vgpr, false);
+
+   /* Compact linear VGPRs, grow the bounds if necessary, and choose a space at the beginning: */
+   compact_linear_vgprs(ctx, reg_file, parallelcopies);
+
+   PhysReg reg(256 + ctx.vgpr_bounds - (ctx.num_linear_vgprs + rc.size()));
+   /* Space that was for normal VGPRs, but is now for linear VGPRs. */
+   PhysRegInterval new_win = PhysRegInterval::from_until(reg, MAX2(old_normal_bounds.hi(), reg));
+
+   RegisterFile tmp_file(reg_file);
+   PhysRegInterval reg_win{reg, rc.size()};
+   std::vector<unsigned> blocking_vars = collect_vars(ctx, tmp_file, new_win);
+
+   /* Re-enable killed operands */
+   for (Operand& op : instr->operands) {
+      if (op.isTemp() && op.isFirstKillBeforeDef())
+         tmp_file.fill(op);
+   }
+
+   /* Find new assignments for blocking vars. */
+   std::vector<std::pair<Operand, Definition>> pc;
+   if (!ctx.policy.skip_optimistic_path &&
+       get_regs_for_copies(ctx, tmp_file, pc, blocking_vars, instr, reg_win)) {
+      parallelcopies.insert(parallelcopies.end(), pc.begin(), pc.end());
+   } else {
+      /* Fallback algorithm: reallocate all variables at once. */
+      std::vector<IDAndRegClass> vars;
+      for (unsigned id : find_vars(ctx, reg_file, old_normal_bounds))
+         vars.emplace_back(id, ctx.assignments[id].rc);
+      compact_relocate_vars(ctx, vars, parallelcopies, PhysReg(256));
+
+      std::vector<IDAndRegClass> killed_op_vars;
+      for (Operand& op : instr->operands) {
+         if (op.isTemp() && op.isFirstKillBeforeDef() && op.regClass().type() == RegType::vgpr)
+            killed_op_vars.emplace_back(op.tempId(), op.regClass());
+      }
+      compact_relocate_vars(ctx, killed_op_vars, parallelcopies, reg_win.lo());
+   }
+
+   /* If this is updated earlier, a killed operand can't be placed inside the definition. */
+   ctx.num_linear_vgprs += rc.size();
+
+   adjust_max_used_regs(ctx, rc, reg);
+   return reg;
+}
+
+bool
+should_compact_linear_vgprs(ra_ctx& ctx, live& live_vars, const RegisterFile& reg_file)
+{
+   if (!(ctx.block->kind & block_kind_top_level) || ctx.block->linear_succs.empty())
+      return false;
+
+   /* Since we won't be able to copy linear VGPRs to make space when in control flow, we have to
+    * ensure in advance that there is enough space for normal VGPRs. */
+   unsigned max_vgpr_usage = 0;
+   unsigned next_toplevel = ctx.block->index + 1;
+   for (; !(ctx.program->blocks[next_toplevel].kind & block_kind_top_level); next_toplevel++) {
+      max_vgpr_usage =
+         MAX2(max_vgpr_usage, (unsigned)ctx.program->blocks[next_toplevel].register_demand.vgpr);
+   }
+
+   std::vector<aco_ptr<Instruction>>& instructions =
+      ctx.program->blocks[next_toplevel].instructions;
+   if (!instructions.empty() && is_phi(instructions[0])) {
+      max_vgpr_usage =
+         MAX2(max_vgpr_usage, (unsigned)live_vars.register_demand[next_toplevel][0].vgpr);
+   }
+
+   for (unsigned tmp : find_vars(ctx, reg_file, get_reg_bounds(ctx, RegType::vgpr, true)))
+      max_vgpr_usage -= ctx.assignments[tmp].rc.size();
+
+   return max_vgpr_usage > get_reg_bounds(ctx, RegType::vgpr, false).size;
+}
+
+PhysReg
+get_reg(ra_ctx& ctx, const RegisterFile& reg_file, Temp temp,
         std::vector<std::pair<Operand, Definition>>& parallelcopies, aco_ptr<Instruction>& instr,
         int operand_index = -1)
 {
@@ -1522,30 +1768,41 @@ get_reg(ra_ctx& ctx, RegisterFile& reg_file, Temp temp,
    if (split_vec != ctx.split_vectors.end()) {
       unsigned offset = 0;
       for (Definition def : split_vec->second->definitions) {
-         auto affinity_it = ctx.affinities.find(def.tempId());
-         if (affinity_it != ctx.affinities.end() && ctx.assignments[affinity_it->second].assigned) {
-            PhysReg reg = ctx.assignments[affinity_it->second].reg;
-            reg.reg_b -= offset;
-            if (get_reg_specified(ctx, reg_file, temp.regClass(), instr, reg))
-               return reg;
+         if (ctx.assignments[def.tempId()].affinity) {
+            assignment& affinity = ctx.assignments[ctx.assignments[def.tempId()].affinity];
+            if (affinity.assigned) {
+               PhysReg reg = affinity.reg;
+               reg.reg_b -= offset;
+               if (get_reg_specified(ctx, reg_file, temp.regClass(), instr, reg))
+                  return reg;
+            }
          }
          offset += def.bytes();
       }
    }
 
-   if (ctx.affinities.find(temp.id()) != ctx.affinities.end() &&
-       ctx.assignments[ctx.affinities[temp.id()]].assigned) {
-      PhysReg reg = ctx.assignments[ctx.affinities[temp.id()]].reg;
-      if (get_reg_specified(ctx, reg_file, temp.regClass(), instr, reg))
-         return reg;
+   if (ctx.assignments[temp.id()].affinity) {
+      assignment& affinity = ctx.assignments[ctx.assignments[temp.id()].affinity];
+      if (affinity.assigned) {
+         if (get_reg_specified(ctx, reg_file, temp.regClass(), instr, affinity.reg))
+            return affinity.reg;
+      }
+   }
+   if (ctx.assignments[temp.id()].vcc) {
+      if (get_reg_specified(ctx, reg_file, temp.regClass(), instr, vcc))
+         return vcc;
+   }
+   if (ctx.assignments[temp.id()].m0) {
+      if (get_reg_specified(ctx, reg_file, temp.regClass(), instr, m0) && can_write_m0(instr))
+         return m0;
    }
 
-   std::pair<PhysReg, bool> res;
+   std::optional<PhysReg> res;
 
    if (ctx.vectors.find(temp.id()) != ctx.vectors.end()) {
       res = get_reg_vector(ctx, reg_file, temp, instr);
-      if (res.second)
-         return res.first;
+      if (res)
+         return *res;
    }
 
    DefInfo info(ctx, instr, temp.regClass(), operand_index);
@@ -1554,24 +1811,39 @@ get_reg(ra_ctx& ctx, RegisterFile& reg_file, Temp temp,
       /* try to find space without live-range splits */
       res = get_reg_simple(ctx, reg_file, info);
 
-      if (res.second)
-         return res.first;
+      if (res)
+         return *res;
    }
 
    /* try to find space with live-range splits */
    res = get_reg_impl(ctx, reg_file, parallelcopies, info, instr);
 
-   if (res.second)
-      return res.first;
+   if (res)
+      return *res;
 
-   /* try using more registers */
+   /* try compacting the linear vgprs to make more space */
+   std::vector<std::pair<Operand, Definition>> pc;
+   if (info.rc.type() == RegType::vgpr && (ctx.block->kind & block_kind_top_level) &&
+       compact_linear_vgprs(ctx, reg_file, pc)) {
+      parallelcopies.insert(parallelcopies.end(), pc.begin(), pc.end());
+
+      /* We don't need to fill the copy definitions in because we don't care about the linear VGPR
+       * space here. */
+      RegisterFile tmp_file(reg_file);
+      for (std::pair<Operand, Definition>& copy : pc)
+         tmp_file.clear(copy.first);
+
+      return get_reg(ctx, tmp_file, temp, parallelcopies, instr, operand_index);
+   }
 
    /* We should only fail here because keeping under the limit would require
     * too many moves. */
    assert(reg_file.count_zero(info.bounds) >= info.size);
 
-   if (!increase_register_file(ctx, info.rc.type())) {
-      /* fallback algorithm: reallocate all variables at once */
+   /* try using more registers */
+   if (!increase_register_file(ctx, info.rc)) {
+      /* fallback algorithm: reallocate all variables at once (linear VGPRs should already be
+       * compact at the end) */
       unsigned def_size = info.rc.size();
       for (Definition def : instr->definitions) {
          if (ctx.assignments[def.tempId()].assigned && def.regClass().type() == info.rc.type())
@@ -1584,12 +1856,12 @@ get_reg(ra_ctx& ctx, RegisterFile& reg_file, Temp temp,
             killed_op_size += op.regClass().size();
       }
 
-      const PhysRegInterval regs = get_reg_bounds(ctx.program, info.rc.type());
+      const PhysRegInterval regs = get_reg_bounds(ctx, info.rc);
 
       /* reallocate passthrough variables and non-killed operands */
       std::vector<IDAndRegClass> vars;
-      for (const std::pair<unsigned, unsigned>& var : find_vars(ctx, reg_file, regs))
-         vars.emplace_back(var.second, ctx.assignments[var.second].rc);
+      for (unsigned id : find_vars(ctx, reg_file, regs))
+         vars.emplace_back(id, ctx.assignments[id].rc);
       vars.emplace_back(0xffffffff, RegClass(info.rc.type(), MAX2(def_size, killed_op_size)));
 
       PhysReg space = compact_relocate_vars(ctx, vars, parallelcopies, regs.lo());
@@ -1616,7 +1888,7 @@ get_reg(ra_ctx& ctx, RegisterFile& reg_file, Temp temp,
 }
 
 PhysReg
-get_reg_create_vector(ra_ctx& ctx, RegisterFile& reg_file, Temp temp,
+get_reg_create_vector(ra_ctx& ctx, const RegisterFile& reg_file, Temp temp,
                       std::vector<std::pair<Operand, Definition>>& parallelcopies,
                       aco_ptr<Instruction>& instr)
 {
@@ -1625,13 +1897,14 @@ get_reg_create_vector(ra_ctx& ctx, RegisterFile& reg_file, Temp temp,
    uint32_t size = rc.size();
    uint32_t bytes = rc.bytes();
    uint32_t stride = get_stride(rc);
-   PhysRegInterval bounds = get_reg_bounds(ctx.program, rc.type());
+   PhysRegInterval bounds = get_reg_bounds(ctx, rc);
 
    // TODO: improve p_create_vector for sub-dword vectors
 
    PhysReg best_pos{0xFFF};
    unsigned num_moves = 0xFF;
    bool best_avoid = true;
+   uint32_t correct_pos_mask = 0;
 
    /* test for each operand which definition placement causes the least shuffle instructions */
    for (unsigned i = 0, offset = 0; i < instr->operands.size();
@@ -1667,6 +1940,7 @@ get_reg_create_vector(ra_ctx& ctx, RegisterFile& reg_file, Temp temp,
 
       /* count variables to be moved and check "avoid" */
       bool avoid = false;
+      bool linear_vgpr = false;
       for (PhysReg j : reg_win) {
          if (reg_file[j] != 0) {
             if (reg_file[j] == 0xF0000000) {
@@ -1677,28 +1951,28 @@ get_reg_create_vector(ra_ctx& ctx, RegisterFile& reg_file, Temp temp,
                   k += reg_file.test(reg, 1);
             } else {
                k += 4;
-               /* we cannot split live ranges of linear vgprs inside control flow */
-               if (ctx.assignments[reg_file[j]].rc.is_linear_vgpr()) {
-                  if (ctx.block->kind & block_kind_top_level)
-                     avoid = true;
-                  else
-                     break;
-               }
+               linear_vgpr |= ctx.assignments[reg_file[j]].rc.is_linear_vgpr();
             }
          }
          avoid |= ctx.war_hint[j];
       }
+
+      /* we cannot split live ranges of linear vgprs */
+      if (linear_vgpr)
+         continue;
+
       if (avoid && !best_avoid)
          continue;
 
       /* count operands in wrong positions */
+      uint32_t correct_pos_mask_new = 0;
       for (unsigned j = 0, offset2 = 0; j < instr->operands.size();
            offset2 += instr->operands[j].bytes(), j++) {
-         if (j == i || !instr->operands[j].isTemp() ||
-             instr->operands[j].getTemp().type() != rc.type())
-            continue;
-         if (instr->operands[j].physReg().reg_b != reg_win.lo() * 4 + offset2)
-            k += instr->operands[j].bytes();
+         Operand& op = instr->operands[j];
+         if (op.isTemp() && op.physReg().reg_b == reg_win.lo() * 4 + offset2)
+            correct_pos_mask_new |= 1 << j;
+         else
+            k += op.bytes();
       }
       bool aligned = rc == RegClass::v4 && reg_win.lo() % 4 == 0;
       if (k > num_moves || (!aligned && k == num_moves))
@@ -1707,49 +1981,39 @@ get_reg_create_vector(ra_ctx& ctx, RegisterFile& reg_file, Temp temp,
       best_pos = reg_win.lo();
       num_moves = k;
       best_avoid = avoid;
+      correct_pos_mask = correct_pos_mask_new;
    }
 
-   if (num_moves >= bytes)
+   /* too many moves: try the generic get_reg() function */
+   if (num_moves >= 2 * bytes) {
       return get_reg(ctx, reg_file, temp, parallelcopies, instr);
+   } else if (num_moves > bytes) {
+      DefInfo info(ctx, instr, rc, -1);
+      std::optional<PhysReg> res = get_reg_simple(ctx, reg_file, info);
+      if (res)
+         return *res;
+   }
 
    /* re-enable killed operands which are in the wrong position */
    RegisterFile tmp_file(reg_file);
-   for (unsigned i = 0, offset = 0; i < instr->operands.size();
-        offset += instr->operands[i].bytes(), i++) {
-      if (instr->operands[i].isTemp() && instr->operands[i].isFirstKillBeforeDef() &&
-          instr->operands[i].physReg().reg_b != best_pos.reg_b + offset)
-         tmp_file.fill(instr->operands[i]);
+   for (Operand& op : instr->operands) {
+      if (op.isTemp() && op.isFirstKillBeforeDef())
+         tmp_file.fill(op);
+   }
+   for (unsigned i = 0; i < instr->operands.size(); i++) {
+      if ((correct_pos_mask >> i) & 1u && instr->operands[i].isKill())
+         tmp_file.clear(instr->operands[i]);
    }
 
    /* collect variables to be moved */
-   std::set<std::pair<unsigned, unsigned>> vars =
-      collect_vars(ctx, tmp_file, PhysRegInterval{best_pos, size});
+   std::vector<unsigned> vars = collect_vars(ctx, tmp_file, PhysRegInterval{best_pos, size});
 
-   for (unsigned i = 0, offset = 0; i < instr->operands.size();
-        offset += instr->operands[i].bytes(), i++) {
-      if (!instr->operands[i].isTemp() || !instr->operands[i].isFirstKillBeforeDef() ||
-          instr->operands[i].getTemp().type() != rc.type())
-         continue;
-      bool correct_pos = instr->operands[i].physReg().reg_b == best_pos.reg_b + offset;
-      /* GFX9+: move killed operands which aren't yet at the correct position
-       * Moving all killed operands generally leads to more register swaps.
-       * This is only done on GFX9+ because of the cheap v_swap instruction.
-       */
-      if (ctx.program->chip_class >= GFX9 && !correct_pos) {
-         vars.emplace(instr->operands[i].bytes(), instr->operands[i].tempId());
-         tmp_file.clear(instr->operands[i]);
-         /* fill operands which are in the correct position to avoid overwriting */
-      } else if (correct_pos) {
-         tmp_file.fill(instr->operands[i]);
-      }
-   }
    bool success = false;
    std::vector<std::pair<Operand, Definition>> pc;
-   success =
-      get_regs_for_copies(ctx, tmp_file, pc, vars, bounds, instr, PhysRegInterval{best_pos, size});
+   success = get_regs_for_copies(ctx, tmp_file, pc, vars, instr, PhysRegInterval{best_pos, size});
 
    if (!success) {
-      if (!increase_register_file(ctx, temp.type())) {
+      if (!increase_register_file(ctx, temp.regClass())) {
          /* use the fallback algorithm in get_reg() */
          return get_reg(ctx, reg_file, temp, parallelcopies, instr);
       }
@@ -1774,7 +2038,7 @@ handle_pseudo(ra_ctx& ctx, const RegisterFile& reg_file, Instruction* instr)
    case aco_opcode::p_create_vector:
    case aco_opcode::p_split_vector:
    case aco_opcode::p_parallelcopy:
-   case aco_opcode::p_wqm: break;
+   case aco_opcode::p_start_linear_vgpr: break;
    default: return;
    }
 
@@ -1794,10 +2058,11 @@ handle_pseudo(ra_ctx& ctx, const RegisterFile& reg_file, Instruction* instr)
          reads_subdword = true;
    }
    bool needs_scratch_reg = (writes_linear && reads_linear && reg_file[scc]) ||
-                            (ctx.program->chip_class <= GFX7 && reads_subdword);
+                            (ctx.program->gfx_level <= GFX7 && reads_subdword);
    if (!needs_scratch_reg)
       return;
 
+   instr->pseudo().needs_scratch_reg = true;
    instr->pseudo().tmp_in_scc = reg_file[scc];
 
    int reg = ctx.max_used_sgpr;
@@ -1818,27 +2083,11 @@ handle_pseudo(ra_ctx& ctx, const RegisterFile& reg_file, Instruction* instr)
 }
 
 bool
-operand_can_use_reg(chip_class chip, aco_ptr<Instruction>& instr, unsigned idx, PhysReg reg,
+operand_can_use_reg(amd_gfx_level gfx_level, aco_ptr<Instruction>& instr, unsigned idx, PhysReg reg,
                     RegClass rc)
 {
-   if (instr->operands[idx].isFixed())
-      return instr->operands[idx].physReg() == reg;
-
-   bool is_writelane = instr->opcode == aco_opcode::v_writelane_b32 ||
-                       instr->opcode == aco_opcode::v_writelane_b32_e64;
-   if (chip <= GFX9 && is_writelane && idx <= 1) {
-      /* v_writelane_b32 can take two sgprs but only if one is m0. */
-      bool is_other_sgpr =
-         instr->operands[!idx].isTemp() &&
-         (!instr->operands[!idx].isFixed() || instr->operands[!idx].physReg() != m0);
-      if (is_other_sgpr && instr->operands[!idx].tempId() != instr->operands[idx].tempId()) {
-         instr->operands[idx].setFixed(m0);
-         return reg == m0;
-      }
-   }
-
    if (reg.byte()) {
-      unsigned stride = get_subdword_operand_stride(chip, instr, idx, rc);
+      unsigned stride = get_subdword_operand_stride(gfx_level, instr, idx, rc);
       if (reg.byte() % stride)
          return false;
    }
@@ -1848,7 +2097,7 @@ operand_can_use_reg(chip_class chip, aco_ptr<Instruction>& instr, unsigned idx,
       return reg != scc && reg != exec &&
              (reg != m0 || idx == 1 || idx == 3) && /* offset can be m0 */
              (reg != vcc || (instr->definitions.empty() && idx == 2) ||
-              chip >= GFX10); /* sdata can be vcc */
+              gfx_level >= GFX10); /* sdata can be vcc */
    default:
       // TODO: there are more instructions with restrictions on registers
       return true;
@@ -1856,41 +2105,82 @@ operand_can_use_reg(chip_class chip, aco_ptr<Instruction>& instr, unsigned idx,
 }
 
 void
-get_reg_for_operand(ra_ctx& ctx, RegisterFile& register_file,
-                    std::vector<std::pair<Operand, Definition>>& parallelcopy,
-                    aco_ptr<Instruction>& instr, Operand& operand, unsigned operand_index)
+handle_fixed_operands(ra_ctx& ctx, RegisterFile& register_file,
+                      std::vector<std::pair<Operand, Definition>>& parallelcopy,
+                      aco_ptr<Instruction>& instr)
 {
-   /* check if the operand is fixed */
-   PhysReg src = ctx.assignments[operand.tempId()].reg;
-   PhysReg dst;
-   if (operand.isFixed()) {
-      assert(operand.physReg() != src);
+   assert(instr->operands.size() <= 128);
 
-      /* check if target reg is blocked, and move away the blocking var */
-      if (register_file.test(operand.physReg(), operand.bytes())) {
-         PhysRegInterval target{operand.physReg(), operand.size()};
+   RegisterFile tmp_file(register_file);
 
-         RegisterFile tmp_file(register_file);
+   BITSET_DECLARE(mask, 128) = {0};
 
-         std::set<std::pair<unsigned, unsigned>> blocking_vars =
-            collect_vars(ctx, tmp_file, target);
+   for (unsigned i = 0; i < instr->operands.size(); i++) {
+      Operand& op = instr->operands[i];
+
+      if (!op.isTemp() || !op.isFixed())
+         continue;
 
-         tmp_file.clear(src, operand.regClass()); // TODO: try to avoid moving block vars to src
-         tmp_file.block(operand.physReg(), operand.regClass());
+      PhysReg src = ctx.assignments[op.tempId()].reg;
+      adjust_max_used_regs(ctx, op.regClass(), op.physReg());
 
-         DefInfo info(ctx, instr, operand.regClass(), -1);
-         get_regs_for_copies(ctx, tmp_file, parallelcopy, blocking_vars, info.bounds, instr,
-                             PhysRegInterval());
+      if (op.physReg() == src) {
+         tmp_file.block(op.physReg(), op.regClass());
+         continue;
       }
-      dst = operand.physReg();
 
-   } else {
-      dst = get_reg(ctx, register_file, operand.getTemp(), parallelcopy, instr, operand_index);
-      update_renames(
-         ctx, register_file, parallelcopy, instr,
-         instr->opcode != aco_opcode::p_create_vector ? rename_not_killed_ops : (UpdateRenames)0);
+      unsigned j;
+      bool found = false;
+      BITSET_FOREACH_SET (j, mask, i) {
+         if (instr->operands[j].tempId() == op.tempId() &&
+             instr->operands[j].physReg() == op.physReg()) {
+            found = true;
+            break;
+         }
+      }
+      if (found)
+         continue; /* the copy is already added to the list */
+
+      /* clear from register_file so fixed operands are not collected be collect_vars() */
+      tmp_file.clear(src, op.regClass()); // TODO: try to avoid moving block vars to src
+
+      BITSET_SET(mask, i);
+
+      Operand pc_op(instr->operands[i].getTemp(), src);
+      Definition pc_def = Definition(op.physReg(), pc_op.regClass());
+      parallelcopy.emplace_back(pc_op, pc_def);
    }
 
+   if (BITSET_IS_EMPTY(mask))
+      return;
+
+   unsigned i;
+   std::vector<unsigned> blocking_vars;
+   BITSET_FOREACH_SET (i, mask, instr->operands.size()) {
+      Operand& op = instr->operands[i];
+      PhysRegInterval target{op.physReg(), op.size()};
+      std::vector<unsigned> blocking_vars2 = collect_vars(ctx, tmp_file, target);
+      blocking_vars.insert(blocking_vars.end(), blocking_vars2.begin(), blocking_vars2.end());
+
+      /* prevent get_regs_for_copies() from using these registers */
+      tmp_file.block(op.physReg(), op.regClass());
+   }
+
+   get_regs_for_copies(ctx, tmp_file, parallelcopy, blocking_vars, instr, PhysRegInterval());
+   update_renames(ctx, register_file, parallelcopy, instr,
+                  rename_not_killed_ops | fill_killed_ops | rename_precolored_ops);
+}
+
+void
+get_reg_for_operand(ra_ctx& ctx, RegisterFile& register_file,
+                    std::vector<std::pair<Operand, Definition>>& parallelcopy,
+                    aco_ptr<Instruction>& instr, Operand& operand, unsigned operand_index)
+{
+   /* clear the operand in case it's only a stride mismatch */
+   PhysReg src = ctx.assignments[operand.tempId()].reg;
+   register_file.clear(src, operand.regClass());
+   PhysReg dst = get_reg(ctx, register_file, operand.getTemp(), parallelcopy, instr, operand_index);
+
    Operand pc_op = operand;
    pc_op.setFixed(src);
    Definition pc_def = Definition(dst, pc_op.regClass());
@@ -1898,6 +2188,151 @@ get_reg_for_operand(ra_ctx& ctx, RegisterFile& register_file,
    update_renames(ctx, register_file, parallelcopy, instr, rename_not_killed_ops | fill_killed_ops);
 }
 
+PhysReg
+get_reg_phi(ra_ctx& ctx, IDSet& live_in, RegisterFile& register_file,
+            std::vector<aco_ptr<Instruction>>& instructions, Block& block,
+            aco_ptr<Instruction>& phi, Temp tmp)
+{
+   std::vector<std::pair<Operand, Definition>> parallelcopy;
+   PhysReg reg = get_reg(ctx, register_file, tmp, parallelcopy, phi);
+   update_renames(ctx, register_file, parallelcopy, phi, rename_not_killed_ops);
+
+   /* process parallelcopy */
+   for (std::pair<Operand, Definition> pc : parallelcopy) {
+      /* see if it's a copy from a different phi */
+      // TODO: prefer moving some previous phis over live-ins
+      // TODO: somehow prevent phis fixed before the RA from being updated (shouldn't be a
+      // problem in practice since they can only be fixed to exec)
+      Instruction* prev_phi = NULL;
+      std::vector<aco_ptr<Instruction>>::iterator phi_it;
+      for (phi_it = instructions.begin(); phi_it != instructions.end(); ++phi_it) {
+         if ((*phi_it)->definitions[0].tempId() == pc.first.tempId())
+            prev_phi = phi_it->get();
+      }
+      if (prev_phi) {
+         /* if so, just update that phi's register */
+         prev_phi->definitions[0].setFixed(pc.second.physReg());
+         register_file.fill(prev_phi->definitions[0]);
+         ctx.assignments[prev_phi->definitions[0].tempId()] = {pc.second.physReg(),
+                                                               pc.second.regClass()};
+         continue;
+      }
+
+      /* rename */
+      std::unordered_map<unsigned, Temp>::iterator orig_it = ctx.orig_names.find(pc.first.tempId());
+      Temp orig = orig_it != ctx.orig_names.end() ? orig_it->second : pc.first.getTemp();
+      ctx.orig_names[pc.second.tempId()] = orig;
+      ctx.renames[block.index][orig.id()] = pc.second.getTemp();
+
+      /* otherwise, this is a live-in and we need to create a new phi
+       * to move it in this block's predecessors */
+      aco_opcode opcode =
+         pc.first.getTemp().is_linear() ? aco_opcode::p_linear_phi : aco_opcode::p_phi;
+      Block::edge_vec& preds =
+         pc.first.getTemp().is_linear() ? block.linear_preds : block.logical_preds;
+      aco_ptr<Instruction> new_phi{create_instruction(opcode, Format::PSEUDO, preds.size(), 1)};
+      new_phi->definitions[0] = pc.second;
+      for (unsigned i = 0; i < preds.size(); i++)
+         new_phi->operands[i] = Operand(pc.first);
+      instructions.emplace_back(std::move(new_phi));
+
+      /* Remove from live_in, because handle_loop_phis() would re-create this phi later if this is
+       * a loop header.
+       */
+      live_in.erase(orig.id());
+   }
+
+   return reg;
+}
+
+void
+get_regs_for_phis(ra_ctx& ctx, Block& block, RegisterFile& register_file,
+                  std::vector<aco_ptr<Instruction>>& instructions, IDSet& live_in)
+{
+   /* move all phis to instructions */
+   for (aco_ptr<Instruction>& phi : block.instructions) {
+      if (!is_phi(phi))
+         break;
+      if (!phi->definitions[0].isKill())
+         instructions.emplace_back(std::move(phi));
+   }
+
+   /* assign phis with all-matching registers to that register */
+   for (aco_ptr<Instruction>& phi : instructions) {
+      Definition& definition = phi->definitions[0];
+      if (definition.isFixed())
+         continue;
+
+      if (!phi->operands[0].isTemp())
+         continue;
+
+      PhysReg reg = phi->operands[0].physReg();
+      auto OpsSame = [=](const Operand& op) -> bool
+      { return op.isTemp() && (!op.isFixed() || op.physReg() == reg); };
+      bool all_same = std::all_of(phi->operands.cbegin() + 1, phi->operands.cend(), OpsSame);
+      if (!all_same)
+         continue;
+
+      if (!get_reg_specified(ctx, register_file, definition.regClass(), phi, reg))
+         continue;
+
+      definition.setFixed(reg);
+      register_file.fill(definition);
+      ctx.assignments[definition.tempId()].set(definition);
+   }
+
+   /* try to find a register that is used by at least one operand */
+   for (aco_ptr<Instruction>& phi : instructions) {
+      Definition& definition = phi->definitions[0];
+      if (definition.isFixed())
+         continue;
+
+      /* use affinity if available */
+      if (ctx.assignments[definition.tempId()].affinity &&
+          ctx.assignments[ctx.assignments[definition.tempId()].affinity].assigned) {
+         assignment& affinity = ctx.assignments[ctx.assignments[definition.tempId()].affinity];
+         assert(affinity.rc == definition.regClass());
+         if (get_reg_specified(ctx, register_file, definition.regClass(), phi, affinity.reg)) {
+            definition.setFixed(affinity.reg);
+            register_file.fill(definition);
+            ctx.assignments[definition.tempId()].set(definition);
+            continue;
+         }
+      }
+
+      /* by going backwards, we aim to avoid copies in else-blocks */
+      for (int i = phi->operands.size() - 1; i >= 0; i--) {
+         const Operand& op = phi->operands[i];
+         if (!op.isTemp() || !op.isFixed())
+            continue;
+
+         PhysReg reg = op.physReg();
+         if (get_reg_specified(ctx, register_file, definition.regClass(), phi, reg)) {
+            definition.setFixed(reg);
+            register_file.fill(definition);
+            ctx.assignments[definition.tempId()].set(definition);
+            break;
+         }
+      }
+   }
+
+   /* find registers for phis where the register was blocked or no operand was assigned */
+
+   /* Don't use iterators because get_reg_phi() can add phis to the end of the vector. */
+   for (unsigned i = 0; i < instructions.size(); i++) {
+      aco_ptr<Instruction>& phi = instructions[i];
+      Definition& definition = phi->definitions[0];
+      if (definition.isFixed())
+         continue;
+
+      definition.setFixed(
+         get_reg_phi(ctx, live_in, register_file, instructions, block, phi, definition.getTemp()));
+
+      register_file.fill(definition);
+      ctx.assignments[definition.tempId()].set(definition);
+   }
+}
+
 Temp
 read_variable(ra_ctx& ctx, Temp val, unsigned block_idx)
 {
@@ -1911,7 +2346,7 @@ read_variable(ra_ctx& ctx, Temp val, unsigned block_idx)
 Temp
 handle_live_in(ra_ctx& ctx, Temp val, Block* block)
 {
-   std::vector<unsigned>& preds = val.is_linear() ? block->linear_preds : block->logical_preds;
+   Block::edge_vec& preds = val.is_linear() ? block->linear_preds : block->logical_preds;
    if (preds.size() == 0)
       return val;
 
@@ -1939,20 +2374,18 @@ handle_live_in(ra_ctx& ctx, Temp val, Block* block)
 
       /* the variable has been renamed differently in the predecessors: we need to insert a phi */
       aco_opcode opcode = val.is_linear() ? aco_opcode::p_linear_phi : aco_opcode::p_phi;
-      aco_ptr<Instruction> phi{
-         create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, preds.size(), 1)};
+      aco_ptr<Instruction> phi{create_instruction(opcode, Format::PSEUDO, preds.size(), 1)};
       new_val = ctx.program->allocateTmp(val.regClass());
       phi->definitions[0] = Definition(new_val);
+      ctx.assignments.emplace_back();
+      assert(ctx.assignments.size() == ctx.program->peekAllocationId());
       for (unsigned i = 0; i < preds.size(); i++) {
          /* update the operands so that it uses the new affinity */
          phi->operands[i] = Operand(ops[i]);
          assert(ctx.assignments[ops[i].id()].assigned);
+         assert(ops[i].regClass() == new_val.regClass());
          phi->operands[i].setFixed(ctx.assignments[ops[i].id()].reg);
-         if (ops[i].regClass() == new_val.regClass())
-            ctx.affinities[new_val.id()] = ops[i].id();
       }
-      ctx.assignments.emplace_back();
-      assert(ctx.assignments.size() == ctx.program->peekAllocationId());
       block->instructions.insert(block->instructions.begin(), std::move(phi));
    }
 
@@ -2002,7 +2435,7 @@ handle_loop_phis(ra_ctx& ctx, const IDSet& live_in, uint32_t loop_header_idx,
       aco_ptr<Instruction>& phi = loop_header.instructions[i];
       if (!is_phi(phi))
          break;
-      const std::vector<unsigned>& preds =
+      const Block::edge_vec& preds =
          phi->opcode == aco_opcode::p_phi ? loop_header.logical_preds : loop_header.linear_preds;
       for (unsigned j = 1; j < phi->operands.size(); j++) {
          Operand& op = phi->operands[j];
@@ -2093,7 +2526,7 @@ init_reg_file(ra_ctx& ctx, const std::vector<IDSet>& live_out_per_block, Block&
       for (aco_ptr<Instruction>& instr : block.instructions) {
          if (!is_phi(instr))
             break;
-         const std::vector<unsigned>& preds =
+         const Block::edge_vec& preds =
             instr->opcode == aco_opcode::p_phi ? block.logical_preds : block.linear_preds;
 
          for (unsigned i = 0; i < instr->operands.size(); i++) {
@@ -2125,8 +2558,8 @@ init_reg_file(ra_ctx& ctx, const std::vector<IDSet>& live_out_per_block, Block&
 void
 get_affinities(ra_ctx& ctx, std::vector<IDSet>& live_out_per_block)
 {
-   std::vector<std::vector<Temp>> phi_ressources;
-   std::unordered_map<unsigned, unsigned> temp_to_phi_ressources;
+   std::vector<std::vector<Temp>> phi_resources;
+   std::unordered_map<unsigned, unsigned> temp_to_phi_resources;
 
    for (auto block_rit = ctx.program->blocks.rbegin(); block_rit != ctx.program->blocks.rend();
         block_rit++) {
@@ -2138,46 +2571,48 @@ get_affinities(ra_ctx& ctx, std::vector<IDSet>& live_out_per_block)
       std::vector<aco_ptr<Instruction>>::reverse_iterator rit;
       for (rit = block.instructions.rbegin(); rit != block.instructions.rend(); ++rit) {
          aco_ptr<Instruction>& instr = *rit;
-         if (is_phi(instr)) {
-            if (instr->definitions[0].isKill() || instr->definitions[0].isFixed()) {
-               live.erase(instr->definitions[0].tempId());
-               continue;
-            }
-            /* collect information about affinity-related temporaries */
-            std::vector<Temp> affinity_related;
-            /* affinity_related[0] is the last seen affinity-related temp */
-            affinity_related.emplace_back(instr->definitions[0].getTemp());
-            affinity_related.emplace_back(instr->definitions[0].getTemp());
-            for (const Operand& op : instr->operands) {
-               if (op.isTemp() && op.isKill() &&
-                   op.regClass() == instr->definitions[0].regClass()) {
-                  affinity_related.emplace_back(op.getTemp());
-                  temp_to_phi_ressources[op.tempId()] = phi_ressources.size();
-               }
-            }
-            phi_ressources.emplace_back(std::move(affinity_related));
-         } else {
-            /* add vector affinities */
-            if (instr->opcode == aco_opcode::p_create_vector) {
-               for (const Operand& op : instr->operands) {
-                  if (op.isTemp() && op.isFirstKill() &&
-                      op.getTemp().type() == instr->definitions[0].getTemp().type())
-                     ctx.vectors[op.tempId()] = instr.get();
-               }
-            } else if (instr->format == Format::MIMG && instr->operands.size() > 4) {
-               for (unsigned i = 3; i < instr->operands.size(); i++)
-                  ctx.vectors[instr->operands[i].tempId()] = instr.get();
-            }
-
-            if (instr->opcode == aco_opcode::p_split_vector &&
-                instr->operands[0].isFirstKillBeforeDef())
-               ctx.split_vectors[instr->operands[0].tempId()] = instr.get();
+         if (is_phi(instr))
+            break;
 
-            /* add operands to live variables */
+         /* add vector affinities */
+         if (instr->opcode == aco_opcode::p_create_vector) {
             for (const Operand& op : instr->operands) {
-               if (op.isTemp())
-                  live.insert(op.tempId());
+               if (op.isTemp() && op.isFirstKill() &&
+                   op.getTemp().type() == instr->definitions[0].getTemp().type())
+                  ctx.vectors[op.tempId()] = instr.get();
             }
+         } else if (instr->format == Format::MIMG && instr->operands.size() > 4 &&
+                    !instr->mimg().strict_wqm) {
+            for (unsigned i = 3; i < instr->operands.size(); i++)
+               ctx.vectors[instr->operands[i].tempId()] = instr.get();
+         } else if (instr->opcode == aco_opcode::p_split_vector &&
+                    instr->operands[0].isFirstKillBeforeDef()) {
+            ctx.split_vectors[instr->operands[0].tempId()] = instr.get();
+         } else if (instr->isVOPC() && !instr->isVOP3()) {
+            if (!instr->isSDWA() || ctx.program->gfx_level == GFX8)
+               ctx.assignments[instr->definitions[0].tempId()].vcc = true;
+         } else if (instr->isVOP2() && !instr->isVOP3()) {
+            if (instr->operands.size() == 3 && instr->operands[2].isTemp() &&
+                instr->operands[2].regClass().type() == RegType::sgpr)
+               ctx.assignments[instr->operands[2].tempId()].vcc = true;
+            if (instr->definitions.size() == 2)
+               ctx.assignments[instr->definitions[1].tempId()].vcc = true;
+         } else if (instr->opcode == aco_opcode::s_and_b32 ||
+                    instr->opcode == aco_opcode::s_and_b64) {
+            /* If SCC is used by a branch, we might be able to use
+             * s_cbranch_vccz/s_cbranch_vccnz if the operand is VCC.
+             */
+            if (!instr->definitions[1].isKill() && instr->operands[0].isTemp() &&
+                instr->operands[1].isFixed() && instr->operands[1].physReg() == exec)
+               ctx.assignments[instr->operands[0].tempId()].vcc = true;
+         } else if (instr->opcode == aco_opcode::s_sendmsg) {
+            ctx.assignments[instr->operands[0].tempId()].m0 = true;
+         }
+
+         /* add operands to live variables */
+         for (const Operand& op : instr->operands) {
+            if (op.isTemp())
+               live.insert(op.tempId());
          }
 
          /* erase definitions from live */
@@ -2188,10 +2623,10 @@ get_affinities(ra_ctx& ctx, std::vector<IDSet>& live_out_per_block)
             live.erase(def.tempId());
             /* mark last-seen phi operand */
             std::unordered_map<unsigned, unsigned>::iterator it =
-               temp_to_phi_ressources.find(def.tempId());
-            if (it != temp_to_phi_ressources.end() &&
-                def.regClass() == phi_ressources[it->second][0].regClass()) {
-               phi_ressources[it->second][0] = def.getTemp();
+               temp_to_phi_resources.find(def.tempId());
+            if (it != temp_to_phi_resources.end() &&
+                def.regClass() == phi_resources[it->second][0].regClass()) {
+               phi_resources[it->second][0] = def.getTemp();
                /* try to coalesce phi affinities with parallelcopies */
                Operand op = Operand();
                switch (instr->opcode) {
@@ -2204,7 +2639,7 @@ get_affinities(ra_ctx& ctx, std::vector<IDSet>& live_out_per_block)
                case aco_opcode::v_fma_f32:
                case aco_opcode::v_fma_f16:
                case aco_opcode::v_pk_fma_f16:
-                  if (ctx.program->chip_class < GFX10)
+                  if (ctx.program->gfx_level < GFX10)
                      continue;
                   FALLTHROUGH;
                case aco_opcode::v_mad_f32:
@@ -2214,193 +2649,371 @@ get_affinities(ra_ctx& ctx, std::vector<IDSet>& live_out_per_block)
                   op = instr->operands[2];
                   break;
 
+               case aco_opcode::v_mad_legacy_f32:
+               case aco_opcode::v_fma_legacy_f32:
+                  if (instr->usesModifiers() || !ctx.program->dev.has_mac_legacy32)
+                     continue;
+                  op = instr->operands[2];
+                  break;
+
                default: continue;
                }
 
                if (op.isTemp() && op.isFirstKillBeforeDef() && def.regClass() == op.regClass()) {
-                  phi_ressources[it->second].emplace_back(op.getTemp());
-                  temp_to_phi_ressources[op.tempId()] = it->second;
+                  phi_resources[it->second].emplace_back(op.getTemp());
+                  temp_to_phi_resources[op.tempId()] = it->second;
+               }
+            }
+         }
+      }
+
+      /* collect phi affinities */
+      for (; rit != block.instructions.rend(); ++rit) {
+         aco_ptr<Instruction>& instr = *rit;
+         assert(is_phi(instr));
+
+         live.erase(instr->definitions[0].tempId());
+         if (instr->definitions[0].isKill() || instr->definitions[0].isFixed())
+            continue;
+
+         assert(instr->definitions[0].isTemp());
+         std::unordered_map<unsigned, unsigned>::iterator it =
+            temp_to_phi_resources.find(instr->definitions[0].tempId());
+         unsigned index = phi_resources.size();
+         std::vector<Temp>* affinity_related;
+         if (it != temp_to_phi_resources.end()) {
+            index = it->second;
+            phi_resources[index][0] = instr->definitions[0].getTemp();
+            affinity_related = &phi_resources[index];
+         } else {
+            phi_resources.emplace_back(std::vector<Temp>{instr->definitions[0].getTemp()});
+            affinity_related = &phi_resources.back();
+         }
+
+         for (const Operand& op : instr->operands) {
+            if (op.isTemp() && op.isKill() && op.regClass() == instr->definitions[0].regClass()) {
+               affinity_related->emplace_back(op.getTemp());
+               if (block.kind & block_kind_loop_header)
+                  continue;
+               temp_to_phi_resources[op.tempId()] = index;
+            }
+         }
+      }
+
+      /* visit the loop header phis first in order to create nested affinities */
+      if (block.kind & block_kind_loop_exit) {
+         /* find loop header */
+         auto header_rit = block_rit;
+         while ((header_rit + 1)->loop_nest_depth > block.loop_nest_depth)
+            header_rit++;
+
+         for (aco_ptr<Instruction>& phi : header_rit->instructions) {
+            if (!is_phi(phi))
+               break;
+            if (phi->definitions[0].isKill() || phi->definitions[0].isFixed())
+               continue;
+
+            /* create an (empty) merge-set for the phi-related variables */
+            auto it = temp_to_phi_resources.find(phi->definitions[0].tempId());
+            unsigned index = phi_resources.size();
+            if (it == temp_to_phi_resources.end()) {
+               temp_to_phi_resources[phi->definitions[0].tempId()] = index;
+               phi_resources.emplace_back(std::vector<Temp>{phi->definitions[0].getTemp()});
+            } else {
+               index = it->second;
+            }
+            for (unsigned i = 1; i < phi->operands.size(); i++) {
+               const Operand& op = phi->operands[i];
+               if (op.isTemp() && op.isKill() && op.regClass() == phi->definitions[0].regClass()) {
+                  temp_to_phi_resources[op.tempId()] = index;
                }
             }
          }
       }
    }
    /* create affinities */
-   for (std::vector<Temp>& vec : phi_ressources) {
-      assert(vec.size() > 1);
+   for (std::vector<Temp>& vec : phi_resources) {
       for (unsigned i = 1; i < vec.size(); i++)
          if (vec[i].id() != vec[0].id())
-            ctx.affinities[vec[i].id()] = vec[0].id();
+            ctx.assignments[vec[i].id()].affinity = vec[0].id();
    }
 }
 
-} /* end namespace */
+void
+optimize_encoding_vop2(Program* program, ra_ctx& ctx, RegisterFile& register_file,
+                       aco_ptr<Instruction>& instr)
+{
+   /* try to optimize v_mad_f32 -> v_mac_f32 */
+   if ((instr->opcode != aco_opcode::v_mad_f32 &&
+        (instr->opcode != aco_opcode::v_fma_f32 || program->gfx_level < GFX10) &&
+        instr->opcode != aco_opcode::v_mad_f16 && instr->opcode != aco_opcode::v_mad_legacy_f16 &&
+        (instr->opcode != aco_opcode::v_fma_f16 || program->gfx_level < GFX10) &&
+        (instr->opcode != aco_opcode::v_pk_fma_f16 || program->gfx_level < GFX10) &&
+        (instr->opcode != aco_opcode::v_mad_legacy_f32 || !program->dev.has_mac_legacy32) &&
+        (instr->opcode != aco_opcode::v_fma_legacy_f32 || !program->dev.has_mac_legacy32) &&
+        (instr->opcode != aco_opcode::v_dot4_i32_i8 || program->family == CHIP_VEGA20)) ||
+       !instr->operands[2].isTemp() || !instr->operands[2].isKillBeforeDef() ||
+       instr->operands[2].getTemp().type() != RegType::vgpr ||
+       (!instr->operands[0].isOfType(RegType::vgpr) &&
+        !instr->operands[1].isOfType(RegType::vgpr)) ||
+       instr->operands[2].physReg().byte() != 0 || instr->valu().opsel[2])
+      return;
+
+   if (instr->isVOP3P() && (instr->valu().opsel_lo != 0 || instr->valu().opsel_hi != 0x7))
+      return;
+
+   if ((instr->operands[0].physReg().byte() != 0 || instr->operands[1].physReg().byte() != 0 ||
+        instr->valu().opsel) &&
+       program->gfx_level < GFX11)
+      return;
+
+   unsigned im_mask = instr->isDPP16() ? 0x3 : 0;
+   if (instr->valu().omod || instr->valu().clamp || (instr->valu().abs & ~im_mask) ||
+       (instr->valu().neg & ~im_mask))
+      return;
+
+   if (!instr->operands[1].isOfType(RegType::vgpr))
+      instr->valu().swapOperands(0, 1);
+
+   if (!instr->operands[0].isOfType(RegType::vgpr) && instr->valu().opsel[0])
+      return;
+
+   unsigned def_id = instr->definitions[0].tempId();
+   if (ctx.assignments[def_id].affinity) {
+      assignment& affinity = ctx.assignments[ctx.assignments[def_id].affinity];
+      if (affinity.assigned && affinity.reg != instr->operands[2].physReg() &&
+          !register_file.test(affinity.reg, instr->operands[2].bytes()))
+         return;
+   }
+
+   instr->format = (Format)(((unsigned)withoutVOP3(instr->format) & ~(unsigned)Format::VOP3P) |
+                            (unsigned)Format::VOP2);
+   instr->valu().opsel_hi = 0;
+   switch (instr->opcode) {
+   case aco_opcode::v_mad_f32: instr->opcode = aco_opcode::v_mac_f32; break;
+   case aco_opcode::v_fma_f32: instr->opcode = aco_opcode::v_fmac_f32; break;
+   case aco_opcode::v_mad_f16:
+   case aco_opcode::v_mad_legacy_f16: instr->opcode = aco_opcode::v_mac_f16; break;
+   case aco_opcode::v_fma_f16: instr->opcode = aco_opcode::v_fmac_f16; break;
+   case aco_opcode::v_pk_fma_f16: instr->opcode = aco_opcode::v_pk_fmac_f16; break;
+   case aco_opcode::v_dot4_i32_i8: instr->opcode = aco_opcode::v_dot4c_i32_i8; break;
+   case aco_opcode::v_mad_legacy_f32: instr->opcode = aco_opcode::v_mac_legacy_f32; break;
+   case aco_opcode::v_fma_legacy_f32: instr->opcode = aco_opcode::v_fmac_legacy_f32; break;
+   default: break;
+   }
+}
 
 void
-register_allocation(Program* program, std::vector<IDSet>& live_out_per_block, ra_test_policy policy)
+optimize_encoding_sopk(Program* program, ra_ctx& ctx, RegisterFile& register_file,
+                       aco_ptr<Instruction>& instr)
 {
-   ra_ctx ctx(program, policy);
-   get_affinities(ctx, live_out_per_block);
+   /* try to optimize sop2 with literal source to sopk */
+   if (instr->opcode != aco_opcode::s_add_i32 && instr->opcode != aco_opcode::s_mul_i32 &&
+       instr->opcode != aco_opcode::s_cselect_b32)
+      return;
 
-   /* state of register file after phis */
-   std::vector<std::bitset<128>> sgpr_live_in(program->blocks.size());
+   uint32_t literal_idx = 0;
 
-   for (Block& block : program->blocks) {
-      ctx.block = &block;
+   if (instr->opcode != aco_opcode::s_cselect_b32 && instr->operands[1].isLiteral())
+      literal_idx = 1;
 
-      /* initialize register file */
-      RegisterFile register_file = init_reg_file(ctx, live_out_per_block, block);
-      ctx.war_hint.reset();
+   if (!instr->operands[!literal_idx].isTemp() ||
+       !instr->operands[!literal_idx].isKillBeforeDef() ||
+       instr->operands[!literal_idx].getTemp().type() != RegType::sgpr ||
+       instr->operands[!literal_idx].physReg() >= 128)
+      return;
 
-      std::vector<aco_ptr<Instruction>> instructions;
-      std::vector<aco_ptr<Instruction>>::iterator instr_it;
+   if (!instr->operands[literal_idx].isLiteral())
+      return;
 
-      /* this is a slight adjustment from the paper as we already have phi nodes:
-       * We consider them incomplete phis and only handle the definition. */
+   const uint32_t i16_mask = 0xffff8000u;
+   uint32_t value = instr->operands[literal_idx].constantValue();
+   if ((value & i16_mask) && (value & i16_mask) != i16_mask)
+      return;
 
-      /* look up the affinities */
-      for (instr_it = block.instructions.begin(); instr_it != block.instructions.end();
-           ++instr_it) {
-         aco_ptr<Instruction>& phi = *instr_it;
-         if (!is_phi(phi))
-            break;
-         Definition& definition = phi->definitions[0];
-         if (definition.isKill() || definition.isFixed())
-            continue;
+   unsigned def_id = instr->definitions[0].tempId();
+   if (ctx.assignments[def_id].affinity) {
+      assignment& affinity = ctx.assignments[ctx.assignments[def_id].affinity];
+      if (affinity.assigned && affinity.reg != instr->operands[!literal_idx].physReg() &&
+          !register_file.test(affinity.reg, instr->operands[!literal_idx].bytes()))
+         return;
+   }
 
-         if (ctx.affinities.find(definition.tempId()) != ctx.affinities.end() &&
-             ctx.assignments[ctx.affinities[definition.tempId()]].assigned) {
-            assert(ctx.assignments[ctx.affinities[definition.tempId()]].rc ==
-                   definition.regClass());
-            PhysReg reg = ctx.assignments[ctx.affinities[definition.tempId()]].reg;
-            if (reg == scc) {
-               /* only use scc if all operands are already placed there */
-               bool use_scc =
-                  std::all_of(phi->operands.begin(), phi->operands.end(),
-                              [](const Operand& op)
-                              { return op.isTemp() && op.isFixed() && op.physReg() == scc; });
-               if (!use_scc)
-                  continue;
-            }
+   instr->format = Format::SOPK;
+   SALU_instruction* instr_sopk = &instr->salu();
 
-            /* only assign if register is still free */
-            if (!register_file.test(reg, definition.bytes())) {
-               definition.setFixed(reg);
-               register_file.fill(definition);
-               ctx.assignments[definition.tempId()] = {definition.physReg(), definition.regClass()};
-            }
+   instr_sopk->imm = instr_sopk->operands[literal_idx].constantValue() & 0xffff;
+   if (literal_idx == 0)
+      std::swap(instr_sopk->operands[0], instr_sopk->operands[1]);
+   if (instr_sopk->operands.size() > 2)
+      std::swap(instr_sopk->operands[1], instr_sopk->operands[2]);
+   instr_sopk->operands.pop_back();
+
+   switch (instr_sopk->opcode) {
+   case aco_opcode::s_add_i32: instr_sopk->opcode = aco_opcode::s_addk_i32; break;
+   case aco_opcode::s_mul_i32: instr_sopk->opcode = aco_opcode::s_mulk_i32; break;
+   case aco_opcode::s_cselect_b32: instr_sopk->opcode = aco_opcode::s_cmovk_i32; break;
+   default: unreachable("illegal instruction");
+   }
+}
+
+void
+optimize_encoding(Program* program, ra_ctx& ctx, RegisterFile& register_file,
+                  aco_ptr<Instruction>& instr)
+{
+   if (instr->isVALU())
+      optimize_encoding_vop2(program, ctx, register_file, instr);
+   if (instr->isSALU())
+      optimize_encoding_sopk(program, ctx, register_file, instr);
+}
+
+void
+emit_parallel_copy_internal(ra_ctx& ctx, std::vector<std::pair<Operand, Definition>>& parallelcopy,
+                            aco_ptr<Instruction>& instr,
+                            std::vector<aco_ptr<Instruction>>& instructions, bool temp_in_scc,
+                            RegisterFile& register_file)
+{
+   if (parallelcopy.empty())
+      return;
+
+   aco_ptr<Instruction> pc;
+   pc.reset(create_instruction(aco_opcode::p_parallelcopy, Format::PSEUDO, parallelcopy.size(),
+                               parallelcopy.size()));
+   bool linear_vgpr = false;
+   bool sgpr_operands_alias_defs = false;
+   uint64_t sgpr_operands[4] = {0, 0, 0, 0};
+   for (unsigned i = 0; i < parallelcopy.size(); i++) {
+      linear_vgpr |= parallelcopy[i].first.regClass().is_linear_vgpr();
+
+      if (temp_in_scc && parallelcopy[i].first.isTemp() &&
+          parallelcopy[i].first.getTemp().type() == RegType::sgpr) {
+         if (!sgpr_operands_alias_defs) {
+            unsigned reg = parallelcopy[i].first.physReg().reg();
+            unsigned size = parallelcopy[i].first.getTemp().size();
+            sgpr_operands[reg / 64u] |= u_bit_consecutive64(reg % 64u, size);
+
+            reg = parallelcopy[i].second.physReg().reg();
+            size = parallelcopy[i].second.getTemp().size();
+            if (sgpr_operands[reg / 64u] & u_bit_consecutive64(reg % 64u, size))
+               sgpr_operands_alias_defs = true;
          }
       }
 
-      /* find registers for phis without affinity or where the register was blocked */
-      for (instr_it = block.instructions.begin(); instr_it != block.instructions.end();
-           ++instr_it) {
-         aco_ptr<Instruction>& phi = *instr_it;
-         if (!is_phi(phi))
-            break;
+      pc->operands[i] = parallelcopy[i].first;
+      pc->definitions[i] = parallelcopy[i].second;
+      assert(pc->operands[i].size() == pc->definitions[i].size());
 
-         Definition& definition = phi->definitions[0];
-         if (definition.isKill())
-            continue;
+      /* it might happen that the operand is already renamed. we have to restore the
+       * original name. */
+      std::unordered_map<unsigned, Temp>::iterator it =
+         ctx.orig_names.find(pc->operands[i].tempId());
+      Temp orig = it != ctx.orig_names.end() ? it->second : pc->operands[i].getTemp();
+      ctx.orig_names[pc->definitions[i].tempId()] = orig;
+      ctx.renames[ctx.block->index][orig.id()] = pc->definitions[i].getTemp();
+   }
 
-         if (!definition.isFixed()) {
-            std::vector<std::pair<Operand, Definition>> parallelcopy;
-            /* try to find a register that is used by at least one operand */
-            for (int i = phi->operands.size() - 1; i >= 0; i--) {
-               /* by going backwards, we aim to avoid copies in else-blocks */
-               const Operand& op = phi->operands[i];
-               if (!op.isTemp() || !op.isFixed())
-                  continue;
-               PhysReg reg = op.physReg();
-               /* we tried this already on the previous loop */
-               if (reg == scc)
-                  continue;
-               if (get_reg_specified(ctx, register_file, definition.regClass(), phi, reg)) {
-                  definition.setFixed(reg);
-                  break;
-               }
-            }
-            if (!definition.isFixed()) {
-               definition.setFixed(
-                  get_reg(ctx, register_file, definition.getTemp(), parallelcopy, phi));
-               update_renames(ctx, register_file, parallelcopy, phi, rename_not_killed_ops);
-            }
+   if (temp_in_scc && (sgpr_operands_alias_defs || linear_vgpr)) {
+      /* disable definitions and re-enable operands */
+      RegisterFile tmp_file(register_file);
+      for (const Definition& def : instr->definitions) {
+         if (def.isTemp() && !def.isKill())
+            tmp_file.clear(def);
+      }
+      for (const Operand& op : instr->operands) {
+         if (op.isTemp() && op.isFirstKill())
+            tmp_file.block(op.physReg(), op.regClass());
+      }
 
-            /* process parallelcopy */
-            for (std::pair<Operand, Definition> pc : parallelcopy) {
-               /* see if it's a copy from a different phi */
-               // TODO: prefer moving some previous phis over live-ins
-               // TODO: somehow prevent phis fixed before the RA from being updated (shouldn't be a
-               // problem in practice since they can only be fixed to exec)
-               Instruction* prev_phi = NULL;
-               std::vector<aco_ptr<Instruction>>::iterator phi_it;
-               for (phi_it = instructions.begin(); phi_it != instructions.end(); ++phi_it) {
-                  if ((*phi_it)->definitions[0].tempId() == pc.first.tempId())
-                     prev_phi = phi_it->get();
-               }
-               phi_it = instr_it;
-               while (!prev_phi && is_phi(*++phi_it)) {
-                  if ((*phi_it)->definitions[0].tempId() == pc.first.tempId())
-                     prev_phi = phi_it->get();
-               }
-               if (prev_phi) {
-                  /* if so, just update that phi's register */
-                  register_file.clear(prev_phi->definitions[0]);
-                  prev_phi->definitions[0].setFixed(pc.second.physReg());
-                  ctx.assignments[prev_phi->definitions[0].tempId()] = {pc.second.physReg(),
-                                                                        pc.second.regClass()};
-                  register_file.fill(prev_phi->definitions[0]);
-                  continue;
-               }
+      handle_pseudo(ctx, tmp_file, pc.get());
+   } else {
+      pc->pseudo().needs_scratch_reg = sgpr_operands_alias_defs || linear_vgpr;
+      pc->pseudo().tmp_in_scc = false;
+   }
 
-               /* rename */
-               std::unordered_map<unsigned, Temp>::iterator orig_it =
-                  ctx.orig_names.find(pc.first.tempId());
-               Temp orig = pc.first.getTemp();
-               if (orig_it != ctx.orig_names.end())
-                  orig = orig_it->second;
-               else
-                  ctx.orig_names[pc.second.tempId()] = orig;
-               ctx.renames[block.index][orig.id()] = pc.second.getTemp();
-
-               /* otherwise, this is a live-in and we need to create a new phi
-                * to move it in this block's predecessors */
-               aco_opcode opcode =
-                  pc.first.getTemp().is_linear() ? aco_opcode::p_linear_phi : aco_opcode::p_phi;
-               std::vector<unsigned>& preds =
-                  pc.first.getTemp().is_linear() ? block.linear_preds : block.logical_preds;
-               aco_ptr<Instruction> new_phi{
-                  create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, preds.size(), 1)};
-               new_phi->definitions[0] = pc.second;
-               for (unsigned i = 0; i < preds.size(); i++)
-                  new_phi->operands[i] = Operand(pc.first);
-               instructions.emplace_back(std::move(new_phi));
-
-               /* Remove from live_out_per_block (now used for live-in), because handle_loop_phis()
-                * would re-create this phi later if this is a loop header.
-                */
-               live_out_per_block[block.index].erase(orig.id());
-            }
+   instructions.emplace_back(std::move(pc));
 
-            register_file.fill(definition);
-            ctx.assignments[definition.tempId()] = {definition.physReg(), definition.regClass()};
-         }
+   parallelcopy.clear();
+}
 
-         /* update phi affinities */
-         for (const Operand& op : phi->operands) {
-            if (op.isTemp() && op.regClass() == phi->definitions[0].regClass())
-               ctx.affinities[op.tempId()] = definition.tempId();
+void
+emit_parallel_copy(ra_ctx& ctx, std::vector<std::pair<Operand, Definition>>& parallelcopy,
+                   aco_ptr<Instruction>& instr, std::vector<aco_ptr<Instruction>>& instructions,
+                   bool temp_in_scc, RegisterFile& register_file)
+{
+   if (parallelcopy.empty())
+      return;
+
+   std::vector<std::pair<Operand, Definition>> linear_vgpr;
+   if (ctx.num_linear_vgprs) {
+      unsigned next = 0;
+      for (unsigned i = 0; i < parallelcopy.size(); i++) {
+         if (parallelcopy[i].first.regClass().is_linear_vgpr()) {
+            linear_vgpr.push_back(parallelcopy[i]);
+            continue;
          }
 
-         instructions.emplace_back(std::move(*instr_it));
+         if (next != i)
+            parallelcopy[next] = parallelcopy[i];
+         next++;
       }
+      parallelcopy.resize(next);
+   }
+
+   /* Because of how linear VGPRs are allocated, we should never have to move a linear VGPR into the
+    * space of a normal one. This means the copy can be done entirely before normal VGPR copies. */
+   emit_parallel_copy_internal(ctx, linear_vgpr, instr, instructions, temp_in_scc,
+                               register_file);
+   emit_parallel_copy_internal(ctx, parallelcopy, instr, instructions, temp_in_scc,
+                               register_file);
+}
+
+} /* end namespace */
+
+void
+register_allocation(Program* program, live& live_vars, ra_test_policy policy)
+{
+   std::vector<IDSet>& live_out_per_block = live_vars.live_out;
+   ra_ctx ctx(program, policy);
+   get_affinities(ctx, live_out_per_block);
 
-      /* fill in sgpr_live_in */
-      for (unsigned i = 0; i <= ctx.max_used_sgpr; i++)
-         sgpr_live_in[block.index][i] = register_file[PhysReg{i}];
-      sgpr_live_in[block.index][127] = register_file[scc];
+   for (Block& block : program->blocks) {
+      ctx.block = &block;
+
+      /* initialize register file */
+      RegisterFile register_file = init_reg_file(ctx, live_out_per_block, block);
+      ctx.war_hint.reset();
+
+      std::vector<aco_ptr<Instruction>> instructions;
+      instructions.reserve(block.instructions.size());
+
+      /* this is a slight adjustment from the paper as we already have phi nodes:
+       * We consider them incomplete phis and only handle the definition. */
+      get_regs_for_phis(ctx, block, register_file, instructions, live_out_per_block[block.index]);
+
+      /* If this is a merge block, the state of the register file at the branch instruction of the
+       * predecessors corresponds to the state after phis at the merge block. So, we allocate a
+       * register for the predecessor's branch definitions as if there was a phi.
+       */
+      if (!block.linear_preds.empty() &&
+          (block.linear_preds.size() != 1 ||
+           program->blocks[block.linear_preds[0]].linear_succs.size() == 1)) {
+         PhysReg br_reg = get_reg_phi(ctx, live_out_per_block[block.index], register_file,
+                                      instructions, block, ctx.phi_dummy, Temp(0, s2));
+         for (unsigned pred : block.linear_preds) {
+            program->blocks[pred].scc_live_out = register_file[scc];
+            aco_ptr<Instruction>& br = program->blocks[pred].instructions.back();
+
+            assert(br->definitions.size() == 1 && br->definitions[0].regClass() == s2 &&
+                   br->definitions[0].isKill());
+
+            br->definitions[0].setFixed(br_reg);
+         }
+      }
 
       /* Handle all other instructions of the block */
+      auto NonPhi = [](aco_ptr<Instruction>& instr) -> bool { return instr && !is_phi(instr); };
+      std::vector<aco_ptr<Instruction>>::iterator instr_it =
+         std::find_if(block.instructions.begin(), block.instructions.end(), NonPhi);
       for (; instr_it != block.instructions.end(); ++instr_it) {
          aco_ptr<Instruction>& instr = *instr_it;
 
@@ -2438,12 +3051,18 @@ register_allocation(Program* program, std::vector<IDSet>& live_out_per_block, ra
          }
 
          std::vector<std::pair<Operand, Definition>> parallelcopy;
+         bool temp_in_scc = register_file[scc];
 
-         assert(!is_phi(instr));
+         if (instr->opcode == aco_opcode::p_branch) {
+            /* unconditional branches are handled after phis of the target */
+            instructions.emplace_back(std::move(instr));
+            break;
+         }
 
-         bool temp_in_scc = register_file[scc];
+         assert(!is_phi(instr));
 
          /* handle operands */
+         bool fixed = false;
          for (unsigned i = 0; i < instr->operands.size(); ++i) {
             auto& operand = instr->operands[i];
             if (!operand.isTemp())
@@ -2453,13 +3072,37 @@ register_allocation(Program* program, std::vector<IDSet>& live_out_per_block, ra
             operand.setTemp(read_variable(ctx, operand.getTemp(), block.index));
             assert(ctx.assignments[operand.tempId()].assigned);
 
+            fixed |=
+               operand.isFixed() && ctx.assignments[operand.tempId()].reg != operand.physReg();
+         }
+
+         bool is_writelane = instr->opcode == aco_opcode::v_writelane_b32 ||
+                             instr->opcode == aco_opcode::v_writelane_b32_e64;
+         if (program->gfx_level <= GFX9 && is_writelane && instr->operands[0].isTemp() &&
+             instr->operands[1].isTemp()) {
+            /* v_writelane_b32 can take two sgprs but only if one is m0. */
+            if (ctx.assignments[instr->operands[0].tempId()].reg != m0 &&
+                ctx.assignments[instr->operands[1].tempId()].reg != m0) {
+               instr->operands[0].setFixed(m0);
+               fixed = true;
+            }
+         }
+
+         if (fixed)
+            handle_fixed_operands(ctx, register_file, parallelcopy, instr);
+
+         for (unsigned i = 0; i < instr->operands.size(); ++i) {
+            auto& operand = instr->operands[i];
+            if (!operand.isTemp() || operand.isFixed())
+               continue;
+
             PhysReg reg = ctx.assignments[operand.tempId()].reg;
-            if (operand_can_use_reg(program->chip_class, instr, i, reg, operand.regClass()))
+            if (operand_can_use_reg(program->gfx_level, instr, i, reg, operand.regClass()))
                operand.setFixed(reg);
             else
                get_reg_for_operand(ctx, register_file, parallelcopy, instr, operand, i);
 
-            if (instr->isEXP() || (instr->isVMEM() && i == 3 && ctx.program->chip_class == GFX6) ||
+            if (instr->isEXP() || (instr->isVMEM() && i == 3 && ctx.program->gfx_level == GFX6) ||
                 (instr->isDS() && instr->ds().gds)) {
                for (unsigned j = 0; j < operand.size(); j++)
                   ctx.war_hint.set(operand.physReg().reg() + j);
@@ -2472,59 +3115,17 @@ register_allocation(Program* program, std::vector<IDSet>& live_out_per_block, ra
                register_file.clear(op);
          }
 
-         /* try to optimize v_mad_f32 -> v_mac_f32 */
-         if ((instr->opcode == aco_opcode::v_mad_f32 ||
-              (instr->opcode == aco_opcode::v_fma_f32 && program->chip_class >= GFX10) ||
-              instr->opcode == aco_opcode::v_mad_f16 ||
-              instr->opcode == aco_opcode::v_mad_legacy_f16 ||
-              (instr->opcode == aco_opcode::v_fma_f16 && program->chip_class >= GFX10) ||
-              (instr->opcode == aco_opcode::v_pk_fma_f16 && program->chip_class >= GFX10) ||
-              (instr->opcode == aco_opcode::v_dot4_i32_i8 && program->family != CHIP_VEGA20)) &&
-             instr->operands[2].isTemp() && instr->operands[2].isKillBeforeDef() &&
-             instr->operands[2].getTemp().type() == RegType::vgpr && instr->operands[1].isTemp() &&
-             instr->operands[1].getTemp().type() == RegType::vgpr && !instr->usesModifiers() &&
-             instr->operands[0].physReg().byte() == 0 && instr->operands[1].physReg().byte() == 0 &&
-             instr->operands[2].physReg().byte() == 0) {
-            unsigned def_id = instr->definitions[0].tempId();
-            auto it = ctx.affinities.find(def_id);
-            if (it == ctx.affinities.end() || !ctx.assignments[it->second].assigned ||
-                instr->operands[2].physReg() == ctx.assignments[it->second].reg ||
-                register_file.test(ctx.assignments[it->second].reg, instr->operands[2].bytes())) {
-               instr->format = Format::VOP2;
-               switch (instr->opcode) {
-               case aco_opcode::v_mad_f32: instr->opcode = aco_opcode::v_mac_f32; break;
-               case aco_opcode::v_fma_f32: instr->opcode = aco_opcode::v_fmac_f32; break;
-               case aco_opcode::v_mad_f16:
-               case aco_opcode::v_mad_legacy_f16: instr->opcode = aco_opcode::v_mac_f16; break;
-               case aco_opcode::v_fma_f16: instr->opcode = aco_opcode::v_fmac_f16; break;
-               case aco_opcode::v_pk_fma_f16: instr->opcode = aco_opcode::v_pk_fmac_f16; break;
-               case aco_opcode::v_dot4_i32_i8: instr->opcode = aco_opcode::v_dot4c_i32_i8; break;
-               default: break;
-               }
-            }
-         }
-
-         /* handle definitions which must have the same register as an operand */
-         if (instr->opcode == aco_opcode::v_interp_p2_f32 ||
-             instr->opcode == aco_opcode::v_mac_f32 || instr->opcode == aco_opcode::v_fmac_f32 ||
-             instr->opcode == aco_opcode::v_mac_f16 || instr->opcode == aco_opcode::v_fmac_f16 ||
-             instr->opcode == aco_opcode::v_pk_fmac_f16 ||
-             instr->opcode == aco_opcode::v_writelane_b32 ||
-             instr->opcode == aco_opcode::v_writelane_b32_e64 ||
-             instr->opcode == aco_opcode::v_dot4c_i32_i8) {
-            instr->definitions[0].setFixed(instr->operands[2].physReg());
-         } else if (instr->opcode == aco_opcode::s_addk_i32 ||
-                    instr->opcode == aco_opcode::s_mulk_i32) {
-            instr->definitions[0].setFixed(instr->operands[0].physReg());
-         } else if (instr->isMUBUF() && instr->definitions.size() == 1 &&
-                    instr->operands.size() == 4) {
-            instr->definitions[0].setFixed(instr->operands[3].physReg());
-         } else if (instr->isMIMG() && instr->definitions.size() == 1 &&
-                    !instr->operands[2].isUndefined()) {
-            instr->definitions[0].setFixed(instr->operands[2].physReg());
-         }
+         optimize_encoding(program, ctx, register_file, instr);
 
-         ctx.defs_done.reset();
+         /* Handle definitions which must have the same register as an operand.
+          * We expect that the definition has the same size as the operand, otherwise the new
+          * location for the operand (if it's not killed) might intersect with the old one.
+          * We can't read from the old location because it's corrupted, and we can't write the new
+          * location because that's used by a live-through operand.
+          */
+         int op_fixed_to_def = get_op_fixed_to_def(instr.get());
+         if (op_fixed_to_def != -1)
+            instr->definitions[0].setFixed(instr->operands[op_fixed_to_def].physReg());
 
          /* handle fixed definitions first */
          for (unsigned i = 0; i < instr->definitions.size(); ++i) {
@@ -2538,8 +3139,7 @@ register_allocation(Program* program, std::vector<IDSet>& live_out_per_block, ra
                const PhysRegInterval def_regs{definition.physReg(), definition.size()};
 
                /* create parallelcopy pair to move blocking vars */
-               std::set<std::pair<unsigned, unsigned>> vars =
-                  collect_vars(ctx, register_file, def_regs);
+               std::vector<unsigned> vars = collect_vars(ctx, register_file, def_regs);
 
                RegisterFile tmp_file(register_file);
                /* re-enable the killed operands, so that we don't move the blocking vars there */
@@ -2549,19 +3149,16 @@ register_allocation(Program* program, std::vector<IDSet>& live_out_per_block, ra
                }
 
                ASSERTED bool success = false;
-               DefInfo info(ctx, instr, definition.regClass(), -1);
-               success = get_regs_for_copies(ctx, tmp_file, parallelcopy, vars, info.bounds, instr,
-                                             def_regs);
+               success = get_regs_for_copies(ctx, tmp_file, parallelcopy, vars, instr, def_regs);
                assert(success);
 
                update_renames(ctx, register_file, parallelcopy, instr, (UpdateRenames)0);
             }
-            ctx.defs_done.set(i);
 
             if (!definition.isTemp())
                continue;
 
-            ctx.assignments[definition.tempId()] = {definition.physReg(), definition.regClass()};
+            ctx.assignments[definition.tempId()].set(definition);
             register_file.fill(definition);
          }
 
@@ -2573,18 +3170,30 @@ register_allocation(Program* program, std::vector<IDSet>& live_out_per_block, ra
                continue;
 
             /* find free reg */
-            if (definition->hasHint() &&
-                get_reg_specified(ctx, register_file, definition->regClass(), instr,
-                                  definition->physReg())) {
-               definition->setFixed(definition->physReg());
+            if (instr->opcode == aco_opcode::p_start_linear_vgpr) {
+               /* Allocation of linear VGPRs is special. */
+               definition->setFixed(alloc_linear_vgpr(ctx, register_file, instr, parallelcopy));
+               update_renames(ctx, register_file, parallelcopy, instr, rename_not_killed_ops);
             } else if (instr->opcode == aco_opcode::p_split_vector) {
                PhysReg reg = instr->operands[0].physReg();
+               RegClass rc = definition->regClass();
                for (unsigned j = 0; j < i; j++)
                   reg.reg_b += instr->definitions[j].bytes();
-               if (get_reg_specified(ctx, register_file, definition->regClass(), instr, reg))
+               if (get_reg_specified(ctx, register_file, rc, instr, reg)) {
                   definition->setFixed(reg);
-            } else if (instr->opcode == aco_opcode::p_wqm ||
-                       instr->opcode == aco_opcode::p_parallelcopy) {
+               } else if (i == 0) {
+                  RegClass vec_rc = RegClass::get(rc.type(), instr->operands[0].bytes());
+                  DefInfo info(ctx, ctx.pseudo_dummy, vec_rc, -1);
+                  std::optional<PhysReg> res = get_reg_simple(ctx, register_file, info);
+                  if (res && get_reg_specified(ctx, register_file, rc, instr, *res))
+                     definition->setFixed(*res);
+               } else if (instr->definitions[i - 1].isFixed()) {
+                  reg = instr->definitions[i - 1].physReg();
+                  reg.reg_b += instr->definitions[i - 1].bytes();
+                  if (get_reg_specified(ctx, register_file, rc, instr, reg))
+                     definition->setFixed(reg);
+               }
+            } else if (instr->opcode == aco_opcode::p_parallelcopy) {
                PhysReg reg = instr->operands[i].physReg();
                if (instr->operands[i].isTemp() &&
                    instr->operands[i].getTemp().type() == definition->getTemp().type() &&
@@ -2600,6 +3209,14 @@ register_allocation(Program* program, std::vector<IDSet>& live_out_per_block, ra
                                                    parallelcopy, instr);
                update_renames(ctx, register_file, parallelcopy, instr, (UpdateRenames)0);
                definition->setFixed(reg);
+            } else if (instr_info.classes[(int)instr->opcode] == instr_class::wmma &&
+                       instr->operands[2].isTemp() && instr->operands[2].isKill() &&
+                       instr->operands[2].regClass() == definition->regClass()) {
+               /* For WMMA, the dest needs to either be equal to operands[2], or not overlap it.
+                * Here we set a policy of forcing them the same if operands[2] gets killed (and
+                * otherwise they don't overlap). This may not be optimal if RA would select a
+                * different location due to affinity, but that gets complicated very quickly. */
+               definition->setFixed(instr->operands[2].physReg());
             }
 
             if (!definition->isFixed()) {
@@ -2608,7 +3225,8 @@ register_allocation(Program* program, std::vector<IDSet>& live_out_per_block, ra
                   PhysReg reg = get_reg(ctx, register_file, tmp, parallelcopy, instr);
                   definition->setFixed(reg);
                   if (reg.byte() || register_file.test(reg, 4)) {
-                     add_subdword_definition(program, instr, reg);
+                     bool allow_16bit_write = reg.byte() % 2 == 0 && !register_file.test(reg, 2);
+                     add_subdword_definition(program, instr, reg, allow_16bit_write);
                      definition = &instr->definitions[i]; /* add_subdword_definition can invalidate
                                                              the reference */
                   }
@@ -2624,8 +3242,7 @@ register_allocation(Program* program, std::vector<IDSet>& live_out_per_block, ra
                definition->isFixed() &&
                ((definition->getTemp().type() == RegType::vgpr && definition->physReg() >= 256) ||
                 (definition->getTemp().type() != RegType::vgpr && definition->physReg() < 256)));
-            ctx.defs_done.set(i);
-            ctx.assignments[definition->tempId()] = {definition->physReg(), definition->regClass()};
+            ctx.assignments[definition->tempId()].set(*definition);
             register_file.fill(*definition);
          }
 
@@ -2645,87 +3262,30 @@ register_allocation(Program* program, std::vector<IDSet>& live_out_per_block, ra
                add_subdword_operand(ctx, instr, i, op.physReg().byte(), op.regClass());
          }
 
-         /* emit parallelcopy */
-         if (!parallelcopy.empty()) {
-            aco_ptr<Pseudo_instruction> pc;
-            pc.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_parallelcopy,
-                                                            Format::PSEUDO, parallelcopy.size(),
-                                                            parallelcopy.size()));
-            bool linear_vgpr = false;
-            bool sgpr_operands_alias_defs = false;
-            uint64_t sgpr_operands[4] = {0, 0, 0, 0};
-            for (unsigned i = 0; i < parallelcopy.size(); i++) {
-               linear_vgpr |= parallelcopy[i].first.regClass().is_linear_vgpr();
-
-               if (temp_in_scc && parallelcopy[i].first.isTemp() &&
-                   parallelcopy[i].first.getTemp().type() == RegType::sgpr) {
-                  if (!sgpr_operands_alias_defs) {
-                     unsigned reg = parallelcopy[i].first.physReg().reg();
-                     unsigned size = parallelcopy[i].first.getTemp().size();
-                     sgpr_operands[reg / 64u] |= u_bit_consecutive64(reg % 64u, size);
-
-                     reg = parallelcopy[i].second.physReg().reg();
-                     size = parallelcopy[i].second.getTemp().size();
-                     if (sgpr_operands[reg / 64u] & u_bit_consecutive64(reg % 64u, size))
-                        sgpr_operands_alias_defs = true;
-                  }
-               }
-
-               pc->operands[i] = parallelcopy[i].first;
-               pc->definitions[i] = parallelcopy[i].second;
-               assert(pc->operands[i].size() == pc->definitions[i].size());
-
-               /* it might happen that the operand is already renamed. we have to restore the
-                * original name. */
-               std::unordered_map<unsigned, Temp>::iterator it =
-                  ctx.orig_names.find(pc->operands[i].tempId());
-               Temp orig = it != ctx.orig_names.end() ? it->second : pc->operands[i].getTemp();
-               ctx.orig_names[pc->definitions[i].tempId()] = orig;
-               ctx.renames[block.index][orig.id()] = pc->definitions[i].getTemp();
-            }
-
-            if (temp_in_scc && (sgpr_operands_alias_defs || linear_vgpr)) {
-               /* disable definitions and re-enable operands */
-               RegisterFile tmp_file(register_file);
-               for (const Definition& def : instr->definitions) {
-                  if (def.isTemp() && !def.isKill())
-                     tmp_file.clear(def);
-               }
-               for (const Operand& op : instr->operands) {
-                  if (op.isTemp() && op.isFirstKill())
-                     tmp_file.block(op.physReg(), op.regClass());
-               }
-
-               handle_pseudo(ctx, tmp_file, pc.get());
-            } else {
-               pc->tmp_in_scc = false;
-            }
-
-            instructions.emplace_back(std::move(pc));
-         }
+         emit_parallel_copy(ctx, parallelcopy, instr, instructions, temp_in_scc, register_file);
 
          /* some instructions need VOP3 encoding if operand/definition is not assigned to VCC */
          bool instr_needs_vop3 =
             !instr->isVOP3() &&
-            ((instr->format == Format::VOPC && !(instr->definitions[0].physReg() == vcc)) ||
-             (instr->opcode == aco_opcode::v_cndmask_b32 &&
-              !(instr->operands[2].physReg() == vcc)) ||
+            ((withoutDPP(instr->format) == Format::VOPC &&
+              instr->definitions[0].physReg() != vcc) ||
+             (instr->opcode == aco_opcode::v_cndmask_b32 && instr->operands[2].physReg() != vcc) ||
              ((instr->opcode == aco_opcode::v_add_co_u32 ||
                instr->opcode == aco_opcode::v_addc_co_u32 ||
                instr->opcode == aco_opcode::v_sub_co_u32 ||
                instr->opcode == aco_opcode::v_subb_co_u32 ||
                instr->opcode == aco_opcode::v_subrev_co_u32 ||
                instr->opcode == aco_opcode::v_subbrev_co_u32) &&
-              !(instr->definitions[1].physReg() == vcc)) ||
+              instr->definitions[1].physReg() != vcc) ||
              ((instr->opcode == aco_opcode::v_addc_co_u32 ||
                instr->opcode == aco_opcode::v_subb_co_u32 ||
                instr->opcode == aco_opcode::v_subbrev_co_u32) &&
-              !(instr->operands[2].physReg() == vcc)));
+              instr->operands[2].physReg() != vcc));
          if (instr_needs_vop3) {
 
             /* if the first operand is a literal, we have to move it to a reg */
             if (instr->operands.size() && instr->operands[0].isLiteral() &&
-                program->chip_class < GFX10) {
+                program->gfx_level < GFX10) {
                bool can_sgpr = true;
                /* check, if we have to move to vgpr */
                for (const Operand& op : instr->operands) {
@@ -2749,11 +3309,9 @@ register_allocation(Program* program, std::vector<IDSet>& live_out_per_block, ra
 
                aco_ptr<Instruction> mov;
                if (can_sgpr)
-                  mov.reset(create_instruction<SOP1_instruction>(aco_opcode::s_mov_b32,
-                                                                 Format::SOP1, 1, 1));
+                  mov.reset(create_instruction(aco_opcode::s_mov_b32, Format::SOP1, 1, 1));
                else
-                  mov.reset(create_instruction<VOP1_instruction>(aco_opcode::v_mov_b32,
-                                                                 Format::VOP1, 1, 1));
+                  mov.reset(create_instruction(aco_opcode::v_mov_b32, Format::VOP1, 1, 1));
                mov->operands[0] = instr->operands[0];
                mov->definitions[0] = Definition(tmp);
                mov->definitions[0].setFixed(reg);
@@ -2766,47 +3324,42 @@ register_allocation(Program* program, std::vector<IDSet>& live_out_per_block, ra
             }
 
             /* change the instruction to VOP3 to enable an arbitrary register pair as dst */
-            aco_ptr<Instruction> tmp = std::move(instr);
-            Format format = asVOP3(tmp->format);
-            instr.reset(create_instruction<VOP3_instruction>(
-               tmp->opcode, format, tmp->operands.size(), tmp->definitions.size()));
-            std::copy(tmp->operands.begin(), tmp->operands.end(), instr->operands.begin());
-            std::copy(tmp->definitions.begin(), tmp->definitions.end(), instr->definitions.begin());
+            instr->format = asVOP3(instr->format);
          }
 
          instructions.emplace_back(std::move(*instr_it));
 
       } /* end for Instr */
 
-      block.instructions = std::move(instructions);
-   } /* end for BB */
+      if ((block.kind & block_kind_top_level) && block.linear_succs.empty()) {
+         /* Reset this for block_kind_resume. */
+         ctx.num_linear_vgprs = 0;
 
-   /* find scc spill registers which may be needed for parallelcopies created by phis */
-   for (Block& block : program->blocks) {
-      if (block.linear_preds.size() <= 1)
-         continue;
+         ASSERTED PhysRegInterval vgpr_bounds = get_reg_bounds(ctx, RegType::vgpr, false);
+         ASSERTED PhysRegInterval sgpr_bounds = get_reg_bounds(ctx, RegType::sgpr, false);
+         assert(register_file.count_zero(vgpr_bounds) == ctx.vgpr_bounds);
+         assert(register_file.count_zero(sgpr_bounds) == ctx.sgpr_bounds);
+      } else if (should_compact_linear_vgprs(ctx, live_vars, register_file)) {
+         aco_ptr<Instruction> br = std::move(instructions.back());
+         instructions.pop_back();
 
-      std::bitset<128> regs = sgpr_live_in[block.index];
-      if (!regs[127])
-         continue;
+         bool temp_in_scc =
+            register_file[scc] || (!br->operands.empty() && br->operands[0].physReg() == scc);
 
-      /* choose a register */
-      int16_t reg = 0;
-      for (; reg < ctx.program->max_reg_demand.sgpr && regs[reg]; reg++)
-         ;
-      assert(reg < ctx.program->max_reg_demand.sgpr);
-      adjust_max_used_regs(ctx, s1, reg);
+         std::vector<std::pair<Operand, Definition>> parallelcopy;
+         compact_linear_vgprs(ctx, register_file, parallelcopy);
+         update_renames(ctx, register_file, parallelcopy, br, rename_not_killed_ops);
+         emit_parallel_copy_internal(ctx, parallelcopy, br, instructions, temp_in_scc, register_file);
 
-      /* update predecessors */
-      for (unsigned& pred_index : block.linear_preds) {
-         Block& pred = program->blocks[pred_index];
-         pred.scc_live_out = true;
-         pred.scratch_sgpr = PhysReg{(uint16_t)reg};
+         instructions.push_back(std::move(br));
       }
-   }
+
+      block.instructions = std::move(instructions);
+   } /* end for BB */
 
    /* num_gpr = rnd_up(max_used_gpr + 1) */
-   program->config->num_vgprs = get_vgpr_alloc(program, ctx.max_used_vgpr + 1);
+   program->config->num_vgprs =
+      std::min<uint16_t>(get_vgpr_alloc(program, ctx.max_used_vgpr + 1), 256);
    program->config->num_sgprs = get_sgpr_alloc(program, ctx.max_used_sgpr + 1);
 
    program->progress = CompilationProgress::after_ra;