1 files changed, 485 insertions, 337 deletions
diff --git a/src/amd/compiler/aco_insert_waitcnt.cpp b/src/amd/compiler/aco_insert_waitcnt.cpp
index d7fc87c126d..248b7f51d83 100644
--- a/src/amd/compiler/aco_insert_waitcnt.cpp
+++ b/src/amd/compiler/aco_insert_waitcnt.cpp
@@ -1,27 +1,10 @@
 /*
  * Copyright © 2018 Valve Corporation
  *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
+ * SPDX-License-Identifier: MIT
  */
 
+#include "aco_builder.h"
 #include "aco_ir.h"
 
 #include "common/sid.h"
@@ -29,6 +12,7 @@
 #include <map>
 #include <stack>
 #include <vector>
+#include <optional>
 
 namespace aco {
 
@@ -55,7 +39,7 @@ namespace {
 
 /* Instructions of the same event will finish in-order except for smem
  * and maybe flat. Instructions of different events may not finish in-order. */
-enum wait_event : uint16_t {
+enum wait_event : uint32_t {
    event_smem = 1 << 0,
    event_lds = 1 << 1,
    event_gds = 1 << 2,
@@ -68,119 +52,129 @@ enum wait_event : uint16_t {
    event_gds_gpr_lock = 1 << 9,
    event_vmem_gpr_lock = 1 << 10,
    event_sendmsg = 1 << 11,
-   num_events = 12,
+   event_ldsdir = 1 << 12,
+   event_vmem_sample = 1 << 13, /* GFX12+ */
+   event_vmem_bvh = 1 << 14,    /* GFX12+ */
+   num_events = 15,
 };
 
 enum counter_type : uint8_t {
-   counter_exp = 1 << 0,
-   counter_lgkm = 1 << 1,
-   counter_vm = 1 << 2,
-   counter_vs = 1 << 3,
-   num_counters = 4,
+   counter_exp = 1 << wait_type_exp,
+   counter_lgkm = 1 << wait_type_lgkm,
+   counter_vm = 1 << wait_type_vm,
+   counter_vs = 1 << wait_type_vs,
+   counter_sample = 1 << wait_type_sample,
+   counter_bvh = 1 << wait_type_bvh,
+   counter_km = 1 << wait_type_km,
+   num_counters = wait_type_num,
 };
 
-static const uint16_t exp_events =
-   event_exp_pos | event_exp_param | event_exp_mrt_null | event_gds_gpr_lock | event_vmem_gpr_lock;
-static const uint16_t lgkm_events = event_smem | event_lds | event_gds | event_flat | event_sendmsg;
-static const uint16_t vm_events = event_vmem | event_flat;
-static const uint16_t vs_events = event_vmem_store;
-
-uint8_t
-get_counters_for_event(wait_event ev)
-{
-   switch (ev) {
-   case event_smem:
-   case event_lds:
-   case event_gds:
-   case event_sendmsg: return counter_lgkm;
-   case event_vmem: return counter_vm;
-   case event_vmem_store: return counter_vs;
-   case event_flat: return counter_vm | counter_lgkm;
-   case event_exp_pos:
-   case event_exp_param:
-   case event_exp_mrt_null:
-   case event_gds_gpr_lock:
-   case event_vmem_gpr_lock: return counter_exp;
-   default: return 0;
-   }
-}
-
 struct wait_entry {
    wait_imm imm;
-   uint16_t events;  /* use wait_event notion */
+   uint32_t events;  /* use wait_event notion */
+   uint32_t logical_events; /* use wait_event notion */
    uint8_t counters; /* use counter_type notion */
    bool wait_on_read : 1;
-   bool logical : 1;
-   bool has_vmem_nosampler : 1;
-   bool has_vmem_sampler : 1;
-
-   wait_entry(wait_event event_, wait_imm imm_, bool logical_, bool wait_on_read_)
-       : imm(imm_), events(event_), counters(get_counters_for_event(event_)),
-         wait_on_read(wait_on_read_), logical(logical_), has_vmem_nosampler(false),
-         has_vmem_sampler(false)
+   uint8_t vmem_types : 4; /* use vmem_type notion. for counter_vm. */
+   uint8_t vm_mask : 2;    /* which halves of the VGPR event_vmem uses */
+
+   wait_entry(wait_event event_, wait_imm imm_, uint8_t counters_, bool wait_on_read_)
+       : imm(imm_), events(event_), logical_events(event_), counters(counters_),
+         wait_on_read(wait_on_read_), vmem_types(0), vm_mask(0)
    {}
 
    bool join(const wait_entry& other)
    {
       bool changed = (other.events & ~events) || (other.counters & ~counters) ||
-                     (other.wait_on_read && !wait_on_read) ||
-                     (other.has_vmem_nosampler && !has_vmem_nosampler) ||
-                     (other.has_vmem_sampler && !has_vmem_sampler);
+                     (other.wait_on_read && !wait_on_read) || (other.vmem_types & ~vmem_types) ||
+                     (other.vm_mask & ~vm_mask);
       events |= other.events;
       counters |= other.counters;
       changed |= imm.combine(other.imm);
       wait_on_read |= other.wait_on_read;
-      has_vmem_nosampler |= other.has_vmem_nosampler;
-      has_vmem_sampler |= other.has_vmem_sampler;
-      assert(logical == other.logical);
+      vmem_types |= other.vmem_types;
+      vm_mask |= other.vm_mask;
       return changed;
    }
 
-   void remove_counter(counter_type counter)
+   void remove_wait(wait_type type, uint32_t type_events)
    {
-      counters &= ~counter;
+      counters &= ~(1 << type);
+      imm[type] = wait_imm::unset_counter;
 
-      if (counter == counter_lgkm) {
-         imm.lgkm = wait_imm::unset_counter;
-         events &= ~(event_smem | event_lds | event_gds | event_sendmsg);
-      }
+      events &= ~type_events | event_flat;
+      if (!(counters & counter_lgkm) && !(counters & counter_vm))
+         events &= ~(type_events & event_flat);
 
-      if (counter == counter_vm) {
-         imm.vm = wait_imm::unset_counter;
-         events &= ~event_vmem;
-         has_vmem_nosampler = false;
-         has_vmem_sampler = false;
-      }
+      logical_events &= events;
+      if (type == wait_type_vm)
+         vmem_types = 0;
+      if (type_events & event_vmem)
+         vm_mask = 0;
+   }
+
+   UNUSED void print(FILE* output) const
+   {
+      imm.print(output);
+      if (events)
+         fprintf(output, "events: %u\n", events);
+      if (logical_events)
+         fprintf(output, "logical_events: %u\n", logical_events);
+      if (counters)
+         fprintf(output, "counters: %u\n", counters);
+      if (!wait_on_read)
+         fprintf(output, "wait_on_read: %u\n", wait_on_read);
+      if (vmem_types)
+         fprintf(output, "vmem_types: %u\n", vmem_types);
+      if (vm_mask)
+         fprintf(output, "vm_mask: %u\n", vm_mask);
+   }
+};
+
+struct target_info {
+   wait_imm max_cnt;
+   uint32_t events[wait_type_num] = {};
+   uint16_t unordered_events;
 
-      if (counter == counter_exp) {
-         imm.exp = wait_imm::unset_counter;
-         events &= ~(event_exp_pos | event_exp_param | event_exp_mrt_null | event_gds_gpr_lock |
-                     event_vmem_gpr_lock);
+   target_info(enum amd_gfx_level gfx_level)
+   {
+      max_cnt = wait_imm::max(gfx_level);
+      for (unsigned i = 0; i < wait_type_num; i++)
+         max_cnt[i] = max_cnt[i] ? max_cnt[i] - 1 : 0;
+
+      events[wait_type_exp] = event_exp_pos | event_exp_param | event_exp_mrt_null |
+                              event_gds_gpr_lock | event_vmem_gpr_lock | event_ldsdir;
+      events[wait_type_lgkm] = event_smem | event_lds | event_gds | event_flat | event_sendmsg;
+      events[wait_type_vm] = event_vmem | event_flat;
+      events[wait_type_vs] = event_vmem_store;
+      if (gfx_level >= GFX12) {
+         events[wait_type_sample] = event_vmem_sample;
+         events[wait_type_bvh] = event_vmem_bvh;
+         events[wait_type_km] = event_smem | event_sendmsg;
+         events[wait_type_lgkm] &= ~events[wait_type_km];
       }
 
-      if (counter == counter_vs) {
-         imm.vs = wait_imm::unset_counter;
-         events &= ~event_vmem_store;
+      for (unsigned i = 0; i < wait_type_num; i++) {
+         u_foreach_bit (j, events[i])
+            counters[j] |= (1 << i);
       }
 
-      if (!(counters & counter_lgkm) && !(counters & counter_vm))
-         events &= ~event_flat;
+      unordered_events = event_smem | (gfx_level < GFX10 ? event_flat : 0);
    }
+
+   uint8_t get_counters_for_event(wait_event event) const { return counters[ffs(event) - 1]; }
+
+private:
+   /* Bitfields of counters affected by each event */
+   uint8_t counters[num_events] = {};
 };
 
 struct wait_ctx {
    Program* program;
-   enum chip_class chip_class;
-   uint16_t max_vm_cnt;
-   uint16_t max_exp_cnt;
-   uint16_t max_lgkm_cnt;
-   uint16_t max_vs_cnt;
-   uint16_t unordered_events = event_smem | event_flat;
-
-   uint8_t vm_cnt = 0;
-   uint8_t exp_cnt = 0;
-   uint8_t lgkm_cnt = 0;
-   uint8_t vs_cnt = 0;
+   enum amd_gfx_level gfx_level;
+   const target_info* info;
+
+   uint32_t nonzero = 0;
    bool pending_flat_lgkm = false;
    bool pending_flat_vm = false;
    bool pending_s_buffer_store = false; /* GFX10 workaround */
@@ -191,57 +185,156 @@ struct wait_ctx {
    std::map<PhysReg, wait_entry> gpr_map;
 
    wait_ctx() {}
-   wait_ctx(Program* program_)
-       : program(program_), chip_class(program_->chip_class),
-         max_vm_cnt(program_->chip_class >= GFX9 ? 62 : 14), max_exp_cnt(6),
-         max_lgkm_cnt(program_->chip_class >= GFX10 ? 62 : 14),
-         max_vs_cnt(program_->chip_class >= GFX10 ? 62 : 0),
-         unordered_events(event_smem | (program_->chip_class < GFX10 ? event_flat : 0))
+   wait_ctx(Program* program_, const target_info* info_)
+       : program(program_), gfx_level(program_->gfx_level), info(info_)
    {}
 
-   bool join(const wait_ctx* other, bool logical)
+   bool join(const wait_ctx* other, bool logical, bool logical_merge)
    {
-      bool changed = other->exp_cnt > exp_cnt || other->vm_cnt > vm_cnt ||
-                     other->lgkm_cnt > lgkm_cnt || other->vs_cnt > vs_cnt ||
-                     (other->pending_flat_lgkm && !pending_flat_lgkm) ||
-                     (other->pending_flat_vm && !pending_flat_vm);
-
-      exp_cnt = std::max(exp_cnt, other->exp_cnt);
-      vm_cnt = std::max(vm_cnt, other->vm_cnt);
-      lgkm_cnt = std::max(lgkm_cnt, other->lgkm_cnt);
-      vs_cnt = std::max(vs_cnt, other->vs_cnt);
+      bool changed = (other->pending_flat_lgkm && !pending_flat_lgkm) ||
+                     (other->pending_flat_vm && !pending_flat_vm) || (~nonzero & other->nonzero);
+
+      nonzero |= other->nonzero;
       pending_flat_lgkm |= other->pending_flat_lgkm;
       pending_flat_vm |= other->pending_flat_vm;
       pending_s_buffer_store |= other->pending_s_buffer_store;
 
-      for (const auto& entry : other->gpr_map) {
-         if (entry.second.logical != logical)
-            continue;
-
-         using iterator = std::map<PhysReg, wait_entry>::iterator;
-         const std::pair<iterator, bool> insert_pair = gpr_map.insert(entry);
-         if (insert_pair.second) {
-            changed = true;
-         } else {
-            changed |= insert_pair.first->second.join(entry.second);
+      using iterator = std::map<PhysReg, wait_entry>::iterator;
+
+      if (logical == logical_merge) {
+         for (const auto& entry : other->gpr_map) {
+            const std::pair<iterator, bool> insert_pair = gpr_map.insert(entry);
+            if (insert_pair.second) {
+               insert_pair.first->second.logical_events = 0;
+               changed = true;
+            } else {
+               changed |= insert_pair.first->second.join(entry.second);
+            }
          }
       }
 
-      for (unsigned i = 0; i < storage_count; i++) {
-         changed |= barrier_imm[i].combine(other->barrier_imm[i]);
-         changed |= (other->barrier_events[i] & ~barrier_events[i]) != 0;
-         barrier_events[i] |= other->barrier_events[i];
+      if (logical) {
+         for (const auto& entry : other->gpr_map) {
+            iterator it = gpr_map.find(entry.first);
+            if (it != gpr_map.end()) {
+               changed |= (entry.second.logical_events & ~it->second.logical_events) != 0;
+               it->second.logical_events |= entry.second.logical_events;
+            }
+         }
+
+         for (unsigned i = 0; i < storage_count; i++) {
+            changed |= barrier_imm[i].combine(other->barrier_imm[i]);
+            changed |= (other->barrier_events[i] & ~barrier_events[i]) != 0;
+            barrier_events[i] |= other->barrier_events[i];
+         }
       }
 
       return changed;
    }
 
-   void wait_and_remove_from_entry(PhysReg reg, wait_entry& entry, counter_type counter)
+   UNUSED void print(FILE* output) const
    {
-      entry.remove_counter(counter);
+      for (unsigned i = 0; i < wait_type_num; i++)
+         fprintf(output, "nonzero[%u]: %u\n", i, nonzero & (1 << i) ? 1 : 0);
+      fprintf(output, "pending_flat_lgkm: %u\n", pending_flat_lgkm);
+      fprintf(output, "pending_flat_vm: %u\n", pending_flat_vm);
+      for (const auto& entry : gpr_map) {
+         fprintf(output, "gpr_map[%c%u] = {\n", entry.first.reg() >= 256 ? 'v' : 's',
+                 entry.first.reg() & 0xff);
+         entry.second.print(output);
+         fprintf(output, "}\n");
+      }
+
+      for (unsigned i = 0; i < storage_count; i++) {
+         if (!barrier_imm[i].empty() || barrier_events[i]) {
+            fprintf(output, "barriers[%u] = {\n", i);
+            barrier_imm[i].print(output);
+            fprintf(output, "events: %u\n", barrier_events[i]);
+            fprintf(output, "}\n");
+         }
+      }
    }
 };
 
+wait_event
+get_vmem_event(wait_ctx& ctx, Instruction* instr, uint8_t type)
+{
+   if (instr->definitions.empty() && ctx.gfx_level >= GFX10)
+      return event_vmem_store;
+   wait_event ev = event_vmem;
+   if (ctx.gfx_level >= GFX12 && type != vmem_nosampler)
+      ev = type == vmem_bvh ? event_vmem_bvh : event_vmem_sample;
+   return ev;
+}
+
+uint32_t
+get_vmem_mask(wait_ctx& ctx, Instruction* instr)
+{
+   if (ctx.program->dev.sram_ecc_enabled)
+      return 0xffffffff;
+   switch (instr->opcode) {
+   case aco_opcode::buffer_load_format_d16_x:
+   case aco_opcode::buffer_load_ubyte_d16:
+   case aco_opcode::buffer_load_sbyte_d16:
+   case aco_opcode::buffer_load_short_d16:
+   case aco_opcode::tbuffer_load_format_d16_x:
+   case aco_opcode::flat_load_ubyte_d16:
+   case aco_opcode::flat_load_sbyte_d16:
+   case aco_opcode::flat_load_short_d16:
+   case aco_opcode::global_load_ubyte_d16:
+   case aco_opcode::global_load_sbyte_d16:
+   case aco_opcode::global_load_short_d16:
+   case aco_opcode::scratch_load_ubyte_d16:
+   case aco_opcode::scratch_load_sbyte_d16:
+   case aco_opcode::scratch_load_short_d16: return 0x1;
+   case aco_opcode::buffer_load_ubyte_d16_hi:
+   case aco_opcode::buffer_load_sbyte_d16_hi:
+   case aco_opcode::buffer_load_short_d16_hi:
+   case aco_opcode::buffer_load_format_d16_hi_x:
+   case aco_opcode::flat_load_ubyte_d16_hi:
+   case aco_opcode::flat_load_sbyte_d16_hi:
+   case aco_opcode::flat_load_short_d16_hi:
+   case aco_opcode::global_load_ubyte_d16_hi:
+   case aco_opcode::global_load_sbyte_d16_hi:
+   case aco_opcode::global_load_short_d16_hi:
+   case aco_opcode::scratch_load_ubyte_d16_hi:
+   case aco_opcode::scratch_load_sbyte_d16_hi:
+   case aco_opcode::scratch_load_short_d16_hi: return 0x2;
+   case aco_opcode::buffer_load_format_d16_xyz:
+   case aco_opcode::tbuffer_load_format_d16_xyz: return 0x7;
+   default: return 0xffffffff;
+   }
+}
+
+wait_imm
+get_imm(wait_ctx& ctx, PhysReg reg, wait_entry& entry)
+{
+   if (reg.reg() >= 256) {
+      uint32_t events = entry.logical_events;
+
+      /* ALU can't safely write to unwritten destination VGPR lanes with DS/VMEM on GFX11+ without
+       * waiting for the load to finish, even if none of the lanes are involved in the load.
+       */
+      if (ctx.gfx_level >= GFX11) {
+         uint32_t ds_vmem_events =
+            event_lds | event_gds | event_vmem | event_vmem_sample | event_vmem_bvh | event_flat;
+         events |= ds_vmem_events;
+      }
+
+      uint32_t counters = 0;
+      u_foreach_bit (i, entry.events & events)
+         counters |= ctx.info->get_counters_for_event((wait_event)(1 << i));
+
+      wait_imm imm;
+      u_foreach_bit (i, entry.counters & counters)
+         imm[i] = entry.imm[i];
+
+      return imm;
+   } else {
+      return entry.imm;
+   }
+}
+
 void
 check_instr(wait_ctx& ctx, wait_imm& wait, Instruction* instr)
 {
@@ -251,12 +344,9 @@ check_instr(wait_ctx& ctx, wait_imm& wait, Instruction* instr)
 
       /* check consecutively read gprs */
       for (unsigned j = 0; j < op.size(); j++) {
-         PhysReg reg{op.physReg() + j};
-         std::map<PhysReg, wait_entry>::iterator it = ctx.gpr_map.find(reg);
-         if (it == ctx.gpr_map.end() || !it->second.wait_on_read)
-            continue;
-
-         wait.combine(it->second.imm);
+         std::map<PhysReg, wait_entry>::iterator it = ctx.gpr_map.find(PhysReg{op.physReg() + j});
+         if (it != ctx.gpr_map.end() && it->second.wait_on_read)
+            wait.combine(get_imm(ctx, PhysReg{op.physReg() + j}, it->second));
       }
    }
 
@@ -269,38 +359,46 @@ check_instr(wait_ctx& ctx, wait_imm& wait, Instruction* instr)
          if (it == ctx.gpr_map.end())
             continue;
 
-         /* Vector Memory reads and writes return in the order they were issued */
-         bool has_sampler = instr->isMIMG() && !instr->operands[1].isUndefined() &&
-                            instr->operands[1].regClass() == s4;
-         if (instr->isVMEM() && ((it->second.events & vm_events) == event_vmem) &&
-             it->second.has_vmem_nosampler == !has_sampler &&
-             it->second.has_vmem_sampler == has_sampler)
-            continue;
+         wait_imm reg_imm = get_imm(ctx, reg, it->second);
+
+         /* Vector Memory reads and writes decrease the counter in the order they were issued.
+          * Before GFX12, they also write VGPRs in order if they're of the same type.
+          * We can do this for GFX12 and different types for GFX11 if we know that the two
+          * VMEM loads do not write the same register half or the same lanes.
+          */
+         uint8_t vmem_type = get_vmem_type(ctx.gfx_level, ctx.program->family, instr);
+         if (vmem_type) {
+            wait_event event = get_vmem_event(ctx, instr, vmem_type);
+            wait_type type = (wait_type)(ffs(ctx.info->get_counters_for_event(event)) - 1);
+
+            bool event_matches = (it->second.events & ctx.info->events[type]) == event;
+            /* wait_type_vm/counter_vm can have several different vmem_types */
+            bool type_matches = type != wait_type_vm || (it->second.vmem_types == vmem_type &&
+                                                         util_bitcount(vmem_type) == 1);
+
+            bool different_halves = false;
+            if (event == event_vmem && event_matches) {
+               uint32_t mask = (get_vmem_mask(ctx, instr) >> (j * 2)) & 0x3;
+               different_halves = !(mask & it->second.vm_mask);
+            }
+
+            bool different_lanes = (it->second.logical_events & ctx.info->events[type]) == 0;
+
+            if ((event_matches && type_matches && ctx.gfx_level < GFX12) || different_halves ||
+                different_lanes)
+               reg_imm[type] = wait_imm::unset_counter;
+         }
 
          /* LDS reads and writes return in the order they were issued. same for GDS */
-         if (instr->isDS() &&
-             (it->second.events & lgkm_events) == (instr->ds().gds ? event_gds : event_lds))
-            continue;
+         if (instr->isDS() && (it->second.events & ctx.info->events[wait_type_lgkm]) ==
+                                 (instr->ds().gds ? event_gds : event_lds))
+            reg_imm.lgkm = wait_imm::unset_counter;
 
-         wait.combine(it->second.imm);
+         wait.combine(reg_imm);
       }
    }
 }
 
-bool
-parse_wait_instr(wait_ctx& ctx, wait_imm& imm, Instruction* instr)
-{
-   if (instr->opcode == aco_opcode::s_waitcnt_vscnt &&
-       instr->definitions[0].physReg() == sgpr_null) {
-      imm.vs = std::min<uint8_t>(imm.vs, instr->sopk().imm);
-      return true;
-   } else if (instr->opcode == aco_opcode::s_waitcnt) {
-      imm.combine(wait_imm(ctx.chip_class, instr->sopp().imm));
-      return true;
-   }
-   return false;
-}
-
 void
 perform_barrier(wait_ctx& ctx, wait_imm& imm, memory_sync_info sync, unsigned semantics)
 {
@@ -318,9 +416,9 @@ perform_barrier(wait_ctx& ctx, wait_imm& imm, memory_sync_info sync, unsigned se
          if (bar_scope_lds <= subgroup_scope)
             events &= ~event_lds;
 
-         /* in non-WGP, the L1 (L0 on GFX10+) cache keeps all memory operations
+         /* Until GFX11, in non-WGP, the L1 (L0 on GFX10+) cache keeps all memory operations
           * in-order for the same workgroup */
-         if (!ctx.program->wgp_mode && sync.scope <= scope_workgroup)
+         if (ctx.gfx_level < GFX11 && !ctx.program->wgp_mode && sync.scope <= scope_workgroup)
             events &= ~(event_vmem | event_vmem_store | event_smem);
 
          if (events)
@@ -332,41 +430,60 @@ perform_barrier(wait_ctx& ctx, wait_imm& imm, memory_sync_info sync, unsigned se
 void
 force_waitcnt(wait_ctx& ctx, wait_imm& imm)
 {
-   if (ctx.vm_cnt)
-      imm.vm = 0;
-   if (ctx.exp_cnt)
-      imm.exp = 0;
-   if (ctx.lgkm_cnt)
-      imm.lgkm = 0;
-
-   if (ctx.chip_class >= GFX10) {
-      if (ctx.vs_cnt)
-         imm.vs = 0;
-   }
+   u_foreach_bit (i, ctx.nonzero)
+      imm[i] = 0;
 }
 
 void
 kill(wait_imm& imm, Instruction* instr, wait_ctx& ctx, memory_sync_info sync_info)
 {
-   if (debug_flags & DEBUG_FORCE_WAITCNT) {
+   if (instr->opcode == aco_opcode::s_setpc_b64 || (debug_flags & DEBUG_FORCE_WAITCNT)) {
       /* Force emitting waitcnt states right after the instruction if there is
-       * something to wait for.
+       * something to wait for. This is also applied for s_setpc_b64 to ensure
+       * waitcnt states are inserted before jumping to the PS epilog.
+       */
+      force_waitcnt(ctx, imm);
+   }
+
+   /* sendmsg(dealloc_vgprs) releases scratch, so this isn't safe if there is a in-progress
+    * scratch store.
+    */
+   if (ctx.gfx_level >= GFX11 && instr->opcode == aco_opcode::s_sendmsg &&
+       instr->salu().imm == sendmsg_dealloc_vgprs) {
+      imm.combine(ctx.barrier_imm[ffs(storage_scratch) - 1]);
+      imm.combine(ctx.barrier_imm[ffs(storage_vgpr_spill) - 1]);
+   }
+
+   /* Make sure POPS coherent memory accesses have reached the L2 cache before letting the
+    * overlapping waves proceed into the ordered section.
+    */
+   if (ctx.program->has_pops_overlapped_waves_wait &&
+       (ctx.gfx_level >= GFX11 ? instr->isEXP() && instr->exp().done
+                               : (instr->opcode == aco_opcode::s_sendmsg &&
+                                  instr->salu().imm == sendmsg_ordered_ps_done))) {
+      uint8_t c = counter_vm | counter_vs;
+      /* Await SMEM loads too, as it's possible for an application to create them, like using a
+       * scalarization loop - pointless and unoptimal for an inherently divergent address of
+       * per-pixel data, but still can be done at least synthetically and must be handled correctly.
        */
-      return force_waitcnt(ctx, imm);
+      if (ctx.program->has_smem_buffer_or_global_loads)
+         c |= counter_lgkm;
+
+      u_foreach_bit (i, c & ctx.nonzero)
+         imm[i] = 0;
    }
 
-   if (ctx.exp_cnt || ctx.vm_cnt || ctx.lgkm_cnt)
-      check_instr(ctx, imm, instr);
+   check_instr(ctx, imm, instr);
 
    /* It's required to wait for scalar stores before "writing back" data.
     * It shouldn't cost anything anyways since we're about to do s_endpgm.
     */
-   if (ctx.lgkm_cnt && instr->opcode == aco_opcode::s_dcache_wb) {
-      assert(ctx.chip_class >= GFX8);
+   if ((ctx.nonzero & BITFIELD_BIT(wait_type_lgkm)) && instr->opcode == aco_opcode::s_dcache_wb) {
+      assert(ctx.gfx_level >= GFX8);
       imm.lgkm = 0;
    }
 
-   if (ctx.chip_class >= GFX10 && instr->isSMEM()) {
+   if (ctx.gfx_level >= GFX10 && instr->isSMEM()) {
       /* GFX10: A store followed by a load at the same address causes a problem because
        * the load doesn't load the correct values unless we wait for the store first.
        * This is NOT mitigated by an s_nop.
@@ -379,19 +496,9 @@ kill(wait_imm& imm, Instruction* instr, wait_ctx& ctx, memory_sync_info sync_inf
       }
    }
 
-   if (ctx.program->early_rast && instr->opcode == aco_opcode::exp) {
-      if (instr->exp().dest >= V_008DFC_SQ_EXP_POS && instr->exp().dest < V_008DFC_SQ_EXP_PRIM) {
-
-         /* With early_rast, the HW will start clipping and rasterization after the 1st DONE pos
-          * export. Wait for all stores (and atomics) to complete, so PS can read them.
-          * TODO: This only really applies to DONE pos exports.
-          *       Consider setting the DONE bit earlier.
-          */
-         if (ctx.vs_cnt > 0)
-            imm.vs = 0;
-         if (ctx.vm_cnt > 0)
-            imm.vm = 0;
-      }
+   if (instr->opcode == aco_opcode::ds_ordered_count &&
+       ((instr->ds().offset1 | (instr->ds().offset0 >> 8)) & 0x1)) {
+      imm.combine(ctx.barrier_imm[ffs(storage_gds) - 1]);
    }
 
    if (instr->opcode == aco_opcode::p_barrier)
@@ -406,30 +513,18 @@ kill(wait_imm& imm, Instruction* instr, wait_ctx& ctx, memory_sync_info sync_inf
          imm.lgkm = 0;
 
       /* reset counters */
-      ctx.exp_cnt = std::min(ctx.exp_cnt, imm.exp);
-      ctx.vm_cnt = std::min(ctx.vm_cnt, imm.vm);
-      ctx.lgkm_cnt = std::min(ctx.lgkm_cnt, imm.lgkm);
-      ctx.vs_cnt = std::min(ctx.vs_cnt, imm.vs);
+      for (unsigned i = 0; i < wait_type_num; i++)
+         ctx.nonzero &= imm[i] == 0 ? ~BITFIELD_BIT(i) : UINT32_MAX;
 
       /* update barrier wait imms */
       for (unsigned i = 0; i < storage_count; i++) {
          wait_imm& bar = ctx.barrier_imm[i];
          uint16_t& bar_ev = ctx.barrier_events[i];
-         if (bar.exp != wait_imm::unset_counter && imm.exp <= bar.exp) {
-            bar.exp = wait_imm::unset_counter;
-            bar_ev &= ~exp_events;
-         }
-         if (bar.vm != wait_imm::unset_counter && imm.vm <= bar.vm) {
-            bar.vm = wait_imm::unset_counter;
-            bar_ev &= ~(vm_events & ~event_flat);
-         }
-         if (bar.lgkm != wait_imm::unset_counter && imm.lgkm <= bar.lgkm) {
-            bar.lgkm = wait_imm::unset_counter;
-            bar_ev &= ~(lgkm_events & ~event_flat);
-         }
-         if (bar.vs != wait_imm::unset_counter && imm.vs <= bar.vs) {
-            bar.vs = wait_imm::unset_counter;
-            bar_ev &= ~vs_events;
+         for (unsigned j = 0; j < wait_type_num; j++) {
+            if (bar[j] != wait_imm::unset_counter && imm[j] <= bar[j]) {
+               bar[j] = wait_imm::unset_counter;
+               bar_ev &= ~ctx.info->events[j] | event_flat;
+            }
          }
          if (bar.vm == wait_imm::unset_counter && bar.lgkm == wait_imm::unset_counter)
             bar_ev &= ~event_flat;
@@ -438,14 +533,10 @@ kill(wait_imm& imm, Instruction* instr, wait_ctx& ctx, memory_sync_info sync_inf
       /* remove all gprs with higher counter from map */
       std::map<PhysReg, wait_entry>::iterator it = ctx.gpr_map.begin();
       while (it != ctx.gpr_map.end()) {
-         if (imm.exp != wait_imm::unset_counter && imm.exp <= it->second.imm.exp)
-            ctx.wait_and_remove_from_entry(it->first, it->second, counter_exp);
-         if (imm.vm != wait_imm::unset_counter && imm.vm <= it->second.imm.vm)
-            ctx.wait_and_remove_from_entry(it->first, it->second, counter_vm);
-         if (imm.lgkm != wait_imm::unset_counter && imm.lgkm <= it->second.imm.lgkm)
-            ctx.wait_and_remove_from_entry(it->first, it->second, counter_lgkm);
-         if (imm.vs != wait_imm::unset_counter && imm.vs <= it->second.imm.vs)
-            ctx.wait_and_remove_from_entry(it->first, it->second, counter_vs);
+         for (unsigned i = 0; i < wait_type_num; i++) {
+            if (imm[i] != wait_imm::unset_counter && imm[i] <= it->second.imm[i])
+               it->second.remove_wait((wait_type)i, ctx.info->events[i]);
+         }
          if (!it->second.counters)
             it = ctx.gpr_map.erase(it);
          else
@@ -462,37 +553,24 @@ kill(wait_imm& imm, Instruction* instr, wait_ctx& ctx, memory_sync_info sync_inf
 }
 
 void
-update_barrier_counter(uint8_t* ctr, unsigned max)
-{
-   if (*ctr != wait_imm::unset_counter && *ctr < max)
-      (*ctr)++;
-}
-
-void
 update_barrier_imm(wait_ctx& ctx, uint8_t counters, wait_event event, memory_sync_info sync)
 {
    for (unsigned i = 0; i < storage_count; i++) {
       wait_imm& bar = ctx.barrier_imm[i];
       uint16_t& bar_ev = ctx.barrier_events[i];
-      if (sync.storage & (1 << i) && !(sync.semantics & semantic_private)) {
+
+      /* We re-use barrier_imm/barrier_events to wait for all scratch stores to finish. */
+      bool ignore_private = i == (ffs(storage_scratch) - 1) || i == (ffs(storage_vgpr_spill) - 1);
+
+      if (sync.storage & (1 << i) && (!(sync.semantics & semantic_private) || ignore_private)) {
          bar_ev |= event;
-         if (counters & counter_lgkm)
-            bar.lgkm = 0;
-         if (counters & counter_vm)
-            bar.vm = 0;
-         if (counters & counter_exp)
-            bar.exp = 0;
-         if (counters & counter_vs)
-            bar.vs = 0;
-      } else if (!(bar_ev & ctx.unordered_events) && !(ctx.unordered_events & event)) {
-         if (counters & counter_lgkm && (bar_ev & lgkm_events) == event)
-            update_barrier_counter(&bar.lgkm, ctx.max_lgkm_cnt);
-         if (counters & counter_vm && (bar_ev & vm_events) == event)
-            update_barrier_counter(&bar.vm, ctx.max_vm_cnt);
-         if (counters & counter_exp && (bar_ev & exp_events) == event)
-            update_barrier_counter(&bar.exp, ctx.max_exp_cnt);
-         if (counters & counter_vs && (bar_ev & vs_events) == event)
-            update_barrier_counter(&bar.vs, ctx.max_vs_cnt);
+         u_foreach_bit (j, counters)
+            bar[j] = 0;
+      } else if (!(bar_ev & ctx.info->unordered_events) && !(ctx.info->unordered_events & event)) {
+         u_foreach_bit (j, counters) {
+            if (bar[j] != wait_imm::unset_counter && (bar_ev & ctx.info->events[j]) == event)
+               bar[j] = std::min<uint16_t>(bar[j] + 1, ctx.info->max_cnt[j]);
+         }
       }
    }
 }
@@ -500,20 +578,13 @@ update_barrier_imm(wait_ctx& ctx, uint8_t counters, wait_event event, memory_syn
 void
 update_counters(wait_ctx& ctx, wait_event event, memory_sync_info sync = memory_sync_info())
 {
-   uint8_t counters = get_counters_for_event(event);
+   uint8_t counters = ctx.info->get_counters_for_event(event);
 
-   if (counters & counter_lgkm && ctx.lgkm_cnt <= ctx.max_lgkm_cnt)
-      ctx.lgkm_cnt++;
-   if (counters & counter_vm && ctx.vm_cnt <= ctx.max_vm_cnt)
-      ctx.vm_cnt++;
-   if (counters & counter_exp && ctx.exp_cnt <= ctx.max_exp_cnt)
-      ctx.exp_cnt++;
-   if (counters & counter_vs && ctx.vs_cnt <= ctx.max_vs_cnt)
-      ctx.vs_cnt++;
+   ctx.nonzero |= counters;
 
    update_barrier_imm(ctx, counters, event, sync);
 
-   if (ctx.unordered_events & event)
+   if (ctx.info->unordered_events & event)
       return;
 
    if (ctx.pending_flat_lgkm)
@@ -524,35 +595,24 @@ update_counters(wait_ctx& ctx, wait_event event, memory_sync_info sync = memory_
    for (std::pair<const PhysReg, wait_entry>& e : ctx.gpr_map) {
       wait_entry& entry = e.second;
 
-      if (entry.events & ctx.unordered_events)
+      if (entry.events & ctx.info->unordered_events)
          continue;
 
       assert(entry.events);
 
-      if ((counters & counter_exp) && (entry.events & exp_events) == event &&
-          entry.imm.exp < ctx.max_exp_cnt)
-         entry.imm.exp++;
-      if ((counters & counter_lgkm) && (entry.events & lgkm_events) == event &&
-          entry.imm.lgkm < ctx.max_lgkm_cnt)
-         entry.imm.lgkm++;
-      if ((counters & counter_vm) && (entry.events & vm_events) == event &&
-          entry.imm.vm < ctx.max_vm_cnt)
-         entry.imm.vm++;
-      if ((counters & counter_vs) && (entry.events & vs_events) == event &&
-          entry.imm.vs < ctx.max_vs_cnt)
-         entry.imm.vs++;
+      u_foreach_bit (i, counters) {
+         if ((entry.events & ctx.info->events[i]) == event)
+            entry.imm[i] = std::min<uint16_t>(entry.imm[i] + 1, ctx.info->max_cnt[i]);
+      }
    }
 }
 
 void
 update_counters_for_flat_load(wait_ctx& ctx, memory_sync_info sync = memory_sync_info())
 {
-   assert(ctx.chip_class < GFX10);
+   assert(ctx.gfx_level < GFX10);
 
-   if (ctx.lgkm_cnt <= ctx.max_lgkm_cnt)
-      ctx.lgkm_cnt++;
-   if (ctx.vm_cnt <= ctx.max_vm_cnt)
-      ctx.vm_cnt++;
+   ctx.nonzero |= BITFIELD_BIT(wait_type_lgkm) | BITFIELD_BIT(wait_type_vm);
 
    update_barrier_imm(ctx, counter_vm | counter_lgkm, event_flat, sync);
 
@@ -568,41 +628,40 @@ update_counters_for_flat_load(wait_ctx& ctx, memory_sync_info sync = memory_sync
 
 void
 insert_wait_entry(wait_ctx& ctx, PhysReg reg, RegClass rc, wait_event event, bool wait_on_read,
-                  bool has_sampler = false)
+                  uint8_t vmem_types = 0, uint32_t vm_mask = 0)
 {
-   uint16_t counters = get_counters_for_event(event);
+   uint16_t counters = ctx.info->get_counters_for_event(event);
    wait_imm imm;
-   if (counters & counter_lgkm)
-      imm.lgkm = 0;
-   if (counters & counter_vm)
-      imm.vm = 0;
-   if (counters & counter_exp)
-      imm.exp = 0;
-   if (counters & counter_vs)
-      imm.vs = 0;
+   u_foreach_bit (i, counters)
+      imm[i] = 0;
 
-   wait_entry new_entry(event, imm, !rc.is_linear(), wait_on_read);
-   new_entry.has_vmem_nosampler = (event & event_vmem) && !has_sampler;
-   new_entry.has_vmem_sampler = (event & event_vmem) && has_sampler;
+   wait_entry new_entry(event, imm, counters, wait_on_read);
+   if (counters & counter_vm)
+      new_entry.vmem_types |= vmem_types;
 
-   for (unsigned i = 0; i < rc.size(); i++) {
+   for (unsigned i = 0; i < rc.size(); i++, vm_mask >>= 2) {
+      new_entry.vm_mask = vm_mask & 0x3;
       auto it = ctx.gpr_map.emplace(PhysReg{reg.reg() + i}, new_entry);
-      if (!it.second)
+      if (!it.second) {
          it.first->second.join(new_entry);
+         it.first->second.logical_events |= event;
+      }
    }
 }
 
 void
-insert_wait_entry(wait_ctx& ctx, Operand op, wait_event event, bool has_sampler = false)
+insert_wait_entry(wait_ctx& ctx, Operand op, wait_event event, uint8_t vmem_types = 0,
+                  uint32_t vm_mask = 0)
 {
    if (!op.isConstant() && !op.isUndefined())
-      insert_wait_entry(ctx, op.physReg(), op.regClass(), event, false, has_sampler);
+      insert_wait_entry(ctx, op.physReg(), op.regClass(), event, false, vmem_types, vm_mask);
 }
 
 void
-insert_wait_entry(wait_ctx& ctx, Definition def, wait_event event, bool has_sampler = false)
+insert_wait_entry(wait_ctx& ctx, Definition def, wait_event event, uint8_t vmem_types = 0,
+                  uint32_t vm_mask = 0)
 {
-   insert_wait_entry(ctx, def.physReg(), def.regClass(), event, true, has_sampler);
+   insert_wait_entry(ctx, def.physReg(), def.regClass(), event, true, vmem_types, vm_mask);
 }
 
 void
@@ -634,7 +693,7 @@ gen(Instruction* instr, wait_ctx& ctx)
    }
    case Format::FLAT: {
       FLAT_instruction& flat = instr->flat();
-      if (ctx.chip_class < GFX10 && !instr->definitions.empty())
+      if (ctx.gfx_level < GFX10 && !instr->definitions.empty())
          update_counters_for_flat_load(ctx, flat.sync);
       else
          update_counters(ctx, event_flat, flat.sync);
@@ -649,7 +708,7 @@ gen(Instruction* instr, wait_ctx& ctx)
 
       if (!instr->definitions.empty())
          insert_wait_entry(ctx, instr->definitions[0], event_smem);
-      else if (ctx.chip_class >= GFX10 && !smem.sync.can_reorder())
+      else if (ctx.gfx_level >= GFX10 && !smem.sync.can_reorder())
          ctx.pending_s_buffer_store = true;
 
       break;
@@ -670,26 +729,30 @@ gen(Instruction* instr, wait_ctx& ctx)
       }
       break;
    }
+   case Format::LDSDIR: {
+      LDSDIR_instruction& ldsdir = instr->ldsdir();
+      update_counters(ctx, event_ldsdir, ldsdir.sync);
+      insert_wait_entry(ctx, instr->definitions[0], event_ldsdir);
+      break;
+   }
    case Format::MUBUF:
    case Format::MTBUF:
    case Format::MIMG:
-   case Format::GLOBAL: {
-      wait_event ev =
-         !instr->definitions.empty() || ctx.chip_class < GFX10 ? event_vmem : event_vmem_store;
-      update_counters(ctx, ev, get_sync_info(instr));
+   case Format::GLOBAL:
+   case Format::SCRATCH: {
+      uint8_t type = get_vmem_type(ctx.gfx_level, ctx.program->family, instr);
+      wait_event ev = get_vmem_event(ctx, instr, type);
+      uint32_t mask = ev == event_vmem ? get_vmem_mask(ctx, instr) : 0;
 
-      bool has_sampler = instr->isMIMG() && !instr->operands[1].isUndefined() &&
-                         instr->operands[1].regClass() == s4;
+      update_counters(ctx, ev, get_sync_info(instr));
 
-      if (!instr->definitions.empty())
-         insert_wait_entry(ctx, instr->definitions[0], ev, has_sampler);
+      for (auto& definition : instr->definitions)
+         insert_wait_entry(ctx, definition, ev, type, mask);
 
-      if (ctx.chip_class == GFX6 && instr->format != Format::MIMG && instr->operands.size() == 4) {
-         ctx.exp_cnt++;
+      if (ctx.gfx_level == GFX6 && instr->format != Format::MIMG && instr->operands.size() == 4) {
          update_counters(ctx, event_vmem_gpr_lock);
          insert_wait_entry(ctx, instr->operands[3], event_vmem_gpr_lock);
-      } else if (ctx.chip_class == GFX6 && instr->isMIMG() && !instr->operands[2].isUndefined()) {
-         ctx.exp_cnt++;
+      } else if (ctx.gfx_level == GFX6 && instr->isMIMG() && !instr->operands[2].isUndefined()) {
          update_counters(ctx, event_vmem_gpr_lock);
          insert_wait_entry(ctx, instr->operands[2], event_vmem_gpr_lock);
       }
@@ -701,6 +764,14 @@ gen(Instruction* instr, wait_ctx& ctx)
          update_counters(ctx, event_sendmsg);
       break;
    }
+   case Format::SOP1: {
+      if (instr->opcode == aco_opcode::s_sendmsg_rtn_b32 ||
+          instr->opcode == aco_opcode::s_sendmsg_rtn_b64) {
+         update_counters(ctx, event_sendmsg);
+         insert_wait_entry(ctx, instr->definitions[0], event_sendmsg);
+      }
+      break;
+   }
    default: break;
    }
 }
@@ -708,23 +779,28 @@ gen(Instruction* instr, wait_ctx& ctx)
 void
 emit_waitcnt(wait_ctx& ctx, std::vector<aco_ptr<Instruction>>& instructions, wait_imm& imm)
 {
-   if (imm.vs != wait_imm::unset_counter) {
-      assert(ctx.chip_class >= GFX10);
-      SOPK_instruction* waitcnt_vs =
-         create_instruction<SOPK_instruction>(aco_opcode::s_waitcnt_vscnt, Format::SOPK, 0, 1);
-      waitcnt_vs->definitions[0] = Definition(sgpr_null, s1);
-      waitcnt_vs->imm = imm.vs;
-      instructions.emplace_back(waitcnt_vs);
-      imm.vs = wait_imm::unset_counter;
+   Builder bld(ctx.program, &instructions);
+   imm.build_waitcnt(bld);
+}
+
+bool
+check_clause_raw(std::bitset<512>& regs_written, Instruction* instr)
+{
+   for (Operand op : instr->operands) {
+      if (op.isConstant())
+         continue;
+      for (unsigned i = 0; i < op.size(); i++) {
+         if (regs_written[op.physReg().reg() + i])
+            return false;
+      }
    }
-   if (!imm.empty()) {
-      SOPP_instruction* waitcnt =
-         create_instruction<SOPP_instruction>(aco_opcode::s_waitcnt, Format::SOPP, 0, 0);
-      waitcnt->imm = imm.pack(ctx.chip_class);
-      waitcnt->block = -1;
-      instructions.emplace_back(waitcnt);
+
+   for (Definition def : instr->definitions) {
+      for (unsigned i = 0; i < def.size(); i++)
+         regs_written[def.physReg().reg() + i] = 1;
    }
-   imm = wait_imm();
+
+   return true;
 }
 
 void
@@ -734,23 +810,66 @@ handle_block(Program* program, Block& block, wait_ctx& ctx)
 
    wait_imm queued_imm;
 
-   for (aco_ptr<Instruction>& instr : block.instructions) {
-      bool is_wait = parse_wait_instr(ctx, queued_imm, instr.get());
+   size_t clause_end = 0;
+   for (size_t i = 0; i < block.instructions.size(); i++) {
+      aco_ptr<Instruction>& instr = block.instructions[i];
+
+      bool is_wait = queued_imm.unpack(ctx.gfx_level, instr.get());
 
       memory_sync_info sync_info = get_sync_info(instr.get());
       kill(queued_imm, instr.get(), ctx, sync_info);
 
+      /* At the start of a possible clause, also emit waitcnts for each instruction to avoid
+       * splitting the clause.
+       */
+      if (i >= clause_end || !queued_imm.empty()) {
+         std::optional<std::bitset<512>> regs_written;
+         for (clause_end = i + 1; clause_end < block.instructions.size(); clause_end++) {
+            Instruction* next = block.instructions[clause_end].get();
+            if (!should_form_clause(instr.get(), next))
+               break;
+
+            if (!regs_written) {
+               regs_written.emplace();
+               check_clause_raw(*regs_written, instr.get());
+            }
+
+            if (!check_clause_raw(*regs_written, next))
+               break;
+
+            kill(queued_imm, next, ctx, get_sync_info(next));
+         }
+      }
+
       gen(instr.get(), ctx);
 
       if (instr->format != Format::PSEUDO_BARRIER && !is_wait) {
+         if (instr->isVINTERP_INREG() && queued_imm.exp != wait_imm::unset_counter) {
+            instr->vinterp_inreg().wait_exp = MIN2(instr->vinterp_inreg().wait_exp, queued_imm.exp);
+            queued_imm.exp = wait_imm::unset_counter;
+         }
+
          if (!queued_imm.empty())
             emit_waitcnt(ctx, new_instructions, queued_imm);
 
+         bool is_ordered_count_acquire =
+            instr->opcode == aco_opcode::ds_ordered_count &&
+            !((instr->ds().offset1 | (instr->ds().offset0 >> 8)) & 0x1);
+
          new_instructions.emplace_back(std::move(instr));
          perform_barrier(ctx, queued_imm, sync_info, semantic_acquire);
+
+         if (is_ordered_count_acquire)
+            queued_imm.combine(ctx.barrier_imm[ffs(storage_gds) - 1]);
       }
    }
 
+   /* For last block of a program which has succeed shader part, wait all memory ops done
+    * before go to next shader part.
+    */
+   if (block.kind & block_kind_end_with_regs)
+      force_waitcnt(ctx, queued_imm);
+
    if (!queued_imm.empty())
       emit_waitcnt(ctx, new_instructions, queued_imm);
 
@@ -760,18 +879,39 @@ handle_block(Program* program, Block& block, wait_ctx& ctx)
 } /* end namespace */
 
 void
-insert_wait_states(Program* program)
+insert_waitcnt(Program* program)
 {
+   target_info info(program->gfx_level);
+
    /* per BB ctx */
    std::vector<bool> done(program->blocks.size());
-   std::vector<wait_ctx> in_ctx(program->blocks.size(), wait_ctx(program));
-   std::vector<wait_ctx> out_ctx(program->blocks.size(), wait_ctx(program));
+   std::vector<wait_ctx> in_ctx(program->blocks.size(), wait_ctx(program, &info));
+   std::vector<wait_ctx> out_ctx(program->blocks.size(), wait_ctx(program, &info));
 
-   std::stack<unsigned> loop_header_indices;
+   std::stack<unsigned, std::vector<unsigned>> loop_header_indices;
    unsigned loop_progress = 0;
 
+   if (program->pending_lds_access) {
+      update_barrier_imm(in_ctx[0], info.get_counters_for_event(event_lds), event_lds,
+                         memory_sync_info(storage_shared));
+   }
+
+   for (Definition def : program->args_pending_vmem) {
+      update_counters(in_ctx[0], event_vmem);
+      insert_wait_entry(in_ctx[0], def, event_vmem, vmem_nosampler, 0xffffffff);
+   }
+
    for (unsigned i = 0; i < program->blocks.size();) {
       Block& current = program->blocks[i++];
+
+      if (current.kind & block_kind_discard_early_exit) {
+         /* Because the jump to the discard early exit block may happen anywhere in a block, it's
+          * not possible to join it with its predecessors this way.
+          * We emit all required waits when emitting the discard block.
+          */
+         continue;
+      }
+
       wait_ctx ctx = in_ctx[current.index];
 
       if (current.kind & block_kind_loop_header) {
@@ -788,11 +928,24 @@ insert_wait_states(Program* program)
             continue;
       }
 
+      /* Sometimes the counter for an entry is incremented or removed on all logical predecessors,
+       * so it might be better to join entries using the logical predecessors instead of the linear
+       * ones.
+       */
+      bool logical_merge =
+         current.logical_preds.size() > 1 &&
+         std::any_of(current.linear_preds.begin(), current.linear_preds.end(),
+                     [&](unsigned pred)
+                     {
+                        return std::find(current.logical_preds.begin(), current.logical_preds.end(),
+                                         pred) == current.logical_preds.end();
+                     });
+
       bool changed = false;
       for (unsigned b : current.linear_preds)
-         changed |= ctx.join(&out_ctx[b], false);
+         changed |= ctx.join(&out_ctx[b], false, logical_merge);
       for (unsigned b : current.logical_preds)
-         changed |= ctx.join(&out_ctx[b], true);
+         changed |= ctx.join(&out_ctx[b], true, logical_merge);
 
       if (done[current.index] && !changed) {
          in_ctx[current.index] = std::move(ctx);
@@ -801,11 +954,6 @@ insert_wait_states(Program* program)
          in_ctx[current.index] = ctx;
       }
 
-      if (current.instructions.empty()) {
-         out_ctx[current.index] = std::move(ctx);
-         continue;
-      }
-
       loop_progress = std::max<unsigned>(loop_progress, current.loop_nest_depth);
       done[current.index] = true;