diff options
Diffstat (limited to 'src/amd/compiler/aco_insert_waitcnt.cpp')
-rw-r--r-- | src/amd/compiler/aco_insert_waitcnt.cpp | 822 |
1 files changed, 485 insertions, 337 deletions
diff --git a/src/amd/compiler/aco_insert_waitcnt.cpp b/src/amd/compiler/aco_insert_waitcnt.cpp index d7fc87c126d..248b7f51d83 100644 --- a/src/amd/compiler/aco_insert_waitcnt.cpp +++ b/src/amd/compiler/aco_insert_waitcnt.cpp @@ -1,27 +1,10 @@ /* * Copyright © 2018 Valve Corporation * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - * + * SPDX-License-Identifier: MIT */ +#include "aco_builder.h" #include "aco_ir.h" #include "common/sid.h" @@ -29,6 +12,7 @@ #include <map> #include <stack> #include <vector> +#include <optional> namespace aco { @@ -55,7 +39,7 @@ namespace { /* Instructions of the same event will finish in-order except for smem * and maybe flat. Instructions of different events may not finish in-order. */ -enum wait_event : uint16_t { +enum wait_event : uint32_t { event_smem = 1 << 0, event_lds = 1 << 1, event_gds = 1 << 2, @@ -68,119 +52,129 @@ enum wait_event : uint16_t { event_gds_gpr_lock = 1 << 9, event_vmem_gpr_lock = 1 << 10, event_sendmsg = 1 << 11, - num_events = 12, + event_ldsdir = 1 << 12, + event_vmem_sample = 1 << 13, /* GFX12+ */ + event_vmem_bvh = 1 << 14, /* GFX12+ */ + num_events = 15, }; enum counter_type : uint8_t { - counter_exp = 1 << 0, - counter_lgkm = 1 << 1, - counter_vm = 1 << 2, - counter_vs = 1 << 3, - num_counters = 4, + counter_exp = 1 << wait_type_exp, + counter_lgkm = 1 << wait_type_lgkm, + counter_vm = 1 << wait_type_vm, + counter_vs = 1 << wait_type_vs, + counter_sample = 1 << wait_type_sample, + counter_bvh = 1 << wait_type_bvh, + counter_km = 1 << wait_type_km, + num_counters = wait_type_num, }; -static const uint16_t exp_events = - event_exp_pos | event_exp_param | event_exp_mrt_null | event_gds_gpr_lock | event_vmem_gpr_lock; -static const uint16_t lgkm_events = event_smem | event_lds | event_gds | event_flat | event_sendmsg; -static const uint16_t vm_events = event_vmem | event_flat; -static const uint16_t vs_events = event_vmem_store; - -uint8_t -get_counters_for_event(wait_event ev) -{ - switch (ev) { - case event_smem: - case event_lds: - case event_gds: - case event_sendmsg: return counter_lgkm; - case event_vmem: return counter_vm; - case event_vmem_store: return counter_vs; - case event_flat: return counter_vm | counter_lgkm; - case event_exp_pos: - case event_exp_param: - case event_exp_mrt_null: - case event_gds_gpr_lock: - case event_vmem_gpr_lock: return counter_exp; - default: return 0; - } -} - struct wait_entry { wait_imm imm; - uint16_t events; /* use wait_event notion */ + uint32_t events; /* use wait_event notion */ + uint32_t logical_events; /* use wait_event notion */ uint8_t counters; /* use counter_type notion */ bool wait_on_read : 1; - bool logical : 1; - bool has_vmem_nosampler : 1; - bool has_vmem_sampler : 1; - - wait_entry(wait_event event_, wait_imm imm_, bool logical_, bool wait_on_read_) - : imm(imm_), events(event_), counters(get_counters_for_event(event_)), - wait_on_read(wait_on_read_), logical(logical_), has_vmem_nosampler(false), - has_vmem_sampler(false) + uint8_t vmem_types : 4; /* use vmem_type notion. for counter_vm. */ + uint8_t vm_mask : 2; /* which halves of the VGPR event_vmem uses */ + + wait_entry(wait_event event_, wait_imm imm_, uint8_t counters_, bool wait_on_read_) + : imm(imm_), events(event_), logical_events(event_), counters(counters_), + wait_on_read(wait_on_read_), vmem_types(0), vm_mask(0) {} bool join(const wait_entry& other) { bool changed = (other.events & ~events) || (other.counters & ~counters) || - (other.wait_on_read && !wait_on_read) || - (other.has_vmem_nosampler && !has_vmem_nosampler) || - (other.has_vmem_sampler && !has_vmem_sampler); + (other.wait_on_read && !wait_on_read) || (other.vmem_types & ~vmem_types) || + (other.vm_mask & ~vm_mask); events |= other.events; counters |= other.counters; changed |= imm.combine(other.imm); wait_on_read |= other.wait_on_read; - has_vmem_nosampler |= other.has_vmem_nosampler; - has_vmem_sampler |= other.has_vmem_sampler; - assert(logical == other.logical); + vmem_types |= other.vmem_types; + vm_mask |= other.vm_mask; return changed; } - void remove_counter(counter_type counter) + void remove_wait(wait_type type, uint32_t type_events) { - counters &= ~counter; + counters &= ~(1 << type); + imm[type] = wait_imm::unset_counter; - if (counter == counter_lgkm) { - imm.lgkm = wait_imm::unset_counter; - events &= ~(event_smem | event_lds | event_gds | event_sendmsg); - } + events &= ~type_events | event_flat; + if (!(counters & counter_lgkm) && !(counters & counter_vm)) + events &= ~(type_events & event_flat); - if (counter == counter_vm) { - imm.vm = wait_imm::unset_counter; - events &= ~event_vmem; - has_vmem_nosampler = false; - has_vmem_sampler = false; - } + logical_events &= events; + if (type == wait_type_vm) + vmem_types = 0; + if (type_events & event_vmem) + vm_mask = 0; + } + + UNUSED void print(FILE* output) const + { + imm.print(output); + if (events) + fprintf(output, "events: %u\n", events); + if (logical_events) + fprintf(output, "logical_events: %u\n", logical_events); + if (counters) + fprintf(output, "counters: %u\n", counters); + if (!wait_on_read) + fprintf(output, "wait_on_read: %u\n", wait_on_read); + if (vmem_types) + fprintf(output, "vmem_types: %u\n", vmem_types); + if (vm_mask) + fprintf(output, "vm_mask: %u\n", vm_mask); + } +}; + +struct target_info { + wait_imm max_cnt; + uint32_t events[wait_type_num] = {}; + uint16_t unordered_events; - if (counter == counter_exp) { - imm.exp = wait_imm::unset_counter; - events &= ~(event_exp_pos | event_exp_param | event_exp_mrt_null | event_gds_gpr_lock | - event_vmem_gpr_lock); + target_info(enum amd_gfx_level gfx_level) + { + max_cnt = wait_imm::max(gfx_level); + for (unsigned i = 0; i < wait_type_num; i++) + max_cnt[i] = max_cnt[i] ? max_cnt[i] - 1 : 0; + + events[wait_type_exp] = event_exp_pos | event_exp_param | event_exp_mrt_null | + event_gds_gpr_lock | event_vmem_gpr_lock | event_ldsdir; + events[wait_type_lgkm] = event_smem | event_lds | event_gds | event_flat | event_sendmsg; + events[wait_type_vm] = event_vmem | event_flat; + events[wait_type_vs] = event_vmem_store; + if (gfx_level >= GFX12) { + events[wait_type_sample] = event_vmem_sample; + events[wait_type_bvh] = event_vmem_bvh; + events[wait_type_km] = event_smem | event_sendmsg; + events[wait_type_lgkm] &= ~events[wait_type_km]; } - if (counter == counter_vs) { - imm.vs = wait_imm::unset_counter; - events &= ~event_vmem_store; + for (unsigned i = 0; i < wait_type_num; i++) { + u_foreach_bit (j, events[i]) + counters[j] |= (1 << i); } - if (!(counters & counter_lgkm) && !(counters & counter_vm)) - events &= ~event_flat; + unordered_events = event_smem | (gfx_level < GFX10 ? event_flat : 0); } + + uint8_t get_counters_for_event(wait_event event) const { return counters[ffs(event) - 1]; } + +private: + /* Bitfields of counters affected by each event */ + uint8_t counters[num_events] = {}; }; struct wait_ctx { Program* program; - enum chip_class chip_class; - uint16_t max_vm_cnt; - uint16_t max_exp_cnt; - uint16_t max_lgkm_cnt; - uint16_t max_vs_cnt; - uint16_t unordered_events = event_smem | event_flat; - - uint8_t vm_cnt = 0; - uint8_t exp_cnt = 0; - uint8_t lgkm_cnt = 0; - uint8_t vs_cnt = 0; + enum amd_gfx_level gfx_level; + const target_info* info; + + uint32_t nonzero = 0; bool pending_flat_lgkm = false; bool pending_flat_vm = false; bool pending_s_buffer_store = false; /* GFX10 workaround */ @@ -191,57 +185,156 @@ struct wait_ctx { std::map<PhysReg, wait_entry> gpr_map; wait_ctx() {} - wait_ctx(Program* program_) - : program(program_), chip_class(program_->chip_class), - max_vm_cnt(program_->chip_class >= GFX9 ? 62 : 14), max_exp_cnt(6), - max_lgkm_cnt(program_->chip_class >= GFX10 ? 62 : 14), - max_vs_cnt(program_->chip_class >= GFX10 ? 62 : 0), - unordered_events(event_smem | (program_->chip_class < GFX10 ? event_flat : 0)) + wait_ctx(Program* program_, const target_info* info_) + : program(program_), gfx_level(program_->gfx_level), info(info_) {} - bool join(const wait_ctx* other, bool logical) + bool join(const wait_ctx* other, bool logical, bool logical_merge) { - bool changed = other->exp_cnt > exp_cnt || other->vm_cnt > vm_cnt || - other->lgkm_cnt > lgkm_cnt || other->vs_cnt > vs_cnt || - (other->pending_flat_lgkm && !pending_flat_lgkm) || - (other->pending_flat_vm && !pending_flat_vm); - - exp_cnt = std::max(exp_cnt, other->exp_cnt); - vm_cnt = std::max(vm_cnt, other->vm_cnt); - lgkm_cnt = std::max(lgkm_cnt, other->lgkm_cnt); - vs_cnt = std::max(vs_cnt, other->vs_cnt); + bool changed = (other->pending_flat_lgkm && !pending_flat_lgkm) || + (other->pending_flat_vm && !pending_flat_vm) || (~nonzero & other->nonzero); + + nonzero |= other->nonzero; pending_flat_lgkm |= other->pending_flat_lgkm; pending_flat_vm |= other->pending_flat_vm; pending_s_buffer_store |= other->pending_s_buffer_store; - for (const auto& entry : other->gpr_map) { - if (entry.second.logical != logical) - continue; - - using iterator = std::map<PhysReg, wait_entry>::iterator; - const std::pair<iterator, bool> insert_pair = gpr_map.insert(entry); - if (insert_pair.second) { - changed = true; - } else { - changed |= insert_pair.first->second.join(entry.second); + using iterator = std::map<PhysReg, wait_entry>::iterator; + + if (logical == logical_merge) { + for (const auto& entry : other->gpr_map) { + const std::pair<iterator, bool> insert_pair = gpr_map.insert(entry); + if (insert_pair.second) { + insert_pair.first->second.logical_events = 0; + changed = true; + } else { + changed |= insert_pair.first->second.join(entry.second); + } } } - for (unsigned i = 0; i < storage_count; i++) { - changed |= barrier_imm[i].combine(other->barrier_imm[i]); - changed |= (other->barrier_events[i] & ~barrier_events[i]) != 0; - barrier_events[i] |= other->barrier_events[i]; + if (logical) { + for (const auto& entry : other->gpr_map) { + iterator it = gpr_map.find(entry.first); + if (it != gpr_map.end()) { + changed |= (entry.second.logical_events & ~it->second.logical_events) != 0; + it->second.logical_events |= entry.second.logical_events; + } + } + + for (unsigned i = 0; i < storage_count; i++) { + changed |= barrier_imm[i].combine(other->barrier_imm[i]); + changed |= (other->barrier_events[i] & ~barrier_events[i]) != 0; + barrier_events[i] |= other->barrier_events[i]; + } } return changed; } - void wait_and_remove_from_entry(PhysReg reg, wait_entry& entry, counter_type counter) + UNUSED void print(FILE* output) const { - entry.remove_counter(counter); + for (unsigned i = 0; i < wait_type_num; i++) + fprintf(output, "nonzero[%u]: %u\n", i, nonzero & (1 << i) ? 1 : 0); + fprintf(output, "pending_flat_lgkm: %u\n", pending_flat_lgkm); + fprintf(output, "pending_flat_vm: %u\n", pending_flat_vm); + for (const auto& entry : gpr_map) { + fprintf(output, "gpr_map[%c%u] = {\n", entry.first.reg() >= 256 ? 'v' : 's', + entry.first.reg() & 0xff); + entry.second.print(output); + fprintf(output, "}\n"); + } + + for (unsigned i = 0; i < storage_count; i++) { + if (!barrier_imm[i].empty() || barrier_events[i]) { + fprintf(output, "barriers[%u] = {\n", i); + barrier_imm[i].print(output); + fprintf(output, "events: %u\n", barrier_events[i]); + fprintf(output, "}\n"); + } + } } }; +wait_event +get_vmem_event(wait_ctx& ctx, Instruction* instr, uint8_t type) +{ + if (instr->definitions.empty() && ctx.gfx_level >= GFX10) + return event_vmem_store; + wait_event ev = event_vmem; + if (ctx.gfx_level >= GFX12 && type != vmem_nosampler) + ev = type == vmem_bvh ? event_vmem_bvh : event_vmem_sample; + return ev; +} + +uint32_t +get_vmem_mask(wait_ctx& ctx, Instruction* instr) +{ + if (ctx.program->dev.sram_ecc_enabled) + return 0xffffffff; + switch (instr->opcode) { + case aco_opcode::buffer_load_format_d16_x: + case aco_opcode::buffer_load_ubyte_d16: + case aco_opcode::buffer_load_sbyte_d16: + case aco_opcode::buffer_load_short_d16: + case aco_opcode::tbuffer_load_format_d16_x: + case aco_opcode::flat_load_ubyte_d16: + case aco_opcode::flat_load_sbyte_d16: + case aco_opcode::flat_load_short_d16: + case aco_opcode::global_load_ubyte_d16: + case aco_opcode::global_load_sbyte_d16: + case aco_opcode::global_load_short_d16: + case aco_opcode::scratch_load_ubyte_d16: + case aco_opcode::scratch_load_sbyte_d16: + case aco_opcode::scratch_load_short_d16: return 0x1; + case aco_opcode::buffer_load_ubyte_d16_hi: + case aco_opcode::buffer_load_sbyte_d16_hi: + case aco_opcode::buffer_load_short_d16_hi: + case aco_opcode::buffer_load_format_d16_hi_x: + case aco_opcode::flat_load_ubyte_d16_hi: + case aco_opcode::flat_load_sbyte_d16_hi: + case aco_opcode::flat_load_short_d16_hi: + case aco_opcode::global_load_ubyte_d16_hi: + case aco_opcode::global_load_sbyte_d16_hi: + case aco_opcode::global_load_short_d16_hi: + case aco_opcode::scratch_load_ubyte_d16_hi: + case aco_opcode::scratch_load_sbyte_d16_hi: + case aco_opcode::scratch_load_short_d16_hi: return 0x2; + case aco_opcode::buffer_load_format_d16_xyz: + case aco_opcode::tbuffer_load_format_d16_xyz: return 0x7; + default: return 0xffffffff; + } +} + +wait_imm +get_imm(wait_ctx& ctx, PhysReg reg, wait_entry& entry) +{ + if (reg.reg() >= 256) { + uint32_t events = entry.logical_events; + + /* ALU can't safely write to unwritten destination VGPR lanes with DS/VMEM on GFX11+ without + * waiting for the load to finish, even if none of the lanes are involved in the load. + */ + if (ctx.gfx_level >= GFX11) { + uint32_t ds_vmem_events = + event_lds | event_gds | event_vmem | event_vmem_sample | event_vmem_bvh | event_flat; + events |= ds_vmem_events; + } + + uint32_t counters = 0; + u_foreach_bit (i, entry.events & events) + counters |= ctx.info->get_counters_for_event((wait_event)(1 << i)); + + wait_imm imm; + u_foreach_bit (i, entry.counters & counters) + imm[i] = entry.imm[i]; + + return imm; + } else { + return entry.imm; + } +} + void check_instr(wait_ctx& ctx, wait_imm& wait, Instruction* instr) { @@ -251,12 +344,9 @@ check_instr(wait_ctx& ctx, wait_imm& wait, Instruction* instr) /* check consecutively read gprs */ for (unsigned j = 0; j < op.size(); j++) { - PhysReg reg{op.physReg() + j}; - std::map<PhysReg, wait_entry>::iterator it = ctx.gpr_map.find(reg); - if (it == ctx.gpr_map.end() || !it->second.wait_on_read) - continue; - - wait.combine(it->second.imm); + std::map<PhysReg, wait_entry>::iterator it = ctx.gpr_map.find(PhysReg{op.physReg() + j}); + if (it != ctx.gpr_map.end() && it->second.wait_on_read) + wait.combine(get_imm(ctx, PhysReg{op.physReg() + j}, it->second)); } } @@ -269,38 +359,46 @@ check_instr(wait_ctx& ctx, wait_imm& wait, Instruction* instr) if (it == ctx.gpr_map.end()) continue; - /* Vector Memory reads and writes return in the order they were issued */ - bool has_sampler = instr->isMIMG() && !instr->operands[1].isUndefined() && - instr->operands[1].regClass() == s4; - if (instr->isVMEM() && ((it->second.events & vm_events) == event_vmem) && - it->second.has_vmem_nosampler == !has_sampler && - it->second.has_vmem_sampler == has_sampler) - continue; + wait_imm reg_imm = get_imm(ctx, reg, it->second); + + /* Vector Memory reads and writes decrease the counter in the order they were issued. + * Before GFX12, they also write VGPRs in order if they're of the same type. + * We can do this for GFX12 and different types for GFX11 if we know that the two + * VMEM loads do not write the same register half or the same lanes. + */ + uint8_t vmem_type = get_vmem_type(ctx.gfx_level, ctx.program->family, instr); + if (vmem_type) { + wait_event event = get_vmem_event(ctx, instr, vmem_type); + wait_type type = (wait_type)(ffs(ctx.info->get_counters_for_event(event)) - 1); + + bool event_matches = (it->second.events & ctx.info->events[type]) == event; + /* wait_type_vm/counter_vm can have several different vmem_types */ + bool type_matches = type != wait_type_vm || (it->second.vmem_types == vmem_type && + util_bitcount(vmem_type) == 1); + + bool different_halves = false; + if (event == event_vmem && event_matches) { + uint32_t mask = (get_vmem_mask(ctx, instr) >> (j * 2)) & 0x3; + different_halves = !(mask & it->second.vm_mask); + } + + bool different_lanes = (it->second.logical_events & ctx.info->events[type]) == 0; + + if ((event_matches && type_matches && ctx.gfx_level < GFX12) || different_halves || + different_lanes) + reg_imm[type] = wait_imm::unset_counter; + } /* LDS reads and writes return in the order they were issued. same for GDS */ - if (instr->isDS() && - (it->second.events & lgkm_events) == (instr->ds().gds ? event_gds : event_lds)) - continue; + if (instr->isDS() && (it->second.events & ctx.info->events[wait_type_lgkm]) == + (instr->ds().gds ? event_gds : event_lds)) + reg_imm.lgkm = wait_imm::unset_counter; - wait.combine(it->second.imm); + wait.combine(reg_imm); } } } -bool -parse_wait_instr(wait_ctx& ctx, wait_imm& imm, Instruction* instr) -{ - if (instr->opcode == aco_opcode::s_waitcnt_vscnt && - instr->definitions[0].physReg() == sgpr_null) { - imm.vs = std::min<uint8_t>(imm.vs, instr->sopk().imm); - return true; - } else if (instr->opcode == aco_opcode::s_waitcnt) { - imm.combine(wait_imm(ctx.chip_class, instr->sopp().imm)); - return true; - } - return false; -} - void perform_barrier(wait_ctx& ctx, wait_imm& imm, memory_sync_info sync, unsigned semantics) { @@ -318,9 +416,9 @@ perform_barrier(wait_ctx& ctx, wait_imm& imm, memory_sync_info sync, unsigned se if (bar_scope_lds <= subgroup_scope) events &= ~event_lds; - /* in non-WGP, the L1 (L0 on GFX10+) cache keeps all memory operations + /* Until GFX11, in non-WGP, the L1 (L0 on GFX10+) cache keeps all memory operations * in-order for the same workgroup */ - if (!ctx.program->wgp_mode && sync.scope <= scope_workgroup) + if (ctx.gfx_level < GFX11 && !ctx.program->wgp_mode && sync.scope <= scope_workgroup) events &= ~(event_vmem | event_vmem_store | event_smem); if (events) @@ -332,41 +430,60 @@ perform_barrier(wait_ctx& ctx, wait_imm& imm, memory_sync_info sync, unsigned se void force_waitcnt(wait_ctx& ctx, wait_imm& imm) { - if (ctx.vm_cnt) - imm.vm = 0; - if (ctx.exp_cnt) - imm.exp = 0; - if (ctx.lgkm_cnt) - imm.lgkm = 0; - - if (ctx.chip_class >= GFX10) { - if (ctx.vs_cnt) - imm.vs = 0; - } + u_foreach_bit (i, ctx.nonzero) + imm[i] = 0; } void kill(wait_imm& imm, Instruction* instr, wait_ctx& ctx, memory_sync_info sync_info) { - if (debug_flags & DEBUG_FORCE_WAITCNT) { + if (instr->opcode == aco_opcode::s_setpc_b64 || (debug_flags & DEBUG_FORCE_WAITCNT)) { /* Force emitting waitcnt states right after the instruction if there is - * something to wait for. + * something to wait for. This is also applied for s_setpc_b64 to ensure + * waitcnt states are inserted before jumping to the PS epilog. + */ + force_waitcnt(ctx, imm); + } + + /* sendmsg(dealloc_vgprs) releases scratch, so this isn't safe if there is a in-progress + * scratch store. + */ + if (ctx.gfx_level >= GFX11 && instr->opcode == aco_opcode::s_sendmsg && + instr->salu().imm == sendmsg_dealloc_vgprs) { + imm.combine(ctx.barrier_imm[ffs(storage_scratch) - 1]); + imm.combine(ctx.barrier_imm[ffs(storage_vgpr_spill) - 1]); + } + + /* Make sure POPS coherent memory accesses have reached the L2 cache before letting the + * overlapping waves proceed into the ordered section. + */ + if (ctx.program->has_pops_overlapped_waves_wait && + (ctx.gfx_level >= GFX11 ? instr->isEXP() && instr->exp().done + : (instr->opcode == aco_opcode::s_sendmsg && + instr->salu().imm == sendmsg_ordered_ps_done))) { + uint8_t c = counter_vm | counter_vs; + /* Await SMEM loads too, as it's possible for an application to create them, like using a + * scalarization loop - pointless and unoptimal for an inherently divergent address of + * per-pixel data, but still can be done at least synthetically and must be handled correctly. */ - return force_waitcnt(ctx, imm); + if (ctx.program->has_smem_buffer_or_global_loads) + c |= counter_lgkm; + + u_foreach_bit (i, c & ctx.nonzero) + imm[i] = 0; } - if (ctx.exp_cnt || ctx.vm_cnt || ctx.lgkm_cnt) - check_instr(ctx, imm, instr); + check_instr(ctx, imm, instr); /* It's required to wait for scalar stores before "writing back" data. * It shouldn't cost anything anyways since we're about to do s_endpgm. */ - if (ctx.lgkm_cnt && instr->opcode == aco_opcode::s_dcache_wb) { - assert(ctx.chip_class >= GFX8); + if ((ctx.nonzero & BITFIELD_BIT(wait_type_lgkm)) && instr->opcode == aco_opcode::s_dcache_wb) { + assert(ctx.gfx_level >= GFX8); imm.lgkm = 0; } - if (ctx.chip_class >= GFX10 && instr->isSMEM()) { + if (ctx.gfx_level >= GFX10 && instr->isSMEM()) { /* GFX10: A store followed by a load at the same address causes a problem because * the load doesn't load the correct values unless we wait for the store first. * This is NOT mitigated by an s_nop. @@ -379,19 +496,9 @@ kill(wait_imm& imm, Instruction* instr, wait_ctx& ctx, memory_sync_info sync_inf } } - if (ctx.program->early_rast && instr->opcode == aco_opcode::exp) { - if (instr->exp().dest >= V_008DFC_SQ_EXP_POS && instr->exp().dest < V_008DFC_SQ_EXP_PRIM) { - - /* With early_rast, the HW will start clipping and rasterization after the 1st DONE pos - * export. Wait for all stores (and atomics) to complete, so PS can read them. - * TODO: This only really applies to DONE pos exports. - * Consider setting the DONE bit earlier. - */ - if (ctx.vs_cnt > 0) - imm.vs = 0; - if (ctx.vm_cnt > 0) - imm.vm = 0; - } + if (instr->opcode == aco_opcode::ds_ordered_count && + ((instr->ds().offset1 | (instr->ds().offset0 >> 8)) & 0x1)) { + imm.combine(ctx.barrier_imm[ffs(storage_gds) - 1]); } if (instr->opcode == aco_opcode::p_barrier) @@ -406,30 +513,18 @@ kill(wait_imm& imm, Instruction* instr, wait_ctx& ctx, memory_sync_info sync_inf imm.lgkm = 0; /* reset counters */ - ctx.exp_cnt = std::min(ctx.exp_cnt, imm.exp); - ctx.vm_cnt = std::min(ctx.vm_cnt, imm.vm); - ctx.lgkm_cnt = std::min(ctx.lgkm_cnt, imm.lgkm); - ctx.vs_cnt = std::min(ctx.vs_cnt, imm.vs); + for (unsigned i = 0; i < wait_type_num; i++) + ctx.nonzero &= imm[i] == 0 ? ~BITFIELD_BIT(i) : UINT32_MAX; /* update barrier wait imms */ for (unsigned i = 0; i < storage_count; i++) { wait_imm& bar = ctx.barrier_imm[i]; uint16_t& bar_ev = ctx.barrier_events[i]; - if (bar.exp != wait_imm::unset_counter && imm.exp <= bar.exp) { - bar.exp = wait_imm::unset_counter; - bar_ev &= ~exp_events; - } - if (bar.vm != wait_imm::unset_counter && imm.vm <= bar.vm) { - bar.vm = wait_imm::unset_counter; - bar_ev &= ~(vm_events & ~event_flat); - } - if (bar.lgkm != wait_imm::unset_counter && imm.lgkm <= bar.lgkm) { - bar.lgkm = wait_imm::unset_counter; - bar_ev &= ~(lgkm_events & ~event_flat); - } - if (bar.vs != wait_imm::unset_counter && imm.vs <= bar.vs) { - bar.vs = wait_imm::unset_counter; - bar_ev &= ~vs_events; + for (unsigned j = 0; j < wait_type_num; j++) { + if (bar[j] != wait_imm::unset_counter && imm[j] <= bar[j]) { + bar[j] = wait_imm::unset_counter; + bar_ev &= ~ctx.info->events[j] | event_flat; + } } if (bar.vm == wait_imm::unset_counter && bar.lgkm == wait_imm::unset_counter) bar_ev &= ~event_flat; @@ -438,14 +533,10 @@ kill(wait_imm& imm, Instruction* instr, wait_ctx& ctx, memory_sync_info sync_inf /* remove all gprs with higher counter from map */ std::map<PhysReg, wait_entry>::iterator it = ctx.gpr_map.begin(); while (it != ctx.gpr_map.end()) { - if (imm.exp != wait_imm::unset_counter && imm.exp <= it->second.imm.exp) - ctx.wait_and_remove_from_entry(it->first, it->second, counter_exp); - if (imm.vm != wait_imm::unset_counter && imm.vm <= it->second.imm.vm) - ctx.wait_and_remove_from_entry(it->first, it->second, counter_vm); - if (imm.lgkm != wait_imm::unset_counter && imm.lgkm <= it->second.imm.lgkm) - ctx.wait_and_remove_from_entry(it->first, it->second, counter_lgkm); - if (imm.vs != wait_imm::unset_counter && imm.vs <= it->second.imm.vs) - ctx.wait_and_remove_from_entry(it->first, it->second, counter_vs); + for (unsigned i = 0; i < wait_type_num; i++) { + if (imm[i] != wait_imm::unset_counter && imm[i] <= it->second.imm[i]) + it->second.remove_wait((wait_type)i, ctx.info->events[i]); + } if (!it->second.counters) it = ctx.gpr_map.erase(it); else @@ -462,37 +553,24 @@ kill(wait_imm& imm, Instruction* instr, wait_ctx& ctx, memory_sync_info sync_inf } void -update_barrier_counter(uint8_t* ctr, unsigned max) -{ - if (*ctr != wait_imm::unset_counter && *ctr < max) - (*ctr)++; -} - -void update_barrier_imm(wait_ctx& ctx, uint8_t counters, wait_event event, memory_sync_info sync) { for (unsigned i = 0; i < storage_count; i++) { wait_imm& bar = ctx.barrier_imm[i]; uint16_t& bar_ev = ctx.barrier_events[i]; - if (sync.storage & (1 << i) && !(sync.semantics & semantic_private)) { + + /* We re-use barrier_imm/barrier_events to wait for all scratch stores to finish. */ + bool ignore_private = i == (ffs(storage_scratch) - 1) || i == (ffs(storage_vgpr_spill) - 1); + + if (sync.storage & (1 << i) && (!(sync.semantics & semantic_private) || ignore_private)) { bar_ev |= event; - if (counters & counter_lgkm) - bar.lgkm = 0; - if (counters & counter_vm) - bar.vm = 0; - if (counters & counter_exp) - bar.exp = 0; - if (counters & counter_vs) - bar.vs = 0; - } else if (!(bar_ev & ctx.unordered_events) && !(ctx.unordered_events & event)) { - if (counters & counter_lgkm && (bar_ev & lgkm_events) == event) - update_barrier_counter(&bar.lgkm, ctx.max_lgkm_cnt); - if (counters & counter_vm && (bar_ev & vm_events) == event) - update_barrier_counter(&bar.vm, ctx.max_vm_cnt); - if (counters & counter_exp && (bar_ev & exp_events) == event) - update_barrier_counter(&bar.exp, ctx.max_exp_cnt); - if (counters & counter_vs && (bar_ev & vs_events) == event) - update_barrier_counter(&bar.vs, ctx.max_vs_cnt); + u_foreach_bit (j, counters) + bar[j] = 0; + } else if (!(bar_ev & ctx.info->unordered_events) && !(ctx.info->unordered_events & event)) { + u_foreach_bit (j, counters) { + if (bar[j] != wait_imm::unset_counter && (bar_ev & ctx.info->events[j]) == event) + bar[j] = std::min<uint16_t>(bar[j] + 1, ctx.info->max_cnt[j]); + } } } } @@ -500,20 +578,13 @@ update_barrier_imm(wait_ctx& ctx, uint8_t counters, wait_event event, memory_syn void update_counters(wait_ctx& ctx, wait_event event, memory_sync_info sync = memory_sync_info()) { - uint8_t counters = get_counters_for_event(event); + uint8_t counters = ctx.info->get_counters_for_event(event); - if (counters & counter_lgkm && ctx.lgkm_cnt <= ctx.max_lgkm_cnt) - ctx.lgkm_cnt++; - if (counters & counter_vm && ctx.vm_cnt <= ctx.max_vm_cnt) - ctx.vm_cnt++; - if (counters & counter_exp && ctx.exp_cnt <= ctx.max_exp_cnt) - ctx.exp_cnt++; - if (counters & counter_vs && ctx.vs_cnt <= ctx.max_vs_cnt) - ctx.vs_cnt++; + ctx.nonzero |= counters; update_barrier_imm(ctx, counters, event, sync); - if (ctx.unordered_events & event) + if (ctx.info->unordered_events & event) return; if (ctx.pending_flat_lgkm) @@ -524,35 +595,24 @@ update_counters(wait_ctx& ctx, wait_event event, memory_sync_info sync = memory_ for (std::pair<const PhysReg, wait_entry>& e : ctx.gpr_map) { wait_entry& entry = e.second; - if (entry.events & ctx.unordered_events) + if (entry.events & ctx.info->unordered_events) continue; assert(entry.events); - if ((counters & counter_exp) && (entry.events & exp_events) == event && - entry.imm.exp < ctx.max_exp_cnt) - entry.imm.exp++; - if ((counters & counter_lgkm) && (entry.events & lgkm_events) == event && - entry.imm.lgkm < ctx.max_lgkm_cnt) - entry.imm.lgkm++; - if ((counters & counter_vm) && (entry.events & vm_events) == event && - entry.imm.vm < ctx.max_vm_cnt) - entry.imm.vm++; - if ((counters & counter_vs) && (entry.events & vs_events) == event && - entry.imm.vs < ctx.max_vs_cnt) - entry.imm.vs++; + u_foreach_bit (i, counters) { + if ((entry.events & ctx.info->events[i]) == event) + entry.imm[i] = std::min<uint16_t>(entry.imm[i] + 1, ctx.info->max_cnt[i]); + } } } void update_counters_for_flat_load(wait_ctx& ctx, memory_sync_info sync = memory_sync_info()) { - assert(ctx.chip_class < GFX10); + assert(ctx.gfx_level < GFX10); - if (ctx.lgkm_cnt <= ctx.max_lgkm_cnt) - ctx.lgkm_cnt++; - if (ctx.vm_cnt <= ctx.max_vm_cnt) - ctx.vm_cnt++; + ctx.nonzero |= BITFIELD_BIT(wait_type_lgkm) | BITFIELD_BIT(wait_type_vm); update_barrier_imm(ctx, counter_vm | counter_lgkm, event_flat, sync); @@ -568,41 +628,40 @@ update_counters_for_flat_load(wait_ctx& ctx, memory_sync_info sync = memory_sync void insert_wait_entry(wait_ctx& ctx, PhysReg reg, RegClass rc, wait_event event, bool wait_on_read, - bool has_sampler = false) + uint8_t vmem_types = 0, uint32_t vm_mask = 0) { - uint16_t counters = get_counters_for_event(event); + uint16_t counters = ctx.info->get_counters_for_event(event); wait_imm imm; - if (counters & counter_lgkm) - imm.lgkm = 0; - if (counters & counter_vm) - imm.vm = 0; - if (counters & counter_exp) - imm.exp = 0; - if (counters & counter_vs) - imm.vs = 0; + u_foreach_bit (i, counters) + imm[i] = 0; - wait_entry new_entry(event, imm, !rc.is_linear(), wait_on_read); - new_entry.has_vmem_nosampler = (event & event_vmem) && !has_sampler; - new_entry.has_vmem_sampler = (event & event_vmem) && has_sampler; + wait_entry new_entry(event, imm, counters, wait_on_read); + if (counters & counter_vm) + new_entry.vmem_types |= vmem_types; - for (unsigned i = 0; i < rc.size(); i++) { + for (unsigned i = 0; i < rc.size(); i++, vm_mask >>= 2) { + new_entry.vm_mask = vm_mask & 0x3; auto it = ctx.gpr_map.emplace(PhysReg{reg.reg() + i}, new_entry); - if (!it.second) + if (!it.second) { it.first->second.join(new_entry); + it.first->second.logical_events |= event; + } } } void -insert_wait_entry(wait_ctx& ctx, Operand op, wait_event event, bool has_sampler = false) +insert_wait_entry(wait_ctx& ctx, Operand op, wait_event event, uint8_t vmem_types = 0, + uint32_t vm_mask = 0) { if (!op.isConstant() && !op.isUndefined()) - insert_wait_entry(ctx, op.physReg(), op.regClass(), event, false, has_sampler); + insert_wait_entry(ctx, op.physReg(), op.regClass(), event, false, vmem_types, vm_mask); } void -insert_wait_entry(wait_ctx& ctx, Definition def, wait_event event, bool has_sampler = false) +insert_wait_entry(wait_ctx& ctx, Definition def, wait_event event, uint8_t vmem_types = 0, + uint32_t vm_mask = 0) { - insert_wait_entry(ctx, def.physReg(), def.regClass(), event, true, has_sampler); + insert_wait_entry(ctx, def.physReg(), def.regClass(), event, true, vmem_types, vm_mask); } void @@ -634,7 +693,7 @@ gen(Instruction* instr, wait_ctx& ctx) } case Format::FLAT: { FLAT_instruction& flat = instr->flat(); - if (ctx.chip_class < GFX10 && !instr->definitions.empty()) + if (ctx.gfx_level < GFX10 && !instr->definitions.empty()) update_counters_for_flat_load(ctx, flat.sync); else update_counters(ctx, event_flat, flat.sync); @@ -649,7 +708,7 @@ gen(Instruction* instr, wait_ctx& ctx) if (!instr->definitions.empty()) insert_wait_entry(ctx, instr->definitions[0], event_smem); - else if (ctx.chip_class >= GFX10 && !smem.sync.can_reorder()) + else if (ctx.gfx_level >= GFX10 && !smem.sync.can_reorder()) ctx.pending_s_buffer_store = true; break; @@ -670,26 +729,30 @@ gen(Instruction* instr, wait_ctx& ctx) } break; } + case Format::LDSDIR: { + LDSDIR_instruction& ldsdir = instr->ldsdir(); + update_counters(ctx, event_ldsdir, ldsdir.sync); + insert_wait_entry(ctx, instr->definitions[0], event_ldsdir); + break; + } case Format::MUBUF: case Format::MTBUF: case Format::MIMG: - case Format::GLOBAL: { - wait_event ev = - !instr->definitions.empty() || ctx.chip_class < GFX10 ? event_vmem : event_vmem_store; - update_counters(ctx, ev, get_sync_info(instr)); + case Format::GLOBAL: + case Format::SCRATCH: { + uint8_t type = get_vmem_type(ctx.gfx_level, ctx.program->family, instr); + wait_event ev = get_vmem_event(ctx, instr, type); + uint32_t mask = ev == event_vmem ? get_vmem_mask(ctx, instr) : 0; - bool has_sampler = instr->isMIMG() && !instr->operands[1].isUndefined() && - instr->operands[1].regClass() == s4; + update_counters(ctx, ev, get_sync_info(instr)); - if (!instr->definitions.empty()) - insert_wait_entry(ctx, instr->definitions[0], ev, has_sampler); + for (auto& definition : instr->definitions) + insert_wait_entry(ctx, definition, ev, type, mask); - if (ctx.chip_class == GFX6 && instr->format != Format::MIMG && instr->operands.size() == 4) { - ctx.exp_cnt++; + if (ctx.gfx_level == GFX6 && instr->format != Format::MIMG && instr->operands.size() == 4) { update_counters(ctx, event_vmem_gpr_lock); insert_wait_entry(ctx, instr->operands[3], event_vmem_gpr_lock); - } else if (ctx.chip_class == GFX6 && instr->isMIMG() && !instr->operands[2].isUndefined()) { - ctx.exp_cnt++; + } else if (ctx.gfx_level == GFX6 && instr->isMIMG() && !instr->operands[2].isUndefined()) { update_counters(ctx, event_vmem_gpr_lock); insert_wait_entry(ctx, instr->operands[2], event_vmem_gpr_lock); } @@ -701,6 +764,14 @@ gen(Instruction* instr, wait_ctx& ctx) update_counters(ctx, event_sendmsg); break; } + case Format::SOP1: { + if (instr->opcode == aco_opcode::s_sendmsg_rtn_b32 || + instr->opcode == aco_opcode::s_sendmsg_rtn_b64) { + update_counters(ctx, event_sendmsg); + insert_wait_entry(ctx, instr->definitions[0], event_sendmsg); + } + break; + } default: break; } } @@ -708,23 +779,28 @@ gen(Instruction* instr, wait_ctx& ctx) void emit_waitcnt(wait_ctx& ctx, std::vector<aco_ptr<Instruction>>& instructions, wait_imm& imm) { - if (imm.vs != wait_imm::unset_counter) { - assert(ctx.chip_class >= GFX10); - SOPK_instruction* waitcnt_vs = - create_instruction<SOPK_instruction>(aco_opcode::s_waitcnt_vscnt, Format::SOPK, 0, 1); - waitcnt_vs->definitions[0] = Definition(sgpr_null, s1); - waitcnt_vs->imm = imm.vs; - instructions.emplace_back(waitcnt_vs); - imm.vs = wait_imm::unset_counter; + Builder bld(ctx.program, &instructions); + imm.build_waitcnt(bld); +} + +bool +check_clause_raw(std::bitset<512>& regs_written, Instruction* instr) +{ + for (Operand op : instr->operands) { + if (op.isConstant()) + continue; + for (unsigned i = 0; i < op.size(); i++) { + if (regs_written[op.physReg().reg() + i]) + return false; + } } - if (!imm.empty()) { - SOPP_instruction* waitcnt = - create_instruction<SOPP_instruction>(aco_opcode::s_waitcnt, Format::SOPP, 0, 0); - waitcnt->imm = imm.pack(ctx.chip_class); - waitcnt->block = -1; - instructions.emplace_back(waitcnt); + + for (Definition def : instr->definitions) { + for (unsigned i = 0; i < def.size(); i++) + regs_written[def.physReg().reg() + i] = 1; } - imm = wait_imm(); + + return true; } void @@ -734,23 +810,66 @@ handle_block(Program* program, Block& block, wait_ctx& ctx) wait_imm queued_imm; - for (aco_ptr<Instruction>& instr : block.instructions) { - bool is_wait = parse_wait_instr(ctx, queued_imm, instr.get()); + size_t clause_end = 0; + for (size_t i = 0; i < block.instructions.size(); i++) { + aco_ptr<Instruction>& instr = block.instructions[i]; + + bool is_wait = queued_imm.unpack(ctx.gfx_level, instr.get()); memory_sync_info sync_info = get_sync_info(instr.get()); kill(queued_imm, instr.get(), ctx, sync_info); + /* At the start of a possible clause, also emit waitcnts for each instruction to avoid + * splitting the clause. + */ + if (i >= clause_end || !queued_imm.empty()) { + std::optional<std::bitset<512>> regs_written; + for (clause_end = i + 1; clause_end < block.instructions.size(); clause_end++) { + Instruction* next = block.instructions[clause_end].get(); + if (!should_form_clause(instr.get(), next)) + break; + + if (!regs_written) { + regs_written.emplace(); + check_clause_raw(*regs_written, instr.get()); + } + + if (!check_clause_raw(*regs_written, next)) + break; + + kill(queued_imm, next, ctx, get_sync_info(next)); + } + } + gen(instr.get(), ctx); if (instr->format != Format::PSEUDO_BARRIER && !is_wait) { + if (instr->isVINTERP_INREG() && queued_imm.exp != wait_imm::unset_counter) { + instr->vinterp_inreg().wait_exp = MIN2(instr->vinterp_inreg().wait_exp, queued_imm.exp); + queued_imm.exp = wait_imm::unset_counter; + } + if (!queued_imm.empty()) emit_waitcnt(ctx, new_instructions, queued_imm); + bool is_ordered_count_acquire = + instr->opcode == aco_opcode::ds_ordered_count && + !((instr->ds().offset1 | (instr->ds().offset0 >> 8)) & 0x1); + new_instructions.emplace_back(std::move(instr)); perform_barrier(ctx, queued_imm, sync_info, semantic_acquire); + + if (is_ordered_count_acquire) + queued_imm.combine(ctx.barrier_imm[ffs(storage_gds) - 1]); } } + /* For last block of a program which has succeed shader part, wait all memory ops done + * before go to next shader part. + */ + if (block.kind & block_kind_end_with_regs) + force_waitcnt(ctx, queued_imm); + if (!queued_imm.empty()) emit_waitcnt(ctx, new_instructions, queued_imm); @@ -760,18 +879,39 @@ handle_block(Program* program, Block& block, wait_ctx& ctx) } /* end namespace */ void -insert_wait_states(Program* program) +insert_waitcnt(Program* program) { + target_info info(program->gfx_level); + /* per BB ctx */ std::vector<bool> done(program->blocks.size()); - std::vector<wait_ctx> in_ctx(program->blocks.size(), wait_ctx(program)); - std::vector<wait_ctx> out_ctx(program->blocks.size(), wait_ctx(program)); + std::vector<wait_ctx> in_ctx(program->blocks.size(), wait_ctx(program, &info)); + std::vector<wait_ctx> out_ctx(program->blocks.size(), wait_ctx(program, &info)); - std::stack<unsigned> loop_header_indices; + std::stack<unsigned, std::vector<unsigned>> loop_header_indices; unsigned loop_progress = 0; + if (program->pending_lds_access) { + update_barrier_imm(in_ctx[0], info.get_counters_for_event(event_lds), event_lds, + memory_sync_info(storage_shared)); + } + + for (Definition def : program->args_pending_vmem) { + update_counters(in_ctx[0], event_vmem); + insert_wait_entry(in_ctx[0], def, event_vmem, vmem_nosampler, 0xffffffff); + } + for (unsigned i = 0; i < program->blocks.size();) { Block& current = program->blocks[i++]; + + if (current.kind & block_kind_discard_early_exit) { + /* Because the jump to the discard early exit block may happen anywhere in a block, it's + * not possible to join it with its predecessors this way. + * We emit all required waits when emitting the discard block. + */ + continue; + } + wait_ctx ctx = in_ctx[current.index]; if (current.kind & block_kind_loop_header) { @@ -788,11 +928,24 @@ insert_wait_states(Program* program) continue; } + /* Sometimes the counter for an entry is incremented or removed on all logical predecessors, + * so it might be better to join entries using the logical predecessors instead of the linear + * ones. + */ + bool logical_merge = + current.logical_preds.size() > 1 && + std::any_of(current.linear_preds.begin(), current.linear_preds.end(), + [&](unsigned pred) + { + return std::find(current.logical_preds.begin(), current.logical_preds.end(), + pred) == current.logical_preds.end(); + }); + bool changed = false; for (unsigned b : current.linear_preds) - changed |= ctx.join(&out_ctx[b], false); + changed |= ctx.join(&out_ctx[b], false, logical_merge); for (unsigned b : current.logical_preds) - changed |= ctx.join(&out_ctx[b], true); + changed |= ctx.join(&out_ctx[b], true, logical_merge); if (done[current.index] && !changed) { in_ctx[current.index] = std::move(ctx); @@ -801,11 +954,6 @@ insert_wait_states(Program* program) in_ctx[current.index] = ctx; } - if (current.instructions.empty()) { - out_ctx[current.index] = std::move(ctx); - continue; - } - loop_progress = std::max<unsigned>(loop_progress, current.loop_nest_depth); done[current.index] = true; |