diff options
Diffstat (limited to 'src/amd/compiler/aco_insert_waitcnt.cpp')
-rw-r--r-- | src/amd/compiler/aco_insert_waitcnt.cpp | 701 |
1 files changed, 552 insertions, 149 deletions
diff --git a/src/amd/compiler/aco_insert_waitcnt.cpp b/src/amd/compiler/aco_insert_waitcnt.cpp index d7fc87c126d..ae94582f6ce 100644 --- a/src/amd/compiler/aco_insert_waitcnt.cpp +++ b/src/amd/compiler/aco_insert_waitcnt.cpp @@ -1,27 +1,10 @@ /* * Copyright © 2018 Valve Corporation * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - * + * SPDX-License-Identifier: MIT */ +#include "aco_builder.h" #include "aco_ir.h" #include "common/sid.h" @@ -29,6 +12,7 @@ #include <map> #include <stack> #include <vector> +#include <optional> namespace aco { @@ -68,7 +52,11 @@ enum wait_event : uint16_t { event_gds_gpr_lock = 1 << 9, event_vmem_gpr_lock = 1 << 10, event_sendmsg = 1 << 11, - num_events = 12, + event_ldsdir = 1 << 12, + event_valu = 1 << 13, + event_trans = 1 << 14, + event_salu = 1 << 15, + num_events = 16, }; enum counter_type : uint8_t { @@ -76,15 +64,106 @@ enum counter_type : uint8_t { counter_lgkm = 1 << 1, counter_vm = 1 << 2, counter_vs = 1 << 3, - num_counters = 4, + counter_alu = 1 << 4, + num_counters = 5, +}; + +enum vmem_type : uint8_t { + vmem_nosampler = 1 << 0, + vmem_sampler = 1 << 1, + vmem_bvh = 1 << 2, }; -static const uint16_t exp_events = - event_exp_pos | event_exp_param | event_exp_mrt_null | event_gds_gpr_lock | event_vmem_gpr_lock; +static const uint16_t exp_events = event_exp_pos | event_exp_param | event_exp_mrt_null | + event_gds_gpr_lock | event_vmem_gpr_lock | event_ldsdir; static const uint16_t lgkm_events = event_smem | event_lds | event_gds | event_flat | event_sendmsg; static const uint16_t vm_events = event_vmem | event_flat; static const uint16_t vs_events = event_vmem_store; +/* On GFX11+ the SIMD frontend doesn't switch to issuing instructions from a different + * wave if there is an ALU stall. Hence we have an instruction (s_delay_alu) to signal + * that we should switch to a different wave and contains info on dependencies as to + * when we can switch back. + * + * This seems to apply only for ALU->ALU dependencies as other instructions have better + * integration with the frontend. + * + * Note that if we do not emit s_delay_alu things will still be correct, but the wave + * will stall in the ALU (and the ALU will be doing nothing else). We'll use this as + * I'm pretty sure our cycle info is wrong at times (necessarily so, e.g. wave64 VALU + * instructions can take a different number of cycles based on the exec mask) + */ +struct alu_delay_info { + /* These are the values directly above the max representable value, i.e. the wait + * would turn into a no-op when we try to wait for something further back than + * this. + */ + static constexpr int8_t valu_nop = 5; + static constexpr int8_t trans_nop = 4; + + /* How many VALU instructions ago this value was written */ + int8_t valu_instrs = valu_nop; + /* Cycles until the writing VALU instruction is finished */ + int8_t valu_cycles = 0; + + /* How many Transcedent instructions ago this value was written */ + int8_t trans_instrs = trans_nop; + /* Cycles until the writing Transcendent instruction is finished */ + int8_t trans_cycles = 0; + + /* Cycles until the writing SALU instruction is finished*/ + int8_t salu_cycles = 0; + + bool combine(const alu_delay_info& other) + { + bool changed = other.valu_instrs < valu_instrs || other.trans_instrs < trans_instrs || + other.salu_cycles > salu_cycles || other.valu_cycles > valu_cycles || + other.trans_cycles > trans_cycles; + valu_instrs = std::min(valu_instrs, other.valu_instrs); + trans_instrs = std::min(trans_instrs, other.trans_instrs); + salu_cycles = std::max(salu_cycles, other.salu_cycles); + valu_cycles = std::max(valu_cycles, other.valu_cycles); + trans_cycles = std::max(trans_cycles, other.trans_cycles); + return changed; + } + + /* Needs to be called after any change to keep the data consistent. */ + void fixup() + { + if (valu_instrs >= valu_nop || valu_cycles <= 0) { + valu_instrs = valu_nop; + valu_cycles = 0; + } + + if (trans_instrs >= trans_nop || trans_cycles <= 0) { + trans_instrs = trans_nop; + trans_cycles = 0; + } + + salu_cycles = std::max<int8_t>(salu_cycles, 0); + } + + /* Returns true if a wait would be a no-op */ + bool empty() const + { + return valu_instrs == valu_nop && trans_instrs == trans_nop && salu_cycles == 0; + } + + UNUSED void print(FILE* output) const + { + if (valu_instrs != valu_nop) + fprintf(output, "valu_instrs: %u\n", valu_instrs); + if (valu_cycles) + fprintf(output, "valu_cycles: %u\n", valu_cycles); + if (trans_instrs != trans_nop) + fprintf(output, "trans_instrs: %u\n", trans_instrs); + if (trans_cycles) + fprintf(output, "trans_cycles: %u\n", trans_cycles); + if (salu_cycles) + fprintf(output, "salu_cycles: %u\n", salu_cycles); + } +}; + uint8_t get_counters_for_event(wait_event ev) { @@ -100,39 +179,42 @@ get_counters_for_event(wait_event ev) case event_exp_param: case event_exp_mrt_null: case event_gds_gpr_lock: - case event_vmem_gpr_lock: return counter_exp; + case event_vmem_gpr_lock: + case event_ldsdir: return counter_exp; + case event_valu: + case event_trans: + case event_salu: return counter_alu; default: return 0; } } struct wait_entry { wait_imm imm; + alu_delay_info delay; uint16_t events; /* use wait_event notion */ uint8_t counters; /* use counter_type notion */ bool wait_on_read : 1; bool logical : 1; - bool has_vmem_nosampler : 1; - bool has_vmem_sampler : 1; + uint8_t vmem_types : 4; - wait_entry(wait_event event_, wait_imm imm_, bool logical_, bool wait_on_read_) - : imm(imm_), events(event_), counters(get_counters_for_event(event_)), - wait_on_read(wait_on_read_), logical(logical_), has_vmem_nosampler(false), - has_vmem_sampler(false) + wait_entry(wait_event event_, wait_imm imm_, alu_delay_info delay_, bool logical_, + bool wait_on_read_) + : imm(imm_), delay(delay_), events(event_), counters(get_counters_for_event(event_)), + wait_on_read(wait_on_read_), logical(logical_), vmem_types(0) {} bool join(const wait_entry& other) { bool changed = (other.events & ~events) || (other.counters & ~counters) || - (other.wait_on_read && !wait_on_read) || - (other.has_vmem_nosampler && !has_vmem_nosampler) || - (other.has_vmem_sampler && !has_vmem_sampler); + (other.wait_on_read && !wait_on_read) || (other.vmem_types & !vmem_types) || + (!other.logical && logical); events |= other.events; counters |= other.counters; changed |= imm.combine(other.imm); + changed |= delay.combine(other.delay); wait_on_read |= other.wait_on_read; - has_vmem_nosampler |= other.has_vmem_nosampler; - has_vmem_sampler |= other.has_vmem_sampler; - assert(logical == other.logical); + vmem_types |= other.vmem_types; + logical &= other.logical; return changed; } @@ -148,14 +230,12 @@ struct wait_entry { if (counter == counter_vm) { imm.vm = wait_imm::unset_counter; events &= ~event_vmem; - has_vmem_nosampler = false; - has_vmem_sampler = false; + vmem_types = 0; } if (counter == counter_exp) { imm.exp = wait_imm::unset_counter; - events &= ~(event_exp_pos | event_exp_param | event_exp_mrt_null | event_gds_gpr_lock | - event_vmem_gpr_lock); + events &= ~exp_events; } if (counter == counter_vs) { @@ -165,22 +245,44 @@ struct wait_entry { if (!(counters & counter_lgkm) && !(counters & counter_vm)) events &= ~event_flat; + + if (counter == counter_alu) { + delay = alu_delay_info(); + events &= ~(event_valu | event_trans | event_salu); + } + } + + UNUSED void print(FILE* output) const + { + fprintf(output, "logical: %u\n", logical); + imm.print(output); + delay.print(output); + if (events) + fprintf(output, "events: %u\n", events); + if (counters) + fprintf(output, "counters: %u\n", counters); + if (!wait_on_read) + fprintf(output, "wait_on_read: %u\n", wait_on_read); + if (!logical) + fprintf(output, "logical: %u\n", logical); + if (vmem_types) + fprintf(output, "vmem_types: %u\n", vmem_types); } }; struct wait_ctx { Program* program; - enum chip_class chip_class; + enum amd_gfx_level gfx_level; uint16_t max_vm_cnt; uint16_t max_exp_cnt; uint16_t max_lgkm_cnt; uint16_t max_vs_cnt; uint16_t unordered_events = event_smem | event_flat; - uint8_t vm_cnt = 0; - uint8_t exp_cnt = 0; - uint8_t lgkm_cnt = 0; - uint8_t vs_cnt = 0; + bool vm_nonzero = false; + bool exp_nonzero = false; + bool lgkm_nonzero = false; + bool vs_nonzero = false; bool pending_flat_lgkm = false; bool pending_flat_vm = false; bool pending_s_buffer_store = false; /* GFX10 workaround */ @@ -192,24 +294,24 @@ struct wait_ctx { wait_ctx() {} wait_ctx(Program* program_) - : program(program_), chip_class(program_->chip_class), - max_vm_cnt(program_->chip_class >= GFX9 ? 62 : 14), max_exp_cnt(6), - max_lgkm_cnt(program_->chip_class >= GFX10 ? 62 : 14), - max_vs_cnt(program_->chip_class >= GFX10 ? 62 : 0), - unordered_events(event_smem | (program_->chip_class < GFX10 ? event_flat : 0)) + : program(program_), gfx_level(program_->gfx_level), + max_vm_cnt(program_->gfx_level >= GFX9 ? 62 : 14), max_exp_cnt(6), + max_lgkm_cnt(program_->gfx_level >= GFX10 ? 62 : 14), + max_vs_cnt(program_->gfx_level >= GFX10 ? 62 : 0), + unordered_events(event_smem | (program_->gfx_level < GFX10 ? event_flat : 0)) {} bool join(const wait_ctx* other, bool logical) { - bool changed = other->exp_cnt > exp_cnt || other->vm_cnt > vm_cnt || - other->lgkm_cnt > lgkm_cnt || other->vs_cnt > vs_cnt || + bool changed = other->exp_nonzero > exp_nonzero || other->vm_nonzero > vm_nonzero || + other->lgkm_nonzero > lgkm_nonzero || other->vs_nonzero > vs_nonzero || (other->pending_flat_lgkm && !pending_flat_lgkm) || (other->pending_flat_vm && !pending_flat_vm); - exp_cnt = std::max(exp_cnt, other->exp_cnt); - vm_cnt = std::max(vm_cnt, other->vm_cnt); - lgkm_cnt = std::max(lgkm_cnt, other->lgkm_cnt); - vs_cnt = std::max(vs_cnt, other->vs_cnt); + exp_nonzero |= other->exp_nonzero; + vm_nonzero |= other->vm_nonzero; + lgkm_nonzero |= other->lgkm_nonzero; + vs_nonzero |= other->vs_nonzero; pending_flat_lgkm |= other->pending_flat_lgkm; pending_flat_vm |= other->pending_flat_vm; pending_s_buffer_store |= other->pending_s_buffer_store; @@ -240,10 +342,48 @@ struct wait_ctx { { entry.remove_counter(counter); } + + UNUSED void print(FILE* output) const + { + fprintf(output, "exp_nonzero: %u\n", exp_nonzero); + fprintf(output, "vm_nonzero: %u\n", vm_nonzero); + fprintf(output, "lgkm_nonzero: %u\n", lgkm_nonzero); + fprintf(output, "vs_nonzero: %u\n", vs_nonzero); + fprintf(output, "pending_flat_lgkm: %u\n", pending_flat_lgkm); + fprintf(output, "pending_flat_vm: %u\n", pending_flat_vm); + for (const auto& entry : gpr_map) { + fprintf(output, "gpr_map[%c%u] = {\n", entry.first.reg() >= 256 ? 'v' : 's', + entry.first.reg() & 0xff); + entry.second.print(output); + fprintf(output, "}\n"); + } + + for (unsigned i = 0; i < storage_count; i++) { + if (!barrier_imm[i].empty() || barrier_events[i]) { + fprintf(output, "barriers[%u] = {\n", i); + barrier_imm[i].print(output); + fprintf(output, "events: %u\n", barrier_events[i]); + fprintf(output, "}\n"); + } + } + } }; +uint8_t +get_vmem_type(Instruction* instr) +{ + if (instr->opcode == aco_opcode::image_bvh64_intersect_ray) + return vmem_bvh; + else if (instr->isMIMG() && !instr->operands[1].isUndefined() && + instr->operands[1].regClass() == s4) + return vmem_sampler; + else if (instr->isVMEM() || instr->isScratch() || instr->isGlobal()) + return vmem_nosampler; + return 0; +} + void -check_instr(wait_ctx& ctx, wait_imm& wait, Instruction* instr) +check_instr(wait_ctx& ctx, wait_imm& wait, alu_delay_info& delay, Instruction* instr) { for (const Operand op : instr->operands) { if (op.isConstant() || op.isUndefined()) @@ -257,6 +397,8 @@ check_instr(wait_ctx& ctx, wait_imm& wait, Instruction* instr) continue; wait.combine(it->second.imm); + if (instr->isVALU() || instr->isSALU()) + delay.combine(it->second.delay); } } @@ -270,11 +412,9 @@ check_instr(wait_ctx& ctx, wait_imm& wait, Instruction* instr) continue; /* Vector Memory reads and writes return in the order they were issued */ - bool has_sampler = instr->isMIMG() && !instr->operands[1].isUndefined() && - instr->operands[1].regClass() == s4; - if (instr->isVMEM() && ((it->second.events & vm_events) == event_vmem) && - it->second.has_vmem_nosampler == !has_sampler && - it->second.has_vmem_sampler == has_sampler) + uint8_t vmem_type = get_vmem_type(instr); + if (vmem_type && ((it->second.events & vm_events) == event_vmem) && + it->second.vmem_types == vmem_type) continue; /* LDS reads and writes return in the order they were issued. same for GDS */ @@ -290,17 +430,39 @@ check_instr(wait_ctx& ctx, wait_imm& wait, Instruction* instr) bool parse_wait_instr(wait_ctx& ctx, wait_imm& imm, Instruction* instr) { - if (instr->opcode == aco_opcode::s_waitcnt_vscnt && - instr->definitions[0].physReg() == sgpr_null) { - imm.vs = std::min<uint8_t>(imm.vs, instr->sopk().imm); + if (instr->opcode == aco_opcode::s_waitcnt_vscnt && instr->operands[0].physReg() == sgpr_null) { + imm.vs = std::min<uint8_t>(imm.vs, instr->salu().imm); return true; } else if (instr->opcode == aco_opcode::s_waitcnt) { - imm.combine(wait_imm(ctx.chip_class, instr->sopp().imm)); + imm.combine(wait_imm(ctx.gfx_level, instr->salu().imm)); return true; } return false; } +bool +parse_delay_alu(wait_ctx& ctx, alu_delay_info& delay, Instruction* instr) +{ + if (instr->opcode != aco_opcode::s_delay_alu) + return false; + + unsigned imm[2] = {instr->salu().imm & 0xf, (instr->salu().imm >> 7) & 0xf}; + for (unsigned i = 0; i < 2; ++i) { + alu_delay_wait wait = (alu_delay_wait)imm[i]; + if (wait >= alu_delay_wait::VALU_DEP_1 && wait <= alu_delay_wait::VALU_DEP_4) + delay.valu_instrs = imm[i] - (uint32_t)alu_delay_wait::VALU_DEP_1 + 1; + else if (wait >= alu_delay_wait::TRANS32_DEP_1 && wait <= alu_delay_wait::TRANS32_DEP_3) + delay.trans_instrs = imm[i] - (uint32_t)alu_delay_wait::TRANS32_DEP_1 + 1; + else if (wait >= alu_delay_wait::SALU_CYCLE_1) + delay.salu_cycles = imm[i] - (uint32_t)alu_delay_wait::SALU_CYCLE_1 + 1; + } + + delay.valu_cycles = instr->pass_flags & 0xffff; + delay.trans_cycles = instr->pass_flags >> 16; + + return true; +} + void perform_barrier(wait_ctx& ctx, wait_imm& imm, memory_sync_info sync, unsigned semantics) { @@ -332,41 +494,89 @@ perform_barrier(wait_ctx& ctx, wait_imm& imm, memory_sync_info sync, unsigned se void force_waitcnt(wait_ctx& ctx, wait_imm& imm) { - if (ctx.vm_cnt) + if (ctx.vm_nonzero) imm.vm = 0; - if (ctx.exp_cnt) + if (ctx.exp_nonzero) imm.exp = 0; - if (ctx.lgkm_cnt) + if (ctx.lgkm_nonzero) imm.lgkm = 0; - if (ctx.chip_class >= GFX10) { - if (ctx.vs_cnt) + if (ctx.gfx_level >= GFX10) { + if (ctx.vs_nonzero) imm.vs = 0; } } void -kill(wait_imm& imm, Instruction* instr, wait_ctx& ctx, memory_sync_info sync_info) +update_alu(wait_ctx& ctx, bool is_valu, bool is_trans, bool clear, int cycles) { - if (debug_flags & DEBUG_FORCE_WAITCNT) { + std::map<PhysReg, wait_entry>::iterator it = ctx.gpr_map.begin(); + while (it != ctx.gpr_map.end()) { + wait_entry& entry = it->second; + + if (clear) { + entry.remove_counter(counter_alu); + } else { + entry.delay.valu_instrs += is_valu ? 1 : 0; + entry.delay.trans_instrs += is_trans ? 1 : 0; + entry.delay.salu_cycles -= cycles; + entry.delay.valu_cycles -= cycles; + entry.delay.trans_cycles -= cycles; + + entry.delay.fixup(); + if (it->second.delay.empty()) + entry.remove_counter(counter_alu); + } + + if (!entry.counters) + it = ctx.gpr_map.erase(it); + else + it++; + } +} + +void +kill(wait_imm& imm, alu_delay_info& delay, Instruction* instr, wait_ctx& ctx, + memory_sync_info sync_info) +{ + if (instr->opcode == aco_opcode::s_setpc_b64 || (debug_flags & DEBUG_FORCE_WAITCNT)) { /* Force emitting waitcnt states right after the instruction if there is - * something to wait for. + * something to wait for. This is also applied for s_setpc_b64 to ensure + * waitcnt states are inserted before jumping to the PS epilog. */ - return force_waitcnt(ctx, imm); + force_waitcnt(ctx, imm); } - if (ctx.exp_cnt || ctx.vm_cnt || ctx.lgkm_cnt) - check_instr(ctx, imm, instr); + /* Make sure POPS coherent memory accesses have reached the L2 cache before letting the + * overlapping waves proceed into the ordered section. + */ + if (ctx.program->has_pops_overlapped_waves_wait && + (ctx.gfx_level >= GFX11 ? instr->isEXP() && instr->exp().done + : (instr->opcode == aco_opcode::s_sendmsg && + instr->salu().imm == sendmsg_ordered_ps_done))) { + if (ctx.vm_nonzero) + imm.vm = 0; + if (ctx.gfx_level >= GFX10 && ctx.vs_nonzero) + imm.vs = 0; + /* Await SMEM loads too, as it's possible for an application to create them, like using a + * scalarization loop - pointless and unoptimal for an inherently divergent address of + * per-pixel data, but still can be done at least synthetically and must be handled correctly. + */ + if (ctx.program->has_smem_buffer_or_global_loads && ctx.lgkm_nonzero) + imm.lgkm = 0; + } + + check_instr(ctx, imm, delay, instr); /* It's required to wait for scalar stores before "writing back" data. * It shouldn't cost anything anyways since we're about to do s_endpgm. */ - if (ctx.lgkm_cnt && instr->opcode == aco_opcode::s_dcache_wb) { - assert(ctx.chip_class >= GFX8); + if (ctx.lgkm_nonzero && instr->opcode == aco_opcode::s_dcache_wb) { + assert(ctx.gfx_level >= GFX8); imm.lgkm = 0; } - if (ctx.chip_class >= GFX10 && instr->isSMEM()) { + if (ctx.gfx_level >= GFX10 && instr->isSMEM()) { /* GFX10: A store followed by a load at the same address causes a problem because * the load doesn't load the correct values unless we wait for the store first. * This is NOT mitigated by an s_nop. @@ -379,19 +589,9 @@ kill(wait_imm& imm, Instruction* instr, wait_ctx& ctx, memory_sync_info sync_inf } } - if (ctx.program->early_rast && instr->opcode == aco_opcode::exp) { - if (instr->exp().dest >= V_008DFC_SQ_EXP_POS && instr->exp().dest < V_008DFC_SQ_EXP_PRIM) { - - /* With early_rast, the HW will start clipping and rasterization after the 1st DONE pos - * export. Wait for all stores (and atomics) to complete, so PS can read them. - * TODO: This only really applies to DONE pos exports. - * Consider setting the DONE bit earlier. - */ - if (ctx.vs_cnt > 0) - imm.vs = 0; - if (ctx.vm_cnt > 0) - imm.vm = 0; - } + if (instr->opcode == aco_opcode::ds_ordered_count && + ((instr->ds().offset1 | (instr->ds().offset0 >> 8)) & 0x1)) { + imm.combine(ctx.barrier_imm[ffs(storage_gds) - 1]); } if (instr->opcode == aco_opcode::p_barrier) @@ -399,17 +599,17 @@ kill(wait_imm& imm, Instruction* instr, wait_ctx& ctx, memory_sync_info sync_inf else perform_barrier(ctx, imm, sync_info, semantic_release); - if (!imm.empty()) { + if (!imm.empty() || !delay.empty()) { if (ctx.pending_flat_vm && imm.vm != wait_imm::unset_counter) imm.vm = 0; if (ctx.pending_flat_lgkm && imm.lgkm != wait_imm::unset_counter) imm.lgkm = 0; /* reset counters */ - ctx.exp_cnt = std::min(ctx.exp_cnt, imm.exp); - ctx.vm_cnt = std::min(ctx.vm_cnt, imm.vm); - ctx.lgkm_cnt = std::min(ctx.lgkm_cnt, imm.lgkm); - ctx.vs_cnt = std::min(ctx.vs_cnt, imm.vs); + ctx.exp_nonzero &= imm.exp != 0; + ctx.vm_nonzero &= imm.vm != 0; + ctx.lgkm_nonzero &= imm.lgkm != 0; + ctx.vs_nonzero &= imm.vs != 0; /* update barrier wait imms */ for (unsigned i = 0; i < storage_count; i++) { @@ -435,6 +635,11 @@ kill(wait_imm& imm, Instruction* instr, wait_ctx& ctx, memory_sync_info sync_inf bar_ev &= ~event_flat; } + if (ctx.program->gfx_level >= GFX11) { + update_alu(ctx, false, false, false, + MAX3(delay.salu_cycles, delay.valu_cycles, delay.trans_cycles)); + } + /* remove all gprs with higher counter from map */ std::map<PhysReg, wait_entry>::iterator it = ctx.gpr_map.begin(); while (it != ctx.gpr_map.end()) { @@ -446,6 +651,13 @@ kill(wait_imm& imm, Instruction* instr, wait_ctx& ctx, memory_sync_info sync_inf ctx.wait_and_remove_from_entry(it->first, it->second, counter_lgkm); if (imm.vs != wait_imm::unset_counter && imm.vs <= it->second.imm.vs) ctx.wait_and_remove_from_entry(it->first, it->second, counter_vs); + if (delay.valu_instrs <= it->second.delay.valu_instrs) + it->second.delay.valu_instrs = alu_delay_info::valu_nop; + if (delay.trans_instrs <= it->second.delay.trans_instrs) + it->second.delay.trans_instrs = alu_delay_info::trans_nop; + it->second.delay.fixup(); + if (it->second.delay.empty()) + ctx.wait_and_remove_from_entry(it->first, it->second, counter_alu); if (!it->second.counters) it = ctx.gpr_map.erase(it); else @@ -502,14 +714,14 @@ update_counters(wait_ctx& ctx, wait_event event, memory_sync_info sync = memory_ { uint8_t counters = get_counters_for_event(event); - if (counters & counter_lgkm && ctx.lgkm_cnt <= ctx.max_lgkm_cnt) - ctx.lgkm_cnt++; - if (counters & counter_vm && ctx.vm_cnt <= ctx.max_vm_cnt) - ctx.vm_cnt++; - if (counters & counter_exp && ctx.exp_cnt <= ctx.max_exp_cnt) - ctx.exp_cnt++; - if (counters & counter_vs && ctx.vs_cnt <= ctx.max_vs_cnt) - ctx.vs_cnt++; + if (counters & counter_lgkm) + ctx.lgkm_nonzero = true; + if (counters & counter_vm) + ctx.vm_nonzero = true; + if (counters & counter_exp) + ctx.exp_nonzero = true; + if (counters & counter_vs) + ctx.vs_nonzero = true; update_barrier_imm(ctx, counters, event, sync); @@ -547,12 +759,10 @@ update_counters(wait_ctx& ctx, wait_event event, memory_sync_info sync = memory_ void update_counters_for_flat_load(wait_ctx& ctx, memory_sync_info sync = memory_sync_info()) { - assert(ctx.chip_class < GFX10); + assert(ctx.gfx_level < GFX10); - if (ctx.lgkm_cnt <= ctx.max_lgkm_cnt) - ctx.lgkm_cnt++; - if (ctx.vm_cnt <= ctx.max_vm_cnt) - ctx.vm_cnt++; + ctx.lgkm_nonzero = true; + ctx.vm_nonzero = true; update_barrier_imm(ctx, counter_vm | counter_lgkm, event_flat, sync); @@ -568,7 +778,7 @@ update_counters_for_flat_load(wait_ctx& ctx, memory_sync_info sync = memory_sync void insert_wait_entry(wait_ctx& ctx, PhysReg reg, RegClass rc, wait_event event, bool wait_on_read, - bool has_sampler = false) + uint8_t vmem_types = 0, unsigned cycles = 0, bool force_linear = false) { uint16_t counters = get_counters_for_event(event); wait_imm imm; @@ -581,9 +791,19 @@ insert_wait_entry(wait_ctx& ctx, PhysReg reg, RegClass rc, wait_event event, boo if (counters & counter_vs) imm.vs = 0; - wait_entry new_entry(event, imm, !rc.is_linear(), wait_on_read); - new_entry.has_vmem_nosampler = (event & event_vmem) && !has_sampler; - new_entry.has_vmem_sampler = (event & event_vmem) && has_sampler; + alu_delay_info delay; + if (event == event_valu) { + delay.valu_instrs = 0; + delay.valu_cycles = cycles; + } else if (event == event_trans) { + delay.trans_instrs = 0; + delay.trans_cycles = cycles; + } else if (event == event_salu) { + delay.salu_cycles = cycles; + } + + wait_entry new_entry(event, imm, delay, !rc.is_linear() && !force_linear, wait_on_read); + new_entry.vmem_types |= vmem_types; for (unsigned i = 0; i < rc.size(); i++) { auto it = ctx.gpr_map.emplace(PhysReg{reg.reg() + i}, new_entry); @@ -593,16 +813,52 @@ insert_wait_entry(wait_ctx& ctx, PhysReg reg, RegClass rc, wait_event event, boo } void -insert_wait_entry(wait_ctx& ctx, Operand op, wait_event event, bool has_sampler = false) +insert_wait_entry(wait_ctx& ctx, Operand op, wait_event event, uint8_t vmem_types = 0) { if (!op.isConstant() && !op.isUndefined()) - insert_wait_entry(ctx, op.physReg(), op.regClass(), event, false, has_sampler); + insert_wait_entry(ctx, op.physReg(), op.regClass(), event, false, vmem_types, 0); } void -insert_wait_entry(wait_ctx& ctx, Definition def, wait_event event, bool has_sampler = false) +insert_wait_entry(wait_ctx& ctx, Definition def, wait_event event, uint8_t vmem_types = 0, + unsigned cycles = 0) { - insert_wait_entry(ctx, def.physReg(), def.regClass(), event, true, has_sampler); + /* We can't safely write to unwritten destination VGPR lanes with DS/VMEM on GFX11 without + * waiting for the load to finish. + * Also, follow linear control flow for ALU because it's unlikely that the hardware does per-lane + * dependency checks. + */ + uint32_t ds_vmem_events = event_lds | event_gds | event_vmem | event_flat; + uint32_t alu_events = event_trans | event_valu | event_salu; + bool force_linear = ctx.gfx_level >= GFX11 && (event & (ds_vmem_events | alu_events)); + + insert_wait_entry(ctx, def.physReg(), def.regClass(), event, true, vmem_types, cycles, + force_linear); +} + +void +gen_alu(Instruction* instr, wait_ctx& ctx) +{ + Instruction_cycle_info cycle_info = get_cycle_info(*ctx.program, *instr); + bool is_valu = instr->isVALU(); + bool is_trans = instr->isTrans(); + bool clear = instr->isEXP() || instr->isDS() || instr->isMIMG() || instr->isFlatLike() || + instr->isMUBUF() || instr->isMTBUF(); + + wait_event event = (wait_event)0; + if (is_trans) + event = event_trans; + else if (is_valu) + event = event_valu; + else if (instr->isSALU()) + event = event_salu; + + if (event != (wait_event)0) { + for (const Definition& def : instr->definitions) + insert_wait_entry(ctx, def, event, 0, cycle_info.latency); + } + update_alu(ctx, is_valu && instr_info.classes[(int)instr->opcode] != instr_class::wmma, is_trans, + clear, cycle_info.issue_cycles); } void @@ -634,7 +890,7 @@ gen(Instruction* instr, wait_ctx& ctx) } case Format::FLAT: { FLAT_instruction& flat = instr->flat(); - if (ctx.chip_class < GFX10 && !instr->definitions.empty()) + if (ctx.gfx_level < GFX10 && !instr->definitions.empty()) update_counters_for_flat_load(ctx, flat.sync); else update_counters(ctx, event_flat, flat.sync); @@ -649,7 +905,7 @@ gen(Instruction* instr, wait_ctx& ctx) if (!instr->definitions.empty()) insert_wait_entry(ctx, instr->definitions[0], event_smem); - else if (ctx.chip_class >= GFX10 && !smem.sync.can_reorder()) + else if (ctx.gfx_level >= GFX10 && !smem.sync.can_reorder()) ctx.pending_s_buffer_store = true; break; @@ -670,26 +926,28 @@ gen(Instruction* instr, wait_ctx& ctx) } break; } + case Format::LDSDIR: { + LDSDIR_instruction& ldsdir = instr->ldsdir(); + update_counters(ctx, event_ldsdir, ldsdir.sync); + insert_wait_entry(ctx, instr->definitions[0], event_ldsdir); + break; + } case Format::MUBUF: case Format::MTBUF: case Format::MIMG: - case Format::GLOBAL: { + case Format::GLOBAL: + case Format::SCRATCH: { wait_event ev = - !instr->definitions.empty() || ctx.chip_class < GFX10 ? event_vmem : event_vmem_store; + !instr->definitions.empty() || ctx.gfx_level < GFX10 ? event_vmem : event_vmem_store; update_counters(ctx, ev, get_sync_info(instr)); - bool has_sampler = instr->isMIMG() && !instr->operands[1].isUndefined() && - instr->operands[1].regClass() == s4; - if (!instr->definitions.empty()) - insert_wait_entry(ctx, instr->definitions[0], ev, has_sampler); + insert_wait_entry(ctx, instr->definitions[0], ev, get_vmem_type(instr)); - if (ctx.chip_class == GFX6 && instr->format != Format::MIMG && instr->operands.size() == 4) { - ctx.exp_cnt++; + if (ctx.gfx_level == GFX6 && instr->format != Format::MIMG && instr->operands.size() == 4) { update_counters(ctx, event_vmem_gpr_lock); insert_wait_entry(ctx, instr->operands[3], event_vmem_gpr_lock); - } else if (ctx.chip_class == GFX6 && instr->isMIMG() && !instr->operands[2].isUndefined()) { - ctx.exp_cnt++; + } else if (ctx.gfx_level == GFX6 && instr->isMIMG() && !instr->operands[2].isUndefined()) { update_counters(ctx, event_vmem_gpr_lock); insert_wait_entry(ctx, instr->operands[2], event_vmem_gpr_lock); } @@ -701,6 +959,14 @@ gen(Instruction* instr, wait_ctx& ctx) update_counters(ctx, event_sendmsg); break; } + case Format::SOP1: { + if (instr->opcode == aco_opcode::s_sendmsg_rtn_b32 || + instr->opcode == aco_opcode::s_sendmsg_rtn_b64) { + update_counters(ctx, event_sendmsg); + insert_wait_entry(ctx, instr->definitions[0], event_sendmsg); + } + break; + } default: break; } } @@ -709,50 +975,146 @@ void emit_waitcnt(wait_ctx& ctx, std::vector<aco_ptr<Instruction>>& instructions, wait_imm& imm) { if (imm.vs != wait_imm::unset_counter) { - assert(ctx.chip_class >= GFX10); - SOPK_instruction* waitcnt_vs = - create_instruction<SOPK_instruction>(aco_opcode::s_waitcnt_vscnt, Format::SOPK, 0, 1); - waitcnt_vs->definitions[0] = Definition(sgpr_null, s1); - waitcnt_vs->imm = imm.vs; + assert(ctx.gfx_level >= GFX10); + Instruction* waitcnt_vs = create_instruction(aco_opcode::s_waitcnt_vscnt, Format::SOPK, 1, 0); + waitcnt_vs->operands[0] = Operand(sgpr_null, s1); + waitcnt_vs->salu().imm = imm.vs; instructions.emplace_back(waitcnt_vs); imm.vs = wait_imm::unset_counter; } if (!imm.empty()) { - SOPP_instruction* waitcnt = - create_instruction<SOPP_instruction>(aco_opcode::s_waitcnt, Format::SOPP, 0, 0); - waitcnt->imm = imm.pack(ctx.chip_class); - waitcnt->block = -1; + Instruction* waitcnt = create_instruction(aco_opcode::s_waitcnt, Format::SOPP, 0, 0); + waitcnt->salu().imm = imm.pack(ctx.gfx_level); instructions.emplace_back(waitcnt); } imm = wait_imm(); } void +emit_delay_alu(wait_ctx& ctx, std::vector<aco_ptr<Instruction>>& instructions, + alu_delay_info& delay) +{ + uint32_t imm = 0; + if (delay.trans_instrs != delay.trans_nop) { + imm |= (uint32_t)alu_delay_wait::TRANS32_DEP_1 + delay.trans_instrs - 1; + } + + if (delay.valu_instrs != delay.valu_nop) { + imm |= ((uint32_t)alu_delay_wait::VALU_DEP_1 + delay.valu_instrs - 1) << (imm ? 7 : 0); + } + + /* Note that we can only put 2 wait conditions in the instruction, so if we have all 3 we just + * drop the SALU one. Here we use that this doesn't really affect correctness so occasionally + * getting this wrong isn't an issue. */ + if (delay.salu_cycles && imm <= 0xf) { + unsigned cycles = std::min<uint8_t>(3, delay.salu_cycles); + imm |= ((uint32_t)alu_delay_wait::SALU_CYCLE_1 + cycles - 1) << (imm ? 7 : 0); + } + + Instruction* inst = create_instruction(aco_opcode::s_delay_alu, Format::SOPP, 0, 0); + inst->salu().imm = imm; + inst->pass_flags = (delay.valu_cycles | (delay.trans_cycles << 16)); + instructions.emplace_back(inst); + delay = alu_delay_info(); +} + +bool +check_clause_raw(std::bitset<512>& regs_written, Instruction* instr) +{ + for (Operand op : instr->operands) { + if (op.isConstant()) + continue; + for (unsigned i = 0; i < op.size(); i++) { + if (regs_written[op.physReg().reg() + i]) + return false; + } + } + + for (Definition def : instr->definitions) { + for (unsigned i = 0; i < def.size(); i++) + regs_written[def.physReg().reg() + i] = 1; + } + + return true; +} + +void handle_block(Program* program, Block& block, wait_ctx& ctx) { std::vector<aco_ptr<Instruction>> new_instructions; wait_imm queued_imm; + alu_delay_info queued_delay; + + size_t clause_end = 0; + for (size_t i = 0; i < block.instructions.size(); i++) { + aco_ptr<Instruction>& instr = block.instructions[i]; - for (aco_ptr<Instruction>& instr : block.instructions) { bool is_wait = parse_wait_instr(ctx, queued_imm, instr.get()); + bool is_delay_alu = parse_delay_alu(ctx, queued_delay, instr.get()); memory_sync_info sync_info = get_sync_info(instr.get()); - kill(queued_imm, instr.get(), ctx, sync_info); + kill(queued_imm, queued_delay, instr.get(), ctx, sync_info); + + /* At the start of a possible clause, also emit waitcnts for each instruction to avoid + * splitting the clause. + */ + if (i >= clause_end || !queued_imm.empty()) { + std::optional<std::bitset<512>> regs_written; + for (clause_end = i + 1; clause_end < block.instructions.size(); clause_end++) { + Instruction* next = block.instructions[clause_end].get(); + if (!should_form_clause(instr.get(), next)) + break; + + if (!regs_written) { + regs_written.emplace(); + check_clause_raw(*regs_written, instr.get()); + } + + if (!check_clause_raw(*regs_written, next)) + break; + + kill(queued_imm, queued_delay, next, ctx, get_sync_info(next)); + } + } + if (program->gfx_level >= GFX11) + gen_alu(instr.get(), ctx); gen(instr.get(), ctx); - if (instr->format != Format::PSEUDO_BARRIER && !is_wait) { + if (instr->format != Format::PSEUDO_BARRIER && !is_wait && !is_delay_alu) { + if (instr->isVINTERP_INREG() && queued_imm.exp != wait_imm::unset_counter) { + instr->vinterp_inreg().wait_exp = MIN2(instr->vinterp_inreg().wait_exp, queued_imm.exp); + queued_imm.exp = wait_imm::unset_counter; + } + if (!queued_imm.empty()) emit_waitcnt(ctx, new_instructions, queued_imm); + if (!queued_delay.empty()) + emit_delay_alu(ctx, new_instructions, queued_delay); + + bool is_ordered_count_acquire = + instr->opcode == aco_opcode::ds_ordered_count && + !((instr->ds().offset1 | (instr->ds().offset0 >> 8)) & 0x1); new_instructions.emplace_back(std::move(instr)); perform_barrier(ctx, queued_imm, sync_info, semantic_acquire); + + if (is_ordered_count_acquire) + queued_imm.combine(ctx.barrier_imm[ffs(storage_gds) - 1]); } } + /* For last block of a program which has succeed shader part, wait all memory ops done + * before go to next shader part. + */ + if (block.kind & block_kind_end_with_regs) + force_waitcnt(ctx, queued_imm); + if (!queued_imm.empty()) emit_waitcnt(ctx, new_instructions, queued_imm); + if (!queued_delay.empty()) + emit_delay_alu(ctx, new_instructions, queued_delay); block.instructions.swap(new_instructions); } @@ -767,11 +1129,30 @@ insert_wait_states(Program* program) std::vector<wait_ctx> in_ctx(program->blocks.size(), wait_ctx(program)); std::vector<wait_ctx> out_ctx(program->blocks.size(), wait_ctx(program)); - std::stack<unsigned> loop_header_indices; + std::stack<unsigned, std::vector<unsigned>> loop_header_indices; unsigned loop_progress = 0; + if (program->pending_lds_access) { + update_barrier_imm(in_ctx[0], get_counters_for_event(event_lds), event_lds, + memory_sync_info(storage_shared)); + } + + for (Definition def : program->args_pending_vmem) { + update_counters(in_ctx[0], event_vmem); + insert_wait_entry(in_ctx[0], def, event_vmem); + } + for (unsigned i = 0; i < program->blocks.size();) { Block& current = program->blocks[i++]; + + if (current.kind & block_kind_discard_early_exit) { + /* Because the jump to the discard early exit block may happen anywhere in a block, it's + * not possible to join it with its predecessors this way. + * We emit all required waits when emitting the discard block. + */ + continue; + } + wait_ctx ctx = in_ctx[current.index]; if (current.kind & block_kind_loop_header) { @@ -801,11 +1182,6 @@ insert_wait_states(Program* program) in_ctx[current.index] = ctx; } - if (current.instructions.empty()) { - out_ctx[current.index] = std::move(ctx); - continue; - } - loop_progress = std::max<unsigned>(loop_progress, current.loop_nest_depth); done[current.index] = true; @@ -813,6 +1189,33 @@ insert_wait_states(Program* program) out_ctx[current.index] = std::move(ctx); } + + /* Combine s_delay_alu using the skip field. */ + if (program->gfx_level >= GFX11) { + for (Block& block : program->blocks) { + int i = 0; + int prev_delay_alu = -1; + for (aco_ptr<Instruction>& instr : block.instructions) { + if (instr->opcode != aco_opcode::s_delay_alu) { + block.instructions[i++] = std::move(instr); + continue; + } + + uint16_t imm = instr->salu().imm; + int skip = i - prev_delay_alu - 1; + if (imm >> 7 || prev_delay_alu < 0 || skip >= 6) { + if (imm >> 7 == 0) + prev_delay_alu = i; + block.instructions[i++] = std::move(instr); + continue; + } + + block.instructions[prev_delay_alu]->salu().imm |= (skip << 4) | (imm << 7); + prev_delay_alu = -1; + } + block.instructions.resize(i); + } + } } } // namespace aco |