diff options
Diffstat (limited to 'src/amd/compiler/aco_scheduler.cpp')
-rw-r--r-- | src/amd/compiler/aco_scheduler.cpp | 223 |
1 files changed, 164 insertions, 59 deletions
diff --git a/src/amd/compiler/aco_scheduler.cpp b/src/amd/compiler/aco_scheduler.cpp index 2f72ba0cc96..20a08ac7636 100644 --- a/src/amd/compiler/aco_scheduler.cpp +++ b/src/amd/compiler/aco_scheduler.cpp @@ -1,25 +1,7 @@ /* * Copyright © 2018 Valve Corporation * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - * + * SPDX-License-Identifier: MIT */ #include "aco_builder.h" @@ -37,7 +19,7 @@ #define SMEM_MAX_MOVES (64 - ctx.num_waves * 4) #define VMEM_MAX_MOVES (256 - ctx.num_waves * 16) /* creating clauses decreases def-use distances, so make it less aggressive the lower num_waves is */ -#define VMEM_CLAUSE_MAX_GRAB_DIST (ctx.num_waves * 8) +#define VMEM_CLAUSE_MAX_GRAB_DIST (ctx.num_waves * 2) #define POS_EXP_MAX_MOVES 512 namespace aco { @@ -122,6 +104,7 @@ struct MoveState { }; struct sched_ctx { + amd_gfx_level gfx_level; int16_t num_waves; int16_t last_SMEM_stall; int last_SMEM_dep_idx; @@ -420,21 +403,21 @@ MoveState::upwards_skip(UpwardsCursor& cursor) } bool -is_gs_or_done_sendmsg(const Instruction* instr) +is_done_sendmsg(amd_gfx_level gfx_level, const Instruction* instr) { - if (instr->opcode == aco_opcode::s_sendmsg) { - uint16_t imm = instr->sopp().imm; - return (imm & sendmsg_id_mask) == _sendmsg_gs || (imm & sendmsg_id_mask) == _sendmsg_gs_done; - } + if (gfx_level <= GFX10_3 && instr->opcode == aco_opcode::s_sendmsg) + return (instr->salu().imm & sendmsg_id_mask) == sendmsg_gs_done; return false; } bool -is_done_sendmsg(const Instruction* instr) +is_pos_prim_export(amd_gfx_level gfx_level, const Instruction* instr) { - if (instr->opcode == aco_opcode::s_sendmsg) - return (instr->sopp().imm & sendmsg_id_mask) == _sendmsg_gs_done; - return false; + /* Because of NO_PC_EXPORT=1, a done=1 position or primitive export can launch PS waves before + * the NGG/VS wave finishes if there are no parameter exports. + */ + return instr->opcode == aco_opcode::exp && instr->exp().dest >= V_008DFC_SQ_EXP_POS && + instr->exp().dest <= V_008DFC_SQ_EXP_PRIM && gfx_level >= GFX10; } memory_sync_info @@ -464,29 +447,35 @@ struct memory_event_set { }; struct hazard_query { + amd_gfx_level gfx_level; bool contains_spill; bool contains_sendmsg; bool uses_exec; + bool writes_exec; memory_event_set mem_events; unsigned aliasing_storage; /* storage classes which are accessed (non-SMEM) */ unsigned aliasing_storage_smem; /* storage classes which are accessed (SMEM) */ }; void -init_hazard_query(hazard_query* query) +init_hazard_query(const sched_ctx& ctx, hazard_query* query) { + query->gfx_level = ctx.gfx_level; query->contains_spill = false; query->contains_sendmsg = false; query->uses_exec = false; + query->writes_exec = false; memset(&query->mem_events, 0, sizeof(query->mem_events)); query->aliasing_storage = 0; query->aliasing_storage_smem = 0; } void -add_memory_event(memory_event_set* set, Instruction* instr, memory_sync_info* sync) +add_memory_event(amd_gfx_level gfx_level, memory_event_set* set, Instruction* instr, + memory_sync_info* sync) { - set->has_control_barrier |= is_done_sendmsg(instr); + set->has_control_barrier |= is_done_sendmsg(gfx_level, instr); + set->has_control_barrier |= is_pos_prim_export(gfx_level, instr); if (instr->opcode == aco_opcode::p_barrier) { Pseudo_barrier_instruction& bar = instr->barrier(); if (bar.sync.semantics & semantic_acquire) @@ -521,10 +510,14 @@ add_to_hazard_query(hazard_query* query, Instruction* instr) query->contains_spill = true; query->contains_sendmsg |= instr->opcode == aco_opcode::s_sendmsg; query->uses_exec |= needs_exec_mask(instr); + for (const Definition& def : instr->definitions) { + if (def.isFixed() && def.physReg() == exec) + query->writes_exec = true; + } memory_sync_info sync = get_sync_info_with_hack(instr); - add_memory_event(&query->mem_events, instr, &sync); + add_memory_event(query->gfx_level, &query->mem_events, instr, &sync); if (!(sync.semantics & semantic_can_reorder)) { unsigned storage = sync.storage; @@ -560,26 +553,56 @@ perform_hazard_query(hazard_query* query, Instruction* instr, bool upwards) if (!upwards && instr->opcode == aco_opcode::p_exit_early_if) return hazard_fail_unreorderable; - if (query->uses_exec) { + /* In Primitive Ordered Pixel Shading, await overlapped waves as late as possible, and notify + * overlapping waves that they can continue execution as early as possible. + */ + if (upwards) { + if (instr->opcode == aco_opcode::p_pops_gfx9_add_exiting_wave_id || + (instr->opcode == aco_opcode::s_wait_event && + !(instr->salu().imm & wait_event_imm_dont_wait_export_ready))) { + return hazard_fail_unreorderable; + } + } else { + if (instr->opcode == aco_opcode::p_pops_gfx9_ordered_section_done) { + return hazard_fail_unreorderable; + } + } + + if (query->uses_exec || query->writes_exec) { for (const Definition& def : instr->definitions) { if (def.isFixed() && def.physReg() == exec) return hazard_fail_exec; } } - - /* don't move exports so that they stay closer together */ - if (instr->isEXP()) + if (query->writes_exec && needs_exec_mask(instr)) + return hazard_fail_exec; + + /* Don't move exports so that they stay closer together. + * Since GFX11, export order matters. MRTZ must come first, + * then color exports sorted from first to last. + * Also, with Primitive Ordered Pixel Shading on GFX11+, the `done` export must not be moved + * above the memory accesses before the queue family scope (more precisely, fragment interlock + * scope, but it's not available in ACO) release barrier that is expected to be inserted before + * the export, as well as before any `s_wait_event export_ready` which enters the ordered + * section, because the `done` export exits the ordered section. + */ + if (instr->isEXP() || instr->opcode == aco_opcode::p_dual_src_export_gfx11) return hazard_fail_export; /* don't move non-reorderable instructions */ if (instr->opcode == aco_opcode::s_memtime || instr->opcode == aco_opcode::s_memrealtime || - instr->opcode == aco_opcode::s_setprio || instr->opcode == aco_opcode::s_getreg_b32) + instr->opcode == aco_opcode::s_setprio || instr->opcode == aco_opcode::s_getreg_b32 || + instr->opcode == aco_opcode::p_init_scratch || + instr->opcode == aco_opcode::p_jump_to_epilog || + instr->opcode == aco_opcode::s_sendmsg_rtn_b32 || + instr->opcode == aco_opcode::s_sendmsg_rtn_b64 || + instr->opcode == aco_opcode::p_end_with_regs) return hazard_fail_unreorderable; memory_event_set instr_set; memset(&instr_set, 0, sizeof(instr_set)); memory_sync_info sync = get_sync_info_with_hack(instr); - add_memory_event(&instr_set, instr, &sync); + add_memory_event(query->gfx_level, &instr_set, instr, &sync); memory_event_set* first = &instr_set; memory_event_set* second = &query->mem_events; @@ -613,7 +636,7 @@ perform_hazard_query(hazard_query* query, Instruction* instr, bool upwards) /* Don't move memory accesses to before control barriers. I don't think * this is necessary for the Vulkan memory model, but it might be for GLSL450. */ unsigned control_classes = - storage_buffer | storage_atomic_counter | storage_image | storage_shared; + storage_buffer | storage_image | storage_shared | storage_task_payload; if (first->has_control_barrier && ((second->access_atomic | second->access_relaxed) & control_classes)) return hazard_fail_barrier; @@ -648,12 +671,14 @@ schedule_SMEM(sched_ctx& ctx, Block* block, std::vector<RegisterDemand>& registe int16_t k = 0; /* don't move s_memtime/s_memrealtime */ - if (current->opcode == aco_opcode::s_memtime || current->opcode == aco_opcode::s_memrealtime) + if (current->opcode == aco_opcode::s_memtime || current->opcode == aco_opcode::s_memrealtime || + current->opcode == aco_opcode::s_sendmsg_rtn_b32 || + current->opcode == aco_opcode::s_sendmsg_rtn_b64) return; /* first, check if we have instructions before current to move down */ hazard_query hq; - init_hazard_query(&hq); + init_hazard_query(ctx, &hq); add_to_hazard_query(&hq, current); DownwardsCursor cursor = ctx.mv.downwards_init(idx, false, false); @@ -675,11 +700,12 @@ schedule_SMEM(sched_ctx& ctx, Block* block, std::vector<RegisterDemand>& registe break; /* only move VMEM instructions below descriptor loads. be more aggressive at higher num_waves * to help create more vmem clauses */ - if (candidate->isVMEM() && (cursor.insert_idx - cursor.source_idx > (ctx.num_waves * 4) || - current->operands[0].size() == 4)) + if ((candidate->isVMEM() || candidate->isFlatLike()) && + (cursor.insert_idx - cursor.source_idx > (ctx.num_waves * 4) || + current->operands[0].size() == 4)) break; /* don't move descriptor loads below buffer loads */ - if (candidate->format == Format::SMEM && current->operands[0].size() == 4 && + if (candidate->isSMEM() && !candidate->operands.empty() && current->operands[0].size() == 4 && candidate->operands[0].size() == 2) break; @@ -694,7 +720,7 @@ schedule_SMEM(sched_ctx& ctx, Block* block, std::vector<RegisterDemand>& registe break; /* don't use LDS/GDS instructions to hide latency since it can - * significanly worsen LDS scheduling */ + * significantly worsen LDS scheduling */ if (candidate->isDS() || !can_move_down) { add_to_hazard_query(&hq, candidate.get()); ctx.mv.downwards_skip(cursor); @@ -732,7 +758,7 @@ schedule_SMEM(sched_ctx& ctx, Block* block, std::vector<RegisterDemand>& registe /* check if candidate depends on current */ bool is_dependency = !found_dependency && !ctx.mv.upwards_check_deps(up_cursor); /* no need to steal from following VMEM instructions */ - if (is_dependency && candidate->isVMEM()) + if (is_dependency && (candidate->isVMEM() || candidate->isFlatLike())) break; if (found_dependency) { @@ -748,7 +774,7 @@ schedule_SMEM(sched_ctx& ctx, Block* block, std::vector<RegisterDemand>& registe if (is_dependency) { if (!found_dependency) { ctx.mv.upwards_update_insert_idx(up_cursor); - init_hazard_query(&hq); + init_hazard_query(ctx, &hq); found_dependency = true; } } @@ -765,7 +791,7 @@ schedule_SMEM(sched_ctx& ctx, Block* block, std::vector<RegisterDemand>& registe MoveResult res = ctx.mv.upwards_move(up_cursor); if (res == move_fail_ssa || res == move_fail_rar) { /* no need to steal from following VMEM instructions */ - if (res == move_fail_ssa && candidate->isVMEM()) + if (res == move_fail_ssa && (candidate->isVMEM() || candidate->isFlatLike())) break; add_to_hazard_query(&hq, candidate.get()); ctx.mv.upwards_skip(up_cursor); @@ -788,13 +814,14 @@ schedule_VMEM(sched_ctx& ctx, Block* block, std::vector<RegisterDemand>& registe int window_size = VMEM_WINDOW_SIZE; int max_moves = VMEM_MAX_MOVES; int clause_max_grab_dist = VMEM_CLAUSE_MAX_GRAB_DIST; + bool only_clauses = false; int16_t k = 0; /* first, check if we have instructions before current to move down */ hazard_query indep_hq; hazard_query clause_hq; - init_hazard_query(&indep_hq); - init_hazard_query(&clause_hq); + init_hazard_query(ctx, &indep_hq); + init_hazard_query(ctx, &clause_hq); add_to_hazard_query(&indep_hq, current); DownwardsCursor cursor = ctx.mv.downwards_init(idx, true, true); @@ -822,12 +849,28 @@ schedule_VMEM(sched_ctx& ctx, Block* block, std::vector<RegisterDemand>& registe /* We can't easily tell how much this will decrease the def-to-use * distances, so just use how far it will be moved as a heuristic. */ part_of_clause = - grab_dist < clause_max_grab_dist && should_form_clause(current, candidate.get()); + grab_dist < clause_max_grab_dist + k && should_form_clause(current, candidate.get()); } /* if current depends on candidate, add additional dependencies and continue */ bool can_move_down = !is_vmem || part_of_clause || candidate->definitions.empty(); - + if (only_clauses) { + /* In case of high register pressure, only try to form clauses, + * and only if the previous clause is not larger + * than the current one will be. + */ + if (part_of_clause) { + int clause_size = cursor.insert_idx - cursor.insert_idx_clause; + int prev_clause_size = 1; + while (should_form_clause(current, + block->instructions[candidate_idx - prev_clause_size].get())) + prev_clause_size++; + if (prev_clause_size > clause_size + 1) + break; + } else { + can_move_down = false; + } + } HazardResult haz = perform_hazard_query(part_of_clause ? &clause_hq : &indep_hq, candidate.get(), false); if (haz == hazard_fail_reorder_ds || haz == hazard_fail_spill || @@ -838,6 +881,8 @@ schedule_VMEM(sched_ctx& ctx, Block* block, std::vector<RegisterDemand>& registe break; if (!can_move_down) { + if (part_of_clause) + break; add_to_hazard_query(&indep_hq, candidate.get()); add_to_hazard_query(&clause_hq, candidate.get()); ctx.mv.downwards_skip(cursor); @@ -847,12 +892,20 @@ schedule_VMEM(sched_ctx& ctx, Block* block, std::vector<RegisterDemand>& registe Instruction* candidate_ptr = candidate.get(); MoveResult res = ctx.mv.downwards_move(cursor, part_of_clause); if (res == move_fail_ssa || res == move_fail_rar) { + if (part_of_clause) + break; add_to_hazard_query(&indep_hq, candidate.get()); add_to_hazard_query(&clause_hq, candidate.get()); ctx.mv.downwards_skip(cursor); continue; } else if (res == move_fail_pressure) { - break; + only_clauses = true; + if (part_of_clause) + break; + add_to_hazard_query(&indep_hq, candidate.get()); + add_to_hazard_query(&clause_hq, candidate.get()); + ctx.mv.downwards_skip(cursor); + continue; } if (part_of_clause) add_to_hazard_query(&indep_hq, candidate_ptr); @@ -893,7 +946,7 @@ schedule_VMEM(sched_ctx& ctx, Block* block, std::vector<RegisterDemand>& registe if (is_dependency) { if (!found_dependency) { ctx.mv.upwards_update_insert_idx(up_cursor); - init_hazard_query(&indep_hq); + init_hazard_query(ctx, &indep_hq); found_dependency = true; } } else if (is_vmem) { @@ -937,7 +990,7 @@ schedule_position_export(sched_ctx& ctx, Block* block, std::vector<RegisterDeman DownwardsCursor cursor = ctx.mv.downwards_init(idx, true, false); hazard_query hq; - init_hazard_query(&hq); + init_hazard_query(ctx, &hq); add_to_hazard_query(&hq, current); for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int)idx - window_size; @@ -972,6 +1025,37 @@ schedule_position_export(sched_ctx& ctx, Block* block, std::vector<RegisterDeman } } +unsigned +schedule_VMEM_store(sched_ctx& ctx, Block* block, std::vector<RegisterDemand>& register_demand, + Instruction* current, int idx) +{ + hazard_query hq; + init_hazard_query(ctx, &hq); + + DownwardsCursor cursor = ctx.mv.downwards_init(idx, true, true); + int skip = 0; + + for (int i = 0; (i - skip) < VMEM_CLAUSE_MAX_GRAB_DIST; i++) { + aco_ptr<Instruction>& candidate = block->instructions[cursor.source_idx]; + if (candidate->opcode == aco_opcode::p_logical_start) + break; + + if (!should_form_clause(current, candidate.get())) { + add_to_hazard_query(&hq, candidate.get()); + ctx.mv.downwards_skip(cursor); + continue; + } + + if (perform_hazard_query(&hq, candidate.get(), false) != hazard_success || + ctx.mv.downwards_move(cursor, true) != move_success) + break; + + skip++; + } + + return skip; +} + void schedule_block(sched_ctx& ctx, Program* program, Block* block, live& live_vars) { @@ -981,9 +1065,13 @@ schedule_block(sched_ctx& ctx, Program* program, Block* block, live& live_vars) ctx.mv.register_demand = live_vars.register_demand[block->index].data(); /* go through all instructions and find memory loads */ + unsigned num_stores = 0; for (unsigned idx = 0; idx < block->instructions.size(); idx++) { Instruction* current = block->instructions[idx].get(); + if (current->opcode == aco_opcode::p_logical_end) + break; + if (block->kind & block_kind_export_end && current->isEXP() && ctx.schedule_pos_exports) { unsigned target = current->exp().dest; if (target >= V_008DFC_SQ_EXP_POS && target < V_008DFC_SQ_EXP_PRIM) { @@ -993,8 +1081,10 @@ schedule_block(sched_ctx& ctx, Program* program, Block* block, live& live_vars) } } - if (current->definitions.empty()) + if (current->definitions.empty()) { + num_stores += current->isVMEM() || current->isFlatLike() ? 1 : 0; continue; + } if (current->isVMEM() || current->isFlatLike()) { ctx.mv.current = current; @@ -1007,6 +1097,19 @@ schedule_block(sched_ctx& ctx, Program* program, Block* block, live& live_vars) } } + /* GFX11 benefits from creating VMEM store clauses. */ + if (num_stores > 1 && program->gfx_level >= GFX11) { + for (int idx = block->instructions.size() - 1; idx >= 0; idx--) { + Instruction* current = block->instructions[idx].get(); + if (!current->definitions.empty() || !(current->isVMEM() || current->isFlatLike())) + continue; + + ctx.mv.current = current; + idx -= + schedule_VMEM_store(ctx, block, live_vars.register_demand[block->index], current, idx); + } + } + /* resummarize the block's register demand */ block->register_demand = RegisterDemand(); for (unsigned idx = 0; idx < block->instructions.size(); idx++) { @@ -1024,6 +1127,7 @@ schedule_program(Program* program, live& live_vars) demand.vgpr += program->config->num_shared_vgprs / 2; sched_ctx ctx; + ctx.gfx_level = program->gfx_level; ctx.mv.depends_on.resize(program->peekAllocationId()); ctx.mv.RAR_dependencies.resize(program->peekAllocationId()); ctx.mv.RAR_dependencies_clause.resize(program->peekAllocationId()); @@ -1042,6 +1146,7 @@ schedule_program(Program* program, live& live_vars) ctx.num_waves = 7 * wave_fac; ctx.num_waves = std::max<uint16_t>(ctx.num_waves, program->min_waves); ctx.num_waves = std::min<uint16_t>(ctx.num_waves, program->num_waves); + ctx.num_waves = max_suitable_waves(program, ctx.num_waves); /* VMEM_MAX_MOVES and such assume pre-GFX10 wave count */ ctx.num_waves = std::max<uint16_t>(ctx.num_waves / wave_fac, 1); @@ -1054,8 +1159,8 @@ schedule_program(Program* program, live& live_vars) * Schedule less aggressively when early primitive export is used, and * keep the position export at the very bottom when late primitive export is used. */ - if (program->info->has_ngg_culling && program->stage.num_sw_stages() == 1) { - if (!program->info->has_ngg_early_prim_export) + if (program->info.has_ngg_culling && program->stage.num_sw_stages() == 1) { + if (!program->info.has_ngg_early_prim_export) ctx.schedule_pos_exports = false; else ctx.schedule_pos_export_div = 4; |