summaryrefslogtreecommitdiff
path: root/src/amd/compiler/aco_scheduler.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/amd/compiler/aco_scheduler.cpp')
-rw-r--r--src/amd/compiler/aco_scheduler.cpp223
1 files changed, 164 insertions, 59 deletions
diff --git a/src/amd/compiler/aco_scheduler.cpp b/src/amd/compiler/aco_scheduler.cpp
index 2f72ba0cc96..20a08ac7636 100644
--- a/src/amd/compiler/aco_scheduler.cpp
+++ b/src/amd/compiler/aco_scheduler.cpp
@@ -1,25 +1,7 @@
/*
* Copyright © 2018 Valve Corporation
*
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
+ * SPDX-License-Identifier: MIT
*/
#include "aco_builder.h"
@@ -37,7 +19,7 @@
#define SMEM_MAX_MOVES (64 - ctx.num_waves * 4)
#define VMEM_MAX_MOVES (256 - ctx.num_waves * 16)
/* creating clauses decreases def-use distances, so make it less aggressive the lower num_waves is */
-#define VMEM_CLAUSE_MAX_GRAB_DIST (ctx.num_waves * 8)
+#define VMEM_CLAUSE_MAX_GRAB_DIST (ctx.num_waves * 2)
#define POS_EXP_MAX_MOVES 512
namespace aco {
@@ -122,6 +104,7 @@ struct MoveState {
};
struct sched_ctx {
+ amd_gfx_level gfx_level;
int16_t num_waves;
int16_t last_SMEM_stall;
int last_SMEM_dep_idx;
@@ -420,21 +403,21 @@ MoveState::upwards_skip(UpwardsCursor& cursor)
}
bool
-is_gs_or_done_sendmsg(const Instruction* instr)
+is_done_sendmsg(amd_gfx_level gfx_level, const Instruction* instr)
{
- if (instr->opcode == aco_opcode::s_sendmsg) {
- uint16_t imm = instr->sopp().imm;
- return (imm & sendmsg_id_mask) == _sendmsg_gs || (imm & sendmsg_id_mask) == _sendmsg_gs_done;
- }
+ if (gfx_level <= GFX10_3 && instr->opcode == aco_opcode::s_sendmsg)
+ return (instr->salu().imm & sendmsg_id_mask) == sendmsg_gs_done;
return false;
}
bool
-is_done_sendmsg(const Instruction* instr)
+is_pos_prim_export(amd_gfx_level gfx_level, const Instruction* instr)
{
- if (instr->opcode == aco_opcode::s_sendmsg)
- return (instr->sopp().imm & sendmsg_id_mask) == _sendmsg_gs_done;
- return false;
+ /* Because of NO_PC_EXPORT=1, a done=1 position or primitive export can launch PS waves before
+ * the NGG/VS wave finishes if there are no parameter exports.
+ */
+ return instr->opcode == aco_opcode::exp && instr->exp().dest >= V_008DFC_SQ_EXP_POS &&
+ instr->exp().dest <= V_008DFC_SQ_EXP_PRIM && gfx_level >= GFX10;
}
memory_sync_info
@@ -464,29 +447,35 @@ struct memory_event_set {
};
struct hazard_query {
+ amd_gfx_level gfx_level;
bool contains_spill;
bool contains_sendmsg;
bool uses_exec;
+ bool writes_exec;
memory_event_set mem_events;
unsigned aliasing_storage; /* storage classes which are accessed (non-SMEM) */
unsigned aliasing_storage_smem; /* storage classes which are accessed (SMEM) */
};
void
-init_hazard_query(hazard_query* query)
+init_hazard_query(const sched_ctx& ctx, hazard_query* query)
{
+ query->gfx_level = ctx.gfx_level;
query->contains_spill = false;
query->contains_sendmsg = false;
query->uses_exec = false;
+ query->writes_exec = false;
memset(&query->mem_events, 0, sizeof(query->mem_events));
query->aliasing_storage = 0;
query->aliasing_storage_smem = 0;
}
void
-add_memory_event(memory_event_set* set, Instruction* instr, memory_sync_info* sync)
+add_memory_event(amd_gfx_level gfx_level, memory_event_set* set, Instruction* instr,
+ memory_sync_info* sync)
{
- set->has_control_barrier |= is_done_sendmsg(instr);
+ set->has_control_barrier |= is_done_sendmsg(gfx_level, instr);
+ set->has_control_barrier |= is_pos_prim_export(gfx_level, instr);
if (instr->opcode == aco_opcode::p_barrier) {
Pseudo_barrier_instruction& bar = instr->barrier();
if (bar.sync.semantics & semantic_acquire)
@@ -521,10 +510,14 @@ add_to_hazard_query(hazard_query* query, Instruction* instr)
query->contains_spill = true;
query->contains_sendmsg |= instr->opcode == aco_opcode::s_sendmsg;
query->uses_exec |= needs_exec_mask(instr);
+ for (const Definition& def : instr->definitions) {
+ if (def.isFixed() && def.physReg() == exec)
+ query->writes_exec = true;
+ }
memory_sync_info sync = get_sync_info_with_hack(instr);
- add_memory_event(&query->mem_events, instr, &sync);
+ add_memory_event(query->gfx_level, &query->mem_events, instr, &sync);
if (!(sync.semantics & semantic_can_reorder)) {
unsigned storage = sync.storage;
@@ -560,26 +553,56 @@ perform_hazard_query(hazard_query* query, Instruction* instr, bool upwards)
if (!upwards && instr->opcode == aco_opcode::p_exit_early_if)
return hazard_fail_unreorderable;
- if (query->uses_exec) {
+ /* In Primitive Ordered Pixel Shading, await overlapped waves as late as possible, and notify
+ * overlapping waves that they can continue execution as early as possible.
+ */
+ if (upwards) {
+ if (instr->opcode == aco_opcode::p_pops_gfx9_add_exiting_wave_id ||
+ (instr->opcode == aco_opcode::s_wait_event &&
+ !(instr->salu().imm & wait_event_imm_dont_wait_export_ready))) {
+ return hazard_fail_unreorderable;
+ }
+ } else {
+ if (instr->opcode == aco_opcode::p_pops_gfx9_ordered_section_done) {
+ return hazard_fail_unreorderable;
+ }
+ }
+
+ if (query->uses_exec || query->writes_exec) {
for (const Definition& def : instr->definitions) {
if (def.isFixed() && def.physReg() == exec)
return hazard_fail_exec;
}
}
-
- /* don't move exports so that they stay closer together */
- if (instr->isEXP())
+ if (query->writes_exec && needs_exec_mask(instr))
+ return hazard_fail_exec;
+
+ /* Don't move exports so that they stay closer together.
+ * Since GFX11, export order matters. MRTZ must come first,
+ * then color exports sorted from first to last.
+ * Also, with Primitive Ordered Pixel Shading on GFX11+, the `done` export must not be moved
+ * above the memory accesses before the queue family scope (more precisely, fragment interlock
+ * scope, but it's not available in ACO) release barrier that is expected to be inserted before
+ * the export, as well as before any `s_wait_event export_ready` which enters the ordered
+ * section, because the `done` export exits the ordered section.
+ */
+ if (instr->isEXP() || instr->opcode == aco_opcode::p_dual_src_export_gfx11)
return hazard_fail_export;
/* don't move non-reorderable instructions */
if (instr->opcode == aco_opcode::s_memtime || instr->opcode == aco_opcode::s_memrealtime ||
- instr->opcode == aco_opcode::s_setprio || instr->opcode == aco_opcode::s_getreg_b32)
+ instr->opcode == aco_opcode::s_setprio || instr->opcode == aco_opcode::s_getreg_b32 ||
+ instr->opcode == aco_opcode::p_init_scratch ||
+ instr->opcode == aco_opcode::p_jump_to_epilog ||
+ instr->opcode == aco_opcode::s_sendmsg_rtn_b32 ||
+ instr->opcode == aco_opcode::s_sendmsg_rtn_b64 ||
+ instr->opcode == aco_opcode::p_end_with_regs)
return hazard_fail_unreorderable;
memory_event_set instr_set;
memset(&instr_set, 0, sizeof(instr_set));
memory_sync_info sync = get_sync_info_with_hack(instr);
- add_memory_event(&instr_set, instr, &sync);
+ add_memory_event(query->gfx_level, &instr_set, instr, &sync);
memory_event_set* first = &instr_set;
memory_event_set* second = &query->mem_events;
@@ -613,7 +636,7 @@ perform_hazard_query(hazard_query* query, Instruction* instr, bool upwards)
/* Don't move memory accesses to before control barriers. I don't think
* this is necessary for the Vulkan memory model, but it might be for GLSL450. */
unsigned control_classes =
- storage_buffer | storage_atomic_counter | storage_image | storage_shared;
+ storage_buffer | storage_image | storage_shared | storage_task_payload;
if (first->has_control_barrier &&
((second->access_atomic | second->access_relaxed) & control_classes))
return hazard_fail_barrier;
@@ -648,12 +671,14 @@ schedule_SMEM(sched_ctx& ctx, Block* block, std::vector<RegisterDemand>& registe
int16_t k = 0;
/* don't move s_memtime/s_memrealtime */
- if (current->opcode == aco_opcode::s_memtime || current->opcode == aco_opcode::s_memrealtime)
+ if (current->opcode == aco_opcode::s_memtime || current->opcode == aco_opcode::s_memrealtime ||
+ current->opcode == aco_opcode::s_sendmsg_rtn_b32 ||
+ current->opcode == aco_opcode::s_sendmsg_rtn_b64)
return;
/* first, check if we have instructions before current to move down */
hazard_query hq;
- init_hazard_query(&hq);
+ init_hazard_query(ctx, &hq);
add_to_hazard_query(&hq, current);
DownwardsCursor cursor = ctx.mv.downwards_init(idx, false, false);
@@ -675,11 +700,12 @@ schedule_SMEM(sched_ctx& ctx, Block* block, std::vector<RegisterDemand>& registe
break;
/* only move VMEM instructions below descriptor loads. be more aggressive at higher num_waves
* to help create more vmem clauses */
- if (candidate->isVMEM() && (cursor.insert_idx - cursor.source_idx > (ctx.num_waves * 4) ||
- current->operands[0].size() == 4))
+ if ((candidate->isVMEM() || candidate->isFlatLike()) &&
+ (cursor.insert_idx - cursor.source_idx > (ctx.num_waves * 4) ||
+ current->operands[0].size() == 4))
break;
/* don't move descriptor loads below buffer loads */
- if (candidate->format == Format::SMEM && current->operands[0].size() == 4 &&
+ if (candidate->isSMEM() && !candidate->operands.empty() && current->operands[0].size() == 4 &&
candidate->operands[0].size() == 2)
break;
@@ -694,7 +720,7 @@ schedule_SMEM(sched_ctx& ctx, Block* block, std::vector<RegisterDemand>& registe
break;
/* don't use LDS/GDS instructions to hide latency since it can
- * significanly worsen LDS scheduling */
+ * significantly worsen LDS scheduling */
if (candidate->isDS() || !can_move_down) {
add_to_hazard_query(&hq, candidate.get());
ctx.mv.downwards_skip(cursor);
@@ -732,7 +758,7 @@ schedule_SMEM(sched_ctx& ctx, Block* block, std::vector<RegisterDemand>& registe
/* check if candidate depends on current */
bool is_dependency = !found_dependency && !ctx.mv.upwards_check_deps(up_cursor);
/* no need to steal from following VMEM instructions */
- if (is_dependency && candidate->isVMEM())
+ if (is_dependency && (candidate->isVMEM() || candidate->isFlatLike()))
break;
if (found_dependency) {
@@ -748,7 +774,7 @@ schedule_SMEM(sched_ctx& ctx, Block* block, std::vector<RegisterDemand>& registe
if (is_dependency) {
if (!found_dependency) {
ctx.mv.upwards_update_insert_idx(up_cursor);
- init_hazard_query(&hq);
+ init_hazard_query(ctx, &hq);
found_dependency = true;
}
}
@@ -765,7 +791,7 @@ schedule_SMEM(sched_ctx& ctx, Block* block, std::vector<RegisterDemand>& registe
MoveResult res = ctx.mv.upwards_move(up_cursor);
if (res == move_fail_ssa || res == move_fail_rar) {
/* no need to steal from following VMEM instructions */
- if (res == move_fail_ssa && candidate->isVMEM())
+ if (res == move_fail_ssa && (candidate->isVMEM() || candidate->isFlatLike()))
break;
add_to_hazard_query(&hq, candidate.get());
ctx.mv.upwards_skip(up_cursor);
@@ -788,13 +814,14 @@ schedule_VMEM(sched_ctx& ctx, Block* block, std::vector<RegisterDemand>& registe
int window_size = VMEM_WINDOW_SIZE;
int max_moves = VMEM_MAX_MOVES;
int clause_max_grab_dist = VMEM_CLAUSE_MAX_GRAB_DIST;
+ bool only_clauses = false;
int16_t k = 0;
/* first, check if we have instructions before current to move down */
hazard_query indep_hq;
hazard_query clause_hq;
- init_hazard_query(&indep_hq);
- init_hazard_query(&clause_hq);
+ init_hazard_query(ctx, &indep_hq);
+ init_hazard_query(ctx, &clause_hq);
add_to_hazard_query(&indep_hq, current);
DownwardsCursor cursor = ctx.mv.downwards_init(idx, true, true);
@@ -822,12 +849,28 @@ schedule_VMEM(sched_ctx& ctx, Block* block, std::vector<RegisterDemand>& registe
/* We can't easily tell how much this will decrease the def-to-use
* distances, so just use how far it will be moved as a heuristic. */
part_of_clause =
- grab_dist < clause_max_grab_dist && should_form_clause(current, candidate.get());
+ grab_dist < clause_max_grab_dist + k && should_form_clause(current, candidate.get());
}
/* if current depends on candidate, add additional dependencies and continue */
bool can_move_down = !is_vmem || part_of_clause || candidate->definitions.empty();
-
+ if (only_clauses) {
+ /* In case of high register pressure, only try to form clauses,
+ * and only if the previous clause is not larger
+ * than the current one will be.
+ */
+ if (part_of_clause) {
+ int clause_size = cursor.insert_idx - cursor.insert_idx_clause;
+ int prev_clause_size = 1;
+ while (should_form_clause(current,
+ block->instructions[candidate_idx - prev_clause_size].get()))
+ prev_clause_size++;
+ if (prev_clause_size > clause_size + 1)
+ break;
+ } else {
+ can_move_down = false;
+ }
+ }
HazardResult haz =
perform_hazard_query(part_of_clause ? &clause_hq : &indep_hq, candidate.get(), false);
if (haz == hazard_fail_reorder_ds || haz == hazard_fail_spill ||
@@ -838,6 +881,8 @@ schedule_VMEM(sched_ctx& ctx, Block* block, std::vector<RegisterDemand>& registe
break;
if (!can_move_down) {
+ if (part_of_clause)
+ break;
add_to_hazard_query(&indep_hq, candidate.get());
add_to_hazard_query(&clause_hq, candidate.get());
ctx.mv.downwards_skip(cursor);
@@ -847,12 +892,20 @@ schedule_VMEM(sched_ctx& ctx, Block* block, std::vector<RegisterDemand>& registe
Instruction* candidate_ptr = candidate.get();
MoveResult res = ctx.mv.downwards_move(cursor, part_of_clause);
if (res == move_fail_ssa || res == move_fail_rar) {
+ if (part_of_clause)
+ break;
add_to_hazard_query(&indep_hq, candidate.get());
add_to_hazard_query(&clause_hq, candidate.get());
ctx.mv.downwards_skip(cursor);
continue;
} else if (res == move_fail_pressure) {
- break;
+ only_clauses = true;
+ if (part_of_clause)
+ break;
+ add_to_hazard_query(&indep_hq, candidate.get());
+ add_to_hazard_query(&clause_hq, candidate.get());
+ ctx.mv.downwards_skip(cursor);
+ continue;
}
if (part_of_clause)
add_to_hazard_query(&indep_hq, candidate_ptr);
@@ -893,7 +946,7 @@ schedule_VMEM(sched_ctx& ctx, Block* block, std::vector<RegisterDemand>& registe
if (is_dependency) {
if (!found_dependency) {
ctx.mv.upwards_update_insert_idx(up_cursor);
- init_hazard_query(&indep_hq);
+ init_hazard_query(ctx, &indep_hq);
found_dependency = true;
}
} else if (is_vmem) {
@@ -937,7 +990,7 @@ schedule_position_export(sched_ctx& ctx, Block* block, std::vector<RegisterDeman
DownwardsCursor cursor = ctx.mv.downwards_init(idx, true, false);
hazard_query hq;
- init_hazard_query(&hq);
+ init_hazard_query(ctx, &hq);
add_to_hazard_query(&hq, current);
for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int)idx - window_size;
@@ -972,6 +1025,37 @@ schedule_position_export(sched_ctx& ctx, Block* block, std::vector<RegisterDeman
}
}
+unsigned
+schedule_VMEM_store(sched_ctx& ctx, Block* block, std::vector<RegisterDemand>& register_demand,
+ Instruction* current, int idx)
+{
+ hazard_query hq;
+ init_hazard_query(ctx, &hq);
+
+ DownwardsCursor cursor = ctx.mv.downwards_init(idx, true, true);
+ int skip = 0;
+
+ for (int i = 0; (i - skip) < VMEM_CLAUSE_MAX_GRAB_DIST; i++) {
+ aco_ptr<Instruction>& candidate = block->instructions[cursor.source_idx];
+ if (candidate->opcode == aco_opcode::p_logical_start)
+ break;
+
+ if (!should_form_clause(current, candidate.get())) {
+ add_to_hazard_query(&hq, candidate.get());
+ ctx.mv.downwards_skip(cursor);
+ continue;
+ }
+
+ if (perform_hazard_query(&hq, candidate.get(), false) != hazard_success ||
+ ctx.mv.downwards_move(cursor, true) != move_success)
+ break;
+
+ skip++;
+ }
+
+ return skip;
+}
+
void
schedule_block(sched_ctx& ctx, Program* program, Block* block, live& live_vars)
{
@@ -981,9 +1065,13 @@ schedule_block(sched_ctx& ctx, Program* program, Block* block, live& live_vars)
ctx.mv.register_demand = live_vars.register_demand[block->index].data();
/* go through all instructions and find memory loads */
+ unsigned num_stores = 0;
for (unsigned idx = 0; idx < block->instructions.size(); idx++) {
Instruction* current = block->instructions[idx].get();
+ if (current->opcode == aco_opcode::p_logical_end)
+ break;
+
if (block->kind & block_kind_export_end && current->isEXP() && ctx.schedule_pos_exports) {
unsigned target = current->exp().dest;
if (target >= V_008DFC_SQ_EXP_POS && target < V_008DFC_SQ_EXP_PRIM) {
@@ -993,8 +1081,10 @@ schedule_block(sched_ctx& ctx, Program* program, Block* block, live& live_vars)
}
}
- if (current->definitions.empty())
+ if (current->definitions.empty()) {
+ num_stores += current->isVMEM() || current->isFlatLike() ? 1 : 0;
continue;
+ }
if (current->isVMEM() || current->isFlatLike()) {
ctx.mv.current = current;
@@ -1007,6 +1097,19 @@ schedule_block(sched_ctx& ctx, Program* program, Block* block, live& live_vars)
}
}
+ /* GFX11 benefits from creating VMEM store clauses. */
+ if (num_stores > 1 && program->gfx_level >= GFX11) {
+ for (int idx = block->instructions.size() - 1; idx >= 0; idx--) {
+ Instruction* current = block->instructions[idx].get();
+ if (!current->definitions.empty() || !(current->isVMEM() || current->isFlatLike()))
+ continue;
+
+ ctx.mv.current = current;
+ idx -=
+ schedule_VMEM_store(ctx, block, live_vars.register_demand[block->index], current, idx);
+ }
+ }
+
/* resummarize the block's register demand */
block->register_demand = RegisterDemand();
for (unsigned idx = 0; idx < block->instructions.size(); idx++) {
@@ -1024,6 +1127,7 @@ schedule_program(Program* program, live& live_vars)
demand.vgpr += program->config->num_shared_vgprs / 2;
sched_ctx ctx;
+ ctx.gfx_level = program->gfx_level;
ctx.mv.depends_on.resize(program->peekAllocationId());
ctx.mv.RAR_dependencies.resize(program->peekAllocationId());
ctx.mv.RAR_dependencies_clause.resize(program->peekAllocationId());
@@ -1042,6 +1146,7 @@ schedule_program(Program* program, live& live_vars)
ctx.num_waves = 7 * wave_fac;
ctx.num_waves = std::max<uint16_t>(ctx.num_waves, program->min_waves);
ctx.num_waves = std::min<uint16_t>(ctx.num_waves, program->num_waves);
+ ctx.num_waves = max_suitable_waves(program, ctx.num_waves);
/* VMEM_MAX_MOVES and such assume pre-GFX10 wave count */
ctx.num_waves = std::max<uint16_t>(ctx.num_waves / wave_fac, 1);
@@ -1054,8 +1159,8 @@ schedule_program(Program* program, live& live_vars)
* Schedule less aggressively when early primitive export is used, and
* keep the position export at the very bottom when late primitive export is used.
*/
- if (program->info->has_ngg_culling && program->stage.num_sw_stages() == 1) {
- if (!program->info->has_ngg_early_prim_export)
+ if (program->info.has_ngg_culling && program->stage.num_sw_stages() == 1) {
+ if (!program->info.has_ngg_early_prim_export)
ctx.schedule_pos_exports = false;
else
ctx.schedule_pos_export_div = 4;