1 files changed, 164 insertions, 59 deletions
diff --git a/src/amd/compiler/aco_scheduler.cpp b/src/amd/compiler/aco_scheduler.cpp
index 2f72ba0cc96..20a08ac7636 100644
--- a/src/amd/compiler/aco_scheduler.cpp
+++ b/src/amd/compiler/aco_scheduler.cpp
@@ -1,25 +1,7 @@
 /*
  * Copyright © 2018 Valve Corporation
  *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
+ * SPDX-License-Identifier: MIT
  */
 
 #include "aco_builder.h"
@@ -37,7 +19,7 @@
 #define SMEM_MAX_MOVES      (64 - ctx.num_waves * 4)
 #define VMEM_MAX_MOVES      (256 - ctx.num_waves * 16)
 /* creating clauses decreases def-use distances, so make it less aggressive the lower num_waves is */
-#define VMEM_CLAUSE_MAX_GRAB_DIST (ctx.num_waves * 8)
+#define VMEM_CLAUSE_MAX_GRAB_DIST (ctx.num_waves * 2)
 #define POS_EXP_MAX_MOVES         512
 
 namespace aco {
@@ -122,6 +104,7 @@ struct MoveState {
 };
 
 struct sched_ctx {
+   amd_gfx_level gfx_level;
    int16_t num_waves;
    int16_t last_SMEM_stall;
    int last_SMEM_dep_idx;
@@ -420,21 +403,21 @@ MoveState::upwards_skip(UpwardsCursor& cursor)
 }
 
 bool
-is_gs_or_done_sendmsg(const Instruction* instr)
+is_done_sendmsg(amd_gfx_level gfx_level, const Instruction* instr)
 {
-   if (instr->opcode == aco_opcode::s_sendmsg) {
-      uint16_t imm = instr->sopp().imm;
-      return (imm & sendmsg_id_mask) == _sendmsg_gs || (imm & sendmsg_id_mask) == _sendmsg_gs_done;
-   }
+   if (gfx_level <= GFX10_3 && instr->opcode == aco_opcode::s_sendmsg)
+      return (instr->salu().imm & sendmsg_id_mask) == sendmsg_gs_done;
    return false;
 }
 
 bool
-is_done_sendmsg(const Instruction* instr)
+is_pos_prim_export(amd_gfx_level gfx_level, const Instruction* instr)
 {
-   if (instr->opcode == aco_opcode::s_sendmsg)
-      return (instr->sopp().imm & sendmsg_id_mask) == _sendmsg_gs_done;
-   return false;
+   /* Because of NO_PC_EXPORT=1, a done=1 position or primitive export can launch PS waves before
+    * the NGG/VS wave finishes if there are no parameter exports.
+    */
+   return instr->opcode == aco_opcode::exp && instr->exp().dest >= V_008DFC_SQ_EXP_POS &&
+          instr->exp().dest <= V_008DFC_SQ_EXP_PRIM && gfx_level >= GFX10;
 }
 
 memory_sync_info
@@ -464,29 +447,35 @@ struct memory_event_set {
 };
 
 struct hazard_query {
+   amd_gfx_level gfx_level;
    bool contains_spill;
    bool contains_sendmsg;
    bool uses_exec;
+   bool writes_exec;
    memory_event_set mem_events;
    unsigned aliasing_storage;      /* storage classes which are accessed (non-SMEM) */
    unsigned aliasing_storage_smem; /* storage classes which are accessed (SMEM) */
 };
 
 void
-init_hazard_query(hazard_query* query)
+init_hazard_query(const sched_ctx& ctx, hazard_query* query)
 {
+   query->gfx_level = ctx.gfx_level;
    query->contains_spill = false;
    query->contains_sendmsg = false;
    query->uses_exec = false;
+   query->writes_exec = false;
    memset(&query->mem_events, 0, sizeof(query->mem_events));
    query->aliasing_storage = 0;
    query->aliasing_storage_smem = 0;
 }
 
 void
-add_memory_event(memory_event_set* set, Instruction* instr, memory_sync_info* sync)
+add_memory_event(amd_gfx_level gfx_level, memory_event_set* set, Instruction* instr,
+                 memory_sync_info* sync)
 {
-   set->has_control_barrier |= is_done_sendmsg(instr);
+   set->has_control_barrier |= is_done_sendmsg(gfx_level, instr);
+   set->has_control_barrier |= is_pos_prim_export(gfx_level, instr);
    if (instr->opcode == aco_opcode::p_barrier) {
       Pseudo_barrier_instruction& bar = instr->barrier();
       if (bar.sync.semantics & semantic_acquire)
@@ -521,10 +510,14 @@ add_to_hazard_query(hazard_query* query, Instruction* instr)
       query->contains_spill = true;
    query->contains_sendmsg |= instr->opcode == aco_opcode::s_sendmsg;
    query->uses_exec |= needs_exec_mask(instr);
+   for (const Definition& def : instr->definitions) {
+      if (def.isFixed() && def.physReg() == exec)
+         query->writes_exec = true;
+   }
 
    memory_sync_info sync = get_sync_info_with_hack(instr);
 
-   add_memory_event(&query->mem_events, instr, &sync);
+   add_memory_event(query->gfx_level, &query->mem_events, instr, &sync);
 
    if (!(sync.semantics & semantic_can_reorder)) {
       unsigned storage = sync.storage;
@@ -560,26 +553,56 @@ perform_hazard_query(hazard_query* query, Instruction* instr, bool upwards)
    if (!upwards && instr->opcode == aco_opcode::p_exit_early_if)
       return hazard_fail_unreorderable;
 
-   if (query->uses_exec) {
+   /* In Primitive Ordered Pixel Shading, await overlapped waves as late as possible, and notify
+    * overlapping waves that they can continue execution as early as possible.
+    */
+   if (upwards) {
+      if (instr->opcode == aco_opcode::p_pops_gfx9_add_exiting_wave_id ||
+          (instr->opcode == aco_opcode::s_wait_event &&
+           !(instr->salu().imm & wait_event_imm_dont_wait_export_ready))) {
+         return hazard_fail_unreorderable;
+      }
+   } else {
+      if (instr->opcode == aco_opcode::p_pops_gfx9_ordered_section_done) {
+         return hazard_fail_unreorderable;
+      }
+   }
+
+   if (query->uses_exec || query->writes_exec) {
       for (const Definition& def : instr->definitions) {
          if (def.isFixed() && def.physReg() == exec)
             return hazard_fail_exec;
       }
    }
-
-   /* don't move exports so that they stay closer together */
-   if (instr->isEXP())
+   if (query->writes_exec && needs_exec_mask(instr))
+      return hazard_fail_exec;
+
+   /* Don't move exports so that they stay closer together.
+    * Since GFX11, export order matters. MRTZ must come first,
+    * then color exports sorted from first to last.
+    * Also, with Primitive Ordered Pixel Shading on GFX11+, the `done` export must not be moved
+    * above the memory accesses before the queue family scope (more precisely, fragment interlock
+    * scope, but it's not available in ACO) release barrier that is expected to be inserted before
+    * the export, as well as before any `s_wait_event export_ready` which enters the ordered
+    * section, because the `done` export exits the ordered section.
+    */
+   if (instr->isEXP() || instr->opcode == aco_opcode::p_dual_src_export_gfx11)
       return hazard_fail_export;
 
    /* don't move non-reorderable instructions */
    if (instr->opcode == aco_opcode::s_memtime || instr->opcode == aco_opcode::s_memrealtime ||
-       instr->opcode == aco_opcode::s_setprio || instr->opcode == aco_opcode::s_getreg_b32)
+       instr->opcode == aco_opcode::s_setprio || instr->opcode == aco_opcode::s_getreg_b32 ||
+       instr->opcode == aco_opcode::p_init_scratch ||
+       instr->opcode == aco_opcode::p_jump_to_epilog ||
+       instr->opcode == aco_opcode::s_sendmsg_rtn_b32 ||
+       instr->opcode == aco_opcode::s_sendmsg_rtn_b64 ||
+       instr->opcode == aco_opcode::p_end_with_regs)
       return hazard_fail_unreorderable;
 
    memory_event_set instr_set;
    memset(&instr_set, 0, sizeof(instr_set));
    memory_sync_info sync = get_sync_info_with_hack(instr);
-   add_memory_event(&instr_set, instr, &sync);
+   add_memory_event(query->gfx_level, &instr_set, instr, &sync);
 
    memory_event_set* first = &instr_set;
    memory_event_set* second = &query->mem_events;
@@ -613,7 +636,7 @@ perform_hazard_query(hazard_query* query, Instruction* instr, bool upwards)
    /* Don't move memory accesses to before control barriers. I don't think
     * this is necessary for the Vulkan memory model, but it might be for GLSL450. */
    unsigned control_classes =
-      storage_buffer | storage_atomic_counter | storage_image | storage_shared;
+      storage_buffer | storage_image | storage_shared | storage_task_payload;
    if (first->has_control_barrier &&
        ((second->access_atomic | second->access_relaxed) & control_classes))
       return hazard_fail_barrier;
@@ -648,12 +671,14 @@ schedule_SMEM(sched_ctx& ctx, Block* block, std::vector<RegisterDemand>& registe
    int16_t k = 0;
 
    /* don't move s_memtime/s_memrealtime */
-   if (current->opcode == aco_opcode::s_memtime || current->opcode == aco_opcode::s_memrealtime)
+   if (current->opcode == aco_opcode::s_memtime || current->opcode == aco_opcode::s_memrealtime ||
+       current->opcode == aco_opcode::s_sendmsg_rtn_b32 ||
+       current->opcode == aco_opcode::s_sendmsg_rtn_b64)
       return;
 
    /* first, check if we have instructions before current to move down */
    hazard_query hq;
-   init_hazard_query(&hq);
+   init_hazard_query(ctx, &hq);
    add_to_hazard_query(&hq, current);
 
    DownwardsCursor cursor = ctx.mv.downwards_init(idx, false, false);
@@ -675,11 +700,12 @@ schedule_SMEM(sched_ctx& ctx, Block* block, std::vector<RegisterDemand>& registe
          break;
       /* only move VMEM instructions below descriptor loads. be more aggressive at higher num_waves
        * to help create more vmem clauses */
-      if (candidate->isVMEM() && (cursor.insert_idx - cursor.source_idx > (ctx.num_waves * 4) ||
-                                  current->operands[0].size() == 4))
+      if ((candidate->isVMEM() || candidate->isFlatLike()) &&
+          (cursor.insert_idx - cursor.source_idx > (ctx.num_waves * 4) ||
+           current->operands[0].size() == 4))
          break;
       /* don't move descriptor loads below buffer loads */
-      if (candidate->format == Format::SMEM && current->operands[0].size() == 4 &&
+      if (candidate->isSMEM() && !candidate->operands.empty() && current->operands[0].size() == 4 &&
           candidate->operands[0].size() == 2)
          break;
 
@@ -694,7 +720,7 @@ schedule_SMEM(sched_ctx& ctx, Block* block, std::vector<RegisterDemand>& registe
          break;
 
       /* don't use LDS/GDS instructions to hide latency since it can
-       * significanly worsen LDS scheduling */
+       * significantly worsen LDS scheduling */
       if (candidate->isDS() || !can_move_down) {
          add_to_hazard_query(&hq, candidate.get());
          ctx.mv.downwards_skip(cursor);
@@ -732,7 +758,7 @@ schedule_SMEM(sched_ctx& ctx, Block* block, std::vector<RegisterDemand>& registe
       /* check if candidate depends on current */
       bool is_dependency = !found_dependency && !ctx.mv.upwards_check_deps(up_cursor);
       /* no need to steal from following VMEM instructions */
-      if (is_dependency && candidate->isVMEM())
+      if (is_dependency && (candidate->isVMEM() || candidate->isFlatLike()))
          break;
 
       if (found_dependency) {
@@ -748,7 +774,7 @@ schedule_SMEM(sched_ctx& ctx, Block* block, std::vector<RegisterDemand>& registe
       if (is_dependency) {
          if (!found_dependency) {
             ctx.mv.upwards_update_insert_idx(up_cursor);
-            init_hazard_query(&hq);
+            init_hazard_query(ctx, &hq);
             found_dependency = true;
          }
       }
@@ -765,7 +791,7 @@ schedule_SMEM(sched_ctx& ctx, Block* block, std::vector<RegisterDemand>& registe
       MoveResult res = ctx.mv.upwards_move(up_cursor);
       if (res == move_fail_ssa || res == move_fail_rar) {
          /* no need to steal from following VMEM instructions */
-         if (res == move_fail_ssa && candidate->isVMEM())
+         if (res == move_fail_ssa && (candidate->isVMEM() || candidate->isFlatLike()))
             break;
          add_to_hazard_query(&hq, candidate.get());
          ctx.mv.upwards_skip(up_cursor);
@@ -788,13 +814,14 @@ schedule_VMEM(sched_ctx& ctx, Block* block, std::vector<RegisterDemand>& registe
    int window_size = VMEM_WINDOW_SIZE;
    int max_moves = VMEM_MAX_MOVES;
    int clause_max_grab_dist = VMEM_CLAUSE_MAX_GRAB_DIST;
+   bool only_clauses = false;
    int16_t k = 0;
 
    /* first, check if we have instructions before current to move down */
    hazard_query indep_hq;
    hazard_query clause_hq;
-   init_hazard_query(&indep_hq);
-   init_hazard_query(&clause_hq);
+   init_hazard_query(ctx, &indep_hq);
+   init_hazard_query(ctx, &clause_hq);
    add_to_hazard_query(&indep_hq, current);
 
    DownwardsCursor cursor = ctx.mv.downwards_init(idx, true, true);
@@ -822,12 +849,28 @@ schedule_VMEM(sched_ctx& ctx, Block* block, std::vector<RegisterDemand>& registe
          /* We can't easily tell how much this will decrease the def-to-use
           * distances, so just use how far it will be moved as a heuristic. */
          part_of_clause =
-            grab_dist < clause_max_grab_dist && should_form_clause(current, candidate.get());
+            grab_dist < clause_max_grab_dist + k && should_form_clause(current, candidate.get());
       }
 
       /* if current depends on candidate, add additional dependencies and continue */
       bool can_move_down = !is_vmem || part_of_clause || candidate->definitions.empty();
-
+      if (only_clauses) {
+         /* In case of high register pressure, only try to form clauses,
+          * and only if the previous clause is not larger
+          * than the current one will be.
+          */
+         if (part_of_clause) {
+            int clause_size = cursor.insert_idx - cursor.insert_idx_clause;
+            int prev_clause_size = 1;
+            while (should_form_clause(current,
+                                      block->instructions[candidate_idx - prev_clause_size].get()))
+               prev_clause_size++;
+            if (prev_clause_size > clause_size + 1)
+               break;
+         } else {
+            can_move_down = false;
+         }
+      }
       HazardResult haz =
          perform_hazard_query(part_of_clause ? &clause_hq : &indep_hq, candidate.get(), false);
       if (haz == hazard_fail_reorder_ds || haz == hazard_fail_spill ||
@@ -838,6 +881,8 @@ schedule_VMEM(sched_ctx& ctx, Block* block, std::vector<RegisterDemand>& registe
          break;
 
       if (!can_move_down) {
+         if (part_of_clause)
+            break;
          add_to_hazard_query(&indep_hq, candidate.get());
          add_to_hazard_query(&clause_hq, candidate.get());
          ctx.mv.downwards_skip(cursor);
@@ -847,12 +892,20 @@ schedule_VMEM(sched_ctx& ctx, Block* block, std::vector<RegisterDemand>& registe
       Instruction* candidate_ptr = candidate.get();
       MoveResult res = ctx.mv.downwards_move(cursor, part_of_clause);
       if (res == move_fail_ssa || res == move_fail_rar) {
+         if (part_of_clause)
+            break;
          add_to_hazard_query(&indep_hq, candidate.get());
          add_to_hazard_query(&clause_hq, candidate.get());
          ctx.mv.downwards_skip(cursor);
          continue;
       } else if (res == move_fail_pressure) {
-         break;
+         only_clauses = true;
+         if (part_of_clause)
+            break;
+         add_to_hazard_query(&indep_hq, candidate.get());
+         add_to_hazard_query(&clause_hq, candidate.get());
+         ctx.mv.downwards_skip(cursor);
+         continue;
       }
       if (part_of_clause)
          add_to_hazard_query(&indep_hq, candidate_ptr);
@@ -893,7 +946,7 @@ schedule_VMEM(sched_ctx& ctx, Block* block, std::vector<RegisterDemand>& registe
       if (is_dependency) {
          if (!found_dependency) {
             ctx.mv.upwards_update_insert_idx(up_cursor);
-            init_hazard_query(&indep_hq);
+            init_hazard_query(ctx, &indep_hq);
             found_dependency = true;
          }
       } else if (is_vmem) {
@@ -937,7 +990,7 @@ schedule_position_export(sched_ctx& ctx, Block* block, std::vector<RegisterDeman
    DownwardsCursor cursor = ctx.mv.downwards_init(idx, true, false);
 
    hazard_query hq;
-   init_hazard_query(&hq);
+   init_hazard_query(ctx, &hq);
    add_to_hazard_query(&hq, current);
 
    for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int)idx - window_size;
@@ -972,6 +1025,37 @@ schedule_position_export(sched_ctx& ctx, Block* block, std::vector<RegisterDeman
    }
 }
 
+unsigned
+schedule_VMEM_store(sched_ctx& ctx, Block* block, std::vector<RegisterDemand>& register_demand,
+                    Instruction* current, int idx)
+{
+   hazard_query hq;
+   init_hazard_query(ctx, &hq);
+
+   DownwardsCursor cursor = ctx.mv.downwards_init(idx, true, true);
+   int skip = 0;
+
+   for (int i = 0; (i - skip) < VMEM_CLAUSE_MAX_GRAB_DIST; i++) {
+      aco_ptr<Instruction>& candidate = block->instructions[cursor.source_idx];
+      if (candidate->opcode == aco_opcode::p_logical_start)
+         break;
+
+      if (!should_form_clause(current, candidate.get())) {
+         add_to_hazard_query(&hq, candidate.get());
+         ctx.mv.downwards_skip(cursor);
+         continue;
+      }
+
+      if (perform_hazard_query(&hq, candidate.get(), false) != hazard_success ||
+          ctx.mv.downwards_move(cursor, true) != move_success)
+         break;
+
+      skip++;
+   }
+
+   return skip;
+}
+
 void
 schedule_block(sched_ctx& ctx, Program* program, Block* block, live& live_vars)
 {
@@ -981,9 +1065,13 @@ schedule_block(sched_ctx& ctx, Program* program, Block* block, live& live_vars)
    ctx.mv.register_demand = live_vars.register_demand[block->index].data();
 
    /* go through all instructions and find memory loads */
+   unsigned num_stores = 0;
    for (unsigned idx = 0; idx < block->instructions.size(); idx++) {
       Instruction* current = block->instructions[idx].get();
 
+      if (current->opcode == aco_opcode::p_logical_end)
+         break;
+
       if (block->kind & block_kind_export_end && current->isEXP() && ctx.schedule_pos_exports) {
          unsigned target = current->exp().dest;
          if (target >= V_008DFC_SQ_EXP_POS && target < V_008DFC_SQ_EXP_PRIM) {
@@ -993,8 +1081,10 @@ schedule_block(sched_ctx& ctx, Program* program, Block* block, live& live_vars)
          }
       }
 
-      if (current->definitions.empty())
+      if (current->definitions.empty()) {
+         num_stores += current->isVMEM() || current->isFlatLike() ? 1 : 0;
          continue;
+      }
 
       if (current->isVMEM() || current->isFlatLike()) {
          ctx.mv.current = current;
@@ -1007,6 +1097,19 @@ schedule_block(sched_ctx& ctx, Program* program, Block* block, live& live_vars)
       }
    }
 
+   /* GFX11 benefits from creating VMEM store clauses. */
+   if (num_stores > 1 && program->gfx_level >= GFX11) {
+      for (int idx = block->instructions.size() - 1; idx >= 0; idx--) {
+         Instruction* current = block->instructions[idx].get();
+         if (!current->definitions.empty() || !(current->isVMEM() || current->isFlatLike()))
+            continue;
+
+         ctx.mv.current = current;
+         idx -=
+            schedule_VMEM_store(ctx, block, live_vars.register_demand[block->index], current, idx);
+      }
+   }
+
    /* resummarize the block's register demand */
    block->register_demand = RegisterDemand();
    for (unsigned idx = 0; idx < block->instructions.size(); idx++) {
@@ -1024,6 +1127,7 @@ schedule_program(Program* program, live& live_vars)
    demand.vgpr += program->config->num_shared_vgprs / 2;
 
    sched_ctx ctx;
+   ctx.gfx_level = program->gfx_level;
    ctx.mv.depends_on.resize(program->peekAllocationId());
    ctx.mv.RAR_dependencies.resize(program->peekAllocationId());
    ctx.mv.RAR_dependencies_clause.resize(program->peekAllocationId());
@@ -1042,6 +1146,7 @@ schedule_program(Program* program, live& live_vars)
       ctx.num_waves = 7 * wave_fac;
    ctx.num_waves = std::max<uint16_t>(ctx.num_waves, program->min_waves);
    ctx.num_waves = std::min<uint16_t>(ctx.num_waves, program->num_waves);
+   ctx.num_waves = max_suitable_waves(program, ctx.num_waves);
 
    /* VMEM_MAX_MOVES and such assume pre-GFX10 wave count */
    ctx.num_waves = std::max<uint16_t>(ctx.num_waves / wave_fac, 1);
@@ -1054,8 +1159,8 @@ schedule_program(Program* program, live& live_vars)
     * Schedule less aggressively when early primitive export is used, and
     * keep the position export at the very bottom when late primitive export is used.
     */
-   if (program->info->has_ngg_culling && program->stage.num_sw_stages() == 1) {
-      if (!program->info->has_ngg_early_prim_export)
+   if (program->info.has_ngg_culling && program->stage.num_sw_stages() == 1) {
+      if (!program->info.has_ngg_early_prim_export)
          ctx.schedule_pos_exports = false;
       else
          ctx.schedule_pos_export_div = 4;