aco/insert_waitcnt: Remove many unnecessary wait_imm.combine()

Reduces overall compile times by ~0.2%. Reviewed-by: Rhys Perry <pendingchaos02@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/11879>
author: Daniel Schürmann <daniel@schuermann.dev> 2021-07-14 13:49:20 +0200
committer: Daniel Schürmann <daniel@schuermann.dev> 2021-07-14 18:10:50 +0200
commit: 20eaa074ececa360fd6431caa9352e8d746f8f96 (patch)
tree: 0f93a4e16fe7d19a6994e92383d5cc1b17be3c19
parent: 114d38e57d78b4141a8cf4359437773f931877a5 (diff)
1 files changed, 23 insertions, 37 deletions
diff --git a/src/amd/compiler/aco_insert_waitcnt.cpp b/src/amd/compiler/aco_insert_waitcnt.cpp
index e4788270c98..d7fc87c126d 100644
--- a/src/amd/compiler/aco_insert_waitcnt.cpp
+++ b/src/amd/compiler/aco_insert_waitcnt.cpp
@@ -242,11 +242,9 @@ struct wait_ctx {
    }
 };
 
-wait_imm
-check_instr(Instruction* instr, wait_ctx& ctx)
+void
+check_instr(wait_ctx& ctx, wait_imm& wait, Instruction* instr)
 {
-   wait_imm wait;
-
    for (const Operand op : instr->operands) {
       if (op.isConstant() || op.isUndefined())
          continue;
@@ -287,28 +285,25 @@ check_instr(Instruction* instr, wait_ctx& ctx)
          wait.combine(it->second.imm);
       }
    }
-
-   return wait;
 }
 
-wait_imm
-parse_wait_instr(wait_ctx& ctx, Instruction* instr)
+bool
+parse_wait_instr(wait_ctx& ctx, wait_imm& imm, Instruction* instr)
 {
    if (instr->opcode == aco_opcode::s_waitcnt_vscnt &&
        instr->definitions[0].physReg() == sgpr_null) {
-      wait_imm imm;
       imm.vs = std::min<uint8_t>(imm.vs, instr->sopk().imm);
-      return imm;
+      return true;
    } else if (instr->opcode == aco_opcode::s_waitcnt) {
-      return wait_imm(ctx.chip_class, instr->sopp().imm);
+      imm.combine(wait_imm(ctx.chip_class, instr->sopp().imm));
+      return true;
    }
-   return wait_imm();
+   return false;
 }
 
-wait_imm
-perform_barrier(wait_ctx& ctx, memory_sync_info sync, unsigned semantics)
+void
+perform_barrier(wait_ctx& ctx, wait_imm& imm, memory_sync_info sync, unsigned semantics)
 {
-   wait_imm imm;
    sync_scope subgroup_scope =
       ctx.program->workgroup_size <= ctx.program->wave_size ? scope_workgroup : scope_subgroup;
    if ((sync.semantics & semantics) && sync.scope > subgroup_scope) {
@@ -332,8 +327,6 @@ perform_barrier(wait_ctx& ctx, memory_sync_info sync, unsigned semantics)
             imm.combine(ctx.barrier_imm[idx]);
       }
    }
-
-   return imm;
 }
 
 void
@@ -352,22 +345,18 @@ force_waitcnt(wait_ctx& ctx, wait_imm& imm)
    }
 }
 
-wait_imm
-kill(Instruction* instr, wait_ctx& ctx, memory_sync_info sync_info)
+void
+kill(wait_imm& imm, Instruction* instr, wait_ctx& ctx, memory_sync_info sync_info)
 {
-   wait_imm imm;
-
    if (debug_flags & DEBUG_FORCE_WAITCNT) {
       /* Force emitting waitcnt states right after the instruction if there is
        * something to wait for.
        */
-      force_waitcnt(ctx, imm);
+      return force_waitcnt(ctx, imm);
    }
 
    if (ctx.exp_cnt || ctx.vm_cnt || ctx.lgkm_cnt)
-      imm.combine(check_instr(instr, ctx));
-
-   imm.combine(parse_wait_instr(ctx, instr));
+      check_instr(ctx, imm, instr);
 
    /* It's required to wait for scalar stores before "writing back" data.
     * It shouldn't cost anything anyways since we're about to do s_endpgm.
@@ -406,9 +395,9 @@ kill(Instruction* instr, wait_ctx& ctx, memory_sync_info sync_info)
    }
 
    if (instr->opcode == aco_opcode::p_barrier)
-      imm.combine(perform_barrier(ctx, instr->barrier().sync, semantic_acqrel));
+      perform_barrier(ctx, imm, instr->barrier().sync, semantic_acqrel);
    else
-      imm.combine(perform_barrier(ctx, sync_info, semantic_release));
+      perform_barrier(ctx, imm, sync_info, semantic_release);
 
    if (!imm.empty()) {
       if (ctx.pending_flat_vm && imm.vm != wait_imm::unset_counter)
@@ -470,8 +459,6 @@ kill(Instruction* instr, wait_ctx& ctx, memory_sync_info sync_info)
       ctx.pending_flat_lgkm = false;
       ctx.pending_s_buffer_store = false;
    }
-
-   return imm;
 }
 
 void
@@ -719,7 +706,7 @@ gen(Instruction* instr, wait_ctx& ctx)
 }
 
 void
-emit_waitcnt(wait_ctx& ctx, std::vector<aco_ptr<Instruction>>& instructions, wait_imm imm)
+emit_waitcnt(wait_ctx& ctx, std::vector<aco_ptr<Instruction>>& instructions, wait_imm& imm)
 {
    if (imm.vs != wait_imm::unset_counter) {
       assert(ctx.chip_class >= GFX10);
@@ -737,6 +724,7 @@ emit_waitcnt(wait_ctx& ctx, std::vector<aco_ptr<Instruction>>& instructions, wai
       waitcnt->block = -1;
       instructions.emplace_back(waitcnt);
    }
+   imm = wait_imm();
 }
 
 void
@@ -747,21 +735,19 @@ handle_block(Program* program, Block& block, wait_ctx& ctx)
    wait_imm queued_imm;
 
    for (aco_ptr<Instruction>& instr : block.instructions) {
-      bool is_wait = !parse_wait_instr(ctx, instr.get()).empty();
+      bool is_wait = parse_wait_instr(ctx, queued_imm, instr.get());
 
       memory_sync_info sync_info = get_sync_info(instr.get());
-      queued_imm.combine(kill(instr.get(), ctx, sync_info));
+      kill(queued_imm, instr.get(), ctx, sync_info);
 
       gen(instr.get(), ctx);
 
       if (instr->format != Format::PSEUDO_BARRIER && !is_wait) {
-         if (!queued_imm.empty()) {
+         if (!queued_imm.empty())
             emit_waitcnt(ctx, new_instructions, queued_imm);
-            queued_imm = wait_imm();
-         }
-         new_instructions.emplace_back(std::move(instr));
 
-         queued_imm.combine(perform_barrier(ctx, sync_info, semantic_acquire));
+         new_instructions.emplace_back(std::move(instr));
+         perform_barrier(ctx, queued_imm, sync_info, semantic_acquire);
       }
    }
author	Daniel Schürmann <daniel@schuermann.dev>	2021-07-14 13:49:20 +0200
committer	Daniel Schürmann <daniel@schuermann.dev>	2021-07-14 18:10:50 +0200
commit	20eaa074ececa360fd6431caa9352e8d746f8f96 (patch)
tree	0f93a4e16fe7d19a6994e92383d5cc1b17be3c19
parent	114d38e57d78b4141a8cf4359437773f931877a5 (diff)