1 files changed, 155 insertions, 57 deletions
diff --git a/src/amd/compiler/aco_statistics.cpp b/src/amd/compiler/aco_statistics.cpp
index ce114e3f879..61d25af2c2f 100644
--- a/src/amd/compiler/aco_statistics.cpp
+++ b/src/amd/compiler/aco_statistics.cpp
@@ -1,25 +1,7 @@
 /*
  * Copyright © 2020 Valve Corporation
  *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
+ * SPDX-License-Identifier: MIT
  */
 
 #include "aco_ir.h"
@@ -40,8 +22,8 @@ collect_presched_stats(Program* program)
    RegisterDemand presched_demand;
    for (Block& block : program->blocks)
       presched_demand.update(block.register_demand);
-   program->statistics[statistic_sgpr_presched] = presched_demand.sgpr;
-   program->statistics[statistic_vgpr_presched] = presched_demand.vgpr;
+   program->statistics[aco_statistic_sgpr_presched] = presched_demand.sgpr;
+   program->statistics[aco_statistic_vgpr_presched] = presched_demand.vgpr;
 }
 
 class BlockCycleEstimator {
@@ -104,15 +86,99 @@ struct perf_info {
    unsigned cost1;
 };
 
+static bool
+is_dual_issue_capable(const Program& program, const Instruction& instr)
+{
+   if (program.gfx_level < GFX11 || !instr.isVALU() || instr.isDPP())
+      return false;
+
+   switch (instr.opcode) {
+   case aco_opcode::v_fma_f32:
+   case aco_opcode::v_fmac_f32:
+   case aco_opcode::v_fmaak_f32:
+   case aco_opcode::v_fmamk_f32:
+   case aco_opcode::v_mul_f32:
+   case aco_opcode::v_add_f32:
+   case aco_opcode::v_sub_f32:
+   case aco_opcode::v_subrev_f32:
+   case aco_opcode::v_mul_legacy_f32:
+   case aco_opcode::v_fma_legacy_f32:
+   case aco_opcode::v_fmac_legacy_f32:
+   case aco_opcode::v_fma_f16:
+   case aco_opcode::v_fmac_f16:
+   case aco_opcode::v_fmaak_f16:
+   case aco_opcode::v_fmamk_f16:
+   case aco_opcode::v_mul_f16:
+   case aco_opcode::v_add_f16:
+   case aco_opcode::v_sub_f16:
+   case aco_opcode::v_subrev_f16:
+   case aco_opcode::v_mov_b32:
+   case aco_opcode::v_movreld_b32:
+   case aco_opcode::v_movrels_b32:
+   case aco_opcode::v_movrelsd_b32:
+   case aco_opcode::v_movrelsd_2_b32:
+   case aco_opcode::v_cndmask_b32:
+   case aco_opcode::v_writelane_b32_e64:
+   case aco_opcode::v_mov_b16:
+   case aco_opcode::v_cndmask_b16:
+   case aco_opcode::v_max_f32:
+   case aco_opcode::v_min_f32:
+   case aco_opcode::v_max_f16:
+   case aco_opcode::v_min_f16:
+   case aco_opcode::v_max_i16_e64:
+   case aco_opcode::v_min_i16_e64:
+   case aco_opcode::v_max_u16_e64:
+   case aco_opcode::v_min_u16_e64:
+   case aco_opcode::v_add_i16:
+   case aco_opcode::v_sub_i16:
+   case aco_opcode::v_mad_i16:
+   case aco_opcode::v_add_u16_e64:
+   case aco_opcode::v_sub_u16_e64:
+   case aco_opcode::v_mad_u16:
+   case aco_opcode::v_mul_lo_u16_e64:
+   case aco_opcode::v_not_b16:
+   case aco_opcode::v_and_b16:
+   case aco_opcode::v_or_b16:
+   case aco_opcode::v_xor_b16:
+   case aco_opcode::v_lshrrev_b16_e64:
+   case aco_opcode::v_ashrrev_i16_e64:
+   case aco_opcode::v_lshlrev_b16_e64:
+   case aco_opcode::v_dot2_bf16_bf16:
+   case aco_opcode::v_dot2_f32_bf16:
+   case aco_opcode::v_dot2_f16_f16:
+   case aco_opcode::v_dot2_f32_f16:
+   case aco_opcode::v_dot2c_f32_f16: return true;
+   case aco_opcode::v_fma_mix_f32:
+   case aco_opcode::v_fma_mixlo_f16:
+   case aco_opcode::v_fma_mixhi_f16: {
+      /* dst and acc type must match */
+      if (instr.valu().opsel_hi[2] == (instr.opcode == aco_opcode::v_fma_mix_f32))
+         return false;
+
+      /* If all operands are vgprs, two must be the same. */
+      for (unsigned i = 0; i < 3; i++) {
+         if (instr.operands[i].isConstant() || instr.operands[i].isOfType(RegType::sgpr))
+            return true;
+         for (unsigned j = 0; j < i; j++) {
+            if (instr.operands[i].physReg() == instr.operands[j].physReg())
+               return true;
+         }
+      }
+      return false;
+   }
+   default: return false;
+   }
+}
+
 static perf_info
-get_perf_info(Program* program, aco_ptr<Instruction>& instr)
+get_perf_info(const Program& program, const Instruction& instr)
 {
-   instr_class cls = instr_info.classes[(int)instr->opcode];
+   instr_class cls = instr_info.classes[(int)instr.opcode];
 
 #define WAIT(res)          BlockCycleEstimator::res, 0
 #define WAIT_USE(res, cnt) BlockCycleEstimator::res, cnt
 
-   if (program->chip_class >= GFX10) {
+   if (program.gfx_level >= GFX10) {
       /* fp64 might be incorrect */
       switch (cls) {
       case instr_class::valu32:
@@ -135,10 +201,15 @@ get_perf_info(Program* program, aco_ptr<Instruction>& instr)
       case instr_class::branch:
       case instr_class::sendmsg: return {0, WAIT_USE(branch_sendmsg, 1)};
       case instr_class::ds:
-         return instr->ds().gds ? perf_info{0, WAIT_USE(export_gds, 1)}
-                                : perf_info{0, WAIT_USE(lds, 1)};
+         return instr.isDS() && instr.ds().gds ? perf_info{0, WAIT_USE(export_gds, 1)}
+                                               : perf_info{0, WAIT_USE(lds, 1)};
       case instr_class::exp: return {0, WAIT_USE(export_gds, 1)};
       case instr_class::vmem: return {0, WAIT_USE(vmem, 1)};
+      case instr_class::wmma: {
+         /* int8 and (b)f16 have the same performance. */
+         uint8_t cost = instr.opcode == aco_opcode::v_wmma_i32_16x16x16_iu4 ? 16 : 32;
+         return {cost, WAIT_USE(valu, cost)};
+      }
       case instr_class::barrier:
       case instr_class::waitcnt:
       case instr_class::other:
@@ -151,8 +222,8 @@ get_perf_info(Program* program, aco_ptr<Instruction>& instr)
       case instr_class::valu64: return {8, WAIT_USE(valu, 8)};
       case instr_class::valu_quarter_rate32: return {16, WAIT_USE(valu, 16)};
       case instr_class::valu_fma:
-         return program->dev.has_fast_fma32 ? perf_info{4, WAIT_USE(valu, 4)}
-                                            : perf_info{16, WAIT_USE(valu, 16)};
+         return program.dev.has_fast_fma32 ? perf_info{4, WAIT_USE(valu, 4)}
+                                           : perf_info{16, WAIT_USE(valu, 16)};
       case instr_class::valu_transcendental32: return {16, WAIT_USE(valu, 16)};
       case instr_class::valu_double: return {64, WAIT_USE(valu, 64)};
       case instr_class::valu_double_add: return {32, WAIT_USE(valu, 32)};
@@ -164,8 +235,8 @@ get_perf_info(Program* program, aco_ptr<Instruction>& instr)
          return {8, WAIT_USE(branch_sendmsg, 8)};
          return {4, WAIT_USE(branch_sendmsg, 4)};
       case instr_class::ds:
-         return instr->ds().gds ? perf_info{4, WAIT_USE(export_gds, 4)}
-                                : perf_info{4, WAIT_USE(lds, 4)};
+         return instr.isDS() && instr.ds().gds ? perf_info{4, WAIT_USE(export_gds, 4)}
+                                               : perf_info{4, WAIT_USE(lds, 4)};
       case instr_class::exp: return {16, WAIT_USE(export_gds, 16)};
       case instr_class::vmem: return {4, WAIT_USE(vmem, 4)};
       case instr_class::barrier:
@@ -182,7 +253,7 @@ get_perf_info(Program* program, aco_ptr<Instruction>& instr)
 void
 BlockCycleEstimator::use_resources(aco_ptr<Instruction>& instr)
 {
-   perf_info perf = get_perf_info(program, instr);
+   perf_info perf = get_perf_info(*program, *instr);
 
    if (perf.rsrc0 != resource_count) {
       res_available[(int)perf.rsrc0] = cur_cycle + perf.cost0;
@@ -198,7 +269,7 @@ BlockCycleEstimator::use_resources(aco_ptr<Instruction>& instr)
 int32_t
 BlockCycleEstimator::cycles_until_res_available(aco_ptr<Instruction>& instr)
 {
-   perf_info perf = get_perf_info(program, instr);
+   perf_info perf = get_perf_info(*program, *instr);
 
    int32_t cost = 0;
    if (perf.rsrc0 != resource_count)
@@ -221,9 +292,9 @@ get_wait_counter_info(aco_ptr<Instruction>& instr)
    if (instr->isFlatLike()) {
       unsigned lgkm = instr->isFlat() ? 20 : 0;
       if (!instr->definitions.empty())
-         return wait_counter_info(230, 0, lgkm, 0);
+         return wait_counter_info(320, 0, lgkm, 0);
       else
-         return wait_counter_info(0, 0, lgkm, 230);
+         return wait_counter_info(0, 0, lgkm, 320);
    }
 
    if (instr->isSMEM()) {
@@ -261,13 +332,13 @@ get_wait_imm(Program* program, aco_ptr<Instruction>& instr)
    if (instr->opcode == aco_opcode::s_endpgm) {
       return wait_imm(0, 0, 0, 0);
    } else if (instr->opcode == aco_opcode::s_waitcnt) {
-      return wait_imm(GFX10_3, instr->sopp().imm);
+      return wait_imm(GFX10_3, instr->salu().imm);
    } else if (instr->opcode == aco_opcode::s_waitcnt_vscnt) {
-      return wait_imm(0, 0, 0, instr->sopk().imm);
+      return wait_imm(0, 0, 0, instr->salu().imm);
    } else {
-      unsigned max_lgkm_cnt = program->chip_class >= GFX10 ? 62 : 14;
+      unsigned max_lgkm_cnt = program->gfx_level >= GFX10 ? 62 : 14;
       unsigned max_exp_cnt = 6;
-      unsigned max_vm_cnt = program->chip_class >= GFX9 ? 62 : 14;
+      unsigned max_vm_cnt = program->gfx_level >= GFX9 ? 62 : 14;
       unsigned max_vs_cnt = 62;
 
       wait_counter_info wait_info = get_wait_counter_info(instr);
@@ -306,7 +377,7 @@ BlockCycleEstimator::get_dependency_cost(aco_ptr<Instruction>& instr)
    if (instr->opcode == aco_opcode::s_endpgm) {
       for (unsigned i = 0; i < 512; i++)
          deps_available = MAX2(deps_available, reg_available[i]);
-   } else if (program->chip_class >= GFX10) {
+   } else if (program->gfx_level >= GFX10) {
       for (Operand& op : instr->operands) {
          if (op.isConstant() || op.isUndefined())
             continue;
@@ -315,7 +386,7 @@ BlockCycleEstimator::get_dependency_cost(aco_ptr<Instruction>& instr)
       }
    }
 
-   if (program->chip_class < GFX10)
+   if (program->gfx_level < GFX10)
       deps_available = align(deps_available, 4);
 
    return deps_available - cur_cycle;
@@ -352,13 +423,14 @@ is_vector(aco_opcode op)
 void
 BlockCycleEstimator::add(aco_ptr<Instruction>& instr)
 {
-   perf_info perf = get_perf_info(program, instr);
+   perf_info perf = get_perf_info(*program, *instr);
 
    cur_cycle += get_dependency_cost(instr);
 
    unsigned start;
-   bool dual_issue = program->chip_class >= GFX10 && program->wave_size == 64 &&
-                     is_vector(instr->opcode) && program->workgroup_size > 32;
+   bool dual_issue = program->gfx_level >= GFX10 && program->wave_size == 64 &&
+                     is_vector(instr->opcode) && !is_dual_issue_capable(*program, *instr) &&
+                     program->workgroup_size > 32;
    for (unsigned i = 0; i < (dual_issue ? 2 : 1); i++) {
       cur_cycle += cycles_until_res_available(instr);
 
@@ -366,7 +438,7 @@ BlockCycleEstimator::add(aco_ptr<Instruction>& instr)
       use_resources(instr);
 
       /* GCN is in-order and doesn't begin the next instruction until the current one finishes */
-      cur_cycle += program->chip_class >= GFX10 ? 1 : perf.latency;
+      cur_cycle += program->gfx_level >= GFX10 ? 1 : perf.latency;
    }
 
    wait_imm imm = get_wait_imm(program, instr);
@@ -438,21 +510,31 @@ collect_preasm_stats(Program* program)
       std::set<Instruction*> vmem_clause;
       std::set<Instruction*> smem_clause;
 
-      program->statistics[statistic_instructions] += block.instructions.size();
+      program->statistics[aco_statistic_instructions] += block.instructions.size();
 
       for (aco_ptr<Instruction>& instr : block.instructions) {
-         if (instr->isSOPP() && instr->sopp().block != -1)
-            program->statistics[statistic_branches]++;
-
-         if (instr->opcode == aco_opcode::p_constaddr)
-            program->statistics[statistic_instructions] += 2;
-
-         if (instr->isVMEM() && !instr->operands.empty()) {
+         const bool is_branch =
+            instr->isSOPP() && instr_info.classes[(int)instr->opcode] == instr_class::branch;
+         if (is_branch)
+            program->statistics[aco_statistic_branches]++;
+
+         if (instr->isVALU() || instr->isVINTRP())
+            program->statistics[aco_statistic_valu]++;
+         if (instr->isSALU() && !instr->isSOPP() &&
+             instr_info.classes[(int)instr->opcode] != instr_class::waitcnt)
+            program->statistics[aco_statistic_salu]++;
+         if (instr->isVOPD())
+            program->statistics[aco_statistic_vopd]++;
+
+         if ((instr->isVMEM() || instr->isScratch() || instr->isGlobal()) &&
+             !instr->operands.empty()) {
             if (std::none_of(vmem_clause.begin(), vmem_clause.end(),
                              [&](Instruction* other)
                              { return should_form_clause(instr.get(), other); }))
-               program->statistics[statistic_vmem_clauses]++;
+               program->statistics[aco_statistic_vmem_clauses]++;
             vmem_clause.insert(instr.get());
+
+            program->statistics[aco_statistic_vmem]++;
          } else {
             vmem_clause.clear();
          }
@@ -461,8 +543,10 @@ collect_preasm_stats(Program* program)
             if (std::none_of(smem_clause.begin(), smem_clause.end(),
                              [&](Instruction* other)
                              { return should_form_clause(instr.get(), other); }))
-               program->statistics[statistic_smem_clauses]++;
+               program->statistics[aco_statistic_smem_clauses]++;
             smem_clause.insert(instr.get());
+
+            program->statistics[aco_statistic_smem]++;
          } else {
             smem_clause.clear();
          }
@@ -473,6 +557,13 @@ collect_preasm_stats(Program* program)
    double usage[(int)BlockCycleEstimator::resource_count] = {0};
    std::vector<BlockCycleEstimator> blocks(program->blocks.size(), program);
 
+   constexpr const unsigned vmem_latency = 320;
+   for (const Definition def : program->args_pending_vmem) {
+      blocks[0].vm.push_back(vmem_latency);
+      for (unsigned i = 0; i < def.size(); i++)
+         blocks[0].reg_available[def.physReg().reg() + i] = vmem_latency;
+   }
+
    for (Block& block : program->blocks) {
       BlockCycleEstimator& block_est = blocks[block.index];
       for (unsigned pred : block.linear_preds)
@@ -535,8 +626,8 @@ collect_preasm_stats(Program* program)
          program->workgroup_size / (double)align(program->workgroup_size, program->wave_size);
    wave64_per_cycle *= max_utilization;
 
-   program->statistics[statistic_latency] = round(latency);
-   program->statistics[statistic_inv_throughput] = round(1.0 / wave64_per_cycle);
+   program->statistics[aco_statistic_latency] = round(latency);
+   program->statistics[aco_statistic_inv_throughput] = round(1.0 / wave64_per_cycle);
 
    if (debug_flags & DEBUG_PERF_INFO) {
       aco_print_program(program, stderr, print_no_ssa | print_perf_info);
@@ -561,7 +652,14 @@ collect_preasm_stats(Program* program)
 void
 collect_postasm_stats(Program* program, const std::vector<uint32_t>& code)
 {
-   program->statistics[aco::statistic_hash] = util_hash_crc32(code.data(), code.size() * 4);
+   program->statistics[aco_statistic_hash] = util_hash_crc32(code.data(), code.size() * 4);
+}
+
+Instruction_cycle_info
+get_cycle_info(const Program& program, const Instruction& instr)
+{
+   perf_info info = get_perf_info(program, instr);
+   return Instruction_cycle_info{(unsigned)info.latency, std::max(info.cost0, info.cost1)};
 }
 
 } // namespace aco