summaryrefslogtreecommitdiff
path: root/src/amd/compiler/aco_optimizer.cpp
diff options
context:
space:
mode:
authorRhys Perry <pendingchaos02@gmail.com>2021-11-11 10:54:56 +0000
committerMarge Bot <emma+marge@anholt.net>2021-12-17 22:14:36 +0000
commitdd18925f864dd4160502c7080e6f07aff0932345 (patch)
treeae45faded684f08d0450fc056ea38aae9bb32ddf /src/amd/compiler/aco_optimizer.cpp
parentcf5fc4b973cd9193f3a1fcb684c75a0e15c2b25c (diff)
aco: skip &-4 before SMEM
The hardware ignores the low 2 bits. I'm not sure if they are ignored before or after the address is calculated, but this optimization should be cautious enough. fossil-db (Sienna Cichlid): Totals from 259 (0.19% of 134572) affected shaders: SpillSGPRs: 1381 -> 1382 (+0.07%) SpillVGPRs: 1783 -> 1782 (-0.06%); split: -0.67%, +0.62% CodeSize: 1598612 -> 1596084 (-0.16%); split: -0.30%, +0.14% Scratch: 180224 -> 179200 (-0.57%); split: -1.14%, +0.57% Instrs: 284885 -> 284268 (-0.22%); split: -0.34%, +0.12% Latency: 6585634 -> 6603388 (+0.27%); split: -0.48%, +0.75% InvThroughput: 2638983 -> 2648474 (+0.36%); split: -0.58%, +0.94% VClause: 6797 -> 6820 (+0.34%); split: -0.15%, +0.49% SClause: 6569 -> 6574 (+0.08%); split: -1.11%, +1.19% Copies: 50561 -> 50586 (+0.05%); split: -0.61%, +0.66% Branches: 10058 -> 10062 (+0.04%); split: -0.01%, +0.05% Signed-off-by: Rhys Perry <pendingchaos02@gmail.com> Reviewed-by: Daniel Schürmann <daniel@schuermann.dev> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13755>
Diffstat (limited to 'src/amd/compiler/aco_optimizer.cpp')
-rw-r--r--src/amd/compiler/aco_optimizer.cpp34
1 files changed, 34 insertions, 0 deletions
diff --git a/src/amd/compiler/aco_optimizer.cpp b/src/amd/compiler/aco_optimizer.cpp
index 8ea5dac94ec..d2efd5df968 100644
--- a/src/amd/compiler/aco_optimizer.cpp
+++ b/src/amd/compiler/aco_optimizer.cpp
@@ -757,6 +757,32 @@ parse_base_offset(opt_ctx& ctx, Instruction* instr, unsigned op_index, Temp* bas
return false;
}
+void
+skip_smem_offset_align(opt_ctx& ctx, SMEM_instruction* smem)
+{
+ bool soe = smem->operands.size() >= (!smem->definitions.empty() ? 3 : 4);
+ if (soe && !smem->operands[1].isConstant())
+ return;
+ /* We don't need to check the constant offset because the address seems to be calculated with
+ * (offset&-4 + const_offset&-4), not (offset+const_offset)&-4.
+ */
+
+ Operand& op = smem->operands[soe ? smem->operands.size() - 1 : 1];
+ if (!op.isTemp() || !ctx.info[op.tempId()].is_bitwise())
+ return;
+
+ Instruction* bitwise_instr = ctx.info[op.tempId()].instr;
+ if (bitwise_instr->opcode != aco_opcode::s_and_b32)
+ return;
+
+ if (bitwise_instr->operands[0].constantEquals(-4) &&
+ bitwise_instr->operands[1].isOfType(op.regClass().type()))
+ op.setTemp(bitwise_instr->operands[1].getTemp());
+ else if (bitwise_instr->operands[1].constantEquals(-4) &&
+ bitwise_instr->operands[0].isOfType(op.regClass().type()))
+ op.setTemp(bitwise_instr->operands[0].getTemp());
+}
+
unsigned
get_operand_size(aco_ptr<Instruction>& instr, unsigned index)
{
@@ -976,6 +1002,10 @@ label_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
instr.get());
}
+ /* skip &-4 before offset additions: load((a + 16) & -4, 0) */
+ if (instr->isSMEM() && !instr->operands.empty())
+ skip_smem_offset_align(ctx, &instr->smem());
+
for (unsigned i = 0; i < instr->operands.size(); i++) {
if (!instr->operands[i].isTemp())
continue;
@@ -1230,6 +1260,10 @@ label_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
}
}
+ /* skip &-4 after offset additions: load(a & -4, 16) */
+ if (instr->isSMEM() && !instr->operands.empty())
+ skip_smem_offset_align(ctx, &instr->smem());
+
/* if this instruction doesn't define anything, return */
if (instr->definitions.empty()) {
check_sdwa_extract(ctx, instr);