summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDaniel Schürmann <daniel@schuermann.dev>2020-09-02 15:19:21 +0100
committerMarge Bot <eric+marge@anholt.net>2021-01-13 17:46:56 +0000
commit01134b0bfe407f43d8089551301ffedaeeb459ff (patch)
treeb80e49bececeba62074887a525bc6a1e47a87bf2
parentfcd2ef23e5f1d50008166168e772815c0213e37c (diff)
aco: simplify multiply-add combining
When both operands of a v_sub (same apply for v_add) are mul and one already uses clamp/omod, pick the other operand to get a chance to combine to a MAD. No fossils-db changes. Co-authored-by: Samuel Pitoiset <samuel.pitoiset@gmail.com> Reviewed-by: Rhys Perry <pendingchaos02@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/6680>
-rw-r--r--src/amd/compiler/aco_optimizer.cpp84
1 files changed, 36 insertions, 48 deletions
diff --git a/src/amd/compiler/aco_optimizer.cpp b/src/amd/compiler/aco_optimizer.cpp
index c986c6f69e7..967656c3c72 100644
--- a/src/amd/compiler/aco_optimizer.cpp
+++ b/src/amd/compiler/aco_optimizer.cpp
@@ -2797,49 +2797,50 @@ void combine_instruction(opt_ctx &ctx, Block& block, aco_ptr<Instruction>& instr
if (need_fma && mad32 && !ctx.program->has_fast_fma32)
return;
- uint32_t uses_src0 = UINT32_MAX;
- uint32_t uses_src1 = UINT32_MAX;
Instruction* mul_instr = nullptr;
unsigned add_op_idx;
- /* check if any of the operands is a multiplication */
- ssa_info *op0_info = instr->operands[0].isTemp() ? &ctx.info[instr->operands[0].tempId()] : NULL;
- ssa_info *op1_info = instr->operands[1].isTemp() ? &ctx.info[instr->operands[1].tempId()] : NULL;
- if (op0_info && op0_info->is_mul() && (!need_fma || !op0_info->instr->definitions[0].isPrecise()))
- uses_src0 = ctx.uses[instr->operands[0].tempId()];
- if (op1_info && op1_info->is_mul() && (!need_fma || !op1_info->instr->definitions[0].isPrecise()))
- uses_src1 = ctx.uses[instr->operands[1].tempId()];
-
+ uint32_t uses = UINT32_MAX;
/* find the 'best' mul instruction to combine with the add */
- if (uses_src0 < uses_src1) {
- mul_instr = op0_info->instr;
- add_op_idx = 1;
- } else if (uses_src1 < uses_src0) {
- mul_instr = op1_info->instr;
- add_op_idx = 0;
- } else if (uses_src0 != UINT32_MAX) {
- /* tiebreaker: quite random what to pick */
- if (op0_info->instr->operands[0].isLiteral()) {
- mul_instr = op1_info->instr;
- add_op_idx = 0;
- } else {
- mul_instr = op0_info->instr;
- add_op_idx = 1;
- }
+ for (unsigned i = 0; i < 2; i++) {
+ if (!instr->operands[i].isTemp() || !ctx.info[instr->operands[i].tempId()].is_mul())
+ continue;
+ /* check precision requirements */
+ ssa_info& info = ctx.info[instr->operands[i].tempId()];
+ if (need_fma && info.instr->definitions[0].isPrecise())
+ continue;
+
+ /* no clamp/omod allowed between mul and add */
+ if (info.instr->isVOP3() &&
+ (static_cast<VOP3A_instruction*>(info.instr)->clamp ||
+ static_cast<VOP3A_instruction*>(info.instr)->omod))
+ continue;
+
+ Operand op[3] = {info.instr->operands[0], info.instr->operands[1], instr->operands[1 - i]};
+ if (info.instr->isSDWA() ||
+ !check_vop3_operands(ctx, 3, op) ||
+ ctx.uses[instr->operands[i].tempId()] >= uses)
+ continue;
+
+ mul_instr = info.instr;
+ add_op_idx = 1 - i;
+ uses = ctx.uses[instr->operands[i].tempId()];
}
+
if (mul_instr) {
- Operand op[3] = {Operand(v1), Operand(v1), Operand(v1)};
+ /* turn mul+add into v_mad/v_fma */
+ Operand op[3] = {mul_instr->operands[0], mul_instr->operands[1], instr->operands[add_op_idx]};
+ ctx.uses[mul_instr->definitions[0].tempId()]--;
+ if (ctx.uses[mul_instr->definitions[0].tempId()]) {
+ if (op[0].isTemp())
+ ctx.uses[op[0].tempId()]++;
+ if (op[1].isTemp())
+ ctx.uses[op[1].tempId()]++;
+ }
+
bool neg[3] = {false, false, false};
bool abs[3] = {false, false, false};
unsigned omod = 0;
bool clamp = false;
- op[0] = mul_instr->operands[0];
- op[1] = mul_instr->operands[1];
- op[2] = instr->operands[add_op_idx];
- // TODO: would be better to check this before selecting a mul instr?
- if (!check_vop3_operands(ctx, 3, op))
- return;
- if (mul_instr->isSDWA())
- return;
if (mul_instr->isVOP3()) {
VOP3A_instruction* vop3 = static_cast<VOP3A_instruction*> (mul_instr);
@@ -2847,18 +2848,6 @@ void combine_instruction(opt_ctx &ctx, Block& block, aco_ptr<Instruction>& instr
neg[1] = vop3->neg[1];
abs[0] = vop3->abs[0];
abs[1] = vop3->abs[1];
- /* we cannot use these modifiers between mul and add */
- if (vop3->clamp || vop3->omod)
- return;
- }
-
- /* convert to mad */
- ctx.uses[mul_instr->definitions[0].tempId()]--;
- if (ctx.uses[mul_instr->definitions[0].tempId()]) {
- if (op[0].isTemp())
- ctx.uses[op[0].tempId()]++;
- if (op[1].isTemp())
- ctx.uses[op[1].tempId()]++;
}
if (instr->isVOP3()) {
@@ -2888,8 +2877,7 @@ void combine_instruction(opt_ctx &ctx, Block& block, aco_ptr<Instruction>& instr
(ctx.program->chip_class == GFX8 ? aco_opcode::v_mad_legacy_f16 : aco_opcode::v_mad_f16);
aco_ptr<VOP3A_instruction> mad{create_instruction<VOP3A_instruction>(mad_op, Format::VOP3A, 3, 1)};
- for (unsigned i = 0; i < 3; i++)
- {
+ for (unsigned i = 0; i < 3; i++) {
mad->operands[i] = op[i];
mad->neg[i] = neg[i];
mad->abs[i] = abs[i];