summaryrefslogtreecommitdiff
path: root/src/gallium/drivers/nouveau
diff options
context:
space:
mode:
authorRhys Perry <pendingchaos02@gmail.com>2018-06-13 16:25:23 +0100
committerRhys Perry <pendingchaos02@gmail.com>2018-08-27 13:56:44 +0100
commitb60bc7a4ab0b252ce8f8a82076a45f3a16202d58 (patch)
treeb7fbf15015e2195ae993d777f36de0638c57a277 /src/gallium/drivers/nouveau
parentbcbcdf84485192c0f6b44d94a423d80ba204bb4b (diff)
nv50/ir: optimize imul/imad to xmads
This hits the shader-db numbers a good bit, though a few xmads is way faster than an imul or imad and the cost is mitigated by the next commit, which optimizes many multiplications by immediates into shorter and less register heavy instructions than the xmads. total instructions in shared programs : 5768871 -> 5820882 (0.90%) total gprs used in shared programs : 669919 -> 670595 (0.10%) total shared used in shared programs : 548832 -> 548832 (0.00%) total local used in shared programs : 21068 -> 21164 (0.46%) local shared gpr inst bytes helped 0 0 38 0 0 hurt 1 0 365 3076 3076 Signed-off-by: Rhys Perry <pendingchaos02@gmail.com> Reviewed-by: Karol Herbst <kherbst@redhat.com>
Diffstat (limited to 'src/gallium/drivers/nouveau')
-rw-r--r--src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp56
-rw-r--r--src/gallium/drivers/nouveau/codegen/nv50_ir_target_gm107.cpp1
2 files changed, 56 insertions, 1 deletions
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
index 5b4a98d25cb..dc7bf24ba23 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
@@ -2301,13 +2301,18 @@ AlgebraicOpt::visit(BasicBlock *bb)
// =============================================================================
// ADD(SHL(a, b), c) -> SHLADD(a, b, c)
+// MUL(a, b) -> a few XMADs
+// MAD/FMA(a, b, c) -> a few XMADs
class LateAlgebraicOpt : public Pass
{
private:
virtual bool visit(Instruction *);
void handleADD(Instruction *);
+ void handleMULMAD(Instruction *);
bool tryADDToSHLADD(Instruction *);
+
+ BuildUtil bld;
};
void
@@ -2368,6 +2373,52 @@ LateAlgebraicOpt::tryADDToSHLADD(Instruction *add)
return true;
}
+// MUL(a, b) -> a few XMADs
+// MAD/FMA(a, b, c) -> a few XMADs
+void
+LateAlgebraicOpt::handleMULMAD(Instruction *i)
+{
+ // TODO: handle NV50_IR_SUBOP_MUL_HIGH
+ if (!prog->getTarget()->isOpSupported(OP_XMAD, TYPE_U32))
+ return;
+ if (isFloatType(i->dType) || typeSizeof(i->dType) != 4)
+ return;
+ if (i->subOp || i->usesFlags() || i->flagsDef >= 0)
+ return;
+
+ assert(!i->src(0).mod);
+ assert(!i->src(1).mod);
+ assert(i->op == OP_MUL ? 1 : !i->src(2).mod);
+
+ bld.setPosition(i, false);
+
+ Value *a = i->getSrc(0);
+ Value *b = i->getSrc(1);
+ Value *c = i->op == OP_MUL ? bld.mkImm(0) : i->getSrc(2);
+
+ Value *tmp0 = bld.getSSA();
+ Value *tmp1 = bld.getSSA();
+
+ Instruction *insn = bld.mkOp3(OP_XMAD, TYPE_U32, tmp0, b, a, c);
+ insn->setPredicate(i->cc, i->getPredicate());
+
+ insn = bld.mkOp3(OP_XMAD, TYPE_U32, tmp1, b, a, bld.mkImm(0));
+ insn->setPredicate(i->cc, i->getPredicate());
+ insn->subOp = NV50_IR_SUBOP_XMAD_MRG | NV50_IR_SUBOP_XMAD_H1(1);
+
+ Value *pred = i->getPredicate();
+ i->setPredicate(i->cc, NULL);
+
+ i->op = OP_XMAD;
+ i->setSrc(0, b);
+ i->setSrc(1, tmp1);
+ i->setSrc(2, tmp0);
+ i->subOp = NV50_IR_SUBOP_XMAD_PSL | NV50_IR_SUBOP_XMAD_CBCC;
+ i->subOp |= NV50_IR_SUBOP_XMAD_H1(0) | NV50_IR_SUBOP_XMAD_H1(1);
+
+ i->setPredicate(i->cc, pred);
+}
+
bool
LateAlgebraicOpt::visit(Instruction *i)
{
@@ -2375,6 +2426,11 @@ LateAlgebraicOpt::visit(Instruction *i)
case OP_ADD:
handleADD(i);
break;
+ case OP_MUL:
+ case OP_MAD:
+ case OP_FMA:
+ handleMULMAD(i);
+ break;
default:
break;
}
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_gm107.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_gm107.cpp
index de07ad1de89..2e2e40770e1 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_gm107.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_gm107.cpp
@@ -170,7 +170,6 @@ TargetGM107::isBarrierRequired(const Instruction *insn) const
}
break;
case OPCLASS_ARITH:
- // TODO: IMUL/IMAD require barriers too, use of XMAD instead!
if ((insn->op == OP_MUL || insn->op == OP_MAD) &&
!isFloatType(insn->dType))
return true;