summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorIlia Mirkin <imirkin@alum.mit.edu>2017-02-04 22:31:04 -0500
committerIlia Mirkin <imirkin@alum.mit.edu>2017-02-09 12:57:48 -0500
commit1aefd6159c07cd5b646ce99afd96d4500020418a (patch)
treec60b7c400f5a67b19c2ccb2cf5d96b8e13492ce6
parent009c54aa7af1cc1c0eeb03939ce616957dab67fc (diff)
nvc0/ir: add support for all the new int64 tgsi opcodes
A few thoughts: - Some of that LegalizeSSA logic should really live much earlier and be subject to the likes of DCE and other useful passes - Some of the "lowering" done in from_tgsi should be done later so that proper optimization might be done. However this all works and the above can be improved upon later. Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
-rw-r--r--src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp222
-rw-r--r--src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp67
-rw-r--r--src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h2
-rw-r--r--src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp6
-rw-r--r--src/gallium/drivers/nouveau/codegen/nv50_ir_target.cpp3
-rw-r--r--src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp7
6 files changed, 302 insertions, 5 deletions
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
index 6320e529980..80cc7fa01ac 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
@@ -354,6 +354,14 @@ unsigned int Instruction::srcMask(unsigned int s) const
case TGSI_OPCODE_DSGE:
case TGSI_OPCODE_DSEQ:
case TGSI_OPCODE_DSNE:
+ case TGSI_OPCODE_U64SEQ:
+ case TGSI_OPCODE_U64SNE:
+ case TGSI_OPCODE_I64SLT:
+ case TGSI_OPCODE_U64SLT:
+ case TGSI_OPCODE_I64SGE:
+ case TGSI_OPCODE_U64SGE:
+ case TGSI_OPCODE_I642F:
+ case TGSI_OPCODE_U642F:
switch (util_bitcount(mask)) {
case 1: return 0x3;
case 2: return 0xf;
@@ -557,6 +565,7 @@ nv50_ir::DataType Instruction::inferSrcType() const
case TGSI_OPCODE_SHL:
case TGSI_OPCODE_U2F:
case TGSI_OPCODE_U2D:
+ case TGSI_OPCODE_U2I64:
case TGSI_OPCODE_UADD:
case TGSI_OPCODE_UDIV:
case TGSI_OPCODE_UMOD:
@@ -587,6 +596,7 @@ nv50_ir::DataType Instruction::inferSrcType() const
return nv50_ir::TYPE_U32;
case TGSI_OPCODE_I2F:
case TGSI_OPCODE_I2D:
+ case TGSI_OPCODE_I2I64:
case TGSI_OPCODE_IDIV:
case TGSI_OPCODE_IMUL_HI:
case TGSI_OPCODE_IMAX:
@@ -608,6 +618,8 @@ nv50_ir::DataType Instruction::inferSrcType() const
case TGSI_OPCODE_D2F:
case TGSI_OPCODE_D2I:
case TGSI_OPCODE_D2U:
+ case TGSI_OPCODE_D2I64:
+ case TGSI_OPCODE_D2U64:
case TGSI_OPCODE_DABS:
case TGSI_OPCODE_DNEG:
case TGSI_OPCODE_DADD:
@@ -630,6 +642,34 @@ nv50_ir::DataType Instruction::inferSrcType() const
case TGSI_OPCODE_DFLR:
case TGSI_OPCODE_DROUND:
return nv50_ir::TYPE_F64;
+ case TGSI_OPCODE_U64SEQ:
+ case TGSI_OPCODE_U64SNE:
+ case TGSI_OPCODE_U64SLT:
+ case TGSI_OPCODE_U64SGE:
+ case TGSI_OPCODE_U64MIN:
+ case TGSI_OPCODE_U64MAX:
+ case TGSI_OPCODE_U64ADD:
+ case TGSI_OPCODE_U64MUL:
+ case TGSI_OPCODE_U64SHL:
+ case TGSI_OPCODE_U64SHR:
+ case TGSI_OPCODE_U64DIV:
+ case TGSI_OPCODE_U64MOD:
+ case TGSI_OPCODE_U642F:
+ case TGSI_OPCODE_U642D:
+ return nv50_ir::TYPE_U64;
+ case TGSI_OPCODE_I64ABS:
+ case TGSI_OPCODE_I64SSG:
+ case TGSI_OPCODE_I64NEG:
+ case TGSI_OPCODE_I64SLT:
+ case TGSI_OPCODE_I64SGE:
+ case TGSI_OPCODE_I64MIN:
+ case TGSI_OPCODE_I64MAX:
+ case TGSI_OPCODE_I64SHR:
+ case TGSI_OPCODE_I64DIV:
+ case TGSI_OPCODE_I64MOD:
+ case TGSI_OPCODE_I642F:
+ case TGSI_OPCODE_I642D:
+ return nv50_ir::TYPE_S64;
default:
return nv50_ir::TYPE_F32;
}
@@ -650,17 +690,35 @@ nv50_ir::DataType Instruction::inferDstType() const
case TGSI_OPCODE_DSGE:
case TGSI_OPCODE_DSLT:
case TGSI_OPCODE_DSNE:
+ case TGSI_OPCODE_I64SLT:
+ case TGSI_OPCODE_I64SGE:
+ case TGSI_OPCODE_U64SEQ:
+ case TGSI_OPCODE_U64SNE:
+ case TGSI_OPCODE_U64SLT:
+ case TGSI_OPCODE_U64SGE:
case TGSI_OPCODE_PK2H:
return nv50_ir::TYPE_U32;
case TGSI_OPCODE_I2F:
case TGSI_OPCODE_U2F:
case TGSI_OPCODE_D2F:
+ case TGSI_OPCODE_I642F:
+ case TGSI_OPCODE_U642F:
case TGSI_OPCODE_UP2H:
return nv50_ir::TYPE_F32;
case TGSI_OPCODE_I2D:
case TGSI_OPCODE_U2D:
case TGSI_OPCODE_F2D:
+ case TGSI_OPCODE_I642D:
+ case TGSI_OPCODE_U642D:
return nv50_ir::TYPE_F64;
+ case TGSI_OPCODE_I2I64:
+ case TGSI_OPCODE_U2I64:
+ case TGSI_OPCODE_F2I64:
+ case TGSI_OPCODE_D2I64:
+ return nv50_ir::TYPE_S64;
+ case TGSI_OPCODE_F2U64:
+ case TGSI_OPCODE_D2U64:
+ return nv50_ir::TYPE_U64;
default:
return inferSrcType();
}
@@ -676,6 +734,8 @@ nv50_ir::CondCode Instruction::getSetCond() const
case TGSI_OPCODE_USLT:
case TGSI_OPCODE_FSLT:
case TGSI_OPCODE_DSLT:
+ case TGSI_OPCODE_I64SLT:
+ case TGSI_OPCODE_U64SLT:
return CC_LT;
case TGSI_OPCODE_SLE:
return CC_LE;
@@ -684,6 +744,8 @@ nv50_ir::CondCode Instruction::getSetCond() const
case TGSI_OPCODE_USGE:
case TGSI_OPCODE_FSGE:
case TGSI_OPCODE_DSGE:
+ case TGSI_OPCODE_I64SGE:
+ case TGSI_OPCODE_U64SGE:
return CC_GE;
case TGSI_OPCODE_SGT:
return CC_GT;
@@ -691,10 +753,12 @@ nv50_ir::CondCode Instruction::getSetCond() const
case TGSI_OPCODE_USEQ:
case TGSI_OPCODE_FSEQ:
case TGSI_OPCODE_DSEQ:
+ case TGSI_OPCODE_U64SEQ:
return CC_EQ;
case TGSI_OPCODE_SNE:
case TGSI_OPCODE_FSNE:
case TGSI_OPCODE_DSNE:
+ case TGSI_OPCODE_U64SNE:
return CC_NEU;
case TGSI_OPCODE_USNE:
return CC_NE;
@@ -832,6 +896,35 @@ static nv50_ir::operation translateOpcode(uint opcode)
NV50_IR_OPCODE_CASE(DFLR, FLOOR);
NV50_IR_OPCODE_CASE(DROUND, CVT);
+ NV50_IR_OPCODE_CASE(U64SEQ, SET);
+ NV50_IR_OPCODE_CASE(U64SNE, SET);
+ NV50_IR_OPCODE_CASE(U64SLT, SET);
+ NV50_IR_OPCODE_CASE(U64SGE, SET);
+ NV50_IR_OPCODE_CASE(I64SLT, SET);
+ NV50_IR_OPCODE_CASE(I64SGE, SET);
+ NV50_IR_OPCODE_CASE(I2I64, CVT);
+ NV50_IR_OPCODE_CASE(U2I64, CVT);
+ NV50_IR_OPCODE_CASE(F2I64, CVT);
+ NV50_IR_OPCODE_CASE(F2U64, CVT);
+ NV50_IR_OPCODE_CASE(D2I64, CVT);
+ NV50_IR_OPCODE_CASE(D2U64, CVT);
+ NV50_IR_OPCODE_CASE(I642F, CVT);
+ NV50_IR_OPCODE_CASE(U642F, CVT);
+ NV50_IR_OPCODE_CASE(I642D, CVT);
+ NV50_IR_OPCODE_CASE(U642D, CVT);
+
+ NV50_IR_OPCODE_CASE(I64MIN, MIN);
+ NV50_IR_OPCODE_CASE(U64MIN, MIN);
+ NV50_IR_OPCODE_CASE(I64MAX, MAX);
+ NV50_IR_OPCODE_CASE(U64MAX, MAX);
+ NV50_IR_OPCODE_CASE(I64ABS, ABS);
+ NV50_IR_OPCODE_CASE(I64NEG, NEG);
+ NV50_IR_OPCODE_CASE(U64ADD, ADD);
+ NV50_IR_OPCODE_CASE(U64MUL, MUL);
+ NV50_IR_OPCODE_CASE(U64SHL, SHL);
+ NV50_IR_OPCODE_CASE(I64SHR, SHR);
+ NV50_IR_OPCODE_CASE(U64SHR, SHR);
+
NV50_IR_OPCODE_CASE(IMUL_HI, MUL);
NV50_IR_OPCODE_CASE(UMUL_HI, MUL);
@@ -3721,6 +3814,8 @@ Converter::handleInstruction(const struct tgsi_full_instruction *insn)
case TGSI_OPCODE_INTERP_OFFSET:
handleINTERP(dst0);
break;
+ case TGSI_OPCODE_I642F:
+ case TGSI_OPCODE_U642F:
case TGSI_OPCODE_D2I:
case TGSI_OPCODE_D2U:
case TGSI_OPCODE_D2F: {
@@ -3737,16 +3832,79 @@ Converter::handleInstruction(const struct tgsi_full_instruction *insn)
}
break;
}
+ case TGSI_OPCODE_I2I64:
+ FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) {
+ dst0[c] = fetchSrc(0, c / 2);
+ mkOp2(OP_SHR, TYPE_S32, dst0[c + 1], dst0[c], loadImm(NULL, 31));
+ c++;
+ }
+ break;
+ case TGSI_OPCODE_U2I64:
+ FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) {
+ dst0[c] = fetchSrc(0, c / 2);
+ dst0[c + 1] = zero;
+ c++;
+ }
+ break;
+ case TGSI_OPCODE_F2I64:
+ case TGSI_OPCODE_F2U64:
case TGSI_OPCODE_I2D:
case TGSI_OPCODE_U2D:
case TGSI_OPCODE_F2D:
FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) {
Value *dreg = getSSA(8);
- mkCvt(OP_CVT, dstTy, dreg, srcTy, fetchSrc(0, c / 2));
+ Instruction *cvt = mkCvt(OP_CVT, dstTy, dreg, srcTy, fetchSrc(0, c / 2));
+ if (!isFloatType(dstTy))
+ cvt->rnd = ROUND_Z;
mkSplit(&dst0[c], 4, dreg);
c++;
}
break;
+ case TGSI_OPCODE_D2I64:
+ case TGSI_OPCODE_D2U64:
+ case TGSI_OPCODE_I642D:
+ case TGSI_OPCODE_U642D:
+ FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) {
+ src0 = getSSA(8);
+ Value *dst = getSSA(8), *tmp[2];
+ tmp[0] = fetchSrc(0, c);
+ tmp[1] = fetchSrc(0, c + 1);
+ mkOp2(OP_MERGE, TYPE_U64, src0, tmp[0], tmp[1]);
+ Instruction *cvt = mkCvt(OP_CVT, dstTy, dst, srcTy, src0);
+ if (!isFloatType(dstTy))
+ cvt->rnd = ROUND_Z;
+ mkSplit(&dst0[c], 4, dst);
+ c++;
+ }
+ break;
+ case TGSI_OPCODE_I64NEG:
+ FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) {
+ src0 = getSSA(8);
+ Value *dst = getSSA(8), *tmp[2];
+ tmp[0] = fetchSrc(0, c);
+ tmp[1] = fetchSrc(0, c + 1);
+ mkOp2(OP_MERGE, TYPE_U64, src0, tmp[0], tmp[1]);
+ mkOp2(OP_SUB, dstTy, dst, zero, src0);
+ mkSplit(&dst0[c], 4, dst);
+ c++;
+ }
+ break;
+ case TGSI_OPCODE_I64ABS:
+ FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) {
+ src0 = getSSA(8);
+ Value *neg = getSSA(8), *srcComp[2], *negComp[2];
+ srcComp[0] = fetchSrc(0, c);
+ srcComp[1] = fetchSrc(0, c + 1);
+ mkOp2(OP_MERGE, TYPE_U64, src0, srcComp[0], srcComp[1]);
+ mkOp2(OP_SUB, dstTy, neg, zero, src0);
+ mkSplit(negComp, 4, neg);
+ mkCmp(OP_SLCT, CC_LT, TYPE_S32, dst0[c], TYPE_S32,
+ negComp[0], srcComp[0], srcComp[1]);
+ mkCmp(OP_SLCT, CC_LT, TYPE_S32, dst0[c + 1], TYPE_S32,
+ negComp[1], srcComp[1], srcComp[1]);
+ c++;
+ }
+ break;
case TGSI_OPCODE_DABS:
case TGSI_OPCODE_DNEG:
case TGSI_OPCODE_DRCP:
@@ -3779,6 +3937,12 @@ Converter::handleInstruction(const struct tgsi_full_instruction *insn)
c++;
}
break;
+ case TGSI_OPCODE_U64SEQ:
+ case TGSI_OPCODE_U64SNE:
+ case TGSI_OPCODE_U64SLT:
+ case TGSI_OPCODE_U64SGE:
+ case TGSI_OPCODE_I64SLT:
+ case TGSI_OPCODE_I64SGE:
case TGSI_OPCODE_DSLT:
case TGSI_OPCODE_DSGE:
case TGSI_OPCODE_DSEQ:
@@ -3800,6 +3964,46 @@ Converter::handleInstruction(const struct tgsi_full_instruction *insn)
}
break;
}
+ case TGSI_OPCODE_U64MIN:
+ case TGSI_OPCODE_U64MAX:
+ case TGSI_OPCODE_I64MIN:
+ case TGSI_OPCODE_I64MAX: {
+ dstTy = isSignedIntType(dstTy) ? TYPE_S32 : TYPE_U32;
+ FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) {
+ Value *flag = getSSA(1, FILE_FLAGS);
+ src0 = fetchSrc(0, c + 1);
+ src1 = fetchSrc(1, c + 1);
+ geni = mkOp2(op, dstTy, dst0[c + 1], src0, src1);
+ geni->subOp = NV50_IR_SUBOP_MINMAX_HIGH;
+ geni->setFlagsDef(1, flag);
+
+ src0 = fetchSrc(0, c);
+ src1 = fetchSrc(1, c);
+ geni = mkOp2(op, TYPE_U32, dst0[c], src0, src1);
+ geni->subOp = NV50_IR_SUBOP_MINMAX_LOW;
+ geni->setFlagsSrc(2, flag);
+
+ c++;
+ }
+ break;
+ }
+ case TGSI_OPCODE_U64SHL:
+ case TGSI_OPCODE_I64SHR:
+ case TGSI_OPCODE_U64SHR:
+ FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) {
+ src0 = getSSA(8);
+ Value *dst = getSSA(8), *tmp[2];
+ tmp[0] = fetchSrc(0, c);
+ tmp[1] = fetchSrc(0, c + 1);
+ mkOp2(OP_MERGE, TYPE_U64, src0, tmp[0], tmp[1]);
+ src1 = fetchSrc(1, c / 2);
+ mkOp2(op, dstTy, dst, src0, src1);
+ mkSplit(&dst0[c], 4, dst);
+ c++;
+ }
+ break;
+ case TGSI_OPCODE_U64ADD:
+ case TGSI_OPCODE_U64MUL:
case TGSI_OPCODE_DADD:
case TGSI_OPCODE_DMUL:
case TGSI_OPCODE_DDIV:
@@ -3873,6 +4077,22 @@ Converter::handleInstruction(const struct tgsi_full_instruction *insn)
mkSplit(&dst0[c], 4, dst);
c++;
}
+ case TGSI_OPCODE_I64SSG:
+ FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) {
+ src0 = getSSA(8);
+ Value *tmp[2];
+ tmp[0] = fetchSrc(0, c);
+ tmp[1] = fetchSrc(0, c + 1);
+ mkOp2(OP_MERGE, TYPE_U64, src0, tmp[0], tmp[1]);
+
+ val0 = getScratch();
+ val1 = getScratch();
+ mkCmp(OP_SET, CC_GT, TYPE_U32, val0, TYPE_S64, src0, zero);
+ mkCmp(OP_SET, CC_LT, TYPE_U32, val1, TYPE_S64, src0, zero);
+ mkOp2(OP_SUB, TYPE_S32, dst0[c], val1, val0);
+ mkOp2(OP_SHR, TYPE_S32, dst0[c + 1], dst0[c], loadImm(0, 31));
+ c++;
+ }
break;
default:
ERROR("unhandled TGSI opcode: %u\n", tgsi.getOpcode());
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
index ec50578c32a..772ea616f16 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
@@ -147,6 +147,59 @@ NVC0LegalizeSSA::handleTEXLOD(TexInstruction *i)
i->moveSources(arg + 1, -1);
}
+void
+NVC0LegalizeSSA::handleShift(Instruction *lo)
+{
+ Instruction *hi = new_Instruction(func, lo->op, TYPE_U32);
+ lo->bb->insertAfter(lo, hi);
+ bld.setPosition(lo, false);
+
+ Value *src[2], *dst[2] = {bld.getSSA(), bld.getSSA()};
+ Value *dst64 = lo->getDef(0), *shift = lo->getSrc(1);
+ bld.mkSplit(src, 4, lo->getSrc(0));
+
+ hi->sType = lo->sType;
+ lo->dType = TYPE_U32;
+
+ hi->setDef(0, dst[1]);
+ if (lo->op == OP_SHR)
+ hi->subOp |= NV50_IR_SUBOP_SHIFT_HIGH;
+ lo->setDef(0, dst[0]);
+
+ bld.setPosition(hi, true);
+
+ if (lo->op == OP_SHL)
+ std::swap(hi, lo);
+
+ hi->setSrc(0, new_ImmediateValue(prog, 0u));
+ hi->setSrc(1, shift);
+ hi->setSrc(2, lo->op == OP_SHL ? src[0] : src[1]);
+
+ lo->setSrc(0, src[0]);
+ lo->setSrc(1, shift);
+ lo->setSrc(2, src[1]);
+
+ bld.mkOp2(OP_MERGE, TYPE_U64, dst64, dst[0], dst[1]);
+}
+
+void
+NVC0LegalizeSSA::handleSET(CmpInstruction *cmp)
+{
+ DataType hTy = cmp->sType == TYPE_S64 ? TYPE_S32 : TYPE_U32;
+ Value *carry;
+ Value *src0[2], *src1[2];
+ bld.setPosition(cmp, false);
+
+ bld.mkSplit(src0, 4, cmp->getSrc(0));
+ bld.mkSplit(src1, 4, cmp->getSrc(1));
+ bld.mkOp2(OP_SUB, hTy, NULL, src0[0], src1[0])
+ ->setFlagsDef(1, (carry = bld.getSSA(1, FILE_FLAGS)));
+ cmp->setFlagsSrc(cmp->srcCount(), carry);
+ cmp->setSrc(0, src0[1]);
+ cmp->setSrc(1, src1[1]);
+ cmp->sType = hTy;
+}
+
bool
NVC0LegalizeSSA::visit(Function *fn)
{
@@ -179,6 +232,18 @@ NVC0LegalizeSSA::visit(BasicBlock *bb)
case OP_TXF:
handleTEXLOD(i->asTex());
break;
+ case OP_SHR:
+ case OP_SHL:
+ if (typeSizeof(i->sType) == 8)
+ handleShift(i);
+ break;
+ case OP_SET:
+ case OP_SET_AND:
+ case OP_SET_OR:
+ case OP_SET_XOR:
+ if (typeSizeof(i->sType) == 8 && i->sType != TYPE_F64)
+ handleSET(i->asCmp());
+ break;
default:
break;
}
@@ -612,7 +677,7 @@ NVC0LegalizePostRA::visit(BasicBlock *bb)
} else {
// TODO: Move this to before register allocation for operations that
// need the $c register !
- if (typeSizeof(i->dType) == 8) {
+ if (typeSizeof(i->sType) == 8 || typeSizeof(i->dType) == 8) {
Instruction *hi;
hi = BuildUtil::split64BitOpPostRA(func, i, rZero, carry);
if (hi)
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h
index d91b6aa1c72..7fae7e24b99 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h
@@ -35,7 +35,9 @@ private:
void handleDIV(Instruction *); // integer division, modulus
void handleRCPRSQ(Instruction *); // double precision float recip/rsqrt
void handleFTZ(Instruction *);
+ void handleSET(CmpInstruction *);
void handleTEXLOD(TexInstruction *);
+ void handleShift(Instruction *);
protected:
BuildUtil bld;
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
index d788b36e1df..d79e87dc9f6 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
@@ -1054,8 +1054,12 @@ ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s)
i->setSrc(1, new_ImmediateValue(prog, imm0.reg.data.u32));
}
break;
- case OP_ADD:
case OP_SUB:
+ if (imm0.isInteger(0) && s == 0 && typeSizeof(i->dType) == 8 &&
+ !isFloatType(i->dType))
+ break;
+ /* fallthrough */
+ case OP_ADD:
if (i->usesFlags())
break;
if (imm0.isInteger(0)) {
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_target.cpp
index 273ec34fd3b..298e7c6ef9b 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target.cpp
@@ -392,7 +392,8 @@ Program::emitBinary(struct nv50_ir_prog_info *info)
for (Instruction *i = fn->bbArray[b]->getEntry(); i; i = i->next) {
emit->emitInstruction(i);
info->bin.instructions++;
- if (i->sType == TYPE_F64 || i->dType == TYPE_F64)
+ if ((typeSizeof(i->sType) == 8 || typeSizeof(i->dType) == 8) &&
+ (isFloatType(i->sType) || isFloatType(i->dType)))
info->io.fp64 = true;
}
}
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp
index 8a67b0afbca..abdb3289499 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp
@@ -329,6 +329,10 @@ TargetNVC0::insnCanLoad(const Instruction *i, int s,
// indirect loads can only be done by OP_LOAD/VFETCH/INTERP on nvc0
if (ld->src(0).isIndirect(0))
return false;
+ // these are implemented using shf.r and shf.l which can't load consts
+ if ((i->op == OP_SHL || i->op == OP_SHR) && typeSizeof(i->sType) == 8 &&
+ sf == FILE_MEMORY_CONST)
+ return false;
for (int k = 0; i->srcExists(k); ++k) {
if (i->src(k).getFile() == FILE_IMMEDIATE) {
@@ -340,7 +344,8 @@ TargetNVC0::insnCanLoad(const Instruction *i, int s,
return false;
} else
if (i->src(k).getFile() != FILE_GPR &&
- i->src(k).getFile() != FILE_PREDICATE) {
+ i->src(k).getFile() != FILE_PREDICATE &&
+ i->src(k).getFile() != FILE_FLAGS) {
return false;
}
}