summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorVincent Lejeune <vljn@ovi.com>2013-06-26 18:09:58 +0200
committerTom Stellard <thomas.stellard@amd.com>2013-06-28 18:23:01 -0400
commit4accdea1dc1d434349575b39b5670b747126260a (patch)
tree913777940a7a154a2cfc8915a5c0ab5717e4ad02
parentafb82e47485253dc1827acf5660381660e728ab6 (diff)
R600: Support schedule and packetization of trans-only instvliw5-rebase
-rw-r--r--lib/Target/R600/R600InstrInfo.cpp185
-rw-r--r--lib/Target/R600/R600InstrInfo.h32
-rw-r--r--lib/Target/R600/R600Instructions.td1
-rw-r--r--lib/Target/R600/R600MachineScheduler.cpp25
-rw-r--r--lib/Target/R600/R600MachineScheduler.h1
-rw-r--r--lib/Target/R600/R600Packetizer.cpp90
-rw-r--r--lib/Target/R600/R600RegisterInfo.td1
-rw-r--r--test/CodeGen/R600/fdiv.ll8
-rw-r--r--test/CodeGen/R600/fp_to_sint.ll8
-rw-r--r--test/CodeGen/R600/fp_to_uint.ll8
-rw-r--r--test/CodeGen/R600/llvm.cos.ll2
-rw-r--r--test/CodeGen/R600/llvm.pow.ll4
-rw-r--r--test/CodeGen/R600/llvm.sin.ll2
13 files changed, 261 insertions, 106 deletions
diff --git a/lib/Target/R600/R600InstrInfo.cpp b/lib/Target/R600/R600InstrInfo.cpp
index 354f039c1e8..974a5794505 100644
--- a/lib/Target/R600/R600InstrInfo.cpp
+++ b/lib/Target/R600/R600InstrInfo.cpp
@@ -250,8 +250,9 @@ R600InstrInfo::getSrcs(MachineInstr *MI) const {
std::vector<std::pair<int, unsigned> >
R600InstrInfo::ExtractSrcs(MachineInstr *MI,
- const DenseMap<unsigned, unsigned> &PV)
- const {
+ const DenseMap<unsigned, unsigned> &PV,
+ unsigned &ConstCount) const {
+ ConstCount = 0;
const SmallVector<std::pair<MachineOperand *, int64_t>, 3> Srcs = getSrcs(MI);
const std::pair<int, unsigned> DummyPair(-1, 0);
std::vector<std::pair<int, unsigned> > Result;
@@ -259,18 +260,20 @@ R600InstrInfo::ExtractSrcs(MachineInstr *MI,
for (unsigned n = Srcs.size(); i < n; ++i) {
unsigned Reg = Srcs[i].first->getReg();
unsigned Index = RI.getEncodingValue(Reg) & 0xff;
- unsigned Chan = RI.getHWRegChan(Reg);
if (Reg == AMDGPU::OQAP) {
Result.push_back(std::pair<int, unsigned>(Index, 0));
}
- if (Index > 127) {
- Result.push_back(DummyPair);
+ if (PV.find(Reg) != PV.end()) {
+ // 255 is used to tells its a PS/PV reg
+ Result.push_back(std::pair<int, unsigned>(255, 0));
continue;
}
- if (PV.find(Reg) != PV.end()) {
+ if (Index > 127) {
+ ConstCount++;
Result.push_back(DummyPair);
continue;
}
+ unsigned Chan = RI.getHWRegChan(Reg);
Result.push_back(std::pair<int, unsigned>(Index, Chan));
}
for (; i < 3; ++i)
@@ -305,23 +308,51 @@ Swizzle(std::vector<std::pair<int, unsigned> > Src,
return Src;
}
-bool
-R600InstrInfo::isLegal(
- const std::vector<std::vector<std::pair<int, unsigned> > > &IGSrcs,
- const std::vector<R600InstrInfo::BankSwizzle> &Swz,
- unsigned CheckedSize) const {
+static unsigned
+getTransSwizzle(R600InstrInfo::BankSwizzle Swz, unsigned Op) {
+ switch (Swz) {
+ case R600InstrInfo::ALU_VEC_012_SCL_210: {
+ unsigned Cycles[3] = { 2, 1, 0};
+ return Cycles[Op];
+ }
+ case R600InstrInfo::ALU_VEC_021_SCL_122: {
+ unsigned Cycles[3] = { 1, 2, 2};
+ return Cycles[Op];
+ }
+ case R600InstrInfo::ALU_VEC_120_SCL_212: {
+ unsigned Cycles[3] = { 2, 1, 2};
+ return Cycles[Op];
+ }
+ case R600InstrInfo::ALU_VEC_102_SCL_221: {
+ unsigned Cycles[3] = { 2, 2, 1};
+ return Cycles[Op];
+ }
+ default:
+ llvm_unreachable("Wrong Swizzle for Trans Slot");
+ return 0;
+ }
+}
+
+/// returns how many MIs (whose inputs are represented by IGSrcs) can be packed
+/// in the same Instruction Group while meeting read port limitations given a
+/// Swz swizzle sequence.
+unsigned R600InstrInfo::isLegalUpTo(
+ const std::vector<std::vector<std::pair<int, unsigned> > > &IGSrcs,
+ const std::vector<R600InstrInfo::BankSwizzle> &Swz,
+ const std::vector<std::pair<int, unsigned> > &TransSrcs,
+ R600InstrInfo::BankSwizzle TransSwz) const {
int Vector[4][3];
memset(Vector, -1, sizeof(Vector));
- for (unsigned i = 0; i < CheckedSize; i++) {
+ for (unsigned i = 0, e = IGSrcs.size(); i < e; i++) {
const std::vector<std::pair<int, unsigned> > &Srcs =
Swizzle(IGSrcs[i], Swz[i]);
for (unsigned j = 0; j < 3; j++) {
const std::pair<int, unsigned> &Src = Srcs[j];
- if (Src.first < 0)
+ if (Src.first < 0 || Src.first == 255)
continue;
if (Src.first == GET_REG_INDEX(RI.getEncodingValue(AMDGPU::OQAP))) {
- if (Swz[i] != R600InstrInfo::ALU_VEC_012 &&
- Swz[i] != R600InstrInfo::ALU_VEC_021) {
+ if (Swz[i] != R600InstrInfo::ALU_VEC_012_SCL_210 &&
+ Swz[i] != R600InstrInfo::ALU_VEC_021_SCL_122) {
// The value from output queue A (denoted by register OQAP) can
// only be fetched during the first cycle.
return false;
@@ -332,51 +363,126 @@ R600InstrInfo::isLegal(
if (Vector[Src.second][j] < 0)
Vector[Src.second][j] = Src.first;
if (Vector[Src.second][j] != Src.first)
- return false;
+ return i;
}
}
- return true;
+ // Now check Trans Alu
+ for (unsigned i = 0, e = TransSrcs.size(); i < e; ++i) {
+ const std::pair<int, unsigned> &Src = TransSrcs[i];
+ unsigned Cycle = getTransSwizzle(TransSwz, i);
+ if (Src.first < 0)
+ continue;
+ if (Src.first == 255)
+ continue;
+ if (Vector[Src.second][Cycle] < 0)
+ Vector[Src.second][Cycle] = Src.first;
+ if (Vector[Src.second][Cycle] != Src.first)
+ return IGSrcs.size() - 1;
+ }
+ return IGSrcs.size();
}
-bool
-R600InstrInfo::recursiveFitsFPLimitation(
- const std::vector<std::vector<std::pair<int, unsigned> > > &IGSrcs,
- std::vector<R600InstrInfo::BankSwizzle> &SwzCandidate,
- unsigned Depth) const {
- if (!isLegal(IGSrcs, SwzCandidate, Depth))
+/// Given a swizzle sequence SwzCandidate and an index Idx, returns the next
+/// (in lexicographic term) swizzle sequence assuming that all swizzles after
+/// Idx can be skipped
+static bool
+NextPossibleSolution(
+ std::vector<R600InstrInfo::BankSwizzle> &SwzCandidate,
+ unsigned Idx) {
+ assert(Idx < SwzCandidate.size());
+ int ResetIdx = Idx;
+ while (ResetIdx > -1 && SwzCandidate[ResetIdx] == R600InstrInfo::ALU_VEC_210)
+ ResetIdx --;
+ for (unsigned i = ResetIdx + 1, e = SwzCandidate.size(); i < e; i++) {
+ SwzCandidate[i] = R600InstrInfo::ALU_VEC_012_SCL_210;
+ }
+ if (ResetIdx == -1)
return false;
- if (IGSrcs.size() == Depth)
- return true;
- unsigned i = SwzCandidate[Depth];
- for (; i < 6; i++) {
- SwzCandidate[Depth] = (R600InstrInfo::BankSwizzle) i;
- if (recursiveFitsFPLimitation(IGSrcs, SwzCandidate, Depth + 1))
+ SwzCandidate[ResetIdx]++;
+ return true;
+}
+
+/// Enumerate all possible Swizzle sequence to find one that can meet all
+/// read port requirements.
+bool R600InstrInfo::FindSwizzleForVectorSlot(
+ const std::vector<std::vector<std::pair<int, unsigned> > > &IGSrcs,
+ std::vector<R600InstrInfo::BankSwizzle> &SwzCandidate,
+ const std::vector<std::pair<int, unsigned> > &TransSrcs,
+ R600InstrInfo::BankSwizzle TransSwz) const {
+ unsigned ValidUpTo = 0;
+ do {
+ ValidUpTo = isLegalUpTo(IGSrcs, SwzCandidate, TransSrcs, TransSwz);
+ if (ValidUpTo == IGSrcs.size())
return true;
- }
- SwzCandidate[Depth] = R600InstrInfo::ALU_VEC_012;
+ } while (NextPossibleSolution(SwzCandidate, ValidUpTo));
return false;
}
+/// Instructions in Trans slot can't read gpr at cycle 0 if they also read
+/// a const, and can't read a gpr at cycle 1 if they read 2 const.
+static bool
+isConstCompatible(R600InstrInfo::BankSwizzle TransSwz,
+ const std::vector<std::pair<int, unsigned> > &TransOps,
+ unsigned ConstCount) {
+ for (unsigned i = 0, e = TransOps.size(); i < e; ++i) {
+ const std::pair<int, unsigned> &Src = TransOps[i];
+ unsigned Cycle = getTransSwizzle(TransSwz, i);
+ if (Src.first < 0)
+ continue;
+ if (ConstCount > 0 && Cycle == 0)
+ return false;
+ if (ConstCount > 1 && Cycle == 1)
+ return false;
+ }
+ return true;
+}
+
bool
R600InstrInfo::fitsReadPortLimitations(const std::vector<MachineInstr *> &IG,
- const DenseMap<unsigned, unsigned> &PV,
- std::vector<BankSwizzle> &ValidSwizzle)
+ const DenseMap<unsigned, unsigned> &PV,
+ std::vector<BankSwizzle> &ValidSwizzle,
+ bool isLastAluTrans)
const {
//Todo : support shared src0 - src1 operand
std::vector<std::vector<std::pair<int, unsigned> > > IGSrcs;
ValidSwizzle.clear();
+ unsigned ConstCount;
+ BankSwizzle TransBS;
for (unsigned i = 0, e = IG.size(); i < e; ++i) {
- IGSrcs.push_back(ExtractSrcs(IG[i], PV));
+ IGSrcs.push_back(ExtractSrcs(IG[i], PV, ConstCount));
unsigned Op = getOperandIdx(IG[i]->getOpcode(),
AMDGPU::OpName::bank_swizzle);
ValidSwizzle.push_back( (R600InstrInfo::BankSwizzle)
IG[i]->getOperand(Op).getImm());
}
- bool Result = recursiveFitsFPLimitation(IGSrcs, ValidSwizzle);
- if (!Result)
- return false;
- return true;
+ std::vector<std::pair<int, unsigned> > TransOps;
+ if (!isLastAluTrans)
+ return FindSwizzleForVectorSlot(IGSrcs, ValidSwizzle, TransOps, TransBS);
+
+ TransOps = IGSrcs.back();
+ IGSrcs.pop_back();
+ ValidSwizzle.pop_back();
+
+ static const R600InstrInfo::BankSwizzle TransSwz[] = {
+ ALU_VEC_012_SCL_210,
+ ALU_VEC_021_SCL_122,
+ ALU_VEC_120_SCL_212,
+ ALU_VEC_102_SCL_221
+ };
+ for (unsigned i = 0; i < 4; i++) {
+ TransBS = TransSwz[i];
+ if (!isConstCompatible(TransBS, TransOps, ConstCount))
+ continue;
+ bool Result = FindSwizzleForVectorSlot(IGSrcs, ValidSwizzle, TransOps,
+ TransBS);
+ if (Result) {
+ ValidSwizzle.push_back(TransBS);
+ return true;
+ }
+ }
+
+ return false;
}
@@ -406,7 +512,8 @@ R600InstrInfo::fitsConstReadLimitations(const std::vector<unsigned> &Consts)
}
bool
-R600InstrInfo::canBundle(const std::vector<MachineInstr *> &MIs) const {
+R600InstrInfo::fitsConstReadLimitations(const std::vector<MachineInstr *> &MIs)
+ const {
std::vector<unsigned> Consts;
for (unsigned i = 0, n = MIs.size(); i < n; i++) {
MachineInstr *MI = MIs[i];
diff --git a/lib/Target/R600/R600InstrInfo.h b/lib/Target/R600/R600InstrInfo.h
index a6add8382cf..1ba4160747b 100644
--- a/lib/Target/R600/R600InstrInfo.h
+++ b/lib/Target/R600/R600InstrInfo.h
@@ -84,26 +84,38 @@ namespace llvm {
SmallVector<std::pair<MachineOperand *, int64_t>, 3>
getSrcs(MachineInstr *MI) const;
- bool isLegal(
- const std::vector<std::vector<std::pair<int, unsigned> > > &IGSrcs,
- const std::vector<R600InstrInfo::BankSwizzle> &Swz,
- unsigned CheckedSize) const;
- bool recursiveFitsFPLimitation(
- const std::vector<std::vector<std::pair<int, unsigned> > > &IGSrcs,
- std::vector<R600InstrInfo::BankSwizzle> &SwzCandidate,
- unsigned Depth = 0) const;
+ unsigned isLegalUpTo(
+ const std::vector<std::vector<std::pair<int, unsigned> > > &IGSrcs,
+ const std::vector<R600InstrInfo::BankSwizzle> &Swz,
+ const std::vector<std::pair<int, unsigned> > &TransSrcs,
+ R600InstrInfo::BankSwizzle TransSwz) const;
+
+ bool FindSwizzleForVectorSlot(
+ const std::vector<std::vector<std::pair<int, unsigned> > > &IGSrcs,
+ std::vector<R600InstrInfo::BankSwizzle> &SwzCandidate,
+ const std::vector<std::pair<int, unsigned> > &TransSrcs,
+ R600InstrInfo::BankSwizzle TransSwz) const;
/// Given the order VEC_012 < VEC_021 < VEC_120 < VEC_102 < VEC_201 < VEC_210
/// returns true and the first (in lexical order) BankSwizzle affectation
/// starting from the one already provided in the Instruction Group MIs that
/// fits Read Port limitations in BS if available. Otherwise returns false
/// and undefined content in BS.
+ /// isLastAluTrans should be set if the last Alu of MIs will be executed on
+ /// Trans ALU. In this case, ValidTSwizzle returns the BankSwizzle value to
+ /// apply to the last instruction.
/// PV holds GPR to PV registers in the Instruction Group MIs.
bool fitsReadPortLimitations(const std::vector<MachineInstr *> &MIs,
const DenseMap<unsigned, unsigned> &PV,
- std::vector<BankSwizzle> &BS) const;
+ std::vector<BankSwizzle> &BS,
+ bool isLastAluTrans) const;
+
+ /// An instruction group can only access 2 channel pair (either [XY] or [ZW])
+ /// from KCache bank on R700+. This function check if MI set in input meet
+ /// this limitations
+ bool fitsConstReadLimitations(const std::vector<MachineInstr *> &) const;
+ /// Same but using const index set instead of MI set.
bool fitsConstReadLimitations(const std::vector<unsigned>&) const;
- bool canBundle(const std::vector<MachineInstr *> &) const;
/// \breif Vector instructions are instructions that must fill all
/// instruction slots within an instruction group.
diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td
index fd585f8b7e9..862080d9ca4 100644
--- a/lib/Target/R600/R600Instructions.td
+++ b/lib/Target/R600/R600Instructions.td
@@ -1489,6 +1489,7 @@ let hasSideEffects = 1 in {
def FLT_TO_INT_eg : FLT_TO_INT_Common<0x50> {
let Pattern = [];
+ let TransOnly = 0;
}
def INT_TO_FLT_eg : INT_TO_FLT_Common<0x9B>;
diff --git a/lib/Target/R600/R600MachineScheduler.cpp b/lib/Target/R600/R600MachineScheduler.cpp
index 7e28f9dde47..0dc0365926e 100644
--- a/lib/Target/R600/R600MachineScheduler.cpp
+++ b/lib/Target/R600/R600MachineScheduler.cpp
@@ -32,7 +32,7 @@ void R600SchedStrategy::initialize(ScheduleDAGMI *dag) {
MRI = &DAG->MRI;
CurInstKind = IDOther;
CurEmitted = 0;
- OccupedSlotsMask = 15;
+ OccupedSlotsMask = 31;
InstKindLimit[IDAlu] = TII->getMaxAlusPerClause();
InstKindLimit[IDOther] = 32;
@@ -160,7 +160,7 @@ void R600SchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
if (NextInstKind != CurInstKind) {
DEBUG(dbgs() << "Instruction Type Switch\n");
if (NextInstKind != IDAlu)
- OccupedSlotsMask = 15;
+ OccupedSlotsMask |= 31;
CurEmitted = 0;
CurInstKind = NextInstKind;
}
@@ -251,6 +251,9 @@ bool R600SchedStrategy::regBelongsToClass(unsigned Reg,
R600SchedStrategy::AluKind R600SchedStrategy::getAluKind(SUnit *SU) const {
MachineInstr *MI = SU->getInstr();
+ if (TII->isTransOnly(MI))
+ return AluTrans;
+
switch (MI->getOpcode()) {
case AMDGPU::PRED_X:
return AluPredX;
@@ -346,7 +349,7 @@ SUnit *R600SchedStrategy::PopInst(std::vector<SUnit *> &Q) {
It != E; ++It) {
SUnit *SU = *It;
InstructionsGroupCandidate.push_back(SU->getInstr());
- if (TII->canBundle(InstructionsGroupCandidate)) {
+ if (TII->fitsConstReadLimitations(InstructionsGroupCandidate)) {
InstructionsGroupCandidate.pop_back();
Q.erase((It + 1).base());
return SU;
@@ -421,7 +424,8 @@ unsigned R600SchedStrategy::AvailablesAluCount() const {
return AvailableAlus[AluAny].size() + AvailableAlus[AluT_XYZW].size() +
AvailableAlus[AluT_X].size() + AvailableAlus[AluT_Y].size() +
AvailableAlus[AluT_Z].size() + AvailableAlus[AluT_W].size() +
- AvailableAlus[AluDiscarded].size() + AvailableAlus[AluPredX].size();
+ AvailableAlus[AluTrans].size() + AvailableAlus[AluDiscarded].size() +
+ AvailableAlus[AluPredX].size();
}
SUnit* R600SchedStrategy::pickAlu() {
@@ -429,20 +433,27 @@ SUnit* R600SchedStrategy::pickAlu() {
if (!OccupedSlotsMask) {
// Bottom up scheduling : predX must comes first
if (!AvailableAlus[AluPredX].empty()) {
- OccupedSlotsMask = 15;
+ OccupedSlotsMask |= 31;
return PopInst(AvailableAlus[AluPredX]);
}
// Flush physical reg copies (RA will discard them)
if (!AvailableAlus[AluDiscarded].empty()) {
- OccupedSlotsMask = 15;
+ OccupedSlotsMask |= 31;
return PopInst(AvailableAlus[AluDiscarded]);
}
// If there is a T_XYZW alu available, use it
if (!AvailableAlus[AluT_XYZW].empty()) {
- OccupedSlotsMask = 15;
+ OccupedSlotsMask |= 15;
return PopInst(AvailableAlus[AluT_XYZW]);
}
}
+ bool TransSlotOccuped = OccupedSlotsMask & 16;
+ if (!TransSlotOccuped) {
+ if (!AvailableAlus[AluTrans].empty()) {
+ OccupedSlotsMask |= 16;
+ return PopInst(AvailableAlus[AluTrans]);
+ }
+ }
for (int Chan = 3; Chan > -1; --Chan) {
bool isOccupied = OccupedSlotsMask & (1 << Chan);
if (!isOccupied) {
diff --git a/lib/Target/R600/R600MachineScheduler.h b/lib/Target/R600/R600MachineScheduler.h
index aae8b3f4eb8..f8965d8998a 100644
--- a/lib/Target/R600/R600MachineScheduler.h
+++ b/lib/Target/R600/R600MachineScheduler.h
@@ -46,6 +46,7 @@ class R600SchedStrategy : public MachineSchedStrategy {
AluT_W,
AluT_XYZW,
AluPredX,
+ AluTrans,
AluDiscarded, // LLVM Instructions that are going to be eliminated
AluLast
};
diff --git a/lib/Target/R600/R600Packetizer.cpp b/lib/Target/R600/R600Packetizer.cpp
index 6fc15deb5c9..5ee51faea87 100644
--- a/lib/Target/R600/R600Packetizer.cpp
+++ b/lib/Target/R600/R600Packetizer.cpp
@@ -77,8 +77,6 @@ private:
do {
if (TII->isPredicated(BI))
continue;
- if (TII->isTransOnly(BI))
- continue;
int OperandIdx = TII->getOperandIdx(BI->getOpcode(), AMDGPU::OpName::write);
if (OperandIdx > -1 && BI->getOperand(OperandIdx).getImm() == 0)
continue;
@@ -87,6 +85,10 @@ private:
continue;
}
unsigned Dst = BI->getOperand(DstIdx).getReg();
+ if (TII->isTransOnly(BI)) {
+ Result[Dst] = AMDGPU::PS;
+ continue;
+ }
if (BI->getOpcode() == AMDGPU::DOT4_r600 ||
BI->getOpcode() == AMDGPU::DOT4_eg) {
Result[Dst] = AMDGPU::PV_X;
@@ -157,10 +159,6 @@ public:
return true;
if (!TII->isALUInstr(MI->getOpcode()))
return true;
- if (TII->get(MI->getOpcode()).TSFlags & R600_InstFlag::TRANS_ONLY)
- return true;
- if (TII->isTransOnly(MI))
- return true;
if (MI->getOpcode() == AMDGPU::GROUP_BARRIER)
return true;
return false;
@@ -170,7 +168,7 @@ public:
// together.
bool isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) {
MachineInstr *MII = SUI->getInstr(), *MIJ = SUJ->getInstr();
- if (getSlot(MII) <= getSlot(MIJ))
+ if (getSlot(MII) <= getSlot(MIJ) && !TII->isTransOnly(MII))
return false;
// Does MII and MIJ share the same pred_sel ?
int OpI = TII->getOperandIdx(MII->getOpcode(), AMDGPU::OpName::pred_sel),
@@ -204,11 +202,16 @@ public:
MI->getOperand(LastOp).setImm(Bit);
}
- MachineBasicBlock::iterator addToPacket(MachineInstr *MI) {
+ bool isBundlableWithCurrentPMI(MachineInstr *MI,
+ const DenseMap<unsigned, unsigned> &PV,
+ std::vector<R600InstrInfo::BankSwizzle> &BS,
+ bool &isTransSlot) {
+ isTransSlot = TII->isTransOnly(MI);
+
+ // Are the Constants limitations met ?
CurrentPacketMIs.push_back(MI);
- bool FitsConstLimits = TII->canBundle(CurrentPacketMIs);
- DEBUG(
- if (!FitsConstLimits) {
+ if (!TII->fitsConstReadLimitations(CurrentPacketMIs)) {
+ DEBUG(
dbgs() << "Couldn't pack :\n";
MI->dump();
dbgs() << "with the following packets :\n";
@@ -217,14 +220,15 @@ public:
dbgs() << "\n";
}
dbgs() << "because of Consts read limitations\n";
- });
- const DenseMap<unsigned, unsigned> &PV =
- getPreviousVector(CurrentPacketMIs.front());
- std::vector<R600InstrInfo::BankSwizzle> BS;
- bool FitsReadPortLimits =
- TII->fitsReadPortLimitations(CurrentPacketMIs, PV, BS);
- DEBUG(
- if (!FitsReadPortLimits) {
+ );
+ CurrentPacketMIs.pop_back();
+ return false;
+ }
+
+ // Is there a BankSwizzle set that meet Read Port limitations ?
+ if (!TII->fitsReadPortLimitations(CurrentPacketMIs,
+ PV, BS, isTransSlot)) {
+ DEBUG(
dbgs() << "Couldn't pack :\n";
MI->dump();
dbgs() << "with the following packets :\n";
@@ -233,25 +237,43 @@ public:
dbgs() << "\n";
}
dbgs() << "because of Read port limitations\n";
- });
- bool isBundlable = FitsConstLimits && FitsReadPortLimits;
- if (isBundlable) {
+ );
+ CurrentPacketMIs.pop_back();
+ return false;
+ }
+
+ CurrentPacketMIs.pop_back();
+ return true;
+ }
+
+ MachineBasicBlock::iterator addToPacket(MachineInstr *MI) {
+ MachineBasicBlock::iterator FirstInBundle =
+ CurrentPacketMIs.empty() ? MI : CurrentPacketMIs.front();
+ const DenseMap<unsigned, unsigned> &PV =
+ getPreviousVector(FirstInBundle);
+ std::vector<R600InstrInfo::BankSwizzle> BS;
+ bool isTransSlot;
+
+ if (isBundlableWithCurrentPMI(MI, PV, BS, isTransSlot)) {
for (unsigned i = 0, e = CurrentPacketMIs.size(); i < e; i++) {
MachineInstr *MI = CurrentPacketMIs[i];
- unsigned Op = TII->getOperandIdx(MI->getOpcode(),
- AMDGPU::OpName::bank_swizzle);
- MI->getOperand(Op).setImm(BS[i]);
+ unsigned Op = TII->getOperandIdx(MI->getOpcode(),
+ AMDGPU::OpName::bank_swizzle);
+ MI->getOperand(Op).setImm(BS[i]);
}
+ unsigned Op = TII->getOperandIdx(MI->getOpcode(),
+ AMDGPU::OpName::bank_swizzle);
+ MI->getOperand(Op).setImm(BS.back());
+ if (!CurrentPacketMIs.empty())
+ setIsLastBit(CurrentPacketMIs.back(), 0);
+ substitutePV(MI, PV);
+ MachineBasicBlock::iterator It = VLIWPacketizerList::addToPacket(MI);
+ if (isTransSlot) {
+ endPacket(llvm::next(It)->getParent(), llvm::next(It));
+ }
+ return It;
}
- CurrentPacketMIs.pop_back();
- if (!isBundlable) {
- endPacket(MI->getParent(), MI);
- substitutePV(MI, getPreviousVector(MI));
- return VLIWPacketizerList::addToPacket(MI);
- }
- if (!CurrentPacketMIs.empty())
- setIsLastBit(CurrentPacketMIs.back(), 0);
- substitutePV(MI, PV);
+ endPacket(MI->getParent(), MI);
return VLIWPacketizerList::addToPacket(MI);
}
};
diff --git a/lib/Target/R600/R600RegisterInfo.td b/lib/Target/R600/R600RegisterInfo.td
index 60a93e3f6c2..66aa91628e1 100644
--- a/lib/Target/R600/R600RegisterInfo.td
+++ b/lib/Target/R600/R600RegisterInfo.td
@@ -96,6 +96,7 @@ def PV_X : R600RegWithChan<"PV.X", 254, "X">;
def PV_Y : R600RegWithChan<"PV.Y", 254, "Y">;
def PV_Z : R600RegWithChan<"PV.Z", 254, "Z">;
def PV_W : R600RegWithChan<"PV.W", 254, "W">;
+def PS: R600Reg<"PS", 255>;
def PREDICATE_BIT : R600Reg<"PredicateBit", 0>;
def PRED_SEL_OFF: R600Reg<"Pred_sel_off", 0>;
def PRED_SEL_ZERO : R600Reg<"Pred_sel_zero", 2>;
diff --git a/test/CodeGen/R600/fdiv.ll b/test/CodeGen/R600/fdiv.ll
index 003590bb53d..f6eb6a66948 100644
--- a/test/CodeGen/R600/fdiv.ll
+++ b/test/CodeGen/R600/fdiv.ll
@@ -1,13 +1,13 @@
;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
;CHECK: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;CHECK: MUL_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], PS}}
;CHECK: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], PS}}
;CHECK: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;CHECK: MUL_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;CHECK: MUL_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], PS}}
;CHECK: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;CHECK: MUL_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;CHECK: MUL_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], PS}}
define void @test(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
%b_ptr = getelementptr <4 x float> addrspace(1)* %in, i32 1
diff --git a/test/CodeGen/R600/fp_to_sint.ll b/test/CodeGen/R600/fp_to_sint.ll
index f5716e1d47e..5a608fdf4a7 100644
--- a/test/CodeGen/R600/fp_to_sint.ll
+++ b/test/CodeGen/R600/fp_to_sint.ll
@@ -1,10 +1,10 @@
; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
; CHECK: @fp_to_sint_v4i32
-; CHECK: FLT_TO_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; CHECK: FLT_TO_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; CHECK: FLT_TO_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; CHECK: FLT_TO_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: FLT_TO_INT T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; CHECK: FLT_TO_INT * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; CHECK: FLT_TO_INT T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; CHECK: FLT_TO_INT * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
define void @fp_to_sint_v4i32(<4 x i32> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
%value = load <4 x float> addrspace(1) * %in
diff --git a/test/CodeGen/R600/fp_to_uint.ll b/test/CodeGen/R600/fp_to_uint.ll
index 1c3c0c62cf5..b07e286f43e 100644
--- a/test/CodeGen/R600/fp_to_uint.ll
+++ b/test/CodeGen/R600/fp_to_uint.ll
@@ -1,10 +1,10 @@
; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
; CHECK: @fp_to_uint_v4i32
-; CHECK: FLT_TO_UINT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; CHECK: FLT_TO_UINT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; CHECK: FLT_TO_UINT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; CHECK: FLT_TO_UINT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: FLT_TO_UINT * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; CHECK: FLT_TO_UINT * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; CHECK: FLT_TO_UINT * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; CHECK: FLT_TO_UINT * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
define void @fp_to_uint_v4i32(<4 x i32> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
%value = load <4 x float> addrspace(1) * %in
diff --git a/test/CodeGen/R600/llvm.cos.ll b/test/CodeGen/R600/llvm.cos.ll
index 9b281670704..b444fa782be 100644
--- a/test/CodeGen/R600/llvm.cos.ll
+++ b/test/CodeGen/R600/llvm.cos.ll
@@ -1,6 +1,6 @@
;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
-;CHECK: COS * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;CHECK: COS * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
define void @test() {
%r0 = call float @llvm.R600.load.input(i32 0)
diff --git a/test/CodeGen/R600/llvm.pow.ll b/test/CodeGen/R600/llvm.pow.ll
index 14220837048..0f51cf46f59 100644
--- a/test/CodeGen/R600/llvm.pow.ll
+++ b/test/CodeGen/R600/llvm.pow.ll
@@ -1,8 +1,8 @@
;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
;CHECK: LOG_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;CHECK: MUL NON-IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;CHECK-NEXT: EXP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;CHECK: MUL NON-IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], PS}}
+;CHECK-NEXT: EXP_IEEE * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
define void @test() {
%r0 = call float @llvm.R600.load.input(i32 0)
diff --git a/test/CodeGen/R600/llvm.sin.ll b/test/CodeGen/R600/llvm.sin.ll
index 803dc2d6deb..09cc3d2c52d 100644
--- a/test/CodeGen/R600/llvm.sin.ll
+++ b/test/CodeGen/R600/llvm.sin.ll
@@ -1,6 +1,6 @@
;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
-;CHECK: SIN * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;CHECK: SIN * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
define void @test() {
%r0 = call float @llvm.R600.load.input(i32 0)