From 4accdea1dc1d434349575b39b5670b747126260a Mon Sep 17 00:00:00 2001
From: Vincent Lejeune <vljn@ovi.com>
Date: Wed, 26 Jun 2013 18:09:58 +0200
Subject: R600: Support schedule and packetization of trans-only inst

---
 lib/Target/R600/R600InstrInfo.cpp        | 185 ++++++++++++++++++++++++-------
 lib/Target/R600/R600InstrInfo.h          |  32 ++++--
 lib/Target/R600/R600Instructions.td      |   1 +
 lib/Target/R600/R600MachineScheduler.cpp |  25 +++--
 lib/Target/R600/R600MachineScheduler.h   |   1 +
 lib/Target/R600/R600Packetizer.cpp       |  90 +++++++++------
 lib/Target/R600/R600RegisterInfo.td      |   1 +
 test/CodeGen/R600/fdiv.ll                |   8 +-
 test/CodeGen/R600/fp_to_sint.ll          |   8 +-
 test/CodeGen/R600/fp_to_uint.ll          |   8 +-
 test/CodeGen/R600/llvm.cos.ll            |   2 +-
 test/CodeGen/R600/llvm.pow.ll            |   4 +-
 test/CodeGen/R600/llvm.sin.ll            |   2 +-
 13 files changed, 261 insertions(+), 106 deletions(-)
diff --git a/lib/Target/R600/R600InstrInfo.cpp b/lib/Target/R600/R600InstrInfo.cpp
index 354f039c1e8..974a5794505 100644
--- a/lib/Target/R600/R600InstrInfo.cpp
+++ b/lib/Target/R600/R600InstrInfo.cpp
@@ -250,8 +250,9 @@ R600InstrInfo::getSrcs(MachineInstr *MI) const {
 
 std::vector<std::pair<int, unsigned> >
 R600InstrInfo::ExtractSrcs(MachineInstr *MI,
-                           const DenseMap<unsigned, unsigned> &PV)
-    const {
+                           const DenseMap<unsigned, unsigned> &PV,
+                           unsigned &ConstCount) const {
+  ConstCount = 0;
   const SmallVector<std::pair<MachineOperand *, int64_t>, 3> Srcs = getSrcs(MI);
   const std::pair<int, unsigned> DummyPair(-1, 0);
   std::vector<std::pair<int, unsigned> > Result;
@@ -259,18 +260,20 @@ R600InstrInfo::ExtractSrcs(MachineInstr *MI,
   for (unsigned n = Srcs.size(); i < n; ++i) {
     unsigned Reg = Srcs[i].first->getReg();
     unsigned Index = RI.getEncodingValue(Reg) & 0xff;
-    unsigned Chan = RI.getHWRegChan(Reg);
     if (Reg == AMDGPU::OQAP) {
       Result.push_back(std::pair<int, unsigned>(Index, 0));
     }
-    if (Index > 127) {
-      Result.push_back(DummyPair);
+    if (PV.find(Reg) != PV.end()) {
+      // 255 is used to tells its a PS/PV reg
+      Result.push_back(std::pair<int, unsigned>(255, 0));
       continue;
     }
-    if (PV.find(Reg) != PV.end()) {
+    if (Index > 127) {
+      ConstCount++;
       Result.push_back(DummyPair);
       continue;
     }
+    unsigned Chan = RI.getHWRegChan(Reg);
     Result.push_back(std::pair<int, unsigned>(Index, Chan));
   }
   for (; i < 3; ++i)
@@ -305,23 +308,51 @@ Swizzle(std::vector<std::pair<int, unsigned> > Src,
   return Src;
 }
 
-bool
-R600InstrInfo::isLegal(
-             const std::vector<std::vector<std::pair<int, unsigned> > > &IGSrcs,
-             const std::vector<R600InstrInfo::BankSwizzle> &Swz,
-             unsigned CheckedSize) const {
+static unsigned
+getTransSwizzle(R600InstrInfo::BankSwizzle Swz, unsigned Op) {
+  switch (Swz) {
+  case R600InstrInfo::ALU_VEC_012_SCL_210: {
+    unsigned Cycles[3] = { 2, 1, 0};
+    return Cycles[Op];
+  }
+  case R600InstrInfo::ALU_VEC_021_SCL_122: {
+    unsigned Cycles[3] = { 1, 2, 2};
+    return Cycles[Op];
+  }
+  case R600InstrInfo::ALU_VEC_120_SCL_212: {
+    unsigned Cycles[3] = { 2, 1, 2};
+    return Cycles[Op];
+  }
+  case R600InstrInfo::ALU_VEC_102_SCL_221: {
+    unsigned Cycles[3] = { 2, 2, 1};
+    return Cycles[Op];
+  }
+  default:
+    llvm_unreachable("Wrong Swizzle for Trans Slot");
+    return 0;
+  }
+}
+
+/// returns how many MIs (whose inputs are represented by IGSrcs) can be packed
+/// in the same Instruction Group while meeting read port limitations given a
+/// Swz swizzle sequence.
+unsigned  R600InstrInfo::isLegalUpTo(
+    const std::vector<std::vector<std::pair<int, unsigned> > > &IGSrcs,
+    const std::vector<R600InstrInfo::BankSwizzle> &Swz,
+    const std::vector<std::pair<int, unsigned> > &TransSrcs,
+    R600InstrInfo::BankSwizzle TransSwz) const {
   int Vector[4][3];
   memset(Vector, -1, sizeof(Vector));
-  for (unsigned i = 0; i < CheckedSize; i++) {
+  for (unsigned i = 0, e = IGSrcs.size(); i < e; i++) {
     const std::vector<std::pair<int, unsigned> > &Srcs =
         Swizzle(IGSrcs[i], Swz[i]);
     for (unsigned j = 0; j < 3; j++) {
       const std::pair<int, unsigned> &Src = Srcs[j];
-      if (Src.first < 0)
+      if (Src.first < 0 || Src.first == 255)
         continue;
       if (Src.first == GET_REG_INDEX(RI.getEncodingValue(AMDGPU::OQAP))) {
-        if (Swz[i] != R600InstrInfo::ALU_VEC_012 &&
-            Swz[i] != R600InstrInfo::ALU_VEC_021) {
+        if (Swz[i] != R600InstrInfo::ALU_VEC_012_SCL_210 &&
+            Swz[i] != R600InstrInfo::ALU_VEC_021_SCL_122) {
             // The value from output queue A (denoted by register OQAP) can
             // only be fetched during the first cycle.
             return false;
@@ -332,51 +363,126 @@ R600InstrInfo::isLegal(
       if (Vector[Src.second][j] < 0)
         Vector[Src.second][j] = Src.first;
       if (Vector[Src.second][j] != Src.first)
-        return false;
+        return i;
     }
   }
-  return true;
+  // Now check Trans Alu
+  for (unsigned i = 0, e = TransSrcs.size(); i < e; ++i) {
+    const std::pair<int, unsigned> &Src = TransSrcs[i];
+    unsigned Cycle = getTransSwizzle(TransSwz, i);
+    if (Src.first < 0)
+      continue;
+    if (Src.first == 255)
+      continue;
+    if (Vector[Src.second][Cycle] < 0)
+      Vector[Src.second][Cycle] = Src.first;
+    if (Vector[Src.second][Cycle] != Src.first)
+      return IGSrcs.size() - 1;
+  }
+  return IGSrcs.size();
 }
 
-bool
-R600InstrInfo::recursiveFitsFPLimitation(
-             const std::vector<std::vector<std::pair<int, unsigned> > > &IGSrcs,
-             std::vector<R600InstrInfo::BankSwizzle> &SwzCandidate,
-             unsigned Depth) const {
-  if (!isLegal(IGSrcs, SwzCandidate, Depth))
+/// Given a swizzle sequence SwzCandidate and an index Idx, returns the next
+/// (in lexicographic term) swizzle sequence assuming that all swizzles after
+/// Idx can be skipped
+static bool
+NextPossibleSolution(
+    std::vector<R600InstrInfo::BankSwizzle> &SwzCandidate,
+    unsigned Idx) {
+  assert(Idx < SwzCandidate.size());
+  int ResetIdx = Idx;
+  while (ResetIdx > -1 && SwzCandidate[ResetIdx] == R600InstrInfo::ALU_VEC_210)
+    ResetIdx --;
+  for (unsigned i = ResetIdx + 1, e = SwzCandidate.size(); i < e; i++) {
+    SwzCandidate[i] = R600InstrInfo::ALU_VEC_012_SCL_210;
+  }
+  if (ResetIdx == -1)
     return false;
-  if (IGSrcs.size() == Depth)
-    return true;
-  unsigned i = SwzCandidate[Depth];
-  for (; i < 6; i++) {
-    SwzCandidate[Depth] = (R600InstrInfo::BankSwizzle) i;
-    if (recursiveFitsFPLimitation(IGSrcs, SwzCandidate, Depth + 1))
+  SwzCandidate[ResetIdx]++;
+  return true;
+}
+
+/// Enumerate all possible Swizzle sequence to find one that can meet all
+/// read port requirements.
+bool R600InstrInfo::FindSwizzleForVectorSlot(
+    const std::vector<std::vector<std::pair<int, unsigned> > > &IGSrcs,
+    std::vector<R600InstrInfo::BankSwizzle> &SwzCandidate,
+    const std::vector<std::pair<int, unsigned> > &TransSrcs,
+    R600InstrInfo::BankSwizzle TransSwz) const {
+  unsigned ValidUpTo = 0;
+  do {
+    ValidUpTo = isLegalUpTo(IGSrcs, SwzCandidate, TransSrcs, TransSwz);
+    if (ValidUpTo == IGSrcs.size())
       return true;
-  }
-  SwzCandidate[Depth] = R600InstrInfo::ALU_VEC_012;
+  } while (NextPossibleSolution(SwzCandidate, ValidUpTo));
   return false;
 }
 
+/// Instructions in Trans slot can't read gpr at cycle 0 if they also read
+/// a const, and can't read a gpr at cycle 1 if they read 2 const.
+static bool
+isConstCompatible(R600InstrInfo::BankSwizzle TransSwz,
+                  const std::vector<std::pair<int, unsigned> > &TransOps,
+                  unsigned ConstCount) {
+  for (unsigned i = 0, e = TransOps.size(); i < e; ++i) {
+    const std::pair<int, unsigned> &Src = TransOps[i];
+    unsigned Cycle = getTransSwizzle(TransSwz, i);
+    if (Src.first < 0)
+      continue;
+    if (ConstCount > 0 && Cycle == 0)
+      return false;
+    if (ConstCount > 1 && Cycle == 1)
+      return false;
+  }
+  return true;
+}
+
 bool
 R600InstrInfo::fitsReadPortLimitations(const std::vector<MachineInstr *> &IG,
-                                      const DenseMap<unsigned, unsigned> &PV,
-                                      std::vector<BankSwizzle> &ValidSwizzle)
+                                       const DenseMap<unsigned, unsigned> &PV,
+                                       std::vector<BankSwizzle> &ValidSwizzle,
+                                       bool isLastAluTrans)
     const {
   //Todo : support shared src0 - src1 operand
 
   std::vector<std::vector<std::pair<int, unsigned> > > IGSrcs;
   ValidSwizzle.clear();
+  unsigned ConstCount;
+  BankSwizzle TransBS;
   for (unsigned i = 0, e = IG.size(); i < e; ++i) {
-    IGSrcs.push_back(ExtractSrcs(IG[i], PV));
+    IGSrcs.push_back(ExtractSrcs(IG[i], PV, ConstCount));
     unsigned Op = getOperandIdx(IG[i]->getOpcode(),
         AMDGPU::OpName::bank_swizzle);
     ValidSwizzle.push_back( (R600InstrInfo::BankSwizzle)
         IG[i]->getOperand(Op).getImm());
   }
-  bool Result = recursiveFitsFPLimitation(IGSrcs, ValidSwizzle);
-  if (!Result)
-    return false;
-  return true;
+  std::vector<std::pair<int, unsigned> > TransOps;
+  if (!isLastAluTrans)
+    return FindSwizzleForVectorSlot(IGSrcs, ValidSwizzle, TransOps, TransBS);
+
+  TransOps = IGSrcs.back();
+  IGSrcs.pop_back();
+  ValidSwizzle.pop_back();
+
+  static const R600InstrInfo::BankSwizzle TransSwz[] = {
+    ALU_VEC_012_SCL_210,
+    ALU_VEC_021_SCL_122,
+    ALU_VEC_120_SCL_212,
+    ALU_VEC_102_SCL_221
+  };
+  for (unsigned i = 0; i < 4; i++) {
+    TransBS = TransSwz[i];
+    if (!isConstCompatible(TransBS, TransOps, ConstCount))
+      continue;
+    bool Result = FindSwizzleForVectorSlot(IGSrcs, ValidSwizzle, TransOps,
+        TransBS);
+    if (Result) {
+      ValidSwizzle.push_back(TransBS);
+      return true;
+    }
+  }
+
+  return false;
 }
 
 
@@ -406,7 +512,8 @@ R600InstrInfo::fitsConstReadLimitations(const std::vector<unsigned> &Consts)
 }
 
 bool
-R600InstrInfo::canBundle(const std::vector<MachineInstr *> &MIs) const {
+R600InstrInfo::fitsConstReadLimitations(const std::vector<MachineInstr *> &MIs)
+    const {
   std::vector<unsigned> Consts;
   for (unsigned i = 0, n = MIs.size(); i < n; i++) {
     MachineInstr *MI = MIs[i];
diff --git a/lib/Target/R600/R600InstrInfo.h b/lib/Target/R600/R600InstrInfo.h
index a6add8382cf..1ba4160747b 100644
--- a/lib/Target/R600/R600InstrInfo.h
+++ b/lib/Target/R600/R600InstrInfo.h
@@ -84,26 +84,38 @@ namespace llvm {
   SmallVector<std::pair<MachineOperand *, int64_t>, 3>
       getSrcs(MachineInstr *MI) const;
 
-  bool isLegal(
-             const std::vector<std::vector<std::pair<int, unsigned> > > &IGSrcs,
-             const std::vector<R600InstrInfo::BankSwizzle> &Swz,
-             unsigned CheckedSize) const;
-  bool recursiveFitsFPLimitation(
-             const std::vector<std::vector<std::pair<int, unsigned> > > &IGSrcs,
-             std::vector<R600InstrInfo::BankSwizzle> &SwzCandidate,
-             unsigned Depth = 0) const;
+  unsigned  isLegalUpTo(
+    const std::vector<std::vector<std::pair<int, unsigned> > > &IGSrcs,
+    const std::vector<R600InstrInfo::BankSwizzle> &Swz,
+    const std::vector<std::pair<int, unsigned> > &TransSrcs,
+    R600InstrInfo::BankSwizzle TransSwz) const;
+
+  bool FindSwizzleForVectorSlot(
+    const std::vector<std::vector<std::pair<int, unsigned> > > &IGSrcs,
+    std::vector<R600InstrInfo::BankSwizzle> &SwzCandidate,
+    const std::vector<std::pair<int, unsigned> > &TransSrcs,
+    R600InstrInfo::BankSwizzle TransSwz) const;
 
   /// Given the order VEC_012 < VEC_021 < VEC_120 < VEC_102 < VEC_201 < VEC_210
   /// returns true and the first (in lexical order) BankSwizzle affectation
   /// starting from the one already provided in the Instruction Group MIs that
   /// fits Read Port limitations in BS if available. Otherwise returns false
   /// and undefined content in BS.
+  /// isLastAluTrans should be set if the last Alu of MIs will be executed on
+  /// Trans ALU. In this case, ValidTSwizzle returns the BankSwizzle value to
+  /// apply to the last instruction.
   /// PV holds GPR to PV registers in the Instruction Group MIs.
   bool fitsReadPortLimitations(const std::vector<MachineInstr *> &MIs,
                                const DenseMap<unsigned, unsigned> &PV,
-                               std::vector<BankSwizzle> &BS) const;
+                               std::vector<BankSwizzle> &BS,
+                               bool isLastAluTrans) const;
+
+  /// An instruction group can only access 2 channel pair (either [XY] or [ZW])
+  /// from KCache bank on R700+. This function check if MI set in input meet
+  /// this limitations
+  bool fitsConstReadLimitations(const std::vector<MachineInstr *> &) const;
+  /// Same but using const index set instead of MI set.
   bool fitsConstReadLimitations(const std::vector<unsigned>&) const;
-  bool canBundle(const std::vector<MachineInstr *> &) const;
 
   /// \breif Vector instructions are instructions that must fill all
   /// instruction slots within an instruction group.
diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td
index fd585f8b7e9..862080d9ca4 100644
--- a/lib/Target/R600/R600Instructions.td
+++ b/lib/Target/R600/R600Instructions.td
@@ -1489,6 +1489,7 @@ let hasSideEffects = 1 in {
 
   def FLT_TO_INT_eg : FLT_TO_INT_Common<0x50> {
     let Pattern = [];
+    let TransOnly = 0;
   }
 
   def INT_TO_FLT_eg : INT_TO_FLT_Common<0x9B>;
diff --git a/lib/Target/R600/R600MachineScheduler.cpp b/lib/Target/R600/R600MachineScheduler.cpp
index 7e28f9dde47..0dc0365926e 100644
--- a/lib/Target/R600/R600MachineScheduler.cpp
+++ b/lib/Target/R600/R600MachineScheduler.cpp
@@ -32,7 +32,7 @@ void R600SchedStrategy::initialize(ScheduleDAGMI *dag) {
   MRI = &DAG->MRI;
   CurInstKind = IDOther;
   CurEmitted = 0;
-  OccupedSlotsMask = 15;
+  OccupedSlotsMask = 31;
   InstKindLimit[IDAlu] = TII->getMaxAlusPerClause();
   InstKindLimit[IDOther] = 32;
 
@@ -160,7 +160,7 @@ void R600SchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
   if (NextInstKind != CurInstKind) {
     DEBUG(dbgs() << "Instruction Type Switch\n");
     if (NextInstKind != IDAlu)
-      OccupedSlotsMask = 15;
+      OccupedSlotsMask |= 31;
     CurEmitted = 0;
     CurInstKind = NextInstKind;
   }
@@ -251,6 +251,9 @@ bool R600SchedStrategy::regBelongsToClass(unsigned Reg,
 R600SchedStrategy::AluKind R600SchedStrategy::getAluKind(SUnit *SU) const {
   MachineInstr *MI = SU->getInstr();
 
+  if (TII->isTransOnly(MI))
+    return AluTrans;
+
     switch (MI->getOpcode()) {
     case AMDGPU::PRED_X:
       return AluPredX;
@@ -346,7 +349,7 @@ SUnit *R600SchedStrategy::PopInst(std::vector<SUnit *> &Q) {
       It != E; ++It) {
     SUnit *SU = *It;
     InstructionsGroupCandidate.push_back(SU->getInstr());
-    if (TII->canBundle(InstructionsGroupCandidate)) {
+    if (TII->fitsConstReadLimitations(InstructionsGroupCandidate)) {
       InstructionsGroupCandidate.pop_back();
       Q.erase((It + 1).base());
       return SU;
@@ -421,7 +424,8 @@ unsigned R600SchedStrategy::AvailablesAluCount() const {
   return AvailableAlus[AluAny].size() + AvailableAlus[AluT_XYZW].size() +
       AvailableAlus[AluT_X].size() + AvailableAlus[AluT_Y].size() +
       AvailableAlus[AluT_Z].size() + AvailableAlus[AluT_W].size() +
-      AvailableAlus[AluDiscarded].size() + AvailableAlus[AluPredX].size();
+      AvailableAlus[AluTrans].size() + AvailableAlus[AluDiscarded].size() +
+      AvailableAlus[AluPredX].size();
 }
 
 SUnit* R600SchedStrategy::pickAlu() {
@@ -429,20 +433,27 @@ SUnit* R600SchedStrategy::pickAlu() {
     if (!OccupedSlotsMask) {
       // Bottom up scheduling : predX must comes first
       if (!AvailableAlus[AluPredX].empty()) {
-        OccupedSlotsMask = 15;
+        OccupedSlotsMask |= 31;
         return PopInst(AvailableAlus[AluPredX]);
       }
       // Flush physical reg copies (RA will discard them)
       if (!AvailableAlus[AluDiscarded].empty()) {
-        OccupedSlotsMask = 15;
+        OccupedSlotsMask |= 31;
         return PopInst(AvailableAlus[AluDiscarded]);
       }
       // If there is a T_XYZW alu available, use it
       if (!AvailableAlus[AluT_XYZW].empty()) {
-        OccupedSlotsMask = 15;
+        OccupedSlotsMask |= 15;
         return PopInst(AvailableAlus[AluT_XYZW]);
       }
     }
+    bool TransSlotOccuped = OccupedSlotsMask & 16;
+    if (!TransSlotOccuped) {
+      if (!AvailableAlus[AluTrans].empty()) {
+        OccupedSlotsMask |= 16;
+        return PopInst(AvailableAlus[AluTrans]);
+      }
+    }
     for (int Chan = 3; Chan > -1; --Chan) {
       bool isOccupied = OccupedSlotsMask & (1 << Chan);
       if (!isOccupied) {
diff --git a/lib/Target/R600/R600MachineScheduler.h b/lib/Target/R600/R600MachineScheduler.h
index aae8b3f4eb8..f8965d8998a 100644
--- a/lib/Target/R600/R600MachineScheduler.h
+++ b/lib/Target/R600/R600MachineScheduler.h
@@ -46,6 +46,7 @@ class R600SchedStrategy : public MachineSchedStrategy {
     AluT_W,
     AluT_XYZW,
     AluPredX,
+    AluTrans,
     AluDiscarded, // LLVM Instructions that are going to be eliminated
     AluLast
   };
diff --git a/lib/Target/R600/R600Packetizer.cpp b/lib/Target/R600/R600Packetizer.cpp
index 6fc15deb5c9..5ee51faea87 100644
--- a/lib/Target/R600/R600Packetizer.cpp
+++ b/lib/Target/R600/R600Packetizer.cpp
@@ -77,8 +77,6 @@ private:
     do {
       if (TII->isPredicated(BI))
         continue;
-      if (TII->isTransOnly(BI))
-        continue;
       int OperandIdx = TII->getOperandIdx(BI->getOpcode(), AMDGPU::OpName::write);
       if (OperandIdx > -1 && BI->getOperand(OperandIdx).getImm() == 0)
         continue;
@@ -87,6 +85,10 @@ private:
         continue;
       }
       unsigned Dst = BI->getOperand(DstIdx).getReg();
+      if (TII->isTransOnly(BI)) {
+        Result[Dst] = AMDGPU::PS;
+        continue;
+      }
       if (BI->getOpcode() == AMDGPU::DOT4_r600 ||
           BI->getOpcode() == AMDGPU::DOT4_eg) {
         Result[Dst] = AMDGPU::PV_X;
@@ -157,10 +159,6 @@ public:
       return true;
     if (!TII->isALUInstr(MI->getOpcode()))
       return true;
-    if (TII->get(MI->getOpcode()).TSFlags & R600_InstFlag::TRANS_ONLY)
-      return true;
-    if (TII->isTransOnly(MI))
-      return true;
     if (MI->getOpcode() == AMDGPU::GROUP_BARRIER)
       return true;
     return false;
@@ -170,7 +168,7 @@ public:
   // together.
   bool isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) {
     MachineInstr *MII = SUI->getInstr(), *MIJ = SUJ->getInstr();
-    if (getSlot(MII) <= getSlot(MIJ))
+    if (getSlot(MII) <= getSlot(MIJ) && !TII->isTransOnly(MII))
       return false;
     // Does MII and MIJ share the same pred_sel ?
     int OpI = TII->getOperandIdx(MII->getOpcode(), AMDGPU::OpName::pred_sel),
@@ -204,11 +202,16 @@ public:
     MI->getOperand(LastOp).setImm(Bit);
   }
 
-  MachineBasicBlock::iterator addToPacket(MachineInstr *MI) {
+  bool isBundlableWithCurrentPMI(MachineInstr *MI,
+                                 const DenseMap<unsigned, unsigned> &PV,
+                                 std::vector<R600InstrInfo::BankSwizzle> &BS,
+                                 bool &isTransSlot) {
+    isTransSlot = TII->isTransOnly(MI);
+
+    // Are the Constants limitations met ?
     CurrentPacketMIs.push_back(MI);
-    bool FitsConstLimits = TII->canBundle(CurrentPacketMIs);
-    DEBUG(
-      if (!FitsConstLimits) {
+    if (!TII->fitsConstReadLimitations(CurrentPacketMIs)) {
+      DEBUG(
         dbgs() << "Couldn't pack :\n";
         MI->dump();
         dbgs() << "with the following packets :\n";
@@ -217,14 +220,15 @@ public:
           dbgs() << "\n";
         }
         dbgs() << "because of Consts read limitations\n";
-      });
-    const DenseMap<unsigned, unsigned> &PV =
-        getPreviousVector(CurrentPacketMIs.front());
-    std::vector<R600InstrInfo::BankSwizzle> BS;
-    bool FitsReadPortLimits =
-        TII->fitsReadPortLimitations(CurrentPacketMIs, PV, BS);
-    DEBUG(
-      if (!FitsReadPortLimits) {
+      );
+      CurrentPacketMIs.pop_back();
+      return false;
+    }
+
+    // Is there a BankSwizzle set that meet Read Port limitations ?
+    if (!TII->fitsReadPortLimitations(CurrentPacketMIs,
+            PV, BS, isTransSlot)) {
+      DEBUG(
         dbgs() << "Couldn't pack :\n";
         MI->dump();
         dbgs() << "with the following packets :\n";
@@ -233,25 +237,43 @@ public:
           dbgs() << "\n";
         }
         dbgs() << "because of Read port limitations\n";
-      });
-    bool isBundlable = FitsConstLimits && FitsReadPortLimits;
-    if (isBundlable) {
+      );
+      CurrentPacketMIs.pop_back();
+      return false;
+    }
+
+    CurrentPacketMIs.pop_back();
+    return true;
+  }
+
+  MachineBasicBlock::iterator addToPacket(MachineInstr *MI) {
+    MachineBasicBlock::iterator FirstInBundle =
+        CurrentPacketMIs.empty() ? MI : CurrentPacketMIs.front();
+    const DenseMap<unsigned, unsigned> &PV =
+        getPreviousVector(FirstInBundle);
+    std::vector<R600InstrInfo::BankSwizzle> BS;
+    bool isTransSlot;
+
+    if (isBundlableWithCurrentPMI(MI, PV, BS, isTransSlot)) {
       for (unsigned i = 0, e = CurrentPacketMIs.size(); i < e; i++) {
         MachineInstr *MI = CurrentPacketMIs[i];
-            unsigned Op = TII->getOperandIdx(MI->getOpcode(),
-                AMDGPU::OpName::bank_swizzle);
-            MI->getOperand(Op).setImm(BS[i]);
+        unsigned Op = TII->getOperandIdx(MI->getOpcode(),
+            AMDGPU::OpName::bank_swizzle);
+        MI->getOperand(Op).setImm(BS[i]);
       }
+      unsigned Op = TII->getOperandIdx(MI->getOpcode(),
+          AMDGPU::OpName::bank_swizzle);
+      MI->getOperand(Op).setImm(BS.back());
+      if (!CurrentPacketMIs.empty())
+        setIsLastBit(CurrentPacketMIs.back(), 0);
+      substitutePV(MI, PV);
+      MachineBasicBlock::iterator It = VLIWPacketizerList::addToPacket(MI);
+      if (isTransSlot) {
+        endPacket(llvm::next(It)->getParent(), llvm::next(It));
+      }
+      return It;
     }
-    CurrentPacketMIs.pop_back();
-    if (!isBundlable) {
-      endPacket(MI->getParent(), MI);
-      substitutePV(MI, getPreviousVector(MI));
-      return VLIWPacketizerList::addToPacket(MI);
-    }
-    if (!CurrentPacketMIs.empty())
-      setIsLastBit(CurrentPacketMIs.back(), 0);
-    substitutePV(MI, PV);
+    endPacket(MI->getParent(), MI);
     return VLIWPacketizerList::addToPacket(MI);
   }
 };
diff --git a/lib/Target/R600/R600RegisterInfo.td b/lib/Target/R600/R600RegisterInfo.td
index 60a93e3f6c2..66aa91628e1 100644
--- a/lib/Target/R600/R600RegisterInfo.td
+++ b/lib/Target/R600/R600RegisterInfo.td
@@ -96,6 +96,7 @@ def PV_X : R600RegWithChan<"PV.X", 254, "X">;
 def PV_Y : R600RegWithChan<"PV.Y", 254, "Y">;
 def PV_Z : R600RegWithChan<"PV.Z", 254, "Z">;
 def PV_W : R600RegWithChan<"PV.W", 254, "W">;
+def PS: R600Reg<"PS", 255>;
 def PREDICATE_BIT : R600Reg<"PredicateBit", 0>;
 def PRED_SEL_OFF: R600Reg<"Pred_sel_off", 0>;
 def PRED_SEL_ZERO : R600Reg<"Pred_sel_zero", 2>;
diff --git a/test/CodeGen/R600/fdiv.ll b/test/CodeGen/R600/fdiv.ll
index 003590bb53d..f6eb6a66948 100644
--- a/test/CodeGen/R600/fdiv.ll
+++ b/test/CodeGen/R600/fdiv.ll
@@ -1,13 +1,13 @@
 ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
 
 ;CHECK: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;CHECK: MUL_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], PS}}
 ;CHECK: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], PS}}
 ;CHECK: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;CHECK: MUL_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;CHECK: MUL_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], PS}}
 ;CHECK: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;CHECK: MUL_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;CHECK: MUL_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], PS}}
 
 define void @test(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x float> addrspace(1)* %in, i32 1
diff --git a/test/CodeGen/R600/fp_to_sint.ll b/test/CodeGen/R600/fp_to_sint.ll
index f5716e1d47e..5a608fdf4a7 100644
--- a/test/CodeGen/R600/fp_to_sint.ll
+++ b/test/CodeGen/R600/fp_to_sint.ll
@@ -1,10 +1,10 @@
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
 
 ; CHECK: @fp_to_sint_v4i32
-; CHECK: FLT_TO_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; CHECK: FLT_TO_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; CHECK: FLT_TO_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; CHECK: FLT_TO_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: FLT_TO_INT T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; CHECK: FLT_TO_INT * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; CHECK: FLT_TO_INT T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; CHECK: FLT_TO_INT * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
 
 define void @fp_to_sint_v4i32(<4 x i32> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
   %value = load <4 x float> addrspace(1) * %in
diff --git a/test/CodeGen/R600/fp_to_uint.ll b/test/CodeGen/R600/fp_to_uint.ll
index 1c3c0c62cf5..b07e286f43e 100644
--- a/test/CodeGen/R600/fp_to_uint.ll
+++ b/test/CodeGen/R600/fp_to_uint.ll
@@ -1,10 +1,10 @@
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
 
 ; CHECK: @fp_to_uint_v4i32
-; CHECK: FLT_TO_UINT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; CHECK: FLT_TO_UINT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; CHECK: FLT_TO_UINT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; CHECK: FLT_TO_UINT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: FLT_TO_UINT * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; CHECK: FLT_TO_UINT * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; CHECK: FLT_TO_UINT * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; CHECK: FLT_TO_UINT * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
 
 define void @fp_to_uint_v4i32(<4 x i32> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
   %value = load <4 x float> addrspace(1) * %in
diff --git a/test/CodeGen/R600/llvm.cos.ll b/test/CodeGen/R600/llvm.cos.ll
index 9b281670704..b444fa782be 100644
--- a/test/CodeGen/R600/llvm.cos.ll
+++ b/test/CodeGen/R600/llvm.cos.ll
@@ -1,6 +1,6 @@
 ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
 
-;CHECK: COS * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;CHECK: COS * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
 
 define void @test() {
    %r0 = call float @llvm.R600.load.input(i32 0)
diff --git a/test/CodeGen/R600/llvm.pow.ll b/test/CodeGen/R600/llvm.pow.ll
index 14220837048..0f51cf46f59 100644
--- a/test/CodeGen/R600/llvm.pow.ll
+++ b/test/CodeGen/R600/llvm.pow.ll
@@ -1,8 +1,8 @@
 ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
 
 ;CHECK: LOG_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;CHECK: MUL NON-IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;CHECK-NEXT: EXP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;CHECK: MUL NON-IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], PS}}
+;CHECK-NEXT: EXP_IEEE * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
 
 define void @test() {
    %r0 = call float @llvm.R600.load.input(i32 0)
diff --git a/test/CodeGen/R600/llvm.sin.ll b/test/CodeGen/R600/llvm.sin.ll
index 803dc2d6deb..09cc3d2c52d 100644
--- a/test/CodeGen/R600/llvm.sin.ll
+++ b/test/CodeGen/R600/llvm.sin.ll
@@ -1,6 +1,6 @@
 ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
 
-;CHECK: SIN * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;CHECK: SIN * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
 
 define void @test() {
    %r0 = call float @llvm.R600.load.input(i32 0)
-- 
cgit v1.2.3