diff options
-rw-r--r-- | include/llvm/CodeGen/ScheduleDAG.h | 77 | ||||
-rw-r--r-- | lib/CodeGen/PostRASchedulerList.cpp | 439 | ||||
-rw-r--r-- | lib/CodeGen/ScheduleDAG.cpp | 201 | ||||
-rw-r--r-- | lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp | 264 | ||||
-rw-r--r-- | test/CodeGen/X86/break-anti-dependencies.ll | 33 |
5 files changed, 751 insertions, 263 deletions
diff --git a/include/llvm/CodeGen/ScheduleDAG.h b/include/llvm/CodeGen/ScheduleDAG.h index 71f0e88ed1a..a89bfa14901 100644 --- a/include/llvm/CodeGen/ScheduleDAG.h +++ b/include/llvm/CodeGen/ScheduleDAG.h @@ -17,6 +17,7 @@ #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/BitVector.h" #include "llvm/ADT/GraphTraits.h" #include "llvm/ADT/SmallVector.h" @@ -148,7 +149,9 @@ namespace llvm { unsigned PhyReg = 0, int Cost = 1, bool isAntiDep = false) { for (unsigned i = 0, e = (unsigned)Preds.size(); i != e; ++i) if (Preds[i].Dep == N && - Preds[i].isCtrl == isCtrl && Preds[i].isArtificial == isArtificial) + Preds[i].isCtrl == isCtrl && + Preds[i].isArtificial == isArtificial && + Preds[i].isAntiDep == isAntiDep) return false; Preds.push_back(SDep(N, PhyReg, Cost, isCtrl, isArtificial, isAntiDep)); N->Succs.push_back(SDep(this, PhyReg, Cost, isCtrl, @@ -167,7 +170,10 @@ namespace llvm { bool removePred(SUnit *N, bool isCtrl, bool isArtificial, bool isAntiDep) { for (SmallVector<SDep, 4>::iterator I = Preds.begin(), E = Preds.end(); I != E; ++I) - if (I->Dep == N && I->isCtrl == isCtrl && I->isArtificial == isArtificial) { + if (I->Dep == N && + I->isCtrl == isCtrl && + I->isArtificial == isArtificial && + I->isAntiDep == isAntiDep) { bool FoundSucc = false; for (SmallVector<SDep, 4>::iterator II = N->Succs.begin(), EE = N->Succs.end(); II != EE; ++II) @@ -404,6 +410,73 @@ namespace llvm { return G->SUnits.end(); } }; + + /// ScheduleDAGTopologicalSort is a class that computes a topological + /// ordering for SUnits and provides methods for dynamically updating + /// the ordering as new edges are added. + /// + /// This allows a very fast implementation of IsReachable, for example. + /// + class ScheduleDAGTopologicalSort { + /// SUnits - A reference to the ScheduleDAG's SUnits. + std::vector<SUnit> &SUnits; + + /// Index2Node - Maps topological index to the node number. + std::vector<int> Index2Node; + /// Node2Index - Maps the node number to its topological index. + std::vector<int> Node2Index; + /// Visited - a set of nodes visited during a DFS traversal. + BitVector Visited; + + /// DFS - make a DFS traversal and mark all nodes affected by the + /// edge insertion. These nodes will later get new topological indexes + /// by means of the Shift method. + void DFS(const SUnit *SU, int UpperBound, bool& HasLoop); + + /// Shift - reassign topological indexes for the nodes in the DAG + /// to preserve the topological ordering. + void Shift(BitVector& Visited, int LowerBound, int UpperBound); + + /// Allocate - assign the topological index to the node n. + void Allocate(int n, int index); + + public: + explicit ScheduleDAGTopologicalSort(std::vector<SUnit> &SUnits); + + /// InitDAGTopologicalSorting - create the initial topological + /// ordering from the DAG to be scheduled. + void InitDAGTopologicalSorting(); + + /// IsReachable - Checks if SU is reachable from TargetSU. + bool IsReachable(const SUnit *SU, const SUnit *TargetSU); + + /// WillCreateCycle - Returns true if adding an edge from SU to TargetSU + /// will create a cycle. + bool WillCreateCycle(SUnit *SU, SUnit *TargetSU); + + /// AddPred - Updates the topological ordering to accomodate an edge + /// to be added from SUnit X to SUnit Y. + void AddPred(SUnit *Y, SUnit *X); + + /// RemovePred - Updates the topological ordering to accomodate an + /// an edge to be removed from the specified node N from the predecessors + /// of the current node M. + void RemovePred(SUnit *M, SUnit *N); + + typedef std::vector<int>::iterator iterator; + typedef std::vector<int>::const_iterator const_iterator; + iterator begin() { return Index2Node.begin(); } + const_iterator begin() const { return Index2Node.begin(); } + iterator end() { return Index2Node.end(); } + const_iterator end() const { return Index2Node.end(); } + + typedef std::vector<int>::reverse_iterator reverse_iterator; + typedef std::vector<int>::const_reverse_iterator const_reverse_iterator; + reverse_iterator rbegin() { return Index2Node.rbegin(); } + const_reverse_iterator rbegin() const { return Index2Node.rbegin(); } + reverse_iterator rend() { return Index2Node.rend(); } + const_reverse_iterator rend() const { return Index2Node.rend(); } + }; } #endif diff --git a/lib/CodeGen/PostRASchedulerList.cpp b/lib/CodeGen/PostRASchedulerList.cpp index a2ad4e356a3..ed24b8fea80 100644 --- a/lib/CodeGen/PostRASchedulerList.cpp +++ b/lib/CodeGen/PostRASchedulerList.cpp @@ -24,24 +24,32 @@ #include "llvm/CodeGen/LatencyPriorityQueue.h" #include "llvm/CodeGen/SchedulerRegistry.h" #include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetRegisterInfo.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" #include "llvm/ADT/Statistic.h" +#include "llvm/ADT/DenseSet.h" +#include <map> +#include <climits> using namespace llvm; STATISTIC(NumStalls, "Number of pipeline stalls"); +static cl::opt<bool> +EnableAntiDepBreaking("break-anti-dependencies", + cl::desc("Break scheduling anti-dependencies"), + cl::init(false)); + namespace { class VISIBILITY_HIDDEN PostRAScheduler : public MachineFunctionPass { public: static char ID; PostRAScheduler() : MachineFunctionPass(&ID) {} - private: - MachineFunction *MF; - const TargetMachine *TM; - public: + const char *getPassName() const { - return "Post RA top-down list latency scheduler (STUB)"; + return "Post RA top-down list latency scheduler"; } bool runOnMachineFunction(MachineFunction &Fn); @@ -49,13 +57,6 @@ namespace { char PostRAScheduler::ID = 0; class VISIBILITY_HIDDEN SchedulePostRATDList : public ScheduleDAGInstrs { - public: - SchedulePostRATDList(MachineBasicBlock *mbb, const TargetMachine &tm) - : ScheduleDAGInstrs(mbb, tm) {} - private: - MachineFunction *MF; - const TargetMachine *TM; - /// AvailableQueue - The priority queue to use for the available SUnits. /// LatencyPriorityQueue AvailableQueue; @@ -66,12 +67,12 @@ namespace { /// added to the AvailableQueue. std::vector<SUnit*> PendingQueue; - public: - const char *getPassName() const { - return "Post RA top-down list latency scheduler (STUB)"; - } + /// Topo - A topological ordering for SUnits. + ScheduleDAGTopologicalSort Topo; - bool runOnMachineFunction(MachineFunction &Fn); + public: + SchedulePostRATDList(MachineBasicBlock *mbb, const TargetMachine &tm) + : ScheduleDAGInstrs(mbb, tm), Topo(SUnits) {} void Schedule(); @@ -79,19 +80,18 @@ namespace { void ReleaseSucc(SUnit *SU, SUnit *SuccSU, bool isChain); void ScheduleNodeTopDown(SUnit *SU, unsigned CurCycle); void ListScheduleTopDown(); + bool BreakAntiDependencies(); }; } bool PostRAScheduler::runOnMachineFunction(MachineFunction &Fn) { DOUT << "PostRAScheduler\n"; - MF = &Fn; - TM = &MF->getTarget(); // Loop over all of the basic blocks for (MachineFunction::iterator MBB = Fn.begin(), MBBe = Fn.end(); MBB != MBBe; ++MBB) { - SchedulePostRATDList Scheduler(MBB, *TM); + SchedulePostRATDList Scheduler(MBB, Fn.getTarget()); Scheduler.Run(); @@ -108,13 +108,407 @@ void SchedulePostRATDList::Schedule() { // Build scheduling units. BuildSchedUnits(); + if (EnableAntiDepBreaking) { + if (BreakAntiDependencies()) { + // We made changes. Update the dependency graph. + // Theoretically we could update the graph in place: + // When a live range is changed to use a different register, remove + // the def's anti-dependence *and* output-dependence edges due to + // that register, and add new anti-dependence and output-dependence + // edges based on the next live range of the register. + SUnits.clear(); + BuildSchedUnits(); + } + } + AvailableQueue.initNodes(SUnits); - + ListScheduleTopDown(); AvailableQueue.releaseState(); } +/// getInstrOperandRegClass - Return register class of the operand of an +/// instruction of the specified TargetInstrDesc. +static const TargetRegisterClass* +getInstrOperandRegClass(const TargetRegisterInfo *TRI, + const TargetInstrInfo *TII, const TargetInstrDesc &II, + unsigned Op) { + if (Op >= II.getNumOperands()) + return NULL; + if (II.OpInfo[Op].isLookupPtrRegClass()) + return TII->getPointerRegClass(); + return TRI->getRegClass(II.OpInfo[Op].RegClass); +} + +/// BreakAntiDependencies - Identifiy anti-dependencies along the critical path +/// of the ScheduleDAG and break them by renaming registers. +/// +bool SchedulePostRATDList::BreakAntiDependencies() { + // The code below assumes that there is at least one instruction, + // so just duck out immediately if the block is empty. + if (BB->empty()) return false; + + Topo.InitDAGTopologicalSorting(); + + // Compute a critical path for the DAG. + SUnit *Max = 0; + std::vector<SDep *> CriticalPath(SUnits.size()); + for (ScheduleDAGTopologicalSort::const_iterator I = Topo.begin(), + E = Topo.end(); I != E; ++I) { + SUnit *SU = &SUnits[*I]; + for (SUnit::pred_iterator P = SU->Preds.begin(), PE = SU->Preds.end(); + P != PE; ++P) { + SUnit *PredSU = P->Dep; + unsigned PredLatency = PredSU->CycleBound + PredSU->Latency; + if (SU->CycleBound < PredLatency) { + SU->CycleBound = PredLatency; + CriticalPath[*I] = &*P; + } + } + // Keep track of the node at the end of the critical path. + if (!Max || SU->CycleBound + SU->Latency > Max->CycleBound + Max->Latency) + Max = SU; + } + + DOUT << "Critical path has total latency " + << (Max ? Max->CycleBound + Max->Latency : 0) << "\n"; + + // Walk the critical path from the bottom up. Collect all anti-dependence + // edges on the critical path. Skip anti-dependencies between SUnits that + // are connected with other edges, since such units won't be able to be + // scheduled past each other anyway. + // + // The heuristic is that edges on the critical path are more important to + // break than other edges. And since there are a limited number of + // registers, we don't want to waste them breaking edges that aren't + // important. + // + // TODO: Instructions with multiple defs could have multiple + // anti-dependencies. The current code here only knows how to break one + // edge per instruction. Note that we'd have to be able to break all of + // the anti-dependencies in an instruction in order to be effective. + BitVector AllocatableSet = TRI->getAllocatableSet(*MF); + DenseMap<MachineInstr *, unsigned> CriticalAntiDeps; + for (SUnit *SU = Max; CriticalPath[SU->NodeNum]; + SU = CriticalPath[SU->NodeNum]->Dep) { + SDep *Edge = CriticalPath[SU->NodeNum]; + SUnit *NextSU = Edge->Dep; + unsigned AntiDepReg = Edge->Reg; + // Don't break anti-dependencies on non-allocatable registers. + if (!AllocatableSet.test(AntiDepReg)) + continue; + // If the SUnit has other dependencies on the SUnit that it + // anti-depends on, don't bother breaking the anti-dependency. + // Also, if there are dependencies on other SUnits with the + // same register as the anti-dependency, don't attempt to + // break it. + for (SUnit::pred_iterator P = SU->Preds.begin(), PE = SU->Preds.end(); + P != PE; ++P) + if (P->Dep == NextSU ? + (!P->isAntiDep || P->Reg != AntiDepReg) : + (!P->isCtrl && !P->isAntiDep && P->Reg == AntiDepReg)) { + AntiDepReg = 0; + break; + } + if (AntiDepReg != 0) + CriticalAntiDeps[SU->getInstr()] = AntiDepReg; + } + + // For live regs that are only used in one register class in a live range, + // the register class. If the register is not live or is referenced in + // multiple register classes, the corresponding value is null. If the + // register is used in multiple register classes, the corresponding value + // is -1 casted to a pointer. + const TargetRegisterClass * + Classes[TargetRegisterInfo::FirstVirtualRegister] = {}; + + // Map registers to all their references within a live range. + std::multimap<unsigned, MachineOperand *> RegRefs; + + // The index of the most recent kill (proceding bottom-up), or -1 if + // the register is not live. + unsigned KillIndices[TargetRegisterInfo::FirstVirtualRegister]; + std::fill(KillIndices, array_endof(KillIndices), -1); + // The index of the most recent def (proceding bottom up), or -1 if + // the register is live. + unsigned DefIndices[TargetRegisterInfo::FirstVirtualRegister]; + std::fill(DefIndices, array_endof(DefIndices), BB->size()); + + // Determine the live-out physregs for this block. + if (!BB->empty() && BB->back().getDesc().isReturn()) + // In a return block, examine the function live-out regs. + for (MachineRegisterInfo::liveout_iterator I = MRI.liveout_begin(), + E = MRI.liveout_end(); I != E; ++I) { + unsigned Reg = *I; + Classes[Reg] = reinterpret_cast<TargetRegisterClass *>(-1); + KillIndices[Reg] = BB->size(); + DefIndices[Reg] = -1; + // Repeat, for all aliases. + for (const unsigned *Alias = TRI->getAliasSet(Reg); *Alias; ++Alias) { + unsigned AliasReg = *Alias; + Classes[AliasReg] = reinterpret_cast<TargetRegisterClass *>(-1); + KillIndices[AliasReg] = BB->size(); + DefIndices[AliasReg] = -1; + } + } + else + // In a non-return block, examine the live-in regs of all successors. + for (MachineBasicBlock::succ_iterator SI = BB->succ_begin(), + SE = BB->succ_end(); SI != SE; ++SI) + for (MachineBasicBlock::livein_iterator I = (*SI)->livein_begin(), + E = (*SI)->livein_end(); I != E; ++I) { + unsigned Reg = *I; + Classes[Reg] = reinterpret_cast<TargetRegisterClass *>(-1); + KillIndices[Reg] = BB->size(); + DefIndices[Reg] = -1; + // Repeat, for all aliases. + for (const unsigned *Alias = TRI->getAliasSet(Reg); *Alias; ++Alias) { + unsigned AliasReg = *Alias; + Classes[AliasReg] = reinterpret_cast<TargetRegisterClass *>(-1); + KillIndices[AliasReg] = BB->size(); + DefIndices[AliasReg] = -1; + } + } + + // Consider callee-saved registers as live-out, since we're running after + // prologue/epilogue insertion so there's no way to add additional + // saved registers. + // + // TODO: If the callee saves and restores these, then we can potentially + // use them between the save and the restore. To do that, we could scan + // the exit blocks to see which of these registers are defined. + for (const unsigned *I = TRI->getCalleeSavedRegs(); *I; ++I) { + unsigned Reg = *I; + Classes[Reg] = reinterpret_cast<TargetRegisterClass *>(-1); + KillIndices[Reg] = BB->size(); + DefIndices[Reg] = -1; + // Repeat, for all aliases. + for (const unsigned *Alias = TRI->getAliasSet(Reg); *Alias; ++Alias) { + unsigned AliasReg = *Alias; + Classes[AliasReg] = reinterpret_cast<TargetRegisterClass *>(-1); + KillIndices[AliasReg] = BB->size(); + DefIndices[AliasReg] = -1; + } + } + + // Consider this pattern: + // A = ... + // ... = A + // A = ... + // ... = A + // A = ... + // ... = A + // A = ... + // ... = A + // There are three anti-dependencies here, and without special care, + // we'd break all of them using the same register: + // A = ... + // ... = A + // B = ... + // ... = B + // B = ... + // ... = B + // B = ... + // ... = B + // because at each anti-dependence, B is the first register that + // isn't A which is free. This re-introduces anti-dependencies + // at all but one of the original anti-dependencies that we were + // trying to break. To avoid this, keep track of the most recent + // register that each register was replaced with, avoid avoid + // using it to repair an anti-dependence on the same register. + // This lets us produce this: + // A = ... + // ... = A + // B = ... + // ... = B + // C = ... + // ... = C + // B = ... + // ... = B + // This still has an anti-dependence on B, but at least it isn't on the + // original critical path. + // + // TODO: If we tracked more than one register here, we could potentially + // fix that remaining critical edge too. This is a little more involved, + // because unlike the most recent register, less recent registers should + // still be considered, though only if no other registers are available. + unsigned LastNewReg[TargetRegisterInfo::FirstVirtualRegister] = {}; + + // A registers defined and not used in an instruction. This is used for + // liveness tracking and is declared outside the loop only to avoid + // having it be re-allocated on each iteration. + DenseSet<unsigned> Defs; + + // Attempt to break anti-dependence edges on the critical path. Walk the + // instructions from the bottom up, tracking information about liveness + // as we go to help determine which registers are available. + bool Changed = false; + unsigned Count = BB->size() - 1; + for (MachineBasicBlock::reverse_iterator I = BB->rbegin(), E = BB->rend(); + I != E; ++I, --Count) { + MachineInstr *MI = &*I; + + // Check if this instruction has an anti-dependence that we're + // interested in. + DenseMap<MachineInstr *, unsigned>::iterator C = CriticalAntiDeps.find(MI); + unsigned AntiDepReg = C != CriticalAntiDeps.end() ? + C->second : 0; + + // Scan the register operands for this instruction and update + // Classes and RegRefs. + for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { + MachineOperand &MO = MI->getOperand(i); + if (!MO.isReg()) continue; + unsigned Reg = MO.getReg(); + if (Reg == 0) continue; + const TargetRegisterClass *NewRC = + getInstrOperandRegClass(TRI, TII, MI->getDesc(), i); + + // If this instruction has a use of AntiDepReg, breaking it + // is invalid. + if (MO.isUse() && AntiDepReg == Reg) + AntiDepReg = 0; + + // For now, only allow the register to be changed if its register + // class is consistent across all uses. + if (!Classes[Reg] && NewRC) + Classes[Reg] = NewRC; + else if (!NewRC || Classes[Reg] != NewRC) + Classes[Reg] = reinterpret_cast<TargetRegisterClass *>(-1); + + // Now check for aliases. + for (const unsigned *Alias = TRI->getAliasSet(Reg); *Alias; ++Alias) { + // If an alias of the reg is used during the live range, give up. + // Note that this allows us to skip checking if AntiDepReg + // overlaps with any of the aliases, among other things. + unsigned AliasReg = *Alias; + if (Classes[AliasReg]) { + Classes[AliasReg] = reinterpret_cast<TargetRegisterClass *>(-1); + Classes[Reg] = reinterpret_cast<TargetRegisterClass *>(-1); + } + } + + // If we're still willing to consider this register, note the reference. + if (Classes[Reg] != reinterpret_cast<TargetRegisterClass *>(-1)) + RegRefs.insert(std::make_pair(Reg, &MO)); + } + + // Determine AntiDepReg's register class, if it is live and is + // consistently used within a single class. + const TargetRegisterClass *RC = AntiDepReg != 0 ? Classes[AntiDepReg] : 0; + assert(AntiDepReg == 0 || RC != NULL && + "Register should be live if it's causing an anti-dependence!"); + if (RC == reinterpret_cast<TargetRegisterClass *>(-1)) + AntiDepReg = 0; + + // Look for a suitable register to use to break the anti-depenence. + // + // TODO: Instead of picking the first free register, consider which might + // be the best. + if (AntiDepReg != 0) { + for (TargetRegisterClass::iterator R = RC->allocation_order_begin(*MF), + RE = RC->allocation_order_end(*MF); R != RE; ++R) { + unsigned NewReg = *R; + // Don't replace a register with itself. + if (NewReg == AntiDepReg) continue; + // Don't replace a register with one that was recently used to repair + // an anti-dependence with this AntiDepReg, because that would + // re-introduce that anti-dependence. + if (NewReg == LastNewReg[AntiDepReg]) continue; + // If NewReg is dead and NewReg's most recent def is not before + // AntiDepReg's kill, it's safe to replace AntiDepReg with NewReg. + assert(((KillIndices[AntiDepReg] == -1) != (DefIndices[AntiDepReg] == -1)) && + "Kill and Def maps aren't consistent for AntiDepReg!"); + assert(((KillIndices[NewReg] == -1) != (DefIndices[NewReg] == -1)) && + "Kill and Def maps aren't consistent for NewReg!"); + if (KillIndices[NewReg] == -1 && + KillIndices[AntiDepReg] <= DefIndices[NewReg]) { + DOUT << "Breaking anti-dependence edge on reg " << AntiDepReg + << " with reg " << NewReg << "!\n"; + + // Update the references to the old register to refer to the new + // register. + std::pair<std::multimap<unsigned, MachineOperand *>::iterator, + std::multimap<unsigned, MachineOperand *>::iterator> + Range = RegRefs.equal_range(AntiDepReg); + for (std::multimap<unsigned, MachineOperand *>::iterator + Q = Range.first, QE = Range.second; Q != QE; ++Q) + Q->second->setReg(NewReg); + + // We just went back in time and modified history; the + // liveness information for the anti-depenence reg is now + // inconsistent. Set the state as if it were dead. + Classes[NewReg] = Classes[AntiDepReg]; + DefIndices[NewReg] = DefIndices[AntiDepReg]; + KillIndices[NewReg] = KillIndices[AntiDepReg]; + + Classes[AntiDepReg] = 0; + DefIndices[AntiDepReg] = KillIndices[AntiDepReg]; + KillIndices[AntiDepReg] = -1; + + RegRefs.erase(AntiDepReg); + Changed = true; + LastNewReg[AntiDepReg] = NewReg; + break; + } + } + } + + // Update liveness. + Defs.clear(); + for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { + MachineOperand &MO = MI->getOperand(i); + if (!MO.isReg()) continue; + unsigned Reg = MO.getReg(); + if (Reg == 0) continue; + if (MO.isDef()) + Defs.insert(Reg); + else { + // Treat a use in the same instruction as a def as an extension of + // a live range. + Defs.erase(Reg); + // It wasn't previously live but now it is, this is a kill. + if (KillIndices[Reg] == -1) { + KillIndices[Reg] = Count; + DefIndices[Reg] = -1; + } + // Repeat, for all aliases. + for (const unsigned *Alias = TRI->getAliasSet(Reg); *Alias; ++Alias) { + unsigned AliasReg = *Alias; + Defs.erase(AliasReg); + if (KillIndices[AliasReg] == -1) { + KillIndices[AliasReg] = Count; + DefIndices[AliasReg] = -1; + } + } + } + } + // Proceding upwards, registers that are defed but not used in this + // instruction are now dead. + for (DenseSet<unsigned>::iterator D = Defs.begin(), DE = Defs.end(); + D != DE; ++D) { + unsigned Reg = *D; + DefIndices[Reg] = Count; + KillIndices[Reg] = -1; + Classes[Reg] = 0; + RegRefs.erase(Reg); + // Repeat, for all subregs. + for (const unsigned *Subreg = TRI->getSubRegisters(Reg); + *Subreg; ++Subreg) { + unsigned SubregReg = *Subreg; + DefIndices[SubregReg] = Count; + KillIndices[SubregReg] = -1; + Classes[SubregReg] = 0; + RegRefs.erase(SubregReg); + } + } + } + assert(Count == -1u && "Count mismatch!"); + + return Changed; +} + //===----------------------------------------------------------------------===// // Top-Down Scheduling //===----------------------------------------------------------------------===// @@ -202,8 +596,7 @@ void SchedulePostRATDList::ListScheduleTopDown() { } } - // If there are no instructions available, don't try to issue anything, and - // don't advance the hazard recognizer. + // If there are no instructions available, don't try to issue anything. if (AvailableQueue.empty()) { ++CurCycle; continue; diff --git a/lib/CodeGen/ScheduleDAG.cpp b/lib/CodeGen/ScheduleDAG.cpp index 046d3379a64..7088313fcdc 100644 --- a/lib/CodeGen/ScheduleDAG.cpp +++ b/lib/CodeGen/ScheduleDAG.cpp @@ -263,3 +263,204 @@ void ScheduleDAG::VerifySchedule(bool isBottomUp) { "The number of nodes scheduled doesn't match the expected number!"); } #endif + +/// InitDAGTopologicalSorting - create the initial topological +/// ordering from the DAG to be scheduled. +/// +/// The idea of the algorithm is taken from +/// "Online algorithms for managing the topological order of +/// a directed acyclic graph" by David J. Pearce and Paul H.J. Kelly +/// This is the MNR algorithm, which was first introduced by +/// A. Marchetti-Spaccamela, U. Nanni and H. Rohnert in +/// "Maintaining a topological order under edge insertions". +/// +/// Short description of the algorithm: +/// +/// Topological ordering, ord, of a DAG maps each node to a topological +/// index so that for all edges X->Y it is the case that ord(X) < ord(Y). +/// +/// This means that if there is a path from the node X to the node Z, +/// then ord(X) < ord(Z). +/// +/// This property can be used to check for reachability of nodes: +/// if Z is reachable from X, then an insertion of the edge Z->X would +/// create a cycle. +/// +/// The algorithm first computes a topological ordering for the DAG by +/// initializing the Index2Node and Node2Index arrays and then tries to keep +/// the ordering up-to-date after edge insertions by reordering the DAG. +/// +/// On insertion of the edge X->Y, the algorithm first marks by calling DFS +/// the nodes reachable from Y, and then shifts them using Shift to lie +/// immediately after X in Index2Node. +void ScheduleDAGTopologicalSort::InitDAGTopologicalSorting() { + unsigned DAGSize = SUnits.size(); + std::vector<SUnit*> WorkList; + WorkList.reserve(DAGSize); + + Index2Node.resize(DAGSize); + Node2Index.resize(DAGSize); + + // Initialize the data structures. + for (unsigned i = 0, e = DAGSize; i != e; ++i) { + SUnit *SU = &SUnits[i]; + int NodeNum = SU->NodeNum; + unsigned Degree = SU->Succs.size(); + // Temporarily use the Node2Index array as scratch space for degree counts. + Node2Index[NodeNum] = Degree; + + // Is it a node without dependencies? + if (Degree == 0) { + assert(SU->Succs.empty() && "SUnit should have no successors"); + // Collect leaf nodes. + WorkList.push_back(SU); + } + } + + int Id = DAGSize; + while (!WorkList.empty()) { + SUnit *SU = WorkList.back(); + WorkList.pop_back(); + Allocate(SU->NodeNum, --Id); + for (SUnit::const_pred_iterator I = SU->Preds.begin(), E = SU->Preds.end(); + I != E; ++I) { + SUnit *SU = I->Dep; + if (!--Node2Index[SU->NodeNum]) + // If all dependencies of the node are processed already, + // then the node can be computed now. + WorkList.push_back(SU); + } + } + + Visited.resize(DAGSize); + +#ifndef NDEBUG + // Check correctness of the ordering + for (unsigned i = 0, e = DAGSize; i != e; ++i) { + SUnit *SU = &SUnits[i]; + for (SUnit::const_pred_iterator I = SU->Preds.begin(), E = SU->Preds.end(); + I != E; ++I) { + assert(Node2Index[SU->NodeNum] > Node2Index[I->Dep->NodeNum] && + "Wrong topological sorting"); + } + } +#endif +} + +/// AddPred - Updates the topological ordering to accomodate an edge +/// to be added from SUnit X to SUnit Y. +void ScheduleDAGTopologicalSort::AddPred(SUnit *Y, SUnit *X) { + int UpperBound, LowerBound; + LowerBound = Node2Index[Y->NodeNum]; + UpperBound = Node2Index[X->NodeNum]; + bool HasLoop = false; + // Is Ord(X) < Ord(Y) ? + if (LowerBound < UpperBound) { + // Update the topological order. + Visited.reset(); + DFS(Y, UpperBound, HasLoop); + assert(!HasLoop && "Inserted edge creates a loop!"); + // Recompute topological indexes. + Shift(Visited, LowerBound, UpperBound); + } +} + +/// RemovePred - Updates the topological ordering to accomodate an +/// an edge to be removed from the specified node N from the predecessors +/// of the current node M. +void ScheduleDAGTopologicalSort::RemovePred(SUnit *M, SUnit *N) { + // InitDAGTopologicalSorting(); +} + +/// DFS - Make a DFS traversal to mark all nodes reachable from SU and mark +/// all nodes affected by the edge insertion. These nodes will later get new +/// topological indexes by means of the Shift method. +void ScheduleDAGTopologicalSort::DFS(const SUnit *SU, int UpperBound, bool& HasLoop) { + std::vector<const SUnit*> WorkList; + WorkList.reserve(SUnits.size()); + + WorkList.push_back(SU); + while (!WorkList.empty()) { + SU = WorkList.back(); + WorkList.pop_back(); + Visited.set(SU->NodeNum); + for (int I = SU->Succs.size()-1; I >= 0; --I) { + int s = SU->Succs[I].Dep->NodeNum; + if (Node2Index[s] == UpperBound) { + HasLoop = true; + return; + } + // Visit successors if not already and in affected region. + if (!Visited.test(s) && Node2Index[s] < UpperBound) { + WorkList.push_back(SU->Succs[I].Dep); + } + } + } +} + +/// Shift - Renumber the nodes so that the topological ordering is +/// preserved. +void ScheduleDAGTopologicalSort::Shift(BitVector& Visited, int LowerBound, + int UpperBound) { + std::vector<int> L; + int shift = 0; + int i; + + for (i = LowerBound; i <= UpperBound; ++i) { + // w is node at topological index i. + int w = Index2Node[i]; + if (Visited.test(w)) { + // Unmark. + Visited.reset(w); + L.push_back(w); + shift = shift + 1; + } else { + Allocate(w, i - shift); + } + } + + for (unsigned j = 0; j < L.size(); ++j) { + Allocate(L[j], i - shift); + i = i + 1; + } +} + + +/// WillCreateCycle - Returns true if adding an edge from SU to TargetSU will +/// create a cycle. +bool ScheduleDAGTopologicalSort::WillCreateCycle(SUnit *SU, SUnit *TargetSU) { + if (IsReachable(TargetSU, SU)) + return true; + for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end(); + I != E; ++I) + if (I->Cost < 0 && IsReachable(TargetSU, I->Dep)) + return true; + return false; +} + +/// IsReachable - Checks if SU is reachable from TargetSU. +bool ScheduleDAGTopologicalSort::IsReachable(const SUnit *SU, const SUnit *TargetSU) { + // If insertion of the edge SU->TargetSU would create a cycle + // then there is a path from TargetSU to SU. + int UpperBound, LowerBound; + LowerBound = Node2Index[TargetSU->NodeNum]; + UpperBound = Node2Index[SU->NodeNum]; + bool HasLoop = false; + // Is Ord(TargetSU) < Ord(SU) ? + if (LowerBound < UpperBound) { + Visited.reset(); + // There may be a path from TargetSU to SU. Check for it. + DFS(TargetSU, UpperBound, HasLoop); + } + return HasLoop; +} + +/// Allocate - assign the topological index to the node n. +void ScheduleDAGTopologicalSort::Allocate(int n, int index) { + Node2Index[n] = index; + Index2Node[index] = n; +} + +ScheduleDAGTopologicalSort::ScheduleDAGTopologicalSort( + std::vector<SUnit> &sunits) + : SUnits(sunits) {} diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp index 16f99502951..5cbffc7c26c 100644 --- a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp +++ b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp @@ -69,12 +69,16 @@ private: std::vector<SUnit*> LiveRegDefs; std::vector<unsigned> LiveRegCycles; + /// Topo - A topological ordering for SUnits which permits fast IsReachable + /// and similar queries. + ScheduleDAGTopologicalSort Topo; + public: ScheduleDAGRRList(SelectionDAG *dag, MachineBasicBlock *bb, const TargetMachine &tm, bool isbottomup, SchedulingPriorityQueue *availqueue) : ScheduleDAGSDNodes(dag, bb, tm), isBottomUp(isbottomup), - AvailableQueue(availqueue) { + AvailableQueue(availqueue), Topo(SUnits) { } ~ScheduleDAGRRList() { @@ -84,22 +88,32 @@ public: void Schedule(); /// IsReachable - Checks if SU is reachable from TargetSU. - bool IsReachable(const SUnit *SU, const SUnit *TargetSU); + bool IsReachable(const SUnit *SU, const SUnit *TargetSU) { + return Topo.IsReachable(SU, TargetSU); + } /// willCreateCycle - Returns true if adding an edge from SU to TargetSU will /// create a cycle. - bool WillCreateCycle(SUnit *SU, SUnit *TargetSU); + bool WillCreateCycle(SUnit *SU, SUnit *TargetSU) { + return Topo.WillCreateCycle(SU, TargetSU); + } /// AddPred - This adds the specified node X as a predecessor of /// the current node Y if not already. /// This returns true if this is a new predecessor. /// Updates the topological ordering if required. bool AddPred(SUnit *Y, SUnit *X, bool isCtrl, bool isArtificial, - unsigned PhyReg = 0, int Cost = 1); + unsigned PhyReg = 0, int Cost = 1) { + Topo.AddPred(Y, X); + return Y->addPred(X, isCtrl, isArtificial, PhyReg, Cost); + } /// RemovePred - This removes the specified node N from the predecessors of /// the current node M. Updates the topological ordering if required. - bool RemovePred(SUnit *M, SUnit *N, bool isCtrl, bool isArtificial); + bool RemovePred(SUnit *M, SUnit *N, bool isCtrl, bool isArtificial) { + Topo.RemovePred(M, N); + return M->removePred(N, isCtrl, isArtificial, false); + } private: void ReleasePred(SUnit *SU, SUnit *PredSU, bool isChain); @@ -123,49 +137,24 @@ private: /// CreateNewSUnit - Creates a new SUnit and returns a pointer to it. /// Updates the topological ordering if required. SUnit *CreateNewSUnit(SDNode *N) { + unsigned NumSUnits = SUnits.size(); SUnit *NewNode = NewSUnit(N); // Update the topological ordering. - if (NewNode->NodeNum >= Node2Index.size()) - InitDAGTopologicalSorting(); + if (NewNode->NodeNum >= NumSUnits) + Topo.InitDAGTopologicalSorting(); return NewNode; } /// CreateClone - Creates a new SUnit from an existing one. /// Updates the topological ordering if required. SUnit *CreateClone(SUnit *N) { + unsigned NumSUnits = SUnits.size(); SUnit *NewNode = Clone(N); // Update the topological ordering. - if (NewNode->NodeNum >= Node2Index.size()) - InitDAGTopologicalSorting(); + if (NewNode->NodeNum >= NumSUnits) + Topo.InitDAGTopologicalSorting(); return NewNode; } - - /// Functions for preserving the topological ordering - /// even after dynamic insertions of new edges. - /// This allows a very fast implementation of IsReachable. - - /// InitDAGTopologicalSorting - create the initial topological - /// ordering from the DAG to be scheduled. - void InitDAGTopologicalSorting(); - - /// DFS - make a DFS traversal and mark all nodes affected by the - /// edge insertion. These nodes will later get new topological indexes - /// by means of the Shift method. - void DFS(const SUnit *SU, int UpperBound, bool& HasLoop); - - /// Shift - reassign topological indexes for the nodes in the DAG - /// to preserve the topological ordering. - void Shift(BitVector& Visited, int LowerBound, int UpperBound); - - /// Allocate - assign the topological index to the node n. - void Allocate(int n, int index); - - /// Index2Node - Maps topological index to the node number. - std::vector<int> Index2Node; - /// Node2Index - Maps the node number to its topological index. - std::vector<int> Node2Index; - /// Visited - a set of nodes visited during a DFS traversal. - BitVector Visited; }; } // end anonymous namespace @@ -185,7 +174,7 @@ void ScheduleDAGRRList::Schedule() { SUnits[su].dumpAll(this)); CalculateDepths(); CalculateHeights(); - InitDAGTopologicalSorting(); + Topo.InitDAGTopologicalSorting(); AvailableQueue->initNodes(SUnits); @@ -374,207 +363,6 @@ void ScheduleDAGRRList::UnscheduleNodeBottomUp(SUnit *SU) { AvailableQueue->push(SU); } -/// IsReachable - Checks if SU is reachable from TargetSU. -bool ScheduleDAGRRList::IsReachable(const SUnit *SU, const SUnit *TargetSU) { - // If insertion of the edge SU->TargetSU would create a cycle - // then there is a path from TargetSU to SU. - int UpperBound, LowerBound; - LowerBound = Node2Index[TargetSU->NodeNum]; - UpperBound = Node2Index[SU->NodeNum]; - bool HasLoop = false; - // Is Ord(TargetSU) < Ord(SU) ? - if (LowerBound < UpperBound) { - Visited.reset(); - // There may be a path from TargetSU to SU. Check for it. - DFS(TargetSU, UpperBound, HasLoop); - } - return HasLoop; -} - -/// Allocate - assign the topological index to the node n. -inline void ScheduleDAGRRList::Allocate(int n, int index) { - Node2Index[n] = index; - Index2Node[index] = n; -} - -/// InitDAGTopologicalSorting - create the initial topological -/// ordering from the DAG to be scheduled. - -/// The idea of the algorithm is taken from -/// "Online algorithms for managing the topological order of -/// a directed acyclic graph" by David J. Pearce and Paul H.J. Kelly -/// This is the MNR algorithm, which was first introduced by -/// A. Marchetti-Spaccamela, U. Nanni and H. Rohnert in -/// "Maintaining a topological order under edge insertions". -/// -/// Short description of the algorithm: -/// -/// Topological ordering, ord, of a DAG maps each node to a topological -/// index so that for all edges X->Y it is the case that ord(X) < ord(Y). -/// -/// This means that if there is a path from the node X to the node Z, -/// then ord(X) < ord(Z). -/// -/// This property can be used to check for reachability of nodes: -/// if Z is reachable from X, then an insertion of the edge Z->X would -/// create a cycle. -/// -/// The algorithm first computes a topological ordering for the DAG by -/// initializing the Index2Node and Node2Index arrays and then tries to keep -/// the ordering up-to-date after edge insertions by reordering the DAG. -/// -/// On insertion of the edge X->Y, the algorithm first marks by calling DFS -/// the nodes reachable from Y, and then shifts them using Shift to lie -/// immediately after X in Index2Node. -void ScheduleDAGRRList::InitDAGTopologicalSorting() { - unsigned DAGSize = SUnits.size(); - std::vector<SUnit*> WorkList; - WorkList.reserve(DAGSize); - - Index2Node.resize(DAGSize); - Node2Index.resize(DAGSize); - - // Initialize the data structures. - for (unsigned i = 0, e = DAGSize; i != e; ++i) { - SUnit *SU = &SUnits[i]; - int NodeNum = SU->NodeNum; - unsigned Degree = SU->Succs.size(); - // Temporarily use the Node2Index array as scratch space for degree counts. - Node2Index[NodeNum] = Degree; - - // Is it a node without dependencies? - if (Degree == 0) { - assert(SU->Succs.empty() && "SUnit should have no successors"); - // Collect leaf nodes. - WorkList.push_back(SU); - } - } - - int Id = DAGSize; - while (!WorkList.empty()) { - SUnit *SU = WorkList.back(); - WorkList.pop_back(); - Allocate(SU->NodeNum, --Id); - for (SUnit::const_pred_iterator I = SU->Preds.begin(), E = SU->Preds.end(); - I != E; ++I) { - SUnit *SU = I->Dep; - if (!--Node2Index[SU->NodeNum]) - // If all dependencies of the node are processed already, - // then the node can be computed now. - WorkList.push_back(SU); - } - } - - Visited.resize(DAGSize); - -#ifndef NDEBUG - // Check correctness of the ordering - for (unsigned i = 0, e = DAGSize; i != e; ++i) { - SUnit *SU = &SUnits[i]; - for (SUnit::const_pred_iterator I = SU->Preds.begin(), E = SU->Preds.end(); - I != E; ++I) { - assert(Node2Index[SU->NodeNum] > Node2Index[I->Dep->NodeNum] && - "Wrong topological sorting"); - } - } -#endif -} - -/// AddPred - adds an edge from SUnit X to SUnit Y. -/// Updates the topological ordering if required. -bool ScheduleDAGRRList::AddPred(SUnit *Y, SUnit *X, bool isCtrl, - bool isArtificial, unsigned PhyReg, int Cost) { - int UpperBound, LowerBound; - LowerBound = Node2Index[Y->NodeNum]; - UpperBound = Node2Index[X->NodeNum]; - bool HasLoop = false; - // Is Ord(X) < Ord(Y) ? - if (LowerBound < UpperBound) { - // Update the topological order. - Visited.reset(); - DFS(Y, UpperBound, HasLoop); - assert(!HasLoop && "Inserted edge creates a loop!"); - // Recompute topological indexes. - Shift(Visited, LowerBound, UpperBound); - } - // Now really insert the edge. - return Y->addPred(X, isCtrl, isArtificial, PhyReg, Cost); -} - -/// RemovePred - This removes the specified node N from the predecessors of -/// the current node M. Updates the topological ordering if required. -bool ScheduleDAGRRList::RemovePred(SUnit *M, SUnit *N, - bool isCtrl, bool isArtificial) { - // InitDAGTopologicalSorting(); - return M->removePred(N, isCtrl, isArtificial, false); -} - -/// DFS - Make a DFS traversal to mark all nodes reachable from SU and mark -/// all nodes affected by the edge insertion. These nodes will later get new -/// topological indexes by means of the Shift method. -void ScheduleDAGRRList::DFS(const SUnit *SU, int UpperBound, bool& HasLoop) { - std::vector<const SUnit*> WorkList; - WorkList.reserve(SUnits.size()); - - WorkList.push_back(SU); - while (!WorkList.empty()) { - SU = WorkList.back(); - WorkList.pop_back(); - Visited.set(SU->NodeNum); - for (int I = SU->Succs.size()-1; I >= 0; --I) { - int s = SU->Succs[I].Dep->NodeNum; - if (Node2Index[s] == UpperBound) { - HasLoop = true; - return; - } - // Visit successors if not already and in affected region. - if (!Visited.test(s) && Node2Index[s] < UpperBound) { - WorkList.push_back(SU->Succs[I].Dep); - } - } - } -} - -/// Shift - Renumber the nodes so that the topological ordering is -/// preserved. -void ScheduleDAGRRList::Shift(BitVector& Visited, int LowerBound, - int UpperBound) { - std::vector<int> L; - int shift = 0; - int i; - - for (i = LowerBound; i <= UpperBound; ++i) { - // w is node at topological index i. - int w = Index2Node[i]; - if (Visited.test(w)) { - // Unmark. - Visited.reset(w); - L.push_back(w); - shift = shift + 1; - } else { - Allocate(w, i - shift); - } - } - - for (unsigned j = 0; j < L.size(); ++j) { - Allocate(L[j], i - shift); - i = i + 1; - } -} - - -/// WillCreateCycle - Returns true if adding an edge from SU to TargetSU will -/// create a cycle. -bool ScheduleDAGRRList::WillCreateCycle(SUnit *SU, SUnit *TargetSU) { - if (IsReachable(TargetSU, SU)) - return true; - for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end(); - I != E; ++I) - if (I->Cost < 0 && IsReachable(TargetSU, I->Dep)) - return true; - return false; -} - /// BacktrackBottomUp - Backtrack scheduling to a previous cycle specified in /// BTCycle in order to schedule a specific node. Returns the last unscheduled /// SUnit. Also returns if a successor is unscheduled in the process. diff --git a/test/CodeGen/X86/break-anti-dependencies.ll b/test/CodeGen/X86/break-anti-dependencies.ll new file mode 100644 index 00000000000..8646e3ebe86 --- /dev/null +++ b/test/CodeGen/X86/break-anti-dependencies.ll @@ -0,0 +1,33 @@ +; RUN: llvm-as < %s | llc -march=x86-64 -disable-post-RA-scheduler=false > %t +; RUN: grep {%xmm0} %t | count 14 +; RUN: not grep {%xmm1} %t +; RUN: llvm-as < %s | llc -march=x86-64 -disable-post-RA-scheduler=false -break-anti-dependencies > %t +; RUN: grep {%xmm0} %t | count 7 +; RUN: grep {%xmm1} %t | count 7 + +define void @goo(double* %r, double* %p, double* %q) nounwind { +entry: + %0 = load double* %p, align 8 + %1 = add double %0, 1.100000e+00 + %2 = mul double %1, 1.200000e+00 + %3 = add double %2, 1.300000e+00 + %4 = mul double %3, 1.400000e+00 + %5 = add double %4, 1.500000e+00 + %6 = fptosi double %5 to i32 + %7 = load double* %r, align 8 + %8 = add double %7, 7.100000e+00 + %9 = mul double %8, 7.200000e+00 + %10 = add double %9, 7.300000e+00 + %11 = mul double %10, 7.400000e+00 + %12 = add double %11, 7.500000e+00 + %13 = fptosi double %12 to i32 + %14 = icmp slt i32 %6, %13 + br i1 %14, label %bb, label %return + +bb: + store double 9.300000e+00, double* %q, align 8 + ret void + +return: + ret void +} |