summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNadav Rotem <nrotem@apple.com>2012-10-18 05:29:12 +0000
committerNadav Rotem <nrotem@apple.com>2012-10-18 05:29:12 +0000
commit1953ace81d3121808967a7fa47ad6d631499933d (patch)
tree4b60c184edfb34fb457154aff1d24702a916b933
parent6ebddd2d6558211a5e4a76b5906633b431e12006 (diff)
Vectorizer: Add support for loops with an unknown count. For example:
for (i=0; i<n; i++){ a[i] = b[i+1] + c[i+3]; } git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166165 91177308-0d34-0410-b5e6-96231b3b80d8
-rw-r--r--lib/Transforms/Vectorize/LoopVectorize.cpp227
-rw-r--r--test/Transforms/LoopVectorize/gcc-examples.ll6
-rw-r--r--test/Transforms/LoopVectorize/non-const-n.ll2
3 files changed, 162 insertions, 73 deletions
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index f84e3925f1d..80fdad30165 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -28,6 +28,8 @@
#include "llvm/Analysis/LoopPass.h"
#include "llvm/Value.h"
#include "llvm/Function.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/Verifier.h"
#include "llvm/Module.h"
#include "llvm/Type.h"
#include "llvm/ADT/SmallVector.h"
@@ -65,8 +67,8 @@ public:
/// Ctor.
SingleBlockLoopVectorizer(Loop *OrigLoop, ScalarEvolution *Se, LoopInfo *Li,
- unsigned VecWidth):
- Orig(OrigLoop), SE(Se), LI(Li), VF(VecWidth),
+ LPPassManager *Lpm, unsigned VecWidth):
+ Orig(OrigLoop), SE(Se), LI(Li), LPM(Lpm), VF(VecWidth),
Builder(0), Induction(0), OldInduction(0) { }
~SingleBlockLoopVectorizer() {
@@ -76,20 +78,20 @@ public:
// Perform the actual loop widening (vectorization).
void vectorize() {
///Create a new empty loop. Unlink the old loop and connect the new one.
- copyEmptyLoop();
+ createEmptyLoop();
/// Widen each instruction in the old loop to a new one in the new loop.
vectorizeLoop();
- // Delete the old loop.
- deleteOldLoop();
+ // register the new loop.
+ cleanup();
}
private:
/// Create an empty loop, based on the loop ranges of the old loop.
- void copyEmptyLoop();
+ void createEmptyLoop();
/// Copy and widen the instructions from the old loop.
void vectorizeLoop();
- /// Delete the old loop.
- void deleteOldLoop();
+ /// Insert the new loop to the loop hierarchy and pass manager.
+ void cleanup();
/// This instruction is un-vectorizable. Implement it as a sequence
/// of scalars.
@@ -123,6 +125,8 @@ private:
ScalarEvolution *SE;
// Loop Info.
LoopInfo *LI;
+ // Loop Pass Manager;
+ LPPassManager *LPM;
// The vectorization factor to use.
unsigned VF;
@@ -132,9 +136,9 @@ private:
// --- Vectorization state ---
/// The new Induction variable which was added to the new block.
- Instruction *Induction;
+ PHINode *Induction;
/// The induction variable of the old basic block.
- Instruction *OldInduction;
+ PHINode *OldInduction;
// Maps scalars to widened vectors.
DenseMap<Value*, Value*> WidenMap;
};
@@ -184,6 +188,7 @@ struct LoopVectorize : public LoopPass {
ScalarEvolution *SE;
DataLayout *DL;
LoopInfo *LI;
+ DominatorTree *DT;
virtual bool runOnLoop(Loop *L, LPPassManager &LPM) {
// Only vectorize innermost loops.
@@ -194,6 +199,7 @@ struct LoopVectorize : public LoopPass {
SE = &getAnalysis<ScalarEvolution>();
DL = getAnalysisIfAvailable<DataLayout>();
LI = &getAnalysis<LoopInfo>();
+ DT = &getAnalysis<DominatorTree>();
DEBUG(dbgs() << "LV: Checking a loop in \"" <<
L->getHeader()->getParent()->getName() << "\"\n");
@@ -203,8 +209,7 @@ struct LoopVectorize : public LoopPass {
unsigned MaxVF = LVL.getLoopMaxVF();
// Check that we can vectorize using the chosen vectorization width.
- if ((MaxVF < DefaultVectorizationFactor) ||
- (MaxVF % DefaultVectorizationFactor)) {
+ if (MaxVF < DefaultVectorizationFactor) {
DEBUG(dbgs() << "LV: non-vectorizable MaxVF ("<< MaxVF << ").\n");
return false;
}
@@ -212,11 +217,10 @@ struct LoopVectorize : public LoopPass {
DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< MaxVF << ").\n");
// If we decided that is is *legal* to vectorizer the loop. Do it.
- SingleBlockLoopVectorizer LB(L, SE, LI, DefaultVectorizationFactor);
+ SingleBlockLoopVectorizer LB(L, SE, LI, &LPM, DefaultVectorizationFactor);
LB.vectorize();
- // The loop is now vectorized. Remove it from LMP.
- LPM.deleteLoopFromQueue(L);
+ DEBUG(verifyFunction(*L->getHeader()->getParent()));
return true;
}
@@ -226,6 +230,7 @@ struct LoopVectorize : public LoopPass {
AU.addRequired<AliasAnalysis>();
AU.addRequired<LoopInfo>();
AU.addRequired<ScalarEvolution>();
+ AU.addRequired<DominatorTree>();
}
};
@@ -327,7 +332,7 @@ void SingleBlockLoopVectorizer::scalarizeInstruction(Instruction *Instr) {
Instruction *SrcInst = dyn_cast<Instruction>(SrcOp);
// If the src is an instruction that appeared earlier in the basic block
- // then it should already be vectorized.
+ // then it should already be vectorized.
if (SrcInst && SrcInst->getParent() == Instr->getParent()) {
assert(WidenMap.count(SrcInst) && "Source operand is unavailable");
// The parameter is a vector value from earlier.
@@ -378,28 +383,71 @@ void SingleBlockLoopVectorizer::scalarizeInstruction(Instruction *Instr) {
WidenMap[Instr] = VecResults;
}
-void SingleBlockLoopVectorizer::copyEmptyLoop() {
- assert(Orig->getNumBlocks() == 1 && "Invalid loop");
- BasicBlock *PH = Orig->getLoopPreheader();
+void SingleBlockLoopVectorizer::createEmptyLoop() {
+ /*
+ In this function we generate a new loop. The new loop will contain
+ the vectorized instructions while the old loop will continue to run the
+ scalar remainder.
+
+ [ ] <-- vector loop bypass.
+ / |
+ / v
+| [ ] <-- vector pre header.
+| |
+| v
+| [ ] \
+| [ ]_| <-- vector loop.
+| |
+ \ v
+ >[ ] <--- middle-block.
+ / |
+ / v
+| [ ] <--- new preheader.
+| |
+| v
+| [ ] \
+| [ ]_| <-- old scalar loop to handle remainder. ()
+ \ |
+ \ v
+ >[ ] <-- exit block.
+ ...
+ */
+
+ // This is the original scalar-loop preheader.
+ BasicBlock *BypassBlock = Orig->getLoopPreheader();
BasicBlock *ExitBlock = Orig->getExitBlock();
- assert(ExitBlock && "Invalid loop exit");
+ assert(ExitBlock && "Must have an exit block");
+
+ BasicBlock *ScalarBody = Orig->getHeader();
+ assert(Orig->getNumBlocks() == 1 && "Invalid loop");
+ assert(ScalarBody && BypassBlock && "Invalid loop structure");
+
+ BasicBlock *VectorPH =
+ BypassBlock->splitBasicBlock(BypassBlock->getTerminator(), "vector.ph");
+ BasicBlock *VecBody = VectorPH->splitBasicBlock(VectorPH->getTerminator(),
+ "vector.body");
- // Create a new single-basic block loop.
- BasicBlock *BB = BasicBlock::Create(PH->getContext(), "vectorizedloop",
- PH->getParent(), ExitBlock);
+ BasicBlock *MiddleBlock = VecBody->splitBasicBlock(VecBody->getTerminator(),
+ "middle.block");
+
+
+ BasicBlock *ScalarPH =
+ MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(),
+ "scalar.preheader");
// Find the induction variable.
BasicBlock *OldBasicBlock = Orig->getHeader();
- PHINode *OldInd = dyn_cast<PHINode>(OldBasicBlock->begin());
- assert(OldInd && "We must have a single phi node.");
- Type *IdxTy = OldInd->getType();
+ OldInduction = dyn_cast<PHINode>(OldBasicBlock->begin());
+ assert(OldInduction && "We must have a single phi node.");
+ Type *IdxTy = OldInduction->getType();
// Use this IR builder to create the loop instructions (Phi, Br, Cmp)
// inside the loop.
- Builder = new IRBuilder<>(BB);
+ Builder = new IRBuilder<>(VecBody);
+ Builder->SetInsertPoint(VecBody->getFirstInsertionPt());
// Generate the induction variable.
- PHINode *Phi = Builder->CreatePHI(IdxTy, 2, "index");
+ Induction = Builder->CreatePHI(IdxTy, 2, "index");
Constant *Zero = ConstantInt::get(IdxTy, 0);
Constant *Step = ConstantInt::get(IdxTy, VF);
@@ -407,32 +455,78 @@ void SingleBlockLoopVectorizer::copyEmptyLoop() {
const SCEV *ExitCount = SE->getExitCount(Orig, Orig->getHeader());
assert(ExitCount != SE->getCouldNotCompute() && "Invalid loop count");
- // Get the trip count from the count by adding 1.
+ // Get the total trip count from the count by adding 1.
ExitCount = SE->getAddExpr(ExitCount,
SE->getConstant(ExitCount->getType(), 1));
// Expand the trip count and place the new instructions in the preheader.
// Notice that the pre-header does not change, only the loop body.
SCEVExpander Exp(*SE, "induction");
- Instruction *Loc = Orig->getLoopPreheader()->getTerminator();
- if (ExitCount->getType() != Phi->getType())
- ExitCount = SE->getSignExtendExpr(ExitCount, Phi->getType());
- Value *Count = Exp.expandCodeFor(ExitCount, Phi->getType(), Loc);
-
+ Instruction *Loc = BypassBlock->getTerminator();
+
+ // We may need to extend the index in case there is a type mismatch.
+ // We know that the count starts at zero and does not overflow.
+ // We are using Zext because it should be less expensive.
+ if (ExitCount->getType() != Induction->getType())
+ ExitCount = SE->getZeroExtendExpr(ExitCount, IdxTy);
+
+ // Count holds the overall loop count (N).
+ Value *Count = Exp.expandCodeFor(ExitCount, Induction->getType(), Loc);
+ // Now we need to generate the expression for N - (N % VF), which is
+ // the part that the vectorized body will execute.
+ Constant *CIVF = ConstantInt::get(IdxTy, VF);
+ Value *R = BinaryOperator::CreateURem(Count, CIVF, "n.mod.vf", Loc);
+ Value *CountRoundDown = BinaryOperator::CreateSub(Count, R, "n.vec", Loc);
+
+ // Now, compare the new count to zero. If it is zero, jump to the scalar part.
+ Value *Cmp = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,
+ CountRoundDown, ConstantInt::getNullValue(IdxTy),
+ "cmp.zero", Loc);
+ BranchInst::Create(MiddleBlock, VectorPH, Cmp, Loc);
+ // Remove the old terminator.
+ Loc->eraseFromParent();
+
+ // Add a check in the middle block to see if we have completed
+ // all of the iterations in the first vector loop.
+ // If (N - N%VF) == N, then we *don't* need to run the remainder.
+ Value *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
+ CountRoundDown, "cmp.n",
+ MiddleBlock->getTerminator());
+
+ BranchInst::Create(ExitBlock, ScalarPH, CmpN, MiddleBlock->getTerminator());
+ // Remove the old terminator.
+ MiddleBlock->getTerminator()->eraseFromParent();
+
// Create i+1 and fill the PHINode.
- Value *Next = Builder->CreateAdd(Phi, Step, "index.next");
- Phi->addIncoming(Zero, PH);
- Phi->addIncoming(Next, BB);
+ Value *NextIdx = Builder->CreateAdd(Induction, Step, "index.next");
+ Induction->addIncoming(Zero, VectorPH);
+ Induction->addIncoming(NextIdx, VecBody);
// Create the compare.
- Value *ICmp = Builder->CreateICmpEQ(Next, Count);
- Builder->CreateCondBr(ICmp, ExitBlock, BB);
- // Fix preheader.
- PH->getTerminator()->setSuccessor(0, BB);
- Builder->SetInsertPoint(BB->getFirstInsertionPt());
-
- // Save the induction variables.
- Induction = Phi;
- OldInduction = OldInd;
+ Value *ICmp = Builder->CreateICmpEQ(NextIdx, CountRoundDown);
+ Builder->CreateCondBr(ICmp, MiddleBlock, VecBody);
+
+ // Now we have two terminators. Remove the old one from the block.
+ VecBody->getTerminator()->eraseFromParent();
+
+ // Fix the scalar body iteration count.
+ unsigned BlockIdx = OldInduction->getBasicBlockIndex(ScalarPH);
+ OldInduction->setIncomingValue(BlockIdx, CountRoundDown);
+
+ // Get ready to start creating new instructions into the vectorized body.
+ Builder->SetInsertPoint(VecBody->getFirstInsertionPt());
+
+ // Register the new loop.
+ Loop* Lp = new Loop();
+ LPM->insertLoop(Lp, Orig->getParentLoop());
+
+ Lp->addBasicBlockToLoop(VecBody, LI->getBase());
+
+ Loop *ParentLoop = Orig->getParentLoop();
+ if (ParentLoop) {
+ ParentLoop->addBasicBlockToLoop(ScalarPH, LI->getBase());
+ ParentLoop->addBasicBlockToLoop(VectorPH, LI->getBase());
+ ParentLoop->addBasicBlockToLoop(MiddleBlock, LI->getBase());
+ }
}
void SingleBlockLoopVectorizer::vectorizeLoop() {
@@ -575,16 +669,9 @@ void SingleBlockLoopVectorizer::vectorizeLoop() {
}// end of for_each instr.
}
-void SingleBlockLoopVectorizer::deleteOldLoop() {
+void SingleBlockLoopVectorizer::cleanup() {
// The original basic block.
- BasicBlock *BB = Orig->getHeader();
SE->forgetLoop(Orig);
-
- LI->removeBlock(BB);
- Orig->addBasicBlockToLoop(Induction->getParent(), LI->getBase());
-
- // Remove the old loop block.
- DeleteDeadBlock(BB);
}
unsigned LoopVectorizationLegality::getLoopMaxVF() {
@@ -605,26 +692,25 @@ unsigned LoopVectorizationLegality::getLoopMaxVF() {
BasicBlock *BB = TheLoop->getHeader();
DEBUG(dbgs() << "LV: Found a loop: " << BB->getName() << "\n");
- // Find the max vectorization factor.
- unsigned MaxVF = SE->getSmallConstantTripMultiple(TheLoop, BB);
-
-
- // Perform an early check. Do not scan the block if we did not find a loop.
- if (MaxVF < 2) {
- DEBUG(dbgs() << "LV: Can't find a vectorizable loop structure\n");
- return 1;
- }
-
// Go over each instruction and look at memory deps.
if (!canVectorizeBlock(*BB)) {
DEBUG(dbgs() << "LV: Can't vectorize this loop header\n");
return 1;
}
- DEBUG(dbgs() << "LV: We can vectorize this loop! VF="<<MaxVF<<"\n");
-
- // Okay! We can vectorize. Return the max trip multiple.
- return MaxVF;
+ // ScalarEvolution needs to be able to find the exit count.
+ const SCEV *ExitCount = SE->getExitCount(TheLoop, BB);
+ if (ExitCount == SE->getCouldNotCompute()) {
+ DEBUG(dbgs() << "LV: SCEV could not compute the loop exit count.\n");
+ return 1;
+ }
+
+ DEBUG(dbgs() << "LV: We can vectorize this loop!\n");
+
+ // Okay! We can vectorize. At this point we don't have any other mem analysis
+ // which may limit our maximum vectorization factor, so just return the
+ // maximum SIMD size.
+ return DefaultVectorizationFactor;
}
bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) {
@@ -725,6 +811,11 @@ bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) {
}
} // next instr.
+ if (NumPhis != 1) {
+ DEBUG(dbgs() << "LV: Did not find a Phi node.\n");
+ return false;
+ }
+
// Check that the underlying objects of the reads and writes are either
// disjoint memory locations, or that they are no-alias arguments.
ValueVector::iterator r, re, w, we;
diff --git a/test/Transforms/LoopVectorize/gcc-examples.ll b/test/Transforms/LoopVectorize/gcc-examples.ll
index 68eab9bae76..4e9e6f940ec 100644
--- a/test/Transforms/LoopVectorize/gcc-examples.ll
+++ b/test/Transforms/LoopVectorize/gcc-examples.ll
@@ -45,9 +45,8 @@ define void @example1() nounwind uwtable ssp {
ret void
}
-; We can't vectorize this loop because it has non constant loop bounds.
;CHECK: @example2
-;CHECK-NOT: <4 x i32>
+;CHECK: store <4 x i32>
;CHECK: ret void
define void @example2(i32 %n, i32 %x) nounwind uwtable ssp {
%1 = icmp sgt i32 %n, 0
@@ -114,9 +113,8 @@ define void @example3(i32 %n, i32* noalias nocapture %p, i32* noalias nocapture
ret void
}
-; We can't vectorize this loop because it has non constant loop bounds.
;CHECK: @example4
-;CHECK-NOT: <4 x i32>
+;CHECK: load <4 x i32>
;CHECK: ret void
define void @example4(i32 %n, i32* noalias nocapture %p, i32* noalias nocapture %q) nounwind uwtable ssp {
%1 = add nsw i32 %n, -1
diff --git a/test/Transforms/LoopVectorize/non-const-n.ll b/test/Transforms/LoopVectorize/non-const-n.ll
index 76835b76603..04c5c84a4f4 100644
--- a/test/Transforms/LoopVectorize/non-const-n.ll
+++ b/test/Transforms/LoopVectorize/non-const-n.ll
@@ -9,7 +9,7 @@ target triple = "x86_64-apple-macosx10.8.0"
;CHECK: @example1
;CHECK: shl i32
-;CHECK: sext i32
+;CHECK: zext i32
;CHECK: load <4 x i32>
;CHECK: add <4 x i32>
;CHECK: store <4 x i32>