summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorVincent Lejeune <vljn@ovi.com>2013-06-04 23:17:26 +0000
committerVincent Lejeune <vljn@ovi.com>2013-06-04 23:17:26 +0000
commitbbbdba891b8a90d676fd251f2a4bae3bef061550 (patch)
tree3c0e2f36a86cb38311e63c8a97efc14e61482684
parente67a4afb5da59c02338622eea68e096ba143113f (diff)
R600: Add a pass that merge Vector Register
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@183279 91177308-0d34-0410-b5e6-96231b3b80d8
-rw-r--r--lib/Target/R600/AMDGPU.h1
-rw-r--r--lib/Target/R600/AMDGPUTargetMachine.cpp5
-rw-r--r--lib/Target/R600/CMakeLists.txt1
-rw-r--r--lib/Target/R600/R600OptimizeVectorRegisters.cpp363
-rw-r--r--test/CodeGen/R600/texture-input-merge.ll30
5 files changed, 400 insertions, 0 deletions
diff --git a/lib/Target/R600/AMDGPU.h b/lib/Target/R600/AMDGPU.h
index f9d70c9c5a3..d3ef4260d65 100644
--- a/lib/Target/R600/AMDGPU.h
+++ b/lib/Target/R600/AMDGPU.h
@@ -23,6 +23,7 @@ class AMDGPUTargetMachine;
// R600 Passes
FunctionPass* createR600TextureIntrinsicsReplacer();
FunctionPass* createR600KernelParametersPass(const DataLayout *TD);
+FunctionPass *createR600VectorRegMerger(TargetMachine &tm);
FunctionPass *createR600ExpandSpecialInstrsPass(TargetMachine &tm);
FunctionPass *createR600EmitClauseMarkers(TargetMachine &tm);
FunctionPass *createR600Packetizer(TargetMachine &tm);
diff --git a/lib/Target/R600/AMDGPUTargetMachine.cpp b/lib/Target/R600/AMDGPUTargetMachine.cpp
index 88dc5832905..c52af56175e 100644
--- a/lib/Target/R600/AMDGPUTargetMachine.cpp
+++ b/lib/Target/R600/AMDGPUTargetMachine.cpp
@@ -130,6 +130,11 @@ bool AMDGPUPassConfig::addInstSelector() {
bool AMDGPUPassConfig::addPreRegAlloc() {
addPass(createAMDGPUConvertToISAPass(*TM));
+ const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
+
+ if (ST.device()->getGeneration() <= AMDGPUDeviceInfo::HD6XXX) {
+ addPass(createR600VectorRegMerger(*TM));
+ }
return false;
}
diff --git a/lib/Target/R600/CMakeLists.txt b/lib/Target/R600/CMakeLists.txt
index c5ce9dc789e..558d001af2e 100644
--- a/lib/Target/R600/CMakeLists.txt
+++ b/lib/Target/R600/CMakeLists.txt
@@ -41,6 +41,7 @@ add_llvm_target(R600CodeGen
R600ISelLowering.cpp
R600MachineFunctionInfo.cpp
R600MachineScheduler.cpp
+ R600OptimizeVectorRegisters.cpp
R600Packetizer.cpp
R600RegisterInfo.cpp
R600TextureIntrinsicsReplacer.cpp
diff --git a/lib/Target/R600/R600OptimizeVectorRegisters.cpp b/lib/Target/R600/R600OptimizeVectorRegisters.cpp
new file mode 100644
index 00000000000..6811dbc8790
--- /dev/null
+++ b/lib/Target/R600/R600OptimizeVectorRegisters.cpp
@@ -0,0 +1,363 @@
+//===--------------------- R600MergeVectorRegisters.cpp -------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This pass merges inputs of swizzeable instructions into vector sharing
+/// common data and/or have enough undef subreg using swizzle abilities.
+///
+/// For instance let's consider the following pseudo code :
+/// vreg5<def> = REG_SEQ vreg1, sub0, vreg2, sub1, vreg3, sub2, undef, sub3
+/// ...
+/// vreg7<def> = REG_SEQ vreg1, sub0, vreg3, sub1, undef, sub2, vreg4, sub3
+/// (swizzable Inst) vreg7, SwizzleMask : sub0, sub1, sub2, sub3
+///
+/// is turned into :
+/// vreg5<def> = REG_SEQ vreg1, sub0, vreg2, sub1, vreg3, sub2, undef, sub3
+/// ...
+/// vreg7<def> = INSERT_SUBREG vreg4, sub3
+/// (swizzable Inst) vreg7, SwizzleMask : sub0, sub2, sub1, sub3
+///
+/// This allow regalloc to reduce register pressure for vector registers and
+/// to reduce MOV count.
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "vec-merger"
+#include "llvm/Support/Debug.h"
+#include "AMDGPU.h"
+#include "R600InstrInfo.h"
+#include "llvm/CodeGen/DFAPacketizer.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+using namespace llvm;
+
+namespace {
+
+static bool
+isImplicitlyDef(MachineRegisterInfo &MRI, unsigned Reg) {
+ for (MachineRegisterInfo::def_iterator It = MRI.def_begin(Reg),
+ E = MRI.def_end(); It != E; ++It) {
+ return (*It).isImplicitDef();
+ }
+ llvm_unreachable("Reg without a def");
+ return false;
+}
+
+class RegSeqInfo {
+public:
+ MachineInstr *Instr;
+ DenseMap<unsigned, unsigned> RegToChan;
+ std::vector<unsigned> UndefReg;
+ RegSeqInfo(MachineRegisterInfo &MRI, MachineInstr *MI) : Instr(MI) {
+ assert (MI->getOpcode() == AMDGPU::REG_SEQUENCE);
+ for (unsigned i = 1, e = Instr->getNumOperands(); i < e; i+=2) {
+ MachineOperand &MO = Instr->getOperand(i);
+ unsigned Chan = Instr->getOperand(i + 1).getImm();
+ if (isImplicitlyDef(MRI, MO.getReg()))
+ UndefReg.push_back(Chan);
+ else
+ RegToChan[MO.getReg()] = Chan;
+ }
+ }
+ RegSeqInfo() {}
+
+ bool operator==(const RegSeqInfo &RSI) const {
+ return RSI.Instr == Instr;
+ }
+};
+
+class R600VectorRegMerger : public MachineFunctionPass {
+private:
+ MachineRegisterInfo *MRI;
+ const R600InstrInfo *TII;
+ bool canSwizzle(const MachineInstr &) const;
+ bool areAllUsesSwizzeable(unsigned Reg) const;
+ void SwizzleInput(MachineInstr &,
+ const std::vector<std::pair<unsigned, unsigned> > &) const;
+ bool tryMergeVector(const RegSeqInfo *, RegSeqInfo *,
+ std::vector<std::pair<unsigned, unsigned> > &Remap) const;
+ bool tryMergeUsingCommonSlot(RegSeqInfo &RSI, RegSeqInfo &CompatibleRSI,
+ std::vector<std::pair<unsigned, unsigned> > &RemapChan);
+ bool tryMergeUsingFreeSlot(RegSeqInfo &RSI, RegSeqInfo &CompatibleRSI,
+ std::vector<std::pair<unsigned, unsigned> > &RemapChan);
+ MachineInstr *RebuildVector(RegSeqInfo *MI,
+ const RegSeqInfo *BaseVec,
+ const std::vector<std::pair<unsigned, unsigned> > &RemapChan) const;
+ void RemoveMI(MachineInstr *);
+ void trackRSI(const RegSeqInfo &RSI);
+
+ typedef DenseMap<unsigned, std::vector<MachineInstr *> > InstructionSetMap;
+ DenseMap<MachineInstr *, RegSeqInfo> PreviousRegSeq;
+ InstructionSetMap PreviousRegSeqByReg;
+ InstructionSetMap PreviousRegSeqByUndefCount;
+public:
+ static char ID;
+ R600VectorRegMerger(TargetMachine &tm) : MachineFunctionPass(ID),
+ TII (static_cast<const R600InstrInfo *>(tm.getInstrInfo())) { }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const {
+ AU.setPreservesCFG();
+ AU.addRequired<MachineDominatorTree>();
+ AU.addPreserved<MachineDominatorTree>();
+ AU.addRequired<MachineLoopInfo>();
+ AU.addPreserved<MachineLoopInfo>();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+ const char *getPassName() const {
+ return "R600 Vector Registers Merge Pass";
+ }
+
+ bool runOnMachineFunction(MachineFunction &Fn);
+};
+
+char R600VectorRegMerger::ID = 0;
+
+bool R600VectorRegMerger::canSwizzle(const MachineInstr &MI)
+ const {
+ if (TII->get(MI.getOpcode()).TSFlags & R600_InstFlag::TEX_INST)
+ return true;
+ switch (MI.getOpcode()) {
+ case AMDGPU::R600_ExportSwz:
+ case AMDGPU::EG_ExportSwz:
+ return true;
+ default:
+ return false;
+ }
+}
+
+bool R600VectorRegMerger::tryMergeVector(const RegSeqInfo *Untouched,
+ RegSeqInfo *ToMerge, std::vector< std::pair<unsigned, unsigned> > &Remap)
+ const {
+ unsigned CurrentUndexIdx = 0;
+ for (DenseMap<unsigned, unsigned>::iterator It = ToMerge->RegToChan.begin(),
+ E = ToMerge->RegToChan.end(); It != E; ++It) {
+ DenseMap<unsigned, unsigned>::const_iterator PosInUntouched =
+ Untouched->RegToChan.find((*It).first);
+ if (PosInUntouched != Untouched->RegToChan.end()) {
+ Remap.push_back(std::pair<unsigned, unsigned>
+ ((*It).second, (*PosInUntouched).second));
+ continue;
+ }
+ if (CurrentUndexIdx >= Untouched->UndefReg.size())
+ return false;
+ Remap.push_back(std::pair<unsigned, unsigned>
+ ((*It).second, Untouched->UndefReg[CurrentUndexIdx++]));
+ }
+
+ return true;
+}
+
+MachineInstr *R600VectorRegMerger::RebuildVector(
+ RegSeqInfo *RSI, const RegSeqInfo *BaseRSI,
+ const std::vector<std::pair<unsigned, unsigned> > &RemapChan) const {
+ unsigned Reg = RSI->Instr->getOperand(0).getReg();
+ MachineBasicBlock::iterator Pos = RSI->Instr;
+ MachineBasicBlock &MBB = *Pos->getParent();
+ DebugLoc DL = Pos->getDebugLoc();
+
+ unsigned SrcVec = BaseRSI->Instr->getOperand(0).getReg();
+ DenseMap<unsigned, unsigned> UpdatedRegToChan = BaseRSI->RegToChan;
+ std::vector<unsigned> UpdatedUndef = BaseRSI->UndefReg;
+ for (DenseMap<unsigned, unsigned>::iterator It = RSI->RegToChan.begin(),
+ E = RSI->RegToChan.end(); It != E; ++It) {
+ if (BaseRSI->RegToChan.find((*It).first) != BaseRSI->RegToChan.end()) {
+ UpdatedRegToChan[(*It).first] = (*It).second;
+ continue;
+ }
+ unsigned DstReg = MRI->createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
+ unsigned SubReg = (*It).first;
+ unsigned Swizzle = (*It).second;
+ unsigned Chan;
+ for (unsigned j = 0, je = RemapChan.size(); j < je; j++) {
+ if (RemapChan[j].first == Swizzle) {
+ Chan = RemapChan[j].second;
+ break;
+ }
+ }
+ MachineInstr *Tmp = BuildMI(MBB, Pos, DL, TII->get(AMDGPU::INSERT_SUBREG),
+ DstReg)
+ .addReg(SrcVec)
+ .addReg(SubReg)
+ .addImm(Chan);
+ UpdatedRegToChan[SubReg] = Chan;
+ for (std::vector<unsigned>::iterator RemoveIt = UpdatedUndef.begin(),
+ RemoveE = UpdatedUndef.end(); RemoveIt != RemoveE; ++ RemoveIt) {
+ if (*RemoveIt == Chan)
+ UpdatedUndef.erase(RemoveIt);
+ }
+ DEBUG(dbgs() << " ->"; Tmp->dump(););
+ SrcVec = DstReg;
+ }
+ Pos = BuildMI(MBB, Pos, DL, TII->get(AMDGPU::COPY), Reg)
+ .addReg(SrcVec);
+ DEBUG(dbgs() << " ->"; Pos->dump(););
+
+ DEBUG(dbgs() << " Updating Swizzle:\n");
+ for (MachineRegisterInfo::use_iterator It = MRI->use_begin(Reg),
+ E = MRI->use_end(); It != E; ++It) {
+ DEBUG(dbgs() << " ";(*It).dump(); dbgs() << " ->");
+ SwizzleInput(*It, RemapChan);
+ DEBUG((*It).dump());
+ }
+ RSI->Instr->eraseFromParent();
+
+ // Update RSI
+ RSI->Instr = Pos;
+ RSI->RegToChan = UpdatedRegToChan;
+ RSI->UndefReg = UpdatedUndef;
+
+ return Pos;
+}
+
+void R600VectorRegMerger::RemoveMI(MachineInstr *MI) {
+ for (InstructionSetMap::iterator It = PreviousRegSeqByReg.begin(),
+ E = PreviousRegSeqByReg.end(); It != E; ++It) {
+ std::vector<MachineInstr *> &MIs = (*It).second;
+ MIs.erase(std::find(MIs.begin(), MIs.end(), MI), MIs.end());
+ }
+ for (InstructionSetMap::iterator It = PreviousRegSeqByUndefCount.begin(),
+ E = PreviousRegSeqByUndefCount.end(); It != E; ++It) {
+ std::vector<MachineInstr *> &MIs = (*It).second;
+ MIs.erase(std::find(MIs.begin(), MIs.end(), MI), MIs.end());
+ }
+}
+
+void R600VectorRegMerger::SwizzleInput(MachineInstr &MI,
+ const std::vector<std::pair<unsigned, unsigned> > &RemapChan) const {
+ unsigned Offset;
+ if (TII->get(MI.getOpcode()).TSFlags & R600_InstFlag::TEX_INST)
+ Offset = 2;
+ else
+ Offset = 3;
+ for (unsigned i = 0; i < 4; i++) {
+ unsigned Swizzle = MI.getOperand(i + Offset).getImm() + 1;
+ for (unsigned j = 0, e = RemapChan.size(); j < e; j++) {
+ if (RemapChan[j].first == Swizzle) {
+ MI.getOperand(i + Offset).setImm(RemapChan[j].second - 1);
+ break;
+ }
+ }
+ }
+}
+
+bool R600VectorRegMerger::areAllUsesSwizzeable(unsigned Reg) const {
+ for (MachineRegisterInfo::use_iterator It = MRI->use_begin(Reg),
+ E = MRI->use_end(); It != E; ++It) {
+ if (!canSwizzle(*It))
+ return false;
+ }
+ return true;
+}
+
+bool R600VectorRegMerger::tryMergeUsingCommonSlot(RegSeqInfo &RSI,
+ RegSeqInfo &CompatibleRSI,
+ std::vector<std::pair<unsigned, unsigned> > &RemapChan) {
+ for (MachineInstr::mop_iterator MOp = RSI.Instr->operands_begin(),
+ MOE = RSI.Instr->operands_end(); MOp != MOE; ++MOp) {
+ if (!MOp->isReg())
+ continue;
+ if (PreviousRegSeqByReg[MOp->getReg()].empty())
+ continue;
+ std::vector<MachineInstr *> MIs = PreviousRegSeqByReg[MOp->getReg()];
+ for (unsigned i = 0, e = MIs.size(); i < e; i++) {
+ CompatibleRSI = PreviousRegSeq[MIs[i]];
+ if (RSI == CompatibleRSI)
+ continue;
+ if (tryMergeVector(&CompatibleRSI, &RSI, RemapChan))
+ return true;
+ }
+ }
+ return false;
+}
+
+bool R600VectorRegMerger::tryMergeUsingFreeSlot(RegSeqInfo &RSI,
+ RegSeqInfo &CompatibleRSI,
+ std::vector<std::pair<unsigned, unsigned> > &RemapChan) {
+ unsigned NeededUndefs = 4 - RSI.UndefReg.size();
+ if (PreviousRegSeqByUndefCount[NeededUndefs].empty())
+ return false;
+ std::vector<MachineInstr *> &MIs =
+ PreviousRegSeqByUndefCount[NeededUndefs];
+ CompatibleRSI = PreviousRegSeq[MIs.back()];
+ tryMergeVector(&CompatibleRSI, &RSI, RemapChan);
+ return true;
+}
+
+void R600VectorRegMerger::trackRSI(const RegSeqInfo &RSI) {
+ for (DenseMap<unsigned, unsigned>::const_iterator
+ It = RSI.RegToChan.begin(), E = RSI.RegToChan.end(); It != E; ++It) {
+ PreviousRegSeqByReg[(*It).first].push_back(RSI.Instr);
+ }
+ PreviousRegSeqByUndefCount[RSI.UndefReg.size()].push_back(RSI.Instr);
+ PreviousRegSeq[RSI.Instr] = RSI;
+}
+
+bool R600VectorRegMerger::runOnMachineFunction(MachineFunction &Fn) {
+ MRI = &(Fn.getRegInfo());
+ for (MachineFunction::iterator MBB = Fn.begin(), MBBe = Fn.end();
+ MBB != MBBe; ++MBB) {
+ MachineBasicBlock *MB = MBB;
+ PreviousRegSeq.clear();
+ PreviousRegSeqByReg.clear();
+ PreviousRegSeqByUndefCount.clear();
+
+ for (MachineBasicBlock::iterator MII = MB->begin(), MIIE = MB->end();
+ MII != MIIE; ++MII) {
+ MachineInstr *MI = MII;
+ if (MI->getOpcode() != AMDGPU::REG_SEQUENCE)
+ continue;
+
+ RegSeqInfo RSI(*MRI, MI);
+
+ // All uses of MI are swizzeable ?
+ unsigned Reg = MI->getOperand(0).getReg();
+ if (!areAllUsesSwizzeable(Reg))
+ continue;
+
+ DEBUG (dbgs() << "Trying to optimize ";
+ MI->dump();
+ );
+
+ RegSeqInfo CandidateRSI;
+ std::vector<std::pair<unsigned, unsigned> > RemapChan;
+ DEBUG(dbgs() << "Using common slots...\n";);
+ if (tryMergeUsingCommonSlot(RSI, CandidateRSI, RemapChan)) {
+ // Remove CandidateRSI mapping
+ RemoveMI(CandidateRSI.Instr);
+ MII = RebuildVector(&RSI, &CandidateRSI, RemapChan);
+ trackRSI(RSI);
+ continue;
+ }
+ DEBUG(dbgs() << "Using free slots...\n";);
+ RemapChan.clear();
+ if (tryMergeUsingFreeSlot(RSI, CandidateRSI, RemapChan)) {
+ RemoveMI(CandidateRSI.Instr);
+ MII = RebuildVector(&RSI, &CandidateRSI, RemapChan);
+ trackRSI(RSI);
+ continue;
+ }
+ //Failed to merge
+ trackRSI(RSI);
+ }
+ }
+ return false;
+}
+
+}
+
+llvm::FunctionPass *llvm::createR600VectorRegMerger(TargetMachine &tm) {
+ return new R600VectorRegMerger(tm);
+}
+
diff --git a/test/CodeGen/R600/texture-input-merge.ll b/test/CodeGen/R600/texture-input-merge.ll
new file mode 100644
index 00000000000..5d0ecef3069
--- /dev/null
+++ b/test/CodeGen/R600/texture-input-merge.ll
@@ -0,0 +1,30 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+;CHECK-NOT: MOV
+
+define void @test() {
+ %1 = call float @llvm.R600.load.input(i32 0)
+ %2 = call float @llvm.R600.load.input(i32 1)
+ %3 = call float @llvm.R600.load.input(i32 2)
+ %4 = call float @llvm.R600.load.input(i32 3)
+ %5 = fmul float %1, 3.0
+ %6 = fmul float %2, 3.0
+ %7 = fmul float %3, 3.0
+ %8 = fmul float %4, 3.0
+ %9 = insertelement <4 x float> undef, float %5, i32 0
+ %10 = insertelement <4 x float> %9, float %6, i32 1
+ %11 = insertelement <4 x float> undef, float %7, i32 0
+ %12 = insertelement <4 x float> %11, float %5, i32 1
+ %13 = insertelement <4 x float> undef, float %8, i32 0
+ %14 = call <4 x float> @llvm.R600.tex(<4 x float> %10, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+ %15 = call <4 x float> @llvm.R600.tex(<4 x float> %12, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+ %16 = call <4 x float> @llvm.R600.tex(<4 x float> %13, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+ %17 = fadd <4 x float> %14, %15
+ %18 = fadd <4 x float> %17, %16
+ call void @llvm.R600.store.swizzle(<4 x float> %18, i32 0, i32 0)
+ ret void
+}
+
+declare float @llvm.R600.load.input(i32) readnone
+declare <4 x float> @llvm.R600.tex(<4 x float>, i32, i32, i32, i32, i32, i32, i32, i32, i32) readnone
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)