summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--lib/Target/R600/SIInstrInfo.cpp51
1 files changed, 41 insertions, 10 deletions
diff --git a/lib/Target/R600/SIInstrInfo.cpp b/lib/Target/R600/SIInstrInfo.cpp
index 1a0010c03dc..07d4f869bb2 100644
--- a/lib/Target/R600/SIInstrInfo.cpp
+++ b/lib/Target/R600/SIInstrInfo.cpp
@@ -272,20 +272,51 @@ bool SIInstrInfo::shouldClusterLoads(MachineInstr *FirstLdSt,
unsigned Opc0 = FirstLdSt->getOpcode();
unsigned Opc1 = SecondLdSt->getOpcode();
- // TODO: This needs finer tuning
- if (NumLoads > 4)
- return false;
+ const MachineOperand *FirstDst = nullptr;
+ const MachineOperand *SecondDst = nullptr;
- if (isDS(Opc0) && isDS(Opc1))
- return true;
+ if (isDS(Opc0) && isDS(Opc1)) {
+ FirstDst = getNamedOperand(*FirstLdSt, AMDGPU::OpName::vdst);
+ SecondDst = getNamedOperand(*SecondLdSt, AMDGPU::OpName::vdst);
+ }
- if (isSMRD(Opc0) && isSMRD(Opc1))
- return true;
+ if (isSMRD(Opc0) && isSMRD(Opc1)) {
+ FirstDst = getNamedOperand(*FirstLdSt, AMDGPU::OpName::dst);
+ SecondDst = getNamedOperand(*SecondLdSt, AMDGPU::OpName::dst);
+ }
- if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1)))
- return true;
+ if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) {
+ FirstDst = getNamedOperand(*FirstLdSt, AMDGPU::OpName::vdata);
+ SecondDst = getNamedOperand(*SecondLdSt, AMDGPU::OpName::vdata);
+ }
- return false;
+ if (!FirstDst || !SecondDst)
+ return false;
+
+ // TODO: This needs finer tuning
+ unsigned LoadClusterThreshold = 4;
+
+ // Check if these two loads are writing to different sub-registers of the
+ // same super-register.
+ if (FirstDst->getReg() == SecondDst->getReg() &&
+ FirstDst->getSubReg() != SecondDst->getSubReg()) {
+ // These loads share a super-register. We want to cluster them, becuase
+ // LLVM does not currently support sub-register liveness tracking, so
+ // if these two loads end up far apart, the live range of the super
+ // register could be very big. This can be especially bad when we have a
+ // kernel that does several large vector loads.
+ const MachineRegisterInfo &MRI =
+ FirstLdSt->getParent()->getParent()->getRegInfo();
+ const TargetRegisterClass *DstRC = MRI.getRegClass(FirstDst->getReg());
+ const TargetRegisterClass *SubRC = FirstDst->getSubReg() ?
+ RI.getSubClassWithSubReg(DstRC, FirstDst->getSubReg()) : DstRC;
+
+ LoadClusterThreshold =
+ std::max(LoadClusterThreshold, DstRC->getSize() / SubRC->getSize());
+
+ }
+
+ return NumLoads <= LoadClusterThreshold;
}
void