1 files changed, 41 insertions, 10 deletions
diff --git a/lib/Target/R600/SIInstrInfo.cpp b/lib/Target/R600/SIInstrInfo.cpp
index 1a0010c03dc..07d4f869bb2 100644
--- a/lib/Target/R600/SIInstrInfo.cpp
+++ b/lib/Target/R600/SIInstrInfo.cpp
@@ -272,20 +272,51 @@ bool SIInstrInfo::shouldClusterLoads(MachineInstr *FirstLdSt,
   unsigned Opc0 = FirstLdSt->getOpcode();
   unsigned Opc1 = SecondLdSt->getOpcode();
 
-  // TODO: This needs finer tuning
-  if (NumLoads > 4)
-    return false;
+	const MachineOperand *FirstDst = nullptr;
+	const MachineOperand *SecondDst = nullptr;
 
-  if (isDS(Opc0) && isDS(Opc1))
-    return true;
+  if (isDS(Opc0) && isDS(Opc1)) {
+    FirstDst = getNamedOperand(*FirstLdSt, AMDGPU::OpName::vdst);
+    SecondDst = getNamedOperand(*SecondLdSt, AMDGPU::OpName::vdst);
+  }
 
-  if (isSMRD(Opc0) && isSMRD(Opc1))
-    return true;
+  if (isSMRD(Opc0) && isSMRD(Opc1)) {
+    FirstDst = getNamedOperand(*FirstLdSt, AMDGPU::OpName::dst);
+    SecondDst = getNamedOperand(*SecondLdSt, AMDGPU::OpName::dst);
+  }
 
-  if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1)))
-    return true;
+  if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) {
+    FirstDst = getNamedOperand(*FirstLdSt, AMDGPU::OpName::vdata);
+    SecondDst = getNamedOperand(*SecondLdSt, AMDGPU::OpName::vdata);
+  }
 
-  return false;
+  if (!FirstDst || !SecondDst)
+    return false;
+
+  // TODO: This needs finer tuning
+  unsigned LoadClusterThreshold = 4;
+
+  // Check if these two loads are writing to different sub-registers of the
+  // same super-register.
+  if (FirstDst->getReg() == SecondDst->getReg() &&
+      FirstDst->getSubReg() != SecondDst->getSubReg()) {
+    // These loads share a super-register.  We want to cluster them, becuase
+    // LLVM does not currently support sub-register liveness tracking, so
+    // if these two loads end up far apart, the live range of the super
+    // register could be very big.  This can be especially bad when we have a
+    // kernel that does several large vector loads.
+    const MachineRegisterInfo &MRI =
+        FirstLdSt->getParent()->getParent()->getRegInfo();
+    const TargetRegisterClass *DstRC = MRI.getRegClass(FirstDst->getReg());
+    const TargetRegisterClass *SubRC = FirstDst->getSubReg() ?
+        RI.getSubClassWithSubReg(DstRC, FirstDst->getSubReg()) : DstRC;
+
+    LoadClusterThreshold =
+        std::max(LoadClusterThreshold, DstRC->getSize() / SubRC->getSize());
+
+  }
+
+  return NumLoads <= LoadClusterThreshold;
 }
 
 void