R600/SI: Define a schedule model and enable the generic machine scheduler

The schedule model is not complete yet, and could be improved.
author: Tom Stellard <thomas.stellard@amd.com> 2013-07-19 11:50:00 -0700
committer: Tom Stellard <thomas.stellard@amd.com> 2015-01-08 20:07:31 -0800
commit: f11db5d5151e41e445cc8965f0d370414297ba23 (patch)
tree: 3bc3b43b7cff8dc845d4610f4f6ebc844d2745fc
parent: 260163c3963afa094645f3fe3d1ff731a81a380a (diff)
27 files changed, 302 insertions, 83 deletions
diff --git a/lib/Target/R600/AMDGPUSubtarget.cpp b/lib/Target/R600/AMDGPUSubtarget.cpp
index a80518807eb..3b73a310e01 100644
--- a/lib/Target/R600/AMDGPUSubtarget.cpp
+++ b/lib/Target/R600/AMDGPUSubtarget.cpp
@@ -19,8 +19,7 @@
 #include "SIInstrInfo.h"
 #include "SIISelLowering.h"
 #include "llvm/ADT/SmallString.h"
-
-#include "llvm/ADT/SmallString.h"
+#include "llvm/CodeGen/MachineScheduler.h"
 
 using namespace llvm;
 
@@ -115,3 +114,21 @@ unsigned AMDGPUSubtarget::getAmdKernelCodeChipID() const {
   case SEA_ISLANDS: return 12;
   }
 }
+
+void AMDGPUSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
+                                          MachineInstr *begin,
+                                          MachineInstr *end,
+                                          unsigned NumRegionInstrs) const {
+  if (getGeneration() >= SOUTHERN_ISLANDS) {
+
+    // Track register pressure so the scheduler can try to decrease
+    // pressure once register usage is above the threshold defined by
+    // SIRegisterInfo::getRegPressureSetLimit()
+    Policy.ShouldTrackPressure = true;
+
+    // Enabling both top down and bottom up scheduling seems to give us less
+    // register spills than just using one of these approaches on its own.
+    Policy.OnlyTopDown = false;
+    Policy.OnlyBottomUp = false;
+  }
+}
diff --git a/lib/Target/R600/AMDGPUSubtarget.h b/lib/Target/R600/AMDGPUSubtarget.h
index 0642b4ad5c8..ff1b8b1df68 100644
--- a/lib/Target/R600/AMDGPUSubtarget.h
+++ b/lib/Target/R600/AMDGPUSubtarget.h
@@ -203,9 +203,13 @@ public:
   unsigned getAmdKernelCodeChipID() const;
 
   bool enableMachineScheduler() const override {
-    return getGeneration() <= NORTHERN_ISLANDS;
+    return true;
   }
 
+  void overrideSchedPolicy(MachineSchedPolicy &Policy,
+                           MachineInstr *begin, MachineInstr *end,
+                           unsigned NumRegionInstrs) const override;
+
   // Helper functions to simplify if statements
   bool isTargetELF() const {
     return false;
@@ -224,6 +228,13 @@ public:
   bool isAmdHsaOS() const {
     return TargetTriple.getOS() == Triple::AMDHSA;
   }
+  unsigned getMaxWavesPerCU() const {
+    if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)
+      return 10;
+
+    // FIXME: Not sure what this is for other subtagets.
+    llvm_unreachable("do not know max waves per CU for this subtarget.");
+  }
 };
 
 } // End namespace llvm
diff --git a/lib/Target/R600/Processors.td b/lib/Target/R600/Processors.td
index fa78ffcbde0..cff97cdb3be 100644
--- a/lib/Target/R600/Processors.td
+++ b/lib/Target/R600/Processors.td
@@ -83,34 +83,38 @@ def : Proc<"cayman",     R600_VLIW4_Itin,
 // Southern Islands
 //===----------------------------------------------------------------------===//
 
-def : Proc<"SI",         SI_Itin, [FeatureSouthernIslands]>;
+def : ProcessorModel<"SI",       SIFullSpeedModel, [FeatureSouthernIslands]>;
 
-def : Proc<"tahiti",     SI_Itin, [FeatureSouthernIslands]>;
+def : ProcessorModel<"tahiti",   SIFullSpeedModel, [FeatureSouthernIslands]>;
 
-def : Proc<"pitcairn",   SI_Itin, [FeatureSouthernIslands]>;
+def : ProcessorModel<"pitcairn", SIQuarterSpeedModel, [FeatureSouthernIslands]>;
 
-def : Proc<"verde",      SI_Itin, [FeatureSouthernIslands]>;
+def : ProcessorModel<"verde",    SIQuarterSpeedModel, [FeatureSouthernIslands]>;
 
-def : Proc<"oland",      SI_Itin, [FeatureSouthernIslands]>;
+def : ProcessorModel<"oland",    SIQuarterSpeedModel, [FeatureSouthernIslands]>;
 
-def : Proc<"hainan",     SI_Itin, [FeatureSouthernIslands]>;
+def : ProcessorModel<"hainan",   SIQuarterSpeedModel, [FeatureSouthernIslands]>;
 
 //===----------------------------------------------------------------------===//
 // Sea Islands
 //===----------------------------------------------------------------------===//
 
-def : Proc<"bonaire",    SI_Itin, [FeatureSeaIslands]>;
+def : ProcessorModel<"bonaire",    SIQuarterSpeedModel, [FeatureSeaIslands]>;
 
-def : Proc<"kabini",     SI_Itin, [FeatureSeaIslands]>;
+def : ProcessorModel<"kabini",     SIQuarterSpeedModel, [FeatureSeaIslands]>;
 
-def : Proc<"kaveri",     SI_Itin, [FeatureSeaIslands]>;
+def : ProcessorModel<"kaveri",     SIQuarterSpeedModel, [FeatureSeaIslands]>;
 
-def : Proc<"hawaii",     SI_Itin, [FeatureSeaIslands]>;
+def : ProcessorModel<"hawaii",     SIFullSpeedModel, [FeatureSeaIslands]>;
 
-def : Proc<"mullins",    SI_Itin, [FeatureSeaIslands]>;
+def : ProcessorModel<"mullins",    SIQuarterSpeedModel, [FeatureSeaIslands]>;
 
-def : Proc<"tonga",      SI_Itin, [FeatureVolcanicIslands]>;
+//===----------------------------------------------------------------------===//
+// Volcanic Islands
+//===----------------------------------------------------------------------===//
+
+def : ProcessorModel<"tonga",   SIFullSpeedModel, [FeatureVolcanicIslands]>;
 
-def : Proc<"iceland",      SI_Itin, [FeatureVolcanicIslands]>;
+def : ProcessorModel<"iceland", SIQuarterSpeedModel, [FeatureVolcanicIslands]>;
 
-def : Proc<"carrizo",      SI_Itin, [FeatureVolcanicIslands]>;
+def : ProcessorModel<"carrizo", SIQuarterSpeedModel, [FeatureVolcanicIslands]>;
diff --git a/lib/Target/R600/SIInstrFormats.td b/lib/Target/R600/SIInstrFormats.td
index ff8db67a619..99a1df36c1f 100644
--- a/lib/Target/R600/SIInstrFormats.td
+++ b/lib/Target/R600/SIInstrFormats.td
@@ -68,6 +68,7 @@ class InstSI <dag outs, dag ins, string asm, list<dag> pattern> :
   // Most instructions require adjustments after selection to satisfy
   // operand requirements.
   let hasPostISelHook = 1;
+  let SchedRW = [Write32Bit];
 }
 
 class Enc32 {
@@ -214,9 +215,9 @@ class SMRDe <bits<5> op, bits<1> imm> : Enc32 {
   let Inst{31-27} = 0x18; //encoding
 }
 
+let SchedRW = [WriteSALU] in {
 class SOP1 <dag outs, dag ins, string asm, list<dag> pattern> :
     InstSI<outs, ins, asm, pattern> {
-
   let mayLoad = 0;
   let mayStore = 0;
   let hasSideEffects = 0;
@@ -274,6 +275,8 @@ class SOPP <bits<7> op, dag ins, string asm, list<dag> pattern = []> :
   let UseNamedOperandTable = 1;
 }
 
+} // let SchedRW = [WriteSALU]
+
 class SMRD <dag outs, dag ins, string asm, list<dag> pattern> :
     InstSI<outs, ins, asm, pattern> {
 
@@ -283,6 +286,7 @@ class SMRD <dag outs, dag ins, string asm, list<dag> pattern> :
   let mayLoad = 1;
   let hasSideEffects = 0;
   let UseNamedOperandTable = 1;
+  let SchedRW = [WriteSMEM];
 }
 
 //===----------------------------------------------------------------------===//
@@ -588,6 +592,7 @@ class DS <dag outs, dag ins, string asm, list<dag> pattern> :
   let DS = 1;
   let UseNamedOperandTable = 1;
   let DisableEncoding = "$m0";
+  let SchedRW = [WriteLDS];
 }
 
 class DS_si <bits<8> op, dag outs, dag ins, string asm, list<dag> pattern> :
@@ -602,6 +607,7 @@ class MUBUF <dag outs, dag ins, string asm, list<dag> pattern> :
 
   let hasSideEffects = 0;
   let UseNamedOperandTable = 1;
+  let SchedRW = [WriteVMEM];
 }
 
 class MTBUF <dag outs, dag ins, string asm, list<dag> pattern> :
@@ -613,6 +619,7 @@ class MTBUF <dag outs, dag ins, string asm, list<dag> pattern> :
 
   let hasSideEffects = 0;
   let UseNamedOperandTable = 1;
+  let SchedRW = [WriteVMEM];
 }
 
 class FLAT <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
@@ -641,5 +648,4 @@ class MIMG <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
 }
 
 
-
 } // End Uses = [EXEC]
diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td
index 4941d6491b1..0fe2b79a162 100644
--- a/lib/Target/R600/SIInstructions.td
+++ b/lib/Target/R600/SIInstructions.td
@@ -1191,6 +1191,8 @@ defm V_MOV_B32 : VOP1Inst <vop1<0x1>, "v_mov_b32", VOP_I32_I32>;
 
 let Uses = [EXEC] in {
 
+// FIXME: Specify SchedRW for READFIRSTLANE_B32
+
 def V_READFIRSTLANE_B32 : VOP1 <
   0x00000002,
   (outs SReg_32:$vdst),
@@ -1201,6 +1203,8 @@ def V_READFIRSTLANE_B32 : VOP1 <
 
 }
 
+let SchedRW = [WriteQuarterRate32] in {
+
 defm V_CVT_I32_F64 : VOP1Inst <vop1<0x3>, "v_cvt_i32_f64",
   VOP_I32_F64, fp_to_sint
 >;
@@ -1253,6 +1257,9 @@ defm V_CVT_U32_F64 : VOP1Inst <vop1<0x15>, "v_cvt_u32_f64",
 defm V_CVT_F64_U32 : VOP1Inst <vop1<0x16>, "v_cvt_f64_u32",
   VOP_F64_I32, uint_to_fp
 >;
+
+} // let SchedRW = [WriteQuarterRate32]
+
 defm V_FRACT_F32 : VOP1Inst <vop1<0x20, 0x1b>, "v_fract_f32",
   VOP_F32_F32, AMDGPUfract
 >;
@@ -1271,6 +1278,9 @@ defm V_FLOOR_F32 : VOP1Inst <vop1<0x24, 0x1f>, "v_floor_f32",
 defm V_EXP_F32 : VOP1Inst <vop1<0x25, 0x20>, "v_exp_f32",
   VOP_F32_F32, fexp2
 >;
+
+let SchedRW = [WriteQuarterRate32] in {
+
 defm V_LOG_F32 : VOP1Inst <vop1<0x27, 0x21>, "v_log_f32",
   VOP_F32_F32, flog2
 >;
@@ -1283,18 +1293,32 @@ defm V_RCP_IFLAG_F32 : VOP1Inst <vop1<0x2b, 0x23>, "v_rcp_iflag_f32",
 defm V_RSQ_F32 : VOP1Inst <vop1<0x2e, 0x24>, "v_rsq_f32",
   VOP_F32_F32, AMDGPUrsq
 >;
+
+} //let SchedRW = [WriteQuarterRate32]
+
+let SchedRW = [WriteDouble] in {
+
 defm V_RCP_F64 : VOP1Inst <vop1<0x2f, 0x25>, "v_rcp_f64",
   VOP_F64_F64, AMDGPUrcp
 >;
 defm V_RSQ_F64 : VOP1Inst <vop1<0x31, 0x26>, "v_rsq_f64",
   VOP_F64_F64, AMDGPUrsq
 >;
+
+} // let SchedRW = [WriteDouble];
+
 defm V_SQRT_F32 : VOP1Inst <vop1<0x33, 0x27>, "v_sqrt_f32",
   VOP_F32_F32, fsqrt
 >;
+
+let SchedRW = [WriteDouble] in {
+
 defm V_SQRT_F64 : VOP1Inst <vop1<0x34, 0x28>, "v_sqrt_f64",
   VOP_F64_F64, fsqrt
 >;
+
+} // let SchedRW = [WriteDouble]
+
 defm V_SIN_F32 : VOP1Inst <vop1<0x35, 0x29>, "v_sin_f32",
   VOP_F32_F32, AMDGPUsin
 >;
@@ -1323,6 +1347,8 @@ defm V_MOVRELSD_B32 : VOP1Inst <vop1<0x44, 0x38>, "v_movrelsd_b32", VOP_I32_I32>
 // These instruction only exist on SI and CI
 let SubtargetPredicate = isSICI in {
 
+let SchedRW = [WriteQuarterRate32] in {
+
 defm V_LOG_CLAMP_F32 : VOP1InstSI <vop1<0x26>, "v_log_clamp_f32", VOP_F32_F32>;
 defm V_RCP_CLAMP_F32 : VOP1InstSI <vop1<0x28>, "v_rcp_clamp_f32", VOP_F32_F32>;
 defm V_RCP_LEGACY_F32 : VOP1InstSI <vop1<0x29>, "v_rcp_legacy_f32", VOP_F32_F32>;
@@ -1332,17 +1358,25 @@ defm V_RSQ_CLAMP_F32 : VOP1InstSI <vop1<0x2c>, "v_rsq_clamp_f32",
 defm V_RSQ_LEGACY_F32 : VOP1InstSI <vop1<0x2d>, "v_rsq_legacy_f32",
   VOP_F32_F32, AMDGPUrsq_legacy
 >;
+
+} // End let SchedRW = [WriteQuarterRate32]
+
+let SchedRW = [WriteDouble] in {
+
 defm V_RCP_CLAMP_F64 : VOP1InstSI <vop1<0x30>, "v_rcp_clamp_f64", VOP_F64_F64>;
 defm V_RSQ_CLAMP_F64 : VOP1InstSI <vop1<0x32>, "v_rsq_clamp_f64",
   VOP_F64_F64, AMDGPUrsq_clamped
 >;
 
+} // End SchedRW = [WriteDouble]
+
 } // End SubtargetPredicate = isSICI
 
 //===----------------------------------------------------------------------===//
 // VINTRP Instructions
 //===----------------------------------------------------------------------===//
 
+// FIXME: Specify SchedRW for VINTRP insturctions.
 defm V_INTERP_P1_F32 : VINTRP_m <
   0x00000000, "v_interp_p1_f32",
   (outs VReg_32:$dst),
@@ -1656,11 +1690,15 @@ defm V_SAD_U32 : VOP3Inst <vop3<0x15d, 0x1dc>, "v_sad_u32",
 defm V_DIV_FIXUP_F32 : VOP3Inst <
   vop3<0x15f, 0x1de>, "v_div_fixup_f32", VOP_F32_F32_F32_F32, AMDGPUdiv_fixup
 >;
+
+let SchedRW = [WriteDouble] in {
+
 defm V_DIV_FIXUP_F64 : VOP3Inst <
   vop3<0x160, 0x1df>, "v_div_fixup_f64", VOP_F64_F64_F64_F64, AMDGPUdiv_fixup
 >;
 
-// Only on SI
+} // let SchedRW = [WriteDouble]
+
 defm V_LSHL_B64 : VOP3Inst <vop3<0x161>, "v_lshl_b64",
   VOP_I64_I64_I32, shl
 >;
@@ -1675,6 +1713,7 @@ defm V_ASHR_I64 : VOP3Inst <vop3<0x163>, "v_ashr_i64",
   VOP_I64_I64_I32, sra
 >;
 
+let SchedRW = [WriteDouble] in {
 let isCommutable = 1 in {
 
 defm V_ADD_F64 : VOP3Inst <vop3<0x164, 0x280>, "v_add_f64",
@@ -1697,7 +1736,9 @@ defm V_LDEXP_F64 : VOP3Inst <vop3<0x168, 0x284>, "v_ldexp_f64",
   VOP_F64_F64_I32, AMDGPUldexp
 >;
 
-let isCommutable = 1 in {
+} // let SchedRW = [WriteDouble]
+
+let isCommutable = 1, SchedRW = [WriteQuarterRate32] in {
 
 defm V_MUL_LO_U32 : VOP3Inst <vop3<0x169, 0x285>, "v_mul_lo_u32",
   VOP_I32_I32_I32
@@ -1713,30 +1754,37 @@ defm V_MUL_HI_I32 : VOP3Inst <vop3<0x16c, 0x287>, "v_mul_hi_i32",
   VOP_I32_I32_I32
 >;
 
-} // isCommutable = 1
+} // isCommutable = 1, SchedRW = [WriteQuarterRate32]
 
 defm V_DIV_SCALE_F32 : VOP3b_32 <vop3<0x16d, 0x1e0>, "v_div_scale_f32", []>;
 
+let SchedRW = [WriteDouble] in {
 // Double precision division pre-scale.
 defm V_DIV_SCALE_F64 : VOP3b_64 <vop3<0x16e, 0x1e1>, "v_div_scale_f64", []>;
+} // let SchedRW = [WriteDouble]
 
 let isCommutable = 1 in {
 defm V_DIV_FMAS_F32 : VOP3Inst <vop3<0x16f, 0x1e2>, "v_div_fmas_f32",
   VOP_F32_F32_F32_F32, AMDGPUdiv_fmas
 >;
+let SchedRW = [WriteDouble] in {
 defm V_DIV_FMAS_F64 : VOP3Inst <vop3<0x170, 0x1e3>, "v_div_fmas_f64",
   VOP_F64_F64_F64_F64, AMDGPUdiv_fmas
 >;
+} // End SchedRW = [WriteDouble]
 } // End isCommutable = 1
 
 //def V_MSAD_U8 : VOP3_U8 <0x00000171, "v_msad_u8", []>;
 //def V_QSAD_U8 : VOP3_U8 <0x00000172, "v_qsad_u8", []>;
 //def V_MQSAD_U8 : VOP3_U8 <0x00000173, "v_mqsad_u8", []>;
 
+let SchedRW = [WriteDouble] in {
 defm V_TRIG_PREOP_F64 : VOP3Inst <
   vop3<0x174, 0x292>, "v_trig_preop_f64", VOP_F64_F64_I32, AMDGPUtrig_preop
 >;
 
+} // let SchedRW = [WriteDouble]
+
 //===----------------------------------------------------------------------===//
 // Pseudo Instructions
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/R600/SIRegisterInfo.cpp b/lib/Target/R600/SIRegisterInfo.cpp
index 9dddd1ccd69..672f316f1f1 100644
--- a/lib/Target/R600/SIRegisterInfo.cpp
+++ b/lib/Target/R600/SIRegisterInfo.cpp
@@ -51,9 +51,32 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   return Reserved;
 }
 
-unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
-                                             MachineFunction &MF) const {
-  return RC->getNumRegs();
+unsigned SIRegisterInfo::getRegPressureSetLimit(unsigned Idx) const {
+
+  // FIXME: We should adjust the max number of waves based on LDS size.
+  unsigned SGPRLimit = getNumSGPRsAllowed(ST.getMaxWavesPerCU());
+  unsigned VGPRLimit = getNumVGPRsAllowed(ST.getMaxWavesPerCU());
+
+  for (regclass_iterator I = regclass_begin(), E = regclass_end();
+       I != E; ++I) {
+
+    unsigned NumSubRegs = std::max((int)(*I)->getSize() / 4, 1);
+    unsigned Limit;
+
+    if (isSGPRClass(*I)) {
+      Limit = SGPRLimit / NumSubRegs;
+    } else {
+      Limit = VGPRLimit / NumSubRegs;
+    }
+
+    const int *Sets = getRegClassPressureSets(*I);
+    assert(Sets);
+    for (unsigned i = 0; Sets[i] != -1; ++i) {
+	    if (Sets[i] == (int)Idx)
+        return Limit;
+    }
+  }
+  return 256;
 }
 
 bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const {
@@ -481,3 +504,29 @@ unsigned SIRegisterInfo::findUnusedRegister(const MachineRegisterInfo &MRI,
   return AMDGPU::NoRegister;
 }
 
+unsigned SIRegisterInfo::getNumVGPRsAllowed(unsigned WaveCount) const {
+  switch(WaveCount) {
+    case 10: return 24;
+    case 9:  return 28;
+    case 8:  return 32;
+    case 7:  return 36;
+    case 6:  return 40;
+    case 5:  return 48;
+    case 4:  return 64;
+    case 3:  return 84;
+    case 2:  return 128;
+    default: return 256;
+  }
+}
+
+unsigned SIRegisterInfo::getNumSGPRsAllowed(unsigned WaveCount) const {
+  switch(WaveCount) {
+    case 10: return 48;
+    case 9:  return 56;
+    case 8:  return 64;
+    case 7:  return 72;
+    case 6:  return 80;
+    case 5:  return 96;
+    default: return 103;
+  }
+}
diff --git a/lib/Target/R600/SIRegisterInfo.h b/lib/Target/R600/SIRegisterInfo.h
index 7040fd0c3e9..ffd0d84c802 100644
--- a/lib/Target/R600/SIRegisterInfo.h
+++ b/lib/Target/R600/SIRegisterInfo.h
@@ -17,6 +17,7 @@
 #define LLVM_LIB_TARGET_R600_SIREGISTERINFO_H
 
 #include "AMDGPURegisterInfo.h"
+#include "llvm/Support/Debug.h"
 
 namespace llvm {
 
@@ -26,8 +27,7 @@ struct SIRegisterInfo : public AMDGPURegisterInfo {
 
   BitVector getReservedRegs(const MachineFunction &MF) const override;
 
-  unsigned getRegPressureLimit(const TargetRegisterClass *RC,
-                               MachineFunction &MF) const override;
+  unsigned getRegPressureSetLimit(unsigned Idx) const override;
 
   bool requiresRegisterScavenging(const MachineFunction &Fn) const override;
 
@@ -113,6 +113,14 @@ struct SIRegisterInfo : public AMDGPURegisterInfo {
   unsigned getPreloadedValue(const MachineFunction &MF,
                              enum PreloadedValue Value) const;
 
+  /// \brief Give the maximum number of VGPRs that can be used by \p WaveCount
+  ///        concurrent waves.
+  unsigned getNumVGPRsAllowed(unsigned WaveCount) const;
+
+  /// \brief Give the maximum number of SGPRs that can be used by \p WaveCount
+  ///        concurrent waves.
+  unsigned getNumSGPRsAllowed(unsigned WaveCount) const;
+
   unsigned findUnusedRegister(const MachineRegisterInfo &MRI,
                               const TargetRegisterClass *RC) const;
 
diff --git a/lib/Target/R600/SISchedule.td b/lib/Target/R600/SISchedule.td
index 28b65b82585..9b1f676020b 100644
--- a/lib/Target/R600/SISchedule.td
+++ b/lib/Target/R600/SISchedule.td
@@ -7,9 +7,85 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// TODO: This is just a place holder for now.
+// MachineModel definitions for Southern Islands (SI)
 //
 //===----------------------------------------------------------------------===//
 
+def WriteBranch : SchedWrite;
+def WriteExport : SchedWrite;
+def WriteLDS    : SchedWrite;
+def WriteSALU   : SchedWrite;
+def WriteSMEM   : SchedWrite;
+def WriteVMEM   : SchedWrite;
 
-def SI_Itin : ProcessorItineraries <[], [], []>;
+// Vector ALU instructions
+def Write32Bit         : SchedWrite;
+def WriteQuarterRate32 : SchedWrite;
+
+def WriteFloatFMA   : SchedWrite;
+
+def WriteDouble     : SchedWrite;
+def WriteDoubleAdd  : SchedWrite;
+
+def SIFullSpeedModel : SchedMachineModel;
+def SIQuarterSpeedModel : SchedMachineModel;
+
+// BufferSize = 0 means the processors are in-order.
+let BufferSize = 0 in {
+
+// XXX: Are the resource counts correct?
+def HWBranch : ProcResource<1>;
+def HWExport : ProcResource<7>;   // Taken from S_WAITCNT
+def HWLGKM   : ProcResource<31>;  // Taken from S_WAITCNT
+def HWSALU   : ProcResource<1>;
+def HWVMEM   : ProcResource<15>;  // Taken from S_WAITCNT
+def HWVALU   : ProcResource<1>;
+
+}
+
+class HWWriteRes<SchedWrite write, list<ProcResourceKind> resources,
+                 int latency> : WriteRes<write, resources> {
+  let Latency = latency;
+}
+
+class HWVALUWriteRes<SchedWrite write, int latency> :
+  HWWriteRes<write, [HWVALU], latency>;
+
+
+// The latency numbers are taken from AMD Accelerated Parallel Processing
+// guide.  They may not be acurate.
+
+// The latency values are 1 / (operations / cycle) / 4.
+multiclass SICommonWriteRes {
+
+  def : HWWriteRes<WriteBranch,  [HWBranch], 100>; // XXX: Guessed ???
+  def : HWWriteRes<WriteExport,  [HWExport], 100>; // XXX: Guessed ???
+  def : HWWriteRes<WriteLDS,     [HWLGKM],    32>; // 2 - 64
+  def : HWWriteRes<WriteSALU,    [HWSALU],     1>;
+  def : HWWriteRes<WriteSMEM,    [HWLGKM],    10>; // XXX: Guessed ???
+  def : HWWriteRes<WriteVMEM,    [HWVMEM],   450>; // 300 - 600
+
+  def : HWVALUWriteRes<Write32Bit,         1>;
+  def : HWVALUWriteRes<WriteQuarterRate32, 4>;
+}
+
+
+let SchedModel = SIFullSpeedModel in {
+
+defm : SICommonWriteRes;
+
+def : HWVALUWriteRes<WriteFloatFMA,   1>;
+def : HWVALUWriteRes<WriteDouble,     4>;
+def : HWVALUWriteRes<WriteDoubleAdd,  2>;
+
+} // End SchedModel = SIFullSpeedModel
+
+let SchedModel = SIQuarterSpeedModel in {
+
+defm : SICommonWriteRes;
+
+def : HWVALUWriteRes<WriteFloatFMA, 16>;
+def : HWVALUWriteRes<WriteDouble,   16>;
+def : HWVALUWriteRes<WriteDoubleAdd, 8>;
+
+}  // End SchedModel = SIQuarterSpeedModel
diff --git a/test/CodeGen/R600/atomic_cmp_swap_local.ll b/test/CodeGen/R600/atomic_cmp_swap_local.ll
index 8a5a3b64cf6..1e28fefbef5 100644
--- a/test/CodeGen/R600/atomic_cmp_swap_local.ll
+++ b/test/CodeGen/R600/atomic_cmp_swap_local.ll
@@ -2,9 +2,9 @@
 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=CI -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}lds_atomic_cmpxchg_ret_i32_offset:
+; SI: v_mov_b32_e32 [[VCMP:v[0-9]+]], 7
 ; SI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
 ; SI: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
-; SI-DAG: v_mov_b32_e32 [[VCMP:v[0-9]+]], 7
 ; SI-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
 ; SI-DAG: v_mov_b32_e32 [[VSWAP:v[0-9]+]], [[SWAP]]
 ; SI: ds_cmpst_rtn_b32 [[RESULT:v[0-9]+]], [[VPTR]], [[VCMP]], [[VSWAP]] offset:16 [M0]
@@ -18,11 +18,11 @@ define void @lds_atomic_cmpxchg_ret_i32_offset(i32 addrspace(1)* %out, i32 addrs
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_cmpxchg_ret_i64_offset:
-; SI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SI: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd
 ; SI: s_mov_b64  s{{\[}}[[LOSCMP:[0-9]+]]:[[HISCMP:[0-9]+]]{{\]}}, 7
 ; SI-DAG: v_mov_b32_e32 v[[LOVCMP:[0-9]+]], s[[LOSCMP]]
 ; SI-DAG: v_mov_b32_e32 v[[HIVCMP:[0-9]+]], s[[HISCMP]]
+; SI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd
 ; SI-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
 ; SI-DAG: v_mov_b32_e32 v[[LOSWAPV:[0-9]+]], s[[LOSWAP]]
 ; SI-DAG: v_mov_b32_e32 v[[HISWAPV:[0-9]+]], s[[HISWAP]]
diff --git a/test/CodeGen/R600/ctpop.ll b/test/CodeGen/R600/ctpop.ll
index a47bc876cb9..9cf2fe94fbb 100644
--- a/test/CodeGen/R600/ctpop.ll
+++ b/test/CodeGen/R600/ctpop.ll
@@ -38,11 +38,11 @@ define void @v_ctpop_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noali
 }
 
 ; FUNC-LABEL: {{^}}v_ctpop_add_chain_i32:
-; SI: buffer_load_dword [[VAL0:v[0-9]+]],
 ; SI: buffer_load_dword [[VAL1:v[0-9]+]],
+; SI: buffer_load_dword [[VAL0:v[0-9]+]],
 ; SI: v_mov_b32_e32 [[VZERO:v[0-9]+]], 0
 ; SI: v_bcnt_u32_b32_e32 [[MIDRESULT:v[0-9]+]], [[VAL1]], [[VZERO]]
-; SI-NEXT: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAL0]], [[MIDRESULT]]
+; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAL0]], [[MIDRESULT]]
 ; SI: buffer_store_dword [[RESULT]],
 ; SI: s_endpgm
 
diff --git a/test/CodeGen/R600/ds_read2st64.ll b/test/CodeGen/R600/ds_read2st64.ll
index 24834af2040..efd875e9317 100644
--- a/test/CodeGen/R600/ds_read2st64.ll
+++ b/test/CodeGen/R600/ds_read2st64.ll
@@ -65,8 +65,8 @@ define void @simple_read2st64_f32_max_offset(float addrspace(1)* %out, float add
 
 ; SI-LABEL: @simple_read2st64_f32_over_max_offset
 ; SI-NOT: ds_read2st64_b32
-; SI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:256
 ; SI: v_add_i32_e32 [[BIGADD:v[0-9]+]], 0x10000, {{v[0-9]+}}
+; SI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:256
 ; SI: ds_read_b32 {{v[0-9]+}}, [[BIGADD]]
 ; SI: s_endpgm
 define void @simple_read2st64_f32_over_max_offset(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
@@ -197,8 +197,8 @@ define void @simple_read2st64_f64_max_offset(double addrspace(1)* %out, double a
 
 ; SI-LABEL: @simple_read2st64_f64_over_max_offset
 ; SI-NOT: ds_read2st64_b64
-; SI: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset:512
 ; SI: v_add_i32_e32 [[BIGADD:v[0-9]+]], 0x10000, {{v[0-9]+}}
+; SI: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset:512
 ; SI: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, [[BIGADD]]
 ; SI: s_endpgm
 define void @simple_read2st64_f64_over_max_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
diff --git a/test/CodeGen/R600/fceil64.ll b/test/CodeGen/R600/fceil64.ll
index 77cd8eae402..107abe2fbbe 100644
--- a/test/CodeGen/R600/fceil64.ll
+++ b/test/CodeGen/R600/fceil64.ll
@@ -11,12 +11,12 @@ declare <16 x double> @llvm.ceil.v16f64(<16 x double>) nounwind readnone
 ; FUNC-LABEL: {{^}}fceil_f64:
 ; CI: v_ceil_f64_e32
 ; SI: s_bfe_u32 [[SEXP:s[0-9]+]], {{s[0-9]+}}, 0xb0014
+; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
 ; SI: s_add_i32 s{{[0-9]+}}, [[SEXP]], 0xfffffc01
 ; SI: s_lshr_b64
 ; SI: s_not_b64
 ; SI: s_and_b64
-; SI-DAG: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
-; SI-DAG: cmp_lt_i32
+; SI: cmp_lt_i32
 ; SI: cndmask_b32
 ; SI: cndmask_b32
 ; SI: cmp_gt_i32
diff --git a/test/CodeGen/R600/ffloor.ll b/test/CodeGen/R600/ffloor.ll
index 194d0aaa181..74491b85b1e 100644
--- a/test/CodeGen/R600/ffloor.ll
+++ b/test/CodeGen/R600/ffloor.ll
@@ -12,12 +12,12 @@ declare <16 x double> @llvm.floor.v16f64(<16 x double>) nounwind readnone
 ; CI: v_floor_f64_e32
 
 ; SI: s_bfe_u32 [[SEXP:s[0-9]+]], {{s[0-9]+}}, 0xb0014
+; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
 ; SI: s_add_i32 s{{[0-9]+}}, [[SEXP]], 0xfffffc01
 ; SI: s_lshr_b64
 ; SI: s_not_b64
 ; SI: s_and_b64
-; SI-DAG: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
-; SI-DAG: cmp_lt_i32
+; SI: cmp_lt_i32
 ; SI: cndmask_b32
 ; SI: cndmask_b32
 ; SI: cmp_gt_i32
diff --git a/test/CodeGen/R600/fmax3.ll b/test/CodeGen/R600/fmax3.ll
index 6f95bf20f73..78833d7606d 100644
--- a/test/CodeGen/R600/fmax3.ll
+++ b/test/CodeGen/R600/fmax3.ll
@@ -3,9 +3,9 @@
 declare float @llvm.maxnum.f32(float, float) nounwind readnone
 
 ; SI-LABEL: {{^}}test_fmax3_olt_0:
-; SI: buffer_load_dword [[REGA:v[0-9]+]]
-; SI: buffer_load_dword [[REGB:v[0-9]+]]
 ; SI: buffer_load_dword [[REGC:v[0-9]+]]
+; SI: buffer_load_dword [[REGB:v[0-9]+]]
+; SI: buffer_load_dword [[REGA:v[0-9]+]]
 ; SI: v_max3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
 ; SI: buffer_store_dword [[RESULT]],
 ; SI: s_endpgm
@@ -21,8 +21,8 @@ define void @test_fmax3_olt_0(float addrspace(1)* %out, float addrspace(1)* %apt
 
 ; Commute operand of second fmax
 ; SI-LABEL: {{^}}test_fmax3_olt_1:
-; SI: buffer_load_dword [[REGA:v[0-9]+]]
 ; SI: buffer_load_dword [[REGB:v[0-9]+]]
+; SI: buffer_load_dword [[REGA:v[0-9]+]]
 ; SI: buffer_load_dword [[REGC:v[0-9]+]]
 ; SI: v_max3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
 ; SI: buffer_store_dword [[RESULT]],
diff --git a/test/CodeGen/R600/fmin3.ll b/test/CodeGen/R600/fmin3.ll
index aeeed1c7dd3..94631a149c0 100644
--- a/test/CodeGen/R600/fmin3.ll
+++ b/test/CodeGen/R600/fmin3.ll
@@ -3,9 +3,9 @@
 declare float @llvm.minnum.f32(float, float) nounwind readnone
 
 ; SI-LABEL: {{^}}test_fmin3_olt_0:
-; SI: buffer_load_dword [[REGA:v[0-9]+]]
-; SI: buffer_load_dword [[REGB:v[0-9]+]]
 ; SI: buffer_load_dword [[REGC:v[0-9]+]]
+; SI: buffer_load_dword [[REGB:v[0-9]+]]
+; SI: buffer_load_dword [[REGA:v[0-9]+]]
 ; SI: v_min3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
 ; SI: buffer_store_dword [[RESULT]],
 ; SI: s_endpgm
@@ -21,8 +21,8 @@ define void @test_fmin3_olt_0(float addrspace(1)* %out, float addrspace(1)* %apt
 
 ; Commute operand of second fmin
 ; SI-LABEL: {{^}}test_fmin3_olt_1:
-; SI: buffer_load_dword [[REGA:v[0-9]+]]
 ; SI: buffer_load_dword [[REGB:v[0-9]+]]
+; SI: buffer_load_dword [[REGA:v[0-9]+]]
 ; SI: buffer_load_dword [[REGC:v[0-9]+]]
 ; SI: v_min3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
 ; SI: buffer_store_dword [[RESULT]],
diff --git a/test/CodeGen/R600/fneg-fabs.f64.ll b/test/CodeGen/R600/fneg-fabs.f64.ll
index 04a87e37785..c6e8608c79d 100644
--- a/test/CodeGen/R600/fneg-fabs.f64.ll
+++ b/test/CodeGen/R600/fneg-fabs.f64.ll
@@ -56,8 +56,8 @@ define void @fneg_fabs_fn_free_f64(double addrspace(1)* %out, i64 %in) {
 }
 
 ; FUNC-LABEL: {{^}}fneg_fabs_f64:
-; SI: s_load_dwordx2
 ; SI: s_load_dwordx2 s{{\[}}[[LO_X:[0-9]+]]:[[HI_X:[0-9]+]]{{\]}}
+; SI: s_load_dwordx2
 ; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000
 ; SI-DAG: v_or_b32_e32 v[[HI_V:[0-9]+]], s[[HI_X]], [[IMMREG]]
 ; SI-DAG: v_mov_b32_e32 v[[LO_V:[0-9]+]], s[[LO_X]]
diff --git a/test/CodeGen/R600/ftrunc.f64.ll b/test/CodeGen/R600/ftrunc.f64.ll
index 2c7217ef056..faf9b401b32 100644
--- a/test/CodeGen/R600/ftrunc.f64.ll
+++ b/test/CodeGen/R600/ftrunc.f64.ll
@@ -23,12 +23,12 @@ define void @v_ftrunc_f64(double addrspace(1)* %out, double addrspace(1)* %in) {
 ; CI: v_trunc_f64_e32
 
 ; SI: s_bfe_u32 [[SEXP:s[0-9]+]], {{s[0-9]+}}, 0xb0014
+; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
 ; SI: s_add_i32 s{{[0-9]+}}, [[SEXP]], 0xfffffc01
 ; SI: s_lshr_b64
+; SI: cmp_lt_i32
 ; SI: s_not_b64
 ; SI: s_and_b64
-; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
-; SI: cmp_lt_i32
 ; SI: cndmask_b32
 ; SI: cndmask_b32
 ; SI: cmp_gt_i32
diff --git a/test/CodeGen/R600/llvm.memcpy.ll b/test/CodeGen/R600/llvm.memcpy.ll
index 9771062ed0d..23d392c61d5 100644
--- a/test/CodeGen/R600/llvm.memcpy.ll
+++ b/test/CodeGen/R600/llvm.memcpy.ll
@@ -6,39 +6,23 @@ declare void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* nocapture, i8 addrspace
 
 ; FUNC-LABEL: {{^}}test_small_memcpy_i64_lds_to_lds_align1:
 ; SI: ds_read_u8
-; SI: ds_write_b8
 ; SI: ds_read_u8
-; SI: ds_write_b8
 ; SI: ds_read_u8
-; SI: ds_write_b8
 ; SI: ds_read_u8
-; SI: ds_write_b8
 ; SI: ds_read_u8
-; SI: ds_write_b8
-
 ; SI: ds_read_u8
-; SI: ds_write_b8
 ; SI: ds_read_u8
-; SI: ds_write_b8
 ; SI: ds_read_u8
-; SI: ds_write_b8
+
 ; SI: ds_read_u8
-; SI: ds_write_b8
 ; SI: ds_read_u8
-; SI: ds_write_b8
-
 ; SI: ds_read_u8
-; SI: ds_write_b8
 ; SI: ds_read_u8
-; SI: ds_write_b8
 ; SI: ds_read_u8
-; SI: ds_write_b8
 ; SI: ds_read_u8
-; SI: ds_write_b8
 ; SI: ds_read_u8
 ; SI: ds_read_u8
 
-
 ; SI: ds_read_u8
 ; SI: ds_read_u8
 ; SI: ds_read_u8
@@ -65,6 +49,14 @@ declare void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* nocapture, i8 addrspace
 ; SI: ds_write_b8
 ; SI: ds_write_b8
 ; SI: ds_write_b8
+
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
 ; SI: ds_write_b8
 
 ; SI: ds_write_b8
@@ -75,6 +67,14 @@ declare void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* nocapture, i8 addrspace
 ; SI: ds_write_b8
 ; SI: ds_write_b8
 ; SI: ds_write_b8
+
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
 ; SI: ds_write_b8
 
 ; SI: s_endpgm
diff --git a/test/CodeGen/R600/local-atomics.ll b/test/CodeGen/R600/local-atomics.ll
index 16d3173f369..78d747d7e72 100644
--- a/test/CodeGen/R600/local-atomics.ll
+++ b/test/CodeGen/R600/local-atomics.ll
@@ -4,8 +4,8 @@
 
 ; FUNC-LABEL: {{^}}lds_atomic_xchg_ret_i32:
 ; EG: LDS_WRXCHG_RET *
-; SI: s_load_dword [[SPTR:s[0-9]+]],
 ; SI: v_mov_b32_e32 [[DATA:v[0-9]+]], 4
+; SI: s_load_dword [[SPTR:s[0-9]+]],
 ; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
 ; SI: ds_wrxchg_rtn_b32 [[RESULT:v[0-9]+]], [[VPTR]], [[DATA]] [M0]
 ; SI: buffer_store_dword [[RESULT]],
@@ -30,8 +30,8 @@ define void @lds_atomic_xchg_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspac
 ; XXX - Is it really necessary to load 4 into VGPR?
 ; FUNC-LABEL: {{^}}lds_atomic_add_ret_i32:
 ; EG: LDS_ADD_RET *
-; SI: s_load_dword [[SPTR:s[0-9]+]],
 ; SI: v_mov_b32_e32 [[DATA:v[0-9]+]], 4
+; SI: s_load_dword [[SPTR:s[0-9]+]],
 ; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
 ; SI: ds_add_rtn_u32 [[RESULT:v[0-9]+]], [[VPTR]], [[DATA]] [M0]
 ; SI: buffer_store_dword [[RESULT]],
diff --git a/test/CodeGen/R600/local-atomics64.ll b/test/CodeGen/R600/local-atomics64.ll
index 0e5a94f25cd..0b808571011 100644
--- a/test/CodeGen/R600/local-atomics64.ll
+++ b/test/CodeGen/R600/local-atomics64.ll
@@ -29,10 +29,10 @@ define void @lds_atomic_add_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %p
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_add_ret_i64_offset:
-; SI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
 ; SI: s_mov_b64 s{{\[}}[[LOSDATA:[0-9]+]]:[[HISDATA:[0-9]+]]{{\]}}, 9
 ; SI-DAG: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], s[[LOSDATA]]
 ; SI-DAG: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], s[[HISDATA]]
+; SI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
 ; SI-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
 ; SI: ds_add_rtn_u64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}} offset:32 [M0]
 ; SI: buffer_store_dwordx2 [[RESULT]],
diff --git a/test/CodeGen/R600/local-memory-two-objects.ll b/test/CodeGen/R600/local-memory-two-objects.ll
index 5c77ad5e6ca..86edfde4e34 100644
--- a/test/CodeGen/R600/local-memory-two-objects.ll
+++ b/test/CodeGen/R600/local-memory-two-objects.ll
@@ -31,8 +31,8 @@
 ; EG-CHECK-NOT: LDS_READ_RET {{[*]*}} OQAP, T[[ADDRR]]
 ; SI: v_add_i32_e32 [[SIPTR:v[0-9]+]], 16, v{{[0-9]+}}
 ; SI: ds_read_b32 {{v[0-9]+}}, [[SIPTR]] [M0]
-; CI: ds_read_b32 {{v[0-9]+}}, [[ADDRR:v[0-9]+]] offset:16 [M0]
-; CI: ds_read_b32 {{v[0-9]+}}, [[ADDRR]] [M0]
+; CI: ds_read_b32 {{v[0-9]+}}, [[ADDRR:v[0-9]+]] [M0]
+; CI: ds_read_b32 {{v[0-9]+}}, [[ADDRR]] offset:16 [M0]
 
 define void @local_memory_two_objects(i32 addrspace(1)* %out) {
 entry:
diff --git a/test/CodeGen/R600/si-triv-disjoint-mem-access.ll b/test/CodeGen/R600/si-triv-disjoint-mem-access.ll
index b2f4a9ff05e..f6dcb388248 100644
--- a/test/CodeGen/R600/si-triv-disjoint-mem-access.ll
+++ b/test/CodeGen/R600/si-triv-disjoint-mem-access.ll
@@ -51,8 +51,8 @@ define void @no_reorder_local_load_volatile_global_store_local_load(i32 addrspac
 
 ; FUNC-LABEL: @no_reorder_barrier_local_load_global_store_local_load
 ; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:4
-; CI: buffer_store_dword
 ; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:8
+; CI: buffer_store_dword
 define void @no_reorder_barrier_local_load_global_store_local_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
   %ptr0 = load i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4
 
diff --git a/test/CodeGen/R600/smrd.ll b/test/CodeGen/R600/smrd.ll
index a66ad0201bf..de60554a779 100644
--- a/test/CodeGen/R600/smrd.ll
+++ b/test/CodeGen/R600/smrd.ll
@@ -37,10 +37,12 @@ entry:
 
 ; SMRD load with a 64-bit offset
 ; CHECK-LABEL: {{^}}smrd3:
-; CHECK-DAG: s_mov_b32 s[[SHI:[0-9]+]], 4
-; CHECK-DAG: s_mov_b32 s[[SLO:[0-9]+]], 0 ;
-; FIXME: We don't need to copy these values to VGPRs
-; CHECK-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SLO]]
+; FIXME: There are too many copies here because we don't fold immediates
+;        through REG_SEQUENCE
+; CHECK: s_mov_b32 s[[SLO:[0-9]+]], 0 ;
+; CHECK: s_mov_b32 s[[SHI:[0-9]+]], 4
+; CHECK: s_mov_b32 s[[SSLO:[0-9]+]], s[[SLO]]
+; CHECK-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SSLO]]
 ; CHECK-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]]
 ; FIXME: We should be able to use s_load_dword here
 ; CHECK: buffer_load_dword v{{[0-9]+}}, v{{\[}}[[VLO]]:[[VHI]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0 addr64
diff --git a/test/CodeGen/R600/valu-i1.ll b/test/CodeGen/R600/valu-i1.ll
index a4027178431..5a3c2ec9b03 100644
--- a/test/CodeGen/R600/valu-i1.ll
+++ b/test/CodeGen/R600/valu-i1.ll
@@ -81,7 +81,6 @@ exit:
 ; SI: buffer_store_dword
 ; SI: v_cmp_eq_i32_e32 vcc,
 ; SI: s_or_b64 [[OR_SREG:s\[[0-9]+:[0-9]+\]]]
-; SI: v_add_i32_e32 {{v[0-9]+}}, 1, {{v[0-9]+}}
 ; SI: s_andn2_b64 exec, exec, [[OR_SREG]]
 ; SI: s_cbranch_execnz BB2_3
 
@@ -124,8 +123,8 @@ exit:
 
 ; Clear exec bits for workitems that load -1s
 ; SI: BB3_3:
-; SI: buffer_load_dword [[A:v[0-9]+]]
 ; SI: buffer_load_dword [[B:v[0-9]+]]
+; SI: buffer_load_dword [[A:v[0-9]+]]
 ; SI-DAG: v_cmp_ne_i32_e64 [[NEG1_CHECK_0:s\[[0-9]+:[0-9]+\]]], [[A]], -1
 ; SI-DAG: v_cmp_ne_i32_e64 [[NEG1_CHECK_1:s\[[0-9]+:[0-9]+\]]], [[B]], -1
 ; SI: s_and_b64 [[ORNEG1:s\[[0-9]+:[0-9]+\]]], [[NEG1_CHECK_1]], [[NEG1_CHECK_0]]
diff --git a/test/CodeGen/R600/wait.ll b/test/CodeGen/R600/wait.ll
index b30f86ccd0d..c54b8dcbfaf 100644
--- a/test/CodeGen/R600/wait.ll
+++ b/test/CodeGen/R600/wait.ll
@@ -3,9 +3,8 @@
 ; CHECK-LABEL: {{^}}main:
 ; CHECK: s_load_dwordx4
 ; CHECK: s_load_dwordx4
-; CHECK: s_waitcnt lgkmcnt(0){{$}}
-; CHECK: s_waitcnt vmcnt(0){{$}}
-; CHECK: s_waitcnt expcnt(0) lgkmcnt(0){{$}}
+; CHECK: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; CHECK: s_endpgm
 define void @main(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, <16 x i8> addrspace(2)* inreg %arg3, <16 x i8> addrspace(2)* inreg %arg4, i32 inreg %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9, float addrspace(2)* inreg %constptr) #0 {
 main_body:
   %tmp = getelementptr <16 x i8> addrspace(2)* %arg3, i32 0
diff --git a/test/CodeGen/R600/xor.ll b/test/CodeGen/R600/xor.ll
index b444b4ec08e..f7bc8757a2b 100644
--- a/test/CodeGen/R600/xor.ll
+++ b/test/CodeGen/R600/xor.ll
@@ -57,8 +57,8 @@ define void @xor_i1(float addrspace(1)* %out, float addrspace(1)* %in0, float ad
 }
 
 ; FUNC-LABEL: {{^}}v_xor_i1:
-; SI: buffer_load_ubyte [[A:v[0-9]+]]
 ; SI: buffer_load_ubyte [[B:v[0-9]+]]
+; SI: buffer_load_ubyte [[A:v[0-9]+]]
 ; SI: v_xor_b32_e32 [[XOR:v[0-9]+]], [[B]], [[A]]
 ; SI: v_and_b32_e32 [[RESULT:v[0-9]+]], 1, [[XOR]]
 ; SI: buffer_store_byte [[RESULT]]
diff --git a/test/CodeGen/R600/zero_extend.ll b/test/CodeGen/R600/zero_extend.ll
index 7df4b48c2b9..f22b8a29254 100644
--- a/test/CodeGen/R600/zero_extend.ll
+++ b/test/CodeGen/R600/zero_extend.ll
@@ -29,9 +29,9 @@ entry:
 }
 
 ; SI-CHECK-LABEL: {{^}}zext_i1_to_i64:
+; SI-CHECK: s_mov_b32 s{{[0-9]+}}, 0
 ; SI-CHECK: v_cmp_eq_i32
 ; SI-CHECK: v_cndmask_b32
-; SI-CHECK: s_mov_b32 s{{[0-9]+}}, 0
 define void @zext_i1_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %cmp = icmp eq i32 %a, %b
   %ext = zext i1 %cmp to i64
author	Tom Stellard <thomas.stellard@amd.com>	2013-07-19 11:50:00 -0700
committer	Tom Stellard <thomas.stellard@amd.com>	2015-01-08 20:07:31 -0800
commit	f11db5d5151e41e445cc8965f0d370414297ba23 (patch)
tree	3bc3b43b7cff8dc845d4610f4f6ebc844d2745fc
parent	260163c3963afa094645f3fe3d1ff731a81a380a (diff)