summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTom Stellard <thomas.stellard@amd.com>2013-07-19 11:50:00 -0700
committerTom Stellard <thomas.stellard@amd.com>2015-01-08 20:07:31 -0800
commitf11db5d5151e41e445cc8965f0d370414297ba23 (patch)
tree3bc3b43b7cff8dc845d4610f4f6ebc844d2745fc
parent260163c3963afa094645f3fe3d1ff731a81a380a (diff)
R600/SI: Define a schedule model and enable the generic machine scheduler
The schedule model is not complete yet, and could be improved.
-rw-r--r--lib/Target/R600/AMDGPUSubtarget.cpp21
-rw-r--r--lib/Target/R600/AMDGPUSubtarget.h13
-rw-r--r--lib/Target/R600/Processors.td32
-rw-r--r--lib/Target/R600/SIInstrFormats.td10
-rw-r--r--lib/Target/R600/SIInstructions.td54
-rw-r--r--lib/Target/R600/SIRegisterInfo.cpp55
-rw-r--r--lib/Target/R600/SIRegisterInfo.h12
-rw-r--r--lib/Target/R600/SISchedule.td80
-rw-r--r--test/CodeGen/R600/atomic_cmp_swap_local.ll6
-rw-r--r--test/CodeGen/R600/ctpop.ll4
-rw-r--r--test/CodeGen/R600/ds_read2st64.ll4
-rw-r--r--test/CodeGen/R600/fceil64.ll4
-rw-r--r--test/CodeGen/R600/ffloor.ll4
-rw-r--r--test/CodeGen/R600/fmax3.ll6
-rw-r--r--test/CodeGen/R600/fmin3.ll6
-rw-r--r--test/CodeGen/R600/fneg-fabs.f64.ll2
-rw-r--r--test/CodeGen/R600/ftrunc.f64.ll4
-rw-r--r--test/CodeGen/R600/llvm.memcpy.ll34
-rw-r--r--test/CodeGen/R600/local-atomics.ll4
-rw-r--r--test/CodeGen/R600/local-atomics64.ll2
-rw-r--r--test/CodeGen/R600/local-memory-two-objects.ll4
-rw-r--r--test/CodeGen/R600/si-triv-disjoint-mem-access.ll2
-rw-r--r--test/CodeGen/R600/smrd.ll10
-rw-r--r--test/CodeGen/R600/valu-i1.ll3
-rw-r--r--test/CodeGen/R600/wait.ll5
-rw-r--r--test/CodeGen/R600/xor.ll2
-rw-r--r--test/CodeGen/R600/zero_extend.ll2
27 files changed, 302 insertions, 83 deletions
diff --git a/lib/Target/R600/AMDGPUSubtarget.cpp b/lib/Target/R600/AMDGPUSubtarget.cpp
index a80518807eb..3b73a310e01 100644
--- a/lib/Target/R600/AMDGPUSubtarget.cpp
+++ b/lib/Target/R600/AMDGPUSubtarget.cpp
@@ -19,8 +19,7 @@
#include "SIInstrInfo.h"
#include "SIISelLowering.h"
#include "llvm/ADT/SmallString.h"
-
-#include "llvm/ADT/SmallString.h"
+#include "llvm/CodeGen/MachineScheduler.h"
using namespace llvm;
@@ -115,3 +114,21 @@ unsigned AMDGPUSubtarget::getAmdKernelCodeChipID() const {
case SEA_ISLANDS: return 12;
}
}
+
+void AMDGPUSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
+ MachineInstr *begin,
+ MachineInstr *end,
+ unsigned NumRegionInstrs) const {
+ if (getGeneration() >= SOUTHERN_ISLANDS) {
+
+ // Track register pressure so the scheduler can try to decrease
+ // pressure once register usage is above the threshold defined by
+ // SIRegisterInfo::getRegPressureSetLimit()
+ Policy.ShouldTrackPressure = true;
+
+ // Enabling both top down and bottom up scheduling seems to give us less
+ // register spills than just using one of these approaches on its own.
+ Policy.OnlyTopDown = false;
+ Policy.OnlyBottomUp = false;
+ }
+}
diff --git a/lib/Target/R600/AMDGPUSubtarget.h b/lib/Target/R600/AMDGPUSubtarget.h
index 0642b4ad5c8..ff1b8b1df68 100644
--- a/lib/Target/R600/AMDGPUSubtarget.h
+++ b/lib/Target/R600/AMDGPUSubtarget.h
@@ -203,9 +203,13 @@ public:
unsigned getAmdKernelCodeChipID() const;
bool enableMachineScheduler() const override {
- return getGeneration() <= NORTHERN_ISLANDS;
+ return true;
}
+ void overrideSchedPolicy(MachineSchedPolicy &Policy,
+ MachineInstr *begin, MachineInstr *end,
+ unsigned NumRegionInstrs) const override;
+
// Helper functions to simplify if statements
bool isTargetELF() const {
return false;
@@ -224,6 +228,13 @@ public:
bool isAmdHsaOS() const {
return TargetTriple.getOS() == Triple::AMDHSA;
}
+ unsigned getMaxWavesPerCU() const {
+ if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)
+ return 10;
+
+ // FIXME: Not sure what this is for other subtagets.
+ llvm_unreachable("do not know max waves per CU for this subtarget.");
+ }
};
} // End namespace llvm
diff --git a/lib/Target/R600/Processors.td b/lib/Target/R600/Processors.td
index fa78ffcbde0..cff97cdb3be 100644
--- a/lib/Target/R600/Processors.td
+++ b/lib/Target/R600/Processors.td
@@ -83,34 +83,38 @@ def : Proc<"cayman", R600_VLIW4_Itin,
// Southern Islands
//===----------------------------------------------------------------------===//
-def : Proc<"SI", SI_Itin, [FeatureSouthernIslands]>;
+def : ProcessorModel<"SI", SIFullSpeedModel, [FeatureSouthernIslands]>;
-def : Proc<"tahiti", SI_Itin, [FeatureSouthernIslands]>;
+def : ProcessorModel<"tahiti", SIFullSpeedModel, [FeatureSouthernIslands]>;
-def : Proc<"pitcairn", SI_Itin, [FeatureSouthernIslands]>;
+def : ProcessorModel<"pitcairn", SIQuarterSpeedModel, [FeatureSouthernIslands]>;
-def : Proc<"verde", SI_Itin, [FeatureSouthernIslands]>;
+def : ProcessorModel<"verde", SIQuarterSpeedModel, [FeatureSouthernIslands]>;
-def : Proc<"oland", SI_Itin, [FeatureSouthernIslands]>;
+def : ProcessorModel<"oland", SIQuarterSpeedModel, [FeatureSouthernIslands]>;
-def : Proc<"hainan", SI_Itin, [FeatureSouthernIslands]>;
+def : ProcessorModel<"hainan", SIQuarterSpeedModel, [FeatureSouthernIslands]>;
//===----------------------------------------------------------------------===//
// Sea Islands
//===----------------------------------------------------------------------===//
-def : Proc<"bonaire", SI_Itin, [FeatureSeaIslands]>;
+def : ProcessorModel<"bonaire", SIQuarterSpeedModel, [FeatureSeaIslands]>;
-def : Proc<"kabini", SI_Itin, [FeatureSeaIslands]>;
+def : ProcessorModel<"kabini", SIQuarterSpeedModel, [FeatureSeaIslands]>;
-def : Proc<"kaveri", SI_Itin, [FeatureSeaIslands]>;
+def : ProcessorModel<"kaveri", SIQuarterSpeedModel, [FeatureSeaIslands]>;
-def : Proc<"hawaii", SI_Itin, [FeatureSeaIslands]>;
+def : ProcessorModel<"hawaii", SIFullSpeedModel, [FeatureSeaIslands]>;
-def : Proc<"mullins", SI_Itin, [FeatureSeaIslands]>;
+def : ProcessorModel<"mullins", SIQuarterSpeedModel, [FeatureSeaIslands]>;
-def : Proc<"tonga", SI_Itin, [FeatureVolcanicIslands]>;
+//===----------------------------------------------------------------------===//
+// Volcanic Islands
+//===----------------------------------------------------------------------===//
+
+def : ProcessorModel<"tonga", SIFullSpeedModel, [FeatureVolcanicIslands]>;
-def : Proc<"iceland", SI_Itin, [FeatureVolcanicIslands]>;
+def : ProcessorModel<"iceland", SIQuarterSpeedModel, [FeatureVolcanicIslands]>;
-def : Proc<"carrizo", SI_Itin, [FeatureVolcanicIslands]>;
+def : ProcessorModel<"carrizo", SIQuarterSpeedModel, [FeatureVolcanicIslands]>;
diff --git a/lib/Target/R600/SIInstrFormats.td b/lib/Target/R600/SIInstrFormats.td
index ff8db67a619..99a1df36c1f 100644
--- a/lib/Target/R600/SIInstrFormats.td
+++ b/lib/Target/R600/SIInstrFormats.td
@@ -68,6 +68,7 @@ class InstSI <dag outs, dag ins, string asm, list<dag> pattern> :
// Most instructions require adjustments after selection to satisfy
// operand requirements.
let hasPostISelHook = 1;
+ let SchedRW = [Write32Bit];
}
class Enc32 {
@@ -214,9 +215,9 @@ class SMRDe <bits<5> op, bits<1> imm> : Enc32 {
let Inst{31-27} = 0x18; //encoding
}
+let SchedRW = [WriteSALU] in {
class SOP1 <dag outs, dag ins, string asm, list<dag> pattern> :
InstSI<outs, ins, asm, pattern> {
-
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
@@ -274,6 +275,8 @@ class SOPP <bits<7> op, dag ins, string asm, list<dag> pattern = []> :
let UseNamedOperandTable = 1;
}
+} // let SchedRW = [WriteSALU]
+
class SMRD <dag outs, dag ins, string asm, list<dag> pattern> :
InstSI<outs, ins, asm, pattern> {
@@ -283,6 +286,7 @@ class SMRD <dag outs, dag ins, string asm, list<dag> pattern> :
let mayLoad = 1;
let hasSideEffects = 0;
let UseNamedOperandTable = 1;
+ let SchedRW = [WriteSMEM];
}
//===----------------------------------------------------------------------===//
@@ -588,6 +592,7 @@ class DS <dag outs, dag ins, string asm, list<dag> pattern> :
let DS = 1;
let UseNamedOperandTable = 1;
let DisableEncoding = "$m0";
+ let SchedRW = [WriteLDS];
}
class DS_si <bits<8> op, dag outs, dag ins, string asm, list<dag> pattern> :
@@ -602,6 +607,7 @@ class MUBUF <dag outs, dag ins, string asm, list<dag> pattern> :
let hasSideEffects = 0;
let UseNamedOperandTable = 1;
+ let SchedRW = [WriteVMEM];
}
class MTBUF <dag outs, dag ins, string asm, list<dag> pattern> :
@@ -613,6 +619,7 @@ class MTBUF <dag outs, dag ins, string asm, list<dag> pattern> :
let hasSideEffects = 0;
let UseNamedOperandTable = 1;
+ let SchedRW = [WriteVMEM];
}
class FLAT <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
@@ -641,5 +648,4 @@ class MIMG <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
}
-
} // End Uses = [EXEC]
diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td
index 4941d6491b1..0fe2b79a162 100644
--- a/lib/Target/R600/SIInstructions.td
+++ b/lib/Target/R600/SIInstructions.td
@@ -1191,6 +1191,8 @@ defm V_MOV_B32 : VOP1Inst <vop1<0x1>, "v_mov_b32", VOP_I32_I32>;
let Uses = [EXEC] in {
+// FIXME: Specify SchedRW for READFIRSTLANE_B32
+
def V_READFIRSTLANE_B32 : VOP1 <
0x00000002,
(outs SReg_32:$vdst),
@@ -1201,6 +1203,8 @@ def V_READFIRSTLANE_B32 : VOP1 <
}
+let SchedRW = [WriteQuarterRate32] in {
+
defm V_CVT_I32_F64 : VOP1Inst <vop1<0x3>, "v_cvt_i32_f64",
VOP_I32_F64, fp_to_sint
>;
@@ -1253,6 +1257,9 @@ defm V_CVT_U32_F64 : VOP1Inst <vop1<0x15>, "v_cvt_u32_f64",
defm V_CVT_F64_U32 : VOP1Inst <vop1<0x16>, "v_cvt_f64_u32",
VOP_F64_I32, uint_to_fp
>;
+
+} // let SchedRW = [WriteQuarterRate32]
+
defm V_FRACT_F32 : VOP1Inst <vop1<0x20, 0x1b>, "v_fract_f32",
VOP_F32_F32, AMDGPUfract
>;
@@ -1271,6 +1278,9 @@ defm V_FLOOR_F32 : VOP1Inst <vop1<0x24, 0x1f>, "v_floor_f32",
defm V_EXP_F32 : VOP1Inst <vop1<0x25, 0x20>, "v_exp_f32",
VOP_F32_F32, fexp2
>;
+
+let SchedRW = [WriteQuarterRate32] in {
+
defm V_LOG_F32 : VOP1Inst <vop1<0x27, 0x21>, "v_log_f32",
VOP_F32_F32, flog2
>;
@@ -1283,18 +1293,32 @@ defm V_RCP_IFLAG_F32 : VOP1Inst <vop1<0x2b, 0x23>, "v_rcp_iflag_f32",
defm V_RSQ_F32 : VOP1Inst <vop1<0x2e, 0x24>, "v_rsq_f32",
VOP_F32_F32, AMDGPUrsq
>;
+
+} //let SchedRW = [WriteQuarterRate32]
+
+let SchedRW = [WriteDouble] in {
+
defm V_RCP_F64 : VOP1Inst <vop1<0x2f, 0x25>, "v_rcp_f64",
VOP_F64_F64, AMDGPUrcp
>;
defm V_RSQ_F64 : VOP1Inst <vop1<0x31, 0x26>, "v_rsq_f64",
VOP_F64_F64, AMDGPUrsq
>;
+
+} // let SchedRW = [WriteDouble];
+
defm V_SQRT_F32 : VOP1Inst <vop1<0x33, 0x27>, "v_sqrt_f32",
VOP_F32_F32, fsqrt
>;
+
+let SchedRW = [WriteDouble] in {
+
defm V_SQRT_F64 : VOP1Inst <vop1<0x34, 0x28>, "v_sqrt_f64",
VOP_F64_F64, fsqrt
>;
+
+} // let SchedRW = [WriteDouble]
+
defm V_SIN_F32 : VOP1Inst <vop1<0x35, 0x29>, "v_sin_f32",
VOP_F32_F32, AMDGPUsin
>;
@@ -1323,6 +1347,8 @@ defm V_MOVRELSD_B32 : VOP1Inst <vop1<0x44, 0x38>, "v_movrelsd_b32", VOP_I32_I32>
// These instruction only exist on SI and CI
let SubtargetPredicate = isSICI in {
+let SchedRW = [WriteQuarterRate32] in {
+
defm V_LOG_CLAMP_F32 : VOP1InstSI <vop1<0x26>, "v_log_clamp_f32", VOP_F32_F32>;
defm V_RCP_CLAMP_F32 : VOP1InstSI <vop1<0x28>, "v_rcp_clamp_f32", VOP_F32_F32>;
defm V_RCP_LEGACY_F32 : VOP1InstSI <vop1<0x29>, "v_rcp_legacy_f32", VOP_F32_F32>;
@@ -1332,17 +1358,25 @@ defm V_RSQ_CLAMP_F32 : VOP1InstSI <vop1<0x2c>, "v_rsq_clamp_f32",
defm V_RSQ_LEGACY_F32 : VOP1InstSI <vop1<0x2d>, "v_rsq_legacy_f32",
VOP_F32_F32, AMDGPUrsq_legacy
>;
+
+} // End let SchedRW = [WriteQuarterRate32]
+
+let SchedRW = [WriteDouble] in {
+
defm V_RCP_CLAMP_F64 : VOP1InstSI <vop1<0x30>, "v_rcp_clamp_f64", VOP_F64_F64>;
defm V_RSQ_CLAMP_F64 : VOP1InstSI <vop1<0x32>, "v_rsq_clamp_f64",
VOP_F64_F64, AMDGPUrsq_clamped
>;
+} // End SchedRW = [WriteDouble]
+
} // End SubtargetPredicate = isSICI
//===----------------------------------------------------------------------===//
// VINTRP Instructions
//===----------------------------------------------------------------------===//
+// FIXME: Specify SchedRW for VINTRP insturctions.
defm V_INTERP_P1_F32 : VINTRP_m <
0x00000000, "v_interp_p1_f32",
(outs VReg_32:$dst),
@@ -1656,11 +1690,15 @@ defm V_SAD_U32 : VOP3Inst <vop3<0x15d, 0x1dc>, "v_sad_u32",
defm V_DIV_FIXUP_F32 : VOP3Inst <
vop3<0x15f, 0x1de>, "v_div_fixup_f32", VOP_F32_F32_F32_F32, AMDGPUdiv_fixup
>;
+
+let SchedRW = [WriteDouble] in {
+
defm V_DIV_FIXUP_F64 : VOP3Inst <
vop3<0x160, 0x1df>, "v_div_fixup_f64", VOP_F64_F64_F64_F64, AMDGPUdiv_fixup
>;
-// Only on SI
+} // let SchedRW = [WriteDouble]
+
defm V_LSHL_B64 : VOP3Inst <vop3<0x161>, "v_lshl_b64",
VOP_I64_I64_I32, shl
>;
@@ -1675,6 +1713,7 @@ defm V_ASHR_I64 : VOP3Inst <vop3<0x163>, "v_ashr_i64",
VOP_I64_I64_I32, sra
>;
+let SchedRW = [WriteDouble] in {
let isCommutable = 1 in {
defm V_ADD_F64 : VOP3Inst <vop3<0x164, 0x280>, "v_add_f64",
@@ -1697,7 +1736,9 @@ defm V_LDEXP_F64 : VOP3Inst <vop3<0x168, 0x284>, "v_ldexp_f64",
VOP_F64_F64_I32, AMDGPUldexp
>;
-let isCommutable = 1 in {
+} // let SchedRW = [WriteDouble]
+
+let isCommutable = 1, SchedRW = [WriteQuarterRate32] in {
defm V_MUL_LO_U32 : VOP3Inst <vop3<0x169, 0x285>, "v_mul_lo_u32",
VOP_I32_I32_I32
@@ -1713,30 +1754,37 @@ defm V_MUL_HI_I32 : VOP3Inst <vop3<0x16c, 0x287>, "v_mul_hi_i32",
VOP_I32_I32_I32
>;
-} // isCommutable = 1
+} // isCommutable = 1, SchedRW = [WriteQuarterRate32]
defm V_DIV_SCALE_F32 : VOP3b_32 <vop3<0x16d, 0x1e0>, "v_div_scale_f32", []>;
+let SchedRW = [WriteDouble] in {
// Double precision division pre-scale.
defm V_DIV_SCALE_F64 : VOP3b_64 <vop3<0x16e, 0x1e1>, "v_div_scale_f64", []>;
+} // let SchedRW = [WriteDouble]
let isCommutable = 1 in {
defm V_DIV_FMAS_F32 : VOP3Inst <vop3<0x16f, 0x1e2>, "v_div_fmas_f32",
VOP_F32_F32_F32_F32, AMDGPUdiv_fmas
>;
+let SchedRW = [WriteDouble] in {
defm V_DIV_FMAS_F64 : VOP3Inst <vop3<0x170, 0x1e3>, "v_div_fmas_f64",
VOP_F64_F64_F64_F64, AMDGPUdiv_fmas
>;
+} // End SchedRW = [WriteDouble]
} // End isCommutable = 1
//def V_MSAD_U8 : VOP3_U8 <0x00000171, "v_msad_u8", []>;
//def V_QSAD_U8 : VOP3_U8 <0x00000172, "v_qsad_u8", []>;
//def V_MQSAD_U8 : VOP3_U8 <0x00000173, "v_mqsad_u8", []>;
+let SchedRW = [WriteDouble] in {
defm V_TRIG_PREOP_F64 : VOP3Inst <
vop3<0x174, 0x292>, "v_trig_preop_f64", VOP_F64_F64_I32, AMDGPUtrig_preop
>;
+} // let SchedRW = [WriteDouble]
+
//===----------------------------------------------------------------------===//
// Pseudo Instructions
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/R600/SIRegisterInfo.cpp b/lib/Target/R600/SIRegisterInfo.cpp
index 9dddd1ccd69..672f316f1f1 100644
--- a/lib/Target/R600/SIRegisterInfo.cpp
+++ b/lib/Target/R600/SIRegisterInfo.cpp
@@ -51,9 +51,32 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
return Reserved;
}
-unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
- MachineFunction &MF) const {
- return RC->getNumRegs();
+unsigned SIRegisterInfo::getRegPressureSetLimit(unsigned Idx) const {
+
+ // FIXME: We should adjust the max number of waves based on LDS size.
+ unsigned SGPRLimit = getNumSGPRsAllowed(ST.getMaxWavesPerCU());
+ unsigned VGPRLimit = getNumVGPRsAllowed(ST.getMaxWavesPerCU());
+
+ for (regclass_iterator I = regclass_begin(), E = regclass_end();
+ I != E; ++I) {
+
+ unsigned NumSubRegs = std::max((int)(*I)->getSize() / 4, 1);
+ unsigned Limit;
+
+ if (isSGPRClass(*I)) {
+ Limit = SGPRLimit / NumSubRegs;
+ } else {
+ Limit = VGPRLimit / NumSubRegs;
+ }
+
+ const int *Sets = getRegClassPressureSets(*I);
+ assert(Sets);
+ for (unsigned i = 0; Sets[i] != -1; ++i) {
+ if (Sets[i] == (int)Idx)
+ return Limit;
+ }
+ }
+ return 256;
}
bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const {
@@ -481,3 +504,29 @@ unsigned SIRegisterInfo::findUnusedRegister(const MachineRegisterInfo &MRI,
return AMDGPU::NoRegister;
}
+unsigned SIRegisterInfo::getNumVGPRsAllowed(unsigned WaveCount) const {
+ switch(WaveCount) {
+ case 10: return 24;
+ case 9: return 28;
+ case 8: return 32;
+ case 7: return 36;
+ case 6: return 40;
+ case 5: return 48;
+ case 4: return 64;
+ case 3: return 84;
+ case 2: return 128;
+ default: return 256;
+ }
+}
+
+unsigned SIRegisterInfo::getNumSGPRsAllowed(unsigned WaveCount) const {
+ switch(WaveCount) {
+ case 10: return 48;
+ case 9: return 56;
+ case 8: return 64;
+ case 7: return 72;
+ case 6: return 80;
+ case 5: return 96;
+ default: return 103;
+ }
+}
diff --git a/lib/Target/R600/SIRegisterInfo.h b/lib/Target/R600/SIRegisterInfo.h
index 7040fd0c3e9..ffd0d84c802 100644
--- a/lib/Target/R600/SIRegisterInfo.h
+++ b/lib/Target/R600/SIRegisterInfo.h
@@ -17,6 +17,7 @@
#define LLVM_LIB_TARGET_R600_SIREGISTERINFO_H
#include "AMDGPURegisterInfo.h"
+#include "llvm/Support/Debug.h"
namespace llvm {
@@ -26,8 +27,7 @@ struct SIRegisterInfo : public AMDGPURegisterInfo {
BitVector getReservedRegs(const MachineFunction &MF) const override;
- unsigned getRegPressureLimit(const TargetRegisterClass *RC,
- MachineFunction &MF) const override;
+ unsigned getRegPressureSetLimit(unsigned Idx) const override;
bool requiresRegisterScavenging(const MachineFunction &Fn) const override;
@@ -113,6 +113,14 @@ struct SIRegisterInfo : public AMDGPURegisterInfo {
unsigned getPreloadedValue(const MachineFunction &MF,
enum PreloadedValue Value) const;
+ /// \brief Give the maximum number of VGPRs that can be used by \p WaveCount
+ /// concurrent waves.
+ unsigned getNumVGPRsAllowed(unsigned WaveCount) const;
+
+ /// \brief Give the maximum number of SGPRs that can be used by \p WaveCount
+ /// concurrent waves.
+ unsigned getNumSGPRsAllowed(unsigned WaveCount) const;
+
unsigned findUnusedRegister(const MachineRegisterInfo &MRI,
const TargetRegisterClass *RC) const;
diff --git a/lib/Target/R600/SISchedule.td b/lib/Target/R600/SISchedule.td
index 28b65b82585..9b1f676020b 100644
--- a/lib/Target/R600/SISchedule.td
+++ b/lib/Target/R600/SISchedule.td
@@ -7,9 +7,85 @@
//
//===----------------------------------------------------------------------===//
//
-// TODO: This is just a place holder for now.
+// MachineModel definitions for Southern Islands (SI)
//
//===----------------------------------------------------------------------===//
+def WriteBranch : SchedWrite;
+def WriteExport : SchedWrite;
+def WriteLDS : SchedWrite;
+def WriteSALU : SchedWrite;
+def WriteSMEM : SchedWrite;
+def WriteVMEM : SchedWrite;
-def SI_Itin : ProcessorItineraries <[], [], []>;
+// Vector ALU instructions
+def Write32Bit : SchedWrite;
+def WriteQuarterRate32 : SchedWrite;
+
+def WriteFloatFMA : SchedWrite;
+
+def WriteDouble : SchedWrite;
+def WriteDoubleAdd : SchedWrite;
+
+def SIFullSpeedModel : SchedMachineModel;
+def SIQuarterSpeedModel : SchedMachineModel;
+
+// BufferSize = 0 means the processors are in-order.
+let BufferSize = 0 in {
+
+// XXX: Are the resource counts correct?
+def HWBranch : ProcResource<1>;
+def HWExport : ProcResource<7>; // Taken from S_WAITCNT
+def HWLGKM : ProcResource<31>; // Taken from S_WAITCNT
+def HWSALU : ProcResource<1>;
+def HWVMEM : ProcResource<15>; // Taken from S_WAITCNT
+def HWVALU : ProcResource<1>;
+
+}
+
+class HWWriteRes<SchedWrite write, list<ProcResourceKind> resources,
+ int latency> : WriteRes<write, resources> {
+ let Latency = latency;
+}
+
+class HWVALUWriteRes<SchedWrite write, int latency> :
+ HWWriteRes<write, [HWVALU], latency>;
+
+
+// The latency numbers are taken from AMD Accelerated Parallel Processing
+// guide. They may not be acurate.
+
+// The latency values are 1 / (operations / cycle) / 4.
+multiclass SICommonWriteRes {
+
+ def : HWWriteRes<WriteBranch, [HWBranch], 100>; // XXX: Guessed ???
+ def : HWWriteRes<WriteExport, [HWExport], 100>; // XXX: Guessed ???
+ def : HWWriteRes<WriteLDS, [HWLGKM], 32>; // 2 - 64
+ def : HWWriteRes<WriteSALU, [HWSALU], 1>;
+ def : HWWriteRes<WriteSMEM, [HWLGKM], 10>; // XXX: Guessed ???
+ def : HWWriteRes<WriteVMEM, [HWVMEM], 450>; // 300 - 600
+
+ def : HWVALUWriteRes<Write32Bit, 1>;
+ def : HWVALUWriteRes<WriteQuarterRate32, 4>;
+}
+
+
+let SchedModel = SIFullSpeedModel in {
+
+defm : SICommonWriteRes;
+
+def : HWVALUWriteRes<WriteFloatFMA, 1>;
+def : HWVALUWriteRes<WriteDouble, 4>;
+def : HWVALUWriteRes<WriteDoubleAdd, 2>;
+
+} // End SchedModel = SIFullSpeedModel
+
+let SchedModel = SIQuarterSpeedModel in {
+
+defm : SICommonWriteRes;
+
+def : HWVALUWriteRes<WriteFloatFMA, 16>;
+def : HWVALUWriteRes<WriteDouble, 16>;
+def : HWVALUWriteRes<WriteDoubleAdd, 8>;
+
+} // End SchedModel = SIQuarterSpeedModel
diff --git a/test/CodeGen/R600/atomic_cmp_swap_local.ll b/test/CodeGen/R600/atomic_cmp_swap_local.ll
index 8a5a3b64cf6..1e28fefbef5 100644
--- a/test/CodeGen/R600/atomic_cmp_swap_local.ll
+++ b/test/CodeGen/R600/atomic_cmp_swap_local.ll
@@ -2,9 +2,9 @@
; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=CI -check-prefix=FUNC %s
; FUNC-LABEL: {{^}}lds_atomic_cmpxchg_ret_i32_offset:
+; SI: v_mov_b32_e32 [[VCMP:v[0-9]+]], 7
; SI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
; SI: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
-; SI-DAG: v_mov_b32_e32 [[VCMP:v[0-9]+]], 7
; SI-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
; SI-DAG: v_mov_b32_e32 [[VSWAP:v[0-9]+]], [[SWAP]]
; SI: ds_cmpst_rtn_b32 [[RESULT:v[0-9]+]], [[VPTR]], [[VCMP]], [[VSWAP]] offset:16 [M0]
@@ -18,11 +18,11 @@ define void @lds_atomic_cmpxchg_ret_i32_offset(i32 addrspace(1)* %out, i32 addrs
}
; FUNC-LABEL: {{^}}lds_atomic_cmpxchg_ret_i64_offset:
-; SI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SI: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd
; SI: s_mov_b64 s{{\[}}[[LOSCMP:[0-9]+]]:[[HISCMP:[0-9]+]]{{\]}}, 7
; SI-DAG: v_mov_b32_e32 v[[LOVCMP:[0-9]+]], s[[LOSCMP]]
; SI-DAG: v_mov_b32_e32 v[[HIVCMP:[0-9]+]], s[[HISCMP]]
+; SI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd
; SI-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
; SI-DAG: v_mov_b32_e32 v[[LOSWAPV:[0-9]+]], s[[LOSWAP]]
; SI-DAG: v_mov_b32_e32 v[[HISWAPV:[0-9]+]], s[[HISWAP]]
diff --git a/test/CodeGen/R600/ctpop.ll b/test/CodeGen/R600/ctpop.ll
index a47bc876cb9..9cf2fe94fbb 100644
--- a/test/CodeGen/R600/ctpop.ll
+++ b/test/CodeGen/R600/ctpop.ll
@@ -38,11 +38,11 @@ define void @v_ctpop_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noali
}
; FUNC-LABEL: {{^}}v_ctpop_add_chain_i32:
-; SI: buffer_load_dword [[VAL0:v[0-9]+]],
; SI: buffer_load_dword [[VAL1:v[0-9]+]],
+; SI: buffer_load_dword [[VAL0:v[0-9]+]],
; SI: v_mov_b32_e32 [[VZERO:v[0-9]+]], 0
; SI: v_bcnt_u32_b32_e32 [[MIDRESULT:v[0-9]+]], [[VAL1]], [[VZERO]]
-; SI-NEXT: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAL0]], [[MIDRESULT]]
+; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAL0]], [[MIDRESULT]]
; SI: buffer_store_dword [[RESULT]],
; SI: s_endpgm
diff --git a/test/CodeGen/R600/ds_read2st64.ll b/test/CodeGen/R600/ds_read2st64.ll
index 24834af2040..efd875e9317 100644
--- a/test/CodeGen/R600/ds_read2st64.ll
+++ b/test/CodeGen/R600/ds_read2st64.ll
@@ -65,8 +65,8 @@ define void @simple_read2st64_f32_max_offset(float addrspace(1)* %out, float add
; SI-LABEL: @simple_read2st64_f32_over_max_offset
; SI-NOT: ds_read2st64_b32
-; SI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:256
; SI: v_add_i32_e32 [[BIGADD:v[0-9]+]], 0x10000, {{v[0-9]+}}
+; SI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:256
; SI: ds_read_b32 {{v[0-9]+}}, [[BIGADD]]
; SI: s_endpgm
define void @simple_read2st64_f32_over_max_offset(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
@@ -197,8 +197,8 @@ define void @simple_read2st64_f64_max_offset(double addrspace(1)* %out, double a
; SI-LABEL: @simple_read2st64_f64_over_max_offset
; SI-NOT: ds_read2st64_b64
-; SI: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset:512
; SI: v_add_i32_e32 [[BIGADD:v[0-9]+]], 0x10000, {{v[0-9]+}}
+; SI: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset:512
; SI: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, [[BIGADD]]
; SI: s_endpgm
define void @simple_read2st64_f64_over_max_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
diff --git a/test/CodeGen/R600/fceil64.ll b/test/CodeGen/R600/fceil64.ll
index 77cd8eae402..107abe2fbbe 100644
--- a/test/CodeGen/R600/fceil64.ll
+++ b/test/CodeGen/R600/fceil64.ll
@@ -11,12 +11,12 @@ declare <16 x double> @llvm.ceil.v16f64(<16 x double>) nounwind readnone
; FUNC-LABEL: {{^}}fceil_f64:
; CI: v_ceil_f64_e32
; SI: s_bfe_u32 [[SEXP:s[0-9]+]], {{s[0-9]+}}, 0xb0014
+; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
; SI: s_add_i32 s{{[0-9]+}}, [[SEXP]], 0xfffffc01
; SI: s_lshr_b64
; SI: s_not_b64
; SI: s_and_b64
-; SI-DAG: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
-; SI-DAG: cmp_lt_i32
+; SI: cmp_lt_i32
; SI: cndmask_b32
; SI: cndmask_b32
; SI: cmp_gt_i32
diff --git a/test/CodeGen/R600/ffloor.ll b/test/CodeGen/R600/ffloor.ll
index 194d0aaa181..74491b85b1e 100644
--- a/test/CodeGen/R600/ffloor.ll
+++ b/test/CodeGen/R600/ffloor.ll
@@ -12,12 +12,12 @@ declare <16 x double> @llvm.floor.v16f64(<16 x double>) nounwind readnone
; CI: v_floor_f64_e32
; SI: s_bfe_u32 [[SEXP:s[0-9]+]], {{s[0-9]+}}, 0xb0014
+; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
; SI: s_add_i32 s{{[0-9]+}}, [[SEXP]], 0xfffffc01
; SI: s_lshr_b64
; SI: s_not_b64
; SI: s_and_b64
-; SI-DAG: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
-; SI-DAG: cmp_lt_i32
+; SI: cmp_lt_i32
; SI: cndmask_b32
; SI: cndmask_b32
; SI: cmp_gt_i32
diff --git a/test/CodeGen/R600/fmax3.ll b/test/CodeGen/R600/fmax3.ll
index 6f95bf20f73..78833d7606d 100644
--- a/test/CodeGen/R600/fmax3.ll
+++ b/test/CodeGen/R600/fmax3.ll
@@ -3,9 +3,9 @@
declare float @llvm.maxnum.f32(float, float) nounwind readnone
; SI-LABEL: {{^}}test_fmax3_olt_0:
-; SI: buffer_load_dword [[REGA:v[0-9]+]]
-; SI: buffer_load_dword [[REGB:v[0-9]+]]
; SI: buffer_load_dword [[REGC:v[0-9]+]]
+; SI: buffer_load_dword [[REGB:v[0-9]+]]
+; SI: buffer_load_dword [[REGA:v[0-9]+]]
; SI: v_max3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
; SI: buffer_store_dword [[RESULT]],
; SI: s_endpgm
@@ -21,8 +21,8 @@ define void @test_fmax3_olt_0(float addrspace(1)* %out, float addrspace(1)* %apt
; Commute operand of second fmax
; SI-LABEL: {{^}}test_fmax3_olt_1:
-; SI: buffer_load_dword [[REGA:v[0-9]+]]
; SI: buffer_load_dword [[REGB:v[0-9]+]]
+; SI: buffer_load_dword [[REGA:v[0-9]+]]
; SI: buffer_load_dword [[REGC:v[0-9]+]]
; SI: v_max3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
; SI: buffer_store_dword [[RESULT]],
diff --git a/test/CodeGen/R600/fmin3.ll b/test/CodeGen/R600/fmin3.ll
index aeeed1c7dd3..94631a149c0 100644
--- a/test/CodeGen/R600/fmin3.ll
+++ b/test/CodeGen/R600/fmin3.ll
@@ -3,9 +3,9 @@
declare float @llvm.minnum.f32(float, float) nounwind readnone
; SI-LABEL: {{^}}test_fmin3_olt_0:
-; SI: buffer_load_dword [[REGA:v[0-9]+]]
-; SI: buffer_load_dword [[REGB:v[0-9]+]]
; SI: buffer_load_dword [[REGC:v[0-9]+]]
+; SI: buffer_load_dword [[REGB:v[0-9]+]]
+; SI: buffer_load_dword [[REGA:v[0-9]+]]
; SI: v_min3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
; SI: buffer_store_dword [[RESULT]],
; SI: s_endpgm
@@ -21,8 +21,8 @@ define void @test_fmin3_olt_0(float addrspace(1)* %out, float addrspace(1)* %apt
; Commute operand of second fmin
; SI-LABEL: {{^}}test_fmin3_olt_1:
-; SI: buffer_load_dword [[REGA:v[0-9]+]]
; SI: buffer_load_dword [[REGB:v[0-9]+]]
+; SI: buffer_load_dword [[REGA:v[0-9]+]]
; SI: buffer_load_dword [[REGC:v[0-9]+]]
; SI: v_min3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
; SI: buffer_store_dword [[RESULT]],
diff --git a/test/CodeGen/R600/fneg-fabs.f64.ll b/test/CodeGen/R600/fneg-fabs.f64.ll
index 04a87e37785..c6e8608c79d 100644
--- a/test/CodeGen/R600/fneg-fabs.f64.ll
+++ b/test/CodeGen/R600/fneg-fabs.f64.ll
@@ -56,8 +56,8 @@ define void @fneg_fabs_fn_free_f64(double addrspace(1)* %out, i64 %in) {
}
; FUNC-LABEL: {{^}}fneg_fabs_f64:
-; SI: s_load_dwordx2
; SI: s_load_dwordx2 s{{\[}}[[LO_X:[0-9]+]]:[[HI_X:[0-9]+]]{{\]}}
+; SI: s_load_dwordx2
; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000
; SI-DAG: v_or_b32_e32 v[[HI_V:[0-9]+]], s[[HI_X]], [[IMMREG]]
; SI-DAG: v_mov_b32_e32 v[[LO_V:[0-9]+]], s[[LO_X]]
diff --git a/test/CodeGen/R600/ftrunc.f64.ll b/test/CodeGen/R600/ftrunc.f64.ll
index 2c7217ef056..faf9b401b32 100644
--- a/test/CodeGen/R600/ftrunc.f64.ll
+++ b/test/CodeGen/R600/ftrunc.f64.ll
@@ -23,12 +23,12 @@ define void @v_ftrunc_f64(double addrspace(1)* %out, double addrspace(1)* %in) {
; CI: v_trunc_f64_e32
; SI: s_bfe_u32 [[SEXP:s[0-9]+]], {{s[0-9]+}}, 0xb0014
+; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
; SI: s_add_i32 s{{[0-9]+}}, [[SEXP]], 0xfffffc01
; SI: s_lshr_b64
+; SI: cmp_lt_i32
; SI: s_not_b64
; SI: s_and_b64
-; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
-; SI: cmp_lt_i32
; SI: cndmask_b32
; SI: cndmask_b32
; SI: cmp_gt_i32
diff --git a/test/CodeGen/R600/llvm.memcpy.ll b/test/CodeGen/R600/llvm.memcpy.ll
index 9771062ed0d..23d392c61d5 100644
--- a/test/CodeGen/R600/llvm.memcpy.ll
+++ b/test/CodeGen/R600/llvm.memcpy.ll
@@ -6,39 +6,23 @@ declare void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* nocapture, i8 addrspace
; FUNC-LABEL: {{^}}test_small_memcpy_i64_lds_to_lds_align1:
; SI: ds_read_u8
-; SI: ds_write_b8
; SI: ds_read_u8
-; SI: ds_write_b8
; SI: ds_read_u8
-; SI: ds_write_b8
; SI: ds_read_u8
-; SI: ds_write_b8
; SI: ds_read_u8
-; SI: ds_write_b8
-
; SI: ds_read_u8
-; SI: ds_write_b8
; SI: ds_read_u8
-; SI: ds_write_b8
; SI: ds_read_u8
-; SI: ds_write_b8
+
; SI: ds_read_u8
-; SI: ds_write_b8
; SI: ds_read_u8
-; SI: ds_write_b8
-
; SI: ds_read_u8
-; SI: ds_write_b8
; SI: ds_read_u8
-; SI: ds_write_b8
; SI: ds_read_u8
-; SI: ds_write_b8
; SI: ds_read_u8
-; SI: ds_write_b8
; SI: ds_read_u8
; SI: ds_read_u8
-
; SI: ds_read_u8
; SI: ds_read_u8
; SI: ds_read_u8
@@ -65,6 +49,14 @@ declare void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* nocapture, i8 addrspace
; SI: ds_write_b8
; SI: ds_write_b8
; SI: ds_write_b8
+
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
; SI: ds_write_b8
; SI: ds_write_b8
@@ -75,6 +67,14 @@ declare void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* nocapture, i8 addrspace
; SI: ds_write_b8
; SI: ds_write_b8
; SI: ds_write_b8
+
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
; SI: ds_write_b8
; SI: s_endpgm
diff --git a/test/CodeGen/R600/local-atomics.ll b/test/CodeGen/R600/local-atomics.ll
index 16d3173f369..78d747d7e72 100644
--- a/test/CodeGen/R600/local-atomics.ll
+++ b/test/CodeGen/R600/local-atomics.ll
@@ -4,8 +4,8 @@
; FUNC-LABEL: {{^}}lds_atomic_xchg_ret_i32:
; EG: LDS_WRXCHG_RET *
-; SI: s_load_dword [[SPTR:s[0-9]+]],
; SI: v_mov_b32_e32 [[DATA:v[0-9]+]], 4
+; SI: s_load_dword [[SPTR:s[0-9]+]],
; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
; SI: ds_wrxchg_rtn_b32 [[RESULT:v[0-9]+]], [[VPTR]], [[DATA]] [M0]
; SI: buffer_store_dword [[RESULT]],
@@ -30,8 +30,8 @@ define void @lds_atomic_xchg_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspac
; XXX - Is it really necessary to load 4 into VGPR?
; FUNC-LABEL: {{^}}lds_atomic_add_ret_i32:
; EG: LDS_ADD_RET *
-; SI: s_load_dword [[SPTR:s[0-9]+]],
; SI: v_mov_b32_e32 [[DATA:v[0-9]+]], 4
+; SI: s_load_dword [[SPTR:s[0-9]+]],
; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
; SI: ds_add_rtn_u32 [[RESULT:v[0-9]+]], [[VPTR]], [[DATA]] [M0]
; SI: buffer_store_dword [[RESULT]],
diff --git a/test/CodeGen/R600/local-atomics64.ll b/test/CodeGen/R600/local-atomics64.ll
index 0e5a94f25cd..0b808571011 100644
--- a/test/CodeGen/R600/local-atomics64.ll
+++ b/test/CodeGen/R600/local-atomics64.ll
@@ -29,10 +29,10 @@ define void @lds_atomic_add_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %p
}
; FUNC-LABEL: {{^}}lds_atomic_add_ret_i64_offset:
-; SI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
; SI: s_mov_b64 s{{\[}}[[LOSDATA:[0-9]+]]:[[HISDATA:[0-9]+]]{{\]}}, 9
; SI-DAG: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], s[[LOSDATA]]
; SI-DAG: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], s[[HISDATA]]
+; SI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
; SI-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
; SI: ds_add_rtn_u64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}} offset:32 [M0]
; SI: buffer_store_dwordx2 [[RESULT]],
diff --git a/test/CodeGen/R600/local-memory-two-objects.ll b/test/CodeGen/R600/local-memory-two-objects.ll
index 5c77ad5e6ca..86edfde4e34 100644
--- a/test/CodeGen/R600/local-memory-two-objects.ll
+++ b/test/CodeGen/R600/local-memory-two-objects.ll
@@ -31,8 +31,8 @@
; EG-CHECK-NOT: LDS_READ_RET {{[*]*}} OQAP, T[[ADDRR]]
; SI: v_add_i32_e32 [[SIPTR:v[0-9]+]], 16, v{{[0-9]+}}
; SI: ds_read_b32 {{v[0-9]+}}, [[SIPTR]] [M0]
-; CI: ds_read_b32 {{v[0-9]+}}, [[ADDRR:v[0-9]+]] offset:16 [M0]
-; CI: ds_read_b32 {{v[0-9]+}}, [[ADDRR]] [M0]
+; CI: ds_read_b32 {{v[0-9]+}}, [[ADDRR:v[0-9]+]] [M0]
+; CI: ds_read_b32 {{v[0-9]+}}, [[ADDRR]] offset:16 [M0]
define void @local_memory_two_objects(i32 addrspace(1)* %out) {
entry:
diff --git a/test/CodeGen/R600/si-triv-disjoint-mem-access.ll b/test/CodeGen/R600/si-triv-disjoint-mem-access.ll
index b2f4a9ff05e..f6dcb388248 100644
--- a/test/CodeGen/R600/si-triv-disjoint-mem-access.ll
+++ b/test/CodeGen/R600/si-triv-disjoint-mem-access.ll
@@ -51,8 +51,8 @@ define void @no_reorder_local_load_volatile_global_store_local_load(i32 addrspac
; FUNC-LABEL: @no_reorder_barrier_local_load_global_store_local_load
; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:4
-; CI: buffer_store_dword
; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:8
+; CI: buffer_store_dword
define void @no_reorder_barrier_local_load_global_store_local_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
%ptr0 = load i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4
diff --git a/test/CodeGen/R600/smrd.ll b/test/CodeGen/R600/smrd.ll
index a66ad0201bf..de60554a779 100644
--- a/test/CodeGen/R600/smrd.ll
+++ b/test/CodeGen/R600/smrd.ll
@@ -37,10 +37,12 @@ entry:
; SMRD load with a 64-bit offset
; CHECK-LABEL: {{^}}smrd3:
-; CHECK-DAG: s_mov_b32 s[[SHI:[0-9]+]], 4
-; CHECK-DAG: s_mov_b32 s[[SLO:[0-9]+]], 0 ;
-; FIXME: We don't need to copy these values to VGPRs
-; CHECK-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SLO]]
+; FIXME: There are too many copies here because we don't fold immediates
+; through REG_SEQUENCE
+; CHECK: s_mov_b32 s[[SLO:[0-9]+]], 0 ;
+; CHECK: s_mov_b32 s[[SHI:[0-9]+]], 4
+; CHECK: s_mov_b32 s[[SSLO:[0-9]+]], s[[SLO]]
+; CHECK-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SSLO]]
; CHECK-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]]
; FIXME: We should be able to use s_load_dword here
; CHECK: buffer_load_dword v{{[0-9]+}}, v{{\[}}[[VLO]]:[[VHI]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0 addr64
diff --git a/test/CodeGen/R600/valu-i1.ll b/test/CodeGen/R600/valu-i1.ll
index a4027178431..5a3c2ec9b03 100644
--- a/test/CodeGen/R600/valu-i1.ll
+++ b/test/CodeGen/R600/valu-i1.ll
@@ -81,7 +81,6 @@ exit:
; SI: buffer_store_dword
; SI: v_cmp_eq_i32_e32 vcc,
; SI: s_or_b64 [[OR_SREG:s\[[0-9]+:[0-9]+\]]]
-; SI: v_add_i32_e32 {{v[0-9]+}}, 1, {{v[0-9]+}}
; SI: s_andn2_b64 exec, exec, [[OR_SREG]]
; SI: s_cbranch_execnz BB2_3
@@ -124,8 +123,8 @@ exit:
; Clear exec bits for workitems that load -1s
; SI: BB3_3:
-; SI: buffer_load_dword [[A:v[0-9]+]]
; SI: buffer_load_dword [[B:v[0-9]+]]
+; SI: buffer_load_dword [[A:v[0-9]+]]
; SI-DAG: v_cmp_ne_i32_e64 [[NEG1_CHECK_0:s\[[0-9]+:[0-9]+\]]], [[A]], -1
; SI-DAG: v_cmp_ne_i32_e64 [[NEG1_CHECK_1:s\[[0-9]+:[0-9]+\]]], [[B]], -1
; SI: s_and_b64 [[ORNEG1:s\[[0-9]+:[0-9]+\]]], [[NEG1_CHECK_1]], [[NEG1_CHECK_0]]
diff --git a/test/CodeGen/R600/wait.ll b/test/CodeGen/R600/wait.ll
index b30f86ccd0d..c54b8dcbfaf 100644
--- a/test/CodeGen/R600/wait.ll
+++ b/test/CodeGen/R600/wait.ll
@@ -3,9 +3,8 @@
; CHECK-LABEL: {{^}}main:
; CHECK: s_load_dwordx4
; CHECK: s_load_dwordx4
-; CHECK: s_waitcnt lgkmcnt(0){{$}}
-; CHECK: s_waitcnt vmcnt(0){{$}}
-; CHECK: s_waitcnt expcnt(0) lgkmcnt(0){{$}}
+; CHECK: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; CHECK: s_endpgm
define void @main(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, <16 x i8> addrspace(2)* inreg %arg3, <16 x i8> addrspace(2)* inreg %arg4, i32 inreg %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9, float addrspace(2)* inreg %constptr) #0 {
main_body:
%tmp = getelementptr <16 x i8> addrspace(2)* %arg3, i32 0
diff --git a/test/CodeGen/R600/xor.ll b/test/CodeGen/R600/xor.ll
index b444b4ec08e..f7bc8757a2b 100644
--- a/test/CodeGen/R600/xor.ll
+++ b/test/CodeGen/R600/xor.ll
@@ -57,8 +57,8 @@ define void @xor_i1(float addrspace(1)* %out, float addrspace(1)* %in0, float ad
}
; FUNC-LABEL: {{^}}v_xor_i1:
-; SI: buffer_load_ubyte [[A:v[0-9]+]]
; SI: buffer_load_ubyte [[B:v[0-9]+]]
+; SI: buffer_load_ubyte [[A:v[0-9]+]]
; SI: v_xor_b32_e32 [[XOR:v[0-9]+]], [[B]], [[A]]
; SI: v_and_b32_e32 [[RESULT:v[0-9]+]], 1, [[XOR]]
; SI: buffer_store_byte [[RESULT]]
diff --git a/test/CodeGen/R600/zero_extend.ll b/test/CodeGen/R600/zero_extend.ll
index 7df4b48c2b9..f22b8a29254 100644
--- a/test/CodeGen/R600/zero_extend.ll
+++ b/test/CodeGen/R600/zero_extend.ll
@@ -29,9 +29,9 @@ entry:
}
; SI-CHECK-LABEL: {{^}}zext_i1_to_i64:
+; SI-CHECK: s_mov_b32 s{{[0-9]+}}, 0
; SI-CHECK: v_cmp_eq_i32
; SI-CHECK: v_cndmask_b32
-; SI-CHECK: s_mov_b32 s{{[0-9]+}}, 0
define void @zext_i1_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
%cmp = icmp eq i32 %a, %b
%ext = zext i1 %cmp to i64