summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--lib/Target/R600/AMDGPUSubtarget.cpp12
-rw-r--r--lib/Target/R600/AMDGPUSubtarget.h6
-rw-r--r--lib/Target/R600/Processors.td24
-rw-r--r--lib/Target/R600/SIInstrFormats.td10
-rw-r--r--lib/Target/R600/SIInstructions.td48
-rw-r--r--lib/Target/R600/SIRegisterInfo.cpp54
-rw-r--r--lib/Target/R600/SIRegisterInfo.h12
-rw-r--r--lib/Target/R600/SISchedule.td76
8 files changed, 217 insertions, 25 deletions
diff --git a/lib/Target/R600/AMDGPUSubtarget.cpp b/lib/Target/R600/AMDGPUSubtarget.cpp
index 9d09a196370..e4674baecd7 100644
--- a/lib/Target/R600/AMDGPUSubtarget.cpp
+++ b/lib/Target/R600/AMDGPUSubtarget.cpp
@@ -19,8 +19,7 @@
#include "SIInstrInfo.h"
#include "SIISelLowering.h"
#include "llvm/ADT/SmallString.h"
-
-#include "llvm/ADT/SmallString.h"
+#include "llvm/CodeGen/MachineScheduler.h"
using namespace llvm;
@@ -107,3 +106,12 @@ unsigned AMDGPUSubtarget::getStackEntrySize() const {
llvm_unreachable("Illegal wavefront size.");
}
}
+
+void AMDGPUSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
+ MachineInstr *begin,
+ MachineInstr *end,
+ unsigned NumRegionInstrs) const {
+ if (getGeneration() >= SOUTHERN_ISLANDS) {
+ Policy.OnlyTopDown = true;
+ }
+}
diff --git a/lib/Target/R600/AMDGPUSubtarget.h b/lib/Target/R600/AMDGPUSubtarget.h
index f71d80a8ba0..3e44c6685d7 100644
--- a/lib/Target/R600/AMDGPUSubtarget.h
+++ b/lib/Target/R600/AMDGPUSubtarget.h
@@ -199,9 +199,13 @@ public:
}
bool enableMachineScheduler() const override {
- return getGeneration() <= NORTHERN_ISLANDS;
+ return true;
}
+ void overrideSchedPolicy(MachineSchedPolicy &Policy,
+ MachineInstr *begin, MachineInstr *end,
+ unsigned NumRegionInstrs) const override;
+
// Helper functions to simplify if statements
bool isTargetELF() const {
return false;
diff --git a/lib/Target/R600/Processors.td b/lib/Target/R600/Processors.td
index ce17d7cb7f1..17422f93ab8 100644
--- a/lib/Target/R600/Processors.td
+++ b/lib/Target/R600/Processors.td
@@ -83,28 +83,30 @@ def : Proc<"cayman", R600_VLIW4_Itin,
// Southern Islands
//===----------------------------------------------------------------------===//
-def : Proc<"SI", SI_Itin, [FeatureSouthernIslands]>;
+// FIXME: Which of these should use the half speed?
-def : Proc<"tahiti", SI_Itin, [FeatureSouthernIslands]>;
+def : ProcessorModel<"SI", SIFullSpeedModel, [FeatureSouthernIslands]>;
-def : Proc<"pitcairn", SI_Itin, [FeatureSouthernIslands]>;
+def : ProcessorModel<"tahiti", SIFullSpeedModel, [FeatureSouthernIslands]>;
-def : Proc<"verde", SI_Itin, [FeatureSouthernIslands]>;
+def : ProcessorModel<"pitcairn", SIFullSpeedModel, [FeatureSouthernIslands]>;
-def : Proc<"oland", SI_Itin, [FeatureSouthernIslands]>;
+def : ProcessorModel<"verde", SIFullSpeedModel, [FeatureSouthernIslands]>;
-def : Proc<"hainan", SI_Itin, [FeatureSouthernIslands]>;
+def : ProcessorModel<"oland", SIFullSpeedModel, [FeatureSouthernIslands]>;
+
+def : ProcessorModel<"hainan", SIFullSpeedModel, [FeatureSouthernIslands]>;
//===----------------------------------------------------------------------===//
// Sea Islands
//===----------------------------------------------------------------------===//
-def : Proc<"bonaire", SI_Itin, [FeatureSeaIslands]>;
+def : ProcessorModel<"bonaire", SIFullSpeedModel, [FeatureSeaIslands]>;
-def : Proc<"kabini", SI_Itin, [FeatureSeaIslands]>;
+def : ProcessorModel<"kabini", SIFullSpeedModel, [FeatureSeaIslands]>;
-def : Proc<"kaveri", SI_Itin, [FeatureSeaIslands]>;
+def : ProcessorModel<"kaveri", SIFullSpeedModel, [FeatureSeaIslands]>;
-def : Proc<"hawaii", SI_Itin, [FeatureSeaIslands]>;
+def : ProcessorModel<"hawaii", SIFullSpeedModel, [FeatureSeaIslands]>;
-def : Proc<"mullins", SI_Itin, [FeatureSeaIslands]>;
+def : ProcessorModel<"mullins", SIFullSpeedModel, [FeatureSeaIslands]>;
diff --git a/lib/Target/R600/SIInstrFormats.td b/lib/Target/R600/SIInstrFormats.td
index ee1a52b2f8f..4b688e0a97b 100644
--- a/lib/Target/R600/SIInstrFormats.td
+++ b/lib/Target/R600/SIInstrFormats.td
@@ -46,6 +46,7 @@ class InstSI <dag outs, dag ins, string asm, list<dag> pattern> :
// Most instructions require adjustments after selection to satisfy
// operand requirements.
let hasPostISelHook = 1;
+ let SchedRW = [Write32Bit];
}
class Enc32 {
@@ -161,6 +162,8 @@ class SMRDe <bits<5> op, bits<1> imm> : Enc32 {
let Inst{31-27} = 0x18; //encoding
}
+let SchedRW = [WriteSALU] in {
+
class SOP1 <bits<8> op, dag outs, dag ins, string asm, list<dag> pattern> :
InstSI<outs, ins, asm, pattern>, SOP1e <op> {
@@ -216,6 +219,8 @@ class SOPP <bits<7> op, dag ins, string asm, list<dag> pattern = []> :
let UseNamedOperandTable = 1;
}
+} // let SchedRW = [WriteSALU]
+
class SMRD <dag outs, dag ins, string asm, list<dag> pattern> :
InstSI<outs, ins, asm, pattern> {
@@ -225,6 +230,7 @@ class SMRD <dag outs, dag ins, string asm, list<dag> pattern> :
let mayLoad = 1;
let hasSideEffects = 0;
let UseNamedOperandTable = 1;
+ let SchedRW = [WriteSMEM];
}
//===----------------------------------------------------------------------===//
@@ -547,6 +553,7 @@ class DS <bits<8> op, dag outs, dag ins, string asm, list<dag> pattern> :
let LGKM_CNT = 1;
let UseNamedOperandTable = 1;
let DisableEncoding = "$m0";
+ let SchedRW = [WriteLDS];
}
class MUBUF <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
@@ -558,6 +565,7 @@ class MUBUF <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
let hasSideEffects = 0;
let UseNamedOperandTable = 1;
+ let SchedRW = [WriteVMEM];
}
class MTBUF <dag outs, dag ins, string asm, list<dag> pattern> :
@@ -569,6 +577,7 @@ class MTBUF <dag outs, dag ins, string asm, list<dag> pattern> :
let neverHasSideEffects = 1;
let UseNamedOperandTable = 1;
+ let SchedRW = [WriteVMEM];
}
class FLAT <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
@@ -597,5 +606,4 @@ class MIMG <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
}
-
} // End Uses = [EXEC]
diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td
index 00ce9bfcc26..5346cf729d0 100644
--- a/lib/Target/R600/SIInstructions.td
+++ b/lib/Target/R600/SIInstructions.td
@@ -1160,6 +1160,8 @@ defm V_MOV_B32 : VOP1Inst <vop1<0x1>, "v_mov_b32", VOP_I32_I32>;
let Uses = [EXEC] in {
+// FIXME: Specify SchedRW for READFIRSTLANE+B32
+
def V_READFIRSTLANE_B32 : VOP1 <
0x00000002,
(outs SReg_32:$vdst),
@@ -1170,6 +1172,8 @@ def V_READFIRSTLANE_B32 : VOP1 <
}
+let SchedRW = [WriteConversion] in {
+
defm V_CVT_I32_F64 : VOP1Inst <vop1<0x3>, "v_cvt_i32_f64",
VOP_I32_F64, fp_to_sint
>;
@@ -1223,6 +1227,8 @@ defm V_CVT_F64_U32 : VOP1Inst <vop1<0x16>, "v_cvt_f64_u32",
VOP_F64_I32, uint_to_fp
>;
+} // let SchedRW = [WriteConversion]
+
defm V_FRACT_F32 : VOP1Inst <vop1<0x20>, "v_fract_f32",
VOP_F32_F32, AMDGPUfract
>;
@@ -1241,6 +1247,9 @@ defm V_FLOOR_F32 : VOP1Inst <vop1<0x24>, "v_floor_f32",
defm V_EXP_F32 : VOP1Inst <vop1<0x25>, "v_exp_f32",
VOP_F32_F32, fexp2
>;
+
+let SchedRW = [WriteFloatTrans] in {
+
defm V_LOG_CLAMP_F32 : VOP1Inst <vop1<0x26>, "v_log_clamp_f32", VOP_F32_F32>;
defm V_LOG_F32 : VOP1Inst <vop1<0x27>, "v_log_f32",
VOP_F32_F32, flog2
@@ -1261,6 +1270,11 @@ defm V_RSQ_LEGACY_F32 : VOP1Inst <vop1<0x2d>, "v_rsq_legacy_f32",
defm V_RSQ_F32 : VOP1Inst <vop1<0x2e>, "v_rsq_f32",
VOP_F32_F32, AMDGPUrsq
>;
+
+} //let SchedRW = [WriteFloatTrans]
+
+let SchedRW = [WriteDouble] in {
+
defm V_RCP_F64 : VOP1Inst <vop1<0x2f>, "v_rcp_f64",
VOP_F64_F64, AMDGPUrcp
>;
@@ -1271,12 +1285,21 @@ defm V_RSQ_F64 : VOP1Inst <vop1<0x31>, "v_rsq_f64",
defm V_RSQ_CLAMP_F64 : VOP1Inst <vop1<0x32>, "v_rsq_clamp_f64",
VOP_F64_F64, AMDGPUrsq_clamped
>;
+
+} // let SchedRW = [WriteDouble];
+
defm V_SQRT_F32 : VOP1Inst <vop1<0x33>, "v_sqrt_f32",
VOP_F32_F32, fsqrt
>;
+
+let SchedRW = [WriteDouble] in {
+
defm V_SQRT_F64 : VOP1Inst <vop1<0x34>, "v_sqrt_f64",
VOP_F64_F64, fsqrt
>;
+
+} // let SchedRW = [WriteDouble]
+
defm V_SIN_F32 : VOP1Inst <vop1<0x35>, "v_sin_f32",
VOP_F32_F32, AMDGPUsin
>;
@@ -1303,6 +1326,8 @@ defm V_MOVRELSD_B32 : VOP1Inst <vop1<0x44>, "v_movrelsd_b32", VOP_I32_I32>;
// VINTRP Instructions
//===----------------------------------------------------------------------===//
+// FIXME: Specify SchedRW for VINTRP insturctions.
+
def V_INTERP_P1_F32 : VINTRP <
0x00000000,
(outs VReg_32:$dst),
@@ -1337,6 +1362,8 @@ def V_INTERP_MOV_F32 : VINTRP <
// VOP2 Instructions
//===----------------------------------------------------------------------===//
+// FIXME: Specify SchedRW for V_CNDMASK and V_*LANE_B32
+
def V_CNDMASK_B32_e32 : VOP2 <0x00000000, (outs VReg_32:$dst),
(ins VSrc_32:$src0, VReg_32:$src1, VCCReg:$vcc),
"v_cndmask_b32_e32 $dst, $src0, $src1, [$vcc]",
@@ -1405,7 +1432,6 @@ defm V_MUL_U32_U24 : VOP2Inst <vop2<0xb>, "v_mul_u32_u24",
>;
//defm V_MUL_HI_U32_U24 : VOP2_32 <0x0000000c, "v_mul_hi_u32_u24", []>;
-
defm V_MIN_LEGACY_F32 : VOP2Inst <vop2<0xd>, "v_min_legacy_f32",
VOP_F32_F32_F32, AMDGPUfmin_legacy
>;
@@ -1608,10 +1634,15 @@ defm V_SAD_U32 : VOP3Inst <vop3<0x15d>, "v_sad_u32",
defm V_DIV_FIXUP_F32 : VOP3Inst <
vop3<0x15f>, "v_div_fixup_f32", VOP_F32_F32_F32_F32, AMDGPUdiv_fixup
>;
+
+let SchedRW = [WriteDouble] in {
+
defm V_DIV_FIXUP_F64 : VOP3Inst <
vop3<0x160>, "v_div_fixup_f64", VOP_F64_F64_F64_F64, AMDGPUdiv_fixup
>;
+} // let SchedRW = [WriteDouble]
+
defm V_LSHL_B64 : VOP3Inst <vop3<0x161>, "v_lshl_b64",
VOP_I64_I64_I32, shl
>;
@@ -1622,6 +1653,7 @@ defm V_ASHR_I64 : VOP3Inst <vop3<0x163>, "v_ashr_i64",
VOP_I64_I64_I32, sra
>;
+let SchedRW = [WriteDouble] in {
let isCommutable = 1 in {
defm V_ADD_F64 : VOP3Inst <vop3<0x164>, "v_add_f64",
@@ -1644,7 +1676,9 @@ defm V_LDEXP_F64 : VOP3Inst <vop3<0x168>, "v_ldexp_f64",
VOP_F64_F64_I32, AMDGPUldexp
>;
-let isCommutable = 1 in {
+} // let SchedRW = [WriteDouble]
+
+let isCommutable = 1, SchedRW = [WriteIntMUL] in {
defm V_MUL_LO_U32 : VOP3Inst <vop3<0x169>, "v_mul_lo_u32",
VOP_I32_I32_I32
@@ -1659,30 +1693,38 @@ defm V_MUL_HI_I32 : VOP3Inst <vop3<0x16c>, "v_mul_hi_i32",
VOP_I32_I32_I32
>;
-} // isCommutable = 1
+} // isCommutable = 1, SchedRW = [WriteIntMUL]
defm V_DIV_SCALE_F32 : VOP3b_32 <vop3<0x16d>, "v_div_scale_f32", []>;
+let SchedRW = [WriteDouble] in {
// Double precision division pre-scale.
defm V_DIV_SCALE_F64 : VOP3b_64 <vop3<0x16e>, "v_div_scale_f64", []>;
+} // let SchedRW = [WriteDouble]
let isCommutable = 1 in {
defm V_DIV_FMAS_F32 : VOP3Inst <vop3<0x16f>, "v_div_fmas_f32",
VOP_F32_F32_F32_F32, AMDGPUdiv_fmas
>;
+
+let SchedRW = [WriteDouble] in {
defm V_DIV_FMAS_F64 : VOP3Inst <vop3<0x170>, "v_div_fmas_f64",
VOP_F64_F64_F64_F64, AMDGPUdiv_fmas
>;
+} // End SchedRW = [WriteDouble]
} // End isCommutable = 1
//def V_MSAD_U8 : VOP3_U8 <0x00000171, "v_msad_u8", []>;
//def V_QSAD_U8 : VOP3_U8 <0x00000172, "v_qsad_u8", []>;
//def V_MQSAD_U8 : VOP3_U8 <0x00000173, "v_mqsad_u8", []>;
+let SchedRW = [WriteDouble] in {
defm V_TRIG_PREOP_F64 : VOP3Inst <
vop3<0x174>, "v_trig_preop_f64", VOP_F64_F64_I32, AMDGPUtrig_preop
>;
+} // let SchedRW = [WriteDouble]
+
//===----------------------------------------------------------------------===//
// Pseudo Instructions
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/R600/SIRegisterInfo.cpp b/lib/Target/R600/SIRegisterInfo.cpp
index cffea129de9..9bc7f2deb5c 100644
--- a/lib/Target/R600/SIRegisterInfo.cpp
+++ b/lib/Target/R600/SIRegisterInfo.cpp
@@ -48,9 +48,31 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
return Reserved;
}
-unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
- MachineFunction &MF) const {
- return RC->getNumRegs();
+unsigned SIRegisterInfo::getRegPressureSetLimit(unsigned Idx) const {
+
+ unsigned SGPRLimit = getNumSGPRsAllowed(10);
+ unsigned VGPRLimit = getNumVGPRsAllowed(10);
+
+ for (regclass_iterator I = regclass_begin(), E = regclass_end();
+ I != E; ++I) {
+
+ unsigned NumSubRegs = std::max((int)(*I)->getSize() / 4, 1);
+ unsigned Limit;
+
+ if (isSGPRClass(*I)) {
+ Limit = SGPRLimit / NumSubRegs;
+ } else {
+ Limit = VGPRLimit / NumSubRegs;
+ }
+
+ const int *Sets = getRegClassPressureSets(*I);
+ assert(Sets);
+ for (unsigned i = 0; Sets[i] != -1; ++i) {
+ if (Sets[i] == (int)Idx)
+ return Limit;
+ }
+ }
+ return 256;
}
bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const {
@@ -464,3 +486,29 @@ unsigned SIRegisterInfo::findUnusedVGPR(const MachineRegisterInfo &MRI) const {
return AMDGPU::NoRegister;
}
+unsigned SIRegisterInfo::getNumVGPRsAllowed(unsigned WaveCount) const {
+ switch(WaveCount) {
+ case 10: return 24;
+ case 9: return 28;
+ case 8: return 32;
+ case 7: return 36;
+ case 6: return 40;
+ case 5: return 48;
+ case 4: return 64;
+ case 3: return 84;
+ case 2: return 128;
+ default: return 256;
+ }
+}
+
+unsigned SIRegisterInfo::getNumSGPRsAllowed(unsigned WaveCount) const {
+ switch(WaveCount) {
+ case 10: return 48;
+ case 9: return 56;
+ case 8: return 64;
+ case 7: return 72;
+ case 6: return 80;
+ case 5: return 96;
+ default: return 103;
+ }
+}
diff --git a/lib/Target/R600/SIRegisterInfo.h b/lib/Target/R600/SIRegisterInfo.h
index c7e54dbf0ec..a7c39002a8c 100644
--- a/lib/Target/R600/SIRegisterInfo.h
+++ b/lib/Target/R600/SIRegisterInfo.h
@@ -17,6 +17,7 @@
#define LLVM_LIB_TARGET_R600_SIREGISTERINFO_H
#include "AMDGPURegisterInfo.h"
+#include "llvm/Support/Debug.h"
namespace llvm {
@@ -26,8 +27,7 @@ struct SIRegisterInfo : public AMDGPURegisterInfo {
BitVector getReservedRegs(const MachineFunction &MF) const override;
- unsigned getRegPressureLimit(const TargetRegisterClass *RC,
- MachineFunction &MF) const override;
+ unsigned getRegPressureSetLimit(unsigned Idx) const override;
bool requiresRegisterScavenging(const MachineFunction &Fn) const override;
@@ -114,6 +114,14 @@ struct SIRegisterInfo : public AMDGPURegisterInfo {
enum PreloadedValue Value) const;
unsigned findUnusedVGPR(const MachineRegisterInfo &MRI) const;
+
+ /// \brief Give the maximum number of VGPRs that can be used by \p WaveCount
+ /// concurrent waves.
+ unsigned getNumVGPRsAllowed(unsigned WaveCount) const;
+
+ /// \brief Give the maximum number of SGPRs that can be used by \p WaveCount
+ /// concurrent waves.
+ unsigned getNumSGPRsAllowed(unsigned WaveCount) const;
};
} // End namespace llvm
diff --git a/lib/Target/R600/SISchedule.td b/lib/Target/R600/SISchedule.td
index 28b65b82585..5a1ae297cdc 100644
--- a/lib/Target/R600/SISchedule.td
+++ b/lib/Target/R600/SISchedule.td
@@ -7,9 +7,81 @@
//
//===----------------------------------------------------------------------===//
//
-// TODO: This is just a place holder for now.
+// MachineModel definitions for Southern Islands (SI)
//
//===----------------------------------------------------------------------===//
-
def SI_Itin : ProcessorItineraries <[], [], []>;
+
+
+def WriteBranch : SchedWrite;
+def WriteExport : SchedWrite;
+def WriteLDS : SchedWrite;
+def WriteSALU : SchedWrite;
+def WriteSMEM : SchedWrite;
+def WriteVMEM : SchedWrite;
+
+// Vector ALU instructions
+def Write32Bit : SchedWrite;
+def WriteIntMUL : SchedWrite;
+
+def WriteConversion : SchedWrite;
+
+def WriteFloatFMA : SchedWrite;
+def WriteFloatTrans : SchedWrite;
+
+def WriteDouble : SchedWrite;
+def WriteDoubleAdd : SchedWrite;
+
+def SIFullSpeedModel : SchedMachineModel;
+
+// BufferSize = 0 means the processors are in-order.
+let BufferSize = 0 in {
+
+// XXX: Are the resource counts correct?
+def HWBranch : ProcResource<1>;
+def HWExport : ProcResource<7>; // Taken from S_WAITCNT
+def HWLGKM : ProcResource<31>; // Taken from S_WAITCNT
+def HWSALU : ProcResource<1>;
+def HWVMEM : ProcResource<15>; // Taken from S_WAITCNT
+def HWVALU : ProcResource<1>;
+
+}
+
+let SchedModel = SIFullSpeedModel in {
+
+class HWWriteRes<SchedWrite write, list<ProcResourceKind> resources,
+ int latency> : WriteRes<write, resources> {
+ let Latency = latency;
+}
+
+class HWVALUWriteRes<SchedWrite write, int latency> :
+ HWWriteRes<write, [HWVALU], latency>;
+
+// The latency numbers are taken from AMD Accelerated Parallel Processing
+// guide. They may not be acurate.
+
+def : HWWriteRes<WriteBranch, [HWBranch], 100>; // XXX: Guessed ???
+def : HWWriteRes<WriteExport, [HWExport], 100>; // XXX: Guessed ???
+def : HWWriteRes<WriteLDS, [HWLGKM], 32>; // 2 - 64
+def : HWWriteRes<WriteSALU, [HWSALU], 1>;
+def : HWWriteRes<WriteSMEM, [HWLGKM], 10>; // XXX: Guessed ???
+def : HWWriteRes<WriteVMEM, [HWVMEM], 450>; // 300 - 600
+
+// XXX: These definitions assume full double-precision speed, some devices are
+// slower. These are also taken from the AMD Accelerated Parallel Processing
+// guide and may not be accurate.
+
+// The latency values are 1 / (operations / cycle) / 4.
+def : HWVALUWriteRes<Write32Bit, 1>;
+def : HWVALUWriteRes<WriteIntMUL, 4>;
+
+def : HWVALUWriteRes<WriteConversion, 4>;
+
+def : HWVALUWriteRes<WriteFloatFMA, 1>; // 16 For single speed
+def : HWVALUWriteRes<WriteFloatTrans, 4>;
+
+def : HWVALUWriteRes<WriteDouble, 4>; // 16 for single speed
+def : HWVALUWriteRes<WriteDoubleAdd, 2>; // 8 for single speed
+
+} // End SchedModel = SIFullSpeedModel