From bcca0271b17417d7226f7c31492b4167326d91b8 Mon Sep 17 00:00:00 2001 From: Tom Stellard Date: Fri, 19 Jul 2013 11:50:00 -0700 Subject: R600/SI: Define a schedule model and enable the generic machine scheduler The schedule model is not complete yet, and could be improved. --- lib/Target/R600/AMDGPUSubtarget.cpp | 12 +++++- lib/Target/R600/AMDGPUSubtarget.h | 6 ++- lib/Target/R600/Processors.td | 24 ++++++------ lib/Target/R600/SIInstrFormats.td | 10 ++++- lib/Target/R600/SIInstructions.td | 48 +++++++++++++++++++++-- lib/Target/R600/SIRegisterInfo.cpp | 54 ++++++++++++++++++++++++-- lib/Target/R600/SIRegisterInfo.h | 12 +++++- lib/Target/R600/SISchedule.td | 76 ++++++++++++++++++++++++++++++++++++- 8 files changed, 216 insertions(+), 26 deletions(-) diff --git a/lib/Target/R600/AMDGPUSubtarget.cpp b/lib/Target/R600/AMDGPUSubtarget.cpp index 9d09a196370..e4674baecd7 100644 --- a/lib/Target/R600/AMDGPUSubtarget.cpp +++ b/lib/Target/R600/AMDGPUSubtarget.cpp @@ -19,8 +19,7 @@ #include "SIInstrInfo.h" #include "SIISelLowering.h" #include "llvm/ADT/SmallString.h" - -#include "llvm/ADT/SmallString.h" +#include "llvm/CodeGen/MachineScheduler.h" using namespace llvm; @@ -107,3 +106,12 @@ unsigned AMDGPUSubtarget::getStackEntrySize() const { llvm_unreachable("Illegal wavefront size."); } } + +void AMDGPUSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, + MachineInstr *begin, + MachineInstr *end, + unsigned NumRegionInstrs) const { + if (getGeneration() >= SOUTHERN_ISLANDS) { + Policy.OnlyTopDown = true; + } +} diff --git a/lib/Target/R600/AMDGPUSubtarget.h b/lib/Target/R600/AMDGPUSubtarget.h index 55a0c586d72..2430cd910c3 100644 --- a/lib/Target/R600/AMDGPUSubtarget.h +++ b/lib/Target/R600/AMDGPUSubtarget.h @@ -201,9 +201,13 @@ public: } bool enableMachineScheduler() const override { - return getGeneration() <= NORTHERN_ISLANDS; + return true; } + void overrideSchedPolicy(MachineSchedPolicy &Policy, + MachineInstr *begin, MachineInstr *end, + unsigned NumRegionInstrs) const override; + // Helper functions to simplify if statements bool isTargetELF() const { return false; diff --git a/lib/Target/R600/Processors.td b/lib/Target/R600/Processors.td index ce17d7cb7f1..17422f93ab8 100644 --- a/lib/Target/R600/Processors.td +++ b/lib/Target/R600/Processors.td @@ -83,28 +83,30 @@ def : Proc<"cayman", R600_VLIW4_Itin, // Southern Islands //===----------------------------------------------------------------------===// -def : Proc<"SI", SI_Itin, [FeatureSouthernIslands]>; +// FIXME: Which of these should use the half speed? -def : Proc<"tahiti", SI_Itin, [FeatureSouthernIslands]>; +def : ProcessorModel<"SI", SIFullSpeedModel, [FeatureSouthernIslands]>; -def : Proc<"pitcairn", SI_Itin, [FeatureSouthernIslands]>; +def : ProcessorModel<"tahiti", SIFullSpeedModel, [FeatureSouthernIslands]>; -def : Proc<"verde", SI_Itin, [FeatureSouthernIslands]>; +def : ProcessorModel<"pitcairn", SIFullSpeedModel, [FeatureSouthernIslands]>; -def : Proc<"oland", SI_Itin, [FeatureSouthernIslands]>; +def : ProcessorModel<"verde", SIFullSpeedModel, [FeatureSouthernIslands]>; -def : Proc<"hainan", SI_Itin, [FeatureSouthernIslands]>; +def : ProcessorModel<"oland", SIFullSpeedModel, [FeatureSouthernIslands]>; + +def : ProcessorModel<"hainan", SIFullSpeedModel, [FeatureSouthernIslands]>; //===----------------------------------------------------------------------===// // Sea Islands //===----------------------------------------------------------------------===// -def : Proc<"bonaire", SI_Itin, [FeatureSeaIslands]>; +def : ProcessorModel<"bonaire", SIFullSpeedModel, [FeatureSeaIslands]>; -def : Proc<"kabini", SI_Itin, [FeatureSeaIslands]>; +def : ProcessorModel<"kabini", SIFullSpeedModel, [FeatureSeaIslands]>; -def : Proc<"kaveri", SI_Itin, [FeatureSeaIslands]>; +def : ProcessorModel<"kaveri", SIFullSpeedModel, [FeatureSeaIslands]>; -def : Proc<"hawaii", SI_Itin, [FeatureSeaIslands]>; +def : ProcessorModel<"hawaii", SIFullSpeedModel, [FeatureSeaIslands]>; -def : Proc<"mullins", SI_Itin, [FeatureSeaIslands]>; +def : ProcessorModel<"mullins", SIFullSpeedModel, [FeatureSeaIslands]>; diff --git a/lib/Target/R600/SIInstrFormats.td b/lib/Target/R600/SIInstrFormats.td index d524bafc53c..84660581c89 100644 --- a/lib/Target/R600/SIInstrFormats.td +++ b/lib/Target/R600/SIInstrFormats.td @@ -46,6 +46,7 @@ class InstSI pattern> : // Most instructions require adjustments after selection to satisfy // operand requirements. let hasPostISelHook = 1; + let SchedRW = [Write32Bit]; } class Enc32 { @@ -161,6 +162,8 @@ class SMRDe op, bits<1> imm> : Enc32 { let Inst{31-27} = 0x18; //encoding } +let SchedRW = [WriteSALU] in { + class SOP1 op, dag outs, dag ins, string asm, list pattern> : InstSI, SOP1e { @@ -215,6 +218,8 @@ class SOPP op, dag ins, string asm, list pattern> : let UseNamedOperandTable = 1; } +} // let SchedRW = [WriteSALU] + class SMRD pattern> : InstSI { @@ -223,6 +228,7 @@ class SMRD pattern> : let mayStore = 0; let mayLoad = 1; let UseNamedOperandTable = 1; + let SchedRW = [WriteSMEM]; } //===----------------------------------------------------------------------===// @@ -546,6 +552,7 @@ class DS op, dag outs, dag ins, string asm, list pattern> : let LGKM_CNT = 1; let UseNamedOperandTable = 1; let DisableEncoding = "$m0"; + let SchedRW = [WriteLDS]; } class MUBUF op, dag outs, dag ins, string asm, list pattern> : @@ -557,6 +564,7 @@ class MUBUF op, dag outs, dag ins, string asm, list pattern> : let neverHasSideEffects = 1; let UseNamedOperandTable = 1; + let SchedRW = [WriteVMEM]; } class MTBUF pattern> : @@ -568,6 +576,7 @@ class MTBUF pattern> : let neverHasSideEffects = 1; let UseNamedOperandTable = 1; + let SchedRW = [WriteVMEM]; } class FLAT op, dag outs, dag ins, string asm, list pattern> : @@ -594,5 +603,4 @@ class MIMG op, dag outs, dag ins, string asm, list pattern> : } - } // End Uses = [EXEC] diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td index 3f1f4f193f3..9030032316e 100644 --- a/lib/Target/R600/SIInstructions.td +++ b/lib/Target/R600/SIInstructions.td @@ -1167,6 +1167,8 @@ defm V_MOV_B32 : VOP1Inst , "v_mov_b32", VOP_I32_I32>; let Uses = [EXEC] in { +// FIXME: Specify SchedRW for READFIRSTLANE+B32 + def V_READFIRSTLANE_B32 : VOP1 < 0x00000002, (outs SReg_32:$vdst), @@ -1177,6 +1179,8 @@ def V_READFIRSTLANE_B32 : VOP1 < } +let SchedRW = [WriteConversion] in { + defm V_CVT_I32_F64 : VOP1Inst , "v_cvt_i32_f64", VOP_I32_F64, fp_to_sint >; @@ -1230,6 +1234,8 @@ defm V_CVT_F64_U32 : VOP1Inst , "v_cvt_f64_u32", VOP_F64_I32, uint_to_fp >; +} // let SchedRW = [WriteConversion] + defm V_FRACT_F32 : VOP1Inst , "v_fract_f32", VOP_F32_F32, AMDGPUfract >; @@ -1248,6 +1254,9 @@ defm V_FLOOR_F32 : VOP1Inst , "v_floor_f32", defm V_EXP_F32 : VOP1Inst , "v_exp_f32", VOP_F32_F32, fexp2 >; + +let SchedRW = [WriteFloatTrans] in { + defm V_LOG_CLAMP_F32 : VOP1Inst , "v_log_clamp_f32", VOP_F32_F32>; defm V_LOG_F32 : VOP1Inst , "v_log_f32", VOP_F32_F32, flog2 @@ -1268,6 +1277,11 @@ defm V_RSQ_LEGACY_F32 : VOP1Inst , "v_rsq_legacy_f32", defm V_RSQ_F32 : VOP1Inst , "v_rsq_f32", VOP_F32_F32, AMDGPUrsq >; + +} //let SchedRW = [WriteFloatTrans] + +let SchedRW = [WriteDouble] in { + defm V_RCP_F64 : VOP1Inst , "v_rcp_f64", VOP_F64_F64, AMDGPUrcp >; @@ -1278,12 +1292,21 @@ defm V_RSQ_F64 : VOP1Inst , "v_rsq_f64", defm V_RSQ_CLAMP_F64 : VOP1Inst , "v_rsq_clamp_f64", VOP_F64_F64, AMDGPUrsq_clamped >; + +} // let SchedRW = [WriteDouble]; + defm V_SQRT_F32 : VOP1Inst , "v_sqrt_f32", VOP_F32_F32, fsqrt >; + +let SchedRW = [WriteDouble] in { + defm V_SQRT_F64 : VOP1Inst , "v_sqrt_f64", VOP_F64_F64, fsqrt >; + +} // let SchedRW = [WriteDouble] + defm V_SIN_F32 : VOP1Inst , "v_sin_f32", VOP_F32_F32, AMDGPUsin >; @@ -1310,6 +1333,8 @@ defm V_MOVRELSD_B32 : VOP1Inst , "v_movrelsd_b32", VOP_I32_I32>; // VINTRP Instructions //===----------------------------------------------------------------------===// +// FIXME: Specify SchedRW for VINTRP insturctions. + def V_INTERP_P1_F32 : VINTRP < 0x00000000, (outs VReg_32:$dst), @@ -1344,6 +1369,8 @@ def V_INTERP_MOV_F32 : VINTRP < // VOP2 Instructions //===----------------------------------------------------------------------===// +// FIXME: Specify SchedRW for V_CNDMASK and V_*LANE_B32 + def V_CNDMASK_B32_e32 : VOP2 <0x00000000, (outs VReg_32:$dst), (ins VSrc_32:$src0, VReg_32:$src1, VCCReg:$vcc), "v_cndmask_b32_e32 $dst, $src0, $src1, [$vcc]", @@ -1403,7 +1430,6 @@ defm V_MUL_F32 : VOP2Inst , "v_mul_f32", VOP_F32_F32_F32, fmul >; - defm V_MUL_I32_I24 : VOP2Inst , "v_mul_i32_i24", VOP_I32_I32_I32, AMDGPUmul_i24 >; @@ -1413,7 +1439,6 @@ defm V_MUL_U32_U24 : VOP2Inst , "v_mul_u32_u24", >; //defm V_MUL_HI_U32_U24 : VOP2_32 <0x0000000c, "v_mul_hi_u32_u24", []>; - defm V_MIN_LEGACY_F32 : VOP2Inst , "v_min_legacy_f32", VOP_F32_F32_F32, AMDGPUfmin >; @@ -1586,10 +1611,15 @@ defm V_SAD_U32 : VOP3Inst , "v_sad_u32", defm V_DIV_FIXUP_F32 : VOP3Inst < vop3<0x15f>, "v_div_fixup_f32", VOP_F32_F32_F32_F32, AMDGPUdiv_fixup >; + +let SchedRW = [WriteDouble] in { + defm V_DIV_FIXUP_F64 : VOP3Inst < vop3<0x160>, "v_div_fixup_f64", VOP_F64_F64_F64_F64, AMDGPUdiv_fixup >; +} // let SchedRW = [WriteDouble] + defm V_LSHL_B64 : VOP3Inst , "v_lshl_b64", VOP_I64_I64_I32, shl >; @@ -1600,6 +1630,7 @@ defm V_ASHR_I64 : VOP3Inst , "v_ashr_i64", VOP_I64_I64_I32, sra >; +let SchedRW = [WriteDouble] in { let isCommutable = 1 in { defm V_ADD_F64 : VOP3Inst , "v_add_f64", @@ -1622,7 +1653,9 @@ defm V_LDEXP_F64 : VOP3Inst , "v_ldexp_f64", VOP_F64_F64_I32, AMDGPUldexp >; -let isCommutable = 1 in { +} // let SchedRW = [WriteDouble] + +let isCommutable = 1, SchedRW = [WriteIntMUL] in { defm V_MUL_LO_U32 : VOP3Inst , "v_mul_lo_u32", VOP_I32_I32_I32 @@ -1637,19 +1670,24 @@ defm V_MUL_HI_I32 : VOP3Inst , "v_mul_hi_i32", VOP_I32_I32_I32 >; -} // isCommutable = 1 +} // isCommutable = 1, SchedRW = [WriteIntMUL] defm V_DIV_SCALE_F32 : VOP3b_32 , "v_div_scale_f32", []>; +let SchedRW = [WriteDouble] in { // Double precision division pre-scale. defm V_DIV_SCALE_F64 : VOP3b_64 , "v_div_scale_f64", []>; +} // let SchedRW = [WriteDouble] defm V_DIV_FMAS_F32 : VOP3Inst , "v_div_fmas_f32", VOP_F32_F32_F32_F32, AMDGPUdiv_fmas >; + +let SchedRW = [WriteDouble] in { defm V_DIV_FMAS_F64 : VOP3Inst , "v_div_fmas_f64", VOP_F64_F64_F64_F64, AMDGPUdiv_fmas >; + //def V_MSAD_U8 : VOP3_U8 <0x00000171, "v_msad_u8", []>; //def V_QSAD_U8 : VOP3_U8 <0x00000172, "v_qsad_u8", []>; //def V_MQSAD_U8 : VOP3_U8 <0x00000173, "v_mqsad_u8", []>; @@ -1657,6 +1695,8 @@ defm V_TRIG_PREOP_F64 : VOP3Inst < vop3<0x174>, "v_trig_preop_f64", VOP_F64_F64_I32, AMDGPUtrig_preop >; +} // let SchedRW = [WriteDouble] + //===----------------------------------------------------------------------===// // Pseudo Instructions //===----------------------------------------------------------------------===// diff --git a/lib/Target/R600/SIRegisterInfo.cpp b/lib/Target/R600/SIRegisterInfo.cpp index 6f79a97c3dc..1f6c1b3e489 100644 --- a/lib/Target/R600/SIRegisterInfo.cpp +++ b/lib/Target/R600/SIRegisterInfo.cpp @@ -48,9 +48,31 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { return Reserved; } -unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, - MachineFunction &MF) const { - return RC->getNumRegs(); +unsigned SIRegisterInfo::getRegPressureSetLimit(unsigned Idx) const { + + unsigned SGPRLimit = getNumSGPRsAllowed(10); + unsigned VGPRLimit = getNumVGPRsAllowed(10); + + for (regclass_iterator I = regclass_begin(), E = regclass_end(); + I != E; ++I) { + + unsigned NumSubRegs = std::max((int)(*I)->getSize() / 4, 1); + unsigned Limit; + + if (isSGPRClass(*I)) { + Limit = SGPRLimit / NumSubRegs; + } else { + Limit = VGPRLimit / NumSubRegs; + } + + const int *Sets = getRegClassPressureSets(*I); + assert(Sets); + for (unsigned i = 0; Sets[i] != -1; ++i) { + if (Sets[i] == (int)Idx) + return Limit; + } + } + return 256; } bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const { @@ -464,3 +486,29 @@ unsigned SIRegisterInfo::findUnusedVGPR(const MachineRegisterInfo &MRI) const { return AMDGPU::NoRegister; } +unsigned SIRegisterInfo::getNumVGPRsAllowed(unsigned WaveCount) const { + switch(WaveCount) { + case 10: return 24; + case 9: return 28; + case 8: return 32; + case 7: return 36; + case 6: return 40; + case 5: return 48; + case 4: return 64; + case 3: return 84; + case 2: return 128; + default: return 256; + } +} + +unsigned SIRegisterInfo::getNumSGPRsAllowed(unsigned WaveCount) const { + switch(WaveCount) { + case 10: return 48; + case 9: return 56; + case 8: return 64; + case 7: return 72; + case 6: return 80; + case 5: return 96; + default: return 103; + } +} diff --git a/lib/Target/R600/SIRegisterInfo.h b/lib/Target/R600/SIRegisterInfo.h index c7e54dbf0ec..a7c39002a8c 100644 --- a/lib/Target/R600/SIRegisterInfo.h +++ b/lib/Target/R600/SIRegisterInfo.h @@ -17,6 +17,7 @@ #define LLVM_LIB_TARGET_R600_SIREGISTERINFO_H #include "AMDGPURegisterInfo.h" +#include "llvm/Support/Debug.h" namespace llvm { @@ -26,8 +27,7 @@ struct SIRegisterInfo : public AMDGPURegisterInfo { BitVector getReservedRegs(const MachineFunction &MF) const override; - unsigned getRegPressureLimit(const TargetRegisterClass *RC, - MachineFunction &MF) const override; + unsigned getRegPressureSetLimit(unsigned Idx) const override; bool requiresRegisterScavenging(const MachineFunction &Fn) const override; @@ -114,6 +114,14 @@ struct SIRegisterInfo : public AMDGPURegisterInfo { enum PreloadedValue Value) const; unsigned findUnusedVGPR(const MachineRegisterInfo &MRI) const; + + /// \brief Give the maximum number of VGPRs that can be used by \p WaveCount + /// concurrent waves. + unsigned getNumVGPRsAllowed(unsigned WaveCount) const; + + /// \brief Give the maximum number of SGPRs that can be used by \p WaveCount + /// concurrent waves. + unsigned getNumSGPRsAllowed(unsigned WaveCount) const; }; } // End namespace llvm diff --git a/lib/Target/R600/SISchedule.td b/lib/Target/R600/SISchedule.td index 28b65b82585..5a1ae297cdc 100644 --- a/lib/Target/R600/SISchedule.td +++ b/lib/Target/R600/SISchedule.td @@ -7,9 +7,81 @@ // //===----------------------------------------------------------------------===// // -// TODO: This is just a place holder for now. +// MachineModel definitions for Southern Islands (SI) // //===----------------------------------------------------------------------===// - def SI_Itin : ProcessorItineraries <[], [], []>; + + +def WriteBranch : SchedWrite; +def WriteExport : SchedWrite; +def WriteLDS : SchedWrite; +def WriteSALU : SchedWrite; +def WriteSMEM : SchedWrite; +def WriteVMEM : SchedWrite; + +// Vector ALU instructions +def Write32Bit : SchedWrite; +def WriteIntMUL : SchedWrite; + +def WriteConversion : SchedWrite; + +def WriteFloatFMA : SchedWrite; +def WriteFloatTrans : SchedWrite; + +def WriteDouble : SchedWrite; +def WriteDoubleAdd : SchedWrite; + +def SIFullSpeedModel : SchedMachineModel; + +// BufferSize = 0 means the processors are in-order. +let BufferSize = 0 in { + +// XXX: Are the resource counts correct? +def HWBranch : ProcResource<1>; +def HWExport : ProcResource<7>; // Taken from S_WAITCNT +def HWLGKM : ProcResource<31>; // Taken from S_WAITCNT +def HWSALU : ProcResource<1>; +def HWVMEM : ProcResource<15>; // Taken from S_WAITCNT +def HWVALU : ProcResource<1>; + +} + +let SchedModel = SIFullSpeedModel in { + +class HWWriteRes resources, + int latency> : WriteRes { + let Latency = latency; +} + +class HWVALUWriteRes : + HWWriteRes; + +// The latency numbers are taken from AMD Accelerated Parallel Processing +// guide. They may not be acurate. + +def : HWWriteRes; // XXX: Guessed ??? +def : HWWriteRes; // XXX: Guessed ??? +def : HWWriteRes; // 2 - 64 +def : HWWriteRes; +def : HWWriteRes; // XXX: Guessed ??? +def : HWWriteRes; // 300 - 600 + +// XXX: These definitions assume full double-precision speed, some devices are +// slower. These are also taken from the AMD Accelerated Parallel Processing +// guide and may not be accurate. + +// The latency values are 1 / (operations / cycle) / 4. +def : HWVALUWriteRes; +def : HWVALUWriteRes; + +def : HWVALUWriteRes; + +def : HWVALUWriteRes; // 16 For single speed +def : HWVALUWriteRes; + +def : HWVALUWriteRes; // 16 for single speed +def : HWVALUWriteRes; // 8 for single speed + +} // End SchedModel = SIFullSpeedModel -- cgit v1.2.3