diff options
-rw-r--r-- | lib/Target/R600/AMDGPUSubtarget.cpp | 12 | ||||
-rw-r--r-- | lib/Target/R600/AMDGPUSubtarget.h | 6 | ||||
-rw-r--r-- | lib/Target/R600/Processors.td | 24 | ||||
-rw-r--r-- | lib/Target/R600/SIInstrFormats.td | 10 | ||||
-rw-r--r-- | lib/Target/R600/SIInstructions.td | 48 | ||||
-rw-r--r-- | lib/Target/R600/SIRegisterInfo.cpp | 54 | ||||
-rw-r--r-- | lib/Target/R600/SIRegisterInfo.h | 12 | ||||
-rw-r--r-- | lib/Target/R600/SISchedule.td | 76 |
8 files changed, 217 insertions, 25 deletions
diff --git a/lib/Target/R600/AMDGPUSubtarget.cpp b/lib/Target/R600/AMDGPUSubtarget.cpp index 9d09a196370..e4674baecd7 100644 --- a/lib/Target/R600/AMDGPUSubtarget.cpp +++ b/lib/Target/R600/AMDGPUSubtarget.cpp @@ -19,8 +19,7 @@ #include "SIInstrInfo.h" #include "SIISelLowering.h" #include "llvm/ADT/SmallString.h" - -#include "llvm/ADT/SmallString.h" +#include "llvm/CodeGen/MachineScheduler.h" using namespace llvm; @@ -107,3 +106,12 @@ unsigned AMDGPUSubtarget::getStackEntrySize() const { llvm_unreachable("Illegal wavefront size."); } } + +void AMDGPUSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, + MachineInstr *begin, + MachineInstr *end, + unsigned NumRegionInstrs) const { + if (getGeneration() >= SOUTHERN_ISLANDS) { + Policy.OnlyTopDown = true; + } +} diff --git a/lib/Target/R600/AMDGPUSubtarget.h b/lib/Target/R600/AMDGPUSubtarget.h index f71d80a8ba0..3e44c6685d7 100644 --- a/lib/Target/R600/AMDGPUSubtarget.h +++ b/lib/Target/R600/AMDGPUSubtarget.h @@ -199,9 +199,13 @@ public: } bool enableMachineScheduler() const override { - return getGeneration() <= NORTHERN_ISLANDS; + return true; } + void overrideSchedPolicy(MachineSchedPolicy &Policy, + MachineInstr *begin, MachineInstr *end, + unsigned NumRegionInstrs) const override; + // Helper functions to simplify if statements bool isTargetELF() const { return false; diff --git a/lib/Target/R600/Processors.td b/lib/Target/R600/Processors.td index ce17d7cb7f1..17422f93ab8 100644 --- a/lib/Target/R600/Processors.td +++ b/lib/Target/R600/Processors.td @@ -83,28 +83,30 @@ def : Proc<"cayman", R600_VLIW4_Itin, // Southern Islands //===----------------------------------------------------------------------===// -def : Proc<"SI", SI_Itin, [FeatureSouthernIslands]>; +// FIXME: Which of these should use the half speed? -def : Proc<"tahiti", SI_Itin, [FeatureSouthernIslands]>; +def : ProcessorModel<"SI", SIFullSpeedModel, [FeatureSouthernIslands]>; -def : Proc<"pitcairn", SI_Itin, [FeatureSouthernIslands]>; +def : ProcessorModel<"tahiti", SIFullSpeedModel, [FeatureSouthernIslands]>; -def : Proc<"verde", SI_Itin, [FeatureSouthernIslands]>; +def : ProcessorModel<"pitcairn", SIFullSpeedModel, [FeatureSouthernIslands]>; -def : Proc<"oland", SI_Itin, [FeatureSouthernIslands]>; +def : ProcessorModel<"verde", SIFullSpeedModel, [FeatureSouthernIslands]>; -def : Proc<"hainan", SI_Itin, [FeatureSouthernIslands]>; +def : ProcessorModel<"oland", SIFullSpeedModel, [FeatureSouthernIslands]>; + +def : ProcessorModel<"hainan", SIFullSpeedModel, [FeatureSouthernIslands]>; //===----------------------------------------------------------------------===// // Sea Islands //===----------------------------------------------------------------------===// -def : Proc<"bonaire", SI_Itin, [FeatureSeaIslands]>; +def : ProcessorModel<"bonaire", SIFullSpeedModel, [FeatureSeaIslands]>; -def : Proc<"kabini", SI_Itin, [FeatureSeaIslands]>; +def : ProcessorModel<"kabini", SIFullSpeedModel, [FeatureSeaIslands]>; -def : Proc<"kaveri", SI_Itin, [FeatureSeaIslands]>; +def : ProcessorModel<"kaveri", SIFullSpeedModel, [FeatureSeaIslands]>; -def : Proc<"hawaii", SI_Itin, [FeatureSeaIslands]>; +def : ProcessorModel<"hawaii", SIFullSpeedModel, [FeatureSeaIslands]>; -def : Proc<"mullins", SI_Itin, [FeatureSeaIslands]>; +def : ProcessorModel<"mullins", SIFullSpeedModel, [FeatureSeaIslands]>; diff --git a/lib/Target/R600/SIInstrFormats.td b/lib/Target/R600/SIInstrFormats.td index ee1a52b2f8f..4b688e0a97b 100644 --- a/lib/Target/R600/SIInstrFormats.td +++ b/lib/Target/R600/SIInstrFormats.td @@ -46,6 +46,7 @@ class InstSI <dag outs, dag ins, string asm, list<dag> pattern> : // Most instructions require adjustments after selection to satisfy // operand requirements. let hasPostISelHook = 1; + let SchedRW = [Write32Bit]; } class Enc32 { @@ -161,6 +162,8 @@ class SMRDe <bits<5> op, bits<1> imm> : Enc32 { let Inst{31-27} = 0x18; //encoding } +let SchedRW = [WriteSALU] in { + class SOP1 <bits<8> op, dag outs, dag ins, string asm, list<dag> pattern> : InstSI<outs, ins, asm, pattern>, SOP1e <op> { @@ -216,6 +219,8 @@ class SOPP <bits<7> op, dag ins, string asm, list<dag> pattern = []> : let UseNamedOperandTable = 1; } +} // let SchedRW = [WriteSALU] + class SMRD <dag outs, dag ins, string asm, list<dag> pattern> : InstSI<outs, ins, asm, pattern> { @@ -225,6 +230,7 @@ class SMRD <dag outs, dag ins, string asm, list<dag> pattern> : let mayLoad = 1; let hasSideEffects = 0; let UseNamedOperandTable = 1; + let SchedRW = [WriteSMEM]; } //===----------------------------------------------------------------------===// @@ -547,6 +553,7 @@ class DS <bits<8> op, dag outs, dag ins, string asm, list<dag> pattern> : let LGKM_CNT = 1; let UseNamedOperandTable = 1; let DisableEncoding = "$m0"; + let SchedRW = [WriteLDS]; } class MUBUF <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> : @@ -558,6 +565,7 @@ class MUBUF <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> : let hasSideEffects = 0; let UseNamedOperandTable = 1; + let SchedRW = [WriteVMEM]; } class MTBUF <dag outs, dag ins, string asm, list<dag> pattern> : @@ -569,6 +577,7 @@ class MTBUF <dag outs, dag ins, string asm, list<dag> pattern> : let neverHasSideEffects = 1; let UseNamedOperandTable = 1; + let SchedRW = [WriteVMEM]; } class FLAT <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> : @@ -597,5 +606,4 @@ class MIMG <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> : } - } // End Uses = [EXEC] diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td index 00ce9bfcc26..5346cf729d0 100644 --- a/lib/Target/R600/SIInstructions.td +++ b/lib/Target/R600/SIInstructions.td @@ -1160,6 +1160,8 @@ defm V_MOV_B32 : VOP1Inst <vop1<0x1>, "v_mov_b32", VOP_I32_I32>; let Uses = [EXEC] in { +// FIXME: Specify SchedRW for READFIRSTLANE+B32 + def V_READFIRSTLANE_B32 : VOP1 < 0x00000002, (outs SReg_32:$vdst), @@ -1170,6 +1172,8 @@ def V_READFIRSTLANE_B32 : VOP1 < } +let SchedRW = [WriteConversion] in { + defm V_CVT_I32_F64 : VOP1Inst <vop1<0x3>, "v_cvt_i32_f64", VOP_I32_F64, fp_to_sint >; @@ -1223,6 +1227,8 @@ defm V_CVT_F64_U32 : VOP1Inst <vop1<0x16>, "v_cvt_f64_u32", VOP_F64_I32, uint_to_fp >; +} // let SchedRW = [WriteConversion] + defm V_FRACT_F32 : VOP1Inst <vop1<0x20>, "v_fract_f32", VOP_F32_F32, AMDGPUfract >; @@ -1241,6 +1247,9 @@ defm V_FLOOR_F32 : VOP1Inst <vop1<0x24>, "v_floor_f32", defm V_EXP_F32 : VOP1Inst <vop1<0x25>, "v_exp_f32", VOP_F32_F32, fexp2 >; + +let SchedRW = [WriteFloatTrans] in { + defm V_LOG_CLAMP_F32 : VOP1Inst <vop1<0x26>, "v_log_clamp_f32", VOP_F32_F32>; defm V_LOG_F32 : VOP1Inst <vop1<0x27>, "v_log_f32", VOP_F32_F32, flog2 @@ -1261,6 +1270,11 @@ defm V_RSQ_LEGACY_F32 : VOP1Inst <vop1<0x2d>, "v_rsq_legacy_f32", defm V_RSQ_F32 : VOP1Inst <vop1<0x2e>, "v_rsq_f32", VOP_F32_F32, AMDGPUrsq >; + +} //let SchedRW = [WriteFloatTrans] + +let SchedRW = [WriteDouble] in { + defm V_RCP_F64 : VOP1Inst <vop1<0x2f>, "v_rcp_f64", VOP_F64_F64, AMDGPUrcp >; @@ -1271,12 +1285,21 @@ defm V_RSQ_F64 : VOP1Inst <vop1<0x31>, "v_rsq_f64", defm V_RSQ_CLAMP_F64 : VOP1Inst <vop1<0x32>, "v_rsq_clamp_f64", VOP_F64_F64, AMDGPUrsq_clamped >; + +} // let SchedRW = [WriteDouble]; + defm V_SQRT_F32 : VOP1Inst <vop1<0x33>, "v_sqrt_f32", VOP_F32_F32, fsqrt >; + +let SchedRW = [WriteDouble] in { + defm V_SQRT_F64 : VOP1Inst <vop1<0x34>, "v_sqrt_f64", VOP_F64_F64, fsqrt >; + +} // let SchedRW = [WriteDouble] + defm V_SIN_F32 : VOP1Inst <vop1<0x35>, "v_sin_f32", VOP_F32_F32, AMDGPUsin >; @@ -1303,6 +1326,8 @@ defm V_MOVRELSD_B32 : VOP1Inst <vop1<0x44>, "v_movrelsd_b32", VOP_I32_I32>; // VINTRP Instructions //===----------------------------------------------------------------------===// +// FIXME: Specify SchedRW for VINTRP insturctions. + def V_INTERP_P1_F32 : VINTRP < 0x00000000, (outs VReg_32:$dst), @@ -1337,6 +1362,8 @@ def V_INTERP_MOV_F32 : VINTRP < // VOP2 Instructions //===----------------------------------------------------------------------===// +// FIXME: Specify SchedRW for V_CNDMASK and V_*LANE_B32 + def V_CNDMASK_B32_e32 : VOP2 <0x00000000, (outs VReg_32:$dst), (ins VSrc_32:$src0, VReg_32:$src1, VCCReg:$vcc), "v_cndmask_b32_e32 $dst, $src0, $src1, [$vcc]", @@ -1405,7 +1432,6 @@ defm V_MUL_U32_U24 : VOP2Inst <vop2<0xb>, "v_mul_u32_u24", >; //defm V_MUL_HI_U32_U24 : VOP2_32 <0x0000000c, "v_mul_hi_u32_u24", []>; - defm V_MIN_LEGACY_F32 : VOP2Inst <vop2<0xd>, "v_min_legacy_f32", VOP_F32_F32_F32, AMDGPUfmin_legacy >; @@ -1608,10 +1634,15 @@ defm V_SAD_U32 : VOP3Inst <vop3<0x15d>, "v_sad_u32", defm V_DIV_FIXUP_F32 : VOP3Inst < vop3<0x15f>, "v_div_fixup_f32", VOP_F32_F32_F32_F32, AMDGPUdiv_fixup >; + +let SchedRW = [WriteDouble] in { + defm V_DIV_FIXUP_F64 : VOP3Inst < vop3<0x160>, "v_div_fixup_f64", VOP_F64_F64_F64_F64, AMDGPUdiv_fixup >; +} // let SchedRW = [WriteDouble] + defm V_LSHL_B64 : VOP3Inst <vop3<0x161>, "v_lshl_b64", VOP_I64_I64_I32, shl >; @@ -1622,6 +1653,7 @@ defm V_ASHR_I64 : VOP3Inst <vop3<0x163>, "v_ashr_i64", VOP_I64_I64_I32, sra >; +let SchedRW = [WriteDouble] in { let isCommutable = 1 in { defm V_ADD_F64 : VOP3Inst <vop3<0x164>, "v_add_f64", @@ -1644,7 +1676,9 @@ defm V_LDEXP_F64 : VOP3Inst <vop3<0x168>, "v_ldexp_f64", VOP_F64_F64_I32, AMDGPUldexp >; -let isCommutable = 1 in { +} // let SchedRW = [WriteDouble] + +let isCommutable = 1, SchedRW = [WriteIntMUL] in { defm V_MUL_LO_U32 : VOP3Inst <vop3<0x169>, "v_mul_lo_u32", VOP_I32_I32_I32 @@ -1659,30 +1693,38 @@ defm V_MUL_HI_I32 : VOP3Inst <vop3<0x16c>, "v_mul_hi_i32", VOP_I32_I32_I32 >; -} // isCommutable = 1 +} // isCommutable = 1, SchedRW = [WriteIntMUL] defm V_DIV_SCALE_F32 : VOP3b_32 <vop3<0x16d>, "v_div_scale_f32", []>; +let SchedRW = [WriteDouble] in { // Double precision division pre-scale. defm V_DIV_SCALE_F64 : VOP3b_64 <vop3<0x16e>, "v_div_scale_f64", []>; +} // let SchedRW = [WriteDouble] let isCommutable = 1 in { defm V_DIV_FMAS_F32 : VOP3Inst <vop3<0x16f>, "v_div_fmas_f32", VOP_F32_F32_F32_F32, AMDGPUdiv_fmas >; + +let SchedRW = [WriteDouble] in { defm V_DIV_FMAS_F64 : VOP3Inst <vop3<0x170>, "v_div_fmas_f64", VOP_F64_F64_F64_F64, AMDGPUdiv_fmas >; +} // End SchedRW = [WriteDouble] } // End isCommutable = 1 //def V_MSAD_U8 : VOP3_U8 <0x00000171, "v_msad_u8", []>; //def V_QSAD_U8 : VOP3_U8 <0x00000172, "v_qsad_u8", []>; //def V_MQSAD_U8 : VOP3_U8 <0x00000173, "v_mqsad_u8", []>; +let SchedRW = [WriteDouble] in { defm V_TRIG_PREOP_F64 : VOP3Inst < vop3<0x174>, "v_trig_preop_f64", VOP_F64_F64_I32, AMDGPUtrig_preop >; +} // let SchedRW = [WriteDouble] + //===----------------------------------------------------------------------===// // Pseudo Instructions //===----------------------------------------------------------------------===// diff --git a/lib/Target/R600/SIRegisterInfo.cpp b/lib/Target/R600/SIRegisterInfo.cpp index cffea129de9..9bc7f2deb5c 100644 --- a/lib/Target/R600/SIRegisterInfo.cpp +++ b/lib/Target/R600/SIRegisterInfo.cpp @@ -48,9 +48,31 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { return Reserved; } -unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, - MachineFunction &MF) const { - return RC->getNumRegs(); +unsigned SIRegisterInfo::getRegPressureSetLimit(unsigned Idx) const { + + unsigned SGPRLimit = getNumSGPRsAllowed(10); + unsigned VGPRLimit = getNumVGPRsAllowed(10); + + for (regclass_iterator I = regclass_begin(), E = regclass_end(); + I != E; ++I) { + + unsigned NumSubRegs = std::max((int)(*I)->getSize() / 4, 1); + unsigned Limit; + + if (isSGPRClass(*I)) { + Limit = SGPRLimit / NumSubRegs; + } else { + Limit = VGPRLimit / NumSubRegs; + } + + const int *Sets = getRegClassPressureSets(*I); + assert(Sets); + for (unsigned i = 0; Sets[i] != -1; ++i) { + if (Sets[i] == (int)Idx) + return Limit; + } + } + return 256; } bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const { @@ -464,3 +486,29 @@ unsigned SIRegisterInfo::findUnusedVGPR(const MachineRegisterInfo &MRI) const { return AMDGPU::NoRegister; } +unsigned SIRegisterInfo::getNumVGPRsAllowed(unsigned WaveCount) const { + switch(WaveCount) { + case 10: return 24; + case 9: return 28; + case 8: return 32; + case 7: return 36; + case 6: return 40; + case 5: return 48; + case 4: return 64; + case 3: return 84; + case 2: return 128; + default: return 256; + } +} + +unsigned SIRegisterInfo::getNumSGPRsAllowed(unsigned WaveCount) const { + switch(WaveCount) { + case 10: return 48; + case 9: return 56; + case 8: return 64; + case 7: return 72; + case 6: return 80; + case 5: return 96; + default: return 103; + } +} diff --git a/lib/Target/R600/SIRegisterInfo.h b/lib/Target/R600/SIRegisterInfo.h index c7e54dbf0ec..a7c39002a8c 100644 --- a/lib/Target/R600/SIRegisterInfo.h +++ b/lib/Target/R600/SIRegisterInfo.h @@ -17,6 +17,7 @@ #define LLVM_LIB_TARGET_R600_SIREGISTERINFO_H #include "AMDGPURegisterInfo.h" +#include "llvm/Support/Debug.h" namespace llvm { @@ -26,8 +27,7 @@ struct SIRegisterInfo : public AMDGPURegisterInfo { BitVector getReservedRegs(const MachineFunction &MF) const override; - unsigned getRegPressureLimit(const TargetRegisterClass *RC, - MachineFunction &MF) const override; + unsigned getRegPressureSetLimit(unsigned Idx) const override; bool requiresRegisterScavenging(const MachineFunction &Fn) const override; @@ -114,6 +114,14 @@ struct SIRegisterInfo : public AMDGPURegisterInfo { enum PreloadedValue Value) const; unsigned findUnusedVGPR(const MachineRegisterInfo &MRI) const; + + /// \brief Give the maximum number of VGPRs that can be used by \p WaveCount + /// concurrent waves. + unsigned getNumVGPRsAllowed(unsigned WaveCount) const; + + /// \brief Give the maximum number of SGPRs that can be used by \p WaveCount + /// concurrent waves. + unsigned getNumSGPRsAllowed(unsigned WaveCount) const; }; } // End namespace llvm diff --git a/lib/Target/R600/SISchedule.td b/lib/Target/R600/SISchedule.td index 28b65b82585..5a1ae297cdc 100644 --- a/lib/Target/R600/SISchedule.td +++ b/lib/Target/R600/SISchedule.td @@ -7,9 +7,81 @@ // //===----------------------------------------------------------------------===// // -// TODO: This is just a place holder for now. +// MachineModel definitions for Southern Islands (SI) // //===----------------------------------------------------------------------===// - def SI_Itin : ProcessorItineraries <[], [], []>; + + +def WriteBranch : SchedWrite; +def WriteExport : SchedWrite; +def WriteLDS : SchedWrite; +def WriteSALU : SchedWrite; +def WriteSMEM : SchedWrite; +def WriteVMEM : SchedWrite; + +// Vector ALU instructions +def Write32Bit : SchedWrite; +def WriteIntMUL : SchedWrite; + +def WriteConversion : SchedWrite; + +def WriteFloatFMA : SchedWrite; +def WriteFloatTrans : SchedWrite; + +def WriteDouble : SchedWrite; +def WriteDoubleAdd : SchedWrite; + +def SIFullSpeedModel : SchedMachineModel; + +// BufferSize = 0 means the processors are in-order. +let BufferSize = 0 in { + +// XXX: Are the resource counts correct? +def HWBranch : ProcResource<1>; +def HWExport : ProcResource<7>; // Taken from S_WAITCNT +def HWLGKM : ProcResource<31>; // Taken from S_WAITCNT +def HWSALU : ProcResource<1>; +def HWVMEM : ProcResource<15>; // Taken from S_WAITCNT +def HWVALU : ProcResource<1>; + +} + +let SchedModel = SIFullSpeedModel in { + +class HWWriteRes<SchedWrite write, list<ProcResourceKind> resources, + int latency> : WriteRes<write, resources> { + let Latency = latency; +} + +class HWVALUWriteRes<SchedWrite write, int latency> : + HWWriteRes<write, [HWVALU], latency>; + +// The latency numbers are taken from AMD Accelerated Parallel Processing +// guide. They may not be acurate. + +def : HWWriteRes<WriteBranch, [HWBranch], 100>; // XXX: Guessed ??? +def : HWWriteRes<WriteExport, [HWExport], 100>; // XXX: Guessed ??? +def : HWWriteRes<WriteLDS, [HWLGKM], 32>; // 2 - 64 +def : HWWriteRes<WriteSALU, [HWSALU], 1>; +def : HWWriteRes<WriteSMEM, [HWLGKM], 10>; // XXX: Guessed ??? +def : HWWriteRes<WriteVMEM, [HWVMEM], 450>; // 300 - 600 + +// XXX: These definitions assume full double-precision speed, some devices are +// slower. These are also taken from the AMD Accelerated Parallel Processing +// guide and may not be accurate. + +// The latency values are 1 / (operations / cycle) / 4. +def : HWVALUWriteRes<Write32Bit, 1>; +def : HWVALUWriteRes<WriteIntMUL, 4>; + +def : HWVALUWriteRes<WriteConversion, 4>; + +def : HWVALUWriteRes<WriteFloatFMA, 1>; // 16 For single speed +def : HWVALUWriteRes<WriteFloatTrans, 4>; + +def : HWVALUWriteRes<WriteDouble, 4>; // 16 for single speed +def : HWVALUWriteRes<WriteDoubleAdd, 2>; // 8 for single speed + +} // End SchedModel = SIFullSpeedModel |