diff options
-rw-r--r-- | lib/Target/R600/AMDGPUAsmPrinter.cpp | 3 | ||||
-rw-r--r-- | lib/Target/R600/SIInstrInfo.cpp | 163 | ||||
-rw-r--r-- | lib/Target/R600/SIInstrInfo.h | 8 | ||||
-rw-r--r-- | lib/Target/R600/SIInstructions.td | 23 | ||||
-rw-r--r-- | lib/Target/R600/SIMachineFunctionInfo.cpp | 51 | ||||
-rw-r--r-- | lib/Target/R600/SIMachineFunctionInfo.h | 10 |
6 files changed, 245 insertions, 13 deletions
diff --git a/lib/Target/R600/AMDGPUAsmPrinter.cpp b/lib/Target/R600/AMDGPUAsmPrinter.cpp index 170f4790574..4ac702ec4e7 100644 --- a/lib/Target/R600/AMDGPUAsmPrinter.cpp +++ b/lib/Target/R600/AMDGPUAsmPrinter.cpp @@ -311,7 +311,8 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(MachineFunction &MF, LDSAlignShift = 9; } unsigned LDSBlocks = - RoundUpToAlignment(MFI->LDSSize, 1 << LDSAlignShift) >> LDSAlignShift; + RoundUpToAlignment(MFI->LDSSize + (MFI->LDSWaveSpillSize * 64), + 1 << LDSAlignShift) >> LDSAlignShift; if (MFI->ShaderType == ShaderType::COMPUTE) { OutStreamer.EmitIntValue(R_00B84C_COMPUTE_PGM_RSRC2, 4); diff --git a/lib/Target/R600/SIInstrInfo.cpp b/lib/Target/R600/SIInstrInfo.cpp index 4a9e3467c5a..c7ec889bff0 100644 --- a/lib/Target/R600/SIInstrInfo.cpp +++ b/lib/Target/R600/SIInstrInfo.cpp @@ -224,8 +224,25 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, BuildMI(MBB, MI, DL, get(Opcode), MFI->SpillTracker.LaneVGPR) .addReg(SrcReg) .addImm(FrameIndex); + } else if(RI.hasVGPRs(RC)) { + unsigned Opcode; + switch(RC->getSize() * 8) { + case 32: Opcode = AMDGPU::SI_SPILL_V32_SAVE; break; + case 64: Opcode = AMDGPU::SI_SPILL_V64_SAVE; break; + case 96: Opcode = AMDGPU::SI_SPILL_V96_SAVE; break; + case 128: Opcode = AMDGPU::SI_SPILL_V128_SAVE; break; + case 256: Opcode = AMDGPU::SI_SPILL_V256_SAVE; break; + case 512: Opcode = AMDGPU::SI_SPILL_V512_SAVE; break; + default: llvm_unreachable("Cannot spill register class"); + } + MFI->allocateLDSSpaceForSpill(FrameIndex, RC->getSize()); + unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + BuildMI(MBB, MI, DL, get(Opcode), TmpReg) + .addReg(MFI->getSpillTIDVirtualReg(MRI, MBB.getParent())) + .addReg(SrcReg) + .addImm(FrameIndex); } else { - llvm_unreachable("VGPR spilling not supported"); + llvm_unreachable("Don't know how to spill register class"); } } @@ -235,6 +252,7 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const { SIMachineFunctionInfo *MFI = MBB.getParent()->getInfo<SIMachineFunctionInfo>(); + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); DebugLoc DL = MBB.findDebugLoc(MI); if (TRI->getCommonSubClass(RC, &AMDGPU::SReg_32RegClass)) { SIMachineFunctionInfo::SpilledReg Spill = @@ -261,9 +279,69 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, .addReg(Spill.VGPR) .addImm(FrameIndex); insertNOPs(MI, 3); - } else { - llvm_unreachable("VGPR spilling not supported"); + } else if(RI.hasVGPRs(RC)) { + unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + unsigned Opcode; + switch(RC->getSize() * 8) { + case 32: Opcode = AMDGPU::SI_SPILL_V32_RESTORE; break; + case 64: Opcode = AMDGPU::SI_SPILL_V64_RESTORE; break; + case 96: Opcode = AMDGPU::SI_SPILL_V96_RESTORE; break; + case 128: Opcode = AMDGPU::SI_SPILL_V128_RESTORE; break; + case 256: Opcode = AMDGPU::SI_SPILL_V256_RESTORE; break; + case 512: Opcode = AMDGPU::SI_SPILL_V512_RESTORE; break; + default: llvm_unreachable("Cannot spill register class"); + } + BuildMI(MBB, MI, DL, get(Opcode), DestReg) + .addReg(TmpReg, RegState::Define) + .addReg(MFI->getSpillTIDVirtualReg(MRI, MBB.getParent())) + .addImm(FrameIndex); + } +} + +/// \param @Offset Offset in bytes of the FrameIndex being spilled +unsigned SIInstrInfo::calculateLDSSpillAddress(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + unsigned TmpReg, + unsigned TIDOffsetReg, + unsigned FrameOffset, + unsigned Size) const { + SIMachineFunctionInfo *MFI = MBB.getParent()->getInfo<SIMachineFunctionInfo>(); + DebugLoc DL = MBB.findDebugLoc(MI); + unsigned ThreadsInWave = 64; + unsigned LDSOffset = MFI->LDSSize + (FrameOffset * ThreadsInWave); + + if (!MFI->HasCalculatedTIDOffset) { + MachineBasicBlock &Entry = MBB.getParent()->front(); + MachineBasicBlock::iterator Insert = Entry.front(); + DebugLoc DL = Insert->getDebugLoc(); + // Get the wave id + BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_LO_U32_B32_e64), + TIDOffsetReg) + .addImm(-1) + .addImm(0) + .addImm(0) + .addImm(0) + .addImm(0) + .addImm(0); + + BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_HI_U32_B32_e32), + TIDOffsetReg) + .addImm(-1) + .addReg(TIDOffsetReg); + + BuildMI(Entry, Insert, DL, get(AMDGPU::V_LSHLREV_B32_e32), + TIDOffsetReg) + .addImm(2) + .addReg(TIDOffsetReg); + MFI->HasCalculatedTIDOffset = true; } + + // Add FrameIndex to LDS offset + BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), TmpReg) + .addImm(LDSOffset) + .addReg(TIDOffsetReg); + + return TmpReg; } static unsigned getNumSubRegsForSpillOp(unsigned Op) { @@ -271,16 +349,30 @@ static unsigned getNumSubRegsForSpillOp(unsigned Op) { switch (Op) { case AMDGPU::SI_SPILL_S512_SAVE: case AMDGPU::SI_SPILL_S512_RESTORE: + case AMDGPU::SI_SPILL_V512_SAVE: + case AMDGPU::SI_SPILL_V512_RESTORE: return 16; case AMDGPU::SI_SPILL_S256_SAVE: case AMDGPU::SI_SPILL_S256_RESTORE: + case AMDGPU::SI_SPILL_V256_SAVE: + case AMDGPU::SI_SPILL_V256_RESTORE: return 8; case AMDGPU::SI_SPILL_S128_SAVE: case AMDGPU::SI_SPILL_S128_RESTORE: + case AMDGPU::SI_SPILL_V128_SAVE: + case AMDGPU::SI_SPILL_V128_RESTORE: return 4; + case AMDGPU::SI_SPILL_V96_SAVE: + case AMDGPU::SI_SPILL_V96_RESTORE: + return 3; case AMDGPU::SI_SPILL_S64_SAVE: case AMDGPU::SI_SPILL_S64_RESTORE: + case AMDGPU::SI_SPILL_V64_SAVE: + case AMDGPU::SI_SPILL_V64_RESTORE: return 2; + case AMDGPU::SI_SPILL_V32_SAVE: + case AMDGPU::SI_SPILL_V32_RESTORE: + return 1; default: llvm_unreachable("Invalid spill opcode"); } } @@ -351,7 +443,72 @@ bool SIInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { MI->eraseFromParent(); break; } + + // VGPR register spill to LDS + case AMDGPU::SI_SPILL_V512_SAVE: + case AMDGPU::SI_SPILL_V256_SAVE: + case AMDGPU::SI_SPILL_V128_SAVE: + case AMDGPU::SI_SPILL_V64_SAVE: + case AMDGPU::SI_SPILL_V32_SAVE: { + unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode()); + unsigned TmpReg = MI->getOperand(0).getReg(); + unsigned TIDOffsetReg = MI->getOperand(1).getReg(); + unsigned SrcReg = MI->getOperand(2).getReg(); + unsigned FrameIndex = MI->getOperand(3).getImm(); + unsigned Offset = MFI->LDSSpillOffsets[FrameIndex]; + unsigned Size = NumSubRegs * 4; + + for (unsigned i = 0, e = NumSubRegs; i != e; ++i) { + unsigned SubReg = NumSubRegs > 1 ? + RI.getPhysRegSubReg(SrcReg, &AMDGPU::VGPR_32RegClass, i) : + SrcReg; + Offset += (i * 4); + unsigned AddrReg = calculateLDSSpillAddress(MBB, MI, TmpReg, TIDOffsetReg, + Offset, Size); + + // Store the value in LDS + BuildMI(MBB, MI, DL, get(AMDGPU::DS_WRITE_B32)) + .addImm(0) // gds + .addReg(AddrReg) // addr + .addReg(SubReg) // data0 + .addImm(0); // offset + } + + MI->eraseFromParent(); + break; + } + case AMDGPU::SI_SPILL_V32_RESTORE: + case AMDGPU::SI_SPILL_V64_RESTORE: + case AMDGPU::SI_SPILL_V128_RESTORE: + case AMDGPU::SI_SPILL_V256_RESTORE: + case AMDGPU::SI_SPILL_V512_RESTORE: { + unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode()); + unsigned DstReg = MI->getOperand(0).getReg(); + unsigned TempReg = MI->getOperand(1).getReg(); + unsigned TIDOffsetReg = MI->getOperand(2).getReg(); + unsigned FrameIndex = MI->getOperand(3).getImm(); + unsigned Offset = MFI->LDSSpillOffsets[FrameIndex]; + unsigned Size = NumSubRegs * 4; + + // FIXME: We could use DS_READ_B64 here to optimize for larger registers. + for (unsigned i = 0, e = NumSubRegs; i != e; ++i) { + unsigned SubReg = NumSubRegs > 1 ? + RI.getPhysRegSubReg(DstReg, &AMDGPU::VGPR_32RegClass, i) : + DstReg; + + Offset += (i * 4); + unsigned AddrReg = calculateLDSSpillAddress(MBB, MI, TempReg, TIDOffsetReg, + Offset, Size); + BuildMI(MBB, MI, DL, get(AMDGPU::DS_READ_B32), SubReg) + .addImm(0) // gds + .addReg(AddrReg) // addr + .addImm(0); //offset + } + MI->eraseFromParent(); + break; } + } + return true; } diff --git a/lib/Target/R600/SIInstrInfo.h b/lib/Target/R600/SIInstrInfo.h index d7992742216..1e131787c03 100644 --- a/lib/Target/R600/SIInstrInfo.h +++ b/lib/Target/R600/SIInstrInfo.h @@ -48,6 +48,14 @@ private: MachineInstr *Inst, unsigned Opcode) const; void addDescImplicitUseDef(const MCInstrDesc &Desc, MachineInstr *MI) const; + unsigned calculateLDSSpillAddress(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + unsigned TmpReg, + unsigned TIDOffsetReg, + unsigned Offset, + unsigned Size) const; + + public: explicit SIInstrInfo(AMDGPUTargetMachine &tm); diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td index 0d23141aa5d..c7cf1ffe020 100644 --- a/lib/Target/R600/SIInstructions.td +++ b/lib/Target/R600/SIInstructions.td @@ -1592,6 +1592,29 @@ defm SI_SPILL_S128 : SI_SPILL_SGPR <SReg_128>; defm SI_SPILL_S256 : SI_SPILL_SGPR <SReg_256>; defm SI_SPILL_S512 : SI_SPILL_SGPR <SReg_512>; +multiclass SI_SPILL_VGPR <RegisterClass vgpr_class> { + def _SAVE : InstSI < + (outs VGPR_32:$temp), + (ins VGPR_32:$tid, vgpr_class:$src, i32imm:$frame_idx), + "", [] + > { + let Constraints = "@earlyclobber $temp"; + } + + def _RESTORE : InstSI < + (outs vgpr_class:$dst, VGPR_32:$temp), + (ins VGPR_32:$tid, i32imm:$frame_idx), + "", [] + >; +} + +defm SI_SPILL_V32 : SI_SPILL_VGPR <VReg_32>; +defm SI_SPILL_V64 : SI_SPILL_VGPR <VReg_64>; +defm SI_SPILL_V96 : SI_SPILL_VGPR <VReg_96>; +defm SI_SPILL_V128 : SI_SPILL_VGPR <VReg_128>; +defm SI_SPILL_V256 : SI_SPILL_VGPR <VReg_256>; +defm SI_SPILL_V512 : SI_SPILL_VGPR <VReg_512>; + } // end IsCodeGenOnly, isPseudo def : Pat< diff --git a/lib/Target/R600/SIMachineFunctionInfo.cpp b/lib/Target/R600/SIMachineFunctionInfo.cpp index af609958129..e4a39f65674 100644 --- a/lib/Target/R600/SIMachineFunctionInfo.cpp +++ b/lib/Target/R600/SIMachineFunctionInfo.cpp @@ -12,6 +12,7 @@ #include "SIMachineFunctionInfo.h" #include "SIInstrInfo.h" #include "SIRegisterInfo.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/IR/Function.h" #include "llvm/IR/LLVMContext.h" @@ -26,8 +27,22 @@ void SIMachineFunctionInfo::anchor() {} SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) : AMDGPUMachineFunction(MF), + SpillTIDVirtualReg(AMDGPU::NoRegister), PSInputAddr(0), - SpillTracker() { } + SpillTracker(), + LDSWaveSpillSize(0), + HasCalculatedTIDOffset(false) { } + +static void addFunctionLiveOut(unsigned Reg, MachineFunction *MF) { + for (MachineBasicBlock &MBB : *MF) { + if (MBB.back().getOpcode() == AMDGPU::S_ENDPGM) { + MBB.back().addOperand(*MF, MachineOperand::CreateReg(Reg, false, true)); + return; + } + } + MF->getFunction()->getContext().emitError( + "Could not found S_ENGPGM instrtuction."); +} static unsigned createLaneVGPR(MachineRegisterInfo &MRI, MachineFunction *MF) { unsigned VGPR = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass); @@ -55,15 +70,8 @@ static unsigned createLaneVGPR(MachineRegisterInfo &MRI, MachineFunction *MF) { // // To work around this, we add Lane VGPRs to the functions live out list, // so that we can guarantee its live range will cover all of its uses. + addFunctionLiveOut(VGPR, MF); - for (MachineBasicBlock &MBB : *MF) { - if (MBB.back().getOpcode() == AMDGPU::S_ENDPGM) { - MBB.back().addOperand(*MF, MachineOperand::CreateReg(VGPR, false, true)); - return VGPR; - } - } - MF->getFunction()->getContext().emitError( - "Could not found S_ENGPGM instrtuction."); return VGPR; } @@ -92,3 +100,28 @@ const SIMachineFunctionInfo::SpilledReg& SIMachineFunctionInfo::RegSpillTracker::getSpilledReg(unsigned FrameIndex) { return SpilledRegisters[FrameIndex]; } + +unsigned SIMachineFunctionInfo::getSpillTIDVirtualReg( + MachineRegisterInfo &MRI, + MachineFunction *MF) { + if (SpillTIDVirtualReg == AMDGPU::NoRegister) { + SpillTIDVirtualReg = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass); + MachineBasicBlock &Entry = MF->front(); + MachineBasicBlock::iterator Insert = Entry.front(); + Insert->addOperand(MachineOperand::CreateReg(SpillTIDVirtualReg, + true, true)); + addFunctionLiveOut(SpillTIDVirtualReg, MF); + } + + return SpillTIDVirtualReg; +} + +unsigned SIMachineFunctionInfo::allocateLDSSpaceForSpill(unsigned FrameIndex, + unsigned NumBytes) { + if (!LDSSpillOffsets.count(FrameIndex)) { + LDSSpillOffsets[FrameIndex] = LDSWaveSpillSize; + LDSWaveSpillSize += NumBytes; + } + + return LDSSpillOffsets[FrameIndex]; +} diff --git a/lib/Target/R600/SIMachineFunctionInfo.h b/lib/Target/R600/SIMachineFunctionInfo.h index 96e619bde8d..1b004fd0238 100644 --- a/lib/Target/R600/SIMachineFunctionInfo.h +++ b/lib/Target/R600/SIMachineFunctionInfo.h @@ -26,6 +26,9 @@ class MachineRegisterInfo; /// tells the hardware which interpolation parameters to load. class SIMachineFunctionInfo : public AMDGPUMachineFunction { void anchor() override; + + unsigned SpillTIDVirtualReg; + public: struct SpilledReg { @@ -59,6 +62,13 @@ public: SIMachineFunctionInfo(const MachineFunction &MF); unsigned PSInputAddr; struct RegSpillTracker SpillTracker; + unsigned LDSWaveSpillSize; + bool HasCalculatedTIDOffset; + /// Key is FrameIndex, value is byte offset + std::map<unsigned, unsigned> LDSSpillOffsets; + unsigned getSpillTIDVirtualReg(MachineRegisterInfo &MRI, MachineFunction *MF); + /// Returns the wave local offset for this \p FrameIndex + unsigned allocateLDSSpaceForSpill(unsigned FrameIndex, unsigned NumBytes); }; } // End namespace llvm |