diff options
author | Tom Stellard <thomas.stellard@amd.com> | 2014-03-03 14:27:38 -0500 |
---|---|---|
committer | Tom Stellard <thomas.stellard@amd.com> | 2014-03-03 14:50:17 -0500 |
commit | 14e6c5b9e0a0bf05afcaf4f1154f0cf82c4b5fd2 (patch) | |
tree | daa753b8f27c1ca7fc80ed44e6e9ddb599cd5b08 | |
parent | 8ec6153697c262ad554fd28486ec0a3824561fc5 (diff) |
R600/SI: Implement VGPR register spillingsi-spill-fixes
VGPRs are spilled to LDS.
-rw-r--r-- | lib/Target/R600/AMDGPUAsmPrinter.cpp | 3 | ||||
-rw-r--r-- | lib/Target/R600/SIInstrInfo.cpp | 162 | ||||
-rw-r--r-- | lib/Target/R600/SIInstrInfo.h | 8 | ||||
-rw-r--r-- | lib/Target/R600/SIInstructions.td | 21 | ||||
-rw-r--r-- | lib/Target/R600/SIMachineFunctionInfo.cpp | 22 | ||||
-rw-r--r-- | lib/Target/R600/SIMachineFunctionInfo.h | 9 |
6 files changed, 221 insertions, 4 deletions
diff --git a/lib/Target/R600/AMDGPUAsmPrinter.cpp b/lib/Target/R600/AMDGPUAsmPrinter.cpp index ccc3d135876..10d81d91433 100644 --- a/lib/Target/R600/AMDGPUAsmPrinter.cpp +++ b/lib/Target/R600/AMDGPUAsmPrinter.cpp @@ -309,7 +309,8 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(MachineFunction &MF, LDSAlignShift = 9; } unsigned LDSBlocks = - RoundUpToAlignment(MFI->LDSSize, 1 << LDSAlignShift) >> LDSAlignShift; + RoundUpToAlignment(MFI->LDSSize + (MFI->LDSWaveSpillSize * 64), + 1 << LDSAlignShift) >> LDSAlignShift; if (MFI->ShaderType == ShaderType::COMPUTE) { OutStreamer.EmitIntValue(R_00B84C_COMPUTE_PGM_RSRC2, 4); diff --git a/lib/Target/R600/SIInstrInfo.cpp b/lib/Target/R600/SIInstrInfo.cpp index c9239f58f27..a2b0451afe1 100644 --- a/lib/Target/R600/SIInstrInfo.cpp +++ b/lib/Target/R600/SIInstrInfo.cpp @@ -204,7 +204,7 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, .addReg(SrcReg, KillFlag) .addImm(Lane); MFI->SpillTracker.addSpilledReg(FrameIndex, MFI->SpillTracker.LaneVGPR, Lane); - } else { + } else if (RI.isSGPRClass(RC)){ // We are only allowed to create one new instruction when spilling // registers, so we need to use pseudo instruction for vector // registers. @@ -228,6 +228,24 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, BuildMI(MBB, MI, DL, get(Opcode), MFI->SpillTracker.LaneVGPR) .addReg(SrcReg) .addImm(FrameIndex); + } else if(RI.hasVGPRs(RC)) { + unsigned Opcode; + switch(RC->getSize() * 8) { + case 32: Opcode = AMDGPU::SI_SPILL_V32_SAVE; break; + case 64: Opcode = AMDGPU::SI_SPILL_V64_SAVE; break; + case 96: Opcode = AMDGPU::SI_SPILL_V96_SAVE; break; + case 128: Opcode = AMDGPU::SI_SPILL_V128_SAVE; break; + case 256: Opcode = AMDGPU::SI_SPILL_V256_SAVE; break; + case 512: Opcode = AMDGPU::SI_SPILL_V512_SAVE; break; + default: llvm_unreachable("Cannot spill register class"); + } + MFI->allocateLDSSpaceForSpill(FrameIndex, RC->getSize()); + unsigned TmpReg = MFI->getSpillTIDVirtualReg(MRI); + BuildMI(MBB, MI, DL, get(Opcode), TmpReg) + .addReg(SrcReg) + .addImm(FrameIndex); + } else { + llvm_unreachable("Don't know how to spill register class"); } } @@ -246,7 +264,7 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, BuildMI(MBB, MI, DL, get(AMDGPU::V_READLANE_B32), DestReg) .addReg(Spill.VGPR) .addImm(Spill.Lane); - } else { + } else if (RI.isSGPRClass(RC)){ unsigned Opcode; switch(RC->getSize() * 8) { case 64: Opcode = AMDGPU::SI_SPILL_S64_RESTORE; break; @@ -262,24 +280,86 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, BuildMI(MBB, MI, DL, get(Opcode), DestReg) .addReg(Spill.VGPR) .addImm(FrameIndex); + } else if(RI.hasVGPRs(RC)) { + unsigned TmpReg = MFI->getSpillTIDVirtualReg(MRI); + unsigned Opcode; + switch(RC->getSize() * 8) { + case 32: Opcode = AMDGPU::SI_SPILL_V32_RESTORE; break; + case 64: Opcode = AMDGPU::SI_SPILL_V64_RESTORE; break; + case 96: Opcode = AMDGPU::SI_SPILL_V96_RESTORE; break; + case 128: Opcode = AMDGPU::SI_SPILL_V128_RESTORE; break; + case 256: Opcode = AMDGPU::SI_SPILL_V256_RESTORE; break; + case 512: Opcode = AMDGPU::SI_SPILL_V512_RESTORE; break; + default: llvm_unreachable("Cannot spill register class"); + } + BuildMI(MBB, MI, DL, get(Opcode), DestReg) + .addReg(TmpReg) + .addImm(FrameIndex); } } +unsigned SIInstrInfo::calculateLDSSpillAddress(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + unsigned TmpReg, + unsigned Offset, + unsigned Size) const { + SIMachineFunctionInfo *MFI = MBB.getParent()->getInfo<SIMachineFunctionInfo>(); + DebugLoc DL = MBB.findDebugLoc(MI); + unsigned ThreadsInWave = 64; + unsigned LDSOffset = MFI->LDSSize + (Offset * ThreadsInWave); + + // Get the wave id + BuildMI(MBB, MI, DL, get(AMDGPU::V_MBCNT_LO_U32_B32_e64), TmpReg) + .addImm(-1) + .addImm(0) + .addImm(0) + .addImm(0) + .addImm(0) + .addImm(0); + + BuildMI(MBB, MI, DL, get(AMDGPU::V_MBCNT_HI_U32_B32_e32), TmpReg) + .addImm(-1) + .addReg(TmpReg); + + // Multiply wave id by frame size to get this waves offset in LDS. + BuildMI(MBB, MI, DL, get(AMDGPU::V_LSHLREV_B32_e32), TmpReg) + .addImm(llvm::countTrailingZeros(Size)) + .addReg(TmpReg); + + // Add FrameIndex to LDS offset + BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), TmpReg) + .addImm(LDSOffset) + .addReg(TmpReg); + + return TmpReg; +} + static unsigned getNumSubRegsForSpillOp(unsigned Op) { switch (Op) { case AMDGPU::SI_SPILL_S512_SAVE: case AMDGPU::SI_SPILL_S512_RESTORE: + case AMDGPU::SI_SPILL_V512_SAVE: + case AMDGPU::SI_SPILL_V512_RESTORE: return 16; case AMDGPU::SI_SPILL_S256_SAVE: case AMDGPU::SI_SPILL_S256_RESTORE: + case AMDGPU::SI_SPILL_V256_SAVE: + case AMDGPU::SI_SPILL_V256_RESTORE: return 8; case AMDGPU::SI_SPILL_S128_SAVE: case AMDGPU::SI_SPILL_S128_RESTORE: + case AMDGPU::SI_SPILL_V128_SAVE: + case AMDGPU::SI_SPILL_V128_RESTORE: return 4; case AMDGPU::SI_SPILL_S64_SAVE: case AMDGPU::SI_SPILL_S64_RESTORE: + case AMDGPU::SI_SPILL_V64_SAVE: + case AMDGPU::SI_SPILL_V64_RESTORE: return 2; + case AMDGPU::SI_SPILL_V32_SAVE: + case AMDGPU::SI_SPILL_V32_RESTORE: + return 1; default: llvm_unreachable("Invalid spill opcode"); } } @@ -338,6 +418,84 @@ bool SIInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { break; } + // VGPR register spill to LDS + case AMDGPU::SI_SPILL_V512_SAVE: + case AMDGPU::SI_SPILL_V256_SAVE: + case AMDGPU::SI_SPILL_V128_SAVE: + case AMDGPU::SI_SPILL_V64_SAVE: + case AMDGPU::SI_SPILL_V32_SAVE: { + unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode()); + unsigned FrameIndex = MI->getOperand(2).getImm(); + unsigned Offset = MFI->LDSSpillOffsets[FrameIndex]; + unsigned Size = NumSubRegs * 4; + unsigned SrcReg = MI->getOperand(1).getReg(); + unsigned TmpReg = MI->getOperand(0).getReg(); + unsigned AddrReg = calculateLDSSpillAddress(MBB, MI, TmpReg, + Offset, Size); + + // FIXME: We can use LDS_WRITE_B64 here to optimize for larger registers. + for (unsigned i = 0, e = NumSubRegs; i != e; ++i) { + unsigned SubReg = NumSubRegs > 1 ? + RI.getPhysRegSubReg(SrcReg, &AMDGPU::VGPR_32RegClass, i) : + SrcReg; + + if (i > 0) { + BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), AddrReg) + .addImm(4) + .addReg(AddrReg); + } + + // Store the value in LDS + BuildMI(MBB, MI, DL, get(AMDGPU::DS_WRITE_B32)) + .addImm(0) // gds + .addReg(AddrReg) // addr + .addReg(SubReg) // data0 + .addReg(SubReg) // data0 + .addImm(0) // offset0 + .addImm(0); //offset1 + } + + MI->eraseFromParent(); + break; + } + case AMDGPU::SI_SPILL_V32_RESTORE: + case AMDGPU::SI_SPILL_V64_RESTORE: + case AMDGPU::SI_SPILL_V128_RESTORE: + case AMDGPU::SI_SPILL_V256_RESTORE: + case AMDGPU::SI_SPILL_V512_RESTORE: { + unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode()); + unsigned FrameIndex = MI->getOperand(2).getImm(); + unsigned DstReg = MI->getOperand(0).getReg(); + unsigned Offset = MFI->LDSSpillOffsets[FrameIndex]; + unsigned Size = NumSubRegs * 4; + unsigned AddrReg = calculateLDSSpillAddress(MBB, MI, + MI->getOperand(1).getReg(), + Offset, Size); + + // FIXME: We could use DS_READ_B64 here to optimize for larger registers. + for (unsigned i = 0, e = NumSubRegs; i != e; ++i) { + unsigned SubReg = NumSubRegs > 1 ? + RI.getPhysRegSubReg(DstReg, &AMDGPU::VGPR_32RegClass, i) : + DstReg; + + if (i > 0) { + BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), AddrReg) + .addImm(4) + .addReg(AddrReg); + } + BuildMI(MBB, MI, DL, get(AMDGPU::DS_READ_B32), SubReg) + .addImm(0) // gds + .addReg(AddrReg) // addr + .addReg(AddrReg) // data0 + .addReg(AddrReg) // data0 + .addImm(0) // offset0 + .addImm(0); //offset1 + } + MI->eraseFromParent(); + break; + } + } + return true; } diff --git a/lib/Target/R600/SIInstrInfo.h b/lib/Target/R600/SIInstrInfo.h index c83d090d965..ee1fd7253a7 100644 --- a/lib/Target/R600/SIInstrInfo.h +++ b/lib/Target/R600/SIInstrInfo.h @@ -31,6 +31,14 @@ private: unsigned MovRelOp, unsigned Dst, unsigned Src0) const; + + unsigned calculateLDSSpillAddress(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + unsigned TmpReg, + unsigned Offset, + unsigned Size) const; + + // If you add or remove instructions from this function, you will public: diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td index 32dbfba70e9..2be06951f73 100644 --- a/lib/Target/R600/SIInstructions.td +++ b/lib/Target/R600/SIInstructions.td @@ -1437,6 +1437,27 @@ defm SI_SPILL_S128 : SI_SPILL_SGPR <SReg_128>; defm SI_SPILL_S256 : SI_SPILL_SGPR <SReg_256>; defm SI_SPILL_S512 : SI_SPILL_SGPR <SReg_512>; +multiclass SI_SPILL_VGPR <RegisterClass vgpr_class> { + def _SAVE : InstSI < + (outs VGPR_32:$temp), + (ins vgpr_class:$src, i32imm:$frame_idx), + "", [] + >; + + def _RESTORE : InstSI < + (outs vgpr_class:$dst, VGPR_32:$temp), + (ins i32imm:$frame_idx), + "", [] + >; +} + +defm SI_SPILL_V32 : SI_SPILL_VGPR <VReg_32>; +defm SI_SPILL_V64 : SI_SPILL_VGPR <VReg_64>; +defm SI_SPILL_V96 : SI_SPILL_VGPR <VReg_96>; +defm SI_SPILL_V128 : SI_SPILL_VGPR <VReg_128>; +defm SI_SPILL_V256 : SI_SPILL_VGPR <VReg_256>; +defm SI_SPILL_V512 : SI_SPILL_VGPR <VReg_512>; + } // end IsCodeGenOnly, isPseudo def : Pat< diff --git a/lib/Target/R600/SIMachineFunctionInfo.cpp b/lib/Target/R600/SIMachineFunctionInfo.cpp index 4f9fed1d474..8d01ad7c25a 100644 --- a/lib/Target/R600/SIMachineFunctionInfo.cpp +++ b/lib/Target/R600/SIMachineFunctionInfo.cpp @@ -23,8 +23,10 @@ void SIMachineFunctionInfo::anchor() {} SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) : AMDGPUMachineFunction(MF), + SpillTIDVirtualReg(AMDGPU::NoRegister), PSInputAddr(0), - SpillTracker() { } + SpillTracker(), + LDSWaveSpillSize(0) { } static unsigned createLaneVGPR(MachineRegisterInfo &MRI) { return MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass); @@ -57,3 +59,21 @@ const SIMachineFunctionInfo::SpilledReg& SIMachineFunctionInfo::RegSpillTracker::getSpilledReg(unsigned FrameIndex) { return SpilledRegisters[FrameIndex]; } + +unsigned SIMachineFunctionInfo::getSpillTIDVirtualReg( + MachineRegisterInfo &MRI) { + if (SpillTIDVirtualReg == AMDGPU::NoRegister) + SpillTIDVirtualReg = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass); + + return SpillTIDVirtualReg; +} + +unsigned SIMachineFunctionInfo::allocateLDSSpaceForSpill(unsigned FrameIndex, + unsigned NumBytes) { + if (!LDSSpillOffsets.count(FrameIndex)) { + LDSSpillOffsets[FrameIndex] = LDSWaveSpillSize; + LDSWaveSpillSize += NumBytes; + } + + return LDSSpillOffsets[FrameIndex]; +} diff --git a/lib/Target/R600/SIMachineFunctionInfo.h b/lib/Target/R600/SIMachineFunctionInfo.h index f78b1f36c7e..b4470de04f5 100644 --- a/lib/Target/R600/SIMachineFunctionInfo.h +++ b/lib/Target/R600/SIMachineFunctionInfo.h @@ -26,6 +26,9 @@ class MachineRegisterInfo; /// tells the hardware which interpolation parameters to load. class SIMachineFunctionInfo : public AMDGPUMachineFunction { virtual void anchor(); + + unsigned SpillTIDVirtualReg; + public: struct SpilledReg { @@ -58,6 +61,12 @@ public: SIMachineFunctionInfo(const MachineFunction &MF); unsigned PSInputAddr; struct RegSpillTracker SpillTracker; + unsigned LDSWaveSpillSize; + /// Key is FrameIndex, value is byte offset + std::map<unsigned, unsigned> LDSSpillOffsets; + unsigned getSpillTIDVirtualReg(MachineRegisterInfo &MRI); + /// Returns the wave local offset for this \p FrameIndex + unsigned allocateLDSSpaceForSpill(unsigned FrameIndex, unsigned NumBytes); }; } // End namespace llvm |