summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTom Stellard <thomas.stellard@amd.com>2014-03-03 14:27:38 -0500
committerTom Stellard <thomas.stellard@amd.com>2014-03-03 14:50:17 -0500
commit14e6c5b9e0a0bf05afcaf4f1154f0cf82c4b5fd2 (patch)
treedaa753b8f27c1ca7fc80ed44e6e9ddb599cd5b08
parent8ec6153697c262ad554fd28486ec0a3824561fc5 (diff)
R600/SI: Implement VGPR register spillingsi-spill-fixes
VGPRs are spilled to LDS.
-rw-r--r--lib/Target/R600/AMDGPUAsmPrinter.cpp3
-rw-r--r--lib/Target/R600/SIInstrInfo.cpp162
-rw-r--r--lib/Target/R600/SIInstrInfo.h8
-rw-r--r--lib/Target/R600/SIInstructions.td21
-rw-r--r--lib/Target/R600/SIMachineFunctionInfo.cpp22
-rw-r--r--lib/Target/R600/SIMachineFunctionInfo.h9
6 files changed, 221 insertions, 4 deletions
diff --git a/lib/Target/R600/AMDGPUAsmPrinter.cpp b/lib/Target/R600/AMDGPUAsmPrinter.cpp
index ccc3d135876..10d81d91433 100644
--- a/lib/Target/R600/AMDGPUAsmPrinter.cpp
+++ b/lib/Target/R600/AMDGPUAsmPrinter.cpp
@@ -309,7 +309,8 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(MachineFunction &MF,
LDSAlignShift = 9;
}
unsigned LDSBlocks =
- RoundUpToAlignment(MFI->LDSSize, 1 << LDSAlignShift) >> LDSAlignShift;
+ RoundUpToAlignment(MFI->LDSSize + (MFI->LDSWaveSpillSize * 64),
+ 1 << LDSAlignShift) >> LDSAlignShift;
if (MFI->ShaderType == ShaderType::COMPUTE) {
OutStreamer.EmitIntValue(R_00B84C_COMPUTE_PGM_RSRC2, 4);
diff --git a/lib/Target/R600/SIInstrInfo.cpp b/lib/Target/R600/SIInstrInfo.cpp
index c9239f58f27..a2b0451afe1 100644
--- a/lib/Target/R600/SIInstrInfo.cpp
+++ b/lib/Target/R600/SIInstrInfo.cpp
@@ -204,7 +204,7 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
.addReg(SrcReg, KillFlag)
.addImm(Lane);
MFI->SpillTracker.addSpilledReg(FrameIndex, MFI->SpillTracker.LaneVGPR, Lane);
- } else {
+ } else if (RI.isSGPRClass(RC)){
// We are only allowed to create one new instruction when spilling
// registers, so we need to use pseudo instruction for vector
// registers.
@@ -228,6 +228,24 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
BuildMI(MBB, MI, DL, get(Opcode), MFI->SpillTracker.LaneVGPR)
.addReg(SrcReg)
.addImm(FrameIndex);
+ } else if(RI.hasVGPRs(RC)) {
+ unsigned Opcode;
+ switch(RC->getSize() * 8) {
+ case 32: Opcode = AMDGPU::SI_SPILL_V32_SAVE; break;
+ case 64: Opcode = AMDGPU::SI_SPILL_V64_SAVE; break;
+ case 96: Opcode = AMDGPU::SI_SPILL_V96_SAVE; break;
+ case 128: Opcode = AMDGPU::SI_SPILL_V128_SAVE; break;
+ case 256: Opcode = AMDGPU::SI_SPILL_V256_SAVE; break;
+ case 512: Opcode = AMDGPU::SI_SPILL_V512_SAVE; break;
+ default: llvm_unreachable("Cannot spill register class");
+ }
+ MFI->allocateLDSSpaceForSpill(FrameIndex, RC->getSize());
+ unsigned TmpReg = MFI->getSpillTIDVirtualReg(MRI);
+ BuildMI(MBB, MI, DL, get(Opcode), TmpReg)
+ .addReg(SrcReg)
+ .addImm(FrameIndex);
+ } else {
+ llvm_unreachable("Don't know how to spill register class");
}
}
@@ -246,7 +264,7 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
BuildMI(MBB, MI, DL, get(AMDGPU::V_READLANE_B32), DestReg)
.addReg(Spill.VGPR)
.addImm(Spill.Lane);
- } else {
+ } else if (RI.isSGPRClass(RC)){
unsigned Opcode;
switch(RC->getSize() * 8) {
case 64: Opcode = AMDGPU::SI_SPILL_S64_RESTORE; break;
@@ -262,24 +280,86 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
BuildMI(MBB, MI, DL, get(Opcode), DestReg)
.addReg(Spill.VGPR)
.addImm(FrameIndex);
+ } else if(RI.hasVGPRs(RC)) {
+ unsigned TmpReg = MFI->getSpillTIDVirtualReg(MRI);
+ unsigned Opcode;
+ switch(RC->getSize() * 8) {
+ case 32: Opcode = AMDGPU::SI_SPILL_V32_RESTORE; break;
+ case 64: Opcode = AMDGPU::SI_SPILL_V64_RESTORE; break;
+ case 96: Opcode = AMDGPU::SI_SPILL_V96_RESTORE; break;
+ case 128: Opcode = AMDGPU::SI_SPILL_V128_RESTORE; break;
+ case 256: Opcode = AMDGPU::SI_SPILL_V256_RESTORE; break;
+ case 512: Opcode = AMDGPU::SI_SPILL_V512_RESTORE; break;
+ default: llvm_unreachable("Cannot spill register class");
+ }
+ BuildMI(MBB, MI, DL, get(Opcode), DestReg)
+ .addReg(TmpReg)
+ .addImm(FrameIndex);
}
}
+unsigned SIInstrInfo::calculateLDSSpillAddress(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ unsigned TmpReg,
+ unsigned Offset,
+ unsigned Size) const {
+ SIMachineFunctionInfo *MFI = MBB.getParent()->getInfo<SIMachineFunctionInfo>();
+ DebugLoc DL = MBB.findDebugLoc(MI);
+ unsigned ThreadsInWave = 64;
+ unsigned LDSOffset = MFI->LDSSize + (Offset * ThreadsInWave);
+
+ // Get the wave id
+ BuildMI(MBB, MI, DL, get(AMDGPU::V_MBCNT_LO_U32_B32_e64), TmpReg)
+ .addImm(-1)
+ .addImm(0)
+ .addImm(0)
+ .addImm(0)
+ .addImm(0)
+ .addImm(0);
+
+ BuildMI(MBB, MI, DL, get(AMDGPU::V_MBCNT_HI_U32_B32_e32), TmpReg)
+ .addImm(-1)
+ .addReg(TmpReg);
+
+ // Multiply wave id by frame size to get this waves offset in LDS.
+ BuildMI(MBB, MI, DL, get(AMDGPU::V_LSHLREV_B32_e32), TmpReg)
+ .addImm(llvm::countTrailingZeros(Size))
+ .addReg(TmpReg);
+
+ // Add FrameIndex to LDS offset
+ BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), TmpReg)
+ .addImm(LDSOffset)
+ .addReg(TmpReg);
+
+ return TmpReg;
+}
+
static unsigned getNumSubRegsForSpillOp(unsigned Op) {
switch (Op) {
case AMDGPU::SI_SPILL_S512_SAVE:
case AMDGPU::SI_SPILL_S512_RESTORE:
+ case AMDGPU::SI_SPILL_V512_SAVE:
+ case AMDGPU::SI_SPILL_V512_RESTORE:
return 16;
case AMDGPU::SI_SPILL_S256_SAVE:
case AMDGPU::SI_SPILL_S256_RESTORE:
+ case AMDGPU::SI_SPILL_V256_SAVE:
+ case AMDGPU::SI_SPILL_V256_RESTORE:
return 8;
case AMDGPU::SI_SPILL_S128_SAVE:
case AMDGPU::SI_SPILL_S128_RESTORE:
+ case AMDGPU::SI_SPILL_V128_SAVE:
+ case AMDGPU::SI_SPILL_V128_RESTORE:
return 4;
case AMDGPU::SI_SPILL_S64_SAVE:
case AMDGPU::SI_SPILL_S64_RESTORE:
+ case AMDGPU::SI_SPILL_V64_SAVE:
+ case AMDGPU::SI_SPILL_V64_RESTORE:
return 2;
+ case AMDGPU::SI_SPILL_V32_SAVE:
+ case AMDGPU::SI_SPILL_V32_RESTORE:
+ return 1;
default: llvm_unreachable("Invalid spill opcode");
}
}
@@ -338,6 +418,84 @@ bool SIInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
break;
}
+ // VGPR register spill to LDS
+ case AMDGPU::SI_SPILL_V512_SAVE:
+ case AMDGPU::SI_SPILL_V256_SAVE:
+ case AMDGPU::SI_SPILL_V128_SAVE:
+ case AMDGPU::SI_SPILL_V64_SAVE:
+ case AMDGPU::SI_SPILL_V32_SAVE: {
+ unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode());
+ unsigned FrameIndex = MI->getOperand(2).getImm();
+ unsigned Offset = MFI->LDSSpillOffsets[FrameIndex];
+ unsigned Size = NumSubRegs * 4;
+ unsigned SrcReg = MI->getOperand(1).getReg();
+ unsigned TmpReg = MI->getOperand(0).getReg();
+ unsigned AddrReg = calculateLDSSpillAddress(MBB, MI, TmpReg,
+ Offset, Size);
+
+ // FIXME: We can use LDS_WRITE_B64 here to optimize for larger registers.
+ for (unsigned i = 0, e = NumSubRegs; i != e; ++i) {
+ unsigned SubReg = NumSubRegs > 1 ?
+ RI.getPhysRegSubReg(SrcReg, &AMDGPU::VGPR_32RegClass, i) :
+ SrcReg;
+
+ if (i > 0) {
+ BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), AddrReg)
+ .addImm(4)
+ .addReg(AddrReg);
+ }
+
+ // Store the value in LDS
+ BuildMI(MBB, MI, DL, get(AMDGPU::DS_WRITE_B32))
+ .addImm(0) // gds
+ .addReg(AddrReg) // addr
+ .addReg(SubReg) // data0
+ .addReg(SubReg) // data0
+ .addImm(0) // offset0
+ .addImm(0); //offset1
+ }
+
+ MI->eraseFromParent();
+ break;
+ }
+ case AMDGPU::SI_SPILL_V32_RESTORE:
+ case AMDGPU::SI_SPILL_V64_RESTORE:
+ case AMDGPU::SI_SPILL_V128_RESTORE:
+ case AMDGPU::SI_SPILL_V256_RESTORE:
+ case AMDGPU::SI_SPILL_V512_RESTORE: {
+ unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode());
+ unsigned FrameIndex = MI->getOperand(2).getImm();
+ unsigned DstReg = MI->getOperand(0).getReg();
+ unsigned Offset = MFI->LDSSpillOffsets[FrameIndex];
+ unsigned Size = NumSubRegs * 4;
+ unsigned AddrReg = calculateLDSSpillAddress(MBB, MI,
+ MI->getOperand(1).getReg(),
+ Offset, Size);
+
+ // FIXME: We could use DS_READ_B64 here to optimize for larger registers.
+ for (unsigned i = 0, e = NumSubRegs; i != e; ++i) {
+ unsigned SubReg = NumSubRegs > 1 ?
+ RI.getPhysRegSubReg(DstReg, &AMDGPU::VGPR_32RegClass, i) :
+ DstReg;
+
+ if (i > 0) {
+ BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), AddrReg)
+ .addImm(4)
+ .addReg(AddrReg);
+ }
+ BuildMI(MBB, MI, DL, get(AMDGPU::DS_READ_B32), SubReg)
+ .addImm(0) // gds
+ .addReg(AddrReg) // addr
+ .addReg(AddrReg) // data0
+ .addReg(AddrReg) // data0
+ .addImm(0) // offset0
+ .addImm(0); //offset1
+ }
+ MI->eraseFromParent();
+ break;
+ }
+ }
+
return true;
}
diff --git a/lib/Target/R600/SIInstrInfo.h b/lib/Target/R600/SIInstrInfo.h
index c83d090d965..ee1fd7253a7 100644
--- a/lib/Target/R600/SIInstrInfo.h
+++ b/lib/Target/R600/SIInstrInfo.h
@@ -31,6 +31,14 @@ private:
unsigned MovRelOp,
unsigned Dst,
unsigned Src0) const;
+
+ unsigned calculateLDSSpillAddress(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ unsigned TmpReg,
+ unsigned Offset,
+ unsigned Size) const;
+
+
// If you add or remove instructions from this function, you will
public:
diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td
index 32dbfba70e9..2be06951f73 100644
--- a/lib/Target/R600/SIInstructions.td
+++ b/lib/Target/R600/SIInstructions.td
@@ -1437,6 +1437,27 @@ defm SI_SPILL_S128 : SI_SPILL_SGPR <SReg_128>;
defm SI_SPILL_S256 : SI_SPILL_SGPR <SReg_256>;
defm SI_SPILL_S512 : SI_SPILL_SGPR <SReg_512>;
+multiclass SI_SPILL_VGPR <RegisterClass vgpr_class> {
+ def _SAVE : InstSI <
+ (outs VGPR_32:$temp),
+ (ins vgpr_class:$src, i32imm:$frame_idx),
+ "", []
+ >;
+
+ def _RESTORE : InstSI <
+ (outs vgpr_class:$dst, VGPR_32:$temp),
+ (ins i32imm:$frame_idx),
+ "", []
+ >;
+}
+
+defm SI_SPILL_V32 : SI_SPILL_VGPR <VReg_32>;
+defm SI_SPILL_V64 : SI_SPILL_VGPR <VReg_64>;
+defm SI_SPILL_V96 : SI_SPILL_VGPR <VReg_96>;
+defm SI_SPILL_V128 : SI_SPILL_VGPR <VReg_128>;
+defm SI_SPILL_V256 : SI_SPILL_VGPR <VReg_256>;
+defm SI_SPILL_V512 : SI_SPILL_VGPR <VReg_512>;
+
} // end IsCodeGenOnly, isPseudo
def : Pat<
diff --git a/lib/Target/R600/SIMachineFunctionInfo.cpp b/lib/Target/R600/SIMachineFunctionInfo.cpp
index 4f9fed1d474..8d01ad7c25a 100644
--- a/lib/Target/R600/SIMachineFunctionInfo.cpp
+++ b/lib/Target/R600/SIMachineFunctionInfo.cpp
@@ -23,8 +23,10 @@ void SIMachineFunctionInfo::anchor() {}
SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
: AMDGPUMachineFunction(MF),
+ SpillTIDVirtualReg(AMDGPU::NoRegister),
PSInputAddr(0),
- SpillTracker() { }
+ SpillTracker(),
+ LDSWaveSpillSize(0) { }
static unsigned createLaneVGPR(MachineRegisterInfo &MRI) {
return MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass);
@@ -57,3 +59,21 @@ const SIMachineFunctionInfo::SpilledReg&
SIMachineFunctionInfo::RegSpillTracker::getSpilledReg(unsigned FrameIndex) {
return SpilledRegisters[FrameIndex];
}
+
+unsigned SIMachineFunctionInfo::getSpillTIDVirtualReg(
+ MachineRegisterInfo &MRI) {
+ if (SpillTIDVirtualReg == AMDGPU::NoRegister)
+ SpillTIDVirtualReg = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass);
+
+ return SpillTIDVirtualReg;
+}
+
+unsigned SIMachineFunctionInfo::allocateLDSSpaceForSpill(unsigned FrameIndex,
+ unsigned NumBytes) {
+ if (!LDSSpillOffsets.count(FrameIndex)) {
+ LDSSpillOffsets[FrameIndex] = LDSWaveSpillSize;
+ LDSWaveSpillSize += NumBytes;
+ }
+
+ return LDSSpillOffsets[FrameIndex];
+}
diff --git a/lib/Target/R600/SIMachineFunctionInfo.h b/lib/Target/R600/SIMachineFunctionInfo.h
index f78b1f36c7e..b4470de04f5 100644
--- a/lib/Target/R600/SIMachineFunctionInfo.h
+++ b/lib/Target/R600/SIMachineFunctionInfo.h
@@ -26,6 +26,9 @@ class MachineRegisterInfo;
/// tells the hardware which interpolation parameters to load.
class SIMachineFunctionInfo : public AMDGPUMachineFunction {
virtual void anchor();
+
+ unsigned SpillTIDVirtualReg;
+
public:
struct SpilledReg {
@@ -58,6 +61,12 @@ public:
SIMachineFunctionInfo(const MachineFunction &MF);
unsigned PSInputAddr;
struct RegSpillTracker SpillTracker;
+ unsigned LDSWaveSpillSize;
+ /// Key is FrameIndex, value is byte offset
+ std::map<unsigned, unsigned> LDSSpillOffsets;
+ unsigned getSpillTIDVirtualReg(MachineRegisterInfo &MRI);
+ /// Returns the wave local offset for this \p FrameIndex
+ unsigned allocateLDSSpaceForSpill(unsigned FrameIndex, unsigned NumBytes);
};
} // End namespace llvm