summaryrefslogtreecommitdiff
path: root/lib/Target/AMDGPU/R600ISelLowering.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'lib/Target/AMDGPU/R600ISelLowering.cpp')
-rw-r--r--lib/Target/AMDGPU/R600ISelLowering.cpp265
1 files changed, 263 insertions, 2 deletions
diff --git a/lib/Target/AMDGPU/R600ISelLowering.cpp b/lib/Target/AMDGPU/R600ISelLowering.cpp
index eaeff4ec217..eb71905340d 100644
--- a/lib/Target/AMDGPU/R600ISelLowering.cpp
+++ b/lib/Target/AMDGPU/R600ISelLowering.cpp
@@ -18,6 +18,7 @@
#include "R600MachineFunctionInfo.h"
#include "llvm/Argument.h"
#include "llvm/Function.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/SelectionDAG.h"
@@ -67,9 +68,20 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
setOperationAction(ISD::SELECT, MVT::i32, Custom);
setOperationAction(ISD::SELECT, MVT::f32, Custom);
+ // Legalize loads and stores to the private address space.
+ setOperationAction(ISD::LOAD, MVT::i32, Custom);
+ setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
+ setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
+ setLoadExtAction(ISD::EXTLOAD, MVT::v4i8, Custom);
+ setLoadExtAction(ISD::EXTLOAD, MVT::i8, Custom);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i8, Custom);
+ setOperationAction(ISD::STORE, MVT::i8, Custom);
setOperationAction(ISD::STORE, MVT::i32, Custom);
+ setOperationAction(ISD::STORE, MVT::v2i32, Custom);
setOperationAction(ISD::STORE, MVT::v4i32, Custom);
+ setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
+
setTargetDAGCombine(ISD::FP_ROUND);
setSchedulingPreference(Sched::VLIW);
@@ -150,6 +162,64 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
break;
}
+ case AMDGPU::RegisterLoad_i32: {
+ unsigned Channel = MI->getOperand(2).getImm();
+ unsigned IndirectRegOffset = TII->getIndirectIndexBegin(*MF);
+ unsigned Offset = (MI->getOperand(2).getImm() * 4) + Channel +
+ (IndirectRegOffset * 4);
+ R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>();
+ unsigned IndirectReg = MRI.createVirtualRegister(&AMDGPU::IndirectRegRegClass);
+ if (MI->getOperand(1).getReg() != AMDGPU::ZERO) {
+
+ MachineInstrBuilder Sequence = BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::REG_SEQUENCE), IndirectReg);
+
+ MachineInstr *MOVA = TII->buildDefaultInstruction(*BB, I,
+ AMDGPU::MOVA_INT_eg,
+ AMDGPU::AR_X,
+ MI->getOperand(1).getReg());
+ TII->setImmOperand(MOVA, R600Operands::WRITE, 0);
+ unsigned OffsetReg = AMDGPU::R600_AddrRegClass.getRegister(Offset);
+ MachineInstrBuilder MIBuilder = TII->buildDefaultInstruction(*BB, I,
+ AMDGPU::MOV, MI->getOperand(0).getReg(),
+ OffsetReg);
+ for (unsigned i = Channel; i < MFI->IndirectRegs.size(); i+=4) {
+ unsigned Reg = MFI->IndirectRegs[i];
+ assert(Reg);
+ Sequence.addReg(MFI->IndirectRegs[i]);
+ Sequence.addImm(TII->getRegisterInfo().getIndirectSubReg(i));
+ }
+ MIBuilder.addReg(IndirectReg, RegState::Implicit | RegState::Kill);
+ MachineInstr *NewMI = MIBuilder.addReg(AMDGPU::AR_X, RegState::Implicit);
+ TII->setImmOperand(NewMI, R600Operands::SRC0_REL, 1);
+ } else {
+ BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::REG_SEQUENCE),
+ IndirectReg)
+ .addReg(MFI->IndirectRegs[Channel + (MI->getOperand(2).getImm() * 4)])
+ .addImm(TII->getRegisterInfo().getIndirectSubReg(Offset));
+ TII->buildDefaultInstruction(*BB, I, AMDGPU::MOV,
+ MI->getOperand(0).getReg(),
+ IndirectReg);
+ }
+
+ break;
+ }
+
+ case AMDGPU::RegisterStore_i32: {
+ if (MI->getOperand(1).getReg() != AMDGPU::ZERO) {
+ return BB;
+ }
+ unsigned CopyReg = MRI.createVirtualRegister(&AMDGPU::R600_TReg32RegClass);
+ unsigned Offset = MI->getOperand(2).getImm() + MI->getOperand(3).getImm();
+ R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>();
+ unsigned DstReg = MFI->IndirectRegs[Offset];
+ unsigned IndirectReg = MFI->IndirectRegSequenceRegs[Offset];
+// MRI.addLiveIn(IndirectReg);
+ assert(DstReg);
+ BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::COPY), DstReg)
+ .addOperand(MI->getOperand(0));
+ break;
+ }
+
case AMDGPU::RESERVE_REG: {
R600MachineFunctionInfo * MFI = MF->getInfo<R600MachineFunctionInfo>();
int64_t ReservedIndex = MI->getOperand(0).getImm();
@@ -359,8 +429,10 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
case ISD::SELECT: return LowerSELECT(Op, DAG);
case ISD::SETCC: return LowerSETCC(Op, DAG);
+ case ISD::LOAD: return LowerLOAD(Op, DAG);
case ISD::STORE: return LowerSTORE(Op, DAG);
case ISD::FPOW: return LowerFPOW(Op, DAG);
+ case ISD::FrameIndex: return LowerFrameIndex(Op, DAG);
case ISD::INTRINSIC_VOID: {
SDValue Chain = Op.getOperand(0);
unsigned IntrinsicID =
@@ -523,6 +595,20 @@ void R600TargetLowering::ReplaceNodeResults(SDNode *N,
switch (N->getOpcode()) {
default: return;
case ISD::FP_TO_UINT: Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG));
+ return;
+ case ISD::LOAD: {
+ SDNode *Node = LowerLOAD(SDValue(N, 0), DAG).getNode();
+ Results.push_back(SDValue(Node, 0));
+ Results.push_back(SDValue(Node, 1));
+ // XXX: LLVM seems not to replace Chain Value inside CustomWidenLowerNode
+ // function
+ DAG.ReplaceAllUsesOfValueWith(SDValue(N,1), SDValue(Node, 1));
+ return;
+ }
+ case ISD::STORE:
+ SDNode *Node = LowerSTORE(SDValue(N, 0), DAG).getNode();
+ Results.push_back(SDValue(Node, 0));
+ return;
}
}
@@ -590,6 +676,21 @@ SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
false, false, false, 0);
}
+SDValue R600TargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const {
+
+ MachineFunction &MF = DAG.getMachineFunction();
+ const MachineFrameInfo *MFI = MF.getFrameInfo();
+ const AMDGPUFrameLowering *TFL =
+ static_cast<const AMDGPUFrameLowering*>(getTargetMachine().getFrameLowering());
+
+ FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Op);
+ assert(FIN);
+
+ unsigned FrameIndex = FIN->getIndex();
+ unsigned Offset = TFL->getFrameIndexOffset(MF, FrameIndex);
+ return DAG.getConstant(Offset * 4 * TFL->getStackWidth(MF), MVT::i32);
+}
+
SDValue R600TargetLowering::LowerROTL(SDValue Op, SelectionDAG &DAG) const {
DebugLoc DL = Op.getDebugLoc();
EVT VT = Op.getValueType();
@@ -804,6 +905,118 @@ SDValue R600TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
return Cond;
}
+/// LLVM generates byte-addresed pointers. For indirect addressing, we need to
+/// convert these pointers to a register index. Each register holds
+/// 16 bytes, (4 x 32bit sub-register), but we need to take into account the
+/// \p StackWidth, which tells us how many of the 4 sub-registrers will be used
+/// for indirect addressing.
+SDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr,
+ unsigned StackWidth,
+ SelectionDAG &DAG) const {
+ unsigned SRLPad;
+ switch(StackWidth) {
+ case 1:
+ SRLPad = 2;
+ break;
+ case 2:
+ SRLPad = 3;
+ break;
+ case 4:
+ SRLPad = 4;
+ break;
+ default: llvm_unreachable("Invalid stack width");
+ }
+
+ return DAG.getNode(ISD::SRL, Ptr.getDebugLoc(), Ptr.getValueType(), Ptr,
+ DAG.getConstant(SRLPad, MVT::i32));
+}
+
+void R600TargetLowering::getStackAddress(unsigned StackWidth,
+ unsigned ElemIdx,
+ unsigned &Channel,
+ unsigned &PtrIncr) const {
+ switch (StackWidth) {
+ default:
+ case 1:
+ Channel = 0;
+ if (ElemIdx > 0) {
+ PtrIncr = 1;
+ } else {
+ PtrIncr = 0;
+ }
+ break;
+ case 2:
+ Channel = ElemIdx % 2;
+ if (ElemIdx == 2) {
+ PtrIncr = 1;
+ } else {
+ PtrIncr = 0;
+ }
+ break;
+ case 4:
+ Channel = ElemIdx;
+ PtrIncr = 0;
+ break;
+ }
+}
+
+SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
+ EVT VT = Op.getValueType();
+ DebugLoc DL = Op.getDebugLoc();
+ LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
+ SDValue Chain = Op.getOperand(0);
+ SDValue Ptr = Op.getOperand(1);
+ SDValue LoweredLoad;
+
+ if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
+ return SDValue();
+ }
+
+ // Lowering for indirect addressing
+ const MachineFunction &MF = DAG.getMachineFunction();
+ const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering*>(
+ getTargetMachine().getFrameLowering());
+ unsigned StackWidth = TFL->getStackWidth(MF);
+
+ Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
+
+ if (VT.isVector()) {
+ unsigned NumElemVT = VT.getVectorNumElements();
+ EVT ElemVT = VT.getVectorElementType();
+ SDValue Loads[4];
+
+ assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
+ "vector width in load");
+
+ for (unsigned i = 0; i < NumElemVT; ++i) {
+ unsigned Channel, PtrIncr;
+ getStackAddress(StackWidth, i, Channel, PtrIncr);
+ Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
+ DAG.getConstant(PtrIncr, MVT::i32));
+ Loads[i] = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, ElemVT,
+ Chain, Ptr,
+ DAG.getTargetConstant(Channel, MVT::i32),
+ Op.getOperand(2));
+ }
+ for (unsigned i = NumElemVT; i < 4; ++i) {
+ Loads[i] = DAG.getUNDEF(ElemVT);
+ }
+ EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, 4);
+ LoweredLoad = DAG.getNode(ISD::BUILD_VECTOR, DL, TargetVT, Loads, 4);
+ } else {
+ LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT,
+ Chain, Ptr,
+ DAG.getTargetConstant(0, MVT::i32), // Channel
+ Op.getOperand(2));
+ }
+
+ SDValue Ops[2];
+ Ops[0] = LoweredLoad;
+ Ops[1] = Chain;
+
+ return DAG.getMergeValues(Ops, 2, DL);
+}
+
SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
DebugLoc DL = Op.getDebugLoc();
StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
@@ -825,9 +1038,57 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
}
return Chain;
}
- return SDValue();
-}
+ EVT VT = Value.getValueType();
+ R600MachineFunctionInfo *MFI =
+ DAG.getMachineFunction().getInfo<R600MachineFunctionInfo>();
+
+ if (StoreNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
+ return SDValue();
+ }
+
+ // Lowering for indirect addressing
+
+ const MachineFunction &MF = DAG.getMachineFunction();
+ const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering*>(
+ getTargetMachine().getFrameLowering());
+ unsigned StackWidth = TFL->getStackWidth(MF);
+
+ Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
+
+ if (VT.isVector()) {
+ unsigned NumElemVT = VT.getVectorNumElements();
+ EVT ElemVT = VT.getVectorElementType();
+ SDValue Stores[4];
+
+ assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
+ "vector width in load");
+
+ for (unsigned i = 0; i < NumElemVT; ++i) {
+ unsigned Channel, PtrIncr;
+ getStackAddress(StackWidth, i, Channel, PtrIncr);
+ Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
+ DAG.getConstant(PtrIncr, MVT::i32));
+ SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT,
+ Value, DAG.getConstant(i, MVT::i32));
+
+ Stores[i] = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
+ Chain, Elem, Ptr,
+ DAG.getTargetConstant(Channel, MVT::i32));
+ MFI->IndirectChannels.set(i);
+ }
+ Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores, NumElemVT);
+ } else {
+ if (VT == MVT::i8) {
+ Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value);
+ }
+ Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, Chain, Value, Ptr,
+ DAG.getTargetConstant(0, MVT::i32)); // Channel
+ MFI->IndirectChannels.set(0);
+ }
+
+ return Chain;
+}
SDValue R600TargetLowering::LowerFPOW(SDValue Op,
SelectionDAG &DAG) const {