diff options
Diffstat (limited to 'lib/Target/AMDGPU/R600ISelLowering.cpp')
-rw-r--r-- | lib/Target/AMDGPU/R600ISelLowering.cpp | 265 |
1 files changed, 263 insertions, 2 deletions
diff --git a/lib/Target/AMDGPU/R600ISelLowering.cpp b/lib/Target/AMDGPU/R600ISelLowering.cpp index eaeff4ec217..eb71905340d 100644 --- a/lib/Target/AMDGPU/R600ISelLowering.cpp +++ b/lib/Target/AMDGPU/R600ISelLowering.cpp @@ -18,6 +18,7 @@ #include "R600MachineFunctionInfo.h" #include "llvm/Argument.h" #include "llvm/Function.h" +#include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/SelectionDAG.h" @@ -67,9 +68,20 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) : setOperationAction(ISD::SELECT, MVT::i32, Custom); setOperationAction(ISD::SELECT, MVT::f32, Custom); + // Legalize loads and stores to the private address space. + setOperationAction(ISD::LOAD, MVT::i32, Custom); + setOperationAction(ISD::LOAD, MVT::v2i32, Custom); + setOperationAction(ISD::LOAD, MVT::v4i32, Custom); + setLoadExtAction(ISD::EXTLOAD, MVT::v4i8, Custom); + setLoadExtAction(ISD::EXTLOAD, MVT::i8, Custom); + setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i8, Custom); + setOperationAction(ISD::STORE, MVT::i8, Custom); setOperationAction(ISD::STORE, MVT::i32, Custom); + setOperationAction(ISD::STORE, MVT::v2i32, Custom); setOperationAction(ISD::STORE, MVT::v4i32, Custom); + setOperationAction(ISD::FrameIndex, MVT::i32, Custom); + setTargetDAGCombine(ISD::FP_ROUND); setSchedulingPreference(Sched::VLIW); @@ -150,6 +162,64 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter( break; } + case AMDGPU::RegisterLoad_i32: { + unsigned Channel = MI->getOperand(2).getImm(); + unsigned IndirectRegOffset = TII->getIndirectIndexBegin(*MF); + unsigned Offset = (MI->getOperand(2).getImm() * 4) + Channel + + (IndirectRegOffset * 4); + R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>(); + unsigned IndirectReg = MRI.createVirtualRegister(&AMDGPU::IndirectRegRegClass); + if (MI->getOperand(1).getReg() != AMDGPU::ZERO) { + + MachineInstrBuilder Sequence = BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::REG_SEQUENCE), IndirectReg); + + MachineInstr *MOVA = TII->buildDefaultInstruction(*BB, I, + AMDGPU::MOVA_INT_eg, + AMDGPU::AR_X, + MI->getOperand(1).getReg()); + TII->setImmOperand(MOVA, R600Operands::WRITE, 0); + unsigned OffsetReg = AMDGPU::R600_AddrRegClass.getRegister(Offset); + MachineInstrBuilder MIBuilder = TII->buildDefaultInstruction(*BB, I, + AMDGPU::MOV, MI->getOperand(0).getReg(), + OffsetReg); + for (unsigned i = Channel; i < MFI->IndirectRegs.size(); i+=4) { + unsigned Reg = MFI->IndirectRegs[i]; + assert(Reg); + Sequence.addReg(MFI->IndirectRegs[i]); + Sequence.addImm(TII->getRegisterInfo().getIndirectSubReg(i)); + } + MIBuilder.addReg(IndirectReg, RegState::Implicit | RegState::Kill); + MachineInstr *NewMI = MIBuilder.addReg(AMDGPU::AR_X, RegState::Implicit); + TII->setImmOperand(NewMI, R600Operands::SRC0_REL, 1); + } else { + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::REG_SEQUENCE), + IndirectReg) + .addReg(MFI->IndirectRegs[Channel + (MI->getOperand(2).getImm() * 4)]) + .addImm(TII->getRegisterInfo().getIndirectSubReg(Offset)); + TII->buildDefaultInstruction(*BB, I, AMDGPU::MOV, + MI->getOperand(0).getReg(), + IndirectReg); + } + + break; + } + + case AMDGPU::RegisterStore_i32: { + if (MI->getOperand(1).getReg() != AMDGPU::ZERO) { + return BB; + } + unsigned CopyReg = MRI.createVirtualRegister(&AMDGPU::R600_TReg32RegClass); + unsigned Offset = MI->getOperand(2).getImm() + MI->getOperand(3).getImm(); + R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>(); + unsigned DstReg = MFI->IndirectRegs[Offset]; + unsigned IndirectReg = MFI->IndirectRegSequenceRegs[Offset]; +// MRI.addLiveIn(IndirectReg); + assert(DstReg); + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::COPY), DstReg) + .addOperand(MI->getOperand(0)); + break; + } + case AMDGPU::RESERVE_REG: { R600MachineFunctionInfo * MFI = MF->getInfo<R600MachineFunctionInfo>(); int64_t ReservedIndex = MI->getOperand(0).getImm(); @@ -359,8 +429,10 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); case ISD::SELECT: return LowerSELECT(Op, DAG); case ISD::SETCC: return LowerSETCC(Op, DAG); + case ISD::LOAD: return LowerLOAD(Op, DAG); case ISD::STORE: return LowerSTORE(Op, DAG); case ISD::FPOW: return LowerFPOW(Op, DAG); + case ISD::FrameIndex: return LowerFrameIndex(Op, DAG); case ISD::INTRINSIC_VOID: { SDValue Chain = Op.getOperand(0); unsigned IntrinsicID = @@ -523,6 +595,20 @@ void R600TargetLowering::ReplaceNodeResults(SDNode *N, switch (N->getOpcode()) { default: return; case ISD::FP_TO_UINT: Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG)); + return; + case ISD::LOAD: { + SDNode *Node = LowerLOAD(SDValue(N, 0), DAG).getNode(); + Results.push_back(SDValue(Node, 0)); + Results.push_back(SDValue(Node, 1)); + // XXX: LLVM seems not to replace Chain Value inside CustomWidenLowerNode + // function + DAG.ReplaceAllUsesOfValueWith(SDValue(N,1), SDValue(Node, 1)); + return; + } + case ISD::STORE: + SDNode *Node = LowerSTORE(SDValue(N, 0), DAG).getNode(); + Results.push_back(SDValue(Node, 0)); + return; } } @@ -590,6 +676,21 @@ SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT, false, false, false, 0); } +SDValue R600TargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const { + + MachineFunction &MF = DAG.getMachineFunction(); + const MachineFrameInfo *MFI = MF.getFrameInfo(); + const AMDGPUFrameLowering *TFL = + static_cast<const AMDGPUFrameLowering*>(getTargetMachine().getFrameLowering()); + + FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Op); + assert(FIN); + + unsigned FrameIndex = FIN->getIndex(); + unsigned Offset = TFL->getFrameIndexOffset(MF, FrameIndex); + return DAG.getConstant(Offset * 4 * TFL->getStackWidth(MF), MVT::i32); +} + SDValue R600TargetLowering::LowerROTL(SDValue Op, SelectionDAG &DAG) const { DebugLoc DL = Op.getDebugLoc(); EVT VT = Op.getValueType(); @@ -804,6 +905,118 @@ SDValue R600TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { return Cond; } +/// LLVM generates byte-addresed pointers. For indirect addressing, we need to +/// convert these pointers to a register index. Each register holds +/// 16 bytes, (4 x 32bit sub-register), but we need to take into account the +/// \p StackWidth, which tells us how many of the 4 sub-registrers will be used +/// for indirect addressing. +SDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr, + unsigned StackWidth, + SelectionDAG &DAG) const { + unsigned SRLPad; + switch(StackWidth) { + case 1: + SRLPad = 2; + break; + case 2: + SRLPad = 3; + break; + case 4: + SRLPad = 4; + break; + default: llvm_unreachable("Invalid stack width"); + } + + return DAG.getNode(ISD::SRL, Ptr.getDebugLoc(), Ptr.getValueType(), Ptr, + DAG.getConstant(SRLPad, MVT::i32)); +} + +void R600TargetLowering::getStackAddress(unsigned StackWidth, + unsigned ElemIdx, + unsigned &Channel, + unsigned &PtrIncr) const { + switch (StackWidth) { + default: + case 1: + Channel = 0; + if (ElemIdx > 0) { + PtrIncr = 1; + } else { + PtrIncr = 0; + } + break; + case 2: + Channel = ElemIdx % 2; + if (ElemIdx == 2) { + PtrIncr = 1; + } else { + PtrIncr = 0; + } + break; + case 4: + Channel = ElemIdx; + PtrIncr = 0; + break; + } +} + +SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + DebugLoc DL = Op.getDebugLoc(); + LoadSDNode *LoadNode = cast<LoadSDNode>(Op); + SDValue Chain = Op.getOperand(0); + SDValue Ptr = Op.getOperand(1); + SDValue LoweredLoad; + + if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) { + return SDValue(); + } + + // Lowering for indirect addressing + const MachineFunction &MF = DAG.getMachineFunction(); + const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering*>( + getTargetMachine().getFrameLowering()); + unsigned StackWidth = TFL->getStackWidth(MF); + + Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG); + + if (VT.isVector()) { + unsigned NumElemVT = VT.getVectorNumElements(); + EVT ElemVT = VT.getVectorElementType(); + SDValue Loads[4]; + + assert(NumElemVT >= StackWidth && "Stack width cannot be greater than " + "vector width in load"); + + for (unsigned i = 0; i < NumElemVT; ++i) { + unsigned Channel, PtrIncr; + getStackAddress(StackWidth, i, Channel, PtrIncr); + Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr, + DAG.getConstant(PtrIncr, MVT::i32)); + Loads[i] = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, ElemVT, + Chain, Ptr, + DAG.getTargetConstant(Channel, MVT::i32), + Op.getOperand(2)); + } + for (unsigned i = NumElemVT; i < 4; ++i) { + Loads[i] = DAG.getUNDEF(ElemVT); + } + EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, 4); + LoweredLoad = DAG.getNode(ISD::BUILD_VECTOR, DL, TargetVT, Loads, 4); + } else { + LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT, + Chain, Ptr, + DAG.getTargetConstant(0, MVT::i32), // Channel + Op.getOperand(2)); + } + + SDValue Ops[2]; + Ops[0] = LoweredLoad; + Ops[1] = Chain; + + return DAG.getMergeValues(Ops, 2, DL); +} + SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { DebugLoc DL = Op.getDebugLoc(); StoreSDNode *StoreNode = cast<StoreSDNode>(Op); @@ -825,9 +1038,57 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { } return Chain; } - return SDValue(); -} + EVT VT = Value.getValueType(); + R600MachineFunctionInfo *MFI = + DAG.getMachineFunction().getInfo<R600MachineFunctionInfo>(); + + if (StoreNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) { + return SDValue(); + } + + // Lowering for indirect addressing + + const MachineFunction &MF = DAG.getMachineFunction(); + const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering*>( + getTargetMachine().getFrameLowering()); + unsigned StackWidth = TFL->getStackWidth(MF); + + Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG); + + if (VT.isVector()) { + unsigned NumElemVT = VT.getVectorNumElements(); + EVT ElemVT = VT.getVectorElementType(); + SDValue Stores[4]; + + assert(NumElemVT >= StackWidth && "Stack width cannot be greater than " + "vector width in load"); + + for (unsigned i = 0; i < NumElemVT; ++i) { + unsigned Channel, PtrIncr; + getStackAddress(StackWidth, i, Channel, PtrIncr); + Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr, + DAG.getConstant(PtrIncr, MVT::i32)); + SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT, + Value, DAG.getConstant(i, MVT::i32)); + + Stores[i] = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, + Chain, Elem, Ptr, + DAG.getTargetConstant(Channel, MVT::i32)); + MFI->IndirectChannels.set(i); + } + Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores, NumElemVT); + } else { + if (VT == MVT::i8) { + Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value); + } + Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, Chain, Value, Ptr, + DAG.getTargetConstant(0, MVT::i32)); // Channel + MFI->IndirectChannels.set(0); + } + + return Chain; +} SDValue R600TargetLowering::LowerFPOW(SDValue Op, SelectionDAG &DAG) const { |