diff options
author | Tom Stellard <thomas.stellard@amd.com> | 2014-08-07 09:34:53 -0400 |
---|---|---|
committer | Tom Stellard <thomas.stellard@amd.com> | 2014-11-06 12:02:42 -0500 |
commit | 777e1ca23d35be2019e5810fd62434ed5a13548d (patch) | |
tree | 98fb8d1d56a6a8f11057b1011d17a1fcf071daf5 | |
parent | 1735e87b6a72e84c406dc52b89a162864525d457 (diff) |
SelectionDAG: Fix incorrect lowering of CONCAT_VECTOR
SelectionDAGLegalize::ExpandVectorBuildThroughStack() was being
used to expand CONCAT_VECTORS, but it was assuming that its inputs
would only be BUILD_VECTOR nodes, so it was using the element type
of the input node to calculate the offsets for storing values to
the stack. For example:
v16f32 = concat_vectors v8f32, v8f32
With this node, it would compute the offset as sizeof(f32),
so it would store the first operand at FrameIndex + 0 and the
second operand at FrameIndex + 4. What it should be doing
for concat_vectors is using the value type of the operands
to compute the offset. The second operand should be stored
at FrameIndex + sizeof(v8f32).
R600 currently hits this code path, however it should really be
be custom lowering CONCAT_VECTORS instead of expanding them since
this will produce much better code. There are no testscase, because
no other in-tree targets seem to hit this path, and I don't think
there is much value in adding a test to R600, since I'm going
to modify it to custom lower CONCAT_VECTORS real soon.
-rw-r--r-- | lib/CodeGen/SelectionDAG/LegalizeDAG.cpp | 11 | ||||
-rw-r--r-- | lib/Target/R600/AMDGPUISelDAGToDAG.cpp | 47 | ||||
-rw-r--r-- | lib/Target/R600/SIInstrInfo.cpp | 21 |
3 files changed, 63 insertions, 16 deletions
diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp index d050e07a1ec..362cab84e5f 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -1501,6 +1501,7 @@ SDValue SelectionDAGLegalize::ExpandVectorBuildThroughStack(SDNode* Node) { // Create the stack frame object. EVT VT = Node->getValueType(0); EVT EltVT = VT.getVectorElementType(); + EVT SrcVT = Node->getOperand(0).getValueType(); SDLoc dl(Node); SDValue FIPtr = DAG.CreateStackTemporary(VT); int FI = cast<FrameIndexSDNode>(FIPtr.getNode())->getIndex(); @@ -1508,9 +1509,11 @@ SDValue SelectionDAGLegalize::ExpandVectorBuildThroughStack(SDNode* Node) { // Emit a store of each element to the stack slot. SmallVector<SDValue, 8> Stores; - unsigned TypeByteSize = EltVT.getSizeInBits() / 8; + unsigned TypeByteSize = !SrcVT.isVector() ? EltVT.getStoreSize() : + SrcVT.getStoreSize(); // Store (in the right endianness) the elements to memory. for (unsigned i = 0, e = Node->getNumOperands(); i != e; ++i) { + assert(Node->getOperand(i).getValueType() == SrcVT); // Ignore undef elements. if (Node->getOperand(i).getOpcode() == ISD::UNDEF) continue; @@ -1520,8 +1523,10 @@ SDValue SelectionDAGLegalize::ExpandVectorBuildThroughStack(SDNode* Node) { Idx = DAG.getNode(ISD::ADD, dl, FIPtr.getValueType(), FIPtr, Idx); // If the destination vector element type is narrower than the source - // element type, only store the bits necessary. - if (EltVT.bitsLT(Node->getOperand(i).getValueType().getScalarType())) { + // element type, only store the bits necessary. This is for the case + // where a build vector has operand types larger than the destination + // element type. e.g. v4i16 = BUILD_VECTOR i32, i32, i32, i32 + if (!SrcVT.isVector() && EltVT.bitsLT(SrcVT)) { Stores.push_back(DAG.getTruncStore(DAG.getEntryNode(), dl, Node->getOperand(i), Idx, PtrInfo.getWithOffset(Offset), diff --git a/lib/Target/R600/AMDGPUISelDAGToDAG.cpp b/lib/Target/R600/AMDGPUISelDAGToDAG.cpp index 59f395bda61..b1937b90897 100644 --- a/lib/Target/R600/AMDGPUISelDAGToDAG.cpp +++ b/lib/Target/R600/AMDGPUISelDAGToDAG.cpp @@ -944,6 +944,53 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, return SelectMUBUFAddr64(Addr, SRsrc, VAddr, Offset); } +static SDValue buildSMovImm32(SelectionDAG *DAG, SDLoc DL, uint64_t Val) { + SDValue K = DAG->getTargetConstant(Val, MVT::i32); + return SDValue(DAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0); +} + +static SDValue buildRSRC(SelectionDAG *DAG, SDLoc DL, SDValue Ptr, + uint32_t RsrcDword1, uint64_t RsrcDword2And3) { + + SDValue PtrLo = DAG->getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr); + SDValue PtrHi = DAG->getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr); + if (RsrcDword1) { + PtrHi = SDValue(DAG->getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi, + DAG->getConstant(RsrcDword1, MVT::i32)), 0); + } + + SDValue DataLo = buildSMovImm32(DAG, DL, + RsrcDword2And3 & UINT64_C(0xFFFFFFFF)); + SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32); + + const SDValue Ops[] = { + DAG->getTargetConstant(AMDGPU::SReg_128RegClassID, MVT::i32), + PtrLo, + DAG->getTargetConstant(AMDGPU::sub0, MVT::i32), + PtrHi, + DAG->getTargetConstant(AMDGPU::sub1, MVT::i32), + DataLo, + DAG->getTargetConstant(AMDGPU::sub2, MVT::i32), + DataHi, + DAG->getTargetConstant(AMDGPU::sub3, MVT::i32) + }; + + return SDValue(DAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL, + MVT::v4i32, Ops), 0); +} + +/// \brief Return a resource descriptor with the 'Add TID' bit enabled +/// The TID (Thread ID) is multipled by the stride value (bits [61:48] +/// of the resource descriptor) to create an offset, which is added to the +/// resource pointer. +static SDValue buildScratchRSRC(SelectionDAG *DAG, SDLoc DL, SDValue Ptr) { + + uint64_t Rsrc = AMDGPU::RSRC_DATA_FORMAT | AMDGPU::RSRC_TID_ENABLE | + 0xffffffff; // Size + + return buildRSRC(DAG, DL, Ptr, 0, Rsrc); +} + bool AMDGPUDAGToDAGISel::SelectMUBUFScratch(SDValue Addr, SDValue &Rsrc, SDValue &VAddr, SDValue &SOffset, SDValue &ImmOffset) const { diff --git a/lib/Target/R600/SIInstrInfo.cpp b/lib/Target/R600/SIInstrInfo.cpp index 4de885c23d1..327b70786c7 100644 --- a/lib/Target/R600/SIInstrInfo.cpp +++ b/lib/Target/R600/SIInstrInfo.cpp @@ -49,7 +49,7 @@ static SDValue findChainOperand(SDNode *Load) { /// \brief Returns true if both nodes have the same value for the given /// operand \p Op, or if both nodes do not have this operand. -static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) { +static bool nodesHaveSameOperandValue(SDNode *N0, SDNode *N1, unsigned OpName) { unsigned Opc0 = N0->getMachineOpcode(); unsigned Opc1 = N1->getMachineOpcode(); @@ -1563,25 +1563,22 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const { unsigned SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); unsigned SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); unsigned NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass); + DebugLoc DL = MI->getDebugLoc(); // Zero64 = 0 - BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B64), - Zero64) + BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), Zero64) .addImm(0); // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0} - BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), - SRsrcFormatLo) + BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), SRsrcFormatLo) .addImm(AMDGPU::RSRC_DATA_FORMAT & 0xFFFFFFFF); // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32} - BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), - SRsrcFormatHi) + BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), SRsrcFormatHi) .addImm(AMDGPU::RSRC_DATA_FORMAT >> 32); // NewSRsrc = {Zero64, SRsrcFormat} - BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), - NewSRsrc) + BuildMI(MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewSRsrc) .addReg(Zero64) .addImm(AMDGPU::sub0_sub1) .addReg(SRsrcFormatLo) @@ -1600,15 +1597,13 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const { NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass); // NewVaddrLo = SRsrcPtrLo + VAddr:sub0 - BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::V_ADD_I32_e32), - NewVAddrLo) + BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), NewVAddrLo) .addReg(SRsrcPtrLo) .addReg(VAddr->getReg(), 0, AMDGPU::sub0) .addReg(AMDGPU::VCC, RegState::ImplicitDefine); // NewVaddrHi = SRsrcPtrHi + VAddr:sub1 - BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::V_ADDC_U32_e32), - NewVAddrHi) + BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e32), NewVAddrHi) .addReg(SRsrcPtrHi) .addReg(VAddr->getReg(), 0, AMDGPU::sub1) .addReg(AMDGPU::VCC, RegState::ImplicitDefine) |