SelectionDAG: Fix incorrect lowering of CONCAT_VECTOR

SelectionDAGLegalize::ExpandVectorBuildThroughStack() was being used to expand CONCAT_VECTORS, but it was assuming that its inputs would only be BUILD_VECTOR nodes, so it was using the element type of the input node to calculate the offsets for storing values to the stack. For example: v16f32 = concat_vectors v8f32, v8f32 With this node, it would compute the offset as sizeof(f32), so it would store the first operand at FrameIndex + 0 and the second operand at FrameIndex + 4. What it should be doing for concat_vectors is using the value type of the operands to compute the offset. The second operand should be stored at FrameIndex + sizeof(v8f32). R600 currently hits this code path, however it should really be be custom lowering CONCAT_VECTORS instead of expanding them since this will produce much better code. There are no testscase, because no other in-tree targets seem to hit this path, and I don't think there is much value in adding a test to R600, since I'm going to modify it to custom lower CONCAT_VECTORS real soon.
author: Tom Stellard <thomas.stellard@amd.com> 2014-08-07 09:34:53 -0400
committer: Tom Stellard <thomas.stellard@amd.com> 2014-11-06 12:02:42 -0500
commit: 777e1ca23d35be2019e5810fd62434ed5a13548d (patch)
tree: 98fb8d1d56a6a8f11057b1011d17a1fcf071daf5
parent: 1735e87b6a72e84c406dc52b89a162864525d457 (diff)
3 files changed, 63 insertions, 16 deletions
diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index d050e07a1ec..362cab84e5f 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -1501,6 +1501,7 @@ SDValue SelectionDAGLegalize::ExpandVectorBuildThroughStack(SDNode* Node) {
   // Create the stack frame object.
   EVT VT = Node->getValueType(0);
   EVT EltVT = VT.getVectorElementType();
+  EVT SrcVT = Node->getOperand(0).getValueType();
   SDLoc dl(Node);
   SDValue FIPtr = DAG.CreateStackTemporary(VT);
   int FI = cast<FrameIndexSDNode>(FIPtr.getNode())->getIndex();
@@ -1508,9 +1509,11 @@ SDValue SelectionDAGLegalize::ExpandVectorBuildThroughStack(SDNode* Node) {
 
   // Emit a store of each element to the stack slot.
   SmallVector<SDValue, 8> Stores;
-  unsigned TypeByteSize = EltVT.getSizeInBits() / 8;
+  unsigned TypeByteSize = !SrcVT.isVector() ? EltVT.getStoreSize() :
+                                              SrcVT.getStoreSize();
   // Store (in the right endianness) the elements to memory.
   for (unsigned i = 0, e = Node->getNumOperands(); i != e; ++i) {
+    assert(Node->getOperand(i).getValueType() == SrcVT);
     // Ignore undef elements.
     if (Node->getOperand(i).getOpcode() == ISD::UNDEF) continue;
 
@@ -1520,8 +1523,10 @@ SDValue SelectionDAGLegalize::ExpandVectorBuildThroughStack(SDNode* Node) {
     Idx = DAG.getNode(ISD::ADD, dl, FIPtr.getValueType(), FIPtr, Idx);
 
     // If the destination vector element type is narrower than the source
-    // element type, only store the bits necessary.
-    if (EltVT.bitsLT(Node->getOperand(i).getValueType().getScalarType())) {
+    // element type, only store the bits necessary.  This is for the case
+    // where a build vector has operand types larger than the destination
+    // element type. e.g. v4i16 = BUILD_VECTOR i32, i32, i32, i32
+    if (!SrcVT.isVector() && EltVT.bitsLT(SrcVT)) {
       Stores.push_back(DAG.getTruncStore(DAG.getEntryNode(), dl,
                                          Node->getOperand(i), Idx,
                                          PtrInfo.getWithOffset(Offset),
diff --git a/lib/Target/R600/AMDGPUISelDAGToDAG.cpp b/lib/Target/R600/AMDGPUISelDAGToDAG.cpp
index 59f395bda61..b1937b90897 100644
--- a/lib/Target/R600/AMDGPUISelDAGToDAG.cpp
+++ b/lib/Target/R600/AMDGPUISelDAGToDAG.cpp
@@ -944,6 +944,53 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
   return SelectMUBUFAddr64(Addr, SRsrc, VAddr, Offset);
 }
 
+static SDValue buildSMovImm32(SelectionDAG *DAG, SDLoc DL, uint64_t Val) {
+  SDValue K = DAG->getTargetConstant(Val, MVT::i32);
+  return SDValue(DAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0);
+}
+
+static SDValue buildRSRC(SelectionDAG *DAG, SDLoc DL, SDValue Ptr,
+                         uint32_t RsrcDword1, uint64_t RsrcDword2And3) {
+
+  SDValue PtrLo = DAG->getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr);
+  SDValue PtrHi = DAG->getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr);
+  if (RsrcDword1) {
+    PtrHi = SDValue(DAG->getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi,
+                                    DAG->getConstant(RsrcDword1, MVT::i32)), 0);
+  }
+
+  SDValue DataLo = buildSMovImm32(DAG, DL,
+                                  RsrcDword2And3 & UINT64_C(0xFFFFFFFF));
+  SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32);
+
+  const SDValue Ops[] = {
+    DAG->getTargetConstant(AMDGPU::SReg_128RegClassID, MVT::i32),
+    PtrLo,
+    DAG->getTargetConstant(AMDGPU::sub0, MVT::i32),
+    PtrHi,
+    DAG->getTargetConstant(AMDGPU::sub1, MVT::i32),
+    DataLo,
+    DAG->getTargetConstant(AMDGPU::sub2, MVT::i32),
+    DataHi,
+    DAG->getTargetConstant(AMDGPU::sub3, MVT::i32)
+  };
+
+  return SDValue(DAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
+                                     MVT::v4i32, Ops), 0);
+}
+
+/// \brief Return a resource descriptor with the 'Add TID' bit enabled
+///        The TID (Thread ID) is multipled by the stride value (bits [61:48]
+///        of the resource descriptor) to create an offset, which is added to the
+///        resource pointer.
+static SDValue buildScratchRSRC(SelectionDAG *DAG, SDLoc DL, SDValue Ptr) {
+
+  uint64_t Rsrc = AMDGPU::RSRC_DATA_FORMAT | AMDGPU::RSRC_TID_ENABLE |
+                  0xffffffff; // Size
+
+  return buildRSRC(DAG, DL, Ptr, 0, Rsrc);
+}
+
 bool AMDGPUDAGToDAGISel::SelectMUBUFScratch(SDValue Addr, SDValue &Rsrc,
                                             SDValue &VAddr, SDValue &SOffset,
                                             SDValue &ImmOffset) const {
diff --git a/lib/Target/R600/SIInstrInfo.cpp b/lib/Target/R600/SIInstrInfo.cpp
index 4de885c23d1..327b70786c7 100644
--- a/lib/Target/R600/SIInstrInfo.cpp
+++ b/lib/Target/R600/SIInstrInfo.cpp
@@ -49,7 +49,7 @@ static SDValue findChainOperand(SDNode *Load) {
 
 /// \brief Returns true if both nodes have the same value for the given
 ///        operand \p Op, or if both nodes do not have this operand.
-static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) {
+static bool nodesHaveSameOperandValue(SDNode *N0, SDNode *N1, unsigned OpName) {
   unsigned Opc0 = N0->getMachineOpcode();
   unsigned Opc1 = N1->getMachineOpcode();
 
@@ -1563,25 +1563,22 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const {
     unsigned SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
     unsigned SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
     unsigned NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass);
+    DebugLoc DL = MI->getDebugLoc();
 
     // Zero64 = 0
-    BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B64),
-            Zero64)
+    BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), Zero64)
             .addImm(0);
 
     // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0}
-    BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32),
-            SRsrcFormatLo)
+    BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), SRsrcFormatLo)
             .addImm(AMDGPU::RSRC_DATA_FORMAT & 0xFFFFFFFF);
 
     // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32}
-    BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32),
-            SRsrcFormatHi)
+    BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), SRsrcFormatHi)
             .addImm(AMDGPU::RSRC_DATA_FORMAT >> 32);
 
     // NewSRsrc = {Zero64, SRsrcFormat}
-    BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
-            NewSRsrc)
+    BuildMI(MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewSRsrc)
             .addReg(Zero64)
             .addImm(AMDGPU::sub0_sub1)
             .addReg(SRsrcFormatLo)
@@ -1600,15 +1597,13 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const {
       NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass);
 
       // NewVaddrLo = SRsrcPtrLo + VAddr:sub0
-      BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::V_ADD_I32_e32),
-              NewVAddrLo)
+      BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), NewVAddrLo)
               .addReg(SRsrcPtrLo)
               .addReg(VAddr->getReg(), 0, AMDGPU::sub0)
               .addReg(AMDGPU::VCC, RegState::ImplicitDefine);
 
       // NewVaddrHi = SRsrcPtrHi + VAddr:sub1
-      BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::V_ADDC_U32_e32),
-              NewVAddrHi)
+      BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e32), NewVAddrHi)
               .addReg(SRsrcPtrHi)
               .addReg(VAddr->getReg(), 0, AMDGPU::sub1)
               .addReg(AMDGPU::VCC, RegState::ImplicitDefine)
author	Tom Stellard <thomas.stellard@amd.com>	2014-08-07 09:34:53 -0400
committer	Tom Stellard <thomas.stellard@amd.com>	2014-11-06 12:02:42 -0500
commit	777e1ca23d35be2019e5810fd62434ed5a13548d (patch)
tree	98fb8d1d56a6a8f11057b1011d17a1fcf071daf5
parent	1735e87b6a72e84c406dc52b89a162864525d457 (diff)