summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTom Stellard <thomas.stellard@amd.com>2014-08-07 09:34:53 -0400
committerTom Stellard <thomas.stellard@amd.com>2014-11-06 12:02:42 -0500
commit777e1ca23d35be2019e5810fd62434ed5a13548d (patch)
tree98fb8d1d56a6a8f11057b1011d17a1fcf071daf5
parent1735e87b6a72e84c406dc52b89a162864525d457 (diff)
SelectionDAG: Fix incorrect lowering of CONCAT_VECTOR
SelectionDAGLegalize::ExpandVectorBuildThroughStack() was being used to expand CONCAT_VECTORS, but it was assuming that its inputs would only be BUILD_VECTOR nodes, so it was using the element type of the input node to calculate the offsets for storing values to the stack. For example: v16f32 = concat_vectors v8f32, v8f32 With this node, it would compute the offset as sizeof(f32), so it would store the first operand at FrameIndex + 0 and the second operand at FrameIndex + 4. What it should be doing for concat_vectors is using the value type of the operands to compute the offset. The second operand should be stored at FrameIndex + sizeof(v8f32). R600 currently hits this code path, however it should really be be custom lowering CONCAT_VECTORS instead of expanding them since this will produce much better code. There are no testscase, because no other in-tree targets seem to hit this path, and I don't think there is much value in adding a test to R600, since I'm going to modify it to custom lower CONCAT_VECTORS real soon.
-rw-r--r--lib/CodeGen/SelectionDAG/LegalizeDAG.cpp11
-rw-r--r--lib/Target/R600/AMDGPUISelDAGToDAG.cpp47
-rw-r--r--lib/Target/R600/SIInstrInfo.cpp21
3 files changed, 63 insertions, 16 deletions
diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index d050e07a1ec..362cab84e5f 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -1501,6 +1501,7 @@ SDValue SelectionDAGLegalize::ExpandVectorBuildThroughStack(SDNode* Node) {
// Create the stack frame object.
EVT VT = Node->getValueType(0);
EVT EltVT = VT.getVectorElementType();
+ EVT SrcVT = Node->getOperand(0).getValueType();
SDLoc dl(Node);
SDValue FIPtr = DAG.CreateStackTemporary(VT);
int FI = cast<FrameIndexSDNode>(FIPtr.getNode())->getIndex();
@@ -1508,9 +1509,11 @@ SDValue SelectionDAGLegalize::ExpandVectorBuildThroughStack(SDNode* Node) {
// Emit a store of each element to the stack slot.
SmallVector<SDValue, 8> Stores;
- unsigned TypeByteSize = EltVT.getSizeInBits() / 8;
+ unsigned TypeByteSize = !SrcVT.isVector() ? EltVT.getStoreSize() :
+ SrcVT.getStoreSize();
// Store (in the right endianness) the elements to memory.
for (unsigned i = 0, e = Node->getNumOperands(); i != e; ++i) {
+ assert(Node->getOperand(i).getValueType() == SrcVT);
// Ignore undef elements.
if (Node->getOperand(i).getOpcode() == ISD::UNDEF) continue;
@@ -1520,8 +1523,10 @@ SDValue SelectionDAGLegalize::ExpandVectorBuildThroughStack(SDNode* Node) {
Idx = DAG.getNode(ISD::ADD, dl, FIPtr.getValueType(), FIPtr, Idx);
// If the destination vector element type is narrower than the source
- // element type, only store the bits necessary.
- if (EltVT.bitsLT(Node->getOperand(i).getValueType().getScalarType())) {
+ // element type, only store the bits necessary. This is for the case
+ // where a build vector has operand types larger than the destination
+ // element type. e.g. v4i16 = BUILD_VECTOR i32, i32, i32, i32
+ if (!SrcVT.isVector() && EltVT.bitsLT(SrcVT)) {
Stores.push_back(DAG.getTruncStore(DAG.getEntryNode(), dl,
Node->getOperand(i), Idx,
PtrInfo.getWithOffset(Offset),
diff --git a/lib/Target/R600/AMDGPUISelDAGToDAG.cpp b/lib/Target/R600/AMDGPUISelDAGToDAG.cpp
index 59f395bda61..b1937b90897 100644
--- a/lib/Target/R600/AMDGPUISelDAGToDAG.cpp
+++ b/lib/Target/R600/AMDGPUISelDAGToDAG.cpp
@@ -944,6 +944,53 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
return SelectMUBUFAddr64(Addr, SRsrc, VAddr, Offset);
}
+static SDValue buildSMovImm32(SelectionDAG *DAG, SDLoc DL, uint64_t Val) {
+ SDValue K = DAG->getTargetConstant(Val, MVT::i32);
+ return SDValue(DAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0);
+}
+
+static SDValue buildRSRC(SelectionDAG *DAG, SDLoc DL, SDValue Ptr,
+ uint32_t RsrcDword1, uint64_t RsrcDword2And3) {
+
+ SDValue PtrLo = DAG->getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr);
+ SDValue PtrHi = DAG->getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr);
+ if (RsrcDword1) {
+ PtrHi = SDValue(DAG->getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi,
+ DAG->getConstant(RsrcDword1, MVT::i32)), 0);
+ }
+
+ SDValue DataLo = buildSMovImm32(DAG, DL,
+ RsrcDword2And3 & UINT64_C(0xFFFFFFFF));
+ SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32);
+
+ const SDValue Ops[] = {
+ DAG->getTargetConstant(AMDGPU::SReg_128RegClassID, MVT::i32),
+ PtrLo,
+ DAG->getTargetConstant(AMDGPU::sub0, MVT::i32),
+ PtrHi,
+ DAG->getTargetConstant(AMDGPU::sub1, MVT::i32),
+ DataLo,
+ DAG->getTargetConstant(AMDGPU::sub2, MVT::i32),
+ DataHi,
+ DAG->getTargetConstant(AMDGPU::sub3, MVT::i32)
+ };
+
+ return SDValue(DAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
+ MVT::v4i32, Ops), 0);
+}
+
+/// \brief Return a resource descriptor with the 'Add TID' bit enabled
+/// The TID (Thread ID) is multipled by the stride value (bits [61:48]
+/// of the resource descriptor) to create an offset, which is added to the
+/// resource pointer.
+static SDValue buildScratchRSRC(SelectionDAG *DAG, SDLoc DL, SDValue Ptr) {
+
+ uint64_t Rsrc = AMDGPU::RSRC_DATA_FORMAT | AMDGPU::RSRC_TID_ENABLE |
+ 0xffffffff; // Size
+
+ return buildRSRC(DAG, DL, Ptr, 0, Rsrc);
+}
+
bool AMDGPUDAGToDAGISel::SelectMUBUFScratch(SDValue Addr, SDValue &Rsrc,
SDValue &VAddr, SDValue &SOffset,
SDValue &ImmOffset) const {
diff --git a/lib/Target/R600/SIInstrInfo.cpp b/lib/Target/R600/SIInstrInfo.cpp
index 4de885c23d1..327b70786c7 100644
--- a/lib/Target/R600/SIInstrInfo.cpp
+++ b/lib/Target/R600/SIInstrInfo.cpp
@@ -49,7 +49,7 @@ static SDValue findChainOperand(SDNode *Load) {
/// \brief Returns true if both nodes have the same value for the given
/// operand \p Op, or if both nodes do not have this operand.
-static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) {
+static bool nodesHaveSameOperandValue(SDNode *N0, SDNode *N1, unsigned OpName) {
unsigned Opc0 = N0->getMachineOpcode();
unsigned Opc1 = N1->getMachineOpcode();
@@ -1563,25 +1563,22 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const {
unsigned SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
unsigned SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
unsigned NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass);
+ DebugLoc DL = MI->getDebugLoc();
// Zero64 = 0
- BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B64),
- Zero64)
+ BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), Zero64)
.addImm(0);
// SRsrcFormatLo = RSRC_DATA_FORMAT{31-0}
- BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32),
- SRsrcFormatLo)
+ BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), SRsrcFormatLo)
.addImm(AMDGPU::RSRC_DATA_FORMAT & 0xFFFFFFFF);
// SRsrcFormatHi = RSRC_DATA_FORMAT{63-32}
- BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32),
- SRsrcFormatHi)
+ BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), SRsrcFormatHi)
.addImm(AMDGPU::RSRC_DATA_FORMAT >> 32);
// NewSRsrc = {Zero64, SRsrcFormat}
- BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
- NewSRsrc)
+ BuildMI(MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewSRsrc)
.addReg(Zero64)
.addImm(AMDGPU::sub0_sub1)
.addReg(SRsrcFormatLo)
@@ -1600,15 +1597,13 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const {
NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass);
// NewVaddrLo = SRsrcPtrLo + VAddr:sub0
- BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::V_ADD_I32_e32),
- NewVAddrLo)
+ BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), NewVAddrLo)
.addReg(SRsrcPtrLo)
.addReg(VAddr->getReg(), 0, AMDGPU::sub0)
.addReg(AMDGPU::VCC, RegState::ImplicitDefine);
// NewVaddrHi = SRsrcPtrHi + VAddr:sub1
- BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::V_ADDC_U32_e32),
- NewVAddrHi)
+ BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e32), NewVAddrHi)
.addReg(SRsrcPtrHi)
.addReg(VAddr->getReg(), 0, AMDGPU::sub1)
.addReg(AMDGPU::VCC, RegState::ImplicitDefine)