summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTom Stellard <thomas.stellard@amd.com>2012-11-15 21:32:31 +0000
committerTom Stellard <thomas.stellard@amd.com>2012-11-16 17:53:28 -0500
commit1c4628988f1cb7bdd93321a86fd08101df843343 (patch)
tree5fee20eb33b0b1e6b97b12ee32a4ce0ab6d4fbd6
parent2cfbd43a3917a2608c80b16c890ef052051d93d1 (diff)
XXX: First try at LDS.indirect-wip-3
What's missing: + LDS uses dword offsets, but the current code uses byte offsets. + I think we need to allocate LDS some how. There are config registers for this for Pixel Shaders, but there may be a way to allocate from within the shader. This patch comments out the setOperationAction calls for lowering loads/stores to indirect addressing only to make it easier to test.
-rw-r--r--lib/Target/AMDGPU/AMDGPUInstructions.td9
-rw-r--r--lib/Target/AMDGPU/R600ISelLowering.cpp30
-rw-r--r--lib/Target/AMDGPU/R600Instructions.td97
-rw-r--r--lib/Target/AMDGPU/R600RegisterInfo.td3
-rw-r--r--lib/Target/AMDGPU/R600Schedule.td2
5 files changed, 135 insertions, 6 deletions
diff --git a/lib/Target/AMDGPU/AMDGPUInstructions.td b/lib/Target/AMDGPU/AMDGPUInstructions.td
index 443c6d4bc5b..4b0eb26d959 100644
--- a/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -84,6 +84,15 @@ def zextloadi8_global : PatFrag<(ops node:$ptr), (zextloadi8 node:$ptr), [{
return isGlobalLoad(dyn_cast<LoadSDNode>(N));
}]>;
+def private_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+ return isPrivateLoad(dyn_cast<LoadSDNode>(N));
+}]>;
+
+def private_store : PatFrag<(ops node:$val, node:$ptr),
+ (store node:$val, node:$ptr), [{
+ return isPrivateStore(dyn_cast<StoreSDNode>(N));
+}]>;
+
class Constants {
int TWO_PI = 0x40c90fdb;
int PI = 0x40490fdb;
diff --git a/lib/Target/AMDGPU/R600ISelLowering.cpp b/lib/Target/AMDGPU/R600ISelLowering.cpp
index 0eb5eed8fb8..eee1bc49c20 100644
--- a/lib/Target/AMDGPU/R600ISelLowering.cpp
+++ b/lib/Target/AMDGPU/R600ISelLowering.cpp
@@ -69,6 +69,7 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
setOperationAction(ISD::SELECT, MVT::f32, Custom);
// Legalize loads and stores to the private address space.
+#if 0
setOperationAction(ISD::LOAD, MVT::f32, Custom);
setOperationAction(ISD::LOAD, MVT::i32, Custom);
setOperationAction(ISD::LOAD, MVT::v2f32, Custom);
@@ -85,6 +86,7 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
setOperationAction(ISD::STORE, MVT::v2i32, Custom);
setOperationAction(ISD::STORE, MVT::v4f32, Custom);
setOperationAction(ISD::STORE, MVT::v4i32, Custom);
+#endif
setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
@@ -141,6 +143,34 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
break;
}
+ case AMDGPU::LDS_READ_RET:
+ {
+ // Convert to DWORD address
+ unsigned NewAddr = MRI.createVirtualRegister(
+ &AMDGPU::R600_TReg32_XRegClass);
+ unsigned ShiftValue = MRI.createVirtualRegister(
+ &AMDGPU::R600_TReg32RegClass);
+ unsigned EOP = (llvm::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
+
+ // XXX In theory, we should be able to pass ShiftValue directly to
+ // the LSHR_eg instruction as an inline literal, but I tried doing it
+ // this way and it didn't produce the correct results.
+ TII->buildMovImm(*BB, I, ShiftValue, 2);
+ TII->buildDefaultInstruction(*BB, I, AMDGPU::LSHR_eg, NewAddr,
+ MI->getOperand(1).getReg(),
+ ShiftValue);
+ MachineInstrBuilder NewMI = BuildMI(*BB, I, BB->findDebugLoc(I),
+ TII->get(MI->getOpcode()),
+ AMDGPU::OQA);
+ for (unsigned i = 1, e = MI->getNumOperands(); i < e; ++i) {
+ NewMI.addOperand(MI->getOperand(i));
+ }
+ TII->buildDefaultInstruction(*BB, I, AMDGPU::MOV,
+ MI->getOperand(0).getReg(),
+ AMDGPU::OQA);
+ break;
+ }
+
case AMDGPU::MASK_WRITE:
{
unsigned maskedRegister = MI->getOperand(0).getReg();
diff --git a/lib/Target/AMDGPU/R600Instructions.td b/lib/Target/AMDGPU/R600Instructions.td
index ed9164c695d..cbbe4b38c24 100644
--- a/lib/Target/AMDGPU/R600Instructions.td
+++ b/lib/Target/AMDGPU/R600Instructions.td
@@ -95,15 +95,13 @@ def ADDRDWord : ComplexPattern<i32, 1, "SelectADDRDWord", [], []>;
def ADDRVTX_READ : ComplexPattern<i32, 2, "SelectADDRVTX_READ", [], []>;
def ADDRIndirect : ComplexPattern<iPTR, 2, "SelectADDRIndirect", [], []>;
-class R600ALU_Word0 {
+class R600_ALU_LDS_Word0 {
field bits<32> Word0;
bits<11> src0;
- bits<1> src0_neg;
bits<1> src0_rel;
bits<11> src1;
bits<1> src1_rel;
- bits<1> src1_neg;
bits<3> index_mode = 0;
bits<2> pred_sel;
bits<1> last;
@@ -116,16 +114,23 @@ class R600ALU_Word0 {
let Word0{8-0} = src0_sel;
let Word0{9} = src0_rel;
let Word0{11-10} = src0_chan;
- let Word0{12} = src0_neg;
let Word0{21-13} = src1_sel;
let Word0{22} = src1_rel;
let Word0{24-23} = src1_chan;
- let Word0{25} = src1_neg;
let Word0{28-26} = index_mode;
let Word0{30-29} = pred_sel;
let Word0{31} = last;
}
+class R600ALU_Word0 : R600_ALU_LDS_Word0 {
+
+ bits<1> src0_neg;
+ bits<1> src1_neg;
+
+ let Word0{12} = src0_neg;
+ let Word0{25} = src1_neg;
+}
+
class R600ALU_Word1 {
field bits<32> Word1;
@@ -1377,8 +1382,88 @@ def CONSTANT_LOAD_eg : VTX_READ_32_eg <1,
[(set (f32 R600_TReg32_X:$dst), (constant_load ADDRVTX_READ:$ptr))]
>;
+//===----------------------------------------------------------------------===//
+// LDS Instructions
+//===----------------------------------------------------------------------===//
+class R600_LDS <bits<6> lds_op, dag outs, dag ins, string asm,
+ list<dag> pattern = []> :
+
+ InstR600 <0, outs, ins, asm, pattern, XALU>,
+ R600_ALU_LDS_Word0 {
+
+ field bits<32> Word1;
+
+ bits<11> src2;
+ bits<1> src2_rel;
+ // offset specifies the stride offset to the second set of data to be read
+ // from. This is a dword offset.
+ bits<6> offset = 0;
+ bits<5> alu_inst = 17; // OP3_INST_LDS_IDX_OP
+ bits<3> bank_swizzle = 0;
+ bits<6> lds_op;
+ bits<2> dst_chan;
+
+ bits<9> src2_sel = src2{8-0};
+ bits<2> src2_chan = src2{10-9};
+
+ let Word0{12} = offset{4};
+ let Word0{25} = offset{5};
+
+ let Word1{8-0} = src2_sel;
+ let Word1{9} = src2_rel;
+ let Word1{11-10} = src2_chan;
+ let Word1{12} = offset{1};
+ let Word1{17-13} = alu_inst;
+ let Word1{20-18} = bank_swizzle;
+ let Word1{26-21} = lds_op;
+ let Word1{27} = offset{0};
+ let Word1{28} = offset{2};
+ let Word1{30-29} = dst_chan;
+ let Word1{31} = offset{3};
+}
+
+class R600_LDS_1A <bits<6> lds_op, string name, list<dag> pattern> : R600_LDS <
+ lds_op,
+ (outs R600_Reg32:$dst),
+ (ins R600_Reg32:$src0, REL:$src0_rel,
+ LAST:$last, R600_Pred:$pred_sel, LITERAL:$literal),
+ !strconcat(name, " OQA, [$src0$src0_rel], $literal $pred_sel$last"),
+ pattern
+ > {
+
+ let src1 = 0;
+ let src1_rel = 0;
+ let src2 = 0;
+ let src2_rel = 0;
+
+ let Defs = [OQA];
+ let usesCustomInserter = 1;
+}
+
+class R600_LDS_1A1D <bits<6> lds_op, string name, list<dag> pattern> :
+ R600_LDS <
+ lds_op,
+ (outs),
+ (ins R600_Reg32:$src0, REL:$src0_rel,
+ R600_Reg32:$src1, REL:$src1_rel,
+ LAST:$last, R600_Pred:$pred_sel, LITERAL:$literal),
+ !strconcat(name, " $src0$src0_rel, [$src1$src1_rel], $literal "
+ "$pred_sel$last"),
+ pattern
+ > {
+
+ let src2 = 0;
+ let src2_rel = 0;
}
+def LDS_READ_RET : R600_LDS_1A <0x32, "LDS_READ_RET",
+ [(set (i32 R600_Reg32:$dst), (private_load R600_Reg32:$src0))]
+>;
+
+def LDS_WRITE : R600_LDS_1A1D <0xD, "LDS_WRITE",
+ [(private_store (i32 R600_Reg32:$src0), R600_Reg32:$src1)]
+>;
+
//===----------------------------------------------------------------------===//
// Regist loads and stores - for indirect addressing
//===----------------------------------------------------------------------===//
@@ -1415,6 +1500,8 @@ def StackMOV : R600_1OP <0x19, "MOV", []>;
} // End hasSideEffects = 1
+} // End Predicates = [isEGorCayman]
+
let Predicates = [isCayman] in {
let isVector = 1 in {
diff --git a/lib/Target/AMDGPU/R600RegisterInfo.td b/lib/Target/AMDGPU/R600RegisterInfo.td
index 3b218253659..f1151448151 100644
--- a/lib/Target/AMDGPU/R600RegisterInfo.td
+++ b/lib/Target/AMDGPU/R600RegisterInfo.td
@@ -65,6 +65,7 @@ def PRED_SEL_OFF: R600Reg<"Pred_sel_off", 0>;
def PRED_SEL_ZERO : R600Reg<"Pred_sel_zero", 2>;
def PRED_SEL_ONE : R600Reg<"Pred_sel_one", 3>;
def AR_X : R600Reg<"AR.x", 0>;
+def OQA : R600Reg<"OQA", 219>;
def R600_ArrayBase : RegisterClass <"AMDGPU", [f32, i32], 32,
(add (sequence "ArrayBase%u", 448, 464))>;
@@ -109,7 +110,7 @@ def R600_Reg32 : RegisterClass <"AMDGPU", [f32, i32], 32, (add
R600_CReg32,
R600_ArrayBase,
R600_Addr,
- ZERO, HALF, ONE, ONE_INT, PV_X, ALU_LITERAL_X, NEG_ONE, NEG_HALF)>;
+ ZERO, HALF, ONE, ONE_INT, PV_X, ALU_LITERAL_X, NEG_ONE, NEG_HALF, OQA)>;
def R600_Predicate : RegisterClass <"AMDGPU", [i32], 32, (add
PRED_SEL_OFF, PRED_SEL_ZERO, PRED_SEL_ONE)>;
diff --git a/lib/Target/AMDGPU/R600Schedule.td b/lib/Target/AMDGPU/R600Schedule.td
index 7ede181c51d..7e03c558f00 100644
--- a/lib/Target/AMDGPU/R600Schedule.td
+++ b/lib/Target/AMDGPU/R600Schedule.td
@@ -23,6 +23,7 @@ def TRANS : FuncUnit;
def AnyALU : InstrItinClass;
def VecALU : InstrItinClass;
def TransALU : InstrItinClass;
+def XALU : InstrItinClass;
def R600_EG_Itin : ProcessorItineraries <
[ALU_X, ALU_Y, ALU_Z, ALU_W, TRANS, ALU_NULL],
@@ -31,6 +32,7 @@ def R600_EG_Itin : ProcessorItineraries <
InstrItinData<AnyALU, [InstrStage<1, [ALU_X, ALU_Y, ALU_Z, ALU_W, TRANS]>]>,
InstrItinData<VecALU, [InstrStage<1, [ALU_X, ALU_Y, ALU_X, ALU_W]>]>,
InstrItinData<TransALU, [InstrStage<1, [TRANS]>]>,
+ InstrItinData<XALU, [InstrStage<1, [ALU_X]>]>,
InstrItinData<NullALU, [InstrStage<1, [ALU_NULL]>]>
]
>;