diff options
author | Tom Stellard <thomas.stellard@amd.com> | 2012-11-15 21:32:31 +0000 |
---|---|---|
committer | Tom Stellard <thomas.stellard@amd.com> | 2012-11-16 17:53:28 -0500 |
commit | 1c4628988f1cb7bdd93321a86fd08101df843343 (patch) | |
tree | 5fee20eb33b0b1e6b97b12ee32a4ce0ab6d4fbd6 | |
parent | 2cfbd43a3917a2608c80b16c890ef052051d93d1 (diff) |
XXX: First try at LDS.indirect-wip-3
What's missing:
+ LDS uses dword offsets, but the current code uses byte offsets.
+ I think we need to allocate LDS some how. There are config registers
for this for Pixel Shaders, but there may be a way to allocate from
within the shader.
This patch comments out the setOperationAction calls for lowering
loads/stores to indirect addressing only to make it easier to test.
-rw-r--r-- | lib/Target/AMDGPU/AMDGPUInstructions.td | 9 | ||||
-rw-r--r-- | lib/Target/AMDGPU/R600ISelLowering.cpp | 30 | ||||
-rw-r--r-- | lib/Target/AMDGPU/R600Instructions.td | 97 | ||||
-rw-r--r-- | lib/Target/AMDGPU/R600RegisterInfo.td | 3 | ||||
-rw-r--r-- | lib/Target/AMDGPU/R600Schedule.td | 2 |
5 files changed, 135 insertions, 6 deletions
diff --git a/lib/Target/AMDGPU/AMDGPUInstructions.td b/lib/Target/AMDGPU/AMDGPUInstructions.td index 443c6d4bc5b..4b0eb26d959 100644 --- a/lib/Target/AMDGPU/AMDGPUInstructions.td +++ b/lib/Target/AMDGPU/AMDGPUInstructions.td @@ -84,6 +84,15 @@ def zextloadi8_global : PatFrag<(ops node:$ptr), (zextloadi8 node:$ptr), [{ return isGlobalLoad(dyn_cast<LoadSDNode>(N)); }]>; +def private_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{ + return isPrivateLoad(dyn_cast<LoadSDNode>(N)); +}]>; + +def private_store : PatFrag<(ops node:$val, node:$ptr), + (store node:$val, node:$ptr), [{ + return isPrivateStore(dyn_cast<StoreSDNode>(N)); +}]>; + class Constants { int TWO_PI = 0x40c90fdb; int PI = 0x40490fdb; diff --git a/lib/Target/AMDGPU/R600ISelLowering.cpp b/lib/Target/AMDGPU/R600ISelLowering.cpp index 0eb5eed8fb8..eee1bc49c20 100644 --- a/lib/Target/AMDGPU/R600ISelLowering.cpp +++ b/lib/Target/AMDGPU/R600ISelLowering.cpp @@ -69,6 +69,7 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) : setOperationAction(ISD::SELECT, MVT::f32, Custom); // Legalize loads and stores to the private address space. +#if 0 setOperationAction(ISD::LOAD, MVT::f32, Custom); setOperationAction(ISD::LOAD, MVT::i32, Custom); setOperationAction(ISD::LOAD, MVT::v2f32, Custom); @@ -85,6 +86,7 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) : setOperationAction(ISD::STORE, MVT::v2i32, Custom); setOperationAction(ISD::STORE, MVT::v4f32, Custom); setOperationAction(ISD::STORE, MVT::v4i32, Custom); +#endif setOperationAction(ISD::FrameIndex, MVT::i32, Custom); @@ -141,6 +143,34 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter( break; } + case AMDGPU::LDS_READ_RET: + { + // Convert to DWORD address + unsigned NewAddr = MRI.createVirtualRegister( + &AMDGPU::R600_TReg32_XRegClass); + unsigned ShiftValue = MRI.createVirtualRegister( + &AMDGPU::R600_TReg32RegClass); + unsigned EOP = (llvm::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0; + + // XXX In theory, we should be able to pass ShiftValue directly to + // the LSHR_eg instruction as an inline literal, but I tried doing it + // this way and it didn't produce the correct results. + TII->buildMovImm(*BB, I, ShiftValue, 2); + TII->buildDefaultInstruction(*BB, I, AMDGPU::LSHR_eg, NewAddr, + MI->getOperand(1).getReg(), + ShiftValue); + MachineInstrBuilder NewMI = BuildMI(*BB, I, BB->findDebugLoc(I), + TII->get(MI->getOpcode()), + AMDGPU::OQA); + for (unsigned i = 1, e = MI->getNumOperands(); i < e; ++i) { + NewMI.addOperand(MI->getOperand(i)); + } + TII->buildDefaultInstruction(*BB, I, AMDGPU::MOV, + MI->getOperand(0).getReg(), + AMDGPU::OQA); + break; + } + case AMDGPU::MASK_WRITE: { unsigned maskedRegister = MI->getOperand(0).getReg(); diff --git a/lib/Target/AMDGPU/R600Instructions.td b/lib/Target/AMDGPU/R600Instructions.td index ed9164c695d..cbbe4b38c24 100644 --- a/lib/Target/AMDGPU/R600Instructions.td +++ b/lib/Target/AMDGPU/R600Instructions.td @@ -95,15 +95,13 @@ def ADDRDWord : ComplexPattern<i32, 1, "SelectADDRDWord", [], []>; def ADDRVTX_READ : ComplexPattern<i32, 2, "SelectADDRVTX_READ", [], []>; def ADDRIndirect : ComplexPattern<iPTR, 2, "SelectADDRIndirect", [], []>; -class R600ALU_Word0 { +class R600_ALU_LDS_Word0 { field bits<32> Word0; bits<11> src0; - bits<1> src0_neg; bits<1> src0_rel; bits<11> src1; bits<1> src1_rel; - bits<1> src1_neg; bits<3> index_mode = 0; bits<2> pred_sel; bits<1> last; @@ -116,16 +114,23 @@ class R600ALU_Word0 { let Word0{8-0} = src0_sel; let Word0{9} = src0_rel; let Word0{11-10} = src0_chan; - let Word0{12} = src0_neg; let Word0{21-13} = src1_sel; let Word0{22} = src1_rel; let Word0{24-23} = src1_chan; - let Word0{25} = src1_neg; let Word0{28-26} = index_mode; let Word0{30-29} = pred_sel; let Word0{31} = last; } +class R600ALU_Word0 : R600_ALU_LDS_Word0 { + + bits<1> src0_neg; + bits<1> src1_neg; + + let Word0{12} = src0_neg; + let Word0{25} = src1_neg; +} + class R600ALU_Word1 { field bits<32> Word1; @@ -1377,8 +1382,88 @@ def CONSTANT_LOAD_eg : VTX_READ_32_eg <1, [(set (f32 R600_TReg32_X:$dst), (constant_load ADDRVTX_READ:$ptr))] >; +//===----------------------------------------------------------------------===// +// LDS Instructions +//===----------------------------------------------------------------------===// +class R600_LDS <bits<6> lds_op, dag outs, dag ins, string asm, + list<dag> pattern = []> : + + InstR600 <0, outs, ins, asm, pattern, XALU>, + R600_ALU_LDS_Word0 { + + field bits<32> Word1; + + bits<11> src2; + bits<1> src2_rel; + // offset specifies the stride offset to the second set of data to be read + // from. This is a dword offset. + bits<6> offset = 0; + bits<5> alu_inst = 17; // OP3_INST_LDS_IDX_OP + bits<3> bank_swizzle = 0; + bits<6> lds_op; + bits<2> dst_chan; + + bits<9> src2_sel = src2{8-0}; + bits<2> src2_chan = src2{10-9}; + + let Word0{12} = offset{4}; + let Word0{25} = offset{5}; + + let Word1{8-0} = src2_sel; + let Word1{9} = src2_rel; + let Word1{11-10} = src2_chan; + let Word1{12} = offset{1}; + let Word1{17-13} = alu_inst; + let Word1{20-18} = bank_swizzle; + let Word1{26-21} = lds_op; + let Word1{27} = offset{0}; + let Word1{28} = offset{2}; + let Word1{30-29} = dst_chan; + let Word1{31} = offset{3}; +} + +class R600_LDS_1A <bits<6> lds_op, string name, list<dag> pattern> : R600_LDS < + lds_op, + (outs R600_Reg32:$dst), + (ins R600_Reg32:$src0, REL:$src0_rel, + LAST:$last, R600_Pred:$pred_sel, LITERAL:$literal), + !strconcat(name, " OQA, [$src0$src0_rel], $literal $pred_sel$last"), + pattern + > { + + let src1 = 0; + let src1_rel = 0; + let src2 = 0; + let src2_rel = 0; + + let Defs = [OQA]; + let usesCustomInserter = 1; +} + +class R600_LDS_1A1D <bits<6> lds_op, string name, list<dag> pattern> : + R600_LDS < + lds_op, + (outs), + (ins R600_Reg32:$src0, REL:$src0_rel, + R600_Reg32:$src1, REL:$src1_rel, + LAST:$last, R600_Pred:$pred_sel, LITERAL:$literal), + !strconcat(name, " $src0$src0_rel, [$src1$src1_rel], $literal " + "$pred_sel$last"), + pattern + > { + + let src2 = 0; + let src2_rel = 0; } +def LDS_READ_RET : R600_LDS_1A <0x32, "LDS_READ_RET", + [(set (i32 R600_Reg32:$dst), (private_load R600_Reg32:$src0))] +>; + +def LDS_WRITE : R600_LDS_1A1D <0xD, "LDS_WRITE", + [(private_store (i32 R600_Reg32:$src0), R600_Reg32:$src1)] +>; + //===----------------------------------------------------------------------===// // Regist loads and stores - for indirect addressing //===----------------------------------------------------------------------===// @@ -1415,6 +1500,8 @@ def StackMOV : R600_1OP <0x19, "MOV", []>; } // End hasSideEffects = 1 +} // End Predicates = [isEGorCayman] + let Predicates = [isCayman] in { let isVector = 1 in { diff --git a/lib/Target/AMDGPU/R600RegisterInfo.td b/lib/Target/AMDGPU/R600RegisterInfo.td index 3b218253659..f1151448151 100644 --- a/lib/Target/AMDGPU/R600RegisterInfo.td +++ b/lib/Target/AMDGPU/R600RegisterInfo.td @@ -65,6 +65,7 @@ def PRED_SEL_OFF: R600Reg<"Pred_sel_off", 0>; def PRED_SEL_ZERO : R600Reg<"Pred_sel_zero", 2>; def PRED_SEL_ONE : R600Reg<"Pred_sel_one", 3>; def AR_X : R600Reg<"AR.x", 0>; +def OQA : R600Reg<"OQA", 219>; def R600_ArrayBase : RegisterClass <"AMDGPU", [f32, i32], 32, (add (sequence "ArrayBase%u", 448, 464))>; @@ -109,7 +110,7 @@ def R600_Reg32 : RegisterClass <"AMDGPU", [f32, i32], 32, (add R600_CReg32, R600_ArrayBase, R600_Addr, - ZERO, HALF, ONE, ONE_INT, PV_X, ALU_LITERAL_X, NEG_ONE, NEG_HALF)>; + ZERO, HALF, ONE, ONE_INT, PV_X, ALU_LITERAL_X, NEG_ONE, NEG_HALF, OQA)>; def R600_Predicate : RegisterClass <"AMDGPU", [i32], 32, (add PRED_SEL_OFF, PRED_SEL_ZERO, PRED_SEL_ONE)>; diff --git a/lib/Target/AMDGPU/R600Schedule.td b/lib/Target/AMDGPU/R600Schedule.td index 7ede181c51d..7e03c558f00 100644 --- a/lib/Target/AMDGPU/R600Schedule.td +++ b/lib/Target/AMDGPU/R600Schedule.td @@ -23,6 +23,7 @@ def TRANS : FuncUnit; def AnyALU : InstrItinClass; def VecALU : InstrItinClass; def TransALU : InstrItinClass; +def XALU : InstrItinClass; def R600_EG_Itin : ProcessorItineraries < [ALU_X, ALU_Y, ALU_Z, ALU_W, TRANS, ALU_NULL], @@ -31,6 +32,7 @@ def R600_EG_Itin : ProcessorItineraries < InstrItinData<AnyALU, [InstrStage<1, [ALU_X, ALU_Y, ALU_Z, ALU_W, TRANS]>]>, InstrItinData<VecALU, [InstrStage<1, [ALU_X, ALU_Y, ALU_X, ALU_W]>]>, InstrItinData<TransALU, [InstrStage<1, [TRANS]>]>, + InstrItinData<XALU, [InstrStage<1, [ALU_X]>]>, InstrItinData<NullALU, [InstrStage<1, [ALU_NULL]>]> ] >; |