diff options
author | Tom Stellard <thomas.stellard@amd.com> | 2012-11-08 19:29:25 +0000 |
---|---|---|
committer | Tom Stellard <thomas.stellard@amd.com> | 2012-12-21 19:49:32 +0000 |
commit | cab1c8584d4413be0720bb34e9f1e610c52884ea (patch) | |
tree | 5f242943579eab7c9d6be902e3abdab72405fe13 | |
parent | 6929a40a5def73068f9ebe6ac72313ae56214264 (diff) |
R600: Support for indirect addressingindirect-wip-5
26 files changed, 1236 insertions, 62 deletions
diff --git a/lib/Target/AMDGPU/AMDGPU.h b/lib/Target/AMDGPU/AMDGPU.h index 40864b09dda..d8e48bfa6c6 100644 --- a/lib/Target/AMDGPU/AMDGPU.h +++ b/lib/Target/AMDGPU/AMDGPU.h @@ -21,8 +21,10 @@ class FunctionPass; class AMDGPUTargetMachine; // R600 Passes +FunctionPass* createR600IndirectAddressingPass(TargetMachine &tm); FunctionPass* createR600KernelParametersPass(const DataLayout *TD); FunctionPass *createR600ExpandSpecialInstrsPass(TargetMachine &tm); +FunctionPass *createR600VolatileLoadSetter(); // SI Passes FunctionPass *createSIAssignInterpRegsPass(TargetMachine &tm); diff --git a/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp b/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp new file mode 100644 index 00000000000..e9e1f1e92a2 --- /dev/null +++ b/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp @@ -0,0 +1,128 @@ +//===----------------------- AMDILFrameLowering.cpp -----------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +// +// Interface to describe a layout of a stack frame on a AMDIL target machine +// +//===----------------------------------------------------------------------===// +#include "AMDGPUFrameLowering.h" +#include "AMDGPURegisterInfo.h" +#include "R600MachineFunctionInfo.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Instructions.h" + +using namespace llvm; +AMDGPUFrameLowering::AMDGPUFrameLowering(StackDirection D, unsigned StackAl, + int LAO, unsigned TransAl) + : TargetFrameLowering(D, StackAl, LAO, TransAl) +{ +} + +AMDGPUFrameLowering::~AMDGPUFrameLowering() +{ +} + +unsigned AMDGPUFrameLowering::getStackWidth(const MachineFunction &MF) const { + + // XXX: Hardcoding to 1 for now. + // + // I think the StackWidth should stored as metadata associated with the + // MachineFunction. This metadata can either be added by a frontend, or + // calculated by a R600 specific LLVM IR pass. + // + // The StackWidth determines how stack objects are laid out in memory. + // For a vector stack variable, like: int4 stack[2], the data will be stored + // in the following ways depending on the StackWidth. + // + // StackWidth = 1: + // + // T0.X = stack[0].x + // T1.X = stack[0].y + // T2.X = stack[0].z + // T3.X = stack[0].w + // T4.X = stack[1].x + // T5.X = stack[1].y + // T6.X = stack[1].z + // T7.X = stack[1].w + // + // StackWidth = 2: + // + // T0.X = stack[0].x + // T0.Y = stack[0].y + // T1.X = stack[0].z + // T1.Y = stack[0].w + // T2.X = stack[1].x + // T2.Y = stack[1].y + // T3.X = stack[1].z + // T3.Y = stack[1].w + // + // StackWidth = 4: + // T0.X = stack[0].x + // T0.Y = stack[0].y + // T0.Z = stack[0].z + // T0.W = stack[0].w + // T1.X = stack[1].x + // T1.Y = stack[1].y + // T1.Z = stack[1].z + // T1.W = stack[1].w + return 1; +} + +/// \returns The number of registers allocated for \p FI. +int AMDGPUFrameLowering::getFrameIndexOffset(const MachineFunction &MF, + int FI) const { + const MachineFrameInfo *MFI = MF.getFrameInfo(); + unsigned Offset = 0; + int UpperBound = FI == -1 ? MFI->getNumObjects() : FI; + + for (int i = MFI->getObjectIndexBegin(); i < UpperBound; ++i) { + const AllocaInst *Alloca = MFI->getObjectAllocation(i); + unsigned ArrayElements; + const Type *AllocaType = Alloca->getAllocatedType(); + const Type *ElementType; + + if (AllocaType->isArrayTy()) { + ArrayElements = AllocaType->getArrayNumElements(); + ElementType = AllocaType->getArrayElementType(); + } else { + ArrayElements = 1; + ElementType = AllocaType; + } + + unsigned VectorElements; + if (ElementType->isVectorTy()) { + VectorElements = ElementType->getVectorNumElements(); + } else { + VectorElements = 1; + } + + Offset += (VectorElements / getStackWidth(MF)) * ArrayElements; + } + return Offset; +} + +const TargetFrameLowering::SpillSlot * +AMDGPUFrameLowering::getCalleeSavedSpillSlots(unsigned &NumEntries) const +{ + NumEntries = 0; + return 0; +} +void +AMDGPUFrameLowering::emitPrologue(MachineFunction &MF) const +{ +} +void +AMDGPUFrameLowering::emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const +{ +} +bool +AMDGPUFrameLowering::hasFP(const MachineFunction &MF) const +{ + return false; +} diff --git a/lib/Target/AMDGPU/AMDILFrameLowering.h b/lib/Target/AMDGPU/AMDGPUFrameLowering.h index 51337c3dd2e..c7be7f88e63 100644 --- a/lib/Target/AMDGPU/AMDILFrameLowering.h +++ b/lib/Target/AMDGPU/AMDGPUFrameLowering.h @@ -30,6 +30,10 @@ public: AMDGPUFrameLowering(StackDirection D, unsigned StackAl, int LAO, unsigned TransAl = 1); virtual ~AMDGPUFrameLowering(); + + /// \returns The number of 32-bit sub-registers that are used when storing + /// values to the stack. + virtual unsigned getStackWidth(const MachineFunction &MF) const; virtual int getFrameIndexOffset(const MachineFunction &MF, int FI) const; virtual const SpillSlot *getCalleeSavedSpillSlots(unsigned &NumEntries) const; virtual void emitPrologue(MachineFunction &MF) const; diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 473dac4ddcb..0d6cd4e3662 100644 --- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -413,5 +413,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(INTERP) NODE_NAME_CASE(INTERP_P0) NODE_NAME_CASE(EXPORT) + NODE_NAME_CASE(REGISTER_LOAD) + NODE_NAME_CASE(REGISTER_STORE) } } diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.h b/lib/Target/AMDGPU/AMDGPUISelLowering.h index c7abaf69b4e..92c00b218aa 100644 --- a/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -123,6 +123,8 @@ enum { INTERP, INTERP_P0, EXPORT, + REGISTER_LOAD, + REGISTER_STORE, LAST_AMDGPU_ISD_NUMBER }; diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.h b/lib/Target/AMDGPU/AMDGPUInstrInfo.h index 32ac691fe03..eaf0d3c6c63 100644 --- a/lib/Target/AMDGPU/AMDGPUInstrInfo.h +++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.h @@ -41,9 +41,10 @@ class MachineInstrBuilder; class AMDGPUInstrInfo : public AMDGPUGenInstrInfo { private: const AMDGPURegisterInfo RI; - TargetMachine &TM; bool getNextBranchInstr(MachineBasicBlock::iterator &iter, MachineBasicBlock &MBB) const; +protected: + TargetMachine &TM; public: explicit AMDGPUInstrInfo(TargetMachine &tm); diff --git a/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp b/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp index eeafec898d0..b3f242c930f 100644 --- a/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp +++ b/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp @@ -47,5 +47,143 @@ unsigned AMDGPURegisterInfo::getFrameRegister(const MachineFunction &MF) const { return 0; } +unsigned AMDGPURegisterInfo::getIndirectSubReg(unsigned IndirectIndex) const { + + switch(IndirectIndex) { + case 0: return AMDGPU::indirect_0; + case 1: return AMDGPU::indirect_1; + case 2: return AMDGPU::indirect_2; + case 3: return AMDGPU::indirect_3; + case 4: return AMDGPU::indirect_4; + case 5: return AMDGPU::indirect_5; + case 6: return AMDGPU::indirect_6; + case 7: return AMDGPU::indirect_7; + case 8: return AMDGPU::indirect_8; + case 9: return AMDGPU::indirect_9; + case 10: return AMDGPU::indirect_10; + case 11: return AMDGPU::indirect_11; + case 12: return AMDGPU::indirect_12; + case 13: return AMDGPU::indirect_13; + case 14: return AMDGPU::indirect_14; + case 15: return AMDGPU::indirect_15; + case 16: return AMDGPU::indirect_16; + case 17: return AMDGPU::indirect_17; + case 18: return AMDGPU::indirect_18; + case 19: return AMDGPU::indirect_19; + case 20: return AMDGPU::indirect_20; + case 21: return AMDGPU::indirect_21; + case 22: return AMDGPU::indirect_22; + case 23: return AMDGPU::indirect_23; + case 24: return AMDGPU::indirect_24; + case 25: return AMDGPU::indirect_25; + case 26: return AMDGPU::indirect_26; + case 27: return AMDGPU::indirect_27; + case 28: return AMDGPU::indirect_28; + case 29: return AMDGPU::indirect_29; + case 30: return AMDGPU::indirect_30; + case 31: return AMDGPU::indirect_31; + case 32: return AMDGPU::indirect_32; + case 33: return AMDGPU::indirect_33; + case 34: return AMDGPU::indirect_34; + case 35: return AMDGPU::indirect_35; + case 36: return AMDGPU::indirect_36; + case 37: return AMDGPU::indirect_37; + case 38: return AMDGPU::indirect_38; + case 39: return AMDGPU::indirect_39; + case 40: return AMDGPU::indirect_40; + case 41: return AMDGPU::indirect_41; + case 42: return AMDGPU::indirect_42; + case 43: return AMDGPU::indirect_43; + case 44: return AMDGPU::indirect_44; + case 45: return AMDGPU::indirect_45; + case 46: return AMDGPU::indirect_46; + case 47: return AMDGPU::indirect_47; + case 48: return AMDGPU::indirect_48; + case 49: return AMDGPU::indirect_49; + case 50: return AMDGPU::indirect_50; + case 51: return AMDGPU::indirect_51; + case 52: return AMDGPU::indirect_52; + case 53: return AMDGPU::indirect_53; + case 54: return AMDGPU::indirect_54; + case 55: return AMDGPU::indirect_55; + case 56: return AMDGPU::indirect_56; + case 57: return AMDGPU::indirect_57; + case 58: return AMDGPU::indirect_58; + case 59: return AMDGPU::indirect_59; + case 60: return AMDGPU::indirect_60; + case 61: return AMDGPU::indirect_61; + case 62: return AMDGPU::indirect_62; + case 63: return AMDGPU::indirect_63; +#if 0 +case 64: return AMDGPU::indirect_64; + case 65: return AMDGPU::indirect_65; + case 66: return AMDGPU::indirect_66; + case 67: return AMDGPU::indirect_67; + case 68: return AMDGPU::indirect_68; + case 69: return AMDGPU::indirect_69; + case 70: return AMDGPU::indirect_70; + case 71: return AMDGPU::indirect_71; + case 72: return AMDGPU::indirect_72; + case 73: return AMDGPU::indirect_73; + case 74: return AMDGPU::indirect_74; + case 75: return AMDGPU::indirect_75; + case 76: return AMDGPU::indirect_76; + case 77: return AMDGPU::indirect_77; + case 78: return AMDGPU::indirect_78; + case 79: return AMDGPU::indirect_79; + case 80: return AMDGPU::indirect_80; + case 81: return AMDGPU::indirect_81; + case 82: return AMDGPU::indirect_82; + case 83: return AMDGPU::indirect_83; + case 84: return AMDGPU::indirect_84; + case 85: return AMDGPU::indirect_85; + case 86: return AMDGPU::indirect_86; + case 87: return AMDGPU::indirect_87; + case 88: return AMDGPU::indirect_88; + case 89: return AMDGPU::indirect_89; + case 90: return AMDGPU::indirect_90; + case 91: return AMDGPU::indirect_91; + case 92: return AMDGPU::indirect_92; + case 93: return AMDGPU::indirect_93; + case 94: return AMDGPU::indirect_94; + case 95: return AMDGPU::indirect_95; + case 96: return AMDGPU::indirect_96; + case 97: return AMDGPU::indirect_97; + case 98: return AMDGPU::indirect_98; + case 99: return AMDGPU::indirect_99; + case 100: return AMDGPU::indirect_100; + case 101: return AMDGPU::indirect_101; + case 102: return AMDGPU::indirect_102; + case 103: return AMDGPU::indirect_103; + case 104: return AMDGPU::indirect_104; + case 105: return AMDGPU::indirect_105; + case 106: return AMDGPU::indirect_106; + case 107: return AMDGPU::indirect_107; + case 108: return AMDGPU::indirect_108; + case 109: return AMDGPU::indirect_109; + case 110: return AMDGPU::indirect_110; + case 111: return AMDGPU::indirect_111; + case 112: return AMDGPU::indirect_112; + case 113: return AMDGPU::indirect_113; + case 114: return AMDGPU::indirect_114; + case 115: return AMDGPU::indirect_115; + case 116: return AMDGPU::indirect_116; + case 117: return AMDGPU::indirect_117; + case 118: return AMDGPU::indirect_118; + case 119: return AMDGPU::indirect_119; + case 120: return AMDGPU::indirect_120; + case 121: return AMDGPU::indirect_121; + case 122: return AMDGPU::indirect_122; + case 123: return AMDGPU::indirect_123; + case 124: return AMDGPU::indirect_124; + case 125: return AMDGPU::indirect_125; + case 126: return AMDGPU::indirect_126; + case 127: return AMDGPU::indirect_127; +#endif +default: llvm_unreachable("indirect index out of range"); + } +} + + #define GET_REGINFO_TARGET_DESC #include "AMDGPUGenRegisterInfo.inc" diff --git a/lib/Target/AMDGPU/AMDGPURegisterInfo.h b/lib/Target/AMDGPU/AMDGPURegisterInfo.h index 76ee7ae06a5..5007ff5feb2 100644 --- a/lib/Target/AMDGPU/AMDGPURegisterInfo.h +++ b/lib/Target/AMDGPU/AMDGPURegisterInfo.h @@ -56,6 +56,8 @@ struct AMDGPURegisterInfo : public AMDGPUGenRegisterInfo { RegScavenger *RS) const; unsigned getFrameRegister(const MachineFunction &MF) const; + unsigned getIndirectSubReg(unsigned IndirectIndex) const; + }; } // End namespace llvm diff --git a/lib/Target/AMDGPU/AMDGPURegisterInfo.td b/lib/Target/AMDGPU/AMDGPURegisterInfo.td index 8181e023aa3..676c158aa95 100644 --- a/lib/Target/AMDGPU/AMDGPURegisterInfo.td +++ b/lib/Target/AMDGPU/AMDGPURegisterInfo.td @@ -16,6 +16,12 @@ let Namespace = "AMDGPU" in { def sel_y : SubRegIndex; def sel_z : SubRegIndex; def sel_w : SubRegIndex; + + +foreach Index = 0-63 in { + def indirect_#Index : SubRegIndex; +} + } include "R600RegisterInfo.td" diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index e42fa8abb2b..1edb5d0b42e 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -49,8 +49,8 @@ AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, StringRef TT, LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OptLevel), Subtarget(TT, CPU, FS), Layout(Subtarget.getDataLayout()), - FrameLowering(TargetFrameLowering::StackGrowsUp, - Subtarget.device()->getStackAlignment(), 0), + FrameLowering(TargetFrameLowering::StackGrowsUp, 128, 0), +// Subtarget.device()->getStackAlignment(), 0), IntrinsicInfo(this), InstrItins(&Subtarget.getInstrItineraryData()) { // TLInfo uses InstrInfo so it must be initialized after. @@ -91,12 +91,14 @@ TargetPassConfig *AMDGPUTargetMachine::createPassConfig(PassManagerBase &PM) { bool AMDGPUPassConfig::addPreISel() { +// addPass(createR600VolatileLoadSetter()); return false; } bool AMDGPUPassConfig::addInstSelector() { addPass(createAMDGPUPeepholeOpt(*TM)); addPass(createAMDGPUISelDag(getAMDGPUTargetMachine())); + addPass(createR600IndirectAddressingPass(*TM)); return false; } @@ -114,12 +116,16 @@ bool AMDGPUPassConfig::addPreRegAlloc() { } bool AMDGPUPassConfig::addPostRegAlloc() { + const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(); + if (ST.device()->getGeneration() <= AMDGPUDeviceInfo::HD6XXX) { +// addPass(createR600AllocateMemoryRegsPass(*TM)); + } return false; } bool AMDGPUPassConfig::addPreSched2() { - addPass(&IfConverterID); +// addPass(&IfConverterID); return false; } diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/lib/Target/AMDGPU/AMDGPUTargetMachine.h index 399e55c8ad7..5a1dcf43fe9 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetMachine.h +++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.h @@ -15,9 +15,9 @@ #ifndef AMDGPU_TARGET_MACHINE_H #define AMDGPU_TARGET_MACHINE_H +#include "AMDGPUFrameLowering.h" #include "AMDGPUInstrInfo.h" #include "AMDGPUSubtarget.h" -#include "AMDILFrameLowering.h" #include "AMDILIntrinsicInfo.h" #include "R600ISelLowering.h" #include "llvm/ADT/OwningPtr.h" diff --git a/lib/Target/AMDGPU/AMDILFrameLowering.cpp b/lib/Target/AMDGPU/AMDILFrameLowering.cpp deleted file mode 100644 index 9ad495ab48b..00000000000 --- a/lib/Target/AMDGPU/AMDILFrameLowering.cpp +++ /dev/null @@ -1,47 +0,0 @@ -//===----------------------- AMDILFrameLowering.cpp -----------------*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//==-----------------------------------------------------------------------===// -// -/// \file -/// \brief Interface to describe a layout of a stack frame on a AMDGPU target -/// machine. -// -//===----------------------------------------------------------------------===// -#include "AMDILFrameLowering.h" -#include "llvm/CodeGen/MachineFrameInfo.h" - -using namespace llvm; -AMDGPUFrameLowering::AMDGPUFrameLowering(StackDirection D, unsigned StackAl, - int LAO, unsigned TransAl) - : TargetFrameLowering(D, StackAl, LAO, TransAl) { -} - -AMDGPUFrameLowering::~AMDGPUFrameLowering() { -} - -int AMDGPUFrameLowering::getFrameIndexOffset(const MachineFunction &MF, - int FI) const { - const MachineFrameInfo *MFI = MF.getFrameInfo(); - return MFI->getObjectOffset(FI); -} - -const TargetFrameLowering::SpillSlot * -AMDGPUFrameLowering::getCalleeSavedSpillSlots(unsigned &NumEntries) const { - NumEntries = 0; - return 0; -} -void -AMDGPUFrameLowering::emitPrologue(MachineFunction &MF) const { -} -void -AMDGPUFrameLowering::emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const { -} -bool -AMDGPUFrameLowering::hasFP(const MachineFunction &MF) const { - return false; -} diff --git a/lib/Target/AMDGPU/AMDILISelDAGToDAG.cpp b/lib/Target/AMDGPU/AMDILISelDAGToDAG.cpp index d15ed393c13..d4d1746b951 100644 --- a/lib/Target/AMDGPU/AMDILISelDAGToDAG.cpp +++ b/lib/Target/AMDGPU/AMDILISelDAGToDAG.cpp @@ -70,6 +70,7 @@ private: bool SelectADDR8BitOffset(SDValue Addr, SDValue& Base, SDValue& Offset); bool SelectADDRReg(SDValue Addr, SDValue& Base, SDValue& Offset); bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, SDValue &Offset); + bool SelectADDRIndirect(SDValue Addr, SDValue &Base, SDValue &Offset); // Include the pieces autogenerated from the target description. #include "AMDGPUGenDAGISel.inc" @@ -95,6 +96,8 @@ SDValue AMDGPUDAGToDAGISel::getSmallIPtrImm(unsigned int Imm) { return CurDAG->getTargetConstant(Imm, MVT::i32); } + + bool AMDGPUDAGToDAGISel::SelectADDRParam( SDValue Addr, SDValue& R1, SDValue& R2) { @@ -161,7 +164,7 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { unsigned int FI = FIN->getIndex(); EVT OpVT = N->getValueType(0); unsigned int NewOpc = AMDGPU::COPY; - SDValue TFI = CurDAG->getTargetFrameIndex(FI, MVT::i32); + SDValue TFI = CurDAG->getRegister(AMDGPU::T0_X, MVT::i32); return CurDAG->SelectNodeTo(N, NewOpc, OpVT, TFI); } break; @@ -483,3 +486,22 @@ bool AMDGPUDAGToDAGISel::SelectADDRReg(SDValue Addr, SDValue& Base, return true; } + +bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base, + SDValue &Offset) { + ConstantSDNode *C; + + if ((C = dyn_cast<ConstantSDNode>(Addr))) { + Base = CurDAG->getRegister(AMDGPU::ZERO, MVT::i32); + Offset = CurDAG->getTargetConstant(C->getZExtValue(), MVT::i32); + } else if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) && + (C = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))) { + Base = Addr.getOperand(0); + Offset = CurDAG->getTargetConstant(C->getZExtValue(), MVT::i32); + } else { + Base = Addr; + Offset = CurDAG->getTargetConstant(0, MVT::i32); + } + + return true; +} diff --git a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp index e6c550b5ac4..fd1b3c0933f 100644 --- a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp +++ b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp @@ -105,10 +105,7 @@ void AMDGPUInstPrinter::printOMOD(const MCInst *MI, unsigned OpNo, void AMDGPUInstPrinter::printRel(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - const MCOperand &Op = MI->getOperand(OpNo); - if (Op.getImm() != 0) { - O << " + " << Op.getImm(); - } + printIfSet(MI, OpNo, O, "+"); } void AMDGPUInstPrinter::printUpdateExecMask(const MCInst *MI, unsigned OpNo, diff --git a/lib/Target/AMDGPU/R600Defines.h b/lib/Target/AMDGPU/R600Defines.h index 7dea8e44ea2..ea4cd9bb1ec 100644 --- a/lib/Target/AMDGPU/R600Defines.h +++ b/lib/Target/AMDGPU/R600Defines.h @@ -49,6 +49,9 @@ namespace R600_InstFlag { #define HW_REG_MASK 0x1ff #define HW_CHAN_SHIFT 9 +#define GET_REG_CHAN(reg) ((reg) >> HW_CHAN_SHIFT) +#define GET_REG_INDEX(reg) ((reg) & HW_REG_MASK) + namespace R600Operands { enum Ops { DST, diff --git a/lib/Target/AMDGPU/R600ISelLowering.cpp b/lib/Target/AMDGPU/R600ISelLowering.cpp index eaeff4ec217..eb71905340d 100644 --- a/lib/Target/AMDGPU/R600ISelLowering.cpp +++ b/lib/Target/AMDGPU/R600ISelLowering.cpp @@ -18,6 +18,7 @@ #include "R600MachineFunctionInfo.h" #include "llvm/Argument.h" #include "llvm/Function.h" +#include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/SelectionDAG.h" @@ -67,9 +68,20 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) : setOperationAction(ISD::SELECT, MVT::i32, Custom); setOperationAction(ISD::SELECT, MVT::f32, Custom); + // Legalize loads and stores to the private address space. + setOperationAction(ISD::LOAD, MVT::i32, Custom); + setOperationAction(ISD::LOAD, MVT::v2i32, Custom); + setOperationAction(ISD::LOAD, MVT::v4i32, Custom); + setLoadExtAction(ISD::EXTLOAD, MVT::v4i8, Custom); + setLoadExtAction(ISD::EXTLOAD, MVT::i8, Custom); + setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i8, Custom); + setOperationAction(ISD::STORE, MVT::i8, Custom); setOperationAction(ISD::STORE, MVT::i32, Custom); + setOperationAction(ISD::STORE, MVT::v2i32, Custom); setOperationAction(ISD::STORE, MVT::v4i32, Custom); + setOperationAction(ISD::FrameIndex, MVT::i32, Custom); + setTargetDAGCombine(ISD::FP_ROUND); setSchedulingPreference(Sched::VLIW); @@ -150,6 +162,64 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter( break; } + case AMDGPU::RegisterLoad_i32: { + unsigned Channel = MI->getOperand(2).getImm(); + unsigned IndirectRegOffset = TII->getIndirectIndexBegin(*MF); + unsigned Offset = (MI->getOperand(2).getImm() * 4) + Channel + + (IndirectRegOffset * 4); + R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>(); + unsigned IndirectReg = MRI.createVirtualRegister(&AMDGPU::IndirectRegRegClass); + if (MI->getOperand(1).getReg() != AMDGPU::ZERO) { + + MachineInstrBuilder Sequence = BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::REG_SEQUENCE), IndirectReg); + + MachineInstr *MOVA = TII->buildDefaultInstruction(*BB, I, + AMDGPU::MOVA_INT_eg, + AMDGPU::AR_X, + MI->getOperand(1).getReg()); + TII->setImmOperand(MOVA, R600Operands::WRITE, 0); + unsigned OffsetReg = AMDGPU::R600_AddrRegClass.getRegister(Offset); + MachineInstrBuilder MIBuilder = TII->buildDefaultInstruction(*BB, I, + AMDGPU::MOV, MI->getOperand(0).getReg(), + OffsetReg); + for (unsigned i = Channel; i < MFI->IndirectRegs.size(); i+=4) { + unsigned Reg = MFI->IndirectRegs[i]; + assert(Reg); + Sequence.addReg(MFI->IndirectRegs[i]); + Sequence.addImm(TII->getRegisterInfo().getIndirectSubReg(i)); + } + MIBuilder.addReg(IndirectReg, RegState::Implicit | RegState::Kill); + MachineInstr *NewMI = MIBuilder.addReg(AMDGPU::AR_X, RegState::Implicit); + TII->setImmOperand(NewMI, R600Operands::SRC0_REL, 1); + } else { + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::REG_SEQUENCE), + IndirectReg) + .addReg(MFI->IndirectRegs[Channel + (MI->getOperand(2).getImm() * 4)]) + .addImm(TII->getRegisterInfo().getIndirectSubReg(Offset)); + TII->buildDefaultInstruction(*BB, I, AMDGPU::MOV, + MI->getOperand(0).getReg(), + IndirectReg); + } + + break; + } + + case AMDGPU::RegisterStore_i32: { + if (MI->getOperand(1).getReg() != AMDGPU::ZERO) { + return BB; + } + unsigned CopyReg = MRI.createVirtualRegister(&AMDGPU::R600_TReg32RegClass); + unsigned Offset = MI->getOperand(2).getImm() + MI->getOperand(3).getImm(); + R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>(); + unsigned DstReg = MFI->IndirectRegs[Offset]; + unsigned IndirectReg = MFI->IndirectRegSequenceRegs[Offset]; +// MRI.addLiveIn(IndirectReg); + assert(DstReg); + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::COPY), DstReg) + .addOperand(MI->getOperand(0)); + break; + } + case AMDGPU::RESERVE_REG: { R600MachineFunctionInfo * MFI = MF->getInfo<R600MachineFunctionInfo>(); int64_t ReservedIndex = MI->getOperand(0).getImm(); @@ -359,8 +429,10 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); case ISD::SELECT: return LowerSELECT(Op, DAG); case ISD::SETCC: return LowerSETCC(Op, DAG); + case ISD::LOAD: return LowerLOAD(Op, DAG); case ISD::STORE: return LowerSTORE(Op, DAG); case ISD::FPOW: return LowerFPOW(Op, DAG); + case ISD::FrameIndex: return LowerFrameIndex(Op, DAG); case ISD::INTRINSIC_VOID: { SDValue Chain = Op.getOperand(0); unsigned IntrinsicID = @@ -523,6 +595,20 @@ void R600TargetLowering::ReplaceNodeResults(SDNode *N, switch (N->getOpcode()) { default: return; case ISD::FP_TO_UINT: Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG)); + return; + case ISD::LOAD: { + SDNode *Node = LowerLOAD(SDValue(N, 0), DAG).getNode(); + Results.push_back(SDValue(Node, 0)); + Results.push_back(SDValue(Node, 1)); + // XXX: LLVM seems not to replace Chain Value inside CustomWidenLowerNode + // function + DAG.ReplaceAllUsesOfValueWith(SDValue(N,1), SDValue(Node, 1)); + return; + } + case ISD::STORE: + SDNode *Node = LowerSTORE(SDValue(N, 0), DAG).getNode(); + Results.push_back(SDValue(Node, 0)); + return; } } @@ -590,6 +676,21 @@ SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT, false, false, false, 0); } +SDValue R600TargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const { + + MachineFunction &MF = DAG.getMachineFunction(); + const MachineFrameInfo *MFI = MF.getFrameInfo(); + const AMDGPUFrameLowering *TFL = + static_cast<const AMDGPUFrameLowering*>(getTargetMachine().getFrameLowering()); + + FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Op); + assert(FIN); + + unsigned FrameIndex = FIN->getIndex(); + unsigned Offset = TFL->getFrameIndexOffset(MF, FrameIndex); + return DAG.getConstant(Offset * 4 * TFL->getStackWidth(MF), MVT::i32); +} + SDValue R600TargetLowering::LowerROTL(SDValue Op, SelectionDAG &DAG) const { DebugLoc DL = Op.getDebugLoc(); EVT VT = Op.getValueType(); @@ -804,6 +905,118 @@ SDValue R600TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { return Cond; } +/// LLVM generates byte-addresed pointers. For indirect addressing, we need to +/// convert these pointers to a register index. Each register holds +/// 16 bytes, (4 x 32bit sub-register), but we need to take into account the +/// \p StackWidth, which tells us how many of the 4 sub-registrers will be used +/// for indirect addressing. +SDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr, + unsigned StackWidth, + SelectionDAG &DAG) const { + unsigned SRLPad; + switch(StackWidth) { + case 1: + SRLPad = 2; + break; + case 2: + SRLPad = 3; + break; + case 4: + SRLPad = 4; + break; + default: llvm_unreachable("Invalid stack width"); + } + + return DAG.getNode(ISD::SRL, Ptr.getDebugLoc(), Ptr.getValueType(), Ptr, + DAG.getConstant(SRLPad, MVT::i32)); +} + +void R600TargetLowering::getStackAddress(unsigned StackWidth, + unsigned ElemIdx, + unsigned &Channel, + unsigned &PtrIncr) const { + switch (StackWidth) { + default: + case 1: + Channel = 0; + if (ElemIdx > 0) { + PtrIncr = 1; + } else { + PtrIncr = 0; + } + break; + case 2: + Channel = ElemIdx % 2; + if (ElemIdx == 2) { + PtrIncr = 1; + } else { + PtrIncr = 0; + } + break; + case 4: + Channel = ElemIdx; + PtrIncr = 0; + break; + } +} + +SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + DebugLoc DL = Op.getDebugLoc(); + LoadSDNode *LoadNode = cast<LoadSDNode>(Op); + SDValue Chain = Op.getOperand(0); + SDValue Ptr = Op.getOperand(1); + SDValue LoweredLoad; + + if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) { + return SDValue(); + } + + // Lowering for indirect addressing + const MachineFunction &MF = DAG.getMachineFunction(); + const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering*>( + getTargetMachine().getFrameLowering()); + unsigned StackWidth = TFL->getStackWidth(MF); + + Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG); + + if (VT.isVector()) { + unsigned NumElemVT = VT.getVectorNumElements(); + EVT ElemVT = VT.getVectorElementType(); + SDValue Loads[4]; + + assert(NumElemVT >= StackWidth && "Stack width cannot be greater than " + "vector width in load"); + + for (unsigned i = 0; i < NumElemVT; ++i) { + unsigned Channel, PtrIncr; + getStackAddress(StackWidth, i, Channel, PtrIncr); + Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr, + DAG.getConstant(PtrIncr, MVT::i32)); + Loads[i] = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, ElemVT, + Chain, Ptr, + DAG.getTargetConstant(Channel, MVT::i32), + Op.getOperand(2)); + } + for (unsigned i = NumElemVT; i < 4; ++i) { + Loads[i] = DAG.getUNDEF(ElemVT); + } + EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, 4); + LoweredLoad = DAG.getNode(ISD::BUILD_VECTOR, DL, TargetVT, Loads, 4); + } else { + LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT, + Chain, Ptr, + DAG.getTargetConstant(0, MVT::i32), // Channel + Op.getOperand(2)); + } + + SDValue Ops[2]; + Ops[0] = LoweredLoad; + Ops[1] = Chain; + + return DAG.getMergeValues(Ops, 2, DL); +} + SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { DebugLoc DL = Op.getDebugLoc(); StoreSDNode *StoreNode = cast<StoreSDNode>(Op); @@ -825,9 +1038,57 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { } return Chain; } - return SDValue(); -} + EVT VT = Value.getValueType(); + R600MachineFunctionInfo *MFI = + DAG.getMachineFunction().getInfo<R600MachineFunctionInfo>(); + + if (StoreNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) { + return SDValue(); + } + + // Lowering for indirect addressing + + const MachineFunction &MF = DAG.getMachineFunction(); + const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering*>( + getTargetMachine().getFrameLowering()); + unsigned StackWidth = TFL->getStackWidth(MF); + + Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG); + + if (VT.isVector()) { + unsigned NumElemVT = VT.getVectorNumElements(); + EVT ElemVT = VT.getVectorElementType(); + SDValue Stores[4]; + + assert(NumElemVT >= StackWidth && "Stack width cannot be greater than " + "vector width in load"); + + for (unsigned i = 0; i < NumElemVT; ++i) { + unsigned Channel, PtrIncr; + getStackAddress(StackWidth, i, Channel, PtrIncr); + Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr, + DAG.getConstant(PtrIncr, MVT::i32)); + SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT, + Value, DAG.getConstant(i, MVT::i32)); + + Stores[i] = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, + Chain, Elem, Ptr, + DAG.getTargetConstant(Channel, MVT::i32)); + MFI->IndirectChannels.set(i); + } + Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores, NumElemVT); + } else { + if (VT == MVT::i8) { + Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value); + } + Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, Chain, Value, Ptr, + DAG.getTargetConstant(0, MVT::i32)); // Channel + MFI->IndirectChannels.set(0); + } + + return Chain; +} SDValue R600TargetLowering::LowerFPOW(SDValue Op, SelectionDAG &DAG) const { diff --git a/lib/Target/AMDGPU/R600ISelLowering.h b/lib/Target/AMDGPU/R600ISelLowering.h index 2b954dab558..6cfc788139c 100644 --- a/lib/Target/AMDGPU/R600ISelLowering.h +++ b/lib/Target/AMDGPU/R600ISelLowering.h @@ -60,10 +60,16 @@ private: SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFPOW(SDValue Op, SelectionDAG &DAG) const; - + SDValue LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const; + + SDValue stackPtrToRegIndex(SDValue Ptr, unsigned StackWidth, + SelectionDAG &DAG) const; + void getStackAddress(unsigned StackWidth, unsigned ElemIdx, + unsigned &Channel, unsigned &PtrIncr) const; bool isZero(SDValue Op) const; }; diff --git a/lib/Target/AMDGPU/R600IndirectAddressing.cpp b/lib/Target/AMDGPU/R600IndirectAddressing.cpp new file mode 100644 index 00000000000..ef336e72957 --- /dev/null +++ b/lib/Target/AMDGPU/R600IndirectAddressing.cpp @@ -0,0 +1,333 @@ +//===-- R600IndirectAddressing.cpp - Indirect Adressing Support -----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// +/// Instructions can use indirect addressing to index the register file as if it +/// were memory. This pass lowers RegisterLoad and RegisterStore instructions +/// to either a COPY or a MOV that uses indirect addressing. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "r600-indirect" + +#include "AMDGPU.h" +#include "R600InstrInfo.h" +#include "R600MachineFunctionInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Support/Debug.h" + +using namespace llvm; + +namespace { + +class R600IndirectAddressingPass : public MachineFunctionPass { + +private: + static char ID; + const R600InstrInfo *TII; + + bool regHasExplicitDef(MachineRegisterInfo &MRI, unsigned Reg) const; + +public: + R600IndirectAddressingPass(TargetMachine &tm) : + MachineFunctionPass(ID), + TII(static_cast<const R600InstrInfo*>(tm.getInstrInfo())) + { } + + virtual bool runOnMachineFunction(MachineFunction &MF); + + const char *getPassName() const { return "R600 Handle indirect addressing"; } + +}; + +} // End anonymous namespace + +char R600IndirectAddressingPass::ID = 0; + +FunctionPass *llvm::createR600IndirectAddressingPass(TargetMachine &tm) { + return new R600IndirectAddressingPass(tm); +} + +bool R600IndirectAddressingPass::runOnMachineFunction(MachineFunction &MF) { + + MachineRegisterInfo &MRI = MF.getRegInfo(); + R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>(); + + unsigned IndirectBegin = TII->getIndirectIndexBegin(MF); + unsigned IndirectEnd = TII->getIndirectIndexEnd(MF); + + // The map keeps track of the indirect address that is represented by + // each virtual register. The key is the register and the value is the + // indirect address it uses. + std::map<unsigned, unsigned> RegisterAddressMap; + + // First pass - Lower all of the RegisterStore instructions and track which + // registers are live. + for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end(); + BB != BB_E; ++BB) { + // This map keeps track of the current live indirect registers. + // The key is the address and the value is the register + std::map<unsigned, unsigned> LiveAddressRegisterMap; + MachineBasicBlock &MBB = *BB; + + for (MachineBasicBlock::iterator I = MBB.begin(), Next = llvm::next(I); + I != MBB.end(); I = Next) { + Next = llvm::next(I); + MachineInstr &MI = *I; + + if (MI.getOpcode() != AMDGPU::RegisterStore_i32) { + continue; + } + + // Lower RegisterStore + + unsigned RegIndex = MI.getOperand(2).getImm(); + unsigned Channel = MI.getOperand(3).getImm(); + unsigned Address = (4 * RegIndex) + Channel; + + if (MI.getOperand(1).getReg() == AMDGPU::ZERO) { + // Direct register access. + unsigned DstReg = MRI.createVirtualRegister( + &AMDGPU::R600_TReg32RegClass); + + BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::COPY), DstReg) + .addOperand(MI.getOperand(0)); + + RegisterAddressMap[DstReg] = Address; + LiveAddressRegisterMap[Address] = DstReg; + } else { + // Indirect register access. + unsigned AddrReg = AMDGPU::R600_AddrRegClass.getRegister(Address); + + MachineInstr *MOVA = TII->buildDefaultInstruction(*BB, I, + AMDGPU::MOVA_INT_eg, + AMDGPU::AR_X, + MI.getOperand(1).getReg()); + TII->setImmOperand(MOVA, R600Operands::WRITE, 0); + MachineInstrBuilder MIBuilder = TII->buildDefaultInstruction(*BB, I, + AMDGPU::MOV, + AddrReg, + MI.getOperand(0).getReg()) + .addReg(AMDGPU::AR_X, RegState::Implicit); + for (unsigned i = IndirectBegin; i <= IndirectEnd; ++i) { + unsigned Addr = (4 * i) + Channel; + unsigned DstReg = MRI.createVirtualRegister( + &AMDGPU::R600_TReg32RegClass); + MIBuilder.addReg(DstReg, RegState::Define | RegState::Implicit); + RegisterAddressMap[DstReg] = Addr; + LiveAddressRegisterMap[Addr] = DstReg; + } + TII->setImmOperand(MIBuilder, R600Operands::DST_REL, 1); + } + MI.eraseFromParent(); + } + + // Update the live-ins of the succesor blocks + for (MachineBasicBlock::succ_iterator Succ = MBB.succ_begin(), + SuccEnd = MBB.succ_end(); + SuccEnd != Succ; ++Succ) { + std::map<unsigned, unsigned>::const_iterator Key, KeyEnd; + for (Key = LiveAddressRegisterMap.begin(), + KeyEnd = LiveAddressRegisterMap.end(); KeyEnd != Key; ++Key) { + (*Succ)->addLiveIn(Key->second); + } + } + } + + // Second pass - Lower the RegisterLoad instructions + for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end(); + BB != BB_E; ++BB) { + // Key is the address and the value is the register + std::map<unsigned, unsigned> LiveAddressRegisterMap; + MachineBasicBlock &MBB = *BB; + + MachineBasicBlock::livein_iterator LI = MBB.livein_begin(); + while (LI != MBB.livein_end()) { + std::vector<unsigned> PhiRegisters; + + // Make sure this live in is used for indirect addressing + if (RegisterAddressMap.find(*LI) == RegisterAddressMap.end()) { + ++LI; + continue; + } + + unsigned Address = RegisterAddressMap[*LI]; + LiveAddressRegisterMap[Address] = *LI; + PhiRegisters.push_back(*LI); + + // Check if there are other live in register which map to the same + // indirect address. + for (MachineBasicBlock::livein_iterator LJ = llvm::next(LI), + LE = MBB.livein_end(); + LJ != LE; ++LJ) { + unsigned Reg = *LJ; + if (RegisterAddressMap.find(Reg) == RegisterAddressMap.end()) { + continue; + } + + if (RegisterAddressMap[Reg] == Address) { + if (!regHasExplicitDef(MRI, Reg)) { + continue; + } + PhiRegisters.push_back(Reg); + } + } + + if (PhiRegisters.size() == 1) { + // We don't need to insert a Phi instruction, so we can just add the + // registers to the live list for the block. + LiveAddressRegisterMap[Address] = *LI; + MBB.removeLiveIn(*LI); + } else { + // We need to insert a PHI, because we have the same address being + // written in multiplie predecessor blocks. + unsigned PhiDstReg = MRI.createVirtualRegister( + &AMDGPU::R600_TReg32RegClass); + MachineInstrBuilder Phi = BuildMI(MBB, MBB.begin(), + MBB.findDebugLoc(MBB.begin()), + TII->get(AMDGPU::PHI), PhiDstReg); + + for (std::vector<unsigned>::const_iterator RI = PhiRegisters.begin(), + RE = PhiRegisters.end(); + RI != RE; ++RI) { + unsigned Reg = *RI; + MachineInstr *DefInst = MRI.getVRegDef(Reg); + assert(DefInst); + MachineBasicBlock *RegBlock = DefInst->getParent(); + Phi.addReg(Reg); + Phi.addMBB(RegBlock); + MBB.removeLiveIn(Reg); + } + RegisterAddressMap[PhiDstReg] = Address; + LiveAddressRegisterMap[Address] = PhiDstReg; + } + // XXX !!! XXX !!! + LI = MBB.livein_begin(); + } + + for (MachineBasicBlock::iterator I = MBB.begin(), Next = llvm::next(I); + I != MBB.end(); I = Next) { + Next = llvm::next(I); + MachineInstr &MI = *I; + + if (MI.getOpcode() != AMDGPU::RegisterLoad_i32) { + if (MI.getOpcode() == AMDGPU::PHI) { + continue; + } + // Check for indirect register defs + for (unsigned OpIdx = 0, NumOperands = MI.getNumOperands(); + OpIdx < NumOperands; ++OpIdx) { + MachineOperand &MO = MI.getOperand(OpIdx); + if (MO.isReg() && MO.isDef() && + RegisterAddressMap.find(MO.getReg()) != RegisterAddressMap.end()) { + unsigned Reg = MO.getReg(); + unsigned LiveAddress = RegisterAddressMap[Reg]; + // Chain the live-ins + if (LiveAddressRegisterMap.find(LiveAddress) != RegisterAddressMap.end()) { + MI.addOperand(MachineOperand::CreateReg( + LiveAddressRegisterMap[LiveAddress], + false, // isDef + true, // isImp + true)); // isKill + } + LiveAddressRegisterMap[LiveAddress] = Reg; + } + } + continue; + } + + unsigned IndirectReg = MRI.createVirtualRegister( + &AMDGPU::IndirectRegRegClass); + + unsigned RegIndex = MI.getOperand(2).getImm(); + unsigned Channel = MI.getOperand(3).getImm(); + unsigned Address = (4 * RegIndex) + Channel; + + // XXX: REQ_SEQUENCE You can't actually use the defined register unless + // you have an instruction that takes the register class as an operand. + + if (MI.getOperand(1).getReg() == AMDGPU::ZERO) { + // Direct register access + unsigned Reg = LiveAddressRegisterMap[Address]; + unsigned CopyReg = MRI.createVirtualRegister( + &AMDGPU::TRegMemRegClass); + unsigned AddrReg = AMDGPU::TRegMemRegClass.getRegister(Address); + + if (regHasExplicitDef(MRI, Reg)) { + // If the register we are reading from has an explicit def, then that + // means it was written via a direct register access (i.e. COPY + // or other instruction that doesn't use indirect addressing). In + // this case we know where the value has been stored, so we can just + // issue a copy. + BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::COPY), + MI.getOperand(0).getReg()) + .addReg(Reg); + } else { + // If the register we are reading has an implicit def, then that + // means it was written by an indirect register access (i.e. An + // instruction that uses indreict addressing. + TII->buildDefaultInstruction(*BB, I, AMDGPU::MOV, + MI.getOperand(0).getReg(), + AddrReg); + } + } else { + // Indirect register access + MachineInstrBuilder Sequence = BuildMI(MBB, I, MBB.findDebugLoc(I), + TII->get(AMDGPU::REG_SEQUENCE), + IndirectReg); + for (unsigned i = IndirectBegin; i <= IndirectEnd; ++i) { + unsigned Addr = (4 * i) + Channel; + if (LiveAddressRegisterMap.find(Addr) == LiveAddressRegisterMap.end()) { + continue; + } + unsigned Reg = LiveAddressRegisterMap[Addr]; + + // We only need to use REG_SEQUENCE for explicit defs, since the + // register coalescer won't do anything with the implicit defs. + MachineInstr *DefInstr = MRI.getVRegDef(Reg); + if (!DefInstr->getOperand(0).isReg() || + DefInstr->getOperand(0).getReg() != Reg) { + continue; + } + + // Insert a REQ_SEQUCE instruction to force the register allocator + // to allocate the virtual register to the correct physical register. + Sequence.addReg(LiveAddressRegisterMap[Addr]); + Sequence.addImm(TII->getRegisterInfo().getIndirectSubReg(Addr)); + } + MachineInstr *MOVA = TII->buildDefaultInstruction(*BB, I, + AMDGPU::MOVA_INT_eg, + AMDGPU::AR_X, + MI.getOperand(1).getReg()); + TII->setImmOperand(MOVA, R600Operands::WRITE, 0); + unsigned AddrReg = AMDGPU::R600_AddrRegClass.getRegister(Address); + MachineInstrBuilder MIBuilder = TII->buildDefaultInstruction(MBB, I, + AMDGPU::MOV, MI.getOperand(0).getReg(), + AddrReg); + MIBuilder.addReg(IndirectReg, RegState::Implicit | RegState::Kill); + MachineInstr *NewMI = MIBuilder.addReg(AMDGPU::AR_X, RegState::Implicit); + TII->setImmOperand(NewMI, R600Operands::SRC0_REL, 1); + + } + MI.eraseFromParent(); + } + } + return false; +} + +bool R600IndirectAddressingPass::regHasExplicitDef(MachineRegisterInfo &MRI, + unsigned Reg) const { + MachineInstr *DefInstr = MRI.getVRegDef(Reg); + return DefInstr && DefInstr->getOperand(0).isReg() && + DefInstr->getOperand(0).getReg() == Reg; +} diff --git a/lib/Target/AMDGPU/R600InstrInfo.cpp b/lib/Target/AMDGPU/R600InstrInfo.cpp index a60a1802d48..09db5c10021 100644 --- a/lib/Target/AMDGPU/R600InstrInfo.cpp +++ b/lib/Target/AMDGPU/R600InstrInfo.cpp @@ -16,8 +16,12 @@ #include "AMDGPUTargetMachine.h" #include "AMDGPUSubtarget.h" #include "R600Defines.h" +#include "R600MachineFunctionInfo.h" #include "R600RegisterInfo.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Instructions.h" #define GET_INSTRINFO_CTOR #include "AMDGPUGenDFAPacketizer.inc" @@ -464,6 +468,75 @@ unsigned int R600InstrInfo::getInstrLatency(const InstrItineraryData *ItinData, return 2; } +unsigned R600InstrInfo::getIndirectIndexBegin(const MachineFunction &MF) const +{ + const MachineRegisterInfo &MRI = MF.getRegInfo(); + const R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>(); + unsigned Offset = 0; + + if (MRI.livein_empty() && MFI->ReservedRegs.empty()) { + return 0; + } + + for (MachineRegisterInfo::livein_iterator LI = MRI.livein_begin(), + LE = MRI.livein_end(); + LI != LE; ++LI) { + Offset = std::max(Offset, + (unsigned)GET_REG_INDEX(RI.getEncodingValue(LI->first))); + } + + for (std::vector<unsigned>::const_iterator RRI = MFI->ReservedRegs.begin(), + RRE = MFI->ReservedRegs.end(); + RRI != RRE; ++RRI) { + Offset = std::max(Offset, + (unsigned GET_REG_INDEX(RI.getEncodingValue(*RRI)))); + } + + return Offset + 1; +} + +unsigned R600InstrInfo::getIndirectIndexEnd(const MachineFunction &MF) const +{ + unsigned Offset = 0; + const MachineFrameInfo *MFI = MF.getFrameInfo(); + + // Variable sized objects are not supported + assert(!MFI->hasVarSizedObjects()); + + // Only one stack object is supported at the moment +// assert(MFI->getNumObjects() <= 1); + + if (MFI->getNumObjects() == 0) { + return 0; + } + + Offset = TM.getFrameLowering()->getFrameIndexOffset(MF, -1); + + return getIndirectIndexBegin(MF) + Offset; +} + +std::vector<unsigned> R600InstrInfo::getIndirectReservedRegs( + const MachineFunction &MF) const +{ + const AMDGPUFrameLowering *TFL = + static_cast<const AMDGPUFrameLowering*>(TM.getFrameLowering()); + const R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>(); + unsigned StackWidth = TFL->getStackWidth(MF); + unsigned End = getIndirectIndexEnd(MF); + + std::vector<unsigned> Regs; + + for (unsigned Index = getIndirectIndexBegin(MF); Index <= End; ++Index) { + unsigned SuperReg = AMDGPU::R600_Reg128RegClass.getRegister(Index); + Regs.push_back(SuperReg); + for (unsigned Chan = 0; Chan < StackWidth; ++Chan) { + unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister((4 * Index) + Chan); + Regs.push_back(Reg); + } + } + return Regs; +} + MachineInstrBuilder R600InstrInfo::buildDefaultInstruction(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, unsigned Opcode, diff --git a/lib/Target/AMDGPU/R600InstrInfo.h b/lib/Target/AMDGPU/R600InstrInfo.h index 6bb0ca92e42..8c3be94cd33 100644 --- a/lib/Target/AMDGPU/R600InstrInfo.h +++ b/lib/Target/AMDGPU/R600InstrInfo.h @@ -114,6 +114,20 @@ namespace llvm { virtual int getInstrLatency(const InstrItineraryData *ItinData, SDNode *Node) const { return 1;} + /// getIndirectIndexBegin - return the smallest register index that will + /// be accessed by an indirect read or write. + unsigned getIndirectIndexBegin(const MachineFunction &MF) const; + + /// getIndirectIndexEnd - return the largest register index that will be + /// accessed by an indirect read or write. + unsigned getIndirectIndexEnd(const MachineFunction &MF) const; + + /// getIndirectReservedRegs - return a list of all the registers that may be + /// accesed using indirect addressing. + std::vector<unsigned> getIndirectReservedRegs(const MachineFunction &MF) const; + + ///buildDefaultInstruction - This function returns a MachineInstr with + /// all the instruction modifiers initialized to their default values. /// You can use this function to avoid manually specifying each instruction /// modifier operand when building a new instruction. /// diff --git a/lib/Target/AMDGPU/R600Instructions.td b/lib/Target/AMDGPU/R600Instructions.td index 105822066c2..bdc6e40cf17 100644 --- a/lib/Target/AMDGPU/R600Instructions.td +++ b/lib/Target/AMDGPU/R600Instructions.td @@ -86,9 +86,14 @@ def UP : InstFlag <"printUpdatePred">; // default to 0. def LAST : InstFlag<"printLast", 1>; +def FRAMEri : Operand<iPTR> { + let MIOperandInfo = (ops R600_Reg32:$ptr, i32imm:$index); +} + def ADDRParam : ComplexPattern<i32, 2, "SelectADDRParam", [], []>; def ADDRDWord : ComplexPattern<i32, 1, "SelectADDRDWord", [], []>; def ADDRVTX_READ : ComplexPattern<i32, 2, "SelectADDRVTX_READ", [], []>; +def ADDRIndirect : ComplexPattern<iPTR, 2, "SelectADDRIndirect", [], []>; class R600ALU_Word0 { field bits<32> Word0; @@ -414,9 +419,17 @@ def isR600toCayman : Predicate< "Subtarget.device()->getGeneration() <= AMDGPUDeviceInfo::HD6XXX">; //===----------------------------------------------------------------------===// -// Interpolation Instructions +// R600 SDNodes //===----------------------------------------------------------------------===// +def REGISTER_LOAD : SDNode<"AMDGPUISD::REGISTER_LOAD", + SDTypeProfile<1, 2, [SDTCisPtrTy<1>, SDTCisInt<2>]>, + [SDNPHasChain, SDNPMayLoad]>; + +def REGISTER_STORE : SDNode<"AMDGPUISD::REGISTER_STORE", + SDTypeProfile<0, 3, [SDTCisPtrTy<1>, SDTCisInt<2>]>, + [SDNPHasChain, SDNPMayStore]>; + def INTERP: SDNode<"AMDGPUISD::INTERP", SDTypeProfile<1, 2, [SDTCisFP<0>, SDTCisInt<1>, SDTCisInt<2>]> >; @@ -425,6 +438,10 @@ def INTERP_P0: SDNode<"AMDGPUISD::INTERP_P0", SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisInt<1>]> >; +//===----------------------------------------------------------------------===// +// Interpolation Instructions +//===----------------------------------------------------------------------===// + let usesCustomInserter = 1 in { def input_perspective : AMDGPUShaderInst < (outs R600_Reg128:$dst), @@ -1138,6 +1155,10 @@ let Predicates = [isEGorCayman] in { defm DOT4_eg : DOT4_Common<0xBE>; defm CUBE_eg : CUBE_Common<0xC0>; +let hasSideEffects = 1 in { + def MOVA_INT_eg : R600_1OP <0xCC, "MOVA_INT", []>; +} + def TGSI_LIT_Z_eg : TGSI_LIT_Z_Common<MUL_LIT_eg, LOG_CLAMPED_eg, EXP_IEEE_eg>; def FLT_TO_INT_eg : FLT_TO_INT_Common<0x50> { @@ -1417,6 +1438,44 @@ def CONSTANT_LOAD_eg : VTX_READ_32_eg <1, } +//===----------------------------------------------------------------------===// +// Regist loads and stores - for indirect addressing +//===----------------------------------------------------------------------===// + +let isPseudo = 1, isCodeGenOnly = 1 in { + +class RegisterLoad <ValueType vt> : InstR600 <0x0, + (outs R600_Reg32:$dst), (ins FRAMEri:$addr, i32imm:$chan), + "RegisterLoad $dst, $addr", + [(set (vt R600_Reg32:$dst), (REGISTER_LOAD ADDRIndirect:$addr, + (i32 timm:$chan)))], + NullALU +>; + +class RegisterStore <ValueType vt> : InstR600 <0x0, + (outs), (ins R600_Reg32:$val, FRAMEri:$addr, i32imm:$chan), + "RegisterStore_i32 $val, $addr", + [(REGISTER_STORE (vt R600_Reg32:$val), ADDRIndirect:$addr, (i32 timm:$chan))], + NullALU +>; + + +} // isPseudo = 1, isCodeGenOnly = 1 + +def RegisterLoad_i32 : RegisterLoad<i32>; +def RegisterLoad_f32 : RegisterLoad<f32>; + +def RegisterStore_i32 : RegisterStore<i32>; +def RegisterStore_f32 : RegisterStore<f32>; + +let hasSideEffects = 1 in { + +def StackMOV : R600_1OP <0x19, "MOV", []> { + let OutOperandList = (outs R600_Reg32:$dst, variable_ops); +} + +} // End hasSideEffects = 1 + let Predicates = [isCayman] in { let isVector = 1 in { diff --git a/lib/Target/AMDGPU/R600MachineFunctionInfo.cpp b/lib/Target/AMDGPU/R600MachineFunctionInfo.cpp index 4eb5efa19f0..93b4608a60d 100644 --- a/lib/Target/AMDGPU/R600MachineFunctionInfo.cpp +++ b/lib/Target/AMDGPU/R600MachineFunctionInfo.cpp @@ -14,6 +14,7 @@ using namespace llvm; R600MachineFunctionInfo::R600MachineFunctionInfo(const MachineFunction &MF) : MachineFunctionInfo(), + IndirectChannels(4), HasLinearInterpolation(false), HasPerspectiveInterpolation(false) { memset(Outputs, 0, sizeof(Outputs)); diff --git a/lib/Target/AMDGPU/R600MachineFunctionInfo.h b/lib/Target/AMDGPU/R600MachineFunctionInfo.h index e97fb5be62e..9cc7b4ee13d 100644 --- a/lib/Target/AMDGPU/R600MachineFunctionInfo.h +++ b/lib/Target/AMDGPU/R600MachineFunctionInfo.h @@ -13,6 +13,7 @@ #ifndef R600MACHINEFUNCTIONINFO_H #define R600MACHINEFUNCTIONINFO_H +#include "llvm/ADT/BitVector.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/SelectionDAG.h" #include <vector> @@ -24,8 +25,11 @@ class R600MachineFunctionInfo : public MachineFunctionInfo { public: R600MachineFunctionInfo(const MachineFunction &MF); std::vector<unsigned> ReservedRegs; + std::vector<unsigned> IndirectRegs; + std::vector<unsigned> IndirectRegSequenceRegs; SDNode *Outputs[16]; SDNode *StreamOutputs[64][4]; + BitVector IndirectChannels; bool HasLinearInterpolation; bool HasPerspectiveInterpolation; diff --git a/lib/Target/AMDGPU/R600RegisterInfo.cpp b/lib/Target/AMDGPU/R600RegisterInfo.cpp index a39f83dbaca..b97d767ab4e 100644 --- a/lib/Target/AMDGPU/R600RegisterInfo.cpp +++ b/lib/Target/AMDGPU/R600RegisterInfo.cpp @@ -15,6 +15,7 @@ #include "R600RegisterInfo.h" #include "AMDGPUTargetMachine.h" #include "R600Defines.h" +#include "R600InstrInfo.h" #include "R600MachineFunctionInfo.h" using namespace llvm; @@ -48,11 +49,23 @@ BitVector R600RegisterInfo::getReservedRegs(const MachineFunction &MF) const { Reserved.set(*I); } + for (TargetRegisterClass::iterator I = AMDGPU::R600_AddrRegClass.begin(), + E = AMDGPU::R600_AddrRegClass.end(); I != E; ++I) { + Reserved.set(*I); + } + for (std::vector<unsigned>::const_iterator I = MFI->ReservedRegs.begin(), E = MFI->ReservedRegs.end(); I != E; ++I) { Reserved.set(*I); } + const R600InstrInfo *RII = static_cast<const R600InstrInfo*>(&TII); + std::vector<unsigned> IndirectRegs = RII->getIndirectReservedRegs(MF); + for (std::vector<unsigned>::iterator I = IndirectRegs.begin(), + E = IndirectRegs.end(); + I != E; ++I) { + Reserved.set(*I); + } return Reserved; } diff --git a/lib/Target/AMDGPU/R600RegisterInfo.td b/lib/Target/AMDGPU/R600RegisterInfo.td index d3d6d25d292..cc5c1972417 100644 --- a/lib/Target/AMDGPU/R600RegisterInfo.td +++ b/lib/Target/AMDGPU/R600RegisterInfo.td @@ -31,6 +31,12 @@ foreach Index = 0-127 in { // 32-bit Constant Registers (There are more than 128, this the number // that is currently supported. def C#Index#_#Chan : R600RegWithChan <"C"#Index#"."#Chan, Index, Chan>; + + // Indirect addressing offset registers + def Addr#Index#_#Chan : R600RegWithChan <"T("#Index#" + AR.x)."#Chan, + Index, Chan>; + def TRegMem#Index#_#Chan : R600RegWithChan <"T"#Index#"."#Chan, Index, + Chan>; } // 128-bit Temporary Registers def T#Index#_XYZW : R600Reg_128 <"T"#Index#".XYZW", @@ -46,7 +52,6 @@ foreach Index = 448-464 in { def ArrayBase#Index : R600Reg<"ARRAY_BASE", Index>; } - // Special Registers def ZERO : R600Reg<"0.0", 248>; @@ -61,10 +66,22 @@ def PREDICATE_BIT : R600Reg<"PredicateBit", 0>; def PRED_SEL_OFF: R600Reg<"Pred_sel_off", 0>; def PRED_SEL_ZERO : R600Reg<"Pred_sel_zero", 2>; def PRED_SEL_ONE : R600Reg<"Pred_sel_one", 3>; +def AR_X : R600Reg<"AR.x", 0>; def R600_ArrayBase : RegisterClass <"AMDGPU", [f32, i32], 32, (add (sequence "ArrayBase%u", 448, 464))>; +let isAllocatable = 0 in { + +def R600_Addr : RegisterClass <"AMDGPU", [i32], 127, + (add (interleave + (interleave (sequence "Addr%u_X", 0, 127), + (sequence "Addr%u_Z", 0, 127)), + (interleave (sequence "Addr%u_Y", 0, 127), + (sequence "Addr%u_W", 0, 127))))>; + +} // End isAllocatable = 0 + def R600_CReg32 : RegisterClass <"AMDGPU", [f32, i32], 32, (add (interleave (interleave (sequence "C%u_X", 0, 127), @@ -93,6 +110,7 @@ def R600_Reg32 : RegisterClass <"AMDGPU", [f32, i32], 32, (add R600_TReg32, R600_CReg32, R600_ArrayBase, + R600_Addr, ZERO, HALF, ONE, ONE_INT, PV_X, ALU_LITERAL_X, NEG_ONE, NEG_HALF)>; def R600_Predicate : RegisterClass <"AMDGPU", [i32], 32, (add @@ -105,3 +123,53 @@ def R600_Reg128 : RegisterClass<"AMDGPU", [v4f32, v4i32], 128, (add (sequence "T%u_XYZW", 0, 127))> { let CopyCost = -1; } + +class IndirectSuper<string n, list<Register> subregs> : RegisterWithSubRegs<n, subregs> { + let Namespace = "AMDGPU"; + let SubRegIndices = + [indirect_0,indirect_1,indirect_2,indirect_3,indirect_4,indirect_5,indirect_6, + indirect_7,indirect_8,indirect_9,indirect_10,indirect_11,indirect_12, + indirect_13,indirect_14,indirect_15,indirect_16,indirect_17,indirect_18, + indirect_19,indirect_20,indirect_21,indirect_22,indirect_23,indirect_24, + indirect_25,indirect_26,indirect_27,indirect_28,indirect_29,indirect_30, + indirect_31,indirect_32,indirect_33,indirect_34,indirect_35,indirect_36, + indirect_37,indirect_38,indirect_39,indirect_40,indirect_41,indirect_42, + indirect_43,indirect_44,indirect_45,indirect_46,indirect_47,indirect_48, + indirect_49,indirect_50,indirect_51,indirect_52,indirect_53,indirect_54, + indirect_55,indirect_56,indirect_57,indirect_58,indirect_59,indirect_60, + indirect_61,indirect_62,indirect_63]; +//indirect_64,indirect_65,indirect_66, +// indirect_67,indirect_68,indirect_69,indirect_70,indirect_71,indirect_72, +// indirect_73,indirect_74,indirect_75,indirect_76,indirect_77,indirect_78, +// indirect_79,indirect_80,indirect_81,indirect_82,indirect_83,indirect_84, +// indirect_85,indirect_86,indirect_87,indirect_88,indirect_89,indirect_90, +// indirect_91,indirect_92,indirect_93,indirect_94,indirect_95,indirect_96, +// indirect_97,indirect_98,indirect_99,indirect_100,indirect_101,indirect_102, +// indirect_103,indirect_104,indirect_105,indirect_106,indirect_107,indirect_108, +// indirect_109,indirect_110,indirect_111,indirect_112,indirect_113,indirect_114, +// indirect_115,indirect_116,indirect_117,indirect_118,indirect_119,indirect_120, +// indirect_121,indirect_122,indirect_123,indirect_124,indirect_125,indirect_126, +// indirect_127]; + +} + +def IndirectSuperReg : IndirectSuper<"Indirect", + [TRegMem0_X,TRegMem0_Y,TRegMem0_Z,TRegMem0_W,TRegMem1_X,TRegMem1_Y,TRegMem1_Z,TRegMem1_W,TRegMem2_X,TRegMem2_Y,TRegMem2_Z,TRegMem2_W,TRegMem3_X,TRegMem3_Y,TRegMem3_Z,TRegMem3_W,TRegMem4_X,TRegMem4_Y,TRegMem4_Z,TRegMem4_W, + TRegMem5_X,TRegMem5_Y,TRegMem5_Z,TRegMem5_W,TRegMem6_X,TRegMem6_Y,TRegMem6_Z,TRegMem6_W,TRegMem7_X,TRegMem7_Y,TRegMem7_Z,TRegMem7_W,TRegMem8_X,TRegMem8_Y,TRegMem8_Z,TRegMem8_W,TRegMem9_X,TRegMem9_Y,TRegMem9_Z,TRegMem9_W, + TRegMem10_X,TRegMem10_Y,TRegMem10_Z,TRegMem10_W,TRegMem11_X,TRegMem11_Y,TRegMem11_Z,TRegMem11_W,TRegMem12_X,TRegMem12_Y,TRegMem12_Z,TRegMem12_W,TRegMem13_X,TRegMem13_Y,TRegMem13_Z,TRegMem13_W, + TRegMem14_X,TRegMem14_Y,TRegMem14_Z,TRegMem14_W,TRegMem15_X,TRegMem15_Y,TRegMem15_Z,TRegMem15_W] +//,TRegMem16_X,TRegMem16_Y,TRegMem16_Z,TRegMem16_W +//,TRegMem17_X,TRegMem17_Y,TRegMem17_Z,TRegMem17_W, +// TRegMem18_X,TRegMem18_Y,TRegMem18_Z,TRegMem18_W,TRegMem19_X,TRegMem19_Y,TRegMem19_Z,TRegMem19_W,TRegMem20_X,TRegMem20_Y,TRegMem20_Z,TRegMem20_W,TRegMem21_X,TRegMem21_Y,TRegMem21_Z,TRegMem21_W, +// TRegMem22_X,TRegMem22_Y,TRegMem22_Z,TRegMem22_W,TRegMem23_X,TRegMem23_Y,TRegMem23_Z,TRegMem23_W,TRegMem24_X,TRegMem24_Y,TRegMem24_Z,TRegMem24_W,TRegMem25_X,TRegMem25_Y,TRegMem25_Z,TRegMem25_W, +// TRegMem26_X,TRegMem26_Y,TRegMem26_Z,TRegMem26_W,TRegMem27_X,TRegMem27_Y,TRegMem27_Z,TRegMem27_W,TRegMem28_X,TRegMem28_Y,TRegMem28_Z,TRegMem28_W,TRegMem29_X,TRegMem29_Y,TRegMem29_Z,TRegMem29_W, +// TRegMem30_X,TRegMem30_Y,TRegMem30_Z,TRegMem30_W,TRegMem31_X,TRegMem31_Y,TRegMem31_Z,TRegMem31_W] +>; + +def IndirectReg : RegisterClass<"AMDGPU", [f32, i32], 32, (add IndirectSuperReg)>; + +def TRegMem : RegisterClass<"AMDGPU", [f32, i32], 32, + (add (interleave + (interleave (sequence "TRegMem%u_X", 0, 16), (sequence "TRegMem%u_Z", 0, 16)), + (interleave (sequence "TRegMem%u_Y", 0, 16), (sequence "TRegMem%u_W", 0, 16)))) +>; diff --git a/lib/Target/AMDGPU/R600VolatileLoadSetter.cpp b/lib/Target/AMDGPU/R600VolatileLoadSetter.cpp new file mode 100644 index 00000000000..16d0d76ad58 --- /dev/null +++ b/lib/Target/AMDGPU/R600VolatileLoadSetter.cpp @@ -0,0 +1,76 @@ +//===-- R600AllocateMemoryRegs.cpp - Indirect Adressing Support -----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass force every load instruction to be set as volatile. +// For non power of 2 vectors, LLVM always uses tinier vector store instructions +// to match the exact size of the original vector ; on the other hand it +// can load vector using bigger vector load instructions if the load is marked +// as non volatile, which can lead to inconsistent store/load operations +// in AMDGPU backend. This pass force all load instructions to be volatile. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "llvm/Pass.h" +#include "llvm/Support/InstVisitor.h" + +namespace llvm { + +class LLVM_LIBRARY_VISIBILITY R600VolatileLoadSetter : + public FunctionPass, public InstVisitor<R600VolatileLoadSetter> +{ +protected: + bool TypeContainsVec3(Type *LoadedType) const { + if (VectorType *VectorLoadedType = dyn_cast<VectorType>(LoadedType)) { + return VectorLoadedType->getNumElements() == 3; + } + if (CompositeType *CompLoadedType = dyn_cast<CompositeType>(LoadedType)) { + for (CompositeType::subtype_iterator I = CompLoadedType->subtype_begin(), + E = CompLoadedType->subtype_end(); I != E; ++I) + { + if (TypeContainsVec3(*I)) + return true; + } + return false; + } + return false; + } + +public: + static char ID; + R600VolatileLoadSetter() : FunctionPass(ID) { } + ~R600VolatileLoadSetter() { } + bool runOnFunction(Function &F) { + visit(F); + return true; + } + + const char *getPassName() const { return "R600 Set Volatile to Load Inst"; } + + void visitLoadInst(LoadInst &I) { + if (I.getPointerAddressSpace() != 0) + return; + I.setAlignment(16); + } + + void visitStoreInst(StoreInst &I) { + if (I.getPointerAddressSpace() != 0) + return; + I.setAlignment(16); + } +}; + +char R600VolatileLoadSetter::ID = 0; + + +FunctionPass *createR600VolatileLoadSetter() { + return new R600VolatileLoadSetter(); +} + +} // End namespace llvm |