summaryrefslogtreecommitdiff
path: root/src/gallium/drivers/radeon/AMDIL789IOExpansion.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/gallium/drivers/radeon/AMDIL789IOExpansion.cpp')
-rw-r--r--src/gallium/drivers/radeon/AMDIL789IOExpansion.cpp723
1 files changed, 723 insertions, 0 deletions
diff --git a/src/gallium/drivers/radeon/AMDIL789IOExpansion.cpp b/src/gallium/drivers/radeon/AMDIL789IOExpansion.cpp
new file mode 100644
index 00000000000..cf5afb9d195
--- /dev/null
+++ b/src/gallium/drivers/radeon/AMDIL789IOExpansion.cpp
@@ -0,0 +1,723 @@
+//===-- AMDIL789IOExpansion.cpp - TODO: Add brief description -------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+//
+// @file AMDIL789IOExpansion.cpp
+// @details Implementation of the IO expansion class for 789 devices.
+//
+#include "AMDILCompilerErrors.h"
+#include "AMDILCompilerWarnings.h"
+#include "AMDILDevices.h"
+#include "AMDILGlobalManager.h"
+#include "AMDILIOExpansion.h"
+#include "AMDILKernelManager.h"
+#include "AMDILMachineFunctionInfo.h"
+#include "AMDILTargetMachine.h"
+#include "AMDILUtilityFunctions.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Support/DebugLoc.h"
+#include "llvm/Value.h"
+
+using namespace llvm;
+AMDIL789IOExpansion::AMDIL789IOExpansion(TargetMachine &tm
+ AMDIL_OPT_LEVEL_DECL)
+: AMDILIOExpansion(tm AMDIL_OPT_LEVEL_VAR)
+{
+}
+
+AMDIL789IOExpansion::~AMDIL789IOExpansion() {
+}
+
+const char *AMDIL789IOExpansion::getPassName() const
+{
+ return "AMDIL 789 IO Expansion Pass";
+}
+// This code produces the following pseudo-IL:
+// mov r1007, $src.y000
+// cmov_logical r1007.x___, $flag.yyyy, r1007.xxxx, $src.xxxx
+// mov r1006, $src.z000
+// cmov_logical r1007.x___, $flag.zzzz, r1006.xxxx, r1007.xxxx
+// mov r1006, $src.w000
+// cmov_logical $dst.x___, $flag.wwww, r1006.xxxx, r1007.xxxx
+void
+AMDIL789IOExpansion::emitComponentExtract(MachineInstr *MI,
+ unsigned flag, unsigned src, unsigned dst, bool before)
+{
+ MachineBasicBlock::iterator I = *MI;
+ DebugLoc DL = MI->getDebugLoc();
+ BuildMI(*mBB, I, DL, mTII->get(AMDIL::VEXTRACT_v4i32), AMDIL::R1007)
+ .addReg(src)
+ .addImm(2);
+ BuildMI(*mBB, I, DL, mTII->get(AMDIL::CMOVLOG_Y_i32), AMDIL::R1007)
+ .addReg(flag)
+ .addReg(AMDIL::R1007)
+ .addReg(src);
+ BuildMI(*mBB, I, DL, mTII->get(AMDIL::VEXTRACT_v4i32), AMDIL::R1006)
+ .addReg(src)
+ .addImm(3);
+ BuildMI(*mBB, I, DL, mTII->get(AMDIL::CMOVLOG_Z_i32), AMDIL::R1007)
+ .addReg(flag)
+ .addReg(AMDIL::R1006)
+ .addReg(AMDIL::R1007);
+ BuildMI(*mBB, I, DL, mTII->get(AMDIL::VEXTRACT_v4i32), AMDIL::R1006)
+ .addReg(src)
+ .addImm(4);
+ BuildMI(*mBB, I, DL, mTII->get(AMDIL::CMOVLOG_W_i32), dst)
+ .addReg(flag)
+ .addReg(AMDIL::R1006)
+ .addReg(AMDIL::R1007);
+
+}
+// We have a 128 bit load but a 8/16/32bit value, so we need to
+// select the correct component and make sure that the correct
+// bits are selected. For the 8 and 16 bit cases we need to
+// extract from the component the correct bits and for 32 bits
+// we just need to select the correct component.
+ void
+AMDIL789IOExpansion::emitDataLoadSelect(MachineInstr *MI)
+{
+ MachineBasicBlock::iterator I = *MI;
+ DebugLoc DL = MI->getDebugLoc();
+ emitComponentExtract(MI, AMDIL::R1008, AMDIL::R1011, AMDIL::R1011, false);
+ if (getMemorySize(MI) == 1) {
+ // This produces the following pseudo-IL:
+ // iand r1006.x___, r1010.xxxx, l14.xxxx
+ // mov r1006, r1006.xxxx
+ // iadd r1006, r1006, {0, -1, 2, 3}
+ // ieq r1008, r1006, 0
+ // mov r1011, r1011.xxxx
+ // ishr r1011, r1011, {0, 8, 16, 24}
+ // mov r1007, r1011.y000
+ // cmov_logical r1007.x___, r1008.yyyy, r1007.xxxx, r1011.xxxx
+ // mov r1006, r1011.z000
+ // cmov_logical r1007.x___, r1008.zzzz, r1006.xxxx, r1007.xxxx
+ // mov r1006, r1011.w000
+ // cmov_logical r1011.x___, r1008.wwww, r1006.xxxx, r1007.xxxx
+ BuildMI(*mBB, I, DL, mTII->get(AMDIL::BINARY_AND_i32), AMDIL::R1006)
+ .addReg(AMDIL::R1010)
+ .addImm(mMFI->addi32Literal(3));
+ BuildMI(*mBB, I, DL, mTII->get(AMDIL::VCREATE_v4i32), AMDIL::R1006)
+ .addReg(AMDIL::R1006);
+ BuildMI(*mBB, I, DL, mTII->get(AMDIL::ADD_v4i32), AMDIL::R1006)
+ .addReg(AMDIL::R1006)
+ .addImm(mMFI->addi128Literal(0xFFFFFFFFULL << 32,
+ (0xFFFFFFFEULL | (0xFFFFFFFDULL << 32))));
+ BuildMI(*mBB, I, DL, mTII->get(AMDIL::IEQ_v4i32), AMDIL::R1008)
+ .addReg(AMDIL::R1006)
+ .addImm(mMFI->addi32Literal(0));
+ BuildMI(*mBB, I, DL, mTII->get(AMDIL::VCREATE_v4i32), AMDIL::R1011)
+ .addReg(AMDIL::R1011);
+ BuildMI(*mBB, I, DL, mTII->get(AMDIL::SHRVEC_v4i32), AMDIL::R1011)
+ .addReg(AMDIL::R1011)
+ .addImm(mMFI->addi128Literal(8ULL << 32, 16ULL | (24ULL << 32)));
+ emitComponentExtract(MI, AMDIL::R1008, AMDIL::R1011, AMDIL::R1011, false);
+ } else if (getMemorySize(MI) == 2) {
+ // This produces the following pseudo-IL:
+ // ishr r1007.x___, r1010.xxxx, 1
+ // iand r1008.x___, r1007.xxxx, 1
+ // ishr r1007.x___, r1011.xxxx, 16
+ // cmov_logical r1011.x___, r1008.xxxx, r1007.xxxx, r1011.xxxx
+ BuildMI(*mBB, I, DL, mTII->get(AMDIL::SHR_i32), AMDIL::R1007)
+ .addReg(AMDIL::R1010)
+ .addImm(mMFI->addi32Literal(1));
+ BuildMI(*mBB, I, DL, mTII->get(AMDIL::BINARY_AND_i32), AMDIL::R1008)
+ .addReg(AMDIL::R1007)
+ .addImm(mMFI->addi32Literal(1));
+ BuildMI(*mBB, I, DL, mTII->get(AMDIL::SHR_i32), AMDIL::R1007)
+ .addReg(AMDIL::R1011)
+ .addImm(mMFI->addi32Literal(16));
+ BuildMI(*mBB, I, DL, mTII->get(AMDIL::CMOVLOG_i32), AMDIL::R1011)
+ .addReg(AMDIL::R1008)
+ .addReg(AMDIL::R1007)
+ .addReg(AMDIL::R1011);
+ }
+}
+// This function does address calculations modifications to load from a vector
+// register type instead of a dword addressed load.
+ void
+AMDIL789IOExpansion::emitVectorAddressCalc(MachineInstr *MI, bool is32bit, bool needsSelect)
+{
+ MachineBasicBlock::iterator I = *MI;
+ DebugLoc DL = MI->getDebugLoc();
+ // This produces the following pseudo-IL:
+ // ishr r1007.x___, r1010.xxxx, (is32bit) ? 2 : 3
+ // iand r1008.x___, r1007.xxxx, (is32bit) ? 3 : 1
+ // ishr r1007.x___, r1007.xxxx, (is32bit) ? 2 : 1
+ BuildMI(*mBB, I, DL, mTII->get(AMDIL::SHR_i32), AMDIL::R1007)
+ .addReg(AMDIL::R1010)
+ .addImm(mMFI->addi32Literal((is32bit) ? 0x2 : 3));
+ BuildMI(*mBB, I, DL, mTII->get(AMDIL::BINARY_AND_i32), AMDIL::R1008)
+ .addReg(AMDIL::R1007)
+ .addImm(mMFI->addi32Literal((is32bit) ? 3 : 1));
+ BuildMI(*mBB, I, DL, mTII->get(AMDIL::SHR_i32), AMDIL::R1007)
+ .addReg(AMDIL::R1007)
+ .addImm(mMFI->addi32Literal((is32bit) ? 2 : 1));
+ if (needsSelect) {
+ // If the component selection is required, the following
+ // pseudo-IL is produced.
+ // mov r1008, r1008.xxxx
+ // iadd r1008, r1008, (is32bit) ? {0, -1, -2, -3} : {0, 0, -1, -1}
+ // ieq r1008, r1008, 0
+ BuildMI(*mBB, I, DL, mTII->get(AMDIL::VCREATE_v4i32), AMDIL::R1008)
+ .addReg(AMDIL::R1008);
+ BuildMI(*mBB, I, DL, mTII->get(AMDIL::ADD_v4i32), AMDIL::R1008)
+ .addReg(AMDIL::R1008)
+ .addImm(mMFI->addi128Literal((is32bit) ? 0xFFFFFFFFULL << 32 : 0ULL,
+ (is32bit) ? 0xFFFFFFFEULL | (0xFFFFFFFDULL << 32) :
+ -1ULL));
+ BuildMI(*mBB, I, DL, mTII->get(AMDIL::IEQ_v4i32), AMDIL::R1008)
+ .addReg(AMDIL::R1008)
+ .addImm(mMFI->addi32Literal(0));
+ }
+}
+// This function emits a switch statement and writes 32bit/64bit
+// value to a 128bit vector register type.
+ void
+AMDIL789IOExpansion::emitVectorSwitchWrite(MachineInstr *MI, bool is32bit)
+{
+ MachineBasicBlock::iterator I = *MI;
+ uint32_t xID = getPointerID(MI);
+ assert(xID && "Found a scratch store that was incorrectly marked as zero ID!\n");
+ // This section generates the following pseudo-IL:
+ // switch r1008.x
+ // default
+ // mov x1[r1007.x].(is32bit) ? x___ : xy__, r1011.x{y}
+ // break
+ // case 1
+ // mov x1[r1007.x].(is32bit) ? _y__ : __zw, r1011.x{yxy}
+ // break
+ // if is32bit is true, case 2 and 3 are emitted.
+ // case 2
+ // mov x1[r1007.x].__z_, r1011.x
+ // break
+ // case 3
+ // mov x1[r1007.x].___w, r1011.x
+ // break
+ // endswitch
+ DebugLoc DL;
+ BuildMI(*mBB, I, MI->getDebugLoc(), mTII->get(AMDIL::SWITCH))
+ .addReg(AMDIL::R1008);
+ BuildMI(*mBB, I, DL, mTII->get(AMDIL::DEFAULT));
+ BuildMI(*mBB, I, DL,
+ mTII->get((is32bit) ? AMDIL::SCRATCHSTORE_X : AMDIL::SCRATCHSTORE_XY)
+ , AMDIL::R1007)
+ .addReg(AMDIL::R1011)
+ .addImm(xID);
+ BuildMI(*mBB, I, DL, mTII->get(AMDIL::BREAK));
+ BuildMI(*mBB, I, DL, mTII->get(AMDIL::CASE)).addImm(1);
+ BuildMI(*mBB, I, DL,
+ mTII->get((is32bit) ? AMDIL::SCRATCHSTORE_Y : AMDIL::SCRATCHSTORE_ZW), AMDIL::R1007)
+ .addReg(AMDIL::R1011)
+ .addImm(xID);
+ BuildMI(*mBB, I, DL, mTII->get(AMDIL::BREAK));
+ if (is32bit) {
+ BuildMI(*mBB, I, DL, mTII->get(AMDIL::CASE)).addImm(2);
+ BuildMI(*mBB, I, DL,
+ mTII->get(AMDIL::SCRATCHSTORE_Z), AMDIL::R1007)
+ .addReg(AMDIL::R1011)
+ .addImm(xID);
+ BuildMI(*mBB, I, DL, mTII->get(AMDIL::BREAK));
+ BuildMI(*mBB, I, DL, mTII->get(AMDIL::CASE)).addImm(3);
+ BuildMI(*mBB, I, DL,
+ mTII->get(AMDIL::SCRATCHSTORE_W), AMDIL::R1007)
+ .addReg(AMDIL::R1011)
+ .addImm(xID);
+ BuildMI(*mBB, I, DL, mTII->get(AMDIL::BREAK));
+ }
+ BuildMI(*mBB, I, DL, mTII->get(AMDIL::ENDSWITCH));
+
+}
+ void
+AMDIL789IOExpansion::expandPrivateLoad(MachineInstr *MI)
+{
+ MachineBasicBlock::iterator I = *MI;
+ bool HWPrivate = mSTM->device()->usesHardware(AMDILDeviceInfo::PrivateMem);
+ if (!HWPrivate || mSTM->device()->isSupported(AMDILDeviceInfo::PrivateUAV)) {
+ return expandGlobalLoad(MI);
+ }
+ if (!mMFI->usesMem(AMDILDevice::SCRATCH_ID)
+ && mKM->isKernel()) {
+ mMFI->addErrorMsg(amd::CompilerErrorMessage[MEMOP_NO_ALLOCATION]);
+ }
+ uint32_t xID = getPointerID(MI);
+ assert(xID && "Found a scratch load that was incorrectly marked as zero ID!\n");
+ if (!xID) {
+ xID = mSTM->device()->getResourceID(AMDILDevice::SCRATCH_ID);
+ mMFI->addErrorMsg(amd::CompilerWarningMessage[RECOVERABLE_ERROR]);
+ }
+ DebugLoc DL;
+ // These instructions go before the current MI.
+ expandLoadStartCode(MI);
+ switch (getMemorySize(MI)) {
+ default:
+ // Since the private register is a 128 bit aligned, we have to align the address
+ // first, since our source address is 32bit aligned and then load the data.
+ // This produces the following pseudo-IL:
+ // ishr r1010.x___, r1010.xxxx, 4
+ // mov r1011, x1[r1010.x]
+ BuildMI(*mBB, I, DL,
+ mTII->get(AMDIL::SHR_i32), AMDIL::R1010)
+ .addReg(AMDIL::R1010)
+ .addImm(mMFI->addi32Literal(4));
+ BuildMI(*mBB, I, DL,
+ mTII->get(AMDIL::SCRATCHLOAD), AMDIL::R1011)
+ .addReg(AMDIL::R1010)
+ .addImm(xID);
+ break;
+ case 1:
+ case 2:
+ case 4:
+ emitVectorAddressCalc(MI, true, true);
+ // This produces the following pseudo-IL:
+ // mov r1011, x1[r1007.x]
+ BuildMI(*mBB, I, DL,
+ mTII->get(AMDIL::SCRATCHLOAD), AMDIL::R1011)
+ .addReg(AMDIL::R1007)
+ .addImm(xID);
+ // These instructions go after the current MI.
+ emitDataLoadSelect(MI);
+ break;
+ case 8:
+ emitVectorAddressCalc(MI, false, true);
+ // This produces the following pseudo-IL:
+ // mov r1011, x1[r1007.x]
+ // mov r1007, r1011.zw00
+ // cmov_logical r1011.xy__, r1008.xxxx, r1011.xy, r1007.zw
+ BuildMI(*mBB, I, DL,
+ mTII->get(AMDIL::SCRATCHLOAD), AMDIL::R1011)
+ .addReg(AMDIL::R1007)
+ .addImm(xID);
+ // These instructions go after the current MI.
+ BuildMI(*mBB, I, DL,
+ mTII->get(AMDIL::VEXTRACT_v2i64), AMDIL::R1007)
+ .addReg(AMDIL::R1011)
+ .addImm(2);
+ BuildMI(*mBB, I, DL,
+ mTII->get(AMDIL::CMOVLOG_i64), AMDIL::R1011)
+ .addReg(AMDIL::R1008)
+ .addReg(AMDIL::R1011)
+ .addReg(AMDIL::R1007);
+ break;
+ }
+ expandPackedData(MI);
+ expandExtendLoad(MI);
+ BuildMI(*mBB, I, MI->getDebugLoc(),
+ mTII->get(getMoveInstFromID(
+ MI->getDesc().OpInfo[0].RegClass)),
+ MI->getOperand(0).getReg())
+ .addReg(AMDIL::R1011);
+}
+
+
+ void
+AMDIL789IOExpansion::expandConstantLoad(MachineInstr *MI)
+{
+ MachineBasicBlock::iterator I = *MI;
+ if (!isHardwareInst(MI) || MI->memoperands_empty()) {
+ return expandGlobalLoad(MI);
+ }
+ uint32_t cID = getPointerID(MI);
+ if (cID < 2) {
+ return expandGlobalLoad(MI);
+ }
+ if (!mMFI->usesMem(AMDILDevice::CONSTANT_ID)
+ && mKM->isKernel()) {
+ mMFI->addErrorMsg(amd::CompilerErrorMessage[MEMOP_NO_ALLOCATION]);
+ }
+
+ DebugLoc DL;
+ // These instructions go before the current MI.
+ expandLoadStartCode(MI);
+ switch (getMemorySize(MI)) {
+ default:
+ BuildMI(*mBB, I, DL,
+ mTII->get(AMDIL::SHR_i32), AMDIL::R1010)
+ .addReg(AMDIL::R1010)
+ .addImm(mMFI->addi32Literal(4));
+ BuildMI(*mBB, I, DL,
+ mTII->get(AMDIL::CBLOAD), AMDIL::R1011)
+ .addReg(AMDIL::R1010)
+ .addImm(cID);
+ break;
+ case 1:
+ case 2:
+ case 4:
+ emitVectorAddressCalc(MI, true, true);
+ BuildMI(*mBB, I, DL,
+ mTII->get(AMDIL::CBLOAD), AMDIL::R1011)
+ .addReg(AMDIL::R1007)
+ .addImm(cID);
+ // These instructions go after the current MI.
+ emitDataLoadSelect(MI);
+ break;
+ case 8:
+ emitVectorAddressCalc(MI, false, true);
+ BuildMI(*mBB, I, DL,
+ mTII->get(AMDIL::CBLOAD), AMDIL::R1011)
+ .addReg(AMDIL::R1007)
+ .addImm(cID);
+ // These instructions go after the current MI.
+ BuildMI(*mBB, I, DL,
+ mTII->get(AMDIL::VEXTRACT_v2i64), AMDIL::R1007)
+ .addReg(AMDIL::R1011)
+ .addImm(2);
+ BuildMI(*mBB, I, DL,
+ mTII->get(AMDIL::VCREATE_v2i32), AMDIL::R1008)
+ .addReg(AMDIL::R1008);
+ BuildMI(*mBB, I, DL,
+ mTII->get(AMDIL::CMOVLOG_i64), AMDIL::R1011)
+ .addReg(AMDIL::R1008)
+ .addReg(AMDIL::R1011)
+ .addReg(AMDIL::R1007);
+ break;
+ }
+ expandPackedData(MI);
+ expandExtendLoad(MI);
+ BuildMI(*mBB, I, MI->getDebugLoc(),
+ mTII->get(getMoveInstFromID(
+ MI->getDesc().OpInfo[0].RegClass)),
+ MI->getOperand(0).getReg())
+ .addReg(AMDIL::R1011);
+ MI->getOperand(0).setReg(AMDIL::R1011);
+}
+
+ void
+AMDIL789IOExpansion::expandConstantPoolLoad(MachineInstr *MI)
+{
+ if (!isStaticCPLoad(MI)) {
+ return expandConstantLoad(MI);
+ } else {
+ uint32_t idx = MI->getOperand(1).getIndex();
+ const MachineConstantPool *MCP = MI->getParent()->getParent()
+ ->getConstantPool();
+ const std::vector<MachineConstantPoolEntry> &consts
+ = MCP->getConstants();
+ const Constant *C = consts[idx].Val.ConstVal;
+ emitCPInst(MI, C, mKM, 0, isExtendLoad(MI));
+ }
+}
+
+ void
+AMDIL789IOExpansion::expandPrivateStore(MachineInstr *MI)
+{
+ MachineBasicBlock::iterator I = *MI;
+ bool HWPrivate = mSTM->device()->usesHardware(AMDILDeviceInfo::PrivateMem);
+ if (!HWPrivate || mSTM->device()->isSupported(AMDILDeviceInfo::PrivateUAV)) {
+ return expandGlobalStore(MI);
+ }
+ if (!mMFI->usesMem(AMDILDevice::SCRATCH_ID)
+ && mKM->isKernel()) {
+ mMFI->addErrorMsg(amd::CompilerErrorMessage[MEMOP_NO_ALLOCATION]);
+ }
+ uint32_t xID = getPointerID(MI);
+ assert(xID && "Found a scratch store that was incorrectly marked as zero ID!\n");
+ if (!xID) {
+ xID = mSTM->device()->getResourceID(AMDILDevice::SCRATCH_ID);
+ mMFI->addErrorMsg(amd::CompilerWarningMessage[RECOVERABLE_ERROR]);
+ }
+ DebugLoc DL;
+ // These instructions go before the current MI.
+ expandStoreSetupCode(MI);
+ switch (getMemorySize(MI)) {
+ default:
+ // This section generates the following pseudo-IL:
+ // ishr r1010.x___, r1010.xxxx, 4
+ // mov x1[r1010.x], r1011
+ BuildMI(*mBB, I, DL,
+ mTII->get(AMDIL::SHR_i32), AMDIL::R1010)
+ .addReg(AMDIL::R1010)
+ .addImm(mMFI->addi32Literal(4));
+ BuildMI(*mBB, I, MI->getDebugLoc(),
+ mTII->get(AMDIL::SCRATCHSTORE), AMDIL::R1010)
+ .addReg(AMDIL::R1011)
+ .addImm(xID);
+ break;
+ case 1:
+ emitVectorAddressCalc(MI, true, true);
+ // This section generates the following pseudo-IL:
+ // mov r1002, x1[r1007.x]
+ BuildMI(*mBB, I, DL,
+ mTII->get(AMDIL::SCRATCHLOAD), AMDIL::R1002)
+ .addReg(AMDIL::R1007)
+ .addImm(xID);
+ emitComponentExtract(MI, AMDIL::R1008, AMDIL::R1002, AMDIL::R1002, true);
+ // This section generates the following pseudo-IL:
+ // iand r1003.x, r1010.x, 3
+ // mov r1003, r1003.xxxx
+ // iadd r1000, r1003, {0, -1, -2, -3}
+ // ieq r1000, r1000, 0
+ // mov r1002, r1002.xxxx
+ // ishr r1002, r1002, {0, 8, 16, 24}
+ // mov r1011, r1011.xxxx
+ // cmov_logical r1002, r1000, r1011, r1002
+ BuildMI(*mBB, I, DL, mTII->get(AMDIL::BINARY_AND_i32), AMDIL::R1003)
+ .addReg(AMDIL::R1010)
+ .addImm(mMFI->addi32Literal(3));
+ BuildMI(*mBB, I, DL, mTII->get(AMDIL::VCREATE_v4i32), AMDIL::R1003)
+ .addReg(AMDIL::R1003);
+ BuildMI(*mBB, I, DL, mTII->get(AMDIL::ADD_v4i32), AMDIL::R1001)
+ .addReg(AMDIL::R1003)
+ .addImm(mMFI->addi128Literal(0xFFFFFFFFULL << 32,
+ (0xFFFFFFFEULL | (0xFFFFFFFDULL << 32))));
+ BuildMI(*mBB, I, DL, mTII->get(AMDIL::IEQ_v4i32), AMDIL::R1001)
+ .addReg(AMDIL::R1001)
+ .addImm(mMFI->addi32Literal(0));
+ BuildMI(*mBB, I, DL, mTII->get(AMDIL::VCREATE_v4i32), AMDIL::R1002)
+ .addReg(AMDIL::R1002);
+ BuildMI(*mBB, I, DL, mTII->get(AMDIL::SHRVEC_v4i32), AMDIL::R1002)
+ .addReg(AMDIL::R1002)
+ .addImm(mMFI->addi128Literal(8ULL << 32, 16ULL | (24ULL << 32)));
+ BuildMI(*mBB, I, DL, mTII->get(AMDIL::VCREATE_v4i32), AMDIL::R1011)
+ .addReg(AMDIL::R1011);
+ BuildMI(*mBB, I, DL, mTII->get(AMDIL::CMOVLOG_v4i32), AMDIL::R1002)
+ .addReg(AMDIL::R1001)
+ .addReg(AMDIL::R1011)
+ .addReg(AMDIL::R1002);
+ if (mSTM->device()->getGeneration() == AMDILDeviceInfo::HD4XXX) {
+ // This section generates the following pseudo-IL:
+ // iand r1002, r1002, 0xFF
+ // ishl r1002, r1002, {0, 8, 16, 24}
+ // ior r1002.xy, r1002.xy, r1002.zw
+ // ior r1011.x, r1002.x, r1002.y
+ BuildMI(*mBB, I, DL, mTII->get(AMDIL::BINARY_AND_v4i32), AMDIL::R1002)
+ .addReg(AMDIL::R1002)
+ .addImm(mMFI->addi32Literal(0xFF));
+ BuildMI(*mBB, I, DL, mTII->get(AMDIL::SHL_v4i32), AMDIL::R1002)
+ .addReg(AMDIL::R1002)
+ .addImm(mMFI->addi128Literal(8ULL << 32, 16ULL | (24ULL << 32)));
+ BuildMI(*mBB, I, DL, mTII->get(AMDIL::HILO_BITOR_v2i64), AMDIL::R1002)
+ .addReg(AMDIL::R1002).addReg(AMDIL::R1002);
+ BuildMI(*mBB, I, DL, mTII->get(AMDIL::HILO_BITOR_v2i32), AMDIL::R1011)
+ .addReg(AMDIL::R1002).addReg(AMDIL::R1002);
+ } else {
+ // This section generates the following pseudo-IL:
+ // mov r1001.xy, r1002.yw
+ // mov r1002.xy, r1002.xz
+ // ubit_insert r1002.xy, 8, 8, r1001.xy, r1002.xy
+ // mov r1001.x, r1002.y
+ // ubit_insert r1011.x, 16, 16, r1002.y, r1002.x
+ BuildMI(*mBB, I, DL, mTII->get(AMDIL::LHI_v2i64), AMDIL::R1001)
+ .addReg(AMDIL::R1002);
+ BuildMI(*mBB, I, DL, mTII->get(AMDIL::LLO_v2i64), AMDIL::R1002)
+ .addReg(AMDIL::R1002);
+ BuildMI(*mBB, I, DL, mTII->get(AMDIL::UBIT_INSERT_v2i32), AMDIL::R1002)
+ .addImm(mMFI->addi32Literal(8))
+ .addImm(mMFI->addi32Literal(8))
+ .addReg(AMDIL::R1001)
+ .addReg(AMDIL::R1002);
+ BuildMI(*mBB, I, DL, mTII->get(AMDIL::LHI), AMDIL::R1001)
+ .addReg(AMDIL::R1002);
+ BuildMI(*mBB, I, DL, mTII->get(AMDIL::UBIT_INSERT_i32), AMDIL::R1011)
+ .addImm(mMFI->addi32Literal(16))
+ .addImm(mMFI->addi32Literal(16))
+ .addReg(AMDIL::R1001)
+ .addReg(AMDIL::R1002);
+ }
+ emitVectorAddressCalc(MI, true, false);
+ emitVectorSwitchWrite(MI, true);
+ break;
+ case 2:
+ emitVectorAddressCalc(MI, true, true);
+ // This section generates the following pseudo-IL:
+ // mov r1002, x1[r1007.x]
+ BuildMI(*mBB, I, DL,
+ mTII->get(AMDIL::SCRATCHLOAD), AMDIL::R1002)
+ .addReg(AMDIL::R1007)
+ .addImm(xID);
+ emitComponentExtract(MI, AMDIL::R1008, AMDIL::R1002, AMDIL::R1002, true);
+ // This section generates the following pseudo-IL:
+ // ishr r1003.x, r1010.x, 1
+ // iand r1003.x, r1003.x, 1
+ // ishr r1001.x, r1002.x, 16
+ // cmov_logical r1002.x, r1003.x, r1002.x, r1011.x
+ // cmov_logical r1001.x, r1003.x, r1011.x, r1001.x
+ BuildMI(*mBB, I, DL, mTII->get(AMDIL::SHR_i32), AMDIL::R1003)
+ .addReg(AMDIL::R1010)
+ .addImm(mMFI->addi32Literal(1));
+ BuildMI(*mBB, I, DL, mTII->get(AMDIL::BINARY_AND_i32), AMDIL::R1003)
+ .addReg(AMDIL::R1003)
+ .addImm(mMFI->addi32Literal(1));
+ BuildMI(*mBB, I, DL, mTII->get(AMDIL::SHR_i32), AMDIL::R1001)
+ .addReg(AMDIL::R1002)
+ .addImm(mMFI->addi32Literal(16));
+ BuildMI(*mBB, I, DL, mTII->get(AMDIL::CMOVLOG_i32), AMDIL::R1002)
+ .addReg(AMDIL::R1003)
+ .addReg(AMDIL::R1002)
+ .addReg(AMDIL::R1011);
+ BuildMI(*mBB, I, DL, mTII->get(AMDIL::CMOVLOG_i32), AMDIL::R1001)
+ .addReg(AMDIL::R1003)
+ .addReg(AMDIL::R1011)
+ .addReg(AMDIL::R1001);
+ if (mSTM->device()->getGeneration() == AMDILDeviceInfo::HD4XXX) {
+ // This section generates the following pseudo-IL:
+ // iand r1002.x, r1002.x, 0xFFFF
+ // iand r1001.x, r1001.x, 0xFFFF
+ // ishl r1001.x, r1002.x, 16
+ // ior r1011.x, r1002.x, r1001.x
+ BuildMI(*mBB, I, DL, mTII->get(AMDIL::BINARY_AND_i32), AMDIL::R1002)
+ .addReg(AMDIL::R1002)
+ .addImm(mMFI->addi32Literal(0xFFFF));
+ BuildMI(*mBB, I, DL, mTII->get(AMDIL::BINARY_AND_i32), AMDIL::R1001)
+ .addReg(AMDIL::R1001)
+ .addImm(mMFI->addi32Literal(0xFFFF));
+ BuildMI(*mBB, I, DL, mTII->get(AMDIL::SHL_i32), AMDIL::R1001)
+ .addReg(AMDIL::R1001)
+ .addImm(mMFI->addi32Literal(16));
+ BuildMI(*mBB, I, DL, mTII->get(AMDIL::BINARY_OR_i32), AMDIL::R1011)
+ .addReg(AMDIL::R1002).addReg(AMDIL::R1001);
+ } else {
+ // This section generates the following pseudo-IL:
+ // ubit_insert r1011.x, 16, 16, r1001.y, r1002.x
+ BuildMI(*mBB, I, DL, mTII->get(AMDIL::UBIT_INSERT_i32), AMDIL::R1011)
+ .addImm(mMFI->addi32Literal(16))
+ .addImm(mMFI->addi32Literal(16))
+ .addReg(AMDIL::R1001)
+ .addReg(AMDIL::R1002);
+ }
+ emitVectorAddressCalc(MI, true, false);
+ emitVectorSwitchWrite(MI, true);
+ break;
+ case 4:
+ emitVectorAddressCalc(MI, true, false);
+ emitVectorSwitchWrite(MI, true);
+ break;
+ case 8:
+ emitVectorAddressCalc(MI, false, false);
+ emitVectorSwitchWrite(MI, false);
+ break;
+ };
+}
+ void
+AMDIL789IOExpansion::expandStoreSetupCode(MachineInstr *MI)
+{
+ MachineBasicBlock::iterator I = *MI;
+ DebugLoc DL;
+ if (MI->getOperand(0).isUndef()) {
+ BuildMI(*mBB, I, DL, mTII->get(getMoveInstFromID(
+ MI->getDesc().OpInfo[0].RegClass)), AMDIL::R1011)
+ .addImm(mMFI->addi32Literal(0));
+ } else {
+ BuildMI(*mBB, I, DL, mTII->get(getMoveInstFromID(
+ MI->getDesc().OpInfo[0].RegClass)), AMDIL::R1011)
+ .addReg(MI->getOperand(0).getReg());
+ }
+ expandTruncData(MI);
+ if (MI->getOperand(2).isReg()) {
+ BuildMI(*mBB, I, DL, mTII->get(AMDIL::ADD_i32), AMDIL::R1010)
+ .addReg(MI->getOperand(1).getReg())
+ .addReg(MI->getOperand(2).getReg());
+ } else {
+ BuildMI(*mBB, I, DL, mTII->get(AMDIL::MOVE_i32), AMDIL::R1010)
+ .addReg(MI->getOperand(1).getReg());
+ }
+ expandAddressCalc(MI);
+ expandPackedData(MI);
+}
+
+
+void
+AMDIL789IOExpansion::expandPackedData(MachineInstr *MI)
+{
+ MachineBasicBlock::iterator I = *MI;
+ if (!isPackedData(MI)) {
+ return;
+ }
+ DebugLoc DL;
+ // If we have packed data, then the shift size is no longer
+ // the same as the load size and we need to adjust accordingly
+ switch(getPackedID(MI)) {
+ default:
+ break;
+ case PACK_V2I8:
+ {
+ BuildMI(*mBB, I, DL, mTII->get(AMDIL::BINARY_AND_v2i32), AMDIL::R1011)
+ .addReg(AMDIL::R1011)
+ .addImm(mMFI->addi64Literal(0xFFULL | (0xFFULL << 32)));
+ BuildMI(*mBB, I, DL, mTII->get(AMDIL::SHL_v2i32), AMDIL::R1011)
+ .addReg(AMDIL::R1011).addImm(mMFI->addi64Literal(8ULL << 32));
+ BuildMI(*mBB, I, DL, mTII->get(AMDIL::HILO_BITOR_v2i32), AMDIL::R1011)
+ .addReg(AMDIL::R1011).addReg(AMDIL::R1011);
+ }
+ break;
+ case PACK_V4I8:
+ {
+ BuildMI(*mBB, I, DL, mTII->get(AMDIL::BINARY_AND_v4i32), AMDIL::R1011)
+ .addReg(AMDIL::R1011)
+ .addImm(mMFI->addi32Literal(0xFF));
+ BuildMI(*mBB, I, DL, mTII->get(AMDIL::SHL_v4i32), AMDIL::R1011)
+ .addReg(AMDIL::R1011)
+ .addImm(mMFI->addi128Literal(8ULL << 32, (16ULL | (24ULL << 32))));
+ BuildMI(*mBB, I, DL, mTII->get(AMDIL::HILO_BITOR_v2i64), AMDIL::R1011)
+ .addReg(AMDIL::R1011).addReg(AMDIL::R1011);
+ BuildMI(*mBB, I, DL, mTII->get(AMDIL::HILO_BITOR_v2i32), AMDIL::R1011)
+ .addReg(AMDIL::R1011).addReg(AMDIL::R1011);
+ }
+ break;
+ case PACK_V2I16:
+ {
+ BuildMI(*mBB, I, DL, mTII->get(AMDIL::BINARY_AND_v2i32), AMDIL::R1011)
+ .addReg(AMDIL::R1011)
+ .addImm(mMFI->addi32Literal(0xFFFF));
+ BuildMI(*mBB, I, DL, mTII->get(AMDIL::SHL_v2i32), AMDIL::R1011)
+ .addReg(AMDIL::R1011)
+ .addImm(mMFI->addi64Literal(16ULL << 32));
+ BuildMI(*mBB, I, DL, mTII->get(AMDIL::HILO_BITOR_v2i32), AMDIL::R1011)
+ .addReg(AMDIL::R1011).addReg(AMDIL::R1011);
+ }
+ break;
+ case PACK_V4I16:
+ {
+ BuildMI(*mBB, I, DL, mTII->get(AMDIL::BINARY_AND_v4i32), AMDIL::R1011)
+ .addReg(AMDIL::R1011)
+ .addImm(mMFI->addi32Literal(0xFFFF));
+ BuildMI(*mBB, I, DL, mTII->get(AMDIL::SHL_v4i32), AMDIL::R1011)
+ .addReg(AMDIL::R1011)
+ .addImm(mMFI->addi64Literal(16ULL << 32));
+ BuildMI(*mBB, I, DL, mTII->get(AMDIL::HILO_BITOR_v4i16), AMDIL::R1011)
+ .addReg(AMDIL::R1011).addReg(AMDIL::R1011);
+ }
+ break;
+ case UNPACK_V2I8:
+ BuildMI(*mBB, I, DL, mTII->get(AMDIL::USHRVEC_i32), AMDIL::R1012)
+ .addReg(AMDIL::R1011)
+ .addImm(mMFI->addi32Literal(8));
+ BuildMI(*mBB, I, DL, mTII->get(AMDIL::LCREATE), AMDIL::R1011)
+ .addReg(AMDIL::R1011).addReg(AMDIL::R1012);
+ break;
+ case UNPACK_V4I8:
+ {
+ BuildMI(*mBB, I, DL, mTII->get(AMDIL::VCREATE_v4i8), AMDIL::R1011)
+ .addReg(AMDIL::R1011);
+ BuildMI(*mBB, I, DL, mTII->get(AMDIL::USHRVEC_v4i8), AMDIL::R1011)
+ .addReg(AMDIL::R1011)
+ .addImm(mMFI->addi128Literal(8ULL << 32, (16ULL | (24ULL << 32))));
+ }
+ break;
+ case UNPACK_V2I16:
+ {
+ BuildMI(*mBB, I, DL, mTII->get(AMDIL::USHRVEC_i32), AMDIL::R1012)
+ .addReg(AMDIL::R1011)
+ .addImm(mMFI->addi32Literal(16));
+ BuildMI(*mBB, I, DL, mTII->get(AMDIL::LCREATE), AMDIL::R1011)
+ .addReg(AMDIL::R1011).addReg(AMDIL::R1012);
+ }
+ break;
+ case UNPACK_V4I16:
+ {
+ BuildMI(*mBB, I, DL, mTII->get(AMDIL::USHRVEC_v2i32), AMDIL::R1012)
+ .addReg(AMDIL::R1011)
+ .addImm(mMFI->addi32Literal(16));
+ BuildMI(*mBB, I, DL, mTII->get(AMDIL::LCREATE_v2i64), AMDIL::R1011)
+ .addReg(AMDIL::R1011).addReg(AMDIL::R1012);
+ }
+ break;
+ };
+}