summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJiangning Liu <jiangning.liu@arm.com>2014-05-15 01:33:17 +0000
committerJiangning Liu <jiangning.liu@arm.com>2014-05-15 01:33:17 +0000
commit66b123f0d889ed5bd6d4b66f209617d596da9a0c (patch)
treed7b439d9698e083d58228754bfce72e77e2d3656
parentcea72fe7634afc8d0cdb860924e6f51d2a64b842 (diff)
[ARM64] Support aggressive fastcc/tailcallopt breaking ABI by popping out argument stack from callee.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@208837 91177308-0d34-0410-b5e6-96231b3b80d8
-rw-r--r--lib/Target/ARM64/ARM64FrameLowering.cpp105
-rw-r--r--lib/Target/ARM64/ARM64ISelLowering.cpp312
-rw-r--r--lib/Target/ARM64/ARM64ISelLowering.h10
-rw-r--r--lib/Target/ARM64/ARM64InstrInfo.td15
-rw-r--r--lib/Target/ARM64/ARM64MachineFunctionInfo.h36
-rw-r--r--test/CodeGen/AArch64/fastcc-reserved.ll16
-rw-r--r--test/CodeGen/AArch64/fastcc.ll105
-rw-r--r--test/CodeGen/AArch64/tail-call.ll59
8 files changed, 553 insertions, 105 deletions
diff --git a/lib/Target/ARM64/ARM64FrameLowering.cpp b/lib/Target/ARM64/ARM64FrameLowering.cpp
index 471ec2092d9..3b14649c487 100644
--- a/lib/Target/ARM64/ARM64FrameLowering.cpp
+++ b/lib/Target/ARM64/ARM64FrameLowering.cpp
@@ -107,32 +107,44 @@ bool ARM64FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
void ARM64FrameLowering::eliminateCallFramePseudoInstr(
MachineFunction &MF, MachineBasicBlock &MBB,
MachineBasicBlock::iterator I) const {
- const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
const ARM64InstrInfo *TII =
static_cast<const ARM64InstrInfo *>(MF.getTarget().getInstrInfo());
+ DebugLoc DL = I->getDebugLoc();
+ int Opc = I->getOpcode();
+ bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode();
+ uint64_t CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0;
+
+ const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
if (!TFI->hasReservedCallFrame(MF)) {
- // If we have alloca, convert as follows:
- // ADJCALLSTACKDOWN -> sub, sp, sp, amount
- // ADJCALLSTACKUP -> add, sp, sp, amount
- MachineInstr *Old = I;
- DebugLoc DL = Old->getDebugLoc();
- unsigned Amount = Old->getOperand(0).getImm();
- if (Amount != 0) {
- // We need to keep the stack aligned properly. To do this, we round the
- // amount of space needed for the outgoing arguments up to the next
- // alignment boundary.
- unsigned Align = TFI->getStackAlignment();
- Amount = (Amount + Align - 1) / Align * Align;
-
- // Replace the pseudo instruction with a new instruction...
- unsigned Opc = Old->getOpcode();
- if (Opc == ARM64::ADJCALLSTACKDOWN) {
- emitFrameOffset(MBB, I, DL, ARM64::SP, ARM64::SP, -Amount, TII);
- } else {
- assert(Opc == ARM64::ADJCALLSTACKUP && "expected ADJCALLSTACKUP");
- emitFrameOffset(MBB, I, DL, ARM64::SP, ARM64::SP, Amount, TII);
- }
+ unsigned Align = getStackAlignment();
+
+ int64_t Amount = I->getOperand(0).getImm();
+ Amount = RoundUpToAlignment(Amount, Align);
+ if (!IsDestroy)
+ Amount = -Amount;
+
+ // N.b. if CalleePopAmount is valid but zero (i.e. callee would pop, but it
+ // doesn't have to pop anything), then the first operand will be zero too so
+ // this adjustment is a no-op.
+ if (CalleePopAmount == 0) {
+ // FIXME: in-function stack adjustment for calls is limited to 24-bits
+ // because there's no guaranteed temporary register available.
+ //
+ // ADD/SUB (immediate) has only LSL #0 and LSL #12 avaiable.
+ // 1) For offset <= 12-bit, we use LSL #0
+ // 2) For 12-bit <= offset <= 24-bit, we use two instructions. One uses
+ // LSL #0, and the other uses LSL #12.
+ //
+ // Mostly call frames will be allocated at the start of a function so
+ // this is OK, but it is a limitation that needs dealing with.
+ assert(Amount > -0xffffff && Amount < 0xffffff && "call frame too large");
+ emitFrameOffset(MBB, I, DL, ARM64::SP, ARM64::SP, Amount, TII);
}
+ } else if (CalleePopAmount != 0) {
+ // If the calling convention demands that the callee pops arguments from the
+ // stack, we want to add it back if we have a reserved call frame.
+ assert(CalleePopAmount < 0xffffff && "call frame too large");
+ emitFrameOffset(MBB, I, DL, ARM64::SP, ARM64::SP, -CalleePopAmount, TII);
}
MBB.erase(I);
}
@@ -420,8 +432,57 @@ void ARM64FrameLowering::emitEpilogue(MachineFunction &MF,
const ARM64RegisterInfo *RegInfo =
static_cast<const ARM64RegisterInfo *>(MF.getTarget().getRegisterInfo());
DebugLoc DL = MBBI->getDebugLoc();
+ unsigned RetOpcode = MBBI->getOpcode();
int NumBytes = MFI->getStackSize();
+ const ARM64FunctionInfo *AFI = MF.getInfo<ARM64FunctionInfo>();
+
+ // Initial and residual are named for consitency with the prologue. Note that
+ // in the epilogue, the residual adjustment is executed first.
+ uint64_t ArgumentPopSize = 0;
+ if (RetOpcode == ARM64::TCRETURNdi || RetOpcode == ARM64::TCRETURNri) {
+ MachineOperand &StackAdjust = MBBI->getOperand(1);
+
+ // For a tail-call in a callee-pops-arguments environment, some or all of
+ // the stack may actually be in use for the call's arguments, this is
+ // calculated during LowerCall and consumed here...
+ ArgumentPopSize = StackAdjust.getImm();
+ } else {
+ // ... otherwise the amount to pop is *all* of the argument space,
+ // conveniently stored in the MachineFunctionInfo by
+ // LowerFormalArguments. This will, of course, be zero for the C calling
+ // convention.
+ ArgumentPopSize = AFI->getArgumentStackToRestore();
+ }
+
+ // The stack frame should be like below,
+ //
+ // ---------------------- ---
+ // | | |
+ // | BytesInStackArgArea| CalleeArgStackSize
+ // | (NumReusableBytes) | (of tail call)
+ // | | ---
+ // | | |
+ // ---------------------| --- |
+ // | | | |
+ // | CalleeSavedReg | | |
+ // | (NumRestores * 16) | | |
+ // | | | |
+ // ---------------------| | NumBytes
+ // | | StackSize (StackAdjustUp)
+ // | LocalStackSize | | |
+ // | (covering callee | | |
+ // | args) | | |
+ // | | | |
+ // ---------------------- --- ---
+ //
+ // So NumBytes = StackSize + BytesInStackArgArea - CalleeArgStackSize
+ // = StackSize + ArgumentPopSize
+ //
+ // ARM64TargetLowering::LowerCall figures out ArgumentPopSize and keeps
+ // it as the 2nd argument of ARM64ISD::TC_RETURN.
+ NumBytes += ArgumentPopSize;
+
unsigned NumRestores = 0;
// Move past the restores of the callee-saved registers.
MachineBasicBlock::iterator LastPopI = MBBI;
diff --git a/lib/Target/ARM64/ARM64ISelLowering.cpp b/lib/Target/ARM64/ARM64ISelLowering.cpp
index f00e829070a..118007fa70c 100644
--- a/lib/Target/ARM64/ARM64ISelLowering.cpp
+++ b/lib/Target/ARM64/ARM64ISelLowering.cpp
@@ -39,12 +39,6 @@ using namespace llvm;
STATISTIC(NumTailCalls, "Number of tail calls");
STATISTIC(NumShiftInserts, "Number of vector shift inserts");
-// This option should go away when tail calls fully work.
-static cl::opt<bool>
-EnableARM64TailCalls("arm64-tail-calls", cl::Hidden,
- cl::desc("Generate ARM64 tail calls (TEMPORARY OPTION)."),
- cl::init(true));
-
enum AlignMode {
StrictAlign,
NoStrictAlign
@@ -1811,6 +1805,27 @@ SDValue ARM64TargetLowering::LowerFormalArguments(
AFI->setVarArgsStackIndex(MFI->CreateFixedObject(4, StackOffset, true));
}
+ ARM64FunctionInfo *FuncInfo = MF.getInfo<ARM64FunctionInfo>();
+ unsigned StackArgSize = CCInfo.getNextStackOffset();
+ bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
+ if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) {
+ // This is a non-standard ABI so by fiat I say we're allowed to make full
+ // use of the stack area to be popped, which must be aligned to 16 bytes in
+ // any case:
+ StackArgSize = RoundUpToAlignment(StackArgSize, 16);
+
+ // If we're expected to restore the stack (e.g. fastcc) then we'll be adding
+ // a multiple of 16.
+ FuncInfo->setArgumentStackToRestore(StackArgSize);
+
+ // This realignment carries over to the available bytes below. Our own
+ // callers will guarantee the space is free by giving an aligned value to
+ // CALLSEQ_START.
+ }
+ // Even if we're not expected to free up the space, it's useful to know how
+ // much is there while considering tail calls (because we can reuse it).
+ FuncInfo->setBytesInStackArgArea(StackArgSize);
+
return Chain;
}
@@ -1942,57 +1957,147 @@ bool ARM64TargetLowering::isEligibleForTailCallOptimization(
const SmallVectorImpl<ISD::OutputArg> &Outs,
const SmallVectorImpl<SDValue> &OutVals,
const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
- // Look for obvious safe cases to perform tail call optimization that do not
- // require ABI changes. This is what gcc calls sibcall.
-
- // Do not sibcall optimize vararg calls unless the call site is not passing
- // any arguments.
- if (isVarArg && !Outs.empty())
+ // For CallingConv::C this function knows whether the ABI needs
+ // changing. That's not true for other conventions so they will have to opt in
+ // manually.
+ if (!IsTailCallConvention(CalleeCC) && CalleeCC != CallingConv::C)
return false;
- // Also avoid sibcall optimization if either caller or callee uses struct
- // return semantics.
- if (isCalleeStructRet || isCallerStructRet)
+ const MachineFunction &MF = DAG.getMachineFunction();
+ const Function *CallerF = MF.getFunction();
+ CallingConv::ID CallerCC = CallerF->getCallingConv();
+ bool CCMatch = CallerCC == CalleeCC;
+
+ // Byval parameters hand the function a pointer directly into the stack area
+ // we want to reuse during a tail call. Working around this *is* possible (see
+ // X86) but less efficient and uglier in LowerCall.
+ for (Function::const_arg_iterator i = CallerF->arg_begin(),
+ e = CallerF->arg_end();
+ i != e; ++i)
+ if (i->hasByValAttr())
+ return false;
+
+ if (getTargetMachine().Options.GuaranteedTailCallOpt) {
+ if (IsTailCallConvention(CalleeCC) && CCMatch)
+ return true;
return false;
+ }
- // Note that currently ARM64 "C" calling convention and "Fast" calling
- // convention are compatible. If/when that ever changes, we'll need to
- // add checks here to make sure any interactions are OK.
+ // Now we search for cases where we can use a tail call without changing the
+ // ABI. Sibcall is used in some places (particularly gcc) to refer to this
+ // concept.
- // If the callee takes no arguments then go on to check the results of the
- // call.
- if (!Outs.empty()) {
- // Check if stack adjustment is needed. For now, do not do this if any
- // argument is passed on the stack.
+ // I want anyone implementing a new calling convention to think long and hard
+ // about this assert.
+ assert((!isVarArg || CalleeCC == CallingConv::C) &&
+ "Unexpected variadic calling convention");
+
+ if (isVarArg && !Outs.empty()) {
+ // At least two cases here: if caller is fastcc then we can't have any
+ // memory arguments (we'd be expected to clean up the stack afterwards). If
+ // caller is C then we could potentially use its argument area.
+
+ // FIXME: for now we take the most conservative of these in both cases:
+ // disallow all variadic memory operands.
SmallVector<CCValAssign, 16> ArgLocs;
CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
getTargetMachine(), ArgLocs, *DAG.getContext());
- CCAssignFn *AssignFn = CCAssignFnForCall(CalleeCC, /*IsVarArg=*/false);
- CCInfo.AnalyzeCallOperands(Outs, AssignFn);
- if (CCInfo.getNextStackOffset()) {
- // Check if the arguments are already laid out in the right way as
- // the caller's fixed stack objects.
- for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e;
- ++i, ++realArgIdx) {
- CCValAssign &VA = ArgLocs[i];
- if (VA.getLocInfo() == CCValAssign::Indirect)
- return false;
- if (VA.needsCustom()) {
- // Just don't handle anything that needs custom adjustments for now.
- // If need be, we can revisit later, but we shouldn't ever end up
- // here.
+
+ CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, true));
+ for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
+ if (!ArgLocs[i].isRegLoc())
+ return false;
+ }
+
+ // If the calling conventions do not match, then we'd better make sure the
+ // results are returned in the same way as what the caller expects.
+ if (!CCMatch) {
+ SmallVector<CCValAssign, 16> RVLocs1;
+ CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(),
+ getTargetMachine(), RVLocs1, *DAG.getContext());
+ CCInfo1.AnalyzeCallResult(Ins, CCAssignFnForCall(CalleeCC, isVarArg));
+
+ SmallVector<CCValAssign, 16> RVLocs2;
+ CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(),
+ getTargetMachine(), RVLocs2, *DAG.getContext());
+ CCInfo2.AnalyzeCallResult(Ins, CCAssignFnForCall(CallerCC, isVarArg));
+
+ if (RVLocs1.size() != RVLocs2.size())
+ return false;
+ for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) {
+ if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc())
+ return false;
+ if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo())
+ return false;
+ if (RVLocs1[i].isRegLoc()) {
+ if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg())
return false;
- } else if (!VA.isRegLoc()) {
- // Likewise, don't try to handle stack based arguments for the
- // time being.
+ } else {
+ if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset())
return false;
- }
}
}
}
- return true;
+ // Nothing more to check if the callee is taking no arguments
+ if (Outs.empty())
+ return true;
+
+ SmallVector<CCValAssign, 16> ArgLocs;
+ CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
+ getTargetMachine(), ArgLocs, *DAG.getContext());
+
+ CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg));
+
+ const ARM64FunctionInfo *FuncInfo = MF.getInfo<ARM64FunctionInfo>();
+
+ // If the stack arguments for this call would fit into our own save area then
+ // the call can be made tail.
+ return CCInfo.getNextStackOffset() <= FuncInfo->getBytesInStackArgArea();
+}
+
+SDValue ARM64TargetLowering::addTokenForArgument(SDValue Chain,
+ SelectionDAG &DAG,
+ MachineFrameInfo *MFI,
+ int ClobberedFI) const {
+ SmallVector<SDValue, 8> ArgChains;
+ int64_t FirstByte = MFI->getObjectOffset(ClobberedFI);
+ int64_t LastByte = FirstByte + MFI->getObjectSize(ClobberedFI) - 1;
+
+ // Include the original chain at the beginning of the list. When this is
+ // used by target LowerCall hooks, this helps legalize find the
+ // CALLSEQ_BEGIN node.
+ ArgChains.push_back(Chain);
+
+ // Add a chain value for each stack argument corresponding
+ for (SDNode::use_iterator U = DAG.getEntryNode().getNode()->use_begin(),
+ UE = DAG.getEntryNode().getNode()->use_end();
+ U != UE; ++U)
+ if (LoadSDNode *L = dyn_cast<LoadSDNode>(*U))
+ if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr()))
+ if (FI->getIndex() < 0) {
+ int64_t InFirstByte = MFI->getObjectOffset(FI->getIndex());
+ int64_t InLastByte = InFirstByte;
+ InLastByte += MFI->getObjectSize(FI->getIndex()) - 1;
+
+ if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
+ (FirstByte <= InFirstByte && InFirstByte <= LastByte))
+ ArgChains.push_back(SDValue(L, 1));
+ }
+
+ // Build a tokenfactor for all the chains.
+ return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
+}
+
+bool ARM64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC,
+ bool TailCallOpt) const {
+ return CallCC == CallingConv::Fast && TailCallOpt;
}
+
+bool ARM64TargetLowering::IsTailCallConvention(CallingConv::ID CallCC) const {
+ return CallCC == CallingConv::Fast;
+}
+
/// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain,
/// and add input and output parameter nodes.
SDValue ARM64TargetLowering::LowerCall(CallLoweringInfo &CLI,
@@ -2012,9 +2117,9 @@ SDValue ARM64TargetLowering::LowerCall(CallLoweringInfo &CLI,
bool IsStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
bool IsThisReturn = false;
- // If tail calls are explicitly disabled, make sure not to use them.
- if (!EnableARM64TailCalls)
- IsTailCall = false;
+ ARM64FunctionInfo *FuncInfo = MF.getInfo<ARM64FunctionInfo>();
+ bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
+ bool IsSibCall = false;
if (IsTailCall) {
// Check if it's really possible to do a tail call.
@@ -2024,9 +2129,12 @@ SDValue ARM64TargetLowering::LowerCall(CallLoweringInfo &CLI,
if (!IsTailCall && CLI.CS && CLI.CS->isMustTailCall())
report_fatal_error("failed to perform tail call elimination on a call "
"site marked musttail");
- // We don't support GuaranteedTailCallOpt, only automatically
- // detected sibcalls.
- // FIXME: Re-evaluate. Is this true? Should it be true?
+
+ // A sibling call is one where we're under the usual C ABI and not planning
+ // to change that but can still do a tail call:
+ if (!TailCallOpt && IsTailCall)
+ IsSibCall = true;
+
if (IsTailCall)
++NumTailCalls;
}
@@ -2082,9 +2190,42 @@ SDValue ARM64TargetLowering::LowerCall(CallLoweringInfo &CLI,
// Get a count of how many bytes are to be pushed on the stack.
unsigned NumBytes = CCInfo.getNextStackOffset();
+ if (IsSibCall) {
+ // Since we're not changing the ABI to make this a tail call, the memory
+ // operands are already available in the caller's incoming argument space.
+ NumBytes = 0;
+ }
+
+ // FPDiff is the byte offset of the call's argument area from the callee's.
+ // Stores to callee stack arguments will be placed in FixedStackSlots offset
+ // by this amount for a tail call. In a sibling call it must be 0 because the
+ // caller will deallocate the entire stack and the callee still expects its
+ // arguments to begin at SP+0. Completely unused for non-tail calls.
+ int FPDiff = 0;
+
+ if (IsTailCall && !IsSibCall) {
+ unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();
+
+ // Since callee will pop argument stack as a tail call, we must keep the
+ // popped size 16-byte aligned.
+ NumBytes = RoundUpToAlignment(NumBytes, 16);
+
+ // FPDiff will be negative if this tail call requires more space than we
+ // would automatically have in our incoming argument space. Positive if we
+ // can actually shrink the stack.
+ FPDiff = NumReusableBytes - NumBytes;
+
+ // The stack pointer must be 16-byte aligned at all times it's used for a
+ // memory operation, which in practice means at *all* times and in
+ // particular across call boundaries. Therefore our own arguments started at
+ // a 16-byte aligned SP and the delta applied for the tail call should
+ // satisfy the same constraint.
+ assert(FPDiff % 16 == 0 && "unaligned stack on tail call");
+ }
+
// Adjust the stack pointer for the new arguments...
// These operations are automatically eliminated by the prolog/epilog pass
- if (!IsTailCall)
+ if (!IsSibCall)
Chain =
DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true), DL);
@@ -2134,31 +2275,50 @@ SDValue ARM64TargetLowering::LowerCall(CallLoweringInfo &CLI,
RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
} else {
assert(VA.isMemLoc());
- // There's no reason we can't support stack args w/ tailcall, but
- // we currently don't, so assert if we see one.
- assert(!IsTailCall && "stack argument with tail call!?");
- // FIXME: This works on big-endian for composite byvals, which are the common
- // case. It should also work for fundamental types too.
+ SDValue DstAddr;
+ MachinePointerInfo DstInfo;
+
+ // FIXME: This works on big-endian for composite byvals, which are the
+ // common case. It should also work for fundamental types too.
uint32_t BEAlign = 0;
+ unsigned OpSize = Flags.isByVal() ? Flags.getByValSize() * 8
+ : VA.getLocVT().getSizeInBits();
+ OpSize = (OpSize + 7) / 8;
if (!Subtarget->isLittleEndian() && !Flags.isByVal()) {
- unsigned OpSize = (VA.getLocVT().getSizeInBits() + 7) / 8;
if (OpSize < 8)
BEAlign = 8 - OpSize;
}
-
unsigned LocMemOffset = VA.getLocMemOffset();
- SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset + BEAlign);
+ int32_t Offset = LocMemOffset + BEAlign;
+ SDValue PtrOff = DAG.getIntPtrConstant(Offset);
PtrOff = DAG.getNode(ISD::ADD, DL, getPointerTy(), StackPtr, PtrOff);
+ if (IsTailCall) {
+ Offset = Offset + FPDiff;
+ int FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
+
+ DstAddr = DAG.getFrameIndex(FI, getPointerTy());
+ DstInfo = MachinePointerInfo::getFixedStack(FI);
+
+ // Make sure any stack arguments overlapping with where we're storing
+ // are loaded before this eventual operation. Otherwise they'll be
+ // clobbered.
+ Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI);
+ } else {
+ SDValue PtrOff = DAG.getIntPtrConstant(Offset);
+
+ DstAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), StackPtr, PtrOff);
+ DstInfo = MachinePointerInfo::getStack(LocMemOffset);
+ }
+
if (Outs[i].Flags.isByVal()) {
SDValue SizeNode =
DAG.getConstant(Outs[i].Flags.getByValSize(), MVT::i64);
SDValue Cpy = DAG.getMemcpy(
- Chain, DL, PtrOff, Arg, SizeNode, Outs[i].Flags.getByValAlign(),
+ Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getByValAlign(),
/*isVolatile = */ false,
- /*alwaysInline = */ false,
- MachinePointerInfo::getStack(LocMemOffset), MachinePointerInfo());
+ /*alwaysInline = */ false, DstInfo, MachinePointerInfo());
MemOpChains.push_back(Cpy);
} else {
@@ -2171,9 +2331,8 @@ SDValue ARM64TargetLowering::LowerCall(CallLoweringInfo &CLI,
VA.getLocVT() == MVT::i16))
Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getLocVT(), Arg);
- SDValue Store = DAG.getStore(Chain, DL, Arg, PtrOff,
- MachinePointerInfo::getStack(LocMemOffset),
- false, false, 0);
+ SDValue Store =
+ DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, false, false, 0);
MemOpChains.push_back(Store);
}
}
@@ -2221,10 +2380,27 @@ SDValue ARM64TargetLowering::LowerCall(CallLoweringInfo &CLI,
Callee = DAG.getTargetExternalSymbol(Sym, getPointerTy(), 0);
}
+ // We don't usually want to end the call-sequence here because we would tidy
+ // the frame up *after* the call, however in the ABI-changing tail-call case
+ // we've carefully laid out the parameters so that when sp is reset they'll be
+ // in the correct location.
+ if (IsTailCall && !IsSibCall) {
+ Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
+ DAG.getIntPtrConstant(0, true), InFlag, DL);
+ InFlag = Chain.getValue(1);
+ }
+
std::vector<SDValue> Ops;
Ops.push_back(Chain);
Ops.push_back(Callee);
+ if (IsTailCall) {
+ // Each tail call may have to adjust the stack by a different amount, so
+ // this information must travel along with the operation for eventual
+ // consumption by emitEpilogue.
+ Ops.push_back(DAG.getTargetConstant(FPDiff, MVT::i32));
+ }
+
// Add argument registers to the end of the list so that they are known live
// into the call.
for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
@@ -2262,8 +2438,13 @@ SDValue ARM64TargetLowering::LowerCall(CallLoweringInfo &CLI,
Chain = DAG.getNode(ARM64ISD::CALL, DL, NodeTys, Ops);
InFlag = Chain.getValue(1);
+ uint64_t CalleePopBytes = DoesCalleeRestoreStack(CallConv, TailCallOpt)
+ ? RoundUpToAlignment(NumBytes, 16)
+ : 0;
+
Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
- DAG.getIntPtrConstant(0, true), InFlag, DL);
+ DAG.getIntPtrConstant(CalleePopBytes, true),
+ InFlag, DL);
if (!Ins.empty())
InFlag = Chain.getValue(1);
@@ -7441,9 +7622,6 @@ bool ARM64TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
// return instructions to help enable tail call optimizations for this
// instruction.
bool ARM64TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
- if (!EnableARM64TailCalls)
- return false;
-
if (!CI->isTailCall())
return false;
diff --git a/lib/Target/ARM64/ARM64ISelLowering.h b/lib/Target/ARM64/ARM64ISelLowering.h
index fd29a51b40e..55792317dba 100644
--- a/lib/Target/ARM64/ARM64ISelLowering.h
+++ b/lib/Target/ARM64/ARM64ISelLowering.h
@@ -353,6 +353,16 @@ private:
const SmallVectorImpl<SDValue> &OutVals,
const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const;
+ /// Finds the incoming stack arguments which overlap the given fixed stack
+ /// object and incorporates their load into the current chain. This prevents
+ /// an upcoming store from clobbering the stack argument before it's used.
+ SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG,
+ MachineFrameInfo *MFI, int ClobberedFI) const;
+
+ bool DoesCalleeRestoreStack(CallingConv::ID CallCC, bool TailCallOpt) const;
+
+ bool IsTailCallConvention(CallingConv::ID CallCC) const;
+
void saveVarArgRegisters(CCState &CCInfo, SelectionDAG &DAG, SDLoc DL,
SDValue &Chain) const;
diff --git a/lib/Target/ARM64/ARM64InstrInfo.td b/lib/Target/ARM64/ARM64InstrInfo.td
index bc39fe97b6f..4d5714b8d0c 100644
--- a/lib/Target/ARM64/ARM64InstrInfo.td
+++ b/lib/Target/ARM64/ARM64InstrInfo.td
@@ -88,7 +88,7 @@ def SDT_ARM64binvec : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
def SDT_ARM64trivec : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
SDTCisSameAs<0,2>,
SDTCisSameAs<0,3>]>;
-def SDT_ARM64TCRET : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>;
+def SDT_ARM64TCRET : SDTypeProfile<0, 2, [SDTCisPtrTy<0>]>;
def SDT_ARM64PREFETCH : SDTypeProfile<0, 2, [SDTCisVT<0, i32>, SDTCisPtrTy<1>]>;
def SDT_ARM64ITOF : SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisSameAs<0,1>]>;
@@ -5163,12 +5163,15 @@ def : Pat<(i64 (int_arm64_neon_urshl (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
// Tail call return handling. These are all compiler pseudo-instructions,
// so no encoding information or anything like that.
let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [SP] in {
- def TCRETURNdi : Pseudo<(outs), (ins i64imm:$dst), []>;
- def TCRETURNri : Pseudo<(outs), (ins tcGPR64:$dst), []>;
+ def TCRETURNdi : Pseudo<(outs), (ins i64imm:$dst, i32imm:$FPDiff),[]>;
+ def TCRETURNri : Pseudo<(outs), (ins tcGPR64:$dst, i32imm:$FPDiff), []>;
}
-def : Pat<(ARM64tcret tcGPR64:$dst), (TCRETURNri tcGPR64:$dst)>;
-def : Pat<(ARM64tcret (i64 tglobaladdr:$dst)), (TCRETURNdi texternalsym:$dst)>;
-def : Pat<(ARM64tcret (i64 texternalsym:$dst)), (TCRETURNdi texternalsym:$dst)>;
+def : Pat<(ARM64tcret tcGPR64:$dst, (i32 timm:$FPDiff)),
+ (TCRETURNri tcGPR64:$dst, imm:$FPDiff)>;
+def : Pat<(ARM64tcret tglobaladdr:$dst, (i32 timm:$FPDiff)),
+ (TCRETURNdi texternalsym:$dst, imm:$FPDiff)>;
+def : Pat<(ARM64tcret texternalsym:$dst, (i32 timm:$FPDiff)),
+ (TCRETURNdi texternalsym:$dst, imm:$FPDiff)>;
include "ARM64InstrAtomics.td"
diff --git a/lib/Target/ARM64/ARM64MachineFunctionInfo.h b/lib/Target/ARM64/ARM64MachineFunctionInfo.h
index 02bf7cf6fd0..0b6f4f1ec64 100644
--- a/lib/Target/ARM64/ARM64MachineFunctionInfo.h
+++ b/lib/Target/ARM64/ARM64MachineFunctionInfo.h
@@ -25,6 +25,22 @@ namespace llvm {
/// contains private ARM64-specific information for each MachineFunction.
class ARM64FunctionInfo : public MachineFunctionInfo {
+ /// Number of bytes of arguments this function has on the stack. If the callee
+ /// is expected to restore the argument stack this should be a multiple of 16,
+ /// all usable during a tail call.
+ ///
+ /// The alternative would forbid tail call optimisation in some cases: if we
+ /// want to transfer control from a function with 8-bytes of stack-argument
+ /// space to a function with 16-bytes then misalignment of this value would
+ /// make a stack adjustment necessary, which could not be undone by the
+ /// callee.
+ unsigned BytesInStackArgArea;
+
+ /// The number of bytes to restore to deallocate space for incoming
+ /// arguments. Canonically 0 in the C calling convention, but non-zero when
+ /// callee is expected to pop the args.
+ unsigned ArgumentStackToRestore;
+
/// HasStackFrame - True if this function has a stack frame. Set by
/// processFunctionBeforeCalleeSavedScan().
bool HasStackFrame;
@@ -58,17 +74,25 @@ class ARM64FunctionInfo : public MachineFunctionInfo {
public:
ARM64FunctionInfo()
- : HasStackFrame(false), NumLocalDynamicTLSAccesses(0),
- VarArgsStackIndex(0), VarArgsGPRIndex(0), VarArgsGPRSize(0),
- VarArgsFPRIndex(0), VarArgsFPRSize(0) {}
+ : BytesInStackArgArea(0), ArgumentStackToRestore(0), HasStackFrame(false),
+ NumLocalDynamicTLSAccesses(0), VarArgsStackIndex(0), VarArgsGPRIndex(0),
+ VarArgsGPRSize(0), VarArgsFPRIndex(0), VarArgsFPRSize(0) {}
explicit ARM64FunctionInfo(MachineFunction &MF)
- : HasStackFrame(false), NumLocalDynamicTLSAccesses(0),
- VarArgsStackIndex(0), VarArgsGPRIndex(0), VarArgsGPRSize(0),
- VarArgsFPRIndex(0), VarArgsFPRSize(0) {
+ : BytesInStackArgArea(0), ArgumentStackToRestore(0), HasStackFrame(false),
+ NumLocalDynamicTLSAccesses(0), VarArgsStackIndex(0), VarArgsGPRIndex(0),
+ VarArgsGPRSize(0), VarArgsFPRIndex(0), VarArgsFPRSize(0) {
(void)MF;
}
+ unsigned getBytesInStackArgArea() const { return BytesInStackArgArea; }
+ void setBytesInStackArgArea(unsigned bytes) { BytesInStackArgArea = bytes; }
+
+ unsigned getArgumentStackToRestore() const { return ArgumentStackToRestore; }
+ void setArgumentStackToRestore(unsigned bytes) {
+ ArgumentStackToRestore = bytes;
+ }
+
bool hasStackFrame() const { return HasStackFrame; }
void setHasStackFrame(bool s) { HasStackFrame = s; }
diff --git a/test/CodeGen/AArch64/fastcc-reserved.ll b/test/CodeGen/AArch64/fastcc-reserved.ll
index c6c050570dd..97410aa5026 100644
--- a/test/CodeGen/AArch64/fastcc-reserved.ll
+++ b/test/CodeGen/AArch64/fastcc-reserved.ll
@@ -1,4 +1,5 @@
; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -tailcallopt | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=arm64-none-linux-gnu -tailcallopt | FileCheck %s --check-prefix=CHECK-ARM64
; This test is designed to be run in the situation where the
; call-frame is not reserved (hence disable-fp-elim), but where
@@ -14,20 +15,28 @@ define fastcc void @foo(i32 %in) {
; Normal frame setup stuff:
; CHECK: sub sp, sp,
; CHECK: stp x29, x30
+; CHECK-ARM64: stp x29, x30, [sp, #-16]!
+; CHECK-ARM64: mov x29, sp
; Reserve space for call-frame:
; CHECK: sub sp, sp, #16
+; CHECK-ARM64: sub sp, sp, #16
call fastcc void @will_pop([8 x i32] undef, i32 42)
; CHECK: bl will_pop
+; CHECK-ARM64: bl will_pop
; Since @will_pop is fastcc with tailcallopt, it will put the stack
; back where it needs to be, we shouldn't duplicate that
; CHECK-NOT: sub sp, sp, #16
; CHECK-NOT: add sp, sp,
+; CHECK-ARM64-NOT: sub sp, sp, #16
+; CHECK-ARM64-NOT: add sp, sp,
; CHECK: ldp x29, x30
; CHECK: add sp, sp,
+; CHECK-ARM64: mov sp, x29
+; CHECK-ARM64: ldp x29, x30, [sp], #16
ret void
}
@@ -40,19 +49,26 @@ define void @foo1(i32 %in) {
; Normal frame setup again
; CHECK: sub sp, sp,
; CHECK: stp x29, x30
+; CHECK-ARM64: stp x29, x30, [sp, #-16]!
+; CHECK-ARM64: mov x29, sp
; Reserve space for call-frame
; CHECK: sub sp, sp, #16
+; CHECK-ARM64: sub sp, sp, #16
call void @wont_pop([8 x i32] undef, i32 42)
; CHECK: bl wont_pop
+; CHECK-ARM64: bl wont_pop
; This time we *do* need to unreserve the call-frame
; CHECK: add sp, sp, #16
+; CHECK-ARM64: add sp, sp, #16
; Check for epilogue (primarily to make sure sp spotted above wasn't
; part of it).
; CHECK: ldp x29, x30
; CHECK: add sp, sp,
+; CHECK-ARM64: mov sp, x29
+; CHECK-ARM64: ldp x29, x30, [sp], #16
ret void
}
diff --git a/test/CodeGen/AArch64/fastcc.ll b/test/CodeGen/AArch64/fastcc.ll
index a4cd37858ee..fb9b4ac57ed 100644
--- a/test/CodeGen/AArch64/fastcc.ll
+++ b/test/CodeGen/AArch64/fastcc.ll
@@ -1,5 +1,7 @@
; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -tailcallopt | FileCheck %s -check-prefix CHECK-TAIL
+; RUN: llc -verify-machineinstrs < %s -mtriple=arm64-none-linux-gnu -tailcallopt | FileCheck %s -check-prefix CHECK-ARM64-TAIL
; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=arm64-none-linux-gnu | FileCheck --check-prefix=CHECK-ARM64 %s
; Without tailcallopt fastcc still means the caller cleans up the
; stack, so try to make sure this is respected.
@@ -8,116 +10,219 @@ define fastcc void @func_stack0() {
; CHECK-LABEL: func_stack0:
; CHECK: sub sp, sp, #48
+; CHECK-ARM64-LABEL: func_stack0:
+; CHECK-ARM64: stp x29, x30, [sp, #-16]!
+; CHECK-ARM64-NEXT: mov x29, sp
+; CHECK-ARM64-NEXT: sub sp, sp, #32
+
; CHECK-TAIL-LABEL: func_stack0:
; CHECK-TAIL: sub sp, sp, #48
+; CHECK-ARM64-TAIL-LABEL: func_stack0:
+; CHECK-ARM64-TAIL: stp x29, x30, [sp, #-16]!
+; CHECK-ARM64-TAIL-NEXT: mov x29, sp
+; CHECK-ARM64-TAIL-NEXT: sub sp, sp, #32
+
call fastcc void @func_stack8([8 x i32] undef, i32 42)
; CHECK: bl func_stack8
; CHECK-NOT: sub sp, sp,
+; CHECK-ARM64: bl func_stack8
+; CHECK-ARM64-NOT: sub sp, sp,
+
; CHECK-TAIL: bl func_stack8
; CHECK-TAIL: sub sp, sp, #16
+; CHECK-ARM64-TAIL: bl func_stack8
+; CHECK-ARM64-TAIL: sub sp, sp, #16
+
call fastcc void @func_stack32([8 x i32] undef, i128 0, i128 9)
; CHECK: bl func_stack32
; CHECK-NOT: sub sp, sp,
+; CHECK-ARM64: bl func_stack32
+; CHECK-ARM64-NOT: sub sp, sp,
+
; CHECK-TAIL: bl func_stack32
; CHECK-TAIL: sub sp, sp, #32
+; CHECK-ARM64-TAIL: bl func_stack32
+; CHECK-ARM64-TAIL: sub sp, sp, #32
+
call fastcc void @func_stack0()
; CHECK: bl func_stack0
; CHECK-NOT: sub sp, sp
+; CHECK-ARM64: bl func_stack0
+; CHECK-ARM64-NOT: sub sp, sp
+
; CHECK-TAIL: bl func_stack0
; CHECK-TAIL-NOT: sub sp, sp
+; CHECK-ARM64-TAIL: bl func_stack0
+; CHECK-ARM64-TAIL-NOT: sub sp, sp
+
ret void
; CHECK: add sp, sp, #48
; CHECK-NEXT: ret
+; CHECK-ARM64: mov sp, x29
+; CHECK-ARM64-NEXT: ldp x29, x30, [sp], #16
+; CHECK-ARM64-NEXT: ret
+
; CHECK-TAIL: add sp, sp, #48
; CHECK-TAIL-NEXT: ret
+; CHECK-ARM64-TAIL: mov sp, x29
+; CHECK-ARM64-TAIL-NEXT: ldp x29, x30, [sp], #16
+; CHECK-ARM64-TAIL-NEXT: ret
}
define fastcc void @func_stack8([8 x i32], i32 %stacked) {
; CHECK-LABEL: func_stack8:
; CHECK: sub sp, sp, #48
+; CHECK-ARM64-LABEL: func_stack8:
+; CHECK-ARM64: stp x29, x30, [sp, #-16]!
+; CHECK-ARM64: mov x29, sp
+; CHECK-ARM64: sub sp, sp, #32
+
; CHECK-TAIL-LABEL: func_stack8:
; CHECK-TAIL: sub sp, sp, #48
+; CHECK-ARM64-TAIL-LABEL: func_stack8:
+; CHECK-ARM64-TAIL: stp x29, x30, [sp, #-16]!
+; CHECK-ARM64-TAIL: mov x29, sp
+; CHECK-ARM64-TAIL: sub sp, sp, #32
+
call fastcc void @func_stack8([8 x i32] undef, i32 42)
; CHECK: bl func_stack8
; CHECK-NOT: sub sp, sp,
+; CHECK-ARM64: bl func_stack8
+; CHECK-ARM64-NOT: sub sp, sp,
+
; CHECK-TAIL: bl func_stack8
; CHECK-TAIL: sub sp, sp, #16
+; CHECK-ARM64-TAIL: bl func_stack8
+; CHECK-ARM64-TAIL: sub sp, sp, #16
+
call fastcc void @func_stack32([8 x i32] undef, i128 0, i128 9)
; CHECK: bl func_stack32
; CHECK-NOT: sub sp, sp,
+; CHECK-ARM64: bl func_stack32
+; CHECK-ARM64-NOT: sub sp, sp,
+
; CHECK-TAIL: bl func_stack32
; CHECK-TAIL: sub sp, sp, #32
+; CHECK-ARM64-TAIL: bl func_stack32
+; CHECK-ARM64-TAIL: sub sp, sp, #32
+
call fastcc void @func_stack0()
; CHECK: bl func_stack0
; CHECK-NOT: sub sp, sp
+; CHECK-ARM64: bl func_stack0
+; CHECK-ARM64-NOT: sub sp, sp
+
; CHECK-TAIL: bl func_stack0
; CHECK-TAIL-NOT: sub sp, sp
+; CHECK-ARM64-TAIL: bl func_stack0
+; CHECK-ARM64-TAIL-NOT: sub sp, sp
+
ret void
; CHECK: add sp, sp, #48
; CHECK-NEXT: ret
+; CHECK-ARM64: mov sp, x29
+; CHECK-ARM64-NEXT: ldp x29, x30, [sp], #16
+; CHECK-ARM64-NEXT: ret
+
; CHECK-TAIL: add sp, sp, #64
; CHECK-TAIL-NEXT: ret
+
+; CHECK-ARM64-TAIL: mov sp, x29
+; CHECK-ARM64-TAIL-NEXT: ldp x29, x30, [sp], #16
+; CHECK-ARM64-TAIL-NEXT: ret
}
define fastcc void @func_stack32([8 x i32], i128 %stacked0, i128 %stacked1) {
; CHECK-LABEL: func_stack32:
; CHECK: sub sp, sp, #48
+; CHECK-ARM64-LABEL: func_stack32:
+; CHECK-ARM64: mov x29, sp
+
; CHECK-TAIL-LABEL: func_stack32:
; CHECK-TAIL: sub sp, sp, #48
+; CHECK-ARM64-TAIL-LABEL: func_stack32:
+; CHECK-ARM64-TAIL: mov x29, sp
+
call fastcc void @func_stack8([8 x i32] undef, i32 42)
; CHECK: bl func_stack8
; CHECK-NOT: sub sp, sp,
+; CHECK-ARM64: bl func_stack8
+; CHECK-ARM64-NOT: sub sp, sp,
+
; CHECK-TAIL: bl func_stack8
; CHECK-TAIL: sub sp, sp, #16
+; CHECK-ARM64-TAIL: bl func_stack8
+; CHECK-ARM64-TAIL: sub sp, sp, #16
+
call fastcc void @func_stack32([8 x i32] undef, i128 0, i128 9)
; CHECK: bl func_stack32
; CHECK-NOT: sub sp, sp,
+; CHECK-ARM64: bl func_stack32
+; CHECK-ARM64-NOT: sub sp, sp,
+
; CHECK-TAIL: bl func_stack32
; CHECK-TAIL: sub sp, sp, #32
+; CHECK-ARM64-TAIL: bl func_stack32
+; CHECK-ARM64-TAIL: sub sp, sp, #32
+
call fastcc void @func_stack0()
; CHECK: bl func_stack0
; CHECK-NOT: sub sp, sp
+; CHECK-ARM64: bl func_stack0
+; CHECK-ARM64-NOT: sub sp, sp
+
; CHECK-TAIL: bl func_stack0
; CHECK-TAIL-NOT: sub sp, sp
+; CHECK-ARM64-TAIL: bl func_stack0
+; CHECK-ARM64-TAIL-NOT: sub sp, sp
+
ret void
; CHECK: add sp, sp, #48
; CHECK-NEXT: ret
+; CHECK-ARM64: mov sp, x29
+; CHECK-ARM64-NEXT: ldp x29, x30, [sp], #16
+; CHECK-ARM64-NEXT: ret
+
; CHECK-TAIL: add sp, sp, #80
; CHECK-TAIL-NEXT: ret
+
+; CHECK-ARM64-TAIL: mov sp, x29
+; CHECK-ARM64-TAIL-NEXT: ldp x29, x30, [sp], #16
+; CHECK-ARM64-TAIL-NEXT: ret
}
diff --git a/test/CodeGen/AArch64/tail-call.ll b/test/CodeGen/AArch64/tail-call.ll
index 81885f10851..da05848dcc5 100644
--- a/test/CodeGen/AArch64/tail-call.ll
+++ b/test/CodeGen/AArch64/tail-call.ll
@@ -1,4 +1,5 @@
; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -tailcallopt | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=arm64-none-linux-gnu -tailcallopt | FileCheck --check-prefix=CHECK-ARM64 %s
declare fastcc void @callee_stack0()
declare fastcc void @callee_stack8([8 x i32], i64)
@@ -7,57 +8,92 @@ declare fastcc void @callee_stack16([8 x i32], i64, i64)
define fastcc void @caller_to0_from0() nounwind {
; CHECK-LABEL: caller_to0_from0:
; CHECK-NEXT: // BB
+
+; CHECK-ARM64-LABEL: caller_to0_from0:
+; CHECK-ARM64-NEXT: // BB
+
tail call fastcc void @callee_stack0()
ret void
+
; CHECK-NEXT: b callee_stack0
+
+; CHECK-ARM64-NEXT: b callee_stack0
}
define fastcc void @caller_to0_from8([8 x i32], i64) {
; CHECK-LABEL: caller_to0_from8:
+; CHECK-ARM64-LABEL: caller_to0_from8:
+
tail call fastcc void @callee_stack0()
ret void
+
; CHECK: add sp, sp, #16
; CHECK-NEXT: b callee_stack0
+
+; CHECK-ARM64: add sp, sp, #16
+; CHECK-ARM64-NEXT: b callee_stack0
}
define fastcc void @caller_to8_from0() {
; CHECK-LABEL: caller_to8_from0:
; CHECK: sub sp, sp, #32
+; CHECK-ARM64-LABEL: caller_to8_from0:
+; CHECK-ARM64: sub sp, sp, #32
+
; Key point is that the "42" should go #16 below incoming stack
; pointer (we didn't have arg space to reuse).
tail call fastcc void @callee_stack8([8 x i32] undef, i64 42)
ret void
+
; CHECK: str {{x[0-9]+}}, [sp, #16]
; CHECK-NEXT: add sp, sp, #16
; CHECK-NEXT: b callee_stack8
+
+; CHECK-ARM64: str {{x[0-9]+}}, [sp, #16]!
+; CHECK-ARM64-NEXT: b callee_stack8
}
define fastcc void @caller_to8_from8([8 x i32], i64 %a) {
; CHECK-LABEL: caller_to8_from8:
; CHECK: sub sp, sp, #16
+; CHECK-ARM64-LABEL: caller_to8_from8:
+; CHECK-ARM64: sub sp, sp, #16
+
; Key point is that the "%a" should go where at SP on entry.
tail call fastcc void @callee_stack8([8 x i32] undef, i64 42)
ret void
+
; CHECK: str {{x[0-9]+}}, [sp, #16]
; CHECK-NEXT: add sp, sp, #16
; CHECK-NEXT: b callee_stack8
+
+; CHECK-ARM64: str {{x[0-9]+}}, [sp, #16]!
+; CHECK-ARM64-NEXT: b callee_stack8
}
define fastcc void @caller_to16_from8([8 x i32], i64 %a) {
; CHECK-LABEL: caller_to16_from8:
; CHECK: sub sp, sp, #16
+; CHECK-ARM64-LABEL: caller_to16_from8:
+; CHECK-ARM64: sub sp, sp, #16
+
; Important point is that the call reuses the "dead" argument space
; above %a on the stack. If it tries to go below incoming-SP then the
; callee will not deallocate the space, even in fastcc.
tail call fastcc void @callee_stack16([8 x i32] undef, i64 42, i64 2)
+
; CHECK: str {{x[0-9]+}}, [sp, #24]
; CHECK: str {{x[0-9]+}}, [sp, #16]
-; CHECK: add sp, sp, #16
-; CHECK: b callee_stack16
+; CHECK-NEXT: add sp, sp, #16
+; CHECK-NEXT: b callee_stack16
+
+; CHECK-ARM64: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]
+; CHECK-ARM64-NEXT: add sp, sp, #16
+; CHECK-ARM64-NEXT: b callee_stack16
ret void
}
@@ -66,12 +102,19 @@ define fastcc void @caller_to8_from24([8 x i32], i64 %a, i64 %b, i64 %c) {
; CHECK-LABEL: caller_to8_from24:
; CHECK: sub sp, sp, #16
+; CHECK-ARM64-LABEL: caller_to8_from24:
+; CHECK-ARM64: sub sp, sp, #16
+
; Key point is that the "%a" should go where at #16 above SP on entry.
tail call fastcc void @callee_stack8([8 x i32] undef, i64 42)
ret void
+
; CHECK: str {{x[0-9]+}}, [sp, #32]
; CHECK-NEXT: add sp, sp, #32
; CHECK-NEXT: b callee_stack8
+
+; CHECK-ARM64: str {{x[0-9]+}}, [sp, #32]!
+; CHECK-ARM64-NEXT: b callee_stack8
}
@@ -79,6 +122,9 @@ define fastcc void @caller_to16_from16([8 x i32], i64 %a, i64 %b) {
; CHECK-LABEL: caller_to16_from16:
; CHECK: sub sp, sp, #16
+; CHECK-ARM64-LABEL: caller_to16_from16:
+; CHECK-ARM64: sub sp, sp, #16
+
; Here we want to make sure that both loads happen before the stores:
; otherwise either %a or %b will be wrongly clobbered.
tail call fastcc void @callee_stack16([8 x i32] undef, i64 %b, i64 %a)
@@ -89,6 +135,11 @@ define fastcc void @caller_to16_from16([8 x i32], i64 %a, i64 %b) {
; CHECK: str x1,
; CHECK: str x0,
-; CHECK: add sp, sp, #16
-; CHECK: b callee_stack16
+; CHECK-NEXT: add sp, sp, #16
+; CHECK-NEXT: b callee_stack16
+
+; CHECK-ARM64: ldp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]
+; CHECK-ARM64: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]
+; CHECK-ARM64-NEXT: add sp, sp, #16
+; CHECK-ARM64-NEXT: b callee_stack16
}