8 files changed, 553 insertions, 105 deletions
diff --git a/lib/Target/ARM64/ARM64FrameLowering.cpp b/lib/Target/ARM64/ARM64FrameLowering.cpp
index 471ec2092d9..3b14649c487 100644
--- a/lib/Target/ARM64/ARM64FrameLowering.cpp
+++ b/lib/Target/ARM64/ARM64FrameLowering.cpp
@@ -107,32 +107,44 @@ bool ARM64FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
 void ARM64FrameLowering::eliminateCallFramePseudoInstr(
     MachineFunction &MF, MachineBasicBlock &MBB,
     MachineBasicBlock::iterator I) const {
-  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
   const ARM64InstrInfo *TII =
       static_cast<const ARM64InstrInfo *>(MF.getTarget().getInstrInfo());
+  DebugLoc DL = I->getDebugLoc();
+  int Opc = I->getOpcode();
+  bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode();
+  uint64_t CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0;
+
+  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
   if (!TFI->hasReservedCallFrame(MF)) {
-    // If we have alloca, convert as follows:
-    // ADJCALLSTACKDOWN -> sub, sp, sp, amount
-    // ADJCALLSTACKUP   -> add, sp, sp, amount
-    MachineInstr *Old = I;
-    DebugLoc DL = Old->getDebugLoc();
-    unsigned Amount = Old->getOperand(0).getImm();
-    if (Amount != 0) {
-      // We need to keep the stack aligned properly.  To do this, we round the
-      // amount of space needed for the outgoing arguments up to the next
-      // alignment boundary.
-      unsigned Align = TFI->getStackAlignment();
-      Amount = (Amount + Align - 1) / Align * Align;
-
-      // Replace the pseudo instruction with a new instruction...
-      unsigned Opc = Old->getOpcode();
-      if (Opc == ARM64::ADJCALLSTACKDOWN) {
-        emitFrameOffset(MBB, I, DL, ARM64::SP, ARM64::SP, -Amount, TII);
-      } else {
-        assert(Opc == ARM64::ADJCALLSTACKUP && "expected ADJCALLSTACKUP");
-        emitFrameOffset(MBB, I, DL, ARM64::SP, ARM64::SP, Amount, TII);
-      }
+    unsigned Align = getStackAlignment();
+
+    int64_t Amount = I->getOperand(0).getImm();
+    Amount = RoundUpToAlignment(Amount, Align);
+    if (!IsDestroy)
+      Amount = -Amount;
+
+    // N.b. if CalleePopAmount is valid but zero (i.e. callee would pop, but it
+    // doesn't have to pop anything), then the first operand will be zero too so
+    // this adjustment is a no-op.
+    if (CalleePopAmount == 0) {
+      // FIXME: in-function stack adjustment for calls is limited to 24-bits
+      // because there's no guaranteed temporary register available.
+      //
+      // ADD/SUB (immediate) has only LSL #0 and LSL #12 avaiable.
+      // 1) For offset <= 12-bit, we use LSL #0
+      // 2) For 12-bit <= offset <= 24-bit, we use two instructions. One uses
+      // LSL #0, and the other uses LSL #12.
+      //
+      // Mostly call frames will be allocated at the start of a function so
+      // this is OK, but it is a limitation that needs dealing with.
+      assert(Amount > -0xffffff && Amount < 0xffffff && "call frame too large");
+      emitFrameOffset(MBB, I, DL, ARM64::SP, ARM64::SP, Amount, TII);
     }
+  } else if (CalleePopAmount != 0) {
+    // If the calling convention demands that the callee pops arguments from the
+    // stack, we want to add it back if we have a reserved call frame.
+    assert(CalleePopAmount < 0xffffff && "call frame too large");
+    emitFrameOffset(MBB, I, DL, ARM64::SP, ARM64::SP, -CalleePopAmount, TII);
   }
   MBB.erase(I);
 }
@@ -420,8 +432,57 @@ void ARM64FrameLowering::emitEpilogue(MachineFunction &MF,
   const ARM64RegisterInfo *RegInfo =
       static_cast<const ARM64RegisterInfo *>(MF.getTarget().getRegisterInfo());
   DebugLoc DL = MBBI->getDebugLoc();
+  unsigned RetOpcode = MBBI->getOpcode();
 
   int NumBytes = MFI->getStackSize();
+  const ARM64FunctionInfo *AFI = MF.getInfo<ARM64FunctionInfo>();
+
+  // Initial and residual are named for consitency with the prologue. Note that
+  // in the epilogue, the residual adjustment is executed first.
+  uint64_t ArgumentPopSize = 0;
+  if (RetOpcode == ARM64::TCRETURNdi || RetOpcode == ARM64::TCRETURNri) {
+    MachineOperand &StackAdjust = MBBI->getOperand(1);
+
+    // For a tail-call in a callee-pops-arguments environment, some or all of
+    // the stack may actually be in use for the call's arguments, this is
+    // calculated during LowerCall and consumed here...
+    ArgumentPopSize = StackAdjust.getImm();
+  } else {
+    // ... otherwise the amount to pop is *all* of the argument space,
+    // conveniently stored in the MachineFunctionInfo by
+    // LowerFormalArguments. This will, of course, be zero for the C calling
+    // convention.
+    ArgumentPopSize = AFI->getArgumentStackToRestore();
+  }
+
+  // The stack frame should be like below,
+  //
+  //      ----------------------                     ---
+  //      |                    |                      |
+  //      | BytesInStackArgArea|              CalleeArgStackSize
+  //      | (NumReusableBytes) |                (of tail call)
+  //      |                    |                     ---
+  //      |                    |                      |
+  //      ---------------------|        ---           |
+  //      |                    |         |            |
+  //      |   CalleeSavedReg   |         |            |
+  //      | (NumRestores * 16) |         |            |
+  //      |                    |         |            |
+  //      ---------------------|         |         NumBytes
+  //      |                    |     StackSize  (StackAdjustUp)
+  //      |   LocalStackSize   |         |            |
+  //      | (covering callee   |         |            |
+  //      |       args)        |         |            |
+  //      |                    |         |            |
+  //      ----------------------        ---          ---
+  //
+  // So NumBytes = StackSize + BytesInStackArgArea - CalleeArgStackSize
+  //             = StackSize + ArgumentPopSize
+  //
+  // ARM64TargetLowering::LowerCall figures out ArgumentPopSize and keeps
+  // it as the 2nd argument of ARM64ISD::TC_RETURN.
+  NumBytes += ArgumentPopSize;
+
   unsigned NumRestores = 0;
   // Move past the restores of the callee-saved registers.
   MachineBasicBlock::iterator LastPopI = MBBI;
diff --git a/lib/Target/ARM64/ARM64ISelLowering.cpp b/lib/Target/ARM64/ARM64ISelLowering.cpp
index f00e829070a..118007fa70c 100644
--- a/lib/Target/ARM64/ARM64ISelLowering.cpp
+++ b/lib/Target/ARM64/ARM64ISelLowering.cpp
@@ -39,12 +39,6 @@ using namespace llvm;
 STATISTIC(NumTailCalls, "Number of tail calls");
 STATISTIC(NumShiftInserts, "Number of vector shift inserts");
 
-// This option should go away when tail calls fully work.
-static cl::opt<bool>
-EnableARM64TailCalls("arm64-tail-calls", cl::Hidden,
-                     cl::desc("Generate ARM64 tail calls (TEMPORARY OPTION)."),
-                     cl::init(true));
-
 enum AlignMode {
   StrictAlign,
   NoStrictAlign
@@ -1811,6 +1805,27 @@ SDValue ARM64TargetLowering::LowerFormalArguments(
     AFI->setVarArgsStackIndex(MFI->CreateFixedObject(4, StackOffset, true));
   }
 
+  ARM64FunctionInfo *FuncInfo = MF.getInfo<ARM64FunctionInfo>();
+  unsigned StackArgSize = CCInfo.getNextStackOffset();
+  bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
+  if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) {
+    // This is a non-standard ABI so by fiat I say we're allowed to make full
+    // use of the stack area to be popped, which must be aligned to 16 bytes in
+    // any case:
+    StackArgSize = RoundUpToAlignment(StackArgSize, 16);
+
+    // If we're expected to restore the stack (e.g. fastcc) then we'll be adding
+    // a multiple of 16.
+    FuncInfo->setArgumentStackToRestore(StackArgSize);
+
+    // This realignment carries over to the available bytes below. Our own
+    // callers will guarantee the space is free by giving an aligned value to
+    // CALLSEQ_START.
+  }
+  // Even if we're not expected to free up the space, it's useful to know how
+  // much is there while considering tail calls (because we can reuse it).
+  FuncInfo->setBytesInStackArgArea(StackArgSize);
+
   return Chain;
 }
 
@@ -1942,57 +1957,147 @@ bool ARM64TargetLowering::isEligibleForTailCallOptimization(
     const SmallVectorImpl<ISD::OutputArg> &Outs,
     const SmallVectorImpl<SDValue> &OutVals,
     const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
-  // Look for obvious safe cases to perform tail call optimization that do not
-  // require ABI changes. This is what gcc calls sibcall.
-
-  // Do not sibcall optimize vararg calls unless the call site is not passing
-  // any arguments.
-  if (isVarArg && !Outs.empty())
+  // For CallingConv::C this function knows whether the ABI needs
+  // changing. That's not true for other conventions so they will have to opt in
+  // manually.
+  if (!IsTailCallConvention(CalleeCC) && CalleeCC != CallingConv::C)
     return false;
 
-  // Also avoid sibcall optimization if either caller or callee uses struct
-  // return semantics.
-  if (isCalleeStructRet || isCallerStructRet)
+  const MachineFunction &MF = DAG.getMachineFunction();
+  const Function *CallerF = MF.getFunction();
+  CallingConv::ID CallerCC = CallerF->getCallingConv();
+  bool CCMatch = CallerCC == CalleeCC;
+
+  // Byval parameters hand the function a pointer directly into the stack area
+  // we want to reuse during a tail call. Working around this *is* possible (see
+  // X86) but less efficient and uglier in LowerCall.
+  for (Function::const_arg_iterator i = CallerF->arg_begin(),
+                                    e = CallerF->arg_end();
+       i != e; ++i)
+    if (i->hasByValAttr())
+      return false;
+
+  if (getTargetMachine().Options.GuaranteedTailCallOpt) {
+    if (IsTailCallConvention(CalleeCC) && CCMatch)
+      return true;
     return false;
+  }
 
-  // Note that currently ARM64 "C" calling convention and "Fast" calling
-  // convention are compatible. If/when that ever changes, we'll need to
-  // add checks here to make sure any interactions are OK.
+  // Now we search for cases where we can use a tail call without changing the
+  // ABI. Sibcall is used in some places (particularly gcc) to refer to this
+  // concept.
 
-  // If the callee takes no arguments then go on to check the results of the
-  // call.
-  if (!Outs.empty()) {
-    // Check if stack adjustment is needed. For now, do not do this if any
-    // argument is passed on the stack.
+  // I want anyone implementing a new calling convention to think long and hard
+  // about this assert.
+  assert((!isVarArg || CalleeCC == CallingConv::C) &&
+         "Unexpected variadic calling convention");
+
+  if (isVarArg && !Outs.empty()) {
+    // At least two cases here: if caller is fastcc then we can't have any
+    // memory arguments (we'd be expected to clean up the stack afterwards). If
+    // caller is C then we could potentially use its argument area.
+
+    // FIXME: for now we take the most conservative of these in both cases:
+    // disallow all variadic memory operands.
     SmallVector<CCValAssign, 16> ArgLocs;
     CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
                    getTargetMachine(), ArgLocs, *DAG.getContext());
-    CCAssignFn *AssignFn = CCAssignFnForCall(CalleeCC, /*IsVarArg=*/false);
-    CCInfo.AnalyzeCallOperands(Outs, AssignFn);
-    if (CCInfo.getNextStackOffset()) {
-      // Check if the arguments are already laid out in the right way as
-      // the caller's fixed stack objects.
-      for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e;
-           ++i, ++realArgIdx) {
-        CCValAssign &VA = ArgLocs[i];
-        if (VA.getLocInfo() == CCValAssign::Indirect)
-          return false;
-        if (VA.needsCustom()) {
-          // Just don't handle anything that needs custom adjustments for now.
-          // If need be, we can revisit later, but we shouldn't ever end up
-          // here.
+
+    CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, true));
+    for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
+      if (!ArgLocs[i].isRegLoc())
+        return false;
+  }
+
+  // If the calling conventions do not match, then we'd better make sure the
+  // results are returned in the same way as what the caller expects.
+  if (!CCMatch) {
+    SmallVector<CCValAssign, 16> RVLocs1;
+    CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(),
+                    getTargetMachine(), RVLocs1, *DAG.getContext());
+    CCInfo1.AnalyzeCallResult(Ins, CCAssignFnForCall(CalleeCC, isVarArg));
+
+    SmallVector<CCValAssign, 16> RVLocs2;
+    CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(),
+                    getTargetMachine(), RVLocs2, *DAG.getContext());
+    CCInfo2.AnalyzeCallResult(Ins, CCAssignFnForCall(CallerCC, isVarArg));
+
+    if (RVLocs1.size() != RVLocs2.size())
+      return false;
+    for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) {
+      if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc())
+        return false;
+      if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo())
+        return false;
+      if (RVLocs1[i].isRegLoc()) {
+        if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg())
           return false;
-        } else if (!VA.isRegLoc()) {
-          // Likewise, don't try to handle stack based arguments for the
-          // time being.
+      } else {
+        if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset())
           return false;
-        }
       }
     }
   }
 
-  return true;
+  // Nothing more to check if the callee is taking no arguments
+  if (Outs.empty())
+    return true;
+
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
+                 getTargetMachine(), ArgLocs, *DAG.getContext());
+
+  CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg));
+
+  const ARM64FunctionInfo *FuncInfo = MF.getInfo<ARM64FunctionInfo>();
+
+  // If the stack arguments for this call would fit into our own save area then
+  // the call can be made tail.
+  return CCInfo.getNextStackOffset() <= FuncInfo->getBytesInStackArgArea();
+}
+
+SDValue ARM64TargetLowering::addTokenForArgument(SDValue Chain,
+                                                 SelectionDAG &DAG,
+                                                 MachineFrameInfo *MFI,
+                                                 int ClobberedFI) const {
+  SmallVector<SDValue, 8> ArgChains;
+  int64_t FirstByte = MFI->getObjectOffset(ClobberedFI);
+  int64_t LastByte = FirstByte + MFI->getObjectSize(ClobberedFI) - 1;
+
+  // Include the original chain at the beginning of the list. When this is
+  // used by target LowerCall hooks, this helps legalize find the
+  // CALLSEQ_BEGIN node.
+  ArgChains.push_back(Chain);
+
+  // Add a chain value for each stack argument corresponding
+  for (SDNode::use_iterator U = DAG.getEntryNode().getNode()->use_begin(),
+                            UE = DAG.getEntryNode().getNode()->use_end();
+       U != UE; ++U)
+    if (LoadSDNode *L = dyn_cast<LoadSDNode>(*U))
+      if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr()))
+        if (FI->getIndex() < 0) {
+          int64_t InFirstByte = MFI->getObjectOffset(FI->getIndex());
+          int64_t InLastByte = InFirstByte;
+          InLastByte += MFI->getObjectSize(FI->getIndex()) - 1;
+
+          if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
+              (FirstByte <= InFirstByte && InFirstByte <= LastByte))
+            ArgChains.push_back(SDValue(L, 1));
+        }
+
+  // Build a tokenfactor for all the chains.
+  return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
+}
+
+bool ARM64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC,
+                                                 bool TailCallOpt) const {
+  return CallCC == CallingConv::Fast && TailCallOpt;
 }
+
+bool ARM64TargetLowering::IsTailCallConvention(CallingConv::ID CallCC) const {
+  return CallCC == CallingConv::Fast;
+}
+
 /// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain,
 /// and add input and output parameter nodes.
 SDValue ARM64TargetLowering::LowerCall(CallLoweringInfo &CLI,
@@ -2012,9 +2117,9 @@ SDValue ARM64TargetLowering::LowerCall(CallLoweringInfo &CLI,
   bool IsStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
   bool IsThisReturn = false;
 
-  // If tail calls are explicitly disabled, make sure not to use them.
-  if (!EnableARM64TailCalls)
-    IsTailCall = false;
+  ARM64FunctionInfo *FuncInfo = MF.getInfo<ARM64FunctionInfo>();
+  bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
+  bool IsSibCall = false;
 
   if (IsTailCall) {
     // Check if it's really possible to do a tail call.
@@ -2024,9 +2129,12 @@ SDValue ARM64TargetLowering::LowerCall(CallLoweringInfo &CLI,
     if (!IsTailCall && CLI.CS && CLI.CS->isMustTailCall())
       report_fatal_error("failed to perform tail call elimination on a call "
                          "site marked musttail");
-    // We don't support GuaranteedTailCallOpt, only automatically
-    // detected sibcalls.
-    // FIXME: Re-evaluate. Is this true? Should it be true?
+
+    // A sibling call is one where we're under the usual C ABI and not planning
+    // to change that but can still do a tail call:
+    if (!TailCallOpt && IsTailCall)
+      IsSibCall = true;
+
     if (IsTailCall)
       ++NumTailCalls;
   }
@@ -2082,9 +2190,42 @@ SDValue ARM64TargetLowering::LowerCall(CallLoweringInfo &CLI,
   // Get a count of how many bytes are to be pushed on the stack.
   unsigned NumBytes = CCInfo.getNextStackOffset();
 
+  if (IsSibCall) {
+    // Since we're not changing the ABI to make this a tail call, the memory
+    // operands are already available in the caller's incoming argument space.
+    NumBytes = 0;
+  }
+
+  // FPDiff is the byte offset of the call's argument area from the callee's.
+  // Stores to callee stack arguments will be placed in FixedStackSlots offset
+  // by this amount for a tail call. In a sibling call it must be 0 because the
+  // caller will deallocate the entire stack and the callee still expects its
+  // arguments to begin at SP+0. Completely unused for non-tail calls.
+  int FPDiff = 0;
+
+  if (IsTailCall && !IsSibCall) {
+    unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();
+
+    // Since callee will pop argument stack as a tail call, we must keep the
+    // popped size 16-byte aligned.
+    NumBytes = RoundUpToAlignment(NumBytes, 16);
+
+    // FPDiff will be negative if this tail call requires more space than we
+    // would automatically have in our incoming argument space. Positive if we
+    // can actually shrink the stack.
+    FPDiff = NumReusableBytes - NumBytes;
+
+    // The stack pointer must be 16-byte aligned at all times it's used for a
+    // memory operation, which in practice means at *all* times and in
+    // particular across call boundaries. Therefore our own arguments started at
+    // a 16-byte aligned SP and the delta applied for the tail call should
+    // satisfy the same constraint.
+    assert(FPDiff % 16 == 0 && "unaligned stack on tail call");
+  }
+
   // Adjust the stack pointer for the new arguments...
   // These operations are automatically eliminated by the prolog/epilog pass
-  if (!IsTailCall)
+  if (!IsSibCall)
     Chain =
         DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true), DL);
 
@@ -2134,31 +2275,50 @@ SDValue ARM64TargetLowering::LowerCall(CallLoweringInfo &CLI,
       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
     } else {
       assert(VA.isMemLoc());
-      // There's no reason we can't support stack args w/ tailcall, but
-      // we currently don't, so assert if we see one.
-      assert(!IsTailCall && "stack argument with tail call!?");
 
-      // FIXME: This works on big-endian for composite byvals, which are the common
-      // case. It should also work for fundamental types too.
+      SDValue DstAddr;
+      MachinePointerInfo DstInfo;
+
+      // FIXME: This works on big-endian for composite byvals, which are the
+      // common case. It should also work for fundamental types too.
       uint32_t BEAlign = 0;
+      unsigned OpSize = Flags.isByVal() ? Flags.getByValSize() * 8
+                                        : VA.getLocVT().getSizeInBits();
+      OpSize = (OpSize + 7) / 8;
       if (!Subtarget->isLittleEndian() && !Flags.isByVal()) {
-        unsigned OpSize = (VA.getLocVT().getSizeInBits() + 7) / 8;
         if (OpSize < 8)
           BEAlign = 8 - OpSize;
       }
-
       unsigned LocMemOffset = VA.getLocMemOffset();
-      SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset + BEAlign);
+      int32_t Offset = LocMemOffset + BEAlign;
+      SDValue PtrOff = DAG.getIntPtrConstant(Offset);
       PtrOff = DAG.getNode(ISD::ADD, DL, getPointerTy(), StackPtr, PtrOff);
 
+      if (IsTailCall) {
+        Offset = Offset + FPDiff;
+        int FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
+
+        DstAddr = DAG.getFrameIndex(FI, getPointerTy());
+        DstInfo = MachinePointerInfo::getFixedStack(FI);
+
+        // Make sure any stack arguments overlapping with where we're storing
+        // are loaded before this eventual operation. Otherwise they'll be
+        // clobbered.
+        Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI);
+      } else {
+        SDValue PtrOff = DAG.getIntPtrConstant(Offset);
+
+        DstAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), StackPtr, PtrOff);
+        DstInfo = MachinePointerInfo::getStack(LocMemOffset);
+      }
+
       if (Outs[i].Flags.isByVal()) {
         SDValue SizeNode =
             DAG.getConstant(Outs[i].Flags.getByValSize(), MVT::i64);
         SDValue Cpy = DAG.getMemcpy(
-            Chain, DL, PtrOff, Arg, SizeNode, Outs[i].Flags.getByValAlign(),
+            Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getByValAlign(),
             /*isVolatile = */ false,
-            /*alwaysInline = */ false,
-            MachinePointerInfo::getStack(LocMemOffset), MachinePointerInfo());
+            /*alwaysInline = */ false, DstInfo, MachinePointerInfo());
 
         MemOpChains.push_back(Cpy);
       } else {
@@ -2171,9 +2331,8 @@ SDValue ARM64TargetLowering::LowerCall(CallLoweringInfo &CLI,
              VA.getLocVT() == MVT::i16))
           Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getLocVT(), Arg);
 
-        SDValue Store = DAG.getStore(Chain, DL, Arg, PtrOff,
-                                     MachinePointerInfo::getStack(LocMemOffset),
-                                     false, false, 0);
+        SDValue Store =
+            DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, false, false, 0);
         MemOpChains.push_back(Store);
       }
     }
@@ -2221,10 +2380,27 @@ SDValue ARM64TargetLowering::LowerCall(CallLoweringInfo &CLI,
     Callee = DAG.getTargetExternalSymbol(Sym, getPointerTy(), 0);
   }
 
+  // We don't usually want to end the call-sequence here because we would tidy
+  // the frame up *after* the call, however in the ABI-changing tail-call case
+  // we've carefully laid out the parameters so that when sp is reset they'll be
+  // in the correct location.
+  if (IsTailCall && !IsSibCall) {
+    Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
+                               DAG.getIntPtrConstant(0, true), InFlag, DL);
+    InFlag = Chain.getValue(1);
+  }
+
   std::vector<SDValue> Ops;
   Ops.push_back(Chain);
   Ops.push_back(Callee);
 
+  if (IsTailCall) {
+    // Each tail call may have to adjust the stack by a different amount, so
+    // this information must travel along with the operation for eventual
+    // consumption by emitEpilogue.
+    Ops.push_back(DAG.getTargetConstant(FPDiff, MVT::i32));
+  }
+
   // Add argument registers to the end of the list so that they are known live
   // into the call.
   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
@@ -2262,8 +2438,13 @@ SDValue ARM64TargetLowering::LowerCall(CallLoweringInfo &CLI,
   Chain = DAG.getNode(ARM64ISD::CALL, DL, NodeTys, Ops);
   InFlag = Chain.getValue(1);
 
+  uint64_t CalleePopBytes = DoesCalleeRestoreStack(CallConv, TailCallOpt)
+                                ? RoundUpToAlignment(NumBytes, 16)
+                                : 0;
+
   Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
-                             DAG.getIntPtrConstant(0, true), InFlag, DL);
+                             DAG.getIntPtrConstant(CalleePopBytes, true),
+                             InFlag, DL);
   if (!Ins.empty())
     InFlag = Chain.getValue(1);
 
@@ -7441,9 +7622,6 @@ bool ARM64TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
 // return instructions to help enable tail call optimizations for this
 // instruction.
 bool ARM64TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
-  if (!EnableARM64TailCalls)
-    return false;
-
   if (!CI->isTailCall())
     return false;
 
diff --git a/lib/Target/ARM64/ARM64ISelLowering.h b/lib/Target/ARM64/ARM64ISelLowering.h
index fd29a51b40e..55792317dba 100644
--- a/lib/Target/ARM64/ARM64ISelLowering.h
+++ b/lib/Target/ARM64/ARM64ISelLowering.h
@@ -353,6 +353,16 @@ private:
       const SmallVectorImpl<SDValue> &OutVals,
       const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const;
 
+  /// Finds the incoming stack arguments which overlap the given fixed stack
+  /// object and incorporates their load into the current chain. This prevents
+  /// an upcoming store from clobbering the stack argument before it's used.
+  SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG,
+                              MachineFrameInfo *MFI, int ClobberedFI) const;
+
+  bool DoesCalleeRestoreStack(CallingConv::ID CallCC, bool TailCallOpt) const;
+
+  bool IsTailCallConvention(CallingConv::ID CallCC) const;
+
   void saveVarArgRegisters(CCState &CCInfo, SelectionDAG &DAG, SDLoc DL,
                            SDValue &Chain) const;
 
diff --git a/lib/Target/ARM64/ARM64InstrInfo.td b/lib/Target/ARM64/ARM64InstrInfo.td
index bc39fe97b6f..4d5714b8d0c 100644
--- a/lib/Target/ARM64/ARM64InstrInfo.td
+++ b/lib/Target/ARM64/ARM64InstrInfo.td
@@ -88,7 +88,7 @@ def SDT_ARM64binvec : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
 def SDT_ARM64trivec : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
                                            SDTCisSameAs<0,2>,
                                            SDTCisSameAs<0,3>]>;
-def SDT_ARM64TCRET : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>;
+def SDT_ARM64TCRET : SDTypeProfile<0, 2, [SDTCisPtrTy<0>]>;
 def SDT_ARM64PREFETCH : SDTypeProfile<0, 2, [SDTCisVT<0, i32>, SDTCisPtrTy<1>]>;
 
 def SDT_ARM64ITOF  : SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisSameAs<0,1>]>;
@@ -5163,12 +5163,15 @@ def : Pat<(i64 (int_arm64_neon_urshl (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
 // Tail call return handling. These are all compiler pseudo-instructions,
 // so no encoding information or anything like that.
 let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [SP] in {
-  def TCRETURNdi : Pseudo<(outs), (ins i64imm:$dst), []>;
-  def TCRETURNri : Pseudo<(outs), (ins tcGPR64:$dst), []>;
+  def TCRETURNdi : Pseudo<(outs), (ins i64imm:$dst, i32imm:$FPDiff),[]>;
+  def TCRETURNri : Pseudo<(outs), (ins tcGPR64:$dst, i32imm:$FPDiff), []>;
 }
 
-def : Pat<(ARM64tcret tcGPR64:$dst), (TCRETURNri tcGPR64:$dst)>;
-def : Pat<(ARM64tcret (i64 tglobaladdr:$dst)), (TCRETURNdi texternalsym:$dst)>;
-def : Pat<(ARM64tcret (i64 texternalsym:$dst)), (TCRETURNdi texternalsym:$dst)>;
+def : Pat<(ARM64tcret tcGPR64:$dst, (i32 timm:$FPDiff)),
+          (TCRETURNri tcGPR64:$dst, imm:$FPDiff)>;
+def : Pat<(ARM64tcret tglobaladdr:$dst, (i32 timm:$FPDiff)),
+          (TCRETURNdi texternalsym:$dst, imm:$FPDiff)>;
+def : Pat<(ARM64tcret texternalsym:$dst, (i32 timm:$FPDiff)),
+          (TCRETURNdi texternalsym:$dst, imm:$FPDiff)>;
 
 include "ARM64InstrAtomics.td"
diff --git a/lib/Target/ARM64/ARM64MachineFunctionInfo.h b/lib/Target/ARM64/ARM64MachineFunctionInfo.h
index 02bf7cf6fd0..0b6f4f1ec64 100644
--- a/lib/Target/ARM64/ARM64MachineFunctionInfo.h
+++ b/lib/Target/ARM64/ARM64MachineFunctionInfo.h
@@ -25,6 +25,22 @@ namespace llvm {
 /// contains private ARM64-specific information for each MachineFunction.
 class ARM64FunctionInfo : public MachineFunctionInfo {
 
+  /// Number of bytes of arguments this function has on the stack. If the callee
+  /// is expected to restore the argument stack this should be a multiple of 16,
+  /// all usable during a tail call.
+  ///
+  /// The alternative would forbid tail call optimisation in some cases: if we
+  /// want to transfer control from a function with 8-bytes of stack-argument
+  /// space to a function with 16-bytes then misalignment of this value would
+  /// make a stack adjustment necessary, which could not be undone by the
+  /// callee.
+  unsigned BytesInStackArgArea;
+
+  /// The number of bytes to restore to deallocate space for incoming
+  /// arguments. Canonically 0 in the C calling convention, but non-zero when
+  /// callee is expected to pop the args.
+  unsigned ArgumentStackToRestore;
+
   /// HasStackFrame - True if this function has a stack frame. Set by
   /// processFunctionBeforeCalleeSavedScan().
   bool HasStackFrame;
@@ -58,17 +74,25 @@ class ARM64FunctionInfo : public MachineFunctionInfo {
 
 public:
   ARM64FunctionInfo()
-      : HasStackFrame(false), NumLocalDynamicTLSAccesses(0),
-        VarArgsStackIndex(0), VarArgsGPRIndex(0), VarArgsGPRSize(0),
-        VarArgsFPRIndex(0), VarArgsFPRSize(0) {}
+      : BytesInStackArgArea(0), ArgumentStackToRestore(0), HasStackFrame(false),
+        NumLocalDynamicTLSAccesses(0), VarArgsStackIndex(0), VarArgsGPRIndex(0),
+        VarArgsGPRSize(0), VarArgsFPRIndex(0), VarArgsFPRSize(0) {}
 
   explicit ARM64FunctionInfo(MachineFunction &MF)
-      : HasStackFrame(false), NumLocalDynamicTLSAccesses(0),
-        VarArgsStackIndex(0), VarArgsGPRIndex(0), VarArgsGPRSize(0),
-        VarArgsFPRIndex(0), VarArgsFPRSize(0) {
+      : BytesInStackArgArea(0), ArgumentStackToRestore(0), HasStackFrame(false),
+        NumLocalDynamicTLSAccesses(0), VarArgsStackIndex(0), VarArgsGPRIndex(0),
+        VarArgsGPRSize(0), VarArgsFPRIndex(0), VarArgsFPRSize(0) {
     (void)MF;
   }
 
+  unsigned getBytesInStackArgArea() const { return BytesInStackArgArea; }
+  void setBytesInStackArgArea(unsigned bytes) { BytesInStackArgArea = bytes; }
+
+  unsigned getArgumentStackToRestore() const { return ArgumentStackToRestore; }
+  void setArgumentStackToRestore(unsigned bytes) {
+    ArgumentStackToRestore = bytes;
+  }
+
   bool hasStackFrame() const { return HasStackFrame; }
   void setHasStackFrame(bool s) { HasStackFrame = s; }
 
diff --git a/test/CodeGen/AArch64/fastcc-reserved.ll b/test/CodeGen/AArch64/fastcc-reserved.ll
index c6c050570dd..97410aa5026 100644
--- a/test/CodeGen/AArch64/fastcc-reserved.ll
+++ b/test/CodeGen/AArch64/fastcc-reserved.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -tailcallopt | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=arm64-none-linux-gnu -tailcallopt | FileCheck %s --check-prefix=CHECK-ARM64
 
 ; This test is designed to be run in the situation where the
 ; call-frame is not reserved (hence disable-fp-elim), but where
@@ -14,20 +15,28 @@ define fastcc void @foo(i32 %in) {
 ; Normal frame setup stuff:
 ; CHECK: sub sp, sp,
 ; CHECK: stp x29, x30
+; CHECK-ARM64: stp     x29, x30, [sp, #-16]!
+; CHECK-ARM64: mov     x29, sp
 
 ; Reserve space for call-frame:
 ; CHECK: sub sp, sp, #16
+; CHECK-ARM64: sub sp, sp, #16
 
   call fastcc void @will_pop([8 x i32] undef, i32 42)
 ; CHECK: bl will_pop
+; CHECK-ARM64: bl will_pop
 
 ; Since @will_pop is fastcc with tailcallopt, it will put the stack
 ; back where it needs to be, we shouldn't duplicate that
 ; CHECK-NOT: sub sp, sp, #16
 ; CHECK-NOT: add sp, sp,
+; CHECK-ARM64-NOT: sub sp, sp, #16
+; CHECK-ARM64-NOT: add sp, sp,
 
 ; CHECK: ldp x29, x30
 ; CHECK: add sp, sp,
+; CHECK-ARM64: mov     sp, x29
+; CHECK-ARM64: ldp     x29, x30, [sp], #16
   ret void
 }
 
@@ -40,19 +49,26 @@ define void @foo1(i32 %in) {
 ; Normal frame setup again
 ; CHECK: sub sp, sp,
 ; CHECK: stp x29, x30
+; CHECK-ARM64: stp     x29, x30, [sp, #-16]!
+; CHECK-ARM64: mov     x29, sp
 
 ; Reserve space for call-frame
 ; CHECK: sub sp, sp, #16
+; CHECK-ARM64: sub sp, sp, #16
 
   call void @wont_pop([8 x i32] undef, i32 42)
 ; CHECK: bl wont_pop
+; CHECK-ARM64: bl wont_pop
 
 ; This time we *do* need to unreserve the call-frame
 ; CHECK: add sp, sp, #16
+; CHECK-ARM64: add sp, sp, #16
 
 ; Check for epilogue (primarily to make sure sp spotted above wasn't
 ; part of it).
 ; CHECK: ldp x29, x30
 ; CHECK: add sp, sp,
+; CHECK-ARM64: mov     sp, x29
+; CHECK-ARM64: ldp     x29, x30, [sp], #16
   ret void
 }
diff --git a/test/CodeGen/AArch64/fastcc.ll b/test/CodeGen/AArch64/fastcc.ll
index a4cd37858ee..fb9b4ac57ed 100644
--- a/test/CodeGen/AArch64/fastcc.ll
+++ b/test/CodeGen/AArch64/fastcc.ll
@@ -1,5 +1,7 @@
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -tailcallopt | FileCheck %s -check-prefix CHECK-TAIL
+; RUN: llc -verify-machineinstrs < %s -mtriple=arm64-none-linux-gnu -tailcallopt | FileCheck %s -check-prefix CHECK-ARM64-TAIL
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=arm64-none-linux-gnu | FileCheck --check-prefix=CHECK-ARM64 %s
 
 ; Without tailcallopt fastcc still means the caller cleans up the
 ; stack, so try to make sure this is respected.
@@ -8,116 +10,219 @@ define fastcc void @func_stack0() {
 ; CHECK-LABEL: func_stack0:
 ; CHECK: sub sp, sp, #48
 
+; CHECK-ARM64-LABEL: func_stack0:
+; CHECK-ARM64: stp x29, x30, [sp, #-16]!
+; CHECK-ARM64-NEXT: mov x29, sp
+; CHECK-ARM64-NEXT: sub sp, sp, #32
+
 ; CHECK-TAIL-LABEL: func_stack0:
 ; CHECK-TAIL: sub sp, sp, #48
 
+; CHECK-ARM64-TAIL-LABEL: func_stack0:
+; CHECK-ARM64-TAIL: stp x29, x30, [sp, #-16]!
+; CHECK-ARM64-TAIL-NEXT: mov x29, sp
+; CHECK-ARM64-TAIL-NEXT: sub sp, sp, #32
+
 
   call fastcc void @func_stack8([8 x i32] undef, i32 42)
 ; CHECK:  bl func_stack8
 ; CHECK-NOT: sub sp, sp,
 
+; CHECK-ARM64:  bl func_stack8
+; CHECK-ARM64-NOT: sub sp, sp,
+
 ; CHECK-TAIL: bl func_stack8
 ; CHECK-TAIL: sub sp, sp, #16
 
+; CHECK-ARM64-TAIL: bl func_stack8
+; CHECK-ARM64-TAIL: sub sp, sp, #16
+
 
   call fastcc void @func_stack32([8 x i32] undef, i128 0, i128 9)
 ; CHECK: bl func_stack32
 ; CHECK-NOT: sub sp, sp,
 
+; CHECK-ARM64: bl func_stack32
+; CHECK-ARM64-NOT: sub sp, sp,
+
 ; CHECK-TAIL: bl func_stack32
 ; CHECK-TAIL: sub sp, sp, #32
 
+; CHECK-ARM64-TAIL: bl func_stack32
+; CHECK-ARM64-TAIL: sub sp, sp, #32
+
 
   call fastcc void @func_stack0()
 ; CHECK: bl func_stack0
 ; CHECK-NOT: sub sp, sp
 
+; CHECK-ARM64: bl func_stack0
+; CHECK-ARM64-NOT: sub sp, sp
+
 ; CHECK-TAIL: bl func_stack0
 ; CHECK-TAIL-NOT: sub sp, sp
 
+; CHECK-ARM64-TAIL: bl func_stack0
+; CHECK-ARM64-TAIL-NOT: sub sp, sp
+
   ret void
 ; CHECK: add sp, sp, #48
 ; CHECK-NEXT: ret
 
+; CHECK-ARM64: mov sp, x29
+; CHECK-ARM64-NEXT: ldp     x29, x30, [sp], #16
+; CHECK-ARM64-NEXT: ret
+
 ; CHECK-TAIL: add sp, sp, #48
 ; CHECK-TAIL-NEXT: ret
 
+; CHECK-ARM64-TAIL: mov sp, x29
+; CHECK-ARM64-TAIL-NEXT: ldp     x29, x30, [sp], #16
+; CHECK-ARM64-TAIL-NEXT: ret
 }
 
 define fastcc void @func_stack8([8 x i32], i32 %stacked) {
 ; CHECK-LABEL: func_stack8:
 ; CHECK: sub sp, sp, #48
 
+; CHECK-ARM64-LABEL: func_stack8:
+; CHECK-ARM64: stp x29, x30, [sp, #-16]!
+; CHECK-ARM64: mov x29, sp
+; CHECK-ARM64: sub sp, sp, #32
+
 ; CHECK-TAIL-LABEL: func_stack8:
 ; CHECK-TAIL: sub sp, sp, #48
 
+; CHECK-ARM64-TAIL-LABEL: func_stack8:
+; CHECK-ARM64-TAIL: stp x29, x30, [sp, #-16]!
+; CHECK-ARM64-TAIL: mov x29, sp
+; CHECK-ARM64-TAIL: sub sp, sp, #32
+
 
   call fastcc void @func_stack8([8 x i32] undef, i32 42)
 ; CHECK:  bl func_stack8
 ; CHECK-NOT: sub sp, sp,
 
+; CHECK-ARM64:  bl func_stack8
+; CHECK-ARM64-NOT: sub sp, sp,
+
 ; CHECK-TAIL: bl func_stack8
 ; CHECK-TAIL: sub sp, sp, #16
 
+; CHECK-ARM64-TAIL: bl func_stack8
+; CHECK-ARM64-TAIL: sub sp, sp, #16
+
 
   call fastcc void @func_stack32([8 x i32] undef, i128 0, i128 9)
 ; CHECK: bl func_stack32
 ; CHECK-NOT: sub sp, sp,
 
+; CHECK-ARM64: bl func_stack32
+; CHECK-ARM64-NOT: sub sp, sp,
+
 ; CHECK-TAIL: bl func_stack32
 ; CHECK-TAIL: sub sp, sp, #32
 
+; CHECK-ARM64-TAIL: bl func_stack32
+; CHECK-ARM64-TAIL: sub sp, sp, #32
+
 
   call fastcc void @func_stack0()
 ; CHECK: bl func_stack0
 ; CHECK-NOT: sub sp, sp
 
+; CHECK-ARM64: bl func_stack0
+; CHECK-ARM64-NOT: sub sp, sp
+
 ; CHECK-TAIL: bl func_stack0
 ; CHECK-TAIL-NOT: sub sp, sp
 
+; CHECK-ARM64-TAIL: bl func_stack0
+; CHECK-ARM64-TAIL-NOT: sub sp, sp
+
   ret void
 ; CHECK: add sp, sp, #48
 ; CHECK-NEXT: ret
 
+; CHECK-ARM64: mov sp, x29
+; CHECK-ARM64-NEXT: ldp     x29, x30, [sp], #16
+; CHECK-ARM64-NEXT: ret
+
 ; CHECK-TAIL: add sp, sp, #64
 ; CHECK-TAIL-NEXT: ret
+
+; CHECK-ARM64-TAIL: mov sp, x29
+; CHECK-ARM64-TAIL-NEXT: ldp     x29, x30, [sp], #16
+; CHECK-ARM64-TAIL-NEXT: ret
 }
 
 define fastcc void @func_stack32([8 x i32], i128 %stacked0, i128 %stacked1) {
 ; CHECK-LABEL: func_stack32:
 ; CHECK: sub sp, sp, #48
 
+; CHECK-ARM64-LABEL: func_stack32:
+; CHECK-ARM64: mov x29, sp
+
 ; CHECK-TAIL-LABEL: func_stack32:
 ; CHECK-TAIL: sub sp, sp, #48
 
+; CHECK-ARM64-TAIL-LABEL: func_stack32:
+; CHECK-ARM64-TAIL: mov x29, sp
+
 
   call fastcc void @func_stack8([8 x i32] undef, i32 42)
 ; CHECK:  bl func_stack8
 ; CHECK-NOT: sub sp, sp,
 
+; CHECK-ARM64:  bl func_stack8
+; CHECK-ARM64-NOT: sub sp, sp,
+
 ; CHECK-TAIL: bl func_stack8
 ; CHECK-TAIL: sub sp, sp, #16
 
+; CHECK-ARM64-TAIL: bl func_stack8
+; CHECK-ARM64-TAIL: sub sp, sp, #16
+
 
   call fastcc void @func_stack32([8 x i32] undef, i128 0, i128 9)
 ; CHECK: bl func_stack32
 ; CHECK-NOT: sub sp, sp,
 
+; CHECK-ARM64: bl func_stack32
+; CHECK-ARM64-NOT: sub sp, sp,
+
 ; CHECK-TAIL: bl func_stack32
 ; CHECK-TAIL: sub sp, sp, #32
 
+; CHECK-ARM64-TAIL: bl func_stack32
+; CHECK-ARM64-TAIL: sub sp, sp, #32
+
 
   call fastcc void @func_stack0()
 ; CHECK: bl func_stack0
 ; CHECK-NOT: sub sp, sp
 
+; CHECK-ARM64: bl func_stack0
+; CHECK-ARM64-NOT: sub sp, sp
+
 ; CHECK-TAIL: bl func_stack0
 ; CHECK-TAIL-NOT: sub sp, sp
 
+; CHECK-ARM64-TAIL: bl func_stack0
+; CHECK-ARM64-TAIL-NOT: sub sp, sp
+
   ret void
 ; CHECK: add sp, sp, #48
 ; CHECK-NEXT: ret
 
+; CHECK-ARM64: mov sp, x29
+; CHECK-ARM64-NEXT: ldp     x29, x30, [sp], #16
+; CHECK-ARM64-NEXT: ret
+
 ; CHECK-TAIL: add sp, sp, #80
 ; CHECK-TAIL-NEXT: ret
+
+; CHECK-ARM64-TAIL: mov sp, x29
+; CHECK-ARM64-TAIL-NEXT: ldp     x29, x30, [sp], #16
+; CHECK-ARM64-TAIL-NEXT: ret
 }
diff --git a/test/CodeGen/AArch64/tail-call.ll b/test/CodeGen/AArch64/tail-call.ll
index 81885f10851..da05848dcc5 100644
--- a/test/CodeGen/AArch64/tail-call.ll
+++ b/test/CodeGen/AArch64/tail-call.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -tailcallopt | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=arm64-none-linux-gnu -tailcallopt | FileCheck --check-prefix=CHECK-ARM64 %s
 
 declare fastcc void @callee_stack0()
 declare fastcc void @callee_stack8([8 x i32], i64)
@@ -7,57 +8,92 @@ declare fastcc void @callee_stack16([8 x i32], i64, i64)
 define fastcc void @caller_to0_from0() nounwind {
 ; CHECK-LABEL: caller_to0_from0:
 ; CHECK-NEXT: // BB
+
+; CHECK-ARM64-LABEL: caller_to0_from0:
+; CHECK-ARM64-NEXT: // BB
+
   tail call fastcc void @callee_stack0()
   ret void
+
 ; CHECK-NEXT: b callee_stack0
+
+; CHECK-ARM64-NEXT: b callee_stack0
 }
 
 define fastcc void @caller_to0_from8([8 x i32], i64) {
 ; CHECK-LABEL: caller_to0_from8:
 
+; CHECK-ARM64-LABEL: caller_to0_from8:
+
   tail call fastcc void @callee_stack0()
   ret void
+
 ; CHECK: add sp, sp, #16
 ; CHECK-NEXT: b callee_stack0
+
+; CHECK-ARM64: add sp, sp, #16
+; CHECK-ARM64-NEXT: b callee_stack0
 }
 
 define fastcc void @caller_to8_from0() {
 ; CHECK-LABEL: caller_to8_from0:
 ; CHECK: sub sp, sp, #32
 
+; CHECK-ARM64-LABEL: caller_to8_from0:
+; CHECK-ARM64: sub sp, sp, #32
+
 ; Key point is that the "42" should go #16 below incoming stack
 ; pointer (we didn't have arg space to reuse).
   tail call fastcc void @callee_stack8([8 x i32] undef, i64 42)
   ret void
+
 ; CHECK: str {{x[0-9]+}}, [sp, #16]
 ; CHECK-NEXT: add sp, sp, #16
 ; CHECK-NEXT: b callee_stack8
+
+; CHECK-ARM64: str {{x[0-9]+}}, [sp, #16]!
+; CHECK-ARM64-NEXT: b callee_stack8
 }
 
 define fastcc void @caller_to8_from8([8 x i32], i64 %a) {
 ; CHECK-LABEL: caller_to8_from8:
 ; CHECK: sub sp, sp, #16
 
+; CHECK-ARM64-LABEL: caller_to8_from8:
+; CHECK-ARM64: sub sp, sp, #16
+
 ; Key point is that the "%a" should go where at SP on entry.
   tail call fastcc void @callee_stack8([8 x i32] undef, i64 42)
   ret void
+
 ; CHECK: str {{x[0-9]+}}, [sp, #16]
 ; CHECK-NEXT: add sp, sp, #16
 ; CHECK-NEXT: b callee_stack8
+
+; CHECK-ARM64: str {{x[0-9]+}}, [sp, #16]!
+; CHECK-ARM64-NEXT: b callee_stack8
 }
 
 define fastcc void @caller_to16_from8([8 x i32], i64 %a) {
 ; CHECK-LABEL: caller_to16_from8:
 ; CHECK: sub sp, sp, #16
 
+; CHECK-ARM64-LABEL: caller_to16_from8:
+; CHECK-ARM64: sub sp, sp, #16
+
 ; Important point is that the call reuses the "dead" argument space
 ; above %a on the stack. If it tries to go below incoming-SP then the
 ; callee will not deallocate the space, even in fastcc.
   tail call fastcc void @callee_stack16([8 x i32] undef, i64 42, i64 2)
+
 ; CHECK: str {{x[0-9]+}}, [sp, #24]
 ; CHECK: str {{x[0-9]+}}, [sp, #16]
-; CHECK: add sp, sp, #16
-; CHECK: b callee_stack16
+; CHECK-NEXT: add sp, sp, #16
+; CHECK-NEXT: b callee_stack16
+
+; CHECK-ARM64: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]
+; CHECK-ARM64-NEXT: add sp, sp, #16
+; CHECK-ARM64-NEXT: b callee_stack16
   ret void
 }
 
@@ -66,12 +102,19 @@ define fastcc void @caller_to8_from24([8 x i32], i64 %a, i64 %b, i64 %c) {
 ; CHECK-LABEL: caller_to8_from24:
 ; CHECK: sub sp, sp, #16
 
+; CHECK-ARM64-LABEL: caller_to8_from24:
+; CHECK-ARM64: sub sp, sp, #16
+
 ; Key point is that the "%a" should go where at #16 above SP on entry.
   tail call fastcc void @callee_stack8([8 x i32] undef, i64 42)
   ret void
+
 ; CHECK: str {{x[0-9]+}}, [sp, #32]
 ; CHECK-NEXT: add sp, sp, #32
 ; CHECK-NEXT: b callee_stack8
+
+; CHECK-ARM64: str {{x[0-9]+}}, [sp, #32]!
+; CHECK-ARM64-NEXT: b callee_stack8
 }
 
 
@@ -79,6 +122,9 @@ define fastcc void @caller_to16_from16([8 x i32], i64 %a, i64 %b) {
 ; CHECK-LABEL: caller_to16_from16:
 ; CHECK: sub sp, sp, #16
 
+; CHECK-ARM64-LABEL: caller_to16_from16:
+; CHECK-ARM64: sub sp, sp, #16
+
 ; Here we want to make sure that both loads happen before the stores:
 ; otherwise either %a or %b will be wrongly clobbered.
   tail call fastcc void @callee_stack16([8 x i32] undef, i64 %b, i64 %a)
@@ -89,6 +135,11 @@ define fastcc void @caller_to16_from16([8 x i32], i64 %a, i64 %b) {
 ; CHECK: str x1,
 ; CHECK: str x0,
 
-; CHECK: add sp, sp, #16
-; CHECK: b callee_stack16
+; CHECK-NEXT: add sp, sp, #16
+; CHECK-NEXT: b callee_stack16
+
+; CHECK-ARM64: ldp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]
+; CHECK-ARM64: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]
+; CHECK-ARM64-NEXT: add sp, sp, #16
+; CHECK-ARM64-NEXT: b callee_stack16
 }