summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--lib/Target/X86/X86ISelLowering.cpp95
-rw-r--r--test/CodeGen/X86/musttail-indirect.ll124
-rw-r--r--test/CodeGen/X86/musttail-thiscall.ll31
-rw-r--r--test/CodeGen/X86/musttail.ll75
4 files changed, 276 insertions, 49 deletions
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index fb941b8d183..0808eff7147 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -2498,10 +2498,10 @@ X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG,
/// EmitTailCallStoreRetAddr - Emit a store of the return address if tail call
/// optimization is performed and it is required (FPDiff!=0).
-static SDValue
-EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF,
- SDValue Chain, SDValue RetAddrFrIdx, EVT PtrVT,
- unsigned SlotSize, int FPDiff, SDLoc dl) {
+static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
+ SDValue Chain, SDValue RetAddrFrIdx,
+ EVT PtrVT, unsigned SlotSize,
+ int FPDiff, SDLoc dl) {
// Store the return address to the appropriate stack slot.
if (!FPDiff) return Chain;
// Calculate the new stack slot for the return address.
@@ -2538,17 +2538,19 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
if (MF.getTarget().Options.DisableTailCalls)
isTailCall = false;
- if (isTailCall) {
+ bool IsMustTail = CLI.CS && CLI.CS->isMustTailCall();
+ if (IsMustTail) {
+ // Force this to be a tail call. The verifier rules are enough to ensure
+ // that we can lower this successfully without moving the return address
+ // around.
+ isTailCall = true;
+ } else if (isTailCall) {
// Check if it's really possible to do a tail call.
isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
isVarArg, SR != NotStructReturn,
MF.getFunction()->hasStructRetAttr(), CLI.RetTy,
Outs, OutVals, Ins, DAG);
- if (!isTailCall && CLI.CS && CLI.CS->isMustTailCall())
- report_fatal_error("failed to perform tail call elimination on a call "
- "site marked musttail");
-
// Sibcalls are automatically detected tailcalls which do not require
// ABI changes.
if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
@@ -2583,7 +2585,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
int FPDiff = 0;
- if (isTailCall && !IsSibcall) {
+ if (isTailCall && !IsSibcall && !IsMustTail) {
// Lower arguments at fp - stackoffset + fpdiff.
X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
@@ -2746,8 +2748,10 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
DAG.getConstant(NumXMMRegs, MVT::i8)));
}
- // For tail calls lower the arguments to the 'real' stack slot.
- if (isTailCall) {
+ // For tail calls lower the arguments to the 'real' stack slots. Sibcalls
+ // don't need this because the eligibility check rejects calls that require
+ // shuffling arguments passed in memory.
+ if (!IsSibcall && isTailCall) {
// Force all the incoming stack arguments to be loaded from the stack
// before any new outgoing arguments are stored to the stack, because the
// outgoing stack slots may alias the incoming argument stack slots, and
@@ -2759,39 +2763,40 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
SmallVector<SDValue, 8> MemOpChains2;
SDValue FIN;
int FI = 0;
- if (getTargetMachine().Options.GuaranteedTailCallOpt) {
- for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
- CCValAssign &VA = ArgLocs[i];
- if (VA.isRegLoc())
- continue;
- assert(VA.isMemLoc());
- SDValue Arg = OutVals[i];
- ISD::ArgFlagsTy Flags = Outs[i].Flags;
- // Create frame index.
- int32_t Offset = VA.getLocMemOffset()+FPDiff;
- uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
- FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
- FIN = DAG.getFrameIndex(FI, getPointerTy());
-
- if (Flags.isByVal()) {
- // Copy relative to framepointer.
- SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset());
- if (!StackPtr.getNode())
- StackPtr = DAG.getCopyFromReg(Chain, dl,
- RegInfo->getStackRegister(),
- getPointerTy());
- Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source);
-
- MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
- ArgChain,
- Flags, DAG, dl));
- } else {
- // Store relative to framepointer.
- MemOpChains2.push_back(
- DAG.getStore(ArgChain, dl, Arg, FIN,
- MachinePointerInfo::getFixedStack(FI),
- false, false, 0));
- }
+ for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+ CCValAssign &VA = ArgLocs[i];
+ if (VA.isRegLoc())
+ continue;
+ assert(VA.isMemLoc());
+ SDValue Arg = OutVals[i];
+ ISD::ArgFlagsTy Flags = Outs[i].Flags;
+ // Skip inalloca arguments. They don't require any work.
+ if (Flags.isInAlloca())
+ continue;
+ // Create frame index.
+ int32_t Offset = VA.getLocMemOffset()+FPDiff;
+ uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
+ FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
+ FIN = DAG.getFrameIndex(FI, getPointerTy());
+
+ if (Flags.isByVal()) {
+ // Copy relative to framepointer.
+ SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset());
+ if (!StackPtr.getNode())
+ StackPtr = DAG.getCopyFromReg(Chain, dl,
+ RegInfo->getStackRegister(),
+ getPointerTy());
+ Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source);
+
+ MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
+ ArgChain,
+ Flags, DAG, dl));
+ } else {
+ // Store relative to framepointer.
+ MemOpChains2.push_back(
+ DAG.getStore(ArgChain, dl, Arg, FIN,
+ MachinePointerInfo::getFixedStack(FI),
+ false, false, 0));
}
}
diff --git a/test/CodeGen/X86/musttail-indirect.ll b/test/CodeGen/X86/musttail-indirect.ll
new file mode 100644
index 00000000000..9d21b5ea5d5
--- /dev/null
+++ b/test/CodeGen/X86/musttail-indirect.ll
@@ -0,0 +1,124 @@
+; RUN: llc < %s -mtriple=i686-win32 | FileCheck %s
+; RUN: llc < %s -mtriple=i686-win32 -O0 | FileCheck %s
+
+; IR simplified from the following C++ snippet compiled for i686-windows-msvc:
+
+; struct A { A(); ~A(); int a; };
+;
+; struct B {
+; virtual int f(int);
+; virtual int g(A, int, A);
+; virtual void h(A, int, A);
+; virtual A i(A, int, A);
+; virtual A j(int);
+; };
+;
+; int (B::*mp_f)(int) = &B::f;
+; int (B::*mp_g)(A, int, A) = &B::g;
+; void (B::*mp_h)(A, int, A) = &B::h;
+; A (B::*mp_i)(A, int, A) = &B::i;
+; A (B::*mp_j)(int) = &B::j;
+
+; Each member pointer creates a thunk. The ones with inalloca are required to
+; tail calls by the ABI, even at O0.
+
+%struct.B = type { i32 (...)** }
+%struct.A = type { i32 }
+
+; CHECK-LABEL: f_thunk:
+; CHECK: jmpl
+; CHECK-NOT: ret
+define x86_thiscallcc i32 @f_thunk(%struct.B* %this, i32) {
+entry:
+ %1 = bitcast %struct.B* %this to i32 (%struct.B*, i32)***
+ %vtable = load i32 (%struct.B*, i32)*** %1
+ %2 = load i32 (%struct.B*, i32)** %vtable
+ %3 = musttail call x86_thiscallcc i32 %2(%struct.B* %this, i32 %0)
+ ret i32 %3
+}
+
+; Inalloca thunks shouldn't require any stores to the stack.
+; CHECK-LABEL: g_thunk:
+; CHECK-NOT: mov %{{.*}}, {{.*(.*esp.*)}}
+; CHECK: jmpl
+; CHECK-NOT: ret
+define x86_thiscallcc i32 @g_thunk(%struct.B* %this, <{ %struct.A, i32, %struct.A }>* inalloca) {
+entry:
+ %1 = bitcast %struct.B* %this to i32 (%struct.B*, <{ %struct.A, i32, %struct.A }>*)***
+ %vtable = load i32 (%struct.B*, <{ %struct.A, i32, %struct.A }>*)*** %1
+ %vfn = getelementptr inbounds i32 (%struct.B*, <{ %struct.A, i32, %struct.A }>*)** %vtable, i32 1
+ %2 = load i32 (%struct.B*, <{ %struct.A, i32, %struct.A }>*)** %vfn
+ %3 = musttail call x86_thiscallcc i32 %2(%struct.B* %this, <{ %struct.A, i32, %struct.A }>* inalloca %0)
+ ret i32 %3
+}
+
+; CHECK-LABEL: h_thunk:
+; CHECK: jmpl
+; CHECK-NOT: mov %{{.*}}, {{.*(.*esp.*)}}
+; CHECK-NOT: ret
+define x86_thiscallcc void @h_thunk(%struct.B* %this, <{ %struct.A, i32, %struct.A }>* inalloca) {
+entry:
+ %1 = bitcast %struct.B* %this to void (%struct.B*, <{ %struct.A, i32, %struct.A }>*)***
+ %vtable = load void (%struct.B*, <{ %struct.A, i32, %struct.A }>*)*** %1
+ %vfn = getelementptr inbounds void (%struct.B*, <{ %struct.A, i32, %struct.A }>*)** %vtable, i32 2
+ %2 = load void (%struct.B*, <{ %struct.A, i32, %struct.A }>*)** %vfn
+ musttail call x86_thiscallcc void %2(%struct.B* %this, <{ %struct.A, i32, %struct.A }>* inalloca %0)
+ ret void
+}
+
+; CHECK-LABEL: i_thunk:
+; CHECK-NOT: mov %{{.*}}, {{.*(.*esp.*)}}
+; CHECK: jmpl
+; CHECK-NOT: ret
+define x86_thiscallcc %struct.A* @i_thunk(%struct.B* %this, <{ %struct.A*, %struct.A, i32, %struct.A }>* inalloca) {
+entry:
+ %1 = bitcast %struct.B* %this to %struct.A* (%struct.B*, <{ %struct.A*, %struct.A, i32, %struct.A }>*)***
+ %vtable = load %struct.A* (%struct.B*, <{ %struct.A*, %struct.A, i32, %struct.A }>*)*** %1
+ %vfn = getelementptr inbounds %struct.A* (%struct.B*, <{ %struct.A*, %struct.A, i32, %struct.A }>*)** %vtable, i32 3
+ %2 = load %struct.A* (%struct.B*, <{ %struct.A*, %struct.A, i32, %struct.A }>*)** %vfn
+ %3 = musttail call x86_thiscallcc %struct.A* %2(%struct.B* %this, <{ %struct.A*, %struct.A, i32, %struct.A }>* inalloca %0)
+ ret %struct.A* %3
+}
+
+; CHECK-LABEL: j_thunk:
+; CHECK: jmpl
+; CHECK-NOT: ret
+define x86_thiscallcc void @j_thunk(%struct.A* noalias sret %agg.result, %struct.B* %this, i32) {
+entry:
+ %1 = bitcast %struct.B* %this to void (%struct.A*, %struct.B*, i32)***
+ %vtable = load void (%struct.A*, %struct.B*, i32)*** %1
+ %vfn = getelementptr inbounds void (%struct.A*, %struct.B*, i32)** %vtable, i32 4
+ %2 = load void (%struct.A*, %struct.B*, i32)** %vfn
+ musttail call x86_thiscallcc void %2(%struct.A* sret %agg.result, %struct.B* %this, i32 %0)
+ ret void
+}
+
+; CHECK-LABEL: _stdcall_thunk@8:
+; CHECK-NOT: mov %{{.*}}, {{.*(.*esp.*)}}
+; CHECK: jmpl
+; CHECK-NOT: ret
+define x86_stdcallcc i32 @stdcall_thunk(<{ %struct.B*, %struct.A }>* inalloca) {
+entry:
+ %this_ptr = getelementptr inbounds <{ %struct.B*, %struct.A }>* %0, i32 0, i32 0
+ %this = load %struct.B** %this_ptr
+ %1 = bitcast %struct.B* %this to i32 (<{ %struct.B*, %struct.A }>*)***
+ %vtable = load i32 (<{ %struct.B*, %struct.A }>*)*** %1
+ %vfn = getelementptr inbounds i32 (<{ %struct.B*, %struct.A }>*)** %vtable, i32 1
+ %2 = load i32 (<{ %struct.B*, %struct.A }>*)** %vfn
+ %3 = musttail call x86_stdcallcc i32 %2(<{ %struct.B*, %struct.A }>* inalloca %0)
+ ret i32 %3
+}
+
+; CHECK-LABEL: @fastcall_thunk@8:
+; CHECK-NOT: mov %{{.*}}, {{.*(.*esp.*)}}
+; CHECK: jmpl
+; CHECK-NOT: ret
+define x86_fastcallcc i32 @fastcall_thunk(%struct.B* inreg %this, <{ %struct.A }>* inalloca) {
+entry:
+ %1 = bitcast %struct.B* %this to i32 (%struct.B*, <{ %struct.A }>*)***
+ %vtable = load i32 (%struct.B*, <{ %struct.A }>*)*** %1
+ %vfn = getelementptr inbounds i32 (%struct.B*, <{ %struct.A }>*)** %vtable, i32 1
+ %2 = load i32 (%struct.B*, <{ %struct.A }>*)** %vfn
+ %3 = musttail call x86_fastcallcc i32 %2(%struct.B* inreg %this, <{ %struct.A }>* inalloca %0)
+ ret i32 %3
+}
diff --git a/test/CodeGen/X86/musttail-thiscall.ll b/test/CodeGen/X86/musttail-thiscall.ll
new file mode 100644
index 00000000000..8ea12482e50
--- /dev/null
+++ b/test/CodeGen/X86/musttail-thiscall.ll
@@ -0,0 +1,31 @@
+; RUN: llc -march=x86 < %s | FileCheck %s
+; RUN: llc -march=x86 -O0 < %s | FileCheck %s
+
+; CHECK-LABEL: t1:
+; CHECK: jmp {{_?}}t1_callee
+define x86_thiscallcc void @t1(i8* %this) {
+ %adj = getelementptr i8* %this, i32 4
+ musttail call x86_thiscallcc void @t1_callee(i8* %adj)
+ ret void
+}
+declare x86_thiscallcc void @t1_callee(i8* %this)
+
+; CHECK-LABEL: t2:
+; CHECK: jmp {{_?}}t2_callee
+define x86_thiscallcc i32 @t2(i8* %this, i32 %a) {
+ %adj = getelementptr i8* %this, i32 4
+ %rv = musttail call x86_thiscallcc i32 @t2_callee(i8* %adj, i32 %a)
+ ret i32 %rv
+}
+declare x86_thiscallcc i32 @t2_callee(i8* %this, i32 %a)
+
+; CHECK-LABEL: t3:
+; CHECK: jmp {{_?}}t3_callee
+define x86_thiscallcc i8* @t3(i8* %this, <{ i8*, i32 }>* inalloca %args) {
+ %adj = getelementptr i8* %this, i32 4
+ %a_ptr = getelementptr <{ i8*, i32 }>* %args, i32 0, i32 1
+ store i32 0, i32* %a_ptr
+ %rv = musttail call x86_thiscallcc i8* @t3_callee(i8* %adj, <{ i8*, i32 }>* inalloca %args)
+ ret i8* %rv
+}
+declare x86_thiscallcc i8* @t3_callee(i8* %this, <{ i8*, i32 }>* inalloca %args);
diff --git a/test/CodeGen/X86/musttail.ll b/test/CodeGen/X86/musttail.ll
index 75b217f9947..ca5d3119cf1 100644
--- a/test/CodeGen/X86/musttail.ll
+++ b/test/CodeGen/X86/musttail.ll
@@ -1,8 +1,6 @@
; RUN: llc -march=x86 < %s | FileCheck %s
-
-; FIXME: Eliminate this tail call at -O0, since musttail is a correctness
-; requirement.
-; RUN: not llc -march=x86 -O0 < %s
+; RUN: llc -march=x86 -O0 < %s | FileCheck %s
+; RUN: llc -march=x86 -disable-tail-calls < %s | FileCheck %s
declare void @t1_callee(i8*)
define void @t1(i32* %a) {
@@ -21,3 +19,72 @@ define i32* @t2() {
%w = bitcast i8* %v to i32*
ret i32* %w
}
+
+; Complex frame layout: stack realignment with dynamic alloca.
+define void @t3(i32 %n) alignstack(32) nounwind {
+entry:
+; CHECK: t3:
+; CHECK: pushl %ebp
+; CHECK: pushl %esi
+; CHECK: andl $-32, %esp
+; CHECK: movl %esp, %esi
+; CHECK: popl %esi
+; CHECK: popl %ebp
+; CHECK-NEXT: jmp {{_?}}t3_callee
+ %a = alloca i8, i32 %n
+ call void @capture(i8* %a)
+ musttail call void @t3_callee(i32 %n) nounwind
+ ret void
+}
+
+declare void @capture(i8*)
+declare void @t3_callee(i32)
+
+; Test that we actually copy in and out stack arguments that aren't forwarded
+; without modification.
+define i32 @t4({}* %fn, i32 %n, i32 %r) {
+; CHECK-LABEL: t4:
+; CHECK: incl %[[r:.*]]
+; CHECK: decl %[[n:.*]]
+; CHECK: movl %[[r]], {{[0-9]+}}(%esp)
+; CHECK: movl %[[n]], {{[0-9]+}}(%esp)
+; CHECK: jmpl *%{{.*}}
+
+entry:
+ %r1 = add i32 %r, 1
+ %n1 = sub i32 %n, 1
+ %fn_cast = bitcast {}* %fn to i32 ({}*, i32, i32)*
+ %r2 = musttail call i32 %fn_cast({}* %fn, i32 %n1, i32 %r1)
+ ret i32 %r2
+}
+
+; Combine the complex stack frame with the parameter modification.
+define i32 @t5({}* %fn, i32 %n, i32 %r) alignstack(32) {
+; CHECK-LABEL: t5:
+; CHECK: pushl %ebp
+; CHECK: movl %esp, %ebp
+; CHECK: pushl %esi
+; Align the stack.
+; CHECK: andl $-32, %esp
+; CHECK: movl %esp, %esi
+; Modify the args.
+; CHECK: incl %[[r:.*]]
+; CHECK: decl %[[n:.*]]
+; Store them through ebp, since that's the only stable arg pointer.
+; CHECK: movl %[[r]], {{[0-9]+}}(%ebp)
+; CHECK: movl %[[n]], {{[0-9]+}}(%ebp)
+; Epilogue.
+; CHECK: leal {{[-0-9]+}}(%ebp), %esp
+; CHECK: popl %esi
+; CHECK: popl %ebp
+; CHECK: jmpl *%{{.*}}
+
+entry:
+ %a = alloca i8, i32 %n
+ call void @capture(i8* %a)
+ %r1 = add i32 %r, 1
+ %n1 = sub i32 %n, 1
+ %fn_cast = bitcast {}* %fn to i32 ({}*, i32, i32)*
+ %r2 = musttail call i32 %fn_cast({}* %fn, i32 %n1, i32 %r1)
+ ret i32 %r2
+}