diff options
author | tstellar <tstellar@91177308-0d34-0410-b5e6-96231b3b80d8> | 2012-10-22 14:55:30 +0000 |
---|---|---|
committer | tstellar <tstellar@91177308-0d34-0410-b5e6-96231b3b80d8> | 2012-10-22 14:55:30 +0000 |
commit | 81d3998812b5977a220d62076cc446822747af21 (patch) | |
tree | b073cb1be9c65dbdd6cd312c322efd3c04abb0af | |
parent | 63a079bf76d139b91e170557cf16b17bdbecd58d (diff) |
Merge master branch
git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/R600/@166411 91177308-0d34-0410-b5e6-96231b3b80d8
164 files changed, 5586 insertions, 783 deletions
diff --git a/bindings/ocaml/executionengine/llvm_executionengine.ml b/bindings/ocaml/executionengine/llvm_executionengine.ml index b6178dc055e..ddb53bbb5af 100644 --- a/bindings/ocaml/executionengine/llvm_executionengine.ml +++ b/bindings/ocaml/executionengine/llvm_executionengine.ml @@ -84,7 +84,7 @@ module ExecutionEngine = struct = "llvm_ee_free_machine_code" external target_data: t -> Llvm_target.DataLayout.t - = "LLVMGetExecutionEngineDataLayout" + = "LLVMGetExecutionEngineTargetData" (* The following are not bound. Patches are welcome. diff --git a/docs/CodingStandards.rst b/docs/CodingStandards.rst index de50e6eeaf5..418e3f05a36 100644 --- a/docs/CodingStandards.rst +++ b/docs/CodingStandards.rst @@ -146,6 +146,132 @@ useful to use C style (``/* */``) comments however: To comment out a large block of code, use ``#if 0`` and ``#endif``. These nest properly and are better behaved in general than C style comments. +Doxygen Use in Documentation Comments +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Use the ``\file`` command to turn the standard file header into a file-level +comment. + +Include descriptive ``\brief`` paragraphs for all public interfaces (public +classes, member and non-member functions). Explain API use and purpose in +``\brief`` paragraphs, don't just restate the information that can be inferred +from the API name. Put detailed discussion into separate paragraphs. + +To refer to parameter names inside a paragraph, use the ``\p name`` command. +Don't use the ``\arg name`` command since it starts a new paragraph that +contains documentation for the parameter. + +Wrap non-inline code examples in ``\code ... \endcode``. + +To document a function parameter, start a new paragraph with the +``\param name`` command. If the parameter is used as an out or an in/out +parameter, use the ``\param [out] name`` or ``\param [in,out] name`` command, +respectively. + +To describe function return value, start a new paragraph with the ``\returns`` +command. + +A minimal documentation comment: + +.. code-block:: c++ + + /// \brief Does foo and bar. + void fooBar(bool Baz); + +A documentation comment that uses all Doxygen features in a preferred way: + +.. code-block:: c++ + + /// \brief Does foo and bar. + /// + /// Does not do foo the usual way if \p Baz is true. + /// + /// Typical usage: + /// \code + /// fooBar(false, "quux", Res); + /// \endcode + /// + /// \param Quux kind of foo to do. + /// \param [out] Result filled with bar sequence on foo success. + /// + /// \returns true on success. + bool fooBar(bool Baz, StringRef Quux, std::vector<int> &Result); + +Don't duplicate the documentation comment in the header file and in the +implementation file. Put the documentation comments for public APIs into the +header file. Documentation comments for private APIs can go to the +implementation file. In any case, implementation files can include additional +comments (not necessarily in Doxygen markup) to explain implementation details +as needed. + +Don't duplicate function or class name at the beginning of the comment. +For humans it is obvious which function or class is being documented; +automatic documentation processing tools are smart enough to bind the comment +to the correct declaration. + +Wrong: + +.. code-block:: c++ + + // In Something.h: + + /// Something - An abstraction for some complicated thing. + class Something { + public: + /// fooBar - Does foo and bar. + void fooBar(); + }; + + // In Something.cpp: + + /// fooBar - Does foo and bar. + void Something::fooBar() { ... } + +Correct: + +.. code-block:: c++ + + // In Something.h: + + /// \brief An abstraction for some complicated thing. + class Something { + public: + /// \brief Does foo and bar. + void fooBar(); + }; + + // In Something.cpp: + + // Builds a B-tree in order to do foo. See paper by... + void Something::fooBar() { ... } + +It is not required to use additional Doxygen features, but sometimes it might +be a good idea to do so. + +Consider: + +* adding comments to any narrow namespace containing a collection of + related functions or types; + +* using top-level groups to organize a collection of related functions at + namespace scope where the grouping is smaller than the namespace; + +* using member groups and additional comments attached to member + groups to organize within a class. + +For example: + +.. code-block:: c++ + + class Something { + /// \name Functions that do Foo. + /// @{ + void fooBar(); + void fooBaz(); + /// @} + ... + }; + ``#include`` Style ^^^^^^^^^^^^^^^^^^ @@ -604,8 +730,7 @@ code to be structured like this: .. code-block:: c++ - /// containsFoo - Return true if the specified list has an element that is - /// a foo. + /// \returns true if the specified list has an element that is a foo. static bool containsFoo(const std::vector<Bar*> &List) { for (unsigned i = 0, e = List.size(); i != e; ++i) if (List[i]->isFoo()) @@ -1051,21 +1176,21 @@ If a namespace definition is small and *easily* fits on a screen (say, less than namespace llvm { namespace X86 { - /// RelocationType - An enum for the x86 relocation codes. Note that + /// \brief An enum for the x86 relocation codes. Note that /// the terminology here doesn't follow x86 convention - word means /// 32-bit and dword means 64-bit. enum RelocationType { - /// reloc_pcrel_word - PC relative relocation, add the relocated value to + /// \brief PC relative relocation, add the relocated value to /// the value already in memory, after we adjust it for where the PC is. reloc_pcrel_word = 0, - /// reloc_picrel_word - PIC base relative relocation, add the relocated - /// value to the value already in memory, after we adjust it for where the + /// \brief PIC base relative relocation, add the relocated value to + /// the value already in memory, after we adjust it for where the /// PIC base is. reloc_picrel_word = 1, - /// reloc_absolute_word, reloc_absolute_dword - Absolute relocation, just - /// add the relocated value to the value already in memory. + /// \brief Absolute relocation, just add the relocated value to the + /// value already in memory. reloc_absolute_word = 2, reloc_absolute_dword = 3 }; @@ -1084,7 +1209,7 @@ closed. For example: namespace llvm { namespace knowledge { - /// Grokable - This class represents things that Smith can have an intimate + /// This class represents things that Smith can have an intimate /// understanding of and contains the data associated with it. class Grokable { ... diff --git a/docs/CommandGuide/lit.rst b/docs/CommandGuide/lit.rst index 3eb0be91f13..9e96cd2a4bf 100644 --- a/docs/CommandGuide/lit.rst +++ b/docs/CommandGuide/lit.rst @@ -125,6 +125,10 @@ EXECUTION OPTIONS *--error-exitcode* argument for valgrind is used so that valgrind failures will cause the program to exit with a non-zero status. + When this option is enabled, **lit** will also automatically provide a + "valgrind" feature that can be used to conditionally disable (or expect failure + in) certain tests. + **--vg-arg**\ =\ *ARG* @@ -133,6 +137,15 @@ EXECUTION OPTIONS +**--vg-leak** + + When *--vg* is used, enable memory leak checks. When this option is enabled, + **lit** will also automatically provide a "vg_leak" feature that can be + used to conditionally disable (or expect failure in) certain tests. + + + + **--time-tests** Track the wall time individual tests take to execute and includes the results in diff --git a/docs/LangRef.html b/docs/LangRef.html index 167397ff53d..874e12fa44a 100644 --- a/docs/LangRef.html +++ b/docs/LangRef.html @@ -1364,11 +1364,13 @@ target datalayout = "<i>layout specification</i>" 8-bits. If omitted, the natural stack alignment defaults to "unspecified", which does not prevent any alignment promotions.</dd> - <dt><tt>p:<i>size</i>:<i>abi</i>:<i>pref</i></tt></dt> + <dt><tt>p[n]:<i>size</i>:<i>abi</i>:<i>pref</i></tt></dt> <dd>This specifies the <i>size</i> of a pointer and its <i>abi</i> and - <i>preferred</i> alignments. All sizes are in bits. Specifying - the <i>pref</i> alignment is optional. If omitted, the - preceding <tt>:</tt> should be omitted too.</dd> + <i>preferred</i> alignments for address space <i>n</i>. All sizes are in + bits. Specifying the <i>pref</i> alignment is optional. If omitted, the + preceding <tt>:</tt> should be omitted too. The address space, + <i>n</i> is optional, and if not specified, denotes the default address + space 0. The value of <i>n</i> must be in the range [1,2^23).</dd> <dt><tt>i<i>size</i>:<i>abi</i>:<i>pref</i></tt></dt> <dd>This specifies the alignment for an integer type of a given bit @@ -1409,6 +1411,10 @@ target datalayout = "<i>layout specification</i>" <ul> <li><tt>E</tt> - big endian</li> <li><tt>p:64:64:64</tt> - 64-bit pointers with 64-bit alignment</li> + <li><tt>p1:32:32:32</tt> - 32-bit pointers with 32-bit alignment for + address space 1</li> + <li><tt>p2:16:32:32</tt> - 16-bit pointers with 32-bit alignment for + address space 2</li> <li><tt>i1:8:8</tt> - i1 is 8-bit (byte) aligned</li> <li><tt>i8:8:8</tt> - i8 is 8-bit (byte) aligned</li> <li><tt>i16:16:16</tt> - i16 is 16-bit aligned</li> diff --git a/docs/README.txt b/docs/README.txt index 2fbbf987405..5ddd599d8a7 100644 --- a/docs/README.txt +++ b/docs/README.txt @@ -6,7 +6,7 @@ The LLVM documentation is currently written in two formats: * Plain HTML documentation. * reStructured Text documentation using the Sphinx documentation generator. It - is currently tested with Sphinx 1.1.3. + is currently tested with Sphinx 1.1.3. For more information, see the "Sphinx Introduction for LLVM Developers" document. diff --git a/docs/ReleaseNotes.html b/docs/ReleaseNotes.html index 75a6fd1ca10..0ef8f3d1f37 100644 --- a/docs/ReleaseNotes.html +++ b/docs/ReleaseNotes.html @@ -466,6 +466,20 @@ Release Notes</a>.</h1> <p>In addition to many minor performance tweaks and bug fixes, this release includes a few major enhancements and additions to the optimizers:</p> +<p> Loop Vectorizer - We've added a loop vectorizer and we are now able to + vectorize small loops. The loop vectorizer is disabled by default and + can be enabled using the <b>-mllvm -vectorize</b> flag. <br/> + We can now vectorize this code: + + <pre class="doc_code"> + for (i=0; i<n; i++) { + a[i] = b[i+1] + c[i+3] + i; + sum += d[i]; + } + </pre> + + </p> + <ul> <li>...</li> </ul> @@ -506,6 +520,8 @@ Release Notes</a>.</h1> We use the lifetime markers to tell the codegen that a certain alloca is used within a region.</p> +<p> We now merge consecutive loads and stores. </p> + <p>We have put a significant amount of work into the code generator infrastructure, which allows us to implement more aggressive algorithms and make it run faster:</p> @@ -645,6 +661,11 @@ Release Notes</a>.</h1> <p>In addition, many APIs have changed in this release. Some of the major LLVM API changes are:</p> +<p> We've added a new interface for allowing IR-level passes to access + target-specific information. A new IR-level pass, called + "TargetTransformInfo" provides a number of low-level interfaces. + LSR and LowerInvoke already use the new interface. </p> + <ul> <li>...</li> </ul> diff --git a/docs/TestingGuide.html b/docs/TestingGuide.html index 1f9c9157306..ae2643fe4e8 100644 --- a/docs/TestingGuide.html +++ b/docs/TestingGuide.html @@ -798,14 +798,15 @@ define two separate CHECK lines that match on the same line. <p>Sometimes it is necessary to mark a test case as "expected fail" or XFAIL. You can easily mark a test as XFAIL just by including <tt>XFAIL: </tt> on a line near the top of the file. This signals that the test case should succeed - if the test fails. Such test cases are counted separately by the testing tool. To - specify an expected fail, use the XFAIL keyword in the comments of the test - program followed by a colon and one or more regular expressions (separated by - a comma). The regular expressions allow you to XFAIL the test conditionally by - host platform. The regular expressions following the : are matched against the - target triplet for the host machine. If there is a match, the test is expected - to fail. If not, the test is expected to succeed. To XFAIL everywhere just - specify <tt>XFAIL: *</tt>. Here is an example of an <tt>XFAIL</tt> line:</p> + if the test fails. Such test cases are counted separately by the testing + tool. To specify an expected fail, use the XFAIL keyword in the comments of + the test program followed by a colon and one or more failure patterns. Each + failure pattern can be either '*' (to specify fail everywhere), or a part of a + target triple (indicating the test should fail on that platform), or the name + of a configurable feature (for example, "loadable_module"). If there is a + match, the test is expected to fail. If not, the test is expected to + succeed. To XFAIL everywhere just specify <tt>XFAIL: *</tt>. Here is an + example of an <tt>XFAIL</tt> line:</p> <div class="doc_code"> <pre> diff --git a/docs/subsystems.rst b/docs/subsystems.rst index 8c3cdf2417b..6f77b79fbe2 100644 --- a/docs/subsystems.rst +++ b/docs/subsystems.rst @@ -91,3 +91,10 @@ Subsystem Documentation * :ref:`segmented_stacks` This document describes segmented stacks and how they are used in LLVM. + +* `Howto: Implementing LLVM Integrated Assembler`_ + + A simple guide for how to implement an LLVM integrated assembler for an + architecture. + +.. _`Howto: Implementing LLVM Integrated Assembler`: http://www.embecosm.com/download/ean10.html diff --git a/include/llvm-c/Transforms/Vectorize.h b/include/llvm-c/Transforms/Vectorize.h index 9e7c7540d76..68a9bdd3885 100644 --- a/include/llvm-c/Transforms/Vectorize.h +++ b/include/llvm-c/Transforms/Vectorize.h @@ -36,6 +36,9 @@ extern "C" { /** See llvm::createBBVectorizePass function. */ void LLVMAddBBVectorizePass(LLVMPassManagerRef PM); +/** See llvm::createLoopVectorizePass function. */ +void LLVMAddLoopVectorizePass(LLVMPassManagerRef PM); + /** * @} */ diff --git a/include/llvm/ADT/Optional.h b/include/llvm/ADT/Optional.h index ee8b69f3d12..f43aeb1bc4d 100644 --- a/include/llvm/ADT/Optional.h +++ b/include/llvm/ADT/Optional.h @@ -16,8 +16,13 @@ #ifndef LLVM_ADT_OPTIONAL #define LLVM_ADT_OPTIONAL +#include "llvm/Support/Compiler.h" #include <cassert> +#if LLVM_USE_RVALUE_REFERENCES +#include <utility> +#endif + namespace llvm { template<typename T> @@ -28,6 +33,10 @@ public: explicit Optional() : x(), hasVal(false) {} Optional(const T &y) : x(y), hasVal(true) {} +#if LLVM_USE_RVALUE_REFERENCES + Optional(T &&y) : x(std::forward<T>(y)), hasVal(true) {} +#endif + static inline Optional create(const T* y) { return y ? Optional(*y) : Optional(); } diff --git a/include/llvm/ADT/OwningPtr.h b/include/llvm/ADT/OwningPtr.h index ea9495d3869..05bcd40d086 100644 --- a/include/llvm/ADT/OwningPtr.h +++ b/include/llvm/ADT/OwningPtr.h @@ -32,6 +32,15 @@ class OwningPtr { public: explicit OwningPtr(T *P = 0) : Ptr(P) {} +#if LLVM_USE_RVALUE_REFERENCES + OwningPtr(OwningPtr &&Other) : Ptr(Other.take()) {} + + OwningPtr &operator=(OwningPtr &&Other) { + reset(Other.take()); + return *this; + } +#endif + ~OwningPtr() { delete Ptr; } @@ -86,6 +95,15 @@ class OwningArrayPtr { public: explicit OwningArrayPtr(T *P = 0) : Ptr(P) {} +#if LLVM_USE_RVALUE_REFERENCES + OwningArrayPtr(OwningArrayPtr &&Other) : Ptr(Other.take()) {} + + OwningArrayPtr &operator=(OwningArrayPtr &&Other) { + reset(Other.take()); + return *this; + } +#endif + ~OwningArrayPtr() { delete [] Ptr; } diff --git a/include/llvm/ADT/SmallBitVector.h b/include/llvm/ADT/SmallBitVector.h index fba1d12542a..a9cd54e13b3 100644 --- a/include/llvm/ADT/SmallBitVector.h +++ b/include/llvm/ADT/SmallBitVector.h @@ -306,8 +306,8 @@ public: assert(E <= size() && "Attempted to set out-of-bounds range!"); if (I == E) return *this; if (isSmall()) { - uintptr_t EMask = 1 << E; - uintptr_t IMask = 1 << I; + uintptr_t EMask = ((uintptr_t)1) << E; + uintptr_t IMask = ((uintptr_t)1) << I; uintptr_t Mask = EMask - IMask; setSmallBits(getSmallBits() | Mask); } else @@ -337,8 +337,8 @@ public: assert(E <= size() && "Attempted to reset out-of-bounds range!"); if (I == E) return *this; if (isSmall()) { - uintptr_t EMask = 1 << E; - uintptr_t IMask = 1 << I; + uintptr_t EMask = ((uintptr_t)1) << E; + uintptr_t IMask = ((uintptr_t)1) << I; uintptr_t Mask = EMask - IMask; setSmallBits(getSmallBits() & ~Mask); } else diff --git a/include/llvm/Analysis/ScalarEvolutionExpander.h b/include/llvm/Analysis/ScalarEvolutionExpander.h index 3ab9c8256bb..3f8f149cb42 100644 --- a/include/llvm/Analysis/ScalarEvolutionExpander.h +++ b/include/llvm/Analysis/ScalarEvolutionExpander.h @@ -22,7 +22,7 @@ #include <set> namespace llvm { - class ScalarTargetTransformInfo; + class TargetLowering; /// Return true if the given expression is safe to expand in the sense that /// all materialized values are safe to speculate. @@ -129,7 +129,7 @@ namespace llvm { /// representative. Return the number of phis eliminated. unsigned replaceCongruentIVs(Loop *L, const DominatorTree *DT, SmallVectorImpl<WeakVH> &DeadInsts, - const ScalarTargetTransformInfo *STTI = NULL); + const TargetLowering *TLI = NULL); /// expandCodeFor - Insert code to directly compute the specified SCEV /// expression into the program. The inserted code is inserted into the diff --git a/include/llvm/Attributes.h b/include/llvm/Attributes.h index a28aa183473..1e995f9a858 100644 --- a/include/llvm/Attributes.h +++ b/include/llvm/Attributes.h @@ -290,16 +290,7 @@ struct AttributeWithIndex { static AttributeWithIndex get(LLVMContext &C, unsigned Idx, ArrayRef<Attributes::AttrVal> Attrs) { - AttrBuilder B; - - for (ArrayRef<Attributes::AttrVal>::iterator I = Attrs.begin(), - E = Attrs.end(); I != E; ++I) - B.addAttribute(*I); - - AttributeWithIndex P; - P.Index = Idx; - P.Attrs = Attributes::get(C, B); - return P; + return get(Idx, Attributes::get(C, Attrs)); } static AttributeWithIndex get(unsigned Idx, Attributes Attrs) { AttributeWithIndex P; @@ -324,8 +315,8 @@ public: FunctionIndex = ~0U }; private: - /// AttrList - The attributes that we are managing. This can be null - /// to represent the empty attributes list. + /// AttrList - The attributes that we are managing. This can be null to + /// represent the empty attributes list. AttributeListImpl *AttrList; public: AttrListPtr() : AttrList(0) {} diff --git a/include/llvm/CodeGen/MachineFrameInfo.h b/include/llvm/CodeGen/MachineFrameInfo.h index 7188b1abbdf..0e4e132e40d 100644 --- a/include/llvm/CodeGen/MachineFrameInfo.h +++ b/include/llvm/CodeGen/MachineFrameInfo.h @@ -29,6 +29,7 @@ class MachineBasicBlock; class TargetFrameLowering; class BitVector; class Value; +class AllocaInst; /// The CalleeSavedInfo class tracks the information need to locate where a /// callee saved register is in the current frame. @@ -106,14 +107,14 @@ class MachineFrameInfo { /// Alloca - If this stack object is originated from an Alloca instruction /// this value saves the original IR allocation. Can be NULL. - const Value *Alloca; + const AllocaInst *Alloca; // PreAllocated - If true, the object was mapped into the local frame // block and doesn't need additional handling for allocation beyond that. bool PreAllocated; StackObject(uint64_t Sz, unsigned Al, int64_t SP, bool IM, - bool isSS, bool NSP, const Value *Val) + bool isSS, bool NSP, const AllocaInst *Val) : SPOffset(SP), Size(Sz), Alignment(Al), isImmutable(IM), isSpillSlot(isSS), MayNeedSP(NSP), Alloca(Val), PreAllocated(false) {} }; @@ -369,7 +370,7 @@ public: /// getObjectAllocation - Return the underlying Alloca of the specified /// stack object if it exists. Returns 0 if none exists. - const Value* getObjectAllocation(int ObjectIdx) const { + const AllocaInst* getObjectAllocation(int ObjectIdx) const { assert(unsigned(ObjectIdx+NumFixedObjects) < Objects.size() && "Invalid Object Idx!"); return Objects[ObjectIdx+NumFixedObjects].Alloca; @@ -495,7 +496,7 @@ public: /// a nonnegative identifier to represent it. /// int CreateStackObject(uint64_t Size, unsigned Alignment, bool isSS, - bool MayNeedSP = false, const Value *Alloca = 0) { + bool MayNeedSP = false, const AllocaInst *Alloca = 0) { assert(Size != 0 && "Cannot allocate zero size stack objects!"); Objects.push_back(StackObject(Size, Alignment, 0, false, isSS, MayNeedSP, Alloca)); diff --git a/include/llvm/CodeGen/MachineRegisterInfo.h b/include/llvm/CodeGen/MachineRegisterInfo.h index a5bc7f7d391..4e86363f071 100644 --- a/include/llvm/CodeGen/MachineRegisterInfo.h +++ b/include/llvm/CodeGen/MachineRegisterInfo.h @@ -77,16 +77,20 @@ class MachineRegisterInfo { return MO->Contents.Reg.Next; } - /// UsedPhysRegs - This is a bit vector that is computed and set by the + /// UsedRegUnits - This is a bit vector that is computed and set by the /// register allocator, and must be kept up to date by passes that run after /// register allocation (though most don't modify this). This is used /// so that the code generator knows which callee save registers to save and /// for other target specific uses. - /// This vector only has bits set for registers explicitly used, not their - /// aliases. - BitVector UsedPhysRegs; - - /// UsedPhysRegMask - Additional used physregs, but including aliases. + /// This vector has bits set for register units that are modified in the + /// current function. It doesn't include registers clobbered by function + /// calls with register mask operands. + BitVector UsedRegUnits; + + /// UsedPhysRegMask - Additional used physregs including aliases. + /// This bit vector represents all the registers clobbered by function calls. + /// It can model things that UsedRegUnits can't, such as function calls that + /// clobber ymm7 but preserve the low half in xmm7. BitVector UsedPhysRegMask; /// ReservedRegs - This is a bit vector of reserved registers. The target @@ -357,29 +361,27 @@ public: //===--------------------------------------------------------------------===// /// isPhysRegUsed - Return true if the specified register is used in this - /// function. This only works after register allocation. + /// function. Also check for clobbered aliases and registers clobbered by + /// function calls with register mask operands. + /// + /// This only works after register allocation. It is primarily used by + /// PrologEpilogInserter to determine which callee-saved registers need + /// spilling. bool isPhysRegUsed(unsigned Reg) const { - return UsedPhysRegs.test(Reg) || UsedPhysRegMask.test(Reg); - } - - /// isPhysRegOrOverlapUsed - Return true if Reg or any overlapping register - /// is used in this function. - bool isPhysRegOrOverlapUsed(unsigned Reg) const { if (UsedPhysRegMask.test(Reg)) return true; - for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) - if (UsedPhysRegs.test(*AI)) + for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units) + if (UsedRegUnits.test(*Units)) return true; return false; } /// setPhysRegUsed - Mark the specified register used in this function. /// This should only be called during and after register allocation. - void setPhysRegUsed(unsigned Reg) { UsedPhysRegs.set(Reg); } - - /// addPhysRegsUsed - Mark the specified registers used in this function. - /// This should only be called during and after register allocation. - void addPhysRegsUsed(const BitVector &Regs) { UsedPhysRegs |= Regs; } + void setPhysRegUsed(unsigned Reg) { + for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units) + UsedRegUnits.set(*Units); + } /// addPhysRegsUsedFromRegMask - Mark any registers not in RegMask as used. /// This corresponds to the bit mask attached to register mask operands. @@ -390,8 +392,9 @@ public: /// setPhysRegUnused - Mark the specified register unused in this function. /// This should only be called during and after register allocation. void setPhysRegUnused(unsigned Reg) { - UsedPhysRegs.reset(Reg); UsedPhysRegMask.reset(Reg); + for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units) + UsedRegUnits.reset(*Units); } diff --git a/include/llvm/CodeGen/SchedulerRegistry.h b/include/llvm/CodeGen/SchedulerRegistry.h index a582b0c40c8..836b73a15a2 100644 --- a/include/llvm/CodeGen/SchedulerRegistry.h +++ b/include/llvm/CodeGen/SchedulerRegistry.h @@ -102,6 +102,11 @@ ScheduleDAGSDNodes *createVLIWDAGScheduler(SelectionDAGISel *IS, ScheduleDAGSDNodes *createDefaultScheduler(SelectionDAGISel *IS, CodeGenOpt::Level OptLevel); +/// createDAGLinearizer - This creates a "no-scheduling" scheduler which +/// linearize the DAG using topological order. +ScheduleDAGSDNodes *createDAGLinearizer(SelectionDAGISel *IS, + CodeGenOpt::Level OptLevel); + } // end namespace llvm #endif diff --git a/include/llvm/ExecutionEngine/RuntimeDyld.h b/include/llvm/ExecutionEngine/RuntimeDyld.h index 7da4a4e09a3..a71b1411c8a 100644 --- a/include/llvm/ExecutionEngine/RuntimeDyld.h +++ b/include/llvm/ExecutionEngine/RuntimeDyld.h @@ -63,7 +63,7 @@ protected: // Any relocations already associated with the symbol will be re-resolved. void reassignSectionAddress(unsigned SectionID, uint64_t Addr); public: - RuntimeDyld(RTDyldMemoryManager*); + RuntimeDyld(RTDyldMemoryManager *); ~RuntimeDyld(); /// loadObject - prepare the object contained in the input buffer for diff --git a/include/llvm/InitializePasses.h b/include/llvm/InitializePasses.h index ee9b1c5852e..a5f7008b6fb 100644 --- a/include/llvm/InitializePasses.h +++ b/include/llvm/InitializePasses.h @@ -66,6 +66,7 @@ void initializeAliasDebuggerPass(PassRegistry&); void initializeAliasSetPrinterPass(PassRegistry&); void initializeAlwaysInlinerPass(PassRegistry&); void initializeArgPromotionPass(PassRegistry&); +void initializeBarrierNoopPass(PassRegistry&); void initializeBasicAliasAnalysisPass(PassRegistry&); void initializeBasicCallGraphPass(PassRegistry&); void initializeBlockExtractorPassPass(PassRegistry&); @@ -261,6 +262,7 @@ void initializeVirtRegRewriterPass(PassRegistry&); void initializeInstSimplifierPass(PassRegistry&); void initializeUnpackMachineBundlesPass(PassRegistry&); void initializeFinalizeMachineBundlesPass(PassRegistry&); +void initializeLoopVectorizePass(PassRegistry&); void initializeBBVectorizePass(PassRegistry&); void initializeMachineFunctionPrinterPassPass(PassRegistry&); } diff --git a/include/llvm/Intrinsics.td b/include/llvm/Intrinsics.td index 63d8d2c0f29..059bd804aa1 100644 --- a/include/llvm/Intrinsics.td +++ b/include/llvm/Intrinsics.td @@ -285,9 +285,9 @@ let Properties = [IntrNoMem] in { // NOTE: these are internal interfaces. def int_setjmp : Intrinsic<[llvm_i32_ty], [llvm_ptr_ty]>; -def int_longjmp : Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty]>; +def int_longjmp : Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty], [IntrNoReturn]>; def int_sigsetjmp : Intrinsic<[llvm_i32_ty] , [llvm_ptr_ty, llvm_i32_ty]>; -def int_siglongjmp : Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty]>; +def int_siglongjmp : Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty], [IntrNoReturn]>; // Internal interface for object size checking def int_objectsize : Intrinsic<[llvm_anyint_ty], [llvm_ptr_ty, llvm_i1_ty], @@ -345,7 +345,7 @@ let Properties = [IntrNoMem] in { } def int_eh_sjlj_functioncontext : Intrinsic<[], [llvm_ptr_ty]>; def int_eh_sjlj_setjmp : Intrinsic<[llvm_i32_ty], [llvm_ptr_ty]>; -def int_eh_sjlj_longjmp : Intrinsic<[], [llvm_ptr_ty]>; +def int_eh_sjlj_longjmp : Intrinsic<[], [llvm_ptr_ty], [IntrNoReturn]>; //===---------------- Generic Variable Attribute Intrinsics----------------===// // diff --git a/include/llvm/LinkAllPasses.h b/include/llvm/LinkAllPasses.h index 4b10d0e5415..8652acd941f 100644 --- a/include/llvm/LinkAllPasses.h +++ b/include/llvm/LinkAllPasses.h @@ -156,6 +156,7 @@ namespace { (void) llvm::createCorrelatedValuePropagationPass(); (void) llvm::createMemDepPrinter(); (void) llvm::createInstructionSimplifierPass(); + (void) llvm::createLoopVectorizePass(); (void) llvm::createBBVectorizePass(); (void)new llvm::IntervalPartition(); diff --git a/include/llvm/MC/MCParser/MCAsmParser.h b/include/llvm/MC/MCParser/MCAsmParser.h index 08758cda226..fcf5b6a9b59 100644 --- a/include/llvm/MC/MCParser/MCAsmParser.h +++ b/include/llvm/MC/MCParser/MCAsmParser.h @@ -20,6 +20,8 @@ class MCAsmLexer; class MCAsmParserExtension; class MCContext; class MCExpr; +class MCInstPrinter; +class MCInstrInfo; class MCParsedAsmOperand; class MCStreamer; class MCTargetAsmParser; @@ -29,6 +31,14 @@ class SourceMgr; class StringRef; class Twine; +/// MCAsmParserSemaCallback - Generic Sema callback for assembly parser. +class MCAsmParserSemaCallback { +public: + virtual ~MCAsmParserSemaCallback(); + virtual void *LookupInlineAsmIdentifier(StringRef Name, void *Loc, + unsigned &Size) = 0; +}; + /// MCAsmParser - Generic assembler parser interface, for use by target specific /// assembly parsers. class MCAsmParser { @@ -75,26 +85,21 @@ public: virtual bool Run(bool NoInitialTextSection, bool NoFinalize = false) = 0; virtual void setParsingInlineAsm(bool V) = 0; + virtual bool isParsingInlineAsm() = 0; + + /// ParseMSInlineAsm - Parse ms-style inline assembly. + virtual bool ParseMSInlineAsm(void *AsmLoc, std::string &AsmString, + unsigned &NumOutputs, unsigned &NumInputs, + SmallVectorImpl<void *> &OpDecls, + SmallVectorImpl<std::string> &Constraints, + SmallVectorImpl<std::string> &Clobbers, + const MCInstrInfo *MII, + const MCInstPrinter *IP, + MCAsmParserSemaCallback &SI) = 0; /// ParseStatement - Parse the next statement. virtual bool ParseStatement() = 0; - /// getNumParsedOperands - Returns the number of MCAsmParsedOperands from the - /// previously parsed statement. - virtual unsigned getNumParsedOperands() = 0; - - /// getParsedOperand - Get a MCAsmParsedOperand. - virtual MCParsedAsmOperand &getParsedOperand(unsigned OpNum) = 0; - - /// freeParsedOperands - Free the MCAsmParsedOperands. - virtual void freeParsedOperands() = 0; - - /// isInstruction - Was the previously parsed statement an instruction? - virtual bool isInstruction() = 0; - - /// getOpcode - Get the opcode from the previously parsed instruction. - virtual unsigned getOpcode() = 0; - /// Warning - Emit a warning at the location \p L, with the message \p Msg. /// /// \return The return value is true, if warnings are fatal. diff --git a/include/llvm/MC/MCParser/MCParsedAsmOperand.h b/include/llvm/MC/MCParser/MCParsedAsmOperand.h index 280145bfbc8..7d0914ebb49 100644 --- a/include/llvm/MC/MCParser/MCParsedAsmOperand.h +++ b/include/llvm/MC/MCParser/MCParsedAsmOperand.h @@ -57,12 +57,17 @@ public: /// isMem - Is this a memory operand? virtual bool isMem() const = 0; + virtual unsigned getMemSize() const { return 0; } /// getStartLoc - Get the location of the first token of this operand. virtual SMLoc getStartLoc() const = 0; /// getEndLoc - Get the location of the last token of this operand. virtual SMLoc getEndLoc() const = 0; + /// needSizeDirective - Do we need to emit a sizing directive for this + /// operand? Only valid when parsing MS-style inline assembly. + virtual bool needSizeDirective() const { return false; } + /// print - Print a debug representation of the operand to the given stream. virtual void print(raw_ostream &OS) const = 0; /// dump - Print to the debug stream. diff --git a/include/llvm/MC/MCSchedule.h b/include/llvm/MC/MCSchedule.h index 0504dc13c8f..c9a060c79b1 100644 --- a/include/llvm/MC/MCSchedule.h +++ b/include/llvm/MC/MCSchedule.h @@ -54,10 +54,12 @@ struct MCWriteProcResEntry { }; /// Specify the latency in cpu cycles for a particular scheduling class and def -/// index. Also identify the WriteResources of this def. When the operand -/// expands to a sequence of writes, this ID is the last write in the sequence. +/// index. -1 indicates an invalid latency. Heuristics would typically consider +/// an instruction with invalid latency to have infinite latency. Also identify +/// the WriteResources of this def. When the operand expands to a sequence of +/// writes, this ID is the last write in the sequence. struct MCWriteLatencyEntry { - unsigned Cycles; + int Cycles; unsigned WriteResourceID; bool operator==(const MCWriteLatencyEntry &Other) const { diff --git a/include/llvm/MC/MCTargetAsmParser.h b/include/llvm/MC/MCTargetAsmParser.h index c9ea5ae4846..05537f9211f 100644 --- a/include/llvm/MC/MCTargetAsmParser.h +++ b/include/llvm/MC/MCTargetAsmParser.h @@ -41,12 +41,26 @@ protected: // Can only create subclasses. /// AvailableFeatures - The current set of available features. unsigned AvailableFeatures; + /// ParsingInlineAsm - Are we parsing ms-style inline assembly? + bool ParsingInlineAsm; + + /// SemaCallback - The Sema callback implementation. Must be set when parsing + /// ms-style inline assembly. + MCAsmParserSemaCallback *SemaCallback; + public: virtual ~MCTargetAsmParser(); unsigned getAvailableFeatures() const { return AvailableFeatures; } void setAvailableFeatures(unsigned Value) { AvailableFeatures = Value; } + bool isParsingInlineAsm () { return ParsingInlineAsm; } + void setParsingInlineAsm (bool Value) { ParsingInlineAsm = Value; } + + void setSemaCallback(MCAsmParserSemaCallback *Callback) { + SemaCallback = Callback; + } + virtual bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) = 0; diff --git a/include/llvm/Object/ObjectFile.h b/include/llvm/Object/ObjectFile.h index 41959bd34e1..4185ffea130 100644 --- a/include/llvm/Object/ObjectFile.h +++ b/include/llvm/Object/ObjectFile.h @@ -266,7 +266,7 @@ const uint64_t UnknownAddressOrSize = ~0ULL; /// ObjectFile - This class is the base class for all object file types. /// Concrete instances of this object are created by createObjectFile, which -/// figure out which type to create. +/// figures out which type to create. class ObjectFile : public Binary { virtual void anchor(); ObjectFile() LLVM_DELETED_FUNCTION; diff --git a/include/llvm/Operator.h b/include/llvm/Operator.h index bc5da8e8aa3..462324a6694 100644 --- a/include/llvm/Operator.h +++ b/include/llvm/Operator.h @@ -36,8 +36,8 @@ private: void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION; void *operator new(size_t s) LLVM_DELETED_FUNCTION; Operator() LLVM_DELETED_FUNCTION; - // NOTE: cannot use LLVM_DELETED_FUNCTION because gcc errors when deleting - // an override of a non-deleted function. + // NOTE: cannot use LLVM_DELETED_FUNCTION because it's not legal to delete + // an overridden method that's not deleted in the base class. ~Operator(); public: @@ -191,7 +191,7 @@ public: /// opcodes. template<typename SuperClass, unsigned Opc> class ConcreteOperator : public SuperClass { - ~ConcreteOperator() LLVM_DELETED_FUNCTION; + ~ConcreteOperator(); // DO NOT IMPLEMENT public: static inline bool classof(const Instruction *I) { return I->getOpcode() == Opc; @@ -207,44 +207,44 @@ public: class AddOperator : public ConcreteOperator<OverflowingBinaryOperator, Instruction::Add> { - ~AddOperator() LLVM_DELETED_FUNCTION; + ~AddOperator(); // DO NOT IMPLEMENT }; class SubOperator : public ConcreteOperator<OverflowingBinaryOperator, Instruction::Sub> { - ~SubOperator() LLVM_DELETED_FUNCTION; + ~SubOperator(); // DO NOT IMPLEMENT }; class MulOperator : public ConcreteOperator<OverflowingBinaryOperator, Instruction::Mul> { - ~MulOperator() LLVM_DELETED_FUNCTION; + ~MulOperator(); // DO NOT IMPLEMENT }; class ShlOperator : public ConcreteOperator<OverflowingBinaryOperator, Instruction::Shl> { - ~ShlOperator() LLVM_DELETED_FUNCTION; + ~ShlOperator(); // DO NOT IMPLEMENT }; - + class SDivOperator : public ConcreteOperator<PossiblyExactOperator, Instruction::SDiv> { - ~SDivOperator() LLVM_DELETED_FUNCTION; + ~SDivOperator(); // DO NOT IMPLEMENT }; class UDivOperator : public ConcreteOperator<PossiblyExactOperator, Instruction::UDiv> { - ~UDivOperator() LLVM_DELETED_FUNCTION; + ~UDivOperator(); // DO NOT IMPLEMENT }; class AShrOperator : public ConcreteOperator<PossiblyExactOperator, Instruction::AShr> { - ~AShrOperator() LLVM_DELETED_FUNCTION; + ~AShrOperator(); // DO NOT IMPLEMENT }; class LShrOperator : public ConcreteOperator<PossiblyExactOperator, Instruction::LShr> { - ~LShrOperator() LLVM_DELETED_FUNCTION; + ~LShrOperator(); // DO NOT IMPLEMENT }; - - - + + + class GEPOperator : public ConcreteOperator<Operator, Instruction::GetElementPtr> { - ~GEPOperator() LLVM_DELETED_FUNCTION; + ~GEPOperator(); // DO NOT IMPLEMENT enum { IsInBounds = (1 << 0) diff --git a/include/llvm/Transforms/IPO.h b/include/llvm/Transforms/IPO.h index 962cb63758d..08d3bbd941d 100644 --- a/include/llvm/Transforms/IPO.h +++ b/include/llvm/Transforms/IPO.h @@ -198,6 +198,11 @@ ModulePass *createPartialInliningPass(); // ModulePass *createMetaRenamerPass(); +//===----------------------------------------------------------------------===// +/// createBarrierNoopPass - This pass is purely a module pass barrier in a pass +/// manager. +ModulePass *createBarrierNoopPass(); + } // End llvm namespace #endif diff --git a/include/llvm/Transforms/Scalar.h b/include/llvm/Transforms/Scalar.h index 3b665bf4b68..a5d8eed7462 100644 --- a/include/llvm/Transforms/Scalar.h +++ b/include/llvm/Transforms/Scalar.h @@ -119,7 +119,7 @@ Pass *createLICMPass(); // optional parameter used to consult the target machine whether certain // transformations are profitable. // -Pass *createLoopStrengthReducePass(); +Pass *createLoopStrengthReducePass(const TargetLowering *TLI = 0); Pass *createGlobalMergePass(const TargetLowering *TLI = 0); @@ -249,8 +249,9 @@ extern char &LowerSwitchID; // purpose "my LLVM-to-LLVM pass doesn't support the invoke instruction yet" // lowering pass. // -FunctionPass *createLowerInvokePass(); -FunctionPass *createLowerInvokePass(bool useExpensiveEHSupport); +FunctionPass *createLowerInvokePass(const TargetLowering *TLI = 0); +FunctionPass *createLowerInvokePass(const TargetLowering *TLI, + bool useExpensiveEHSupport); extern char &LowerInvokePassID; //===----------------------------------------------------------------------===// diff --git a/include/llvm/Transforms/Utils/BasicBlockUtils.h b/include/llvm/Transforms/Utils/BasicBlockUtils.h index 2510aecc69c..b810f1a818c 100644 --- a/include/llvm/Transforms/Utils/BasicBlockUtils.h +++ b/include/llvm/Transforms/Utils/BasicBlockUtils.h @@ -25,9 +25,11 @@ namespace llvm { class AliasAnalysis; class Instruction; +class MDNode; class Pass; class ReturnInst; class TargetLibraryInfo; +class TerminatorInst; /// DeleteDeadBlock - Delete the specified block, which must have no /// predecessors. @@ -203,6 +205,29 @@ void SplitLandingPadPredecessors(BasicBlock *OrigBB,ArrayRef<BasicBlock*> Preds, ReturnInst *FoldReturnIntoUncondBranch(ReturnInst *RI, BasicBlock *BB, BasicBlock *Pred); +/// SplitBlockAndInsertIfThen - Split the containing block at the +/// specified instruction - everything before and including Cmp stays +/// in the old basic block, and everything after Cmp is moved to a +/// new block. The two blocks are connected by a conditional branch +/// (with value of Cmp being the condition). +/// Before: +/// Head +/// Cmp +/// Tail +/// After: +/// Head +/// Cmp +/// if (Cmp) +/// ThenBlock +/// Tail +/// +/// If Unreachable is true, then ThenBlock ends with +/// UnreachableInst, otherwise it branches to Tail. +/// Returns the NewBasicBlock's terminator. + +TerminatorInst *SplitBlockAndInsertIfThen(Instruction *Cmp, + bool Unreachable, MDNode *BranchWeights = 0); + } // End llvm namespace #endif diff --git a/include/llvm/Transforms/Vectorize.h b/include/llvm/Transforms/Vectorize.h index 1e49a9c01e6..41e53a83e2f 100644 --- a/include/llvm/Transforms/Vectorize.h +++ b/include/llvm/Transforms/Vectorize.h @@ -107,6 +107,12 @@ BasicBlockPass * createBBVectorizePass(const VectorizeConfig &C = VectorizeConfig()); //===----------------------------------------------------------------------===// +// +// LoopVectorize - Create a loop vectorization pass. +// +Pass * createLoopVectorizePass(); + +//===----------------------------------------------------------------------===// /// @brief Vectorize the BasicBlock. /// /// @param BB The BasicBlock to be vectorized diff --git a/lib/Analysis/ScalarEvolutionExpander.cpp b/lib/Analysis/ScalarEvolutionExpander.cpp index 5c2a49e767f..111bfb4a6a7 100644 --- a/lib/Analysis/ScalarEvolutionExpander.cpp +++ b/lib/Analysis/ScalarEvolutionExpander.cpp @@ -19,8 +19,8 @@ #include "llvm/LLVMContext.h" #include "llvm/Support/Debug.h" #include "llvm/DataLayout.h" +#include "llvm/Target/TargetLowering.h" #include "llvm/ADT/STLExtras.h" -#include "llvm/TargetTransformInfo.h" using namespace llvm; @@ -1599,15 +1599,15 @@ static bool width_descending(Value *lhs, Value *rhs) { /// This does not depend on any SCEVExpander state but should be used in /// the same context that SCEVExpander is used. unsigned SCEVExpander::replaceCongruentIVs(Loop *L, const DominatorTree *DT, - SmallVectorImpl<WeakVH> &DeadInsts, - const ScalarTargetTransformInfo *STTI) { + SmallVectorImpl<WeakVH> &DeadInsts, + const TargetLowering *TLI) { // Find integer phis in order of increasing width. SmallVector<PHINode*, 8> Phis; for (BasicBlock::iterator I = L->getHeader()->begin(); PHINode *Phi = dyn_cast<PHINode>(I); ++I) { Phis.push_back(Phi); } - if (STTI) + if (TLI) std::sort(Phis.begin(), Phis.end(), width_descending); unsigned NumElim = 0; @@ -1618,14 +1618,25 @@ unsigned SCEVExpander::replaceCongruentIVs(Loop *L, const DominatorTree *DT, PEnd = Phis.end(); PIter != PEnd; ++PIter) { PHINode *Phi = *PIter; + // Fold constant phis. They may be congruent to other constant phis and + // would confuse the logic below that expects proper IVs. + if (Value *V = Phi->hasConstantValue()) { + Phi->replaceAllUsesWith(V); + DeadInsts.push_back(Phi); + ++NumElim; + DEBUG_WITH_TYPE(DebugType, dbgs() + << "INDVARS: Eliminated constant iv: " << *Phi << '\n'); + continue; + } + if (!SE.isSCEVable(Phi->getType())) continue; PHINode *&OrigPhiRef = ExprToIVMap[SE.getSCEV(Phi)]; if (!OrigPhiRef) { OrigPhiRef = Phi; - if (Phi->getType()->isIntegerTy() && STTI && - STTI->isTruncateFree(Phi->getType(), Phis.back()->getType())) { + if (Phi->getType()->isIntegerTy() && TLI + && TLI->isTruncateFree(Phi->getType(), Phis.back()->getType())) { // This phi can be freely truncated to the narrowest phi type. Map the // truncated expression to it so it will be reused for narrow types. const SCEV *TruncExpr = diff --git a/lib/CodeGen/ExecutionDepsFix.cpp b/lib/CodeGen/ExecutionDepsFix.cpp index 2311842671a..ed78f194215 100644 --- a/lib/CodeGen/ExecutionDepsFix.cpp +++ b/lib/CodeGen/ExecutionDepsFix.cpp @@ -657,7 +657,7 @@ bool ExeDepsFix::runOnMachineFunction(MachineFunction &mf) { bool anyregs = false; for (TargetRegisterClass::const_iterator I = RC->begin(), E = RC->end(); I != E; ++I) - if (MF->getRegInfo().isPhysRegOrOverlapUsed(*I)) { + if (MF->getRegInfo().isPhysRegUsed(*I)) { anyregs = true; break; } diff --git a/lib/CodeGen/LiveRangeEdit.cpp b/lib/CodeGen/LiveRangeEdit.cpp index 0dfb084f1e1..f8fbc7ddf0c 100644 --- a/lib/CodeGen/LiveRangeEdit.cpp +++ b/lib/CodeGen/LiveRangeEdit.cpp @@ -96,6 +96,13 @@ bool LiveRangeEdit::allUsesAvailableAt(const MachineInstr *OrigMI, const VNInfo *OVNI = li.getVNInfoAt(OrigIdx); if (!OVNI) continue; + + // Don't allow rematerialization immediately after the original def. + // It would be incorrect if OrigMI redefines the register. + // See PR14098. + if (SlotIndex::isSameInstr(OrigIdx, UseIdx)) + return false; + if (OVNI != li.getVNInfoAt(UseIdx)) return false; } diff --git a/lib/CodeGen/MachineFunctionPrinterPass.cpp b/lib/CodeGen/MachineFunctionPrinterPass.cpp index 0102ac708d0..ed94efb9355 100644 --- a/lib/CodeGen/MachineFunctionPrinterPass.cpp +++ b/lib/CodeGen/MachineFunctionPrinterPass.cpp @@ -51,7 +51,7 @@ struct MachineFunctionPrinterPass : public MachineFunctionPass { char MachineFunctionPrinterPass::ID = 0; } -char &MachineFunctionPrinterPassID = MachineFunctionPrinterPass::ID; +char &llvm::MachineFunctionPrinterPassID = MachineFunctionPrinterPass::ID; INITIALIZE_PASS(MachineFunctionPrinterPass, "print-machineinstrs", "Machine Function Printer", false, false) diff --git a/lib/CodeGen/MachineRegisterInfo.cpp b/lib/CodeGen/MachineRegisterInfo.cpp index ae7c15be158..95d7a7dd689 100644 --- a/lib/CodeGen/MachineRegisterInfo.cpp +++ b/lib/CodeGen/MachineRegisterInfo.cpp @@ -21,7 +21,7 @@ MachineRegisterInfo::MachineRegisterInfo(const TargetRegisterInfo &TRI) : TRI(&TRI), IsSSA(true), TracksLiveness(true) { VRegInfo.reserve(256); RegAllocHints.reserve(256); - UsedPhysRegs.resize(TRI.getNumRegs()); + UsedRegUnits.resize(TRI.getNumRegUnits()); UsedPhysRegMask.resize(TRI.getNumRegs()); // Create the physreg use/def lists. @@ -32,7 +32,7 @@ MachineRegisterInfo::MachineRegisterInfo(const TargetRegisterInfo &TRI) MachineRegisterInfo::~MachineRegisterInfo() { #ifndef NDEBUG clearVirtRegs(); - for (unsigned i = 0, e = UsedPhysRegs.size(); i != e; ++i) + for (unsigned i = 0, e = TRI->getNumRegs(); i != e; ++i) assert(!PhysRegUseDefLists[i] && "PhysRegUseDefLists has entries after all instructions are deleted"); #endif diff --git a/lib/CodeGen/Passes.cpp b/lib/CodeGen/Passes.cpp index abd62efc026..4ea21d4ff7b 100644 --- a/lib/CodeGen/Passes.cpp +++ b/lib/CodeGen/Passes.cpp @@ -359,7 +359,7 @@ void TargetPassConfig::addIRPasses() { // Run loop strength reduction before anything else. if (getOptLevel() != CodeGenOpt::None && !DisableLSR) { - addPass(createLoopStrengthReducePass()); + addPass(createLoopStrengthReducePass(getTargetLowering())); if (PrintLSR) addPass(createPrintFunctionPass("\n\n*** Code after LSR ***\n", &dbgs())); } @@ -389,7 +389,7 @@ void TargetPassConfig::addPassesToHandleExceptions() { addPass(createDwarfEHPass(TM)); break; case ExceptionHandling::None: - addPass(createLowerInvokePass()); + addPass(createLowerInvokePass(TM->getTargetLowering())); // The lower invoke pass may create unreachable code. Remove it. addPass(createUnreachableBlockEliminationPass()); diff --git a/lib/CodeGen/PrologEpilogInserter.cpp b/lib/CodeGen/PrologEpilogInserter.cpp index 86df0a127bf..77554d691c2 100644 --- a/lib/CodeGen/PrologEpilogInserter.cpp +++ b/lib/CodeGen/PrologEpilogInserter.cpp @@ -227,7 +227,7 @@ void PEI::calculateCalleeSavedRegisters(MachineFunction &Fn) { std::vector<CalleeSavedInfo> CSI; for (unsigned i = 0; CSRegs[i]; ++i) { unsigned Reg = CSRegs[i]; - if (Fn.getRegInfo().isPhysRegOrOverlapUsed(Reg)) { + if (Fn.getRegInfo().isPhysRegUsed(Reg)) { // If the reg is modified, save it! CSI.push_back(CalleeSavedInfo(Reg)); } diff --git a/lib/CodeGen/RegAllocFast.cpp b/lib/CodeGen/RegAllocFast.cpp index e096240e04b..d6ed36ef95b 100644 --- a/lib/CodeGen/RegAllocFast.cpp +++ b/lib/CodeGen/RegAllocFast.cpp @@ -113,9 +113,11 @@ namespace { // PhysRegState - One of the RegState enums, or a virtreg. std::vector<unsigned> PhysRegState; - // UsedInInstr - BitVector of physregs that are used in the current - // instruction, and so cannot be allocated. - BitVector UsedInInstr; + typedef SparseSet<unsigned> UsedInInstrSet; + + // UsedInInstr - Set of physregs that are used in the current instruction, + // and so cannot be allocated. + UsedInInstrSet UsedInInstr; // SkippedInstrs - Descriptors of instructions whose clobber list was // ignored because all registers were spilled. It is still necessary to @@ -340,7 +342,7 @@ void RAFast::usePhysReg(MachineOperand &MO) { PhysRegState[PhysReg] = regFree; // Fall through case regFree: - UsedInInstr.set(PhysReg); + UsedInInstr.insert(PhysReg); MO.setIsKill(); return; default: @@ -360,13 +362,13 @@ void RAFast::usePhysReg(MachineOperand &MO) { "Instruction is not using a subregister of a reserved register"); // Leave the superregister in the working set. PhysRegState[Alias] = regFree; - UsedInInstr.set(Alias); + UsedInInstr.insert(Alias); MO.getParent()->addRegisterKilled(Alias, TRI, true); return; case regFree: if (TRI->isSuperRegister(PhysReg, Alias)) { // Leave the superregister in the working set. - UsedInInstr.set(Alias); + UsedInInstr.insert(Alias); MO.getParent()->addRegisterKilled(Alias, TRI, true); return; } @@ -380,7 +382,7 @@ void RAFast::usePhysReg(MachineOperand &MO) { // All aliases are disabled, bring register into working set. PhysRegState[PhysReg] = regFree; - UsedInInstr.set(PhysReg); + UsedInInstr.insert(PhysReg); MO.setIsKill(); } @@ -389,7 +391,7 @@ void RAFast::usePhysReg(MachineOperand &MO) { /// reserved instead of allocated. void RAFast::definePhysReg(MachineInstr *MI, unsigned PhysReg, RegState NewState) { - UsedInInstr.set(PhysReg); + UsedInInstr.insert(PhysReg); switch (unsigned VirtReg = PhysRegState[PhysReg]) { case regDisabled: break; @@ -429,7 +431,7 @@ void RAFast::definePhysReg(MachineInstr *MI, unsigned PhysReg, // can be allocated directly. // Returns spillImpossible when PhysReg or an alias can't be spilled. unsigned RAFast::calcSpillCost(unsigned PhysReg) const { - if (UsedInInstr.test(PhysReg)) { + if (UsedInInstr.count(PhysReg)) { DEBUG(dbgs() << PrintReg(PhysReg, TRI) << " is already used in instr.\n"); return spillImpossible; } @@ -454,7 +456,7 @@ unsigned RAFast::calcSpillCost(unsigned PhysReg) const { unsigned Cost = 0; for (MCRegAliasIterator AI(PhysReg, TRI, false); AI.isValid(); ++AI) { unsigned Alias = *AI; - if (UsedInInstr.test(Alias)) + if (UsedInInstr.count(Alias)) return spillImpossible; switch (unsigned VirtReg = PhysRegState[Alias]) { case regDisabled: @@ -530,7 +532,7 @@ RAFast::LiveRegMap::iterator RAFast::allocVirtReg(MachineInstr *MI, // First try to find a completely free register. for (ArrayRef<unsigned>::iterator I = AO.begin(), E = AO.end(); I != E; ++I) { unsigned PhysReg = *I; - if (PhysRegState[PhysReg] == regFree && !UsedInInstr.test(PhysReg)) { + if (PhysRegState[PhysReg] == regFree && !UsedInInstr.count(PhysReg)) { assignVirtToPhysReg(*LRI, PhysReg); return LRI; } @@ -596,7 +598,7 @@ RAFast::defineVirtReg(MachineInstr *MI, unsigned OpNum, LRI->LastUse = MI; LRI->LastOpNum = OpNum; LRI->Dirty = true; - UsedInInstr.set(LRI->PhysReg); + UsedInInstr.insert(LRI->PhysReg); return LRI; } @@ -646,7 +648,7 @@ RAFast::reloadVirtReg(MachineInstr *MI, unsigned OpNum, assert(LRI->PhysReg && "Register not assigned"); LRI->LastUse = MI; LRI->LastOpNum = OpNum; - UsedInInstr.set(LRI->PhysReg); + UsedInInstr.insert(LRI->PhysReg); return LRI; } @@ -708,7 +710,7 @@ void RAFast::handleThroughOperands(MachineInstr *MI, unsigned Reg = MO.getReg(); if (!Reg || !TargetRegisterInfo::isPhysicalRegister(Reg)) continue; for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) { - UsedInInstr.set(*AI); + UsedInInstr.insert(*AI); if (ThroughRegs.count(PhysRegState[*AI])) definePhysReg(MI, *AI, regFree); } @@ -756,7 +758,7 @@ void RAFast::handleThroughOperands(MachineInstr *MI, } // Restore UsedInInstr to a state usable for allocating normal virtual uses. - UsedInInstr.reset(); + UsedInInstr.clear(); for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { MachineOperand &MO = MI->getOperand(i); if (!MO.isReg() || (MO.isDef() && !MO.isEarlyClobber())) continue; @@ -764,12 +766,12 @@ void RAFast::handleThroughOperands(MachineInstr *MI, if (!Reg || !TargetRegisterInfo::isPhysicalRegister(Reg)) continue; DEBUG(dbgs() << "\tSetting " << PrintReg(Reg, TRI) << " as used in instr\n"); - UsedInInstr.set(Reg); + UsedInInstr.insert(Reg); } // Also mark PartialDefs as used to avoid reallocation. for (unsigned i = 0, e = PartialDefs.size(); i != e; ++i) - UsedInInstr.set(PartialDefs[i]); + UsedInInstr.insert(PartialDefs[i]); } /// addRetOperand - ensure that a return instruction has an operand for each @@ -942,7 +944,7 @@ void RAFast::AllocateBasicBlock() { } // Track registers used by instruction. - UsedInInstr.reset(); + UsedInInstr.clear(); // First scan. // Mark physreg uses and early clobbers as used. @@ -1016,11 +1018,13 @@ void RAFast::AllocateBasicBlock() { } } - MRI->addPhysRegsUsed(UsedInInstr); + for (UsedInInstrSet::iterator + I = UsedInInstr.begin(), E = UsedInInstr.end(); I != E; ++I) + MRI->setPhysRegUsed(*I); // Track registers defined by instruction - early clobbers and tied uses at // this point. - UsedInInstr.reset(); + UsedInInstr.clear(); if (hasEarlyClobbers) { for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { MachineOperand &MO = MI->getOperand(i); @@ -1030,7 +1034,7 @@ void RAFast::AllocateBasicBlock() { // Look for physreg defs and tied uses. if (!MO.isDef() && !MI->isRegTiedToDefOperand(i)) continue; for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) - UsedInInstr.set(*AI); + UsedInInstr.insert(*AI); } } @@ -1080,7 +1084,9 @@ void RAFast::AllocateBasicBlock() { killVirtReg(VirtDead[i]); VirtDead.clear(); - MRI->addPhysRegsUsed(UsedInInstr); + for (UsedInInstrSet::iterator + I = UsedInInstr.begin(), E = UsedInInstr.end(); I != E; ++I) + MRI->setPhysRegUsed(*I); if (CopyDst && CopyDst == CopySrc && CopyDstSub == CopySrcSub) { DEBUG(dbgs() << "-- coalescing: " << *MI); @@ -1118,7 +1124,8 @@ bool RAFast::runOnMachineFunction(MachineFunction &Fn) { TII = TM->getInstrInfo(); MRI->freezeReservedRegs(Fn); RegClassInfo.runOnMachineFunction(Fn); - UsedInInstr.resize(TRI->getNumRegs()); + UsedInInstr.clear(); + UsedInInstr.setUniverse(TRI->getNumRegs()); assert(!MRI->isSSA() && "regalloc requires leaving SSA"); diff --git a/lib/CodeGen/RegisterCoalescer.cpp b/lib/CodeGen/RegisterCoalescer.cpp index ba6b4569a8f..2ca67d63257 100644 --- a/lib/CodeGen/RegisterCoalescer.cpp +++ b/lib/CodeGen/RegisterCoalescer.cpp @@ -1302,7 +1302,7 @@ public: SmallVectorImpl<unsigned> &ShrinkRegs); /// Get the value assignments suitable for passing to LiveInterval::join. - const int *getAssignments() const { return &Assignments[0]; } + const int *getAssignments() const { return Assignments.data(); } }; } // end anonymous namespace diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 8846247090c..17386b74e36 100644 --- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -5308,6 +5308,48 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) { if (Reduced.getNode()) return Reduced; } + // fold (trunc (concat ... x ...)) -> (concat ..., (trunc x), ...)), + // where ... are all 'undef'. + if (N0.getOpcode() == ISD::CONCAT_VECTORS && !LegalTypes) { + SmallVector<EVT, 8> VTs; + SDValue V; + unsigned Idx = 0; + unsigned NumDefs = 0; + + for (unsigned i = 0, e = N0.getNumOperands(); i != e; ++i) { + SDValue X = N0.getOperand(i); + if (X.getOpcode() != ISD::UNDEF) { + V = X; + Idx = i; + NumDefs++; + } + // Stop if more than one members are non-undef. + if (NumDefs > 1) + break; + VTs.push_back(EVT::getVectorVT(*DAG.getContext(), + VT.getVectorElementType(), + X.getValueType().getVectorNumElements())); + } + + if (NumDefs == 0) + return DAG.getUNDEF(VT); + + if (NumDefs == 1) { + assert(V.getNode() && "The single defined operand is empty!"); + SmallVector<SDValue, 8> Opnds; + for (unsigned i = 0, e = VTs.size(); i != e; ++i) { + if (i != Idx) { + Opnds.push_back(DAG.getUNDEF(VTs[i])); + continue; + } + SDValue NV = DAG.getNode(ISD::TRUNCATE, V.getDebugLoc(), VTs[i], V); + AddToWorkList(NV.getNode()); + Opnds.push_back(NV); + } + return DAG.getNode(ISD::CONCAT_VECTORS, N->getDebugLoc(), VT, + &Opnds[0], Opnds.size()); + } + } // Simplify the operands using demanded-bits information. if (!VT.isVector() && @@ -8564,8 +8606,8 @@ SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode* N) { return SDValue(); // Only handle cases where both indexes are constants with the same type. - ConstantSDNode *InsIdx = dyn_cast<ConstantSDNode>(N->getOperand(1)); - ConstantSDNode *ExtIdx = dyn_cast<ConstantSDNode>(V->getOperand(2)); + ConstantSDNode *ExtIdx = dyn_cast<ConstantSDNode>(N->getOperand(1)); + ConstantSDNode *InsIdx = dyn_cast<ConstantSDNode>(V->getOperand(2)); if (InsIdx && ExtIdx && InsIdx->getValueType(0).getSizeInBits() <= 64 && @@ -8582,6 +8624,21 @@ SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode* N) { } } + if (V->getOpcode() == ISD::CONCAT_VECTORS) { + // Combine: + // (extract_subvec (concat V1, V2, ...), i) + // Into: + // Vi if possible + // Only operand 0 is checked as 'concat' assumes all inputs of the same type. + if (V->getOperand(0).getValueType() != NVT) + return SDValue(); + unsigned Idx = dyn_cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); + unsigned NumElems = NVT.getVectorNumElements(); + assert((Idx % NumElems) == 0 && + "IDX in concat is not a multiple of the result vector length."); + return V->getOperand(Idx / NumElems); + } + return SDValue(); } @@ -9018,6 +9075,10 @@ bool DAGCombiner::SimplifySelectOps(SDNode *TheSelect, SDValue LHS, if ((LLD->hasAnyUseOfValue(1) && LLD->isPredecessorOf(CondNode)) || (RLD->hasAnyUseOfValue(1) && RLD->isPredecessorOf(CondNode))) return false; + // The loads must not depend on one another. + if (LLD->isPredecessorOf(RLD) || + RLD->isPredecessorOf(LLD)) + return false; Addr = DAG.getNode(ISD::SELECT, TheSelect->getDebugLoc(), LLD->getBasePtr().getValueType(), TheSelect->getOperand(0), LLD->getBasePtr(), diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp index 2ec129f7308..abf40b77a18 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -1240,6 +1240,19 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) { if (Action == TargetLowering::Legal) Action = TargetLowering::Custom; break; + case ISD::DEBUGTRAP: + Action = TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0)); + if (Action == TargetLowering::Expand) { + // replace ISD::DEBUGTRAP with ISD::TRAP + SDValue NewVal; + NewVal = DAG.getNode(ISD::TRAP, Node->getDebugLoc(), Node->getVTList(), + Node->getOperand(0)); + ReplaceNode(Node, NewVal.getNode()); + LegalizeOp(NewVal.getNode()); + return; + } + break; + default: if (Node->getOpcode() >= ISD::BUILTIN_OP_END) { Action = TargetLowering::Legal; diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp index 515eff3b253..2ae08692ae1 100644 --- a/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp +++ b/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp @@ -13,6 +13,7 @@ #define DEBUG_TYPE "pre-RA-sched" #include "ScheduleDAGSDNodes.h" +#include "InstrEmitter.h" #include "llvm/InlineAsm.h" #include "llvm/CodeGen/SchedulerRegistry.h" #include "llvm/CodeGen/SelectionDAGISel.h" @@ -34,6 +35,10 @@ STATISTIC(NumPRCopies, "Number of physical copies"); static RegisterScheduler fastDAGScheduler("fast", "Fast suboptimal list scheduling", createFastDAGScheduler); +static RegisterScheduler + linearizeDAGScheduler("linearize", "Linearize DAG, no scheduling", + createDAGLinearizer); + namespace { /// FastPriorityQueue - A degenerate priority queue that considers @@ -629,6 +634,155 @@ void ScheduleDAGFast::ListScheduleBottomUp() { #endif } + +namespace { +//===----------------------------------------------------------------------===// +// ScheduleDAGLinearize - No scheduling scheduler, it simply linearize the +// DAG in topological order. +// IMPORTANT: this may not work for targets with phyreg dependency. +// +class ScheduleDAGLinearize : public ScheduleDAGSDNodes { +public: + ScheduleDAGLinearize(MachineFunction &mf) : ScheduleDAGSDNodes(mf) {} + + void Schedule(); + + MachineBasicBlock *EmitSchedule(MachineBasicBlock::iterator &InsertPos); + +private: + std::vector<SDNode*> Sequence; + DenseMap<SDNode*, SDNode*> GluedMap; // Cache glue to its user + + void ScheduleNode(SDNode *N); +}; +} // end anonymous namespace + +void ScheduleDAGLinearize::ScheduleNode(SDNode *N) { + if (N->getNodeId() != 0) + llvm_unreachable(0); + + if (!N->isMachineOpcode() && + (N->getOpcode() == ISD::EntryToken || isPassiveNode(N))) + // These nodes do not need to be translated into MIs. + return; + + DEBUG(dbgs() << "\n*** Scheduling: "); + DEBUG(N->dump(DAG)); + Sequence.push_back(N); + + unsigned NumOps = N->getNumOperands(); + if (unsigned NumLeft = NumOps) { + SDNode *GluedOpN = 0; + do { + const SDValue &Op = N->getOperand(NumLeft-1); + SDNode *OpN = Op.getNode(); + + if (NumLeft == NumOps && Op.getValueType() == MVT::Glue) { + // Schedule glue operand right above N. + GluedOpN = OpN; + assert(OpN->getNodeId() != 0 && "Glue operand not ready?"); + OpN->setNodeId(0); + ScheduleNode(OpN); + continue; + } + + if (OpN == GluedOpN) + // Glue operand is already scheduled. + continue; + + DenseMap<SDNode*, SDNode*>::iterator DI = GluedMap.find(OpN); + if (DI != GluedMap.end() && DI->second != N) + // Users of glues are counted against the glued users. + OpN = DI->second; + + unsigned Degree = OpN->getNodeId(); + assert(Degree > 0 && "Predecessor over-released!"); + OpN->setNodeId(--Degree); + if (Degree == 0) + ScheduleNode(OpN); + } while (--NumLeft); + } +} + +/// findGluedUser - Find the representative use of a glue value by walking +/// the use chain. +static SDNode *findGluedUser(SDNode *N) { + while (SDNode *Glued = N->getGluedUser()) + N = Glued; + return N; +} + +void ScheduleDAGLinearize::Schedule() { + DEBUG(dbgs() << "********** DAG Linearization **********\n"); + + SmallVector<SDNode*, 8> Glues; + unsigned DAGSize = 0; + for (SelectionDAG::allnodes_iterator I = DAG->allnodes_begin(), + E = DAG->allnodes_end(); I != E; ++I) { + SDNode *N = I; + + // Use node id to record degree. + unsigned Degree = N->use_size(); + N->setNodeId(Degree); + unsigned NumVals = N->getNumValues(); + if (NumVals && N->getValueType(NumVals-1) == MVT::Glue && + N->hasAnyUseOfValue(NumVals-1)) { + SDNode *User = findGluedUser(N); + if (User) { + Glues.push_back(N); + GluedMap.insert(std::make_pair(N, User)); + } + } + + if (N->isMachineOpcode() || + (N->getOpcode() != ISD::EntryToken && !isPassiveNode(N))) + ++DAGSize; + } + + for (unsigned i = 0, e = Glues.size(); i != e; ++i) { + SDNode *Glue = Glues[i]; + SDNode *GUser = GluedMap[Glue]; + unsigned Degree = Glue->getNodeId(); + unsigned UDegree = GUser->getNodeId(); + + // Glue user must be scheduled together with the glue operand. So other + // users of the glue operand must be treated as its users. + SDNode *ImmGUser = Glue->getGluedUser(); + for (SDNode::use_iterator ui = Glue->use_begin(), ue = Glue->use_end(); + ui != ue; ++ui) + if (*ui == ImmGUser) + --Degree; + GUser->setNodeId(UDegree + Degree); + Glue->setNodeId(1); + } + + Sequence.reserve(DAGSize); + ScheduleNode(DAG->getRoot().getNode()); +} + +MachineBasicBlock* +ScheduleDAGLinearize::EmitSchedule(MachineBasicBlock::iterator &InsertPos) { + InstrEmitter Emitter(BB, InsertPos); + DenseMap<SDValue, unsigned> VRBaseMap; + + DEBUG({ + dbgs() << "\n*** Final schedule ***\n"; + }); + + // FIXME: Handle dbg_values. + unsigned NumNodes = Sequence.size(); + for (unsigned i = 0; i != NumNodes; ++i) { + SDNode *N = Sequence[NumNodes-i-1]; + DEBUG(N->dump(DAG)); + Emitter.EmitNode(N, false, false, VRBaseMap); + } + + DEBUG(dbgs() << '\n'); + + InsertPos = Emitter.getInsertPos(); + return Emitter.getBlock(); +} + //===----------------------------------------------------------------------===// // Public Constructor Functions //===----------------------------------------------------------------------===// @@ -637,3 +791,8 @@ llvm::ScheduleDAGSDNodes * llvm::createFastDAGScheduler(SelectionDAGISel *IS, CodeGenOpt::Level) { return new ScheduleDAGFast(*IS->MF); } + +llvm::ScheduleDAGSDNodes * +llvm::createDAGLinearizer(SelectionDAGISel *IS, CodeGenOpt::Level) { + return new ScheduleDAGLinearize(*IS->MF); +} diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp index 660223a5059..714471f559e 100644 --- a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp +++ b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp @@ -831,8 +831,7 @@ EmitSchedule(MachineBasicBlock::iterator &InsertPos) { } SmallVector<SDNode *, 4> GluedNodes; - for (SDNode *N = SU->getNode()->getGluedNode(); N; - N = N->getGluedNode()) + for (SDNode *N = SU->getNode()->getGluedNode(); N; N = N->getGluedNode()) GluedNodes.push_back(N); while (!GluedNodes.empty()) { SDNode *N = GluedNodes.back(); diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h index 8e7bd822014..907356fd212 100644 --- a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h +++ b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h @@ -114,7 +114,8 @@ namespace llvm { /// EmitSchedule - Insert MachineInstrs into the MachineBasicBlock /// according to the order specified in Sequence. /// - MachineBasicBlock *EmitSchedule(MachineBasicBlock::iterator &InsertPos); + virtual MachineBasicBlock* + EmitSchedule(MachineBasicBlock::iterator &InsertPos); virtual void dumpNode(const SUnit *SU) const; diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index cdebff94e59..db159941749 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -5175,10 +5175,13 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) { return 0; } + case Intrinsic::debugtrap: case Intrinsic::trap: { StringRef TrapFuncName = TM.Options.getTrapFunctionName(); if (TrapFuncName.empty()) { - DAG.setRoot(DAG.getNode(ISD::TRAP, dl,MVT::Other, getRoot())); + ISD::NodeType Op = (Intrinsic == Intrinsic::trap) ? + ISD::TRAP : ISD::DEBUGTRAP; + DAG.setRoot(DAG.getNode(Op, dl,MVT::Other, getRoot())); return 0; } TargetLowering::ArgListTy Args; @@ -5193,10 +5196,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) { DAG.setRoot(Result.second); return 0; } - case Intrinsic::debugtrap: { - DAG.setRoot(DAG.getNode(ISD::DEBUGTRAP, dl,MVT::Other, getRoot())); - return 0; - } + case Intrinsic::uadd_with_overflow: case Intrinsic::sadd_with_overflow: case Intrinsic::usub_with_overflow: diff --git a/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/lib/CodeGen/SelectionDAG/TargetLowering.cpp index cd485ac235c..5abc55ba8e2 100644 --- a/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -583,6 +583,11 @@ TargetLowering::TargetLowering(const TargetMachine &tm, // Default ISD::TRAP to expand (which turns it into abort). setOperationAction(ISD::TRAP, MVT::Other, Expand); + // On most systems, DEBUGTRAP and TRAP have no difference. The "Expand" + // here is to inform DAG Legalizer to replace DEBUGTRAP with TRAP. + // + setOperationAction(ISD::DEBUGTRAP, MVT::Other, Expand); + IsLittleEndian = TD->isLittleEndian(); PointerTy = MVT::getIntegerVT(8*TD->getPointerSize(0)); memset(RegClassForVT, 0,MVT::LAST_VALUETYPE*sizeof(TargetRegisterClass*)); diff --git a/lib/CodeGen/StackColoring.cpp b/lib/CodeGen/StackColoring.cpp index 54d8c8cde7e..1cbee843a12 100644 --- a/lib/CodeGen/StackColoring.cpp +++ b/lib/CodeGen/StackColoring.cpp @@ -48,6 +48,7 @@ #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/SlotIndexes.h" #include "llvm/DebugInfo.h" +#include "llvm/Instructions.h" #include "llvm/MC/MCInstrItineraries.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetRegisterInfo.h" @@ -260,7 +261,7 @@ unsigned StackColoring::collectMarkers(unsigned NumSlot) { MarkersFound++; - const Value *Allocation = MFI->getObjectAllocation(Slot); + const AllocaInst *Allocation = MFI->getObjectAllocation(Slot); if (Allocation) { DEBUG(dbgs()<<"Found a lifetime marker for slot #"<<Slot<< " with allocation: "<< Allocation->getName()<<"\n"); @@ -480,11 +481,11 @@ void StackColoring::remapInstructions(DenseMap<int, int> &SlotRemap) { } // Keep a list of *allocas* which need to be remapped. - DenseMap<const Value*, const Value*> Allocas; + DenseMap<const AllocaInst*, const AllocaInst*> Allocas; for (DenseMap<int, int>::iterator it = SlotRemap.begin(), e = SlotRemap.end(); it != e; ++it) { - const Value *From = MFI->getObjectAllocation(it->first); - const Value *To = MFI->getObjectAllocation(it->second); + const AllocaInst *From = MFI->getObjectAllocation(it->first); + const AllocaInst *To = MFI->getObjectAllocation(it->second); assert(To && From && "Invalid allocation object"); Allocas[From] = To; } @@ -514,10 +515,17 @@ void StackColoring::remapInstructions(DenseMap<int, int> &SlotRemap) { V = GetUnderlyingObject(V); // If we did not find one, or if the one that we found is not in our // map, then move on. - if (!V || !Allocas.count(V)) + if (!V || !isa<AllocaInst>(V)) { + // Clear mem operand since we don't know for sure that it doesn't + // alias a merged alloca. + MMO->setValue(0); + continue; + } + const AllocaInst *AI= cast<AllocaInst>(V); + if (!Allocas.count(AI)) continue; - MMO->setValue(Allocas[V]); + MMO->setValue(Allocas[AI]); FixedMemOp++; } diff --git a/lib/CodeGen/TargetSchedule.cpp b/lib/CodeGen/TargetSchedule.cpp index 7a6e2604d77..6a096a16c4e 100644 --- a/lib/CodeGen/TargetSchedule.cpp +++ b/lib/CodeGen/TargetSchedule.cpp @@ -58,6 +58,14 @@ unsigned TargetSchedModel::getNumMicroOps(MachineInstr *MI) const { return MI->isTransient() ? 0 : 1; } +// The machine model may explicitly specify an invalid latency, which +// effectively means infinite latency. Since users of the TargetSchedule API +// don't know how to handle this, we convert it to a very large latency that is +// easy to distinguish when debugging the DAG but won't induce overflow. +static unsigned convertLatency(int Cycles) { + return Cycles >= 0 ? Cycles : 1000; +} + /// If we can determine the operand latency from the def only, without machine /// model or itinerary lookup, do so. Otherwise return -1. int TargetSchedModel::getDefLatency(const MachineInstr *DefMI, @@ -178,7 +186,7 @@ unsigned TargetSchedModel::computeOperandLatency( const MCWriteLatencyEntry *WLEntry = STI->getWriteLatencyEntry(SCDesc, DefIdx); unsigned WriteID = WLEntry->WriteResourceID; - unsigned Latency = WLEntry->Cycles; + unsigned Latency = convertLatency(WLEntry->Cycles); if (!UseMI) return Latency; @@ -219,7 +227,7 @@ unsigned TargetSchedModel::computeInstrLatency(const MachineInstr *MI) const { // Lookup the definition's write latency in SubtargetInfo. const MCWriteLatencyEntry *WLEntry = STI->getWriteLatencyEntry(SCDesc, DefIdx); - Latency = std::max(Latency, WLEntry->Cycles); + Latency = std::max(Latency, convertLatency(WLEntry->Cycles)); } return Latency; } diff --git a/lib/DebugInfo/DWARFContext.cpp b/lib/DebugInfo/DWARFContext.cpp index 241f55eaed2..afd614cc356 100644 --- a/lib/DebugInfo/DWARFContext.cpp +++ b/lib/DebugInfo/DWARFContext.cpp @@ -17,6 +17,8 @@ using namespace llvm; using namespace dwarf; +typedef DWARFDebugLine::LineTable DWARFLineTable; + void DWARFContext::dump(raw_ostream &OS) { OS << ".debug_abbrev contents:\n"; getDebugAbbrev()->dump(OS); @@ -94,7 +96,7 @@ const DWARFDebugAranges *DWARFContext::getDebugAranges() { return Aranges.get(); } -const DWARFDebugLine::LineTable * +const DWARFLineTable * DWARFContext::getLineTableForCompileUnit(DWARFCompileUnit *cu) { if (!Line) Line.reset(new DWARFDebugLine()); @@ -106,7 +108,7 @@ DWARFContext::getLineTableForCompileUnit(DWARFCompileUnit *cu) { return 0; // No line table for this compile unit. // See if the line table is cached. - if (const DWARFDebugLine::LineTable *lt = Line->getLineTable(stmtOffset)) + if (const DWARFLineTable *lt = Line->getLineTable(stmtOffset)) return lt; // We have to parse it first. @@ -117,11 +119,11 @@ DWARFContext::getLineTableForCompileUnit(DWARFCompileUnit *cu) { void DWARFContext::parseCompileUnits() { uint32_t offset = 0; - const DataExtractor &debug_info_data = DataExtractor(getInfoSection(), - isLittleEndian(), 0); - while (debug_info_data.isValidOffset(offset)) { + const DataExtractor &DIData = DataExtractor(getInfoSection(), + isLittleEndian(), 0); + while (DIData.isValidOffset(offset)) { CUs.push_back(DWARFCompileUnit(*this)); - if (!CUs.back().extract(debug_info_data, &offset)) { + if (!CUs.back().extract(DIData, &offset)) { CUs.pop_back(); break; } @@ -163,9 +165,11 @@ DWARFCompileUnit *DWARFContext::getCompileUnitForAddress(uint64_t Address) { return getCompileUnitForOffset(CUOffset); } -static bool getFileNameForCompileUnit( - DWARFCompileUnit *CU, const DWARFDebugLine::LineTable *LineTable, - uint64_t FileIndex, bool NeedsAbsoluteFilePath, std::string &FileName) { +static bool getFileNameForCompileUnit(DWARFCompileUnit *CU, + const DWARFLineTable *LineTable, + uint64_t FileIndex, + bool NeedsAbsoluteFilePath, + std::string &FileName) { if (CU == 0 || LineTable == 0 || !LineTable->getFileNameByIndex(FileIndex, NeedsAbsoluteFilePath, @@ -183,10 +187,12 @@ static bool getFileNameForCompileUnit( return true; } -static bool getFileLineInfoForCompileUnit( - DWARFCompileUnit *CU, const DWARFDebugLine::LineTable *LineTable, - uint64_t Address, bool NeedsAbsoluteFilePath, std::string &FileName, - uint32_t &Line, uint32_t &Column) { +static bool getFileLineInfoForCompileUnit(DWARFCompileUnit *CU, + const DWARFLineTable *LineTable, + uint64_t Address, + bool NeedsAbsoluteFilePath, + std::string &FileName, + uint32_t &Line, uint32_t &Column) { if (CU == 0 || LineTable == 0) return false; // Get the index of row we're looking for in the line table. @@ -225,8 +231,7 @@ DILineInfo DWARFContext::getLineInfoForAddress(uint64_t Address, } } if (Specifier.needs(DILineInfoSpecifier::FileLineInfo)) { - const DWARFDebugLine::LineTable *LineTable = - getLineTableForCompileUnit(CU); + const DWARFLineTable *LineTable = getLineTableForCompileUnit(CU); const bool NeedsAbsoluteFilePath = Specifier.needs(DILineInfoSpecifier::AbsoluteFilePath); getFileLineInfoForCompileUnit(CU, LineTable, Address, @@ -250,7 +255,7 @@ DIInliningInfo DWARFContext::getInliningInfoForAddress(uint64_t Address, DIInliningInfo InliningInfo; uint32_t CallFile = 0, CallLine = 0, CallColumn = 0; - const DWARFDebugLine::LineTable *LineTable = 0; + const DWARFLineTable *LineTable = 0; for (uint32_t i = 0, n = InlinedChain.size(); i != n; i++) { const DWARFDebugInfoEntryMinimal &FunctionDIE = InlinedChain[i]; std::string FileName = "<invalid>"; diff --git a/lib/MC/ELFObjectWriter.cpp b/lib/MC/ELFObjectWriter.cpp index 6e37b5c1a03..b8b3188ce48 100644 --- a/lib/MC/ELFObjectWriter.cpp +++ b/lib/MC/ELFObjectWriter.cpp @@ -733,8 +733,7 @@ void ELFObjectWriter::RecordRelocation(const MCAssembler &Asm, Index = -1; } Addend = Value; - // Compensate for the addend on i386. - if (is64Bit()) + if (hasRelocationAddend()) Value = 0; } diff --git a/lib/MC/MCParser/AsmParser.cpp b/lib/MC/MCParser/AsmParser.cpp index 0406ff8d446..f22b2754f68 100644 --- a/lib/MC/MCParser/AsmParser.cpp +++ b/lib/MC/MCParser/AsmParser.cpp @@ -19,6 +19,8 @@ #include "llvm/MC/MCContext.h" #include "llvm/MC/MCDwarf.h" #include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCInstPrinter.h" +#include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCParser/AsmCond.h" #include "llvm/MC/MCParser/AsmLexer.h" #include "llvm/MC/MCParser/MCAsmParser.h" @@ -35,6 +37,8 @@ #include "llvm/Support/SourceMgr.h" #include "llvm/Support/raw_ostream.h" #include <cctype> +#include <set> +#include <string> #include <vector> using namespace llvm; @@ -42,6 +46,8 @@ static cl::opt<bool> FatalAssemblerWarnings("fatal-assembler-warnings", cl::desc("Consider warnings as error")); +MCAsmParserSemaCallback::~MCAsmParserSemaCallback() {} + namespace { /// \brief Helper class for tracking macro definitions. @@ -139,7 +145,8 @@ private: /// ParsedOperands - The parsed operands from the last parsed statement. SmallVector<MCParsedAsmOperand*, 8> ParsedOperands; - /// Opcode - The opcode from the last parsed instruction. + /// Opcode - The opcode from the last parsed instruction. This is MS-style + /// inline asm specific. unsigned Opcode; public: @@ -180,20 +187,17 @@ public: virtual const AsmToken &Lex(); - bool ParseStatement(); void setParsingInlineAsm(bool V) { ParsingInlineAsm = V; } - unsigned getNumParsedOperands() { return ParsedOperands.size(); } - MCParsedAsmOperand &getParsedOperand(unsigned OpNum) { - assert (ParsedOperands.size() > OpNum); - return *ParsedOperands[OpNum]; - } - void freeParsedOperands() { - for (unsigned i = 0, e = ParsedOperands.size(); i != e; ++i) - delete ParsedOperands[i]; - ParsedOperands.clear(); - } - bool isInstruction() { return Opcode != (unsigned)~0x0; } - unsigned getOpcode() { return Opcode; } + bool isParsingInlineAsm() { return ParsingInlineAsm; } + + bool ParseMSInlineAsm(void *AsmLoc, std::string &AsmString, + unsigned &NumOutputs, unsigned &NumInputs, + SmallVectorImpl<void *> &OpDecls, + SmallVectorImpl<std::string> &Constraints, + SmallVectorImpl<std::string> &Clobbers, + const MCInstrInfo *MII, + const MCInstPrinter *IP, + MCAsmParserSemaCallback &SI); bool ParseExpression(const MCExpr *&Res); virtual bool ParseExpression(const MCExpr *&Res, SMLoc &EndLoc); @@ -205,6 +209,7 @@ public: private: void CheckForValidSection(); + bool ParseStatement(); void EatToEndOfLine(); bool ParseCppHashLineFilenameComment(const SMLoc &L); @@ -310,6 +315,11 @@ private: bool ParseDirectiveIrp(SMLoc DirectiveLoc); // ".irp" bool ParseDirectiveIrpc(SMLoc DirectiveLoc); // ".irpc" bool ParseDirectiveEndr(SMLoc DirectiveLoc); // ".endr" + + // MS-style inline assembly parsing. + bool isInstruction() { return Opcode != (unsigned)~0x0; } + unsigned getOpcode() { return Opcode; } + void setOpcode(unsigned Value) { Opcode = Value; } }; /// \brief Generic implementations of directive handling, etc. which is shared @@ -1377,10 +1387,13 @@ bool AsmParser::ParseStatement() { ParsingInlineAsm); } - // Free any parsed operands. If parsing ms-style inline assembly it is the - // responsibility of the caller (i.e., clang) to free the parsed operands. - if (!ParsingInlineAsm) - freeParsedOperands(); + // Free any parsed operands. If parsing ms-style inline assembly the operands + // will be freed by the ParseMSInlineAsm() function. + if (!ParsingInlineAsm) { + for (unsigned i = 0, e = ParsedOperands.size(); i != e; ++i) + delete ParsedOperands[i]; + ParsedOperands.clear(); + } // Don't skip the rest of the line, the instruction parser is responsible for // that. @@ -3561,6 +3574,222 @@ bool AsmParser::ParseDirectiveEndr(SMLoc DirectiveLoc) { return false; } +namespace { +enum AsmRewriteKind { + AOK_Imm, + AOK_Input, + AOK_Output, + AOK_SizeDirective, + AOK_Skip +}; + +struct AsmRewrite { + AsmRewriteKind Kind; + SMLoc Loc; + unsigned Len; + unsigned Size; +public: + AsmRewrite(AsmRewriteKind kind, SMLoc loc, unsigned len, unsigned size = 0) + : Kind(kind), Loc(loc), Len(len), Size(size) { } +}; +} + +bool AsmParser::ParseMSInlineAsm(void *AsmLoc, std::string &AsmString, + unsigned &NumOutputs, unsigned &NumInputs, + SmallVectorImpl<void *> &OpDecls, + SmallVectorImpl<std::string> &Constraints, + SmallVectorImpl<std::string> &Clobbers, + const MCInstrInfo *MII, + const MCInstPrinter *IP, + MCAsmParserSemaCallback &SI) { + SmallVector<void*, 4> InputDecls; + SmallVector<void*, 4> OutputDecls; + SmallVector<std::string, 4> InputConstraints; + SmallVector<std::string, 4> OutputConstraints; + std::set<std::string> ClobberRegs; + + SmallVector<struct AsmRewrite, 4> AsmStrRewrites; + + // Prime the lexer. + Lex(); + + // While we have input, parse each statement. + unsigned InputIdx = 0; + unsigned OutputIdx = 0; + while (getLexer().isNot(AsmToken::Eof)) { + // Clear the opcode. + setOpcode(~0x0); + + // Save the conditional ignore state of the parser prior to parsing the statement. + bool PreParseCondStateIgnore = TheCondState.Ignore; + + // Save the starting point of this statement in case we need to skip it. + SMLoc Start = getLexer().getLoc(); + + if (ParseStatement()) + return true; + + // If PreParseCondStateIgnore is false, but TheCondState.Ignore is true, then we + // just parsed a directive that changed the state to ignore. Don't skip + // emitting this directive. + if (PreParseCondStateIgnore && TheCondState.Ignore) { + unsigned Len = getLexer().getLoc().getPointer() - Start.getPointer(); + AsmStrRewrites.push_back(AsmRewrite(AOK_Skip, Start, Len)); + continue; + } + + if (isInstruction()) { + const MCInstrDesc &Desc = MII->get(getOpcode()); + + // Build the list of clobbers, outputs and inputs. + for (unsigned i = 1, e = ParsedOperands.size(); i != e; ++i) { + MCParsedAsmOperand *Operand = ParsedOperands[i]; + + // Immediate. + if (Operand->isImm()) { + AsmStrRewrites.push_back(AsmRewrite(AOK_Imm, + Operand->getStartLoc(), + Operand->getNameLen())); + continue; + } + + // Register operand. + if (Operand->isReg()) { + unsigned NumDefs = Desc.getNumDefs(); + // Clobber. + if (NumDefs && Operand->getMCOperandNum() < NumDefs) { + std::string Reg; + raw_string_ostream OS(Reg); + IP->printRegName(OS, Operand->getReg()); + ClobberRegs.insert(StringRef(OS.str())); + } + continue; + } + + // Expr/Input or Output. + unsigned Size; + void *OpDecl = SI.LookupInlineAsmIdentifier(Operand->getName(), AsmLoc, + Size); + if (OpDecl) { + bool isOutput = (i == 1) && Desc.mayStore(); + if (Operand->needSizeDirective()) + AsmStrRewrites.push_back(AsmRewrite(AOK_SizeDirective, + Operand->getStartLoc(), 0, + Operand->getMemSize())); + + if (isOutput) { + std::string Constraint = "="; + ++InputIdx; + OutputDecls.push_back(OpDecl); + Constraint += Operand->getConstraint().str(); + OutputConstraints.push_back(Constraint); + AsmStrRewrites.push_back(AsmRewrite(AOK_Output, + Operand->getStartLoc(), + Operand->getNameLen())); + } else { + InputDecls.push_back(OpDecl); + InputConstraints.push_back(Operand->getConstraint().str()); + AsmStrRewrites.push_back(AsmRewrite(AOK_Input, + Operand->getStartLoc(), + Operand->getNameLen())); + } + } + } + // Free any parsed operands. + for (unsigned i = 0, e = ParsedOperands.size(); i != e; ++i) + delete ParsedOperands[i]; + ParsedOperands.clear(); + } + } + + // Set the number of Outputs and Inputs. + NumOutputs = OutputDecls.size(); + NumInputs = InputDecls.size(); + + // Set the unique clobbers. + for (std::set<std::string>::iterator I = ClobberRegs.begin(), + E = ClobberRegs.end(); I != E; ++I) + Clobbers.push_back(*I); + + // Merge the various outputs and inputs. Output are expected first. + if (NumOutputs || NumInputs) { + unsigned NumExprs = NumOutputs + NumInputs; + OpDecls.resize(NumExprs); + Constraints.resize(NumExprs); + for (unsigned i = 0; i < NumOutputs; ++i) { + OpDecls[i] = OutputDecls[i]; + Constraints[i] = OutputConstraints[i]; + } + for (unsigned i = 0, j = NumOutputs; i < NumInputs; ++i, ++j) { + OpDecls[j] = InputDecls[i]; + Constraints[j] = InputConstraints[i]; + } + } + + // Build the IR assembly string. + std::string AsmStringIR; + AsmRewriteKind PrevKind = AOK_Imm; + raw_string_ostream OS(AsmStringIR); + const char *Start = SrcMgr.getMemoryBuffer(0)->getBufferStart(); + for (SmallVectorImpl<struct AsmRewrite>::iterator + I = AsmStrRewrites.begin(), E = AsmStrRewrites.end(); I != E; ++I) { + const char *Loc = (*I).Loc.getPointer(); + + AsmRewriteKind Kind = (*I).Kind; + + // Emit everything up to the immediate/expression. If the previous rewrite + // was a size directive, then this has already been done. + if (PrevKind != AOK_SizeDirective) + OS << StringRef(Start, Loc - Start); + PrevKind = Kind; + + // Skip the original expression. + if (Kind == AOK_Skip) { + Start = Loc + (*I).Len; + continue; + } + + // Rewrite expressions in $N notation. + switch (Kind) { + default: break; + case AOK_Imm: + OS << Twine("$$") + StringRef(Loc, (*I).Len); + break; + case AOK_Input: + OS << '$'; + OS << InputIdx++; + break; + case AOK_Output: + OS << '$'; + OS << OutputIdx++; + break; + case AOK_SizeDirective: + switch((*I).Size) { + default: break; + case 8: OS << "byte ptr "; break; + case 16: OS << "word ptr "; break; + case 32: OS << "dword ptr "; break; + case 64: OS << "qword ptr "; break; + case 80: OS << "xword ptr "; break; + case 128: OS << "xmmword ptr "; break; + case 256: OS << "ymmword ptr "; break; + } + } + + // Skip the original expression. + if (Kind != AOK_SizeDirective) + Start = Loc + (*I).Len; + } + + // Emit the remainder of the asm string. + const char *AsmEnd = SrcMgr.getMemoryBuffer(0)->getBufferEnd(); + if (Start != AsmEnd) + OS << StringRef(Start, AsmEnd - Start); + + AsmString = OS.str(); + return false; +} + /// \brief Create an MCAsmParser instance. MCAsmParser *llvm::createMCAsmParser(SourceMgr &SM, MCContext &C, MCStreamer &Out, diff --git a/lib/MC/MCParser/MCTargetAsmParser.cpp b/lib/MC/MCParser/MCTargetAsmParser.cpp index 6fb1ba4216f..60a3a3b59a3 100644 --- a/lib/MC/MCParser/MCTargetAsmParser.cpp +++ b/lib/MC/MCParser/MCTargetAsmParser.cpp @@ -11,7 +11,7 @@ using namespace llvm; MCTargetAsmParser::MCTargetAsmParser() - : AvailableFeatures(0) + : AvailableFeatures(0), ParsingInlineAsm(false) { } diff --git a/lib/MC/SubtargetFeature.cpp b/lib/MC/SubtargetFeature.cpp index 378c4184a83..7625abd465f 100644 --- a/lib/MC/SubtargetFeature.cpp +++ b/lib/MC/SubtargetFeature.cpp @@ -119,8 +119,8 @@ void SubtargetFeatures::AddFeature(const StringRef String, } /// Find KV in array using binary search. -const SubtargetFeatureKV *Find(const StringRef S, const SubtargetFeatureKV *A, - size_t L) { +static const SubtargetFeatureKV *Find(StringRef S, const SubtargetFeatureKV *A, + size_t L) { // Make the lower bound element we're looking for SubtargetFeatureKV KV; KV.Key = S.data(); diff --git a/lib/Support/Unix/Signals.inc b/lib/Support/Unix/Signals.inc index cf64fa8f385..9e94068c9c3 100644 --- a/lib/Support/Unix/Signals.inc +++ b/lib/Support/Unix/Signals.inc @@ -121,17 +121,29 @@ static void UnregisterHandlers() { /// NB: This must be an async signal safe function. It cannot allocate or free /// memory, even in debug builds. static void RemoveFilesToRemove() { - // Note: avoid iterators in case of debug iterators that allocate or release + // We avoid iterators in case of debug iterators that allocate or release // memory. for (unsigned i = 0, e = FilesToRemove.size(); i != e; ++i) { - // Note that we don't want to use any external code here, and we don't care - // about errors. We're going to try as hard as we can as often as we need - // to to make these files go away. If these aren't files, too bad. - // - // We do however rely on a std::string implementation for which repeated - // calls to 'c_str()' don't allocate memory. We pre-call 'c_str()' on all - // of these strings to try to ensure this is safe. - unlink(FilesToRemove[i].c_str()); + // We rely on a std::string implementation for which repeated calls to + // 'c_str()' don't allocate memory. We pre-call 'c_str()' on all of these + // strings to try to ensure this is safe. + const char *path = FilesToRemove[i].c_str(); + + // Get the status so we can determine if it's a file or directory. If we + // can't stat the file, ignore it. + struct stat buf; + if (stat(path, &buf) != 0) + continue; + + // If this is not a regular file, ignore it. We want to prevent removal of + // special files like /dev/null, even if the compiler is being run with the + // super-user permissions. + if (!S_ISREG(buf.st_mode)) + continue; + + // Otherwise, remove the file. We ignore any errors here as there is nothing + // else we can do. + unlink(path); } } diff --git a/lib/Target/ARM/ARMFrameLowering.cpp b/lib/Target/ARM/ARMFrameLowering.cpp index 3e70d578231..1d4bbb74d33 100644 --- a/lib/Target/ARM/ARMFrameLowering.cpp +++ b/lib/Target/ARM/ARMFrameLowering.cpp @@ -1176,7 +1176,7 @@ static void checkNumAlignedDPRCS2Regs(MachineFunction &MF) { MachineRegisterInfo &MRI = MF.getRegInfo(); unsigned NumSpills = 0; for (; NumSpills < 8; ++NumSpills) - if (!MRI.isPhysRegOrOverlapUsed(ARM::D8 + NumSpills)) + if (!MRI.isPhysRegUsed(ARM::D8 + NumSpills)) break; // Don't do this for just one d-register. It's not worth it. @@ -1249,7 +1249,7 @@ ARMFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF, for (unsigned i = 0; CSRegs[i]; ++i) { unsigned Reg = CSRegs[i]; bool Spilled = false; - if (MF.getRegInfo().isPhysRegOrOverlapUsed(Reg)) { + if (MF.getRegInfo().isPhysRegUsed(Reg)) { Spilled = true; CanEliminateFrame = false; } diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp index b2eb5784879..8de23872349 100644 --- a/lib/Target/ARM/ARMISelLowering.cpp +++ b/lib/Target/ARM/ARMISelLowering.cpp @@ -2554,7 +2554,8 @@ ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG, DebugLoc dl, SDValue &Chain, const Value *OrigArg, unsigned OffsetFromOrigArg, - unsigned ArgOffset) const { + unsigned ArgOffset, + bool ForceMutable) const { MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo *MFI = MF.getFrameInfo(); ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); @@ -2603,7 +2604,8 @@ ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG, &MemOps[0], MemOps.size()); } else // This will point to the next argument passed via stack. - AFI->setVarArgsFrameIndex(MFI->CreateFixedObject(4, ArgOffset, true)); + AFI->setVarArgsFrameIndex( + MFI->CreateFixedObject(4, ArgOffset, !ForceMutable)); } SDValue @@ -2729,15 +2731,20 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain, // Since they could be overwritten by lowering of arguments in case of // a tail call. if (Flags.isByVal()) { - unsigned VARegSize, VARegSaveSize; - computeRegArea(CCInfo, MF, VARegSize, VARegSaveSize); - VarArgStyleRegisters(CCInfo, DAG, - dl, Chain, CurOrigArg, Ins[VA.getValNo()].PartOffset, 0); - unsigned Bytes = Flags.getByValSize() - VARegSize; - if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects. - int FI = MFI->CreateFixedObject(Bytes, - VA.getLocMemOffset(), false); - InVals.push_back(DAG.getFrameIndex(FI, getPointerTy())); + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + if (!AFI->getVarArgsFrameIndex()) { + VarArgStyleRegisters(CCInfo, DAG, + dl, Chain, CurOrigArg, + Ins[VA.getValNo()].PartOffset, + VA.getLocMemOffset(), + true /*force mutable frames*/); + int VAFrameIndex = AFI->getVarArgsFrameIndex(); + InVals.push_back(DAG.getFrameIndex(VAFrameIndex, getPointerTy())); + } else { + int FI = MFI->CreateFixedObject(Flags.getByValSize(), + VA.getLocMemOffset(), false); + InVals.push_back(DAG.getFrameIndex(FI, getPointerTy())); + } } else { int FI = MFI->CreateFixedObject(VA.getLocVT().getSizeInBits()/8, VA.getLocMemOffset(), true); diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h index 9acab0b0834..4eb3b2cb515 100644 --- a/lib/Target/ARM/ARMISelLowering.h +++ b/lib/Target/ARM/ARMISelLowering.h @@ -469,7 +469,8 @@ namespace llvm { DebugLoc dl, SDValue &Chain, const Value *OrigArg, unsigned OffsetFromOrigArg, - unsigned ArgOffset) + unsigned ArgOffset, + bool ForceMutable = false) const; void computeRegArea(CCState &CCInfo, MachineFunction &MF, diff --git a/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/lib/Target/ARM/Disassembler/ARMDisassembler.cpp index bf0dabb4a0f..d2b1cc37f21 100644 --- a/lib/Target/ARM/Disassembler/ARMDisassembler.cpp +++ b/lib/Target/ARM/Disassembler/ARMDisassembler.cpp @@ -525,8 +525,9 @@ static bool tryAddingSymbolicOperand(uint64_t Address, int32_t Value, else ReferenceType = LLVMDisassembler_ReferenceType_InOut_None; const char *ReferenceName; - const char *Name = SymbolLookUp(DisInfo, Value, &ReferenceType, Address, - &ReferenceName); + uint64_t SymbolValue = 0x00000000ffffffffULL & Value; + const char *Name = SymbolLookUp(DisInfo, SymbolValue, &ReferenceType, + Address, &ReferenceName); if (Name) { SymbolicOp.AddSymbol.Name = Name; SymbolicOp.AddSymbol.Present = true; diff --git a/lib/Target/MSP430/MSP430FrameLowering.cpp b/lib/Target/MSP430/MSP430FrameLowering.cpp index a312c8d5b25..2e170f17bf9 100644 --- a/lib/Target/MSP430/MSP430FrameLowering.cpp +++ b/lib/Target/MSP430/MSP430FrameLowering.cpp @@ -221,3 +221,17 @@ MSP430FrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB, return true; } + +void +MSP430FrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF) + const { + const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering(); + + // Create a frame entry for the FPW register that must be saved. + if (TFI->hasFP(MF)) { + int FrameIdx = MF.getFrameInfo()->CreateFixedObject(2, -4, true); + (void)FrameIdx; + assert(FrameIdx == MF.getFrameInfo()->getObjectIndexBegin() && + "Slot for FPW register must be last in order to be found!"); + } +} diff --git a/lib/Target/MSP430/MSP430FrameLowering.h b/lib/Target/MSP430/MSP430FrameLowering.h index b636827da7b..cb02545852b 100644 --- a/lib/Target/MSP430/MSP430FrameLowering.h +++ b/lib/Target/MSP430/MSP430FrameLowering.h @@ -46,6 +46,7 @@ public: bool hasFP(const MachineFunction &MF) const; bool hasReservedCallFrame(const MachineFunction &MF) const; + void processFunctionBeforeFrameFinalized(MachineFunction &MF) const; }; } // End llvm namespace diff --git a/lib/Target/MSP430/MSP430RegisterInfo.cpp b/lib/Target/MSP430/MSP430RegisterInfo.cpp index aed46a2ec59..9ae238f66f5 100644 --- a/lib/Target/MSP430/MSP430RegisterInfo.cpp +++ b/lib/Target/MSP430/MSP430RegisterInfo.cpp @@ -220,20 +220,6 @@ MSP430RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, MI.getOperand(i+1).ChangeToImmediate(Offset); } -void -MSP430RegisterInfo::processFunctionBeforeFrameFinalized(MachineFunction &MF) - const { - const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering(); - - // Create a frame entry for the FPW register that must be saved. - if (TFI->hasFP(MF)) { - int FrameIdx = MF.getFrameInfo()->CreateFixedObject(2, -4, true); - (void)FrameIdx; - assert(FrameIdx == MF.getFrameInfo()->getObjectIndexBegin() && - "Slot for FPW register must be last in order to be found!"); - } -} - unsigned MSP430RegisterInfo::getFrameRegister(const MachineFunction &MF) const { const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering(); diff --git a/lib/Target/MSP430/MSP430RegisterInfo.h b/lib/Target/MSP430/MSP430RegisterInfo.h index 9ee0a03f631..64a43bcafbb 100644 --- a/lib/Target/MSP430/MSP430RegisterInfo.h +++ b/lib/Target/MSP430/MSP430RegisterInfo.h @@ -49,8 +49,6 @@ public: void eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj, RegScavenger *RS = NULL) const; - void processFunctionBeforeFrameFinalized(MachineFunction &MF) const; - // Debug information queries. unsigned getFrameRegister(const MachineFunction &MF) const; }; diff --git a/lib/Target/Mips/Mips16InstrInfo.cpp b/lib/Target/Mips/Mips16InstrInfo.cpp index 8991433005d..5e33fed0cc9 100644 --- a/lib/Target/Mips/Mips16InstrInfo.cpp +++ b/lib/Target/Mips/Mips16InstrInfo.cpp @@ -25,7 +25,7 @@ using namespace llvm; Mips16InstrInfo::Mips16InstrInfo(MipsTargetMachine &tm) - : MipsInstrInfo(tm, /* FIXME: set mips16 unconditional br */ 0), + : MipsInstrInfo(tm, Mips::BimmX16), RI(*tm.getSubtargetImpl()) {} const MipsRegisterInfo &Mips16InstrInfo::getRegisterInfo() const { @@ -137,12 +137,39 @@ bool Mips16InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { /// GetOppositeBranchOpc - Return the inverse of the specified /// opcode, e.g. turning BEQ to BNE. unsigned Mips16InstrInfo::GetOppositeBranchOpc(unsigned Opc) const { + switch (Opc) { + default: llvm_unreachable("Illegal opcode!"); + case Mips::BeqzRxImmX16: return Mips::BnezRxImmX16; + case Mips::BnezRxImmX16: return Mips::BeqzRxImmX16; + case Mips::BteqzT8CmpX16: return Mips::BtnezT8CmpX16; + case Mips::BteqzT8SltX16: return Mips::BtnezT8SltX16; + case Mips::BteqzT8SltiX16: return Mips::BtnezT8SltiX16; + case Mips::BtnezX16: return Mips::BteqzX16; + case Mips::BtnezT8CmpiX16: return Mips::BteqzT8CmpiX16; + case Mips::BtnezT8SltuX16: return Mips::BteqzT8SltuX16; + case Mips::BtnezT8SltiuX16: return Mips::BteqzT8SltiuX16; + case Mips::BteqzX16: return Mips::BtnezX16; + case Mips::BteqzT8CmpiX16: return Mips::BtnezT8CmpiX16; + case Mips::BteqzT8SltuX16: return Mips::BtnezT8SltuX16; + case Mips::BteqzT8SltiuX16: return Mips::BtnezT8SltiuX16; + case Mips::BtnezT8CmpX16: return Mips::BteqzT8CmpX16; + case Mips::BtnezT8SltX16: return Mips::BteqzT8SltX16; + case Mips::BtnezT8SltiX16: return Mips::BteqzT8SltiX16; + } assert(false && "Implement this function."); return 0; } unsigned Mips16InstrInfo::GetAnalyzableBrOpc(unsigned Opc) const { - return 0; + return (Opc == Mips::BeqzRxImmX16 || Opc == Mips::BimmX16 || + Opc == Mips::BnezRxImmX16 || Opc == Mips::BteqzX16 || + Opc == Mips::BteqzT8CmpX16 || Opc == Mips::BteqzT8CmpiX16 || + Opc == Mips::BteqzT8SltX16 || Opc == Mips::BteqzT8SltuX16 || + Opc == Mips::BteqzT8SltiX16 || Opc == Mips::BteqzT8SltiuX16 || + Opc == Mips::BtnezX16 || Opc == Mips::BtnezT8CmpX16 || + Opc == Mips::BtnezT8CmpiX16 || Opc == Mips::BtnezT8SltX16 || + Opc == Mips::BtnezT8SltuX16 || Opc == Mips::BtnezT8SltiX16 || + Opc == Mips::BtnezT8SltiuX16 ) ? Opc : 0; } void Mips16InstrInfo::ExpandRetRA16(MachineBasicBlock &MBB, diff --git a/lib/Target/Mips/Mips16InstrInfo.td b/lib/Target/Mips/Mips16InstrInfo.td index eba201a0ea9..2694b09206f 100644 --- a/lib/Target/Mips/Mips16InstrInfo.td +++ b/lib/Target/Mips/Mips16InstrInfo.td @@ -21,6 +21,26 @@ def mem16 : Operand<i32> { } // +// EXT-I instruction format +// +class FEXT_I16_ins<bits<5> eop, string asmstr, InstrItinClass itin> : + FEXT_I16<eop, (outs), (ins brtarget:$imm16), + !strconcat(asmstr, "\t$imm16"),[], itin>; + +// +// EXT-I8 instruction format +// + +class FEXT_I816_ins_base<bits<3> _func, string asmstr, + string asmstr2, InstrItinClass itin>: + FEXT_I816<_func, (outs), (ins uimm16:$imm), !strconcat(asmstr, asmstr2), + [], itin>; + +class FEXT_I816_ins<bits<3> _func, string asmstr, + InstrItinClass itin>: + FEXT_I816_ins_base<_func, asmstr, "\t$imm", itin>; + +// // Assembler formats in alphabetical order. // Natural and pseudos are mixed together. // @@ -40,6 +60,11 @@ class FEXT_RI16_ins<bits<5> _op, string asmstr, class FEXT_RI16_PC_ins<bits<5> _op, string asmstr, InstrItinClass itin>: FEXT_RI16_ins_base<_op, asmstr, "\t$rx, $$pc, $imm", itin>; +class FEXT_RI16_B_ins<bits<5> _op, string asmstr, + InstrItinClass itin>: + FEXT_RI16<_op, (outs), (ins CPU16Regs:$rx, brtarget:$imm), + !strconcat(asmstr, "\t$rx, $imm"), [], itin>; + class FEXT_2RI16_ins<bits<5> _op, string asmstr, InstrItinClass itin>: FEXT_RI16<_op, (outs CPU16Regs:$rx), (ins CPU16Regs:$rx_, simm16:$imm), @@ -47,6 +72,7 @@ class FEXT_2RI16_ins<bits<5> _op, string asmstr, let Constraints = "$rx_ = $rx"; } + // this has an explicit sp argument that we ignore to work around a problem // in the compiler class FEXT_RI16_SP_explicit_ins<bits<5> _op, string asmstr, @@ -75,6 +101,31 @@ class FEXT_SHIFT16_ins<bits<2> _f, string asmstr, InstrItinClass itin>: FEXT_SHIFT16<_f, (outs CPU16Regs:$rx), (ins CPU16Regs:$ry, shamt:$sa), !strconcat(asmstr, "\t$rx, $ry, $sa"), [], itin>; +// +// EXT-T8I8 +// +class FEXT_T8I816_ins<bits<3> _func, string asmstr, string asmstr2, + InstrItinClass itin>: + FEXT_I816<_func, (outs), + (ins CPU16Regs:$rx, CPU16Regs:$ry, brtarget:$imm), + !strconcat(asmstr2, !strconcat("\t$rx, $ry\n\t", + !strconcat(asmstr, "\t$imm"))),[], itin> { + let isCodeGenOnly=1; +} + +// +// EXT-T8I8I +// +class FEXT_T8I8I16_ins<bits<3> _func, string asmstr, string asmstr2, + InstrItinClass itin>: + FEXT_I816<_func, (outs), + (ins CPU16Regs:$rx, simm16:$imm, brtarget:$targ), + !strconcat(asmstr2, !strconcat("\t$rx, $imm\n\t", + !strconcat(asmstr, "\t$targ"))), [], itin> { + let isCodeGenOnly=1; +} +// + // // I8_MOVR32 instruction format (used only by the MOVR32 instructio @@ -165,6 +216,17 @@ class ArithLogic16Defs<bit isCom=0> { bit neverHasSideEffects = 1; } +class branch16 { + bit isBranch = 1; + bit isTerminator = 1; + bit isBarrier = 1; +} + +class cbranch16 { + bit isBranch = 1; + bit isTerminator = 1; +} + class MayLoad { bit mayLoad = 1; } @@ -204,6 +266,69 @@ def AdduRxRyRz16: FRRR16_ins<01, "addu", IIAlu>, ArithLogic16Defs<1>; // To do a bitwise logical AND. def AndRxRxRy16: FRxRxRy16_ins<0b01100, "and", IIAlu>, ArithLogic16Defs<1>; + + +// +// Format: BEQZ rx, offset MIPS16e +// Purpose: Branch on Equal to Zero (Extended) +// To test a GPR then do a PC-relative conditional branch. +// +def BeqzRxImmX16: FEXT_RI16_B_ins<0b00100, "beqz", IIAlu>, cbranch16; + +// Format: B offset MIPS16e +// Purpose: Unconditional Branch +// To do an unconditional PC-relative branch. +// +def BimmX16: FEXT_I16_ins<0b00010, "b", IIAlu>, branch16; + +// +// Format: BNEZ rx, offset MIPS16e +// Purpose: Branch on Not Equal to Zero (Extended) +// To test a GPR then do a PC-relative conditional branch. +// +def BnezRxImmX16: FEXT_RI16_B_ins<0b00101, "bnez", IIAlu>, cbranch16; + +// +// Format: BTEQZ offset MIPS16e +// Purpose: Branch on T Equal to Zero (Extended) +// To test special register T then do a PC-relative conditional branch. +// +def BteqzX16: FEXT_I816_ins<0b000, "bteqz", IIAlu>, cbranch16; + +def BteqzT8CmpX16: FEXT_T8I816_ins<0b000, "bteqz", "cmp", IIAlu>, cbranch16; + +def BteqzT8CmpiX16: FEXT_T8I8I16_ins<0b000, "bteqz", "cmpi", IIAlu>, + cbranch16; + +def BteqzT8SltX16: FEXT_T8I816_ins<0b000, "bteqz", "slt", IIAlu>, cbranch16; + +def BteqzT8SltuX16: FEXT_T8I816_ins<0b000, "bteqz", "sltu", IIAlu>, cbranch16; + +def BteqzT8SltiX16: FEXT_T8I8I16_ins<0b000, "bteqz", "slti", IIAlu>, cbranch16; + +def BteqzT8SltiuX16: FEXT_T8I8I16_ins<0b000, "bteqz", "sltiu", IIAlu>, + cbranch16; + +// +// Format: BTNEZ offset MIPS16e +// Purpose: Branch on T Not Equal to Zero (Extended) +// To test special register T then do a PC-relative conditional branch. +// +def BtnezX16: FEXT_I816_ins<0b001, "btnez", IIAlu> ,cbranch16; + +def BtnezT8CmpX16: FEXT_T8I816_ins<0b000, "btnez", "cmp", IIAlu>, cbranch16; + +def BtnezT8CmpiX16: FEXT_T8I8I16_ins<0b000, "btnez", "cmpi", IIAlu>, cbranch16; + +def BtnezT8SltX16: FEXT_T8I816_ins<0b000, "btnez", "slt", IIAlu>, cbranch16; + +def BtnezT8SltuX16: FEXT_T8I816_ins<0b000, "btnez", "sltu", IIAlu>, cbranch16; + +def BtnezT8SltiX16: FEXT_T8I8I16_ins<0b000, "btnez", "slti", IIAlu>, cbranch16; + +def BtnezT8SltiuX16: FEXT_T8I8I16_ins<0b000, "btnez", "sltiu", IIAlu>, + cbranch16; + // // Format: DIV rx, ry MIPS16e // Purpose: Divide Word @@ -562,6 +687,11 @@ def: StoreM16_pat<truncstorei8, SbRxRyOffMemX16>; def: StoreM16_pat<truncstorei16, ShRxRyOffMemX16>; def: StoreM16_pat<store, SwRxRyOffMemX16>; +// Unconditional branch +class UncondBranch16_pat<SDNode OpNode, Instruction I>: + Mips16Pat<(OpNode bb:$imm16), (I bb:$imm16)> { + let Predicates = [RelocPIC, InMips16Mode]; + } // Jump and Link (Call) let isCall=1, hasDelaySlot=1 in @@ -574,7 +704,144 @@ let isReturn=1, isTerminator=1, hasDelaySlot=1, isBarrier=1, hasCtrlDep=1, hasExtraSrcRegAllocReq = 1 in def RetRA16 : MipsPseudo16<(outs), (ins), "", [(MipsRet)]>; + +// +// Some branch conditional patterns are not generated by llvm at this time. +// Some are for seemingly arbitrary reasons not used: i.e. with signed number +// comparison they are used and for unsigned a different pattern is used. +// I am pushing upstream from the full mips16 port and it seemed that I needed +// these earlier and the mips32 port has these but now I cannot create test +// cases that use these patterns. While I sort this all out I will leave these +// extra patterns commented out and if I can be sure they are really not used, +// I will delete the code. I don't want to check the code in uncommented without +// a valid test case. In some cases, the compiler is generating patterns with +// setcc instead and earlier I had implemented setcc first so may have masked +// the problem. The setcc variants are suboptimal for mips16 so I may wantto +// figure out how to enable the brcond patterns or else possibly new +// combinations of of brcond and setcc. +// +// +// bcond-seteq +// +def: Mips16Pat + <(brcond (i32 (seteq CPU16Regs:$rx, CPU16Regs:$ry)), bb:$imm16), + (BteqzT8CmpX16 CPU16Regs:$rx, CPU16Regs:$ry, bb:$imm16) + >; + + +def: Mips16Pat + <(brcond (i32 (seteq CPU16Regs:$rx, immZExt16:$imm)), bb:$targ16), + (BteqzT8CmpiX16 CPU16Regs:$rx, immSExt16:$imm, bb:$targ16) + >; + +def: Mips16Pat + <(brcond (i32 (seteq CPU16Regs:$rx, 0)), bb:$targ16), + (BeqzRxImmX16 CPU16Regs:$rx, bb:$targ16) + >; + +// +// bcond-setgt (do we need to have this pair of setlt, setgt??) +// +def: Mips16Pat + <(brcond (i32 (setgt CPU16Regs:$rx, CPU16Regs:$ry)), bb:$imm16), + (BtnezT8SltX16 CPU16Regs:$ry, CPU16Regs:$rx, bb:$imm16) + >; + +// +// bcond-setge +// +def: Mips16Pat + <(brcond (i32 (setge CPU16Regs:$rx, CPU16Regs:$ry)), bb:$imm16), + (BteqzT8SltX16 CPU16Regs:$rx, CPU16Regs:$ry, bb:$imm16) + >; + +// +// never called because compiler transforms a >= k to a > (k-1) +//def: Mips16Pat +// <(brcond (i32 (setge CPU16Regs:$rx, immSExt16:$imm)), bb:$imm16), +// (BteqzT8SltiX16 CPU16Regs:$rx, immSExt16:$imm, bb:$imm16) +// >; + +// +// bcond-setlt +// +def: Mips16Pat + <(brcond (i32 (setlt CPU16Regs:$rx, CPU16Regs:$ry)), bb:$imm16), + (BtnezT8SltX16 CPU16Regs:$rx, CPU16Regs:$ry, bb:$imm16) + >; + +def: Mips16Pat + <(brcond (i32 (setlt CPU16Regs:$rx, immSExt16:$imm)), bb:$imm16), + (BtnezT8SltiX16 CPU16Regs:$rx, immSExt16:$imm, bb:$imm16) + >; + +// +// bcond-setle +// +def: Mips16Pat + <(brcond (i32 (setle CPU16Regs:$rx, CPU16Regs:$ry)), bb:$imm16), + (BteqzT8SltX16 CPU16Regs:$ry, CPU16Regs:$rx, bb:$imm16) + >; + +// +// bcond-setne +// +def: Mips16Pat + <(brcond (i32 (setne CPU16Regs:$rx, CPU16Regs:$ry)), bb:$imm16), + (BtnezT8CmpX16 CPU16Regs:$rx, CPU16Regs:$ry, bb:$imm16) + >; + +def: Mips16Pat + <(brcond (i32 (setne CPU16Regs:$rx, immZExt16:$imm)), bb:$targ16), + (BtnezT8CmpiX16 CPU16Regs:$rx, immSExt16:$imm, bb:$targ16) + >; + +def: Mips16Pat + <(brcond (i32 (setne CPU16Regs:$rx, 0)), bb:$targ16), + (BnezRxImmX16 CPU16Regs:$rx, bb:$targ16) + >; + +// +// This needs to be there but I forget which code will generate it +// +def: Mips16Pat + <(brcond CPU16Regs:$rx, bb:$targ16), + (BnezRxImmX16 CPU16Regs:$rx, bb:$targ16) + >; + +// + +// +// bcond-setugt +// +//def: Mips16Pat +// <(brcond (i32 (setugt CPU16Regs:$rx, CPU16Regs:$ry)), bb:$imm16), +// (BtnezT8SltuX16 CPU16Regs:$ry, CPU16Regs:$rx, bb:$imm16) +// >; + +// +// bcond-setuge +// +//def: Mips16Pat +// <(brcond (i32 (setuge CPU16Regs:$rx, CPU16Regs:$ry)), bb:$imm16), +// (BteqzT8SltuX16 CPU16Regs:$rx, CPU16Regs:$ry, bb:$imm16) +// >; + + +// +// bcond-setult +// +//def: Mips16Pat +// <(brcond (i32 (setult CPU16Regs:$rx, CPU16Regs:$ry)), bb:$imm16), +// (BtnezT8SltuX16 CPU16Regs:$rx, CPU16Regs:$ry, bb:$imm16) +// >; + +def: UncondBranch16_pat<br, BimmX16>; + // Small immediates +def: Mips16Pat<(i32 immSExt16:$in), + (AddiuRxRxImmX16 (Move32R16 ZERO), immSExt16:$in)>; + def: Mips16Pat<(i32 immZExt16:$in), (LiRxImmX16 immZExt16:$in)>; // diff --git a/lib/Target/Mips/Mips64InstrInfo.td b/lib/Target/Mips/Mips64InstrInfo.td index 99a9f25abd2..ed0ea0e849a 100644 --- a/lib/Target/Mips/Mips64InstrInfo.td +++ b/lib/Target/Mips/Mips64InstrInfo.td @@ -177,6 +177,7 @@ def BLTZ64 : CBranchZero<0x01, 0, "bltz", setlt, CPU64Regs>; } let DecoderNamespace = "Mips64" in def JALR64 : JumpLinkReg<0x00, 0x09, "jalr", CPU64Regs>; +def TAILCALL64_R : JumpFR<CPU64Regs, MipsTailCall>, IsTailCall; let DecoderNamespace = "Mips64" in { /// Multiply and Divide Instructions. diff --git a/lib/Target/Mips/MipsISelLowering.cpp b/lib/Target/Mips/MipsISelLowering.cpp index 2236af9a7b6..5cc9662864e 100644 --- a/lib/Target/Mips/MipsISelLowering.cpp +++ b/lib/Target/Mips/MipsISelLowering.cpp @@ -25,6 +25,7 @@ #include "llvm/GlobalVariable.h" #include "llvm/Intrinsics.h" #include "llvm/CallingConv.h" +#include "llvm/ADT/Statistic.h" #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" @@ -32,12 +33,19 @@ #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/SelectionDAGISel.h" #include "llvm/CodeGen/ValueTypes.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" using namespace llvm; +STATISTIC(NumTailCalls, "Number of tail calls"); + +static cl::opt<bool> +EnableMipsTailCalls("enable-mips-tail-calls", cl::Hidden, + cl::desc("MIPS: Enable tail calls."), cl::init(false)); + // If I is a shifted mask, set the size (Size) and the first bit of the // mask (Pos), and return true. // For example, if I is 0x003ff800, (Pos, Size) = (11, 11). @@ -58,6 +66,7 @@ static SDValue GetGlobalReg(SelectionDAG &DAG, EVT Ty) { const char *MipsTargetLowering::getTargetNodeName(unsigned Opcode) const { switch (Opcode) { case MipsISD::JmpLink: return "MipsISD::JmpLink"; + case MipsISD::TailCall: return "MipsISD::TailCall"; case MipsISD::Hi: return "MipsISD::Hi"; case MipsISD::Lo: return "MipsISD::Lo"; case MipsISD::GPRel: return "MipsISD::GPRel"; @@ -2870,9 +2879,26 @@ PassByValArg64(SDValue Chain, DebugLoc dl, MemOpChains.push_back(Chain); } +/// IsEligibleForTailCallOptimization - Check whether the call is eligible +/// for tail call optimization. +bool MipsTargetLowering:: +IsEligibleForTailCallOptimization(CallingConv::ID CalleeCC, + unsigned NextStackOffset) const { + if (!EnableMipsTailCalls) + return false; + + // Do not tail-call optimize if there is an argument passed on stack. + if (IsO32 && (CalleeCC != CallingConv::Fast)) { + if (NextStackOffset > 16) + return false; + } else if (NextStackOffset) + return false; + + return true; +} + /// LowerCall - functions arguments are copied from virtual regs to /// (physical regs)/(stack frame), CALLSEQ_START and CALLSEQ_END are emitted. -/// TODO: isTailCall. SDValue MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, SmallVectorImpl<SDValue> &InVals) const { @@ -2887,14 +2913,10 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, CallingConv::ID CallConv = CLI.CallConv; bool isVarArg = CLI.IsVarArg; - // MIPs target does not yet support tail call optimization. - isTailCall = false; - MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo *MFI = MF.getFrameInfo(); const TargetFrameLowering *TFL = MF.getTarget().getFrameLowering(); bool IsPIC = getTargetMachine().getRelocationModel() == Reloc::PIC_; - MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>(); // Analyze operands of the call, assigning locations to each operand. SmallVector<CCValAssign, 16> ArgLocs; @@ -2921,19 +2943,25 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, if (IsO32 && (CallConv != CallingConv::Fast)) NextStackOffset = std::max(NextStackOffset, (unsigned)16); + // Check if it's really possible to do a tail call. + if (isTailCall) + isTailCall = IsEligibleForTailCallOptimization(CallConv, NextStackOffset); + + if (isTailCall) + ++NumTailCalls; + // Chain is the output chain of the last Load/Store or CopyToReg node. // ByValChain is the output chain of the last Memcpy node created for copying // byval arguments to the stack. SDValue NextStackOffsetVal = DAG.getIntPtrConstant(NextStackOffset, true); - Chain = DAG.getCALLSEQ_START(Chain, NextStackOffsetVal); + + if (!isTailCall) + Chain = DAG.getCALLSEQ_START(Chain, NextStackOffsetVal); SDValue StackPtr = DAG.getCopyFromReg(Chain, dl, IsN64 ? Mips::SP_64 : Mips::SP, getPointerTy()); - if (MipsFI->getMaxCallFrameSize() < NextStackOffset) - MipsFI->setMaxCallFrameSize(NextStackOffset); - // With EABI is it possible to have 16 args on registers. SmallVector<std::pair<unsigned, SDValue>, 16> RegsToPass; SmallVector<SDValue, 8> MemOpChains; @@ -3138,6 +3166,9 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, if (InFlag.getNode()) Ops.push_back(InFlag); + if (isTailCall) + return DAG.getNode(MipsISD::TailCall, dl, MVT::Other, &Ops[0], Ops.size()); + Chain = DAG.getNode(MipsISD::JmpLink, dl, NodeTys, &Ops[0], Ops.size()); InFlag = Chain.getValue(1); @@ -3379,7 +3410,8 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain, if (DAG.getMachineFunction().getFunction()->hasStructRetAttr()) { unsigned Reg = MipsFI->getSRetReturnReg(); if (!Reg) { - Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i32)); + Reg = MF.getRegInfo(). + createVirtualRegister(getRegClassFor(IsN64 ? MVT::i64 : MVT::i32)); MipsFI->setSRetReturnReg(Reg); } SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[0]); @@ -3508,7 +3540,8 @@ MipsTargetLowering::LowerReturn(SDValue Chain, llvm_unreachable("sret virtual register not created in the entry block"); SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy()); - Chain = DAG.getCopyToReg(Chain, dl, Mips::V0, Val, Flag); + Chain = DAG.getCopyToReg(Chain, dl, IsN64 ? Mips::V0_64 : Mips::V0, Val, + Flag); Flag = Chain.getValue(1); } diff --git a/lib/Target/Mips/MipsISelLowering.h b/lib/Target/Mips/MipsISelLowering.h index 28d126b62c6..b75a513fa7e 100644 --- a/lib/Target/Mips/MipsISelLowering.h +++ b/lib/Target/Mips/MipsISelLowering.h @@ -29,6 +29,9 @@ namespace llvm { // Jump and link (call) JmpLink, + // Tail call + TailCall, + // Get the Higher 16 bits from a 32-bit immediate // No relation with Mips Hi register Hi, @@ -205,6 +208,11 @@ namespace llvm { SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const; + /// IsEligibleForTailCallOptimization - Check whether the call is eligible + /// for tail call optimization. + bool IsEligibleForTailCallOptimization(CallingConv::ID CalleeCC, + unsigned NextStackOffset) const; + virtual SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, diff --git a/lib/Target/Mips/MipsInstrInfo.td b/lib/Target/Mips/MipsInstrInfo.td index 5a51fc80eb6..cc216c391dd 100644 --- a/lib/Target/Mips/MipsInstrInfo.td +++ b/lib/Target/Mips/MipsInstrInfo.td @@ -52,6 +52,10 @@ def MipsJmpLink : SDNode<"MipsISD::JmpLink",SDT_MipsJmpLink, [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue, SDNPVariadic]>; +// Tail call +def MipsTailCall : SDNode<"MipsISD::TailCall", SDT_MipsJmpLink, + [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>; + // Hi and Lo nodes are used to handle global addresses. Used on // MipsISelLowering to lower stuff like GlobalAddress, ExternalSymbol // static model. (nothing to do with Mips Registers Hi and Lo) @@ -175,6 +179,27 @@ class MipsPat<dag pattern, dag result> : Pat<pattern, result> { let Predicates = [HasStandardEncoding]; } +class IsBranch { + bit isBranch = 1; +} + +class IsReturn { + bit isReturn = 1; +} + +class IsCall { + bit isCall = 1; +} + +class IsTailCall { + bit isCall = 1; + bit isTerminator = 1; + bit isReturn = 1; + bit isBarrier = 1; + bit hasExtraSrcRegAllocReq = 1; + bit isCodeGenOnly = 1; +} + //===----------------------------------------------------------------------===// // Instruction format superclass //===----------------------------------------------------------------------===// @@ -573,14 +598,13 @@ class SetCC_I<bits<6> op, string instr_asm, PatFrag cond_op, Operand Od, IIAlu>; // Jump -class JumpFJ<bits<6> op, string instr_asm>: - FJ<op, (outs), (ins jmptarget:$target), - !strconcat(instr_asm, "\t$target"), [(br bb:$target)], IIBranch> { - let isBranch=1; +class JumpFJ<bits<6> op, DAGOperand opnd, string instr_asm, + SDPatternOperator operator, SDPatternOperator targetoperator>: + FJ<op, (outs), (ins opnd:$target), !strconcat(instr_asm, "\t$target"), + [(operator targetoperator:$target)], IIBranch> { let isTerminator=1; let isBarrier=1; let hasDelaySlot = 1; - let Predicates = [RelocStatic, HasStandardEncoding]; let DecoderMethod = "DecodeJumpTarget"; let Defs = [AT]; } @@ -601,21 +625,21 @@ class UncondBranch<bits<6> op, string instr_asm>: // Base class for indirect branch and return instruction classes. let isTerminator=1, isBarrier=1, hasDelaySlot = 1 in -class JumpFR<RegisterClass RC, list<dag> pattern>: - FR<0, 0x8, (outs), (ins RC:$rs), "jr\t$rs", pattern, IIBranch> { +class JumpFR<RegisterClass RC, SDPatternOperator operator = null_frag>: + FR<0, 0x8, (outs), (ins RC:$rs), "jr\t$rs", [(operator RC:$rs)], IIBranch> { let rt = 0; let rd = 0; let shamt = 0; } // Indirect branch -class IndirectBranch<RegisterClass RC>: JumpFR<RC, [(brind RC:$rs)]> { +class IndirectBranch<RegisterClass RC>: JumpFR<RC, brind> { let isBranch = 1; let isIndirectBranch = 1; } // Return instruction -class RetBase<RegisterClass RC>: JumpFR<RC, []> { +class RetBase<RegisterClass RC>: JumpFR<RC> { let isReturn = 1; let isCodeGenOnly = 1; let hasCtrlDep = 1; @@ -980,7 +1004,8 @@ def SC_P8 : SCBase<0x38, "sc", CPURegs, mem64>, } /// Jump and Branch Instructions -def J : JumpFJ<0x02, "j">; +def J : JumpFJ<0x02, jmptarget, "j", br, bb>, + Requires<[RelocStatic, HasStandardEncoding]>, IsBranch; def JR : IndirectBranch<CPURegs>; def B : UncondBranch<0x04, "b">; def BEQ : CBranch<0x04, "beq", seteq, CPURegs>; @@ -998,6 +1023,8 @@ def JAL : JumpLink<0x03, "jal">; def JALR : JumpLinkReg<0x00, 0x09, "jalr", CPURegs>; def BGEZAL : BranchLink<"bgezal", 0x11, CPURegs>; def BLTZAL : BranchLink<"bltzal", 0x10, CPURegs>; +def TAILCALL : JumpFJ<0x02, calltarget, "j", MipsTailCall, imm>, IsTailCall; +def TAILCALL_R : JumpFR<CPURegs, MipsTailCall>, IsTailCall; def RET : RetBase<CPURegs>; @@ -1107,6 +1134,11 @@ def : MipsPat<(MipsJmpLink (i32 texternalsym:$dst)), //def : MipsPat<(MipsJmpLink CPURegs:$dst), // (JALR CPURegs:$dst)>; +// Tail call +def : MipsPat<(MipsTailCall (iPTR tglobaladdr:$dst)), + (TAILCALL tglobaladdr:$dst)>; +def : MipsPat<(MipsTailCall (iPTR texternalsym:$dst)), + (TAILCALL texternalsym:$dst)>; // hi/lo relocs def : MipsPat<(MipsHi tglobaladdr:$in), (LUi tglobaladdr:$in)>; def : MipsPat<(MipsHi tblockaddress:$in), (LUi tblockaddress:$in)>; diff --git a/lib/Target/Mips/MipsMachineFunction.h b/lib/Target/Mips/MipsMachineFunction.h index 93ce94803a0..5b766f22a80 100644 --- a/lib/Target/Mips/MipsMachineFunction.h +++ b/lib/Target/Mips/MipsMachineFunction.h @@ -46,15 +46,13 @@ class MipsFunctionInfo : public MachineFunctionInfo { // InArgFIRange: Range of indices of all frame objects created during call to // LowerFormalArguments. std::pair<int, int> InArgFIRange; - unsigned MaxCallFrameSize; bool EmitNOAT; public: MipsFunctionInfo(MachineFunction& MF) : MF(MF), SRetReturnReg(0), GlobalBaseReg(0), - VarArgsFrameIndex(0), InArgFIRange(std::make_pair(-1, 0)), - MaxCallFrameSize(0), EmitNOAT(false) + VarArgsFrameIndex(0), InArgFIRange(std::make_pair(-1, 0)), EmitNOAT(false) {} bool isInArgFI(int FI) const { @@ -71,9 +69,6 @@ public: int getVarArgsFrameIndex() const { return VarArgsFrameIndex; } void setVarArgsFrameIndex(int Index) { VarArgsFrameIndex = Index; } - unsigned getMaxCallFrameSize() const { return MaxCallFrameSize; } - void setMaxCallFrameSize(unsigned S) { MaxCallFrameSize = S; } - bool getEmitNOAT() const { return EmitNOAT; } void setEmitNOAT() { EmitNOAT = true; } }; diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp index c18250a78f7..36db4b51799 100644 --- a/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/lib/Target/PowerPC/PPCISelLowering.cpp @@ -4224,7 +4224,52 @@ SDValue PPCTargetLowering::LowerSINT_TO_FP(SDValue Op, return SDValue(); if (Op.getOperand(0).getValueType() == MVT::i64) { - SDValue Bits = DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op.getOperand(0)); + SDValue SINT = Op.getOperand(0); + // When converting to single-precision, we actually need to convert + // to double-precision first and then round to single-precision. + // To avoid double-rounding effects during that operation, we have + // to prepare the input operand. Bits that might be truncated when + // converting to double-precision are replaced by a bit that won't + // be lost at this stage, but is below the single-precision rounding + // position. + // + // However, if -enable-unsafe-fp-math is in effect, accept double + // rounding to avoid the extra overhead. + if (Op.getValueType() == MVT::f32 && + !DAG.getTarget().Options.UnsafeFPMath) { + + // Twiddle input to make sure the low 11 bits are zero. (If this + // is the case, we are guaranteed the value will fit into the 53 bit + // mantissa of an IEEE double-precision value without rounding.) + // If any of those low 11 bits were not zero originally, make sure + // bit 12 (value 2048) is set instead, so that the final rounding + // to single-precision gets the correct result. + SDValue Round = DAG.getNode(ISD::AND, dl, MVT::i64, + SINT, DAG.getConstant(2047, MVT::i64)); + Round = DAG.getNode(ISD::ADD, dl, MVT::i64, + Round, DAG.getConstant(2047, MVT::i64)); + Round = DAG.getNode(ISD::OR, dl, MVT::i64, Round, SINT); + Round = DAG.getNode(ISD::AND, dl, MVT::i64, + Round, DAG.getConstant(-2048, MVT::i64)); + + // However, we cannot use that value unconditionally: if the magnitude + // of the input value is small, the bit-twiddling we did above might + // end up visibly changing the output. Fortunately, in that case, we + // don't need to twiddle bits since the original input will convert + // exactly to double-precision floating-point already. Therefore, + // construct a conditional to use the original value if the top 11 + // bits are all sign-bit copies, and use the rounded value computed + // above otherwise. + SDValue Cond = DAG.getNode(ISD::SRA, dl, MVT::i64, + SINT, DAG.getConstant(53, MVT::i32)); + Cond = DAG.getNode(ISD::ADD, dl, MVT::i64, + Cond, DAG.getConstant(1, MVT::i64)); + Cond = DAG.getSetCC(dl, MVT::i32, + Cond, DAG.getConstant(1, MVT::i64), ISD::SETUGT); + + SINT = DAG.getNode(ISD::SELECT, dl, MVT::i64, Cond, Round, SINT); + } + SDValue Bits = DAG.getNode(ISD::BITCAST, dl, MVT::f64, SINT); SDValue FP = DAG.getNode(PPCISD::FCFID, dl, MVT::f64, Bits); if (Op.getValueType() == MVT::f32) FP = DAG.getNode(ISD::FP_ROUND, dl, diff --git a/lib/Target/X86/AsmParser/X86AsmParser.cpp b/lib/Target/X86/AsmParser/X86AsmParser.cpp index 454664e3ed1..e07fafcb207 100644 --- a/lib/Target/X86/AsmParser/X86AsmParser.cpp +++ b/lib/Target/X86/AsmParser/X86AsmParser.cpp @@ -11,6 +11,7 @@ #include "llvm/MC/MCTargetAsmParser.h" #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCSymbol.h" #include "llvm/MC/MCInst.h" #include "llvm/MC/MCRegisterInfo.h" #include "llvm/MC/MCSubtargetInfo.h" @@ -180,6 +181,7 @@ struct X86Operand : public MCParsedAsmOperand { unsigned IndexReg; unsigned Scale; unsigned Size; + bool NeedSizeDir; } Mem; }; @@ -312,6 +314,16 @@ struct X86Operand : public MCParsedAsmOperand { return isImmSExti64i32Value(CE->getValue()); } + unsigned getMemSize() const { + assert(Kind == Memory && "Invalid access!"); + return Mem.Size; + } + + bool needSizeDirective() const { + assert(Kind == Memory && "Invalid access!"); + return Mem.NeedSizeDir; + } + bool isMem() const { return Kind == Memory; } bool isMem8() const { return Kind == Memory && (!Mem.Size || Mem.Size == 8); @@ -451,7 +463,8 @@ struct X86Operand : public MCParsedAsmOperand { /// Create an absolute memory operand. static X86Operand *CreateMem(const MCExpr *Disp, SMLoc StartLoc, - SMLoc EndLoc, unsigned Size = 0) { + SMLoc EndLoc, unsigned Size = 0, + bool NeedSizeDir = false) { X86Operand *Res = new X86Operand(Memory, StartLoc, EndLoc); Res->Mem.SegReg = 0; Res->Mem.Disp = Disp; @@ -459,6 +472,7 @@ struct X86Operand : public MCParsedAsmOperand { Res->Mem.IndexReg = 0; Res->Mem.Scale = 1; Res->Mem.Size = Size; + Res->Mem.NeedSizeDir = NeedSizeDir; return Res; } @@ -466,7 +480,7 @@ struct X86Operand : public MCParsedAsmOperand { static X86Operand *CreateMem(unsigned SegReg, const MCExpr *Disp, unsigned BaseReg, unsigned IndexReg, unsigned Scale, SMLoc StartLoc, SMLoc EndLoc, - unsigned Size = 0) { + unsigned Size = 0, bool NeedSizeDir = false) { // We should never just have a displacement, that should be parsed as an // absolute memory operand. assert((SegReg || BaseReg || IndexReg) && "Invalid memory operand!"); @@ -481,6 +495,7 @@ struct X86Operand : public MCParsedAsmOperand { Res->Mem.IndexReg = IndexReg; Res->Mem.Scale = Scale; Res->Mem.Size = Size; + Res->Mem.NeedSizeDir = NeedSizeDir; return Res; } }; @@ -753,7 +768,19 @@ X86Operand *X86AsmParser::ParseIntelMemOperand(unsigned SegReg, SMLoc Start) { const MCExpr *Disp = MCConstantExpr::Create(0, getParser().getContext()); if (getParser().ParseExpression(Disp, End)) return 0; End = Parser.getTok().getLoc(); - return X86Operand::CreateMem(Disp, Start, End, Size); + + bool NeedSizeDir = false; + if (!Size && isParsingInlineAsm()) { + if (const MCSymbolRefExpr *SymRef = dyn_cast<MCSymbolRefExpr>(Disp)) { + const MCSymbol &Sym = SymRef->getSymbol(); + // FIXME: The SemaLookup will fail if the name is anything other then an + // identifier. + // FIXME: Pass a valid SMLoc. + SemaCallback->LookupInlineAsmIdentifier(Sym.getName(), NULL, Size); + NeedSizeDir = Size > 0; + } + } + return X86Operand::CreateMem(Disp, Start, End, Size, NeedSizeDir); } X86Operand *X86AsmParser::ParseIntelOperand() { @@ -1526,9 +1553,6 @@ MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, assert(Op->isToken() && "Leading operand should always be a mnemonic!"); ArrayRef<SMRange> EmptyRanges = ArrayRef<SMRange>(); - // Clear the opcode. - Opcode = ~0x0; - // First, handle aliases that expand to multiple instructions. // FIXME: This should be replaced with a real .td file alias mechanism. // Also, MatchInstructionImpl should actually *do* the EmitInstruction diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 2f09e9e6ff1..5034cc0330a 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -459,7 +459,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::EH_RETURN , MVT::Other, Custom); // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intened to support // SjLj exception handling but a light-weight setjmp/longjmp replacement to - // support continuation, user-level threading, and etc.. As a result, not + // support continuation, user-level threading, and etc.. As a result, no // other SjLj exception interfaces are implemented and please don't build // your own exception handling based on them. // LLVM/Clang supports zero-cost DWARF exception handling. @@ -557,6 +557,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom); setOperationAction(ISD::TRAP, MVT::Other, Legal); + setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal); // VASTART needs to be custom lowered to use the VarArgsFrameIndex setOperationAction(ISD::VASTART , MVT::Other, Custom); @@ -1058,6 +1059,10 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::FNEG, MVT::v4f64, Custom); setOperationAction(ISD::FABS, MVT::v4f64, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom); + + setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Custom); + setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal); setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal); setOperationAction(ISD::FP_ROUND, MVT::v4f32, Legal); @@ -1255,7 +1260,6 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setTargetDAGCombine(ISD::UINT_TO_FP); setTargetDAGCombine(ISD::SINT_TO_FP); setTargetDAGCombine(ISD::SETCC); - setTargetDAGCombine(ISD::FP_TO_SINT); if (Subtarget->is64Bit()) setTargetDAGCombine(ISD::MUL); setTargetDAGCombine(ISD::XOR); @@ -5173,6 +5177,80 @@ X86TargetLowering::LowerVectorBroadcast(SDValue Op, SelectionDAG &DAG) const { } SDValue +X86TargetLowering::buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + + // Skip if insert_vec_elt is not supported. + if (!isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT)) + return SDValue(); + + DebugLoc DL = Op.getDebugLoc(); + unsigned NumElems = Op.getNumOperands(); + + SDValue VecIn1; + SDValue VecIn2; + SmallVector<unsigned, 4> InsertIndices; + SmallVector<int, 8> Mask(NumElems, -1); + + for (unsigned i = 0; i != NumElems; ++i) { + unsigned Opc = Op.getOperand(i).getOpcode(); + + if (Opc == ISD::UNDEF) + continue; + + if (Opc != ISD::EXTRACT_VECTOR_ELT) { + // Quit if more than 1 elements need inserting. + if (InsertIndices.size() > 1) + return SDValue(); + + InsertIndices.push_back(i); + continue; + } + + SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0); + SDValue ExtIdx = Op.getOperand(i).getOperand(1); + + // Quit if extracted from vector of different type. + if (ExtractedFromVec.getValueType() != VT) + return SDValue(); + + // Quit if non-constant index. + if (!isa<ConstantSDNode>(ExtIdx)) + return SDValue(); + + if (VecIn1.getNode() == 0) + VecIn1 = ExtractedFromVec; + else if (VecIn1 != ExtractedFromVec) { + if (VecIn2.getNode() == 0) + VecIn2 = ExtractedFromVec; + else if (VecIn2 != ExtractedFromVec) + // Quit if more than 2 vectors to shuffle + return SDValue(); + } + + unsigned Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue(); + + if (ExtractedFromVec == VecIn1) + Mask[i] = Idx; + else if (ExtractedFromVec == VecIn2) + Mask[i] = Idx + NumElems; + } + + if (VecIn1.getNode() == 0) + return SDValue(); + + VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT); + SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, &Mask[0]); + for (unsigned i = 0, e = InsertIndices.size(); i != e; ++i) { + unsigned Idx = InsertIndices[i]; + NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx), + DAG.getIntPtrConstant(Idx)); + } + + return NV; +} + +SDValue X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { DebugLoc dl = Op.getDebugLoc(); @@ -5448,6 +5526,11 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { if (LD.getNode()) return LD; + // Check for a build vector from mostly shuffle plus few inserting. + SDValue Sh = buildFromShuffleMostly(Op, DAG); + if (Sh.getNode()) + return Sh; + // For SSE 4.1, use insertps to put the high elements into the low element. if (getSubtarget()->hasSSE41()) { SDValue Result; @@ -8104,10 +8187,42 @@ FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned, bool IsReplace) co } } +SDValue X86TargetLowering::lowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { + DebugLoc DL = Op.getDebugLoc(); + EVT VT = Op.getValueType(); + EVT SVT = Op.getOperand(0).getValueType(); + + if (!VT.is128BitVector() || !SVT.is256BitVector() || + VT.getVectorNumElements() != SVT.getVectorNumElements()) + return SDValue(); + + assert(Subtarget->hasAVX() && "256-bit vector is observed without AVX!"); + + unsigned NumElems = VT.getVectorNumElements(); + EVT NVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), + NumElems * 2); + + SDValue In = Op.getOperand(0); + SmallVector<int, 16> MaskVec(NumElems * 2, -1); + // Prepare truncation shuffle mask + for (unsigned i = 0; i != NumElems; ++i) + MaskVec[i] = i * 2; + SDValue V = DAG.getVectorShuffle(NVT, DL, + DAG.getNode(ISD::BITCAST, DL, NVT, In), + DAG.getUNDEF(NVT), &MaskVec[0]); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V, + DAG.getIntPtrConstant(0)); +} + SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const { - if (Op.getValueType().isVector()) + if (Op.getValueType().isVector()) { + if (Op.getValueType() == MVT::v8i16) + return DAG.getNode(ISD::TRUNCATE, Op.getDebugLoc(), Op.getValueType(), + DAG.getNode(ISD::FP_TO_SINT, Op.getDebugLoc(), + MVT::v8i32, Op.getOperand(0))); return SDValue(); + } std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, /*IsSigned=*/ true, /*IsReplace=*/ false); @@ -11376,6 +11491,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG); case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); + case ISD::TRUNCATE: return lowerTRUNCATE(Op, DAG); case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG); case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG); case ISD::FP_EXTEND: return lowerFP_EXTEND(Op, DAG); @@ -13276,7 +13392,7 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI, // For v = setjmp(buf), we generate // // thisMBB: - // buf[Label_Offset] = ljMBB + // buf[LabelOffset] = restoreMBB // SjLjSetup restoreMBB // // mainMBB: @@ -13304,18 +13420,48 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI, sinkMBB->transferSuccessorsAndUpdatePHIs(MBB); // thisMBB: - unsigned PtrImmStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi; - const int64_t Label_Offset = 1 * PVT.getStoreSize(); - + unsigned PtrStoreOpc = 0; + unsigned LabelReg = 0; + const int64_t LabelOffset = 1 * PVT.getStoreSize(); + Reloc::Model RM = getTargetMachine().getRelocationModel(); + bool UseImmLabel = (getTargetMachine().getCodeModel() == CodeModel::Small) && + (RM == Reloc::Static || RM == Reloc::DynamicNoPIC); + + // Prepare IP either in reg or imm. + if (!UseImmLabel) { + PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr; + const TargetRegisterClass *PtrRC = getRegClassFor(PVT); + LabelReg = MRI.createVirtualRegister(PtrRC); + if (Subtarget->is64Bit()) { + MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg) + .addReg(X86::RIP) + .addImm(0) + .addReg(0) + .addMBB(restoreMBB) + .addReg(0); + } else { + const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII); + MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg) + .addReg(XII->getGlobalBaseReg(MF)) + .addImm(0) + .addReg(0) + .addMBB(restoreMBB, Subtarget->ClassifyBlockAddressReference()) + .addReg(0); + } + } else + PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi; // Store IP - MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrImmStoreOpc)); + MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc)); for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { if (i == X86::AddrDisp) - MIB.addDisp(MI->getOperand(MemOpndSlot + i), Label_Offset); + MIB.addDisp(MI->getOperand(MemOpndSlot + i), LabelOffset); else MIB.addOperand(MI->getOperand(MemOpndSlot + i)); } - MIB.addMBB(restoreMBB); + if (!UseImmLabel) + MIB.addReg(LabelReg); + else + MIB.addMBB(restoreMBB); MIB.setMemRefs(MMOBegin, MMOEnd); // Setup MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup)) @@ -13370,8 +13516,8 @@ X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI, MachineInstrBuilder MIB; - const int64_t Label_Offset = 1 * PVT.getStoreSize(); - const int64_t SP_Offset = 2 * PVT.getStoreSize(); + const int64_t LabelOffset = 1 * PVT.getStoreSize(); + const int64_t SPOffset = 2 * PVT.getStoreSize(); unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm; unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r; @@ -13385,7 +13531,7 @@ X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI, MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), Tmp); for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { if (i == X86::AddrDisp) - MIB.addDisp(MI->getOperand(i), Label_Offset); + MIB.addDisp(MI->getOperand(i), LabelOffset); else MIB.addOperand(MI->getOperand(i)); } @@ -13394,7 +13540,7 @@ X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI, MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), SP); for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { if (i == X86::AddrDisp) - MIB.addDisp(MI->getOperand(i), SP_Offset); + MIB.addDisp(MI->getOperand(i), SPOffset); else MIB.addOperand(MI->getOperand(i)); } @@ -15411,11 +15557,11 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG, ISD::LoadExtType Ext = Ld->getExtensionType(); // If this is a vector EXT Load then attempt to optimize it using a - // shuffle. We need SSE4 for the shuffles. + // shuffle. We need SSSE3 shuffles. // TODO: It is possible to support ZExt by zeroing the undef values // during the shuffle phase or after the shuffle. if (RegVT.isVector() && RegVT.isInteger() && - Ext == ISD::EXTLOAD && Subtarget->hasSSE41()) { + Ext == ISD::EXTLOAD && Subtarget->hasSSSE3()) { assert(MemVT != RegVT && "Cannot extend to the same type"); assert(MemVT.isVector() && "Must load a vector from memory"); @@ -16269,20 +16415,6 @@ static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } -static SDValue PerformFP_TO_SINTCombine(SDNode *N, SelectionDAG &DAG) { - EVT VT = N->getValueType(0); - - // v4i8 = FP_TO_SINT() -> v4i8 = TRUNCATE (V4i32 = FP_TO_SINT() - if (VT == MVT::v8i8 || VT == MVT::v4i8) { - DebugLoc dl = N->getDebugLoc(); - MVT DstVT = VT == MVT::v4i8 ? MVT::v4i32 : MVT::v8i32; - SDValue I = DAG.getNode(ISD::FP_TO_SINT, dl, DstVT, N->getOperand(0)); - return DAG.getNode(ISD::TRUNCATE, dl, VT, I); - } - - return SDValue(); -} - // Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS static SDValue PerformADCCombine(SDNode *N, SelectionDAG &DAG, X86TargetLowering::DAGCombinerInfo &DCI) { @@ -16421,7 +16553,6 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case ISD::STORE: return PerformSTORECombine(N, DAG, Subtarget); case ISD::UINT_TO_FP: return PerformUINT_TO_FPCombine(N, DAG); case ISD::SINT_TO_FP: return PerformSINT_TO_FPCombine(N, DAG, this); - case ISD::FP_TO_SINT: return PerformFP_TO_SINTCombine(N, DAG); case ISD::FADD: return PerformFADDCombine(N, DAG, Subtarget); case ISD::FSUB: return PerformFSUBCombine(N, DAG, Subtarget); case X86ISD::FXOR: diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index 40e966ad676..228fab1689e 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -795,6 +795,7 @@ namespace llvm { SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG) const; SDValue LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) const; SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const; @@ -824,9 +825,10 @@ namespace llvm { SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const; - // Utility functions to help LowerVECTOR_SHUFFLE + // Utility functions to help LowerVECTOR_SHUFFLE & LowerBUILD_VECTOR SDValue LowerVectorBroadcast(SDValue Op, SelectionDAG &DAG) const; SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG) const; + SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVectorAllZeroTest(SDValue Op, SelectionDAG &DAG) const; diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp index c44549c30ac..cfd68f74b7b 100644 --- a/lib/Target/X86/X86MCInstLower.cpp +++ b/lib/Target/X86/X86MCInstLower.cpp @@ -67,15 +67,11 @@ MachineModuleInfoMachO &X86MCInstLower::getMachOMMI() const { /// operand to an MCSymbol. MCSymbol *X86MCInstLower:: GetSymbolFromOperand(const MachineOperand &MO) const { - assert((MO.isGlobal() || MO.isSymbol()) && "Isn't a symbol reference"); + assert((MO.isGlobal() || MO.isSymbol() || MO.isMBB()) && "Isn't a symbol reference"); SmallString<128> Name; - if (!MO.isGlobal()) { - assert(MO.isSymbol()); - Name += MAI.getGlobalPrefix(); - Name += MO.getSymbolName(); - } else { + if (MO.isGlobal()) { const GlobalValue *GV = MO.getGlobal(); bool isImplicitlyPrivate = false; if (MO.getTargetFlags() == X86II::MO_DARWIN_STUB || @@ -85,6 +81,11 @@ GetSymbolFromOperand(const MachineOperand &MO) const { isImplicitlyPrivate = true; Mang->getNameWithPrefix(Name, GV, isImplicitlyPrivate); + } else if (MO.isSymbol()) { + Name += MAI.getGlobalPrefix(); + Name += MO.getSymbolName(); + } else if (MO.isMBB()) { + Name += MO.getMBB()->getSymbol()->getName(); } // If the target flags on the operand changes the name of the symbol, do that @@ -215,7 +216,7 @@ MCOperand X86MCInstLower::LowerSymbolOperand(const MachineOperand &MO, if (Expr == 0) Expr = MCSymbolRefExpr::Create(Sym, RefKind, Ctx); - if (!MO.isJTI() && MO.getOffset()) + if (!MO.isJTI() && !MO.isMBB() && MO.getOffset()) Expr = MCBinaryExpr::CreateAdd(Expr, MCConstantExpr::Create(MO.getOffset(), Ctx), Ctx); @@ -348,9 +349,6 @@ void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const { MCOp = MCOperand::CreateImm(MO.getImm()); break; case MachineOperand::MO_MachineBasicBlock: - MCOp = MCOperand::CreateExpr(MCSymbolRefExpr::Create( - MO.getMBB()->getSymbol(), Ctx)); - break; case MachineOperand::MO_GlobalAddress: case MachineOperand::MO_ExternalSymbol: MCOp = LowerSymbolOperand(MO, GetSymbolFromOperand(MO)); diff --git a/lib/Target/X86/X86VZeroUpper.cpp b/lib/Target/X86/X86VZeroUpper.cpp index 449eed3d8d5..c4a58874a41 100644 --- a/lib/Target/X86/X86VZeroUpper.cpp +++ b/lib/Target/X86/X86VZeroUpper.cpp @@ -147,7 +147,7 @@ bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) { const TargetRegisterClass *RC = &X86::VR256RegClass; for (TargetRegisterClass::iterator i = RC->begin(), e = RC->end(); i != e; i++) { - if (MRI.isPhysRegUsed(*i)) { + if (!MRI.reg_nodbg_empty(*i)) { YMMUsed = true; break; } diff --git a/lib/Transforms/IPO/BarrierNoopPass.cpp b/lib/Transforms/IPO/BarrierNoopPass.cpp new file mode 100644 index 00000000000..2e32240621f --- /dev/null +++ b/lib/Transforms/IPO/BarrierNoopPass.cpp @@ -0,0 +1,47 @@ +//===- BarrierNoopPass.cpp - A barrier pass for the pass manager ----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// NOTE: DO NOT USE THIS IF AVOIDABLE +// +// This pass is a nonce pass intended to allow manipulation of the implicitly +// nesting pass manager. For example, it can be used to cause a CGSCC pass +// manager to be closed prior to running a new collection of function passes. +// +// FIXME: This is a huge HACK. This should be removed when the pass manager's +// nesting is made explicit instead of implicit. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Pass.h" +#include "llvm/Transforms/IPO.h" +using namespace llvm; + +namespace { +/// \brief A nonce module pass used to place a barrier in a pass manager. +/// +/// There is no mechanism for ending a CGSCC pass manager once one is started. +/// This prevents extension points from having clear deterministic ordering +/// when they are phrased as non-module passes. +class BarrierNoop : public ModulePass { +public: + static char ID; // Pass identification. + + BarrierNoop() : ModulePass(ID) { + initializeBarrierNoopPass(*PassRegistry::getPassRegistry()); + } + + bool runOnModule(Module &M) { return false; } +}; +} + +ModulePass *llvm::createBarrierNoopPass() { return new BarrierNoop(); } + +char BarrierNoop::ID = 0; +INITIALIZE_PASS(BarrierNoop, "barrier", "A No-Op Barrier Pass", + false, false) diff --git a/lib/Transforms/IPO/CMakeLists.txt b/lib/Transforms/IPO/CMakeLists.txt index 3f6b1de614d..90c1c33e6dc 100644 --- a/lib/Transforms/IPO/CMakeLists.txt +++ b/lib/Transforms/IPO/CMakeLists.txt @@ -1,5 +1,6 @@ add_llvm_library(LLVMipo ArgumentPromotion.cpp + BarrierNoopPass.cpp ConstantMerge.cpp DeadArgumentElimination.cpp ExtractGV.cpp diff --git a/lib/Transforms/IPO/PassManagerBuilder.cpp b/lib/Transforms/IPO/PassManagerBuilder.cpp index a2862022a3b..1d8f1e531a0 100644 --- a/lib/Transforms/IPO/PassManagerBuilder.cpp +++ b/lib/Transforms/IPO/PassManagerBuilder.cpp @@ -119,6 +119,14 @@ void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) { MPM.add(Inliner); Inliner = 0; } + + // FIXME: This is a HACK! The inliner pass above implicitly creates a CGSCC + // pass manager, but we don't want to add extensions into that pass manager. + // To prevent this we must insert a no-op module pass to reset the pass + // manager to get the same behavior as EP_OptimizerLast in non-O0 builds. + if (!GlobalExtensions->empty() || !Extensions.empty()) + MPM.add(createBarrierNoopPass()); + addExtensionsToPM(EP_EnabledOnOptLevel0, MPM); return; } @@ -176,6 +184,12 @@ void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) { MPM.add(createIndVarSimplifyPass()); // Canonicalize indvars MPM.add(createLoopIdiomPass()); // Recognize idioms like memset. MPM.add(createLoopDeletionPass()); // Delete dead loops + + if (Vectorize) { + MPM.add(createLoopVectorizePass()); + MPM.add(createLICMPass()); + } + if (!DisableUnrollLoops) MPM.add(createLoopUnrollPass()); // Unroll small loops addExtensionsToPM(EP_LoopOptimizerEnd, MPM); diff --git a/lib/Transforms/InstCombine/InstCombineSelect.cpp b/lib/Transforms/InstCombine/InstCombineSelect.cpp index ecce242a81e..a2d4c888f2c 100644 --- a/lib/Transforms/InstCombine/InstCombineSelect.cpp +++ b/lib/Transforms/InstCombine/InstCombineSelect.cpp @@ -333,6 +333,10 @@ static Value *SimplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp, // All operands were constants, fold it. if (ConstOps.size() == I->getNumOperands()) { + if (CmpInst *C = dyn_cast<CmpInst>(I)) + return ConstantFoldCompareInstOperands(C->getPredicate(), ConstOps[0], + ConstOps[1], TD, TLI); + if (LoadInst *LI = dyn_cast<LoadInst>(I)) if (!LI->isVolatile()) return ConstantFoldLoadFromConstPtr(ConstOps[0], TD); diff --git a/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/lib/Transforms/InstCombine/InstCombineVectorOps.cpp index cf60f0f426d..dd7ea14e8a8 100644 --- a/lib/Transforms/InstCombine/InstCombineVectorOps.cpp +++ b/lib/Transforms/InstCombine/InstCombineVectorOps.cpp @@ -636,8 +636,11 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) { // If LHS's width is changed, shift the mask value accordingly. // If newRHS == NULL, i.e. LHSOp0 == RHSOp0, we want to remap any - // references to RHSOp0 to LHSOp0, so we don't need to shift the mask. - if (eltMask >= 0 && newRHS != NULL) + // references from RHSOp0 to LHSOp0, so we don't need to shift the mask. + // If newRHS == newLHS, we want to remap any references from newRHS to + // newLHS so that we can properly identify splats that may occur due to + // obfuscation accross the two vectors. + if (eltMask >= 0 && newRHS != NULL && newLHS != newRHS) eltMask += newLHSWidth; } diff --git a/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/lib/Transforms/Instrumentation/AddressSanitizer.cpp index b566994edfc..4abaeca0c5c 100644 --- a/lib/Transforms/Instrumentation/AddressSanitizer.cpp +++ b/lib/Transforms/Instrumentation/AddressSanitizer.cpp @@ -245,38 +245,6 @@ static GlobalVariable *createPrivateGlobalForString(Module &M, StringRef Str) { GlobalValue::PrivateLinkage, StrConst, ""); } -// Split the basic block and insert an if-then code. -// Before: -// Head -// Cmp -// Tail -// After: -// Head -// if (Cmp) -// ThenBlock -// Tail -// -// ThenBlock block is created and its terminator is returned. -// If Unreachable, ThenBlock is terminated with UnreachableInst, otherwise -// it is terminated with BranchInst to Tail. -static TerminatorInst *splitBlockAndInsertIfThen(Value *Cmp, bool Unreachable) { - Instruction *SplitBefore = cast<Instruction>(Cmp)->getNextNode(); - BasicBlock *Head = SplitBefore->getParent(); - BasicBlock *Tail = Head->splitBasicBlock(SplitBefore); - TerminatorInst *HeadOldTerm = Head->getTerminator(); - LLVMContext &C = Head->getParent()->getParent()->getContext(); - BasicBlock *ThenBlock = BasicBlock::Create(C, "", Head->getParent(), Tail); - TerminatorInst *CheckTerm; - if (Unreachable) - CheckTerm = new UnreachableInst(C, ThenBlock); - else - CheckTerm = BranchInst::Create(Tail, ThenBlock); - BranchInst *HeadNewTerm = - BranchInst::Create(/*ifTrue*/ThenBlock, /*ifFalse*/Tail, Cmp); - ReplaceInstWithInst(HeadOldTerm, HeadNewTerm); - return CheckTerm; -} - Value *AddressSanitizer::memToShadow(Value *Shadow, IRBuilder<> &IRB) { // Shadow >> scale Shadow = IRB.CreateLShr(Shadow, MappingScale); @@ -324,7 +292,7 @@ bool AddressSanitizer::instrumentMemIntrinsic(MemIntrinsic *MI) { Value *Cmp = IRB.CreateICmpNE(Length, Constant::getNullValue(Length->getType())); - InsertBefore = splitBlockAndInsertIfThen(Cmp, false); + InsertBefore = SplitBlockAndInsertIfThen(cast<Instruction>(Cmp), false); } instrumentMemIntrinsicParam(MI, Dst, Length, InsertBefore, true); @@ -480,7 +448,8 @@ void AddressSanitizer::instrumentAddress(Instruction *OrigIns, TerminatorInst *CrashTerm = 0; if (ClAlwaysSlowPath || (TypeSize < 8 * Granularity)) { - TerminatorInst *CheckTerm = splitBlockAndInsertIfThen(Cmp, false); + TerminatorInst *CheckTerm = + SplitBlockAndInsertIfThen(cast<Instruction>(Cmp), false); assert(dyn_cast<BranchInst>(CheckTerm)->isUnconditional()); BasicBlock *NextBB = CheckTerm->getSuccessor(0); IRB.SetInsertPoint(CheckTerm); @@ -491,7 +460,7 @@ void AddressSanitizer::instrumentAddress(Instruction *OrigIns, BranchInst *NewTerm = BranchInst::Create(CrashBlock, NextBB, Cmp2); ReplaceInstWithInst(CheckTerm, NewTerm); } else { - CrashTerm = splitBlockAndInsertIfThen(Cmp, true); + CrashTerm = SplitBlockAndInsertIfThen(cast<Instruction>(Cmp), true); } Instruction *Crash = @@ -534,7 +503,7 @@ void AddressSanitizer::createInitializerPoisonCalls(Module &M, bool AddressSanitizer::ShouldInstrumentGlobal(GlobalVariable *G) { Type *Ty = cast<PointerType>(G->getType())->getElementType(); - DEBUG(dbgs() << "GLOBAL: " << *G); + DEBUG(dbgs() << "GLOBAL: " << *G << "\n"); if (BL->isIn(*G)) return false; if (!Ty->isSized()) return false; @@ -682,7 +651,7 @@ bool AddressSanitizer::insertGlobalRedzones(Module &M) { FirstDynamic = LastDynamic; } - DEBUG(dbgs() << "NEW GLOBAL:\n" << *NewGlobal); + DEBUG(dbgs() << "NEW GLOBAL: " << *NewGlobal << "\n"); } ArrayType *ArrayOfGlobalStructTy = ArrayType::get(GlobalStructTy, n); @@ -851,6 +820,7 @@ bool AddressSanitizer::maybeInsertAsanInitAtFunctionEntry(Function &F) { bool AddressSanitizer::runOnFunction(Function &F) { if (BL->isIn(F)) return false; if (&F == AsanCtorFunction) return false; + DEBUG(dbgs() << "ASAN instrumenting:\n" << F << "\n"); // If needed, insert __asan_init before checking for AddressSafety attr. maybeInsertAsanInitAtFunctionEntry(F); @@ -914,8 +884,6 @@ bool AddressSanitizer::runOnFunction(Function &F) { NumInstrumented++; } - DEBUG(dbgs() << F); - bool ChangedStack = poisonStackInFunction(F); // We must unpoison the stack before every NoReturn call (throw, _exit, etc). @@ -925,6 +893,7 @@ bool AddressSanitizer::runOnFunction(Function &F) { IRBuilder<> IRB(CI); IRB.CreateCall(AsanHandleNoReturnFunc); } + DEBUG(dbgs() << "ASAN done instrumenting:\n" << F << "\n"); return NumInstrumented > 0 || ChangedStack || !NoReturnCalls.empty(); } @@ -1148,6 +1117,10 @@ bool AddressSanitizer::poisonStackInFunction(Function &F) { } } + // We are done. Remove the old unused alloca instructions. + for (size_t i = 0, n = AllocaVec.size(); i < n; i++) + AllocaVec[i]->eraseFromParent(); + if (ClDebugStack) { DEBUG(dbgs() << F); } diff --git a/lib/Transforms/Instrumentation/BlackList.cpp b/lib/Transforms/Instrumentation/BlackList.cpp index 2cb119964a3..ef34b8a56d8 100644 --- a/lib/Transforms/Instrumentation/BlackList.cpp +++ b/lib/Transforms/Instrumentation/BlackList.cpp @@ -45,6 +45,9 @@ BlackList::BlackList(const StringRef Path) { StringMap<std::string> Regexps; for (SmallVector<StringRef, 16>::iterator I = Lines.begin(), E = Lines.end(); I != E; ++I) { + // Ignore empty lines and lines starting with "#" + if (I->empty() || I->startswith("#")) + continue; // Get our prefix and unparsed regexp. std::pair<StringRef, StringRef> SplitLine = I->split(":"); StringRef Prefix = SplitLine.first; diff --git a/lib/Transforms/Instrumentation/BlackList.h b/lib/Transforms/Instrumentation/BlackList.h index 73977fc10a6..f3c05a5058c 100644 --- a/lib/Transforms/Instrumentation/BlackList.h +++ b/lib/Transforms/Instrumentation/BlackList.h @@ -12,7 +12,9 @@ // // The blacklist disables instrumentation of various functions and global // variables. Each line contains a prefix, followed by a wild card expression. +// Empty lines and lines starting with "#" are ignored. // --- +// # Blacklisted items: // fun:*_ZN4base6subtle* // global:*global_with_bad_access_or_initialization* // global-init:*global_with_initialization_issues* diff --git a/lib/Transforms/Scalar/DCE.cpp b/lib/Transforms/Scalar/DCE.cpp index 086f0a1a714..a2e074fae89 100644 --- a/lib/Transforms/Scalar/DCE.cpp +++ b/lib/Transforms/Scalar/DCE.cpp @@ -118,13 +118,8 @@ bool DCE::runOnFunction(Function &F) { I->eraseFromParent(); // Remove the instruction from the worklist if it still exists in it. - for (std::vector<Instruction*>::iterator WI = WorkList.begin(); - WI != WorkList.end(); ) { - if (*WI == I) - WI = WorkList.erase(WI); - else - ++WI; - } + WorkList.erase(std::remove(WorkList.begin(), WorkList.end(), I), + WorkList.end()); MadeChange = true; ++DCEEliminated; diff --git a/lib/Transforms/Scalar/IndVarSimplify.cpp b/lib/Transforms/Scalar/IndVarSimplify.cpp index 78630b2a9f3..82eb7464677 100644 --- a/lib/Transforms/Scalar/IndVarSimplify.cpp +++ b/lib/Transforms/Scalar/IndVarSimplify.cpp @@ -551,15 +551,17 @@ void IndVarSimplify::RewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) { PN->setIncomingValue(i, ExitVal); - // If this instruction is dead now, delete it. - RecursivelyDeleteTriviallyDeadInstructions(Inst, TLI); + // If this instruction is dead now, delete it. Don't do it now to avoid + // invalidating iterators. + if (isInstructionTriviallyDead(Inst, TLI)) + DeadInsts.push_back(Inst); if (NumPreds == 1) { // Completely replace a single-pred PHI. This is safe, because the // NewVal won't be variant in the loop, so we don't need an LCSSA phi // node anymore. PN->replaceAllUsesWith(ExitVal); - RecursivelyDeleteTriviallyDeadInstructions(PN, TLI); + PN->eraseFromParent(); } } if (NumPreds != 1) { diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp index 99a62dbe62f..958348d9faa 100644 --- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -37,7 +37,7 @@ // // TODO: Handle multiple loops at a time. // -// TODO: Should AddrMode::BaseGV be changed to a ConstantExpr +// TODO: Should TargetLowering::AddrMode::BaseGV be changed to a ConstantExpr // instead of a GlobalValue? // // TODO: When truncation is free, truncate ICmp users' operands to make it a @@ -67,7 +67,6 @@ #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" -#include "llvm/TargetTransformInfo.h" #include "llvm/ADT/SmallBitVector.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/DenseSet.h" @@ -75,6 +74,7 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/ValueHandle.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetLowering.h" #include <algorithm> using namespace llvm; @@ -1118,7 +1118,7 @@ public: enum KindType { Basic, ///< A normal use, with no folding. Special, ///< A special case of basic, allowing -1 scales. - Address, ///< An address use; folding according to ScalarTargetTransformInfo. + Address, ///< An address use; folding according to TargetLowering ICmpZero ///< An equality icmp with both operands folded into one. // TODO: Add a generic icmp too? }; @@ -1272,12 +1272,12 @@ void LSRUse::dump() const { /// address-mode folding and special icmp tricks. static bool isLegalUse(const AddrMode &AM, LSRUse::KindType Kind, Type *AccessTy, - const ScalarTargetTransformInfo *STTI) { + const TargetLowering *TLI) { switch (Kind) { case LSRUse::Address: // If we have low-level target information, ask the target if it can // completely fold this address. - if (STTI) return STTI->isLegalAddressingMode(AM, AccessTy); + if (TLI) return TLI->isLegalAddressingMode(AM, AccessTy); // Otherwise, just guess that reg+reg addressing is legal. return !AM.BaseGV && AM.BaseOffs == 0 && AM.Scale <= 1; @@ -1300,7 +1300,7 @@ static bool isLegalUse(const AddrMode &AM, // If we have low-level target information, ask the target if it can fold an // integer immediate on an icmp. if (AM.BaseOffs != 0) { - if (!STTI) + if (!TLI) return false; // We have one of: // ICmpZero BaseReg + Offset => ICmp BaseReg, -Offset @@ -1309,7 +1309,7 @@ static bool isLegalUse(const AddrMode &AM, int64_t Offs = AM.BaseOffs; if (AM.Scale == 0) Offs = -(uint64_t)Offs; // The cast does the right thing with INT64_MIN. - return STTI->isLegalICmpImmediate(Offs); + return TLI->isLegalICmpImmediate(Offs); } // ICmpZero BaseReg + -1*ScaleReg => ICmp BaseReg, ScaleReg @@ -1330,20 +1330,20 @@ static bool isLegalUse(const AddrMode &AM, static bool isLegalUse(AddrMode AM, int64_t MinOffset, int64_t MaxOffset, LSRUse::KindType Kind, Type *AccessTy, - const ScalarTargetTransformInfo *LTTI) { + const TargetLowering *TLI) { // Check for overflow. if (((int64_t)((uint64_t)AM.BaseOffs + MinOffset) > AM.BaseOffs) != (MinOffset > 0)) return false; AM.BaseOffs = (uint64_t)AM.BaseOffs + MinOffset; - if (isLegalUse(AM, Kind, AccessTy, LTTI)) { + if (isLegalUse(AM, Kind, AccessTy, TLI)) { AM.BaseOffs = (uint64_t)AM.BaseOffs - MinOffset; // Check for overflow. if (((int64_t)((uint64_t)AM.BaseOffs + MaxOffset) > AM.BaseOffs) != (MaxOffset > 0)) return false; AM.BaseOffs = (uint64_t)AM.BaseOffs + MaxOffset; - return isLegalUse(AM, Kind, AccessTy, LTTI); + return isLegalUse(AM, Kind, AccessTy, TLI); } return false; } @@ -1352,7 +1352,7 @@ static bool isAlwaysFoldable(int64_t BaseOffs, GlobalValue *BaseGV, bool HasBaseReg, LSRUse::KindType Kind, Type *AccessTy, - const ScalarTargetTransformInfo *LTTI) { + const TargetLowering *TLI) { // Fast-path: zero is always foldable. if (BaseOffs == 0 && !BaseGV) return true; @@ -1371,14 +1371,14 @@ static bool isAlwaysFoldable(int64_t BaseOffs, AM.HasBaseReg = true; } - return isLegalUse(AM, Kind, AccessTy, LTTI); + return isLegalUse(AM, Kind, AccessTy, TLI); } static bool isAlwaysFoldable(const SCEV *S, int64_t MinOffset, int64_t MaxOffset, bool HasBaseReg, LSRUse::KindType Kind, Type *AccessTy, - const ScalarTargetTransformInfo *LTTI, + const TargetLowering *TLI, ScalarEvolution &SE) { // Fast-path: zero is always foldable. if (S->isZero()) return true; @@ -1402,7 +1402,7 @@ static bool isAlwaysFoldable(const SCEV *S, AM.HasBaseReg = HasBaseReg; AM.Scale = Kind == LSRUse::ICmpZero ? -1 : 1; - return isLegalUse(AM, MinOffset, MaxOffset, Kind, AccessTy, LTTI); + return isLegalUse(AM, MinOffset, MaxOffset, Kind, AccessTy, TLI); } namespace { @@ -1502,7 +1502,7 @@ class LSRInstance { ScalarEvolution &SE; DominatorTree &DT; LoopInfo &LI; - const ScalarTargetTransformInfo *const STTI; + const TargetLowering *const TLI; Loop *const L; bool Changed; @@ -1638,7 +1638,7 @@ class LSRInstance { Pass *P); public: - LSRInstance(const ScalarTargetTransformInfo *ltti, Loop *l, Pass *P); + LSRInstance(const TargetLowering *tli, Loop *l, Pass *P); bool getChanged() const { return Changed; } @@ -1688,10 +1688,11 @@ void LSRInstance::OptimizeShadowIV() { } if (!DestTy) continue; - if (STTI) { + if (TLI) { // If target does not support DestTy natively then do not apply // this transformation. - if (!STTI->isTypeLegal(DestTy)) continue; + EVT DVT = TLI->getValueType(DestTy); + if (!TLI->isTypeLegal(DVT)) continue; } PHINode *PH = dyn_cast<PHINode>(ShadowUse->getOperand(0)); @@ -2014,18 +2015,18 @@ LSRInstance::OptimizeLoopTermCond() { if (C->getValue().getMinSignedBits() >= 64 || C->getValue().isMinSignedValue()) goto decline_post_inc; - // Without STTI, assume that any stride might be valid, and so any + // Without TLI, assume that any stride might be valid, and so any // use might be shared. - if (!STTI) + if (!TLI) goto decline_post_inc; // Check for possible scaled-address reuse. Type *AccessTy = getAccessType(UI->getUser()); AddrMode AM; AM.Scale = C->getSExtValue(); - if (STTI->isLegalAddressingMode(AM, AccessTy)) + if (TLI->isLegalAddressingMode(AM, AccessTy)) goto decline_post_inc; AM.Scale = -AM.Scale; - if (STTI->isLegalAddressingMode(AM, AccessTy)) + if (TLI->isLegalAddressingMode(AM, AccessTy)) goto decline_post_inc; } } @@ -2096,12 +2097,12 @@ LSRInstance::reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg, // Conservatively assume HasBaseReg is true for now. if (NewOffset < LU.MinOffset) { if (!isAlwaysFoldable(LU.MaxOffset - NewOffset, 0, HasBaseReg, - Kind, AccessTy, STTI)) + Kind, AccessTy, TLI)) return false; NewMinOffset = NewOffset; } else if (NewOffset > LU.MaxOffset) { if (!isAlwaysFoldable(NewOffset - LU.MinOffset, 0, HasBaseReg, - Kind, AccessTy, STTI)) + Kind, AccessTy, TLI)) return false; NewMaxOffset = NewOffset; } @@ -2130,7 +2131,7 @@ LSRInstance::getUse(const SCEV *&Expr, int64_t Offset = ExtractImmediate(Expr, SE); // Basic uses can't accept any offset, for example. - if (!isAlwaysFoldable(Offset, 0, /*HasBaseReg=*/true, Kind, AccessTy, STTI)) { + if (!isAlwaysFoldable(Offset, 0, /*HasBaseReg=*/true, Kind, AccessTy, TLI)) { Expr = Copy; Offset = 0; } @@ -2395,7 +2396,7 @@ bool IVChain::isProfitableIncrement(const SCEV *OperExpr, /// TODO: Consider IVInc free if it's already used in another chains. static bool isProfitableChain(IVChain &Chain, SmallPtrSet<Instruction*, 4> &Users, - ScalarEvolution &SE, const ScalarTargetTransformInfo *STTI) { + ScalarEvolution &SE, const TargetLowering *TLI) { if (StressIVChain) return true; @@ -2653,7 +2654,7 @@ void LSRInstance::CollectChains() { for (unsigned UsersIdx = 0, NChains = IVChainVec.size(); UsersIdx < NChains; ++UsersIdx) { if (!isProfitableChain(IVChainVec[UsersIdx], - ChainUsersVec[UsersIdx].FarUsers, SE, STTI)) + ChainUsersVec[UsersIdx].FarUsers, SE, TLI)) continue; // Preserve the chain at UsesIdx. if (ChainIdx != UsersIdx) @@ -2680,8 +2681,7 @@ void LSRInstance::FinalizeChain(IVChain &Chain) { /// Return true if the IVInc can be folded into an addressing mode. static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst, - Value *Operand, - const ScalarTargetTransformInfo *STTI) { + Value *Operand, const TargetLowering *TLI) { const SCEVConstant *IncConst = dyn_cast<SCEVConstant>(IncExpr); if (!IncConst || !isAddressUse(UserInst, Operand)) return false; @@ -2691,7 +2691,7 @@ static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst, int64_t IncOffset = IncConst->getValue()->getSExtValue(); if (!isAlwaysFoldable(IncOffset, /*BaseGV=*/0, /*HaseBaseReg=*/false, - LSRUse::Address, getAccessType(UserInst), STTI)) + LSRUse::Address, getAccessType(UserInst), TLI)) return false; return true; @@ -2762,7 +2762,7 @@ void LSRInstance::GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter, // If an IV increment can't be folded, use it as the next IV value. if (!canFoldIVIncExpr(LeftOverExpr, IncI->UserInst, IncI->IVOperand, - STTI)) { + TLI)) { assert(IVTy == IVOper->getType() && "inconsistent IV increment type"); IVSrc = IVOper; LeftOverExpr = 0; @@ -3108,7 +3108,7 @@ void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx, // into an immediate field. if (isAlwaysFoldable(*J, LU.MinOffset, LU.MaxOffset, Base.getNumRegs() > 1, - LU.Kind, LU.AccessTy, STTI, SE)) + LU.Kind, LU.AccessTy, TLI, SE)) continue; // Collect all operands except *J. @@ -3122,7 +3122,7 @@ void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx, if (InnerAddOps.size() == 1 && isAlwaysFoldable(InnerAddOps[0], LU.MinOffset, LU.MaxOffset, Base.getNumRegs() > 1, - LU.Kind, LU.AccessTy, STTI, SE)) + LU.Kind, LU.AccessTy, TLI, SE)) continue; const SCEV *InnerSum = SE.getAddExpr(InnerAddOps); @@ -3132,9 +3132,9 @@ void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx, // Add the remaining pieces of the add back into the new formula. const SCEVConstant *InnerSumSC = dyn_cast<SCEVConstant>(InnerSum); - if (STTI && InnerSumSC && + if (TLI && InnerSumSC && SE.getTypeSizeInBits(InnerSumSC->getType()) <= 64 && - STTI->isLegalAddImmediate((uint64_t)F.UnfoldedOffset + + TLI->isLegalAddImmediate((uint64_t)F.UnfoldedOffset + InnerSumSC->getValue()->getZExtValue())) { F.UnfoldedOffset = (uint64_t)F.UnfoldedOffset + InnerSumSC->getValue()->getZExtValue(); @@ -3144,8 +3144,8 @@ void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx, // Add J as its own register, or an unfolded immediate. const SCEVConstant *SC = dyn_cast<SCEVConstant>(*J); - if (STTI && SC && SE.getTypeSizeInBits(SC->getType()) <= 64 && - STTI->isLegalAddImmediate((uint64_t)F.UnfoldedOffset + + if (TLI && SC && SE.getTypeSizeInBits(SC->getType()) <= 64 && + TLI->isLegalAddImmediate((uint64_t)F.UnfoldedOffset + SC->getValue()->getZExtValue())) F.UnfoldedOffset = (uint64_t)F.UnfoldedOffset + SC->getValue()->getZExtValue(); @@ -3205,7 +3205,7 @@ void LSRInstance::GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx, Formula F = Base; F.AM.BaseGV = GV; if (!isLegalUse(F.AM, LU.MinOffset, LU.MaxOffset, - LU.Kind, LU.AccessTy, STTI)) + LU.Kind, LU.AccessTy, TLI)) continue; F.BaseRegs[i] = G; (void)InsertFormula(LU, LUIdx, F); @@ -3230,7 +3230,7 @@ void LSRInstance::GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx, Formula F = Base; F.AM.BaseOffs = (uint64_t)Base.AM.BaseOffs - *I; if (isLegalUse(F.AM, LU.MinOffset - *I, LU.MaxOffset - *I, - LU.Kind, LU.AccessTy, STTI)) { + LU.Kind, LU.AccessTy, TLI)) { // Add the offset to the base register. const SCEV *NewG = SE.getAddExpr(SE.getConstant(G->getType(), *I), G); // If it cancelled out, drop the base register, otherwise update it. @@ -3250,7 +3250,7 @@ void LSRInstance::GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx, Formula F = Base; F.AM.BaseOffs = (uint64_t)F.AM.BaseOffs + Imm; if (!isLegalUse(F.AM, LU.MinOffset, LU.MaxOffset, - LU.Kind, LU.AccessTy, STTI)) + LU.Kind, LU.AccessTy, TLI)) continue; F.BaseRegs[i] = G; (void)InsertFormula(LU, LUIdx, F); @@ -3297,7 +3297,7 @@ void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, F.AM.BaseOffs = NewBaseOffs; // Check that this scale is legal. - if (!isLegalUse(F.AM, Offset, Offset, LU.Kind, LU.AccessTy, STTI)) + if (!isLegalUse(F.AM, Offset, Offset, LU.Kind, LU.AccessTy, TLI)) continue; // Compensate for the use having MinOffset built into it. @@ -3353,12 +3353,12 @@ void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) { Base.AM.HasBaseReg = Base.BaseRegs.size() > 1; // Check whether this scale is going to be legal. if (!isLegalUse(Base.AM, LU.MinOffset, LU.MaxOffset, - LU.Kind, LU.AccessTy, STTI)) { + LU.Kind, LU.AccessTy, TLI)) { // As a special-case, handle special out-of-loop Basic users specially. // TODO: Reconsider this special case. if (LU.Kind == LSRUse::Basic && isLegalUse(Base.AM, LU.MinOffset, LU.MaxOffset, - LSRUse::Special, LU.AccessTy, STTI) && + LSRUse::Special, LU.AccessTy, TLI) && LU.AllFixupsOutsideLoop) LU.Kind = LSRUse::Special; else @@ -3391,8 +3391,8 @@ void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) { /// GenerateTruncates - Generate reuse formulae from different IV types. void LSRInstance::GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base) { - // This requires ScalarTargetTransformInfo to tell us which truncates are free. - if (!STTI) return; + // This requires TargetLowering to tell us which truncates are free. + if (!TLI) return; // Don't bother truncating symbolic values. if (Base.AM.BaseGV) return; @@ -3405,7 +3405,7 @@ void LSRInstance::GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base) { for (SmallSetVector<Type *, 4>::const_iterator I = Types.begin(), E = Types.end(); I != E; ++I) { Type *SrcTy = *I; - if (SrcTy != DstTy && STTI->isTruncateFree(SrcTy, DstTy)) { + if (SrcTy != DstTy && TLI->isTruncateFree(SrcTy, DstTy)) { Formula F = Base; if (F.ScaledReg) F.ScaledReg = SE.getAnyExtendExpr(F.ScaledReg, *I); @@ -3561,7 +3561,7 @@ void LSRInstance::GenerateCrossUseConstantOffsets() { Formula NewF = F; NewF.AM.BaseOffs = Offs; if (!isLegalUse(NewF.AM, LU.MinOffset, LU.MaxOffset, - LU.Kind, LU.AccessTy, STTI)) + LU.Kind, LU.AccessTy, TLI)) continue; NewF.ScaledReg = SE.getAddExpr(NegImmS, NewF.ScaledReg); @@ -3586,9 +3586,9 @@ void LSRInstance::GenerateCrossUseConstantOffsets() { Formula NewF = F; NewF.AM.BaseOffs = (uint64_t)NewF.AM.BaseOffs + Imm; if (!isLegalUse(NewF.AM, LU.MinOffset, LU.MaxOffset, - LU.Kind, LU.AccessTy, STTI)) { - if (!STTI || - !STTI->isLegalAddImmediate((uint64_t)NewF.UnfoldedOffset + Imm)) + LU.Kind, LU.AccessTy, TLI)) { + if (!TLI || + !TLI->isLegalAddImmediate((uint64_t)NewF.UnfoldedOffset + Imm)) continue; NewF = F; NewF.UnfoldedOffset = (uint64_t)NewF.UnfoldedOffset + Imm; @@ -3900,7 +3900,7 @@ void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() { Formula &F = LUThatHas->Formulae[i]; if (!isLegalUse(F.AM, LUThatHas->MinOffset, LUThatHas->MaxOffset, - LUThatHas->Kind, LUThatHas->AccessTy, STTI)) { + LUThatHas->Kind, LUThatHas->AccessTy, TLI)) { DEBUG(dbgs() << " Deleting "; F.print(dbgs()); dbgs() << '\n'); LUThatHas->DeleteFormula(F); @@ -4589,12 +4589,12 @@ LSRInstance::ImplementSolution(const SmallVectorImpl<const Formula *> &Solution, Changed |= DeleteTriviallyDeadInstructions(DeadInsts); } -LSRInstance::LSRInstance(const ScalarTargetTransformInfo *stti, Loop *l, Pass *P) +LSRInstance::LSRInstance(const TargetLowering *tli, Loop *l, Pass *P) : IU(P->getAnalysis<IVUsers>()), SE(P->getAnalysis<ScalarEvolution>()), DT(P->getAnalysis<DominatorTree>()), LI(P->getAnalysis<LoopInfo>()), - STTI(stti), L(l), Changed(false), IVIncInsertPos(0) { + TLI(tli), L(l), Changed(false), IVIncInsertPos(0) { // If LoopSimplify form is not available, stay out of trouble. if (!L->isLoopSimplifyForm()) @@ -4684,7 +4684,7 @@ LSRInstance::LSRInstance(const ScalarTargetTransformInfo *stti, Loop *l, Pass *P for (SmallVectorImpl<Formula>::const_iterator J = LU.Formulae.begin(), JE = LU.Formulae.end(); J != JE; ++J) assert(isLegalUse(J->AM, LU.MinOffset, LU.MaxOffset, - LU.Kind, LU.AccessTy, STTI) && + LU.Kind, LU.AccessTy, TLI) && "Illegal formula generated!"); }; #endif @@ -4757,13 +4757,13 @@ void LSRInstance::dump() const { namespace { class LoopStrengthReduce : public LoopPass { - /// ScalarTargetTransformInfo provides target information that is needed - /// for strength reducing loops. - const ScalarTargetTransformInfo *STTI; + /// TLI - Keep a pointer of a TargetLowering to consult for determining + /// transformation profitability. + const TargetLowering *const TLI; public: static char ID; // Pass ID, replacement for typeid - LoopStrengthReduce(); + explicit LoopStrengthReduce(const TargetLowering *tli = 0); private: bool runOnLoop(Loop *L, LPPassManager &LPM); @@ -4783,12 +4783,13 @@ INITIALIZE_PASS_DEPENDENCY(LoopSimplify) INITIALIZE_PASS_END(LoopStrengthReduce, "loop-reduce", "Loop Strength Reduction", false, false) -Pass *llvm::createLoopStrengthReducePass() { - return new LoopStrengthReduce(); + +Pass *llvm::createLoopStrengthReducePass(const TargetLowering *TLI) { + return new LoopStrengthReduce(TLI); } -LoopStrengthReduce::LoopStrengthReduce() - : LoopPass(ID), STTI(0) { +LoopStrengthReduce::LoopStrengthReduce(const TargetLowering *tli) + : LoopPass(ID), TLI(tli) { initializeLoopStrengthReducePass(*PassRegistry::getPassRegistry()); } @@ -4814,13 +4815,8 @@ void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const { bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager & /*LPM*/) { bool Changed = false; - TargetTransformInfo *TTI = getAnalysisIfAvailable<TargetTransformInfo>(); - - if (TTI) - STTI = TTI->getScalarTargetTransformInfo(); - // Run the main LSR transformation. - Changed |= LSRInstance(STTI, L, this).getChanged(); + Changed |= LSRInstance(TLI, L, this).getChanged(); // Remove any extra phis created by processing inner loops. Changed |= DeleteDeadPHIs(L->getHeader()); @@ -4831,7 +4827,7 @@ bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager & /*LPM*/) { Rewriter.setDebugType(DEBUG_TYPE); #endif unsigned numFolded = Rewriter. - replaceCongruentIVs(L, &getAnalysis<DominatorTree>(), DeadInsts, STTI); + replaceCongruentIVs(L, &getAnalysis<DominatorTree>(), DeadInsts, TLI); if (numFolded) { Changed = true; DeleteTriviallyDeadInstructions(DeadInsts); diff --git a/lib/Transforms/Scalar/LoopUnswitch.cpp b/lib/Transforms/Scalar/LoopUnswitch.cpp index 15b168fe2aa..047b43eb84f 100644 --- a/lib/Transforms/Scalar/LoopUnswitch.cpp +++ b/lib/Transforms/Scalar/LoopUnswitch.cpp @@ -907,13 +907,9 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val, /// specified. static void RemoveFromWorklist(Instruction *I, std::vector<Instruction*> &Worklist) { - std::vector<Instruction*>::iterator WI = std::find(Worklist.begin(), - Worklist.end(), I); - while (WI != Worklist.end()) { - unsigned Offset = WI-Worklist.begin(); - Worklist.erase(WI); - WI = std::find(Worklist.begin()+Offset, Worklist.end(), I); - } + + Worklist.erase(std::remove(Worklist.begin(), Worklist.end(), I), + Worklist.end()); } /// ReplaceUsesOfWith - When we find that I really equals V, remove I from the diff --git a/lib/Transforms/Scalar/SROA.cpp b/lib/Transforms/Scalar/SROA.cpp index 3e84a91c1db..71c62257e7f 100644 --- a/lib/Transforms/Scalar/SROA.cpp +++ b/lib/Transforms/Scalar/SROA.cpp @@ -132,9 +132,6 @@ public: /// /// We flag partitions as splittable when they are formed entirely due to /// accesses by trivially splittable operations such as memset and memcpy. - /// - /// FIXME: At some point we should consider loads and stores of FCAs to be - /// splittable and eagerly split them into scalar values. bool IsSplittable; /// \brief Test whether a partition has been marked as dead. @@ -1785,9 +1782,9 @@ static Value *getNaturalGEPWithType(IRBuilder<> &IRB, const DataLayout &TD, break; if (SequentialType *SeqTy = dyn_cast<SequentialType>(ElementTy)) { ElementTy = SeqTy->getElementType(); - Indices.push_back(IRB.getInt(APInt(TD.getPointerSizeInBits( - ElementTy->isPointerTy() ? - cast<PointerType>(ElementTy)->getAddressSpace(): 0), 0))); + // Note that we use the default address space as this index is over an + // array or a vector, not a pointer. + Indices.push_back(IRB.getInt(APInt(TD.getPointerSizeInBits(0), 0))); } else if (StructType *STy = dyn_cast<StructType>(ElementTy)) { if (STy->element_begin() == STy->element_end()) break; // Nothing left to descend into. @@ -1828,7 +1825,7 @@ static Value *getNaturalGEPRecursively(IRBuilder<> &IRB, const DataLayout &TD, if (ElementSizeInBits % 8) return 0; // GEPs over non-multiple of 8 size vector elements are invalid. APInt ElementSize(Offset.getBitWidth(), ElementSizeInBits / 8); - APInt NumSkippedElements = Offset.udiv(ElementSize); + APInt NumSkippedElements = Offset.sdiv(ElementSize); if (NumSkippedElements.ugt(VecTy->getNumElements())) return 0; Offset -= NumSkippedElements * ElementSize; @@ -1840,7 +1837,7 @@ static Value *getNaturalGEPRecursively(IRBuilder<> &IRB, const DataLayout &TD, if (ArrayType *ArrTy = dyn_cast<ArrayType>(Ty)) { Type *ElementTy = ArrTy->getElementType(); APInt ElementSize(Offset.getBitWidth(), TD.getTypeAllocSize(ElementTy)); - APInt NumSkippedElements = Offset.udiv(ElementSize); + APInt NumSkippedElements = Offset.sdiv(ElementSize); if (NumSkippedElements.ugt(ArrTy->getNumElements())) return 0; @@ -1896,7 +1893,7 @@ static Value *getNaturalGEPWithOffset(IRBuilder<> &IRB, const DataLayout &TD, APInt ElementSize(Offset.getBitWidth(), TD.getTypeAllocSize(ElementTy)); if (ElementSize == 0) return 0; // Zero-length arrays can't help us build a natural GEP. - APInt NumSkippedElements = Offset.udiv(ElementSize); + APInt NumSkippedElements = Offset.sdiv(ElementSize); Offset -= NumSkippedElements * ElementSize; Indices.push_back(IRB.getInt(NumSkippedElements)); @@ -2211,6 +2208,48 @@ static bool isIntegerWideningViable(const DataLayout &TD, return WholeAllocaOp; } +static Value *extractInteger(const DataLayout &DL, IRBuilder<> &IRB, Value *V, + IntegerType *Ty, uint64_t Offset, + const Twine &Name) { + IntegerType *IntTy = cast<IntegerType>(V->getType()); + assert(DL.getTypeStoreSize(Ty) + Offset <= DL.getTypeStoreSize(IntTy) && + "Element extends past full value"); + uint64_t ShAmt = 8*Offset; + if (DL.isBigEndian()) + ShAmt = 8*(DL.getTypeStoreSize(IntTy) - DL.getTypeStoreSize(Ty) - Offset); + if (ShAmt) + V = IRB.CreateLShr(V, ShAmt, Name + ".shift"); + assert(Ty->getBitWidth() <= IntTy->getBitWidth() && + "Cannot extract to a larger integer!"); + if (Ty != IntTy) + V = IRB.CreateTrunc(V, Ty, Name + ".trunc"); + return V; +} + +static Value *insertInteger(const DataLayout &DL, IRBuilder<> &IRB, Value *Old, + Value *V, uint64_t Offset, const Twine &Name) { + IntegerType *IntTy = cast<IntegerType>(Old->getType()); + IntegerType *Ty = cast<IntegerType>(V->getType()); + assert(Ty->getBitWidth() <= IntTy->getBitWidth() && + "Cannot insert a larger integer!"); + if (Ty != IntTy) + V = IRB.CreateZExt(V, IntTy, Name + ".ext"); + assert(DL.getTypeStoreSize(Ty) + Offset <= DL.getTypeStoreSize(IntTy) && + "Element store outside of alloca store"); + uint64_t ShAmt = 8*Offset; + if (DL.isBigEndian()) + ShAmt = 8*(DL.getTypeStoreSize(IntTy) - DL.getTypeStoreSize(Ty) - Offset); + if (ShAmt) + V = IRB.CreateShl(V, ShAmt, Name + ".shift"); + + if (ShAmt || Ty->getBitWidth() < IntTy->getBitWidth()) { + APInt Mask = ~Ty->getMask().zext(IntTy->getBitWidth()).shl(ShAmt); + Old = IRB.CreateAnd(Old, Mask, Name + ".mask"); + V = IRB.CreateOr(Old, V, Name + ".insert"); + } + return V; +} + namespace { /// \brief Visitor to rewrite instructions using a partition of an alloca to /// use a new alloca. @@ -2371,60 +2410,6 @@ private: return IRB.getInt32(Index); } - Value *extractInteger(IRBuilder<> &IRB, IntegerType *TargetTy, - uint64_t Offset) { - assert(IntTy && "We cannot extract an integer from the alloca"); - Value *V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), - getName(".load")); - V = convertValue(TD, IRB, V, IntTy); - assert(Offset >= NewAllocaBeginOffset && "Out of bounds offset"); - uint64_t RelOffset = Offset - NewAllocaBeginOffset; - assert(TD.getTypeStoreSize(TargetTy) + RelOffset <= - TD.getTypeStoreSize(IntTy) && - "Element load outside of alloca store"); - uint64_t ShAmt = 8*RelOffset; - if (TD.isBigEndian()) - ShAmt = 8*(TD.getTypeStoreSize(IntTy) - - TD.getTypeStoreSize(TargetTy) - RelOffset); - if (ShAmt) - V = IRB.CreateLShr(V, ShAmt, getName(".shift")); - assert(TargetTy->getBitWidth() <= IntTy->getBitWidth() && - "Cannot extract to a larger integer!"); - if (TargetTy != IntTy) - V = IRB.CreateTrunc(V, TargetTy, getName(".trunc")); - return V; - } - - StoreInst *insertInteger(IRBuilder<> &IRB, Value *V, uint64_t Offset) { - IntegerType *Ty = cast<IntegerType>(V->getType()); - assert(Ty->getBitWidth() <= IntTy->getBitWidth() && - "Cannot insert a larger integer!"); - if (Ty != IntTy) - V = IRB.CreateZExt(V, IntTy, getName(".ext")); - assert(Offset >= NewAllocaBeginOffset && "Out of bounds offset"); - uint64_t RelOffset = Offset - NewAllocaBeginOffset; - assert(TD.getTypeStoreSize(Ty) + RelOffset <= - TD.getTypeStoreSize(IntTy) && - "Element store outside of alloca store"); - uint64_t ShAmt = 8*RelOffset; - if (TD.isBigEndian()) - ShAmt = 8*(TD.getTypeStoreSize(IntTy) - TD.getTypeStoreSize(Ty) - - RelOffset); - if (ShAmt) - V = IRB.CreateShl(V, ShAmt, getName(".shift")); - - if (ShAmt || Ty->getBitWidth() < IntTy->getBitWidth()) { - APInt Mask = ~Ty->getMask().zext(IntTy->getBitWidth()).shl(ShAmt); - Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), - getName(".oldload")); - Old = convertValue(TD, IRB, Old, IntTy); - Old = IRB.CreateAnd(Old, Mask, getName(".mask")); - V = IRB.CreateOr(Old, V, getName(".insert")); - } - V = convertValue(TD, IRB, V, NewAllocaTy); - return IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment()); - } - void deleteIfTriviallyDead(Value *V) { Instruction *I = cast<Instruction>(V); if (isInstructionTriviallyDead(I)) @@ -2452,12 +2437,18 @@ private: } bool rewriteIntegerLoad(IRBuilder<> &IRB, LoadInst &LI) { + assert(IntTy && "We cannot insert an integer to the alloca"); assert(!LI.isVolatile()); - Value *Result = extractInteger(IRB, cast<IntegerType>(LI.getType()), - BeginOffset); - LI.replaceAllUsesWith(Result); + Value *V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), + getName(".load")); + V = convertValue(TD, IRB, V, IntTy); + assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset"); + uint64_t Offset = BeginOffset - NewAllocaBeginOffset; + V = extractInteger(TD, IRB, V, cast<IntegerType>(LI.getType()), Offset, + getName(".extract")); + LI.replaceAllUsesWith(V); Pass.DeadInsts.push_back(&LI); - DEBUG(dbgs() << " to: " << *Result << "\n"); + DEBUG(dbgs() << " to: " << *V << "\n"); return true; } @@ -2519,8 +2510,20 @@ private: } bool rewriteIntegerStore(IRBuilder<> &IRB, StoreInst &SI) { + assert(IntTy && "We cannot extract an integer from the alloca"); assert(!SI.isVolatile()); - StoreInst *Store = insertInteger(IRB, SI.getValueOperand(), BeginOffset); + Value *V = SI.getValueOperand(); + if (TD.getTypeSizeInBits(V->getType()) != IntTy->getBitWidth()) { + Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), + getName(".oldload")); + Old = convertValue(TD, IRB, Old, IntTy); + assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset"); + uint64_t Offset = BeginOffset - NewAllocaBeginOffset; + V = insertInteger(TD, IRB, Old, SI.getValueOperand(), Offset, + getName(".insert")); + } + V = convertValue(TD, IRB, V, NewAllocaTy); + StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment()); Pass.DeadInsts.push_back(&SI); (void)Store; DEBUG(dbgs() << " to: " << *Store << "\n"); @@ -2652,10 +2655,12 @@ private: if (IntTy && (BeginOffset > NewAllocaBeginOffset || EndOffset < NewAllocaEndOffset)) { assert(!II.isVolatile()); - StoreInst *Store = insertInteger(IRB, V, BeginOffset); - (void)Store; - DEBUG(dbgs() << " to: " << *Store << "\n"); - return true; + Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), + getName(".oldload")); + Old = convertValue(TD, IRB, Old, IntTy); + assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset"); + uint64_t Offset = BeginOffset - NewAllocaBeginOffset; + V = insertInteger(TD, IRB, Old, V, Offset, getName(".insert")); } if (V->getType() != AllocaTy) @@ -2811,17 +2816,25 @@ private: getIndex(IRB, BeginOffset), getName(".copyextract")); } else if (IntTy && !IsWholeAlloca && !IsDest) { - Src = extractInteger(IRB, SubIntTy, BeginOffset); + Src = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), + getName(".load")); + Src = convertValue(TD, IRB, Src, IntTy); + assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset"); + uint64_t Offset = BeginOffset - NewAllocaBeginOffset; + Src = extractInteger(TD, IRB, Src, SubIntTy, Offset, getName(".extract")); } else { Src = IRB.CreateAlignedLoad(SrcPtr, Align, II.isVolatile(), getName(".copyload")); } if (IntTy && !IsWholeAlloca && IsDest) { - StoreInst *Store = insertInteger(IRB, Src, BeginOffset); - (void)Store; - DEBUG(dbgs() << " to: " << *Store << "\n"); - return true; + Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), + getName(".oldload")); + Old = convertValue(TD, IRB, Old, IntTy); + assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset"); + uint64_t Offset = BeginOffset - NewAllocaBeginOffset; + Src = insertInteger(TD, IRB, Old, Src, Offset, getName(".insert")); + Src = convertValue(TD, IRB, Src, NewAllocaTy); } if (IsVectorElement && IsDest) { @@ -2875,10 +2888,7 @@ private: Value *NewPtr = getAdjustedAllocaPtr(PtrBuilder, OldPtr->getType()); // Replace the operands which were using the old pointer. - User::op_iterator OI = PN.op_begin(), OE = PN.op_end(); - for (; OI != OE; ++OI) - if (*OI == OldPtr) - *OI = NewPtr; + std::replace(PN.op_begin(), PN.op_end(), cast<Value>(OldPtr), NewPtr); DEBUG(dbgs() << " to: " << PN << "\n"); deleteIfTriviallyDead(OldPtr); @@ -3248,11 +3258,7 @@ static Type *getTypePartition(const DataLayout &TD, Type *Ty, } // Try to build up a sub-structure. - SmallVector<Type *, 4> ElementTys; - do { - ElementTys.push_back(*EI++); - } while (EI != EE); - StructType *SubTy = StructType::get(STy->getContext(), ElementTys, + StructType *SubTy = StructType::get(STy->getContext(), makeArrayRef(EI, EE), STy->isPacked()); const StructLayout *SubSL = TD.getStructLayout(SubTy); if (Size != SubSL->getSizeInBytes()) diff --git a/lib/Transforms/Scalar/SimplifyLibCalls.cpp b/lib/Transforms/Scalar/SimplifyLibCalls.cpp index d86c4cbc9f6..c82a00fc2c8 100644 --- a/lib/Transforms/Scalar/SimplifyLibCalls.cpp +++ b/lib/Transforms/Scalar/SimplifyLibCalls.cpp @@ -135,47 +135,6 @@ static bool IsOnlyUsedInEqualityComparison(Value *V, Value *With) { namespace { //===---------------------------------------===// -// 'strcpy' Optimizations - -struct StrCpyOpt : public LibCallOptimization { - bool OptChkCall; // True if it's optimizing a __strcpy_chk libcall. - - StrCpyOpt(bool c) : OptChkCall(c) {} - - virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { - // Verify the "strcpy" function prototype. - unsigned NumParams = OptChkCall ? 3 : 2; - FunctionType *FT = Callee->getFunctionType(); - if (FT->getNumParams() != NumParams || - FT->getReturnType() != FT->getParamType(0) || - FT->getParamType(0) != FT->getParamType(1) || - FT->getParamType(0) != B.getInt8PtrTy()) - return 0; - - Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1); - if (Dst == Src) // strcpy(x,x) -> x - return Src; - - // These optimizations require DataLayout. - if (!TD) return 0; - - // See if we can get the length of the input string. - uint64_t Len = GetStringLength(Src); - if (Len == 0) return 0; - - // We have enough information to now generate the memcpy call to do the - // concatenation for us. Make a memcpy to copy the nul byte with align = 1. - if (!OptChkCall || - !EmitMemCpyChk(Dst, Src, - ConstantInt::get(TD->getIntPtrType(*Context), Len), - CI->getArgOperand(2), B, TD, TLI)) - B.CreateMemCpy(Dst, Src, - ConstantInt::get(TD->getIntPtrType(*Context), Len), 1); - return Dst; - } -}; - -//===---------------------------------------===// // 'stpcpy' Optimizations struct StpCpyOpt: public LibCallOptimization { @@ -813,8 +772,8 @@ struct FFSOpt : public LibCallOptimization { // Constant fold. if (ConstantInt *CI = dyn_cast<ConstantInt>(Op)) { - if (CI->getValue() == 0) // ffs(0) -> 0. - return Constant::getNullValue(CI->getType()); + if (CI->isZero()) // ffs(0) -> 0. + return B.getInt32(0); // ffs(c) -> cttz(c)+1 return B.getInt32(CI->getValue().countTrailingZeros() + 1); } @@ -1275,7 +1234,6 @@ namespace { StringMap<LibCallOptimization*> Optimizations; // String and Memory LibCall Optimizations - StrCpyOpt StrCpy; StrCpyOpt StrCpyChk; StpCpyOpt StpCpy; StpCpyOpt StpCpyChk; StrNCpyOpt StrNCpy; StrLenOpt StrLen; StrPBrkOpt StrPBrk; @@ -1295,8 +1253,7 @@ namespace { bool Modified; // This is only used by doInitialization. public: static char ID; // Pass identification - SimplifyLibCalls() : FunctionPass(ID), StrCpy(false), StrCpyChk(true), - StpCpy(false), StpCpyChk(true), + SimplifyLibCalls() : FunctionPass(ID), StpCpy(false), StpCpyChk(true), UnaryDoubleFP(false), UnsafeUnaryDoubleFP(true) { initializeSimplifyLibCallsPass(*PassRegistry::getPassRegistry()); } @@ -1348,7 +1305,6 @@ void SimplifyLibCalls::AddOpt(LibFunc::Func F1, LibFunc::Func F2, /// we know. void SimplifyLibCalls::InitOptimizations() { // String and Memory LibCall Optimizations - Optimizations["strcpy"] = &StrCpy; Optimizations["strncpy"] = &StrNCpy; Optimizations["stpcpy"] = &StpCpy; Optimizations["strlen"] = &StrLen; @@ -1369,7 +1325,6 @@ void SimplifyLibCalls::InitOptimizations() { AddOpt(LibFunc::memset, &MemSet); // _chk variants of String and Memory LibCall Optimizations. - Optimizations["__strcpy_chk"] = &StrCpyChk; Optimizations["__stpcpy_chk"] = &StpCpyChk; // Math Library Optimizations diff --git a/lib/Transforms/Scalar/TailRecursionElimination.cpp b/lib/Transforms/Scalar/TailRecursionElimination.cpp index 6557d630a94..28c278f0362 100644 --- a/lib/Transforms/Scalar/TailRecursionElimination.cpp +++ b/lib/Transforms/Scalar/TailRecursionElimination.cpp @@ -69,6 +69,7 @@ #include "llvm/Support/CFG.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Support/ValueHandle.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/STLExtras.h" using namespace llvm; @@ -117,34 +118,45 @@ FunctionPass *llvm::createTailCallEliminationPass() { return new TailCallElim(); } -/// AllocaMightEscapeToCalls - Return true if this alloca may be accessed by -/// callees of this function. We only do very simple analysis right now, this -/// could be expanded in the future to use mod/ref information for particular -/// call sites if desired. -static bool AllocaMightEscapeToCalls(AllocaInst *AI) { - // FIXME: do simple 'address taken' analysis. - return true; +/// CanTRE - Scan the specified basic block for alloca instructions. +/// If it contains any that are variable-sized or not in the entry block, +/// returns false. +static bool CanTRE(AllocaInst *AI) { + // Because of PR962, we don't TRE allocas outside the entry block. + + // If this alloca is in the body of the function, or if it is a variable + // sized allocation, we cannot tail call eliminate calls marked 'tail' + // with this mechanism. + BasicBlock *BB = AI->getParent(); + return BB == &BB->getParent()->getEntryBlock() && + isa<ConstantInt>(AI->getArraySize()); } -/// CheckForEscapingAllocas - Scan the specified basic block for alloca -/// instructions. If it contains any that might be accessed by calls, return -/// true. -static bool CheckForEscapingAllocas(BasicBlock *BB, - bool &CannotTCETailMarkedCall) { - bool RetVal = false; - for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) - if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) { - RetVal |= AllocaMightEscapeToCalls(AI); - - // If this alloca is in the body of the function, or if it is a variable - // sized allocation, we cannot tail call eliminate calls marked 'tail' - // with this mechanism. - if (BB != &BB->getParent()->getEntryBlock() || - !isa<ConstantInt>(AI->getArraySize())) - CannotTCETailMarkedCall = true; - } - return RetVal; -} +struct AllocaCaptureTracker : public CaptureTracker { + AllocaCaptureTracker() : Captured(false) {} + + void tooManyUses() { Captured = true; } + + bool shouldExplore(Use *U) { + Value *V = U->getUser(); + if (isa<CallInst>(V) || isa<InvokeInst>(V)) + UsesAlloca.push_back(V); + + return true; + } + + bool captured(Use *U) { + if (isa<ReturnInst>(U->getUser())) + return false; + + Captured = true; + return true; + } + + SmallVector<WeakVH, 64> UsesAlloca; + + bool Captured; +}; bool TailCallElim::runOnFunction(Function &F) { // If this function is a varargs function, we won't be able to PHI the args @@ -157,38 +169,34 @@ bool TailCallElim::runOnFunction(Function &F) { bool MadeChange = false; bool FunctionContainsEscapingAllocas = false; - // CannotTCETailMarkedCall - If true, we cannot perform TCE on tail calls + // CanTRETailMarkedCall - If false, we cannot perform TRE on tail calls // marked with the 'tail' attribute, because doing so would cause the stack - // size to increase (real TCE would deallocate variable sized allocas, TCE + // size to increase (real TRE would deallocate variable sized allocas, TRE // doesn't). - bool CannotTCETailMarkedCall = false; - - // Loop over the function, looking for any returning blocks, and keeping track - // of whether this function has any non-trivially used allocas. - for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) { - if (FunctionContainsEscapingAllocas && CannotTCETailMarkedCall) - break; - - FunctionContainsEscapingAllocas |= - CheckForEscapingAllocas(BB, CannotTCETailMarkedCall); + bool CanTRETailMarkedCall = true; + + // Find calls that can be marked tail. + AllocaCaptureTracker ACT; + for (Function::iterator BB = F.begin(), EE = F.end(); BB != EE; ++BB) { + for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) { + if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) { + CanTRETailMarkedCall &= CanTRE(AI); + PointerMayBeCaptured(AI, &ACT); + if (ACT.Captured) + return false; + } + } } - /// FIXME: The code generator produces really bad code when an 'escaping - /// alloca' is changed from being a static alloca to being a dynamic alloca. - /// Until this is resolved, disable this transformation if that would ever - /// happen. This bug is PR962. - if (FunctionContainsEscapingAllocas) - return false; - - // Second pass, change any tail calls to loops. + // Second pass, change any tail recursive calls to loops. for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) { if (ReturnInst *Ret = dyn_cast<ReturnInst>(BB->getTerminator())) { bool Change = ProcessReturningBlock(Ret, OldEntry, TailCallsAreMarkedTail, - ArgumentPHIs,CannotTCETailMarkedCall); + ArgumentPHIs, !CanTRETailMarkedCall); if (!Change && BB->getFirstNonPHIOrDbg() == Ret) Change = FoldReturnAndProcessPred(BB, Ret, OldEntry, TailCallsAreMarkedTail, ArgumentPHIs, - CannotTCETailMarkedCall); + !CanTRETailMarkedCall); MadeChange |= Change; } } @@ -210,21 +218,24 @@ bool TailCallElim::runOnFunction(Function &F) { } } - // Finally, if this function contains no non-escaping allocas, or calls - // setjmp, mark all calls in the function as eligible for tail calls - //(there is no stack memory for them to access). + // Finally, if this function contains no non-escaping allocas and doesn't + // call setjmp, mark all calls in the function as eligible for tail calls + // (there is no stack memory for them to access). + std::sort(ACT.UsesAlloca.begin(), ACT.UsesAlloca.end()); + if (!FunctionContainsEscapingAllocas && !F.callsFunctionThatReturnsTwice()) for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) - if (CallInst *CI = dyn_cast<CallInst>(I)) { - CI->setTailCall(); - MadeChange = true; - } + if (CallInst *CI = dyn_cast<CallInst>(I)) + if (!std::binary_search(ACT.UsesAlloca.begin(), ACT.UsesAlloca.end(), + CI)) { + CI->setTailCall(); + MadeChange = true; + } return MadeChange; } - /// CanMoveAboveCall - Return true if it is safe to move the specified /// instruction from after the call to before the call, assuming that all /// instructions between the call and this instruction are movable. diff --git a/lib/Transforms/Utils/BasicBlockUtils.cpp b/lib/Transforms/Utils/BasicBlockUtils.cpp index edeebe1b79a..9fea11391a1 100644 --- a/lib/Transforms/Utils/BasicBlockUtils.cpp +++ b/lib/Transforms/Utils/BasicBlockUtils.cpp @@ -687,3 +687,42 @@ ReturnInst *llvm::FoldReturnIntoUncondBranch(ReturnInst *RI, BasicBlock *BB, return cast<ReturnInst>(NewRet); } +/// SplitBlockAndInsertIfThen - Split the containing block at the +/// specified instruction - everything before and including Cmp stays +/// in the old basic block, and everything after Cmp is moved to a +/// new block. The two blocks are connected by a conditional branch +/// (with value of Cmp being the condition). +/// Before: +/// Head +/// Cmp +/// Tail +/// After: +/// Head +/// Cmp +/// if (Cmp) +/// ThenBlock +/// Tail +/// +/// If Unreachable is true, then ThenBlock ends with +/// UnreachableInst, otherwise it branches to Tail. +/// Returns the NewBasicBlock's terminator. + +TerminatorInst *llvm::SplitBlockAndInsertIfThen(Instruction *Cmp, + bool Unreachable, MDNode *BranchWeights) { + Instruction *SplitBefore = Cmp->getNextNode(); + BasicBlock *Head = SplitBefore->getParent(); + BasicBlock *Tail = Head->splitBasicBlock(SplitBefore); + TerminatorInst *HeadOldTerm = Head->getTerminator(); + LLVMContext &C = Head->getContext(); + BasicBlock *ThenBlock = BasicBlock::Create(C, "", Head->getParent(), Tail); + TerminatorInst *CheckTerm; + if (Unreachable) + CheckTerm = new UnreachableInst(C, ThenBlock); + else + CheckTerm = BranchInst::Create(Tail, ThenBlock); + BranchInst *HeadNewTerm = + BranchInst::Create(/*ifTrue*/ThenBlock, /*ifFalse*/Tail, Cmp); + HeadNewTerm->setMetadata(LLVMContext::MD_prof, BranchWeights); + ReplaceInstWithInst(HeadOldTerm, HeadNewTerm); + return CheckTerm; +} diff --git a/lib/Transforms/Utils/LowerInvoke.cpp b/lib/Transforms/Utils/LowerInvoke.cpp index f35cbbdde5e..930555424de 100644 --- a/lib/Transforms/Utils/LowerInvoke.cpp +++ b/lib/Transforms/Utils/LowerInvoke.cpp @@ -45,10 +45,10 @@ #include "llvm/Pass.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" -#include "llvm/TargetTransformInfo.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Target/TargetLowering.h" #include <csetjmp> #include <set> using namespace llvm; @@ -70,14 +70,15 @@ namespace { Constant *SetJmpFn, *LongJmpFn, *StackSaveFn, *StackRestoreFn; bool useExpensiveEHSupport; - // We peek in STTI to grab the target's jmp_buf size and alignment - const ScalarTargetTransformInfo *STTI; + // We peek in TLI to grab the target's jmp_buf size and alignment + const TargetLowering *TLI; public: static char ID; // Pass identification, replacement for typeid - explicit LowerInvoke(bool useExpensiveEHSupport = ExpensiveEHSupport) + explicit LowerInvoke(const TargetLowering *tli = NULL, + bool useExpensiveEHSupport = ExpensiveEHSupport) : FunctionPass(ID), useExpensiveEHSupport(useExpensiveEHSupport), - STTI(0) { + TLI(tli) { initializeLowerInvokePass(*PassRegistry::getPassRegistry()); } bool doInitialization(Module &M); @@ -107,24 +108,21 @@ INITIALIZE_PASS(LowerInvoke, "lowerinvoke", char &llvm::LowerInvokePassID = LowerInvoke::ID; // Public Interface To the LowerInvoke pass. -FunctionPass *llvm::createLowerInvokePass() { - return new LowerInvoke(ExpensiveEHSupport); +FunctionPass *llvm::createLowerInvokePass(const TargetLowering *TLI) { + return new LowerInvoke(TLI, ExpensiveEHSupport); } -FunctionPass *llvm::createLowerInvokePass(bool useExpensiveEHSupport) { - return new LowerInvoke(useExpensiveEHSupport); +FunctionPass *llvm::createLowerInvokePass(const TargetLowering *TLI, + bool useExpensiveEHSupport) { + return new LowerInvoke(TLI, useExpensiveEHSupport); } // doInitialization - Make sure that there is a prototype for abort in the // current module. bool LowerInvoke::doInitialization(Module &M) { - TargetTransformInfo *TTI = getAnalysisIfAvailable<TargetTransformInfo>(); - if (TTI) - STTI = TTI->getScalarTargetTransformInfo(); - Type *VoidPtrTy = Type::getInt8PtrTy(M.getContext()); if (useExpensiveEHSupport) { // Insert a type for the linked list of jump buffers. - unsigned JBSize = STTI ? STTI->getJumpBufSize() : 0; + unsigned JBSize = TLI ? TLI->getJumpBufSize() : 0; JBSize = JBSize ? JBSize : 200; Type *JmpBufTy = ArrayType::get(VoidPtrTy, JBSize); @@ -432,7 +430,7 @@ bool LowerInvoke::insertExpensiveEHSupport(Function &F) { // Create an alloca for the incoming jump buffer ptr and the new jump buffer // that needs to be restored on all exits from the function. This is an // alloca because the value needs to be live across invokes. - unsigned Align = STTI ? STTI->getJumpBufAlignment() : 0; + unsigned Align = TLI ? TLI->getJumpBufAlignment() : 0; AllocaInst *JmpBuf = new AllocaInst(JBLinkTy, 0, Align, "jblink", F.begin()->begin()); @@ -577,10 +575,6 @@ bool LowerInvoke::insertExpensiveEHSupport(Function &F) { } bool LowerInvoke::runOnFunction(Function &F) { - TargetTransformInfo *TTI = getAnalysisIfAvailable<TargetTransformInfo>(); - if (TTI) - STTI = TTI->getScalarTargetTransformInfo(); - if (useExpensiveEHSupport) return insertExpensiveEHSupport(F); else diff --git a/lib/Transforms/Utils/SimplifyLibCalls.cpp b/lib/Transforms/Utils/SimplifyLibCalls.cpp index bd28ec35273..b15acdff633 100644 --- a/lib/Transforms/Utils/SimplifyLibCalls.cpp +++ b/lib/Transforms/Utils/SimplifyLibCalls.cpp @@ -183,14 +183,30 @@ struct StrCpyChkOpt : public InstFortifiedLibCallOptimization { FT->getParamType(2) != TD->getIntPtrType(Context)) return 0; + Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1); + if (Dst == Src) // __strcpy_chk(x,x) -> x + return Src; + // If a) we don't have any length information, or b) we know this will // fit then just lower to a plain st[rp]cpy. Otherwise we'll keep our // st[rp]cpy_chk call which may fail at runtime if the size is too long. // TODO: It might be nice to get a maximum length out of the possible // string lengths for varying. if (isFoldable(2, 1, true)) { - Value *Ret = EmitStrCpy(CI->getArgOperand(0), CI->getArgOperand(1), B, TD, - TLI, Name.substr(2, 6)); + Value *Ret = EmitStrCpy(Dst, Src, B, TD, TLI, Name.substr(2, 6)); + return Ret; + } else { + // Maybe we can stil fold __strcpy_chk to __memcpy_chk. + uint64_t Len = GetStringLength(Src); + if (Len == 0) return 0; + + // This optimization require DataLayout. + if (!TD) return 0; + + Value *Ret = + EmitMemCpyChk(Dst, Src, + ConstantInt::get(TD->getIntPtrType(Context), Len), + CI->getArgOperand(2), B, TD, TLI); return Ret; } return 0; @@ -497,6 +513,35 @@ struct StrNCmpOpt : public LibCallOptimization { } }; +struct StrCpyOpt : public LibCallOptimization { + virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + // Verify the "strcpy" function prototype. + FunctionType *FT = Callee->getFunctionType(); + if (FT->getNumParams() != 2 || + FT->getReturnType() != FT->getParamType(0) || + FT->getParamType(0) != FT->getParamType(1) || + FT->getParamType(0) != B.getInt8PtrTy()) + return 0; + + Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1); + if (Dst == Src) // strcpy(x,x) -> x + return Src; + + // These optimizations require DataLayout. + if (!TD) return 0; + + // See if we can get the length of the input string. + uint64_t Len = GetStringLength(Src); + if (Len == 0) return 0; + + // We have enough information to now generate the memcpy call to do the + // copy for us. Make a memcpy to copy the nul byte with align = 1. + B.CreateMemCpy(Dst, Src, + ConstantInt::get(TD->getIntPtrType(*Context), Len), 1); + return Dst; + } +}; + } // End anonymous namespace. namespace llvm { @@ -520,6 +565,7 @@ class LibCallSimplifierImpl { StrRChrOpt StrRChr; StrCmpOpt StrCmp; StrNCmpOpt StrNCmp; + StrCpyOpt StrCpy; void initOptimizations(); public: @@ -540,14 +586,15 @@ void LibCallSimplifierImpl::initOptimizations() { Optimizations["__stpcpy_chk"] = &StrCpyChk; Optimizations["__strncpy_chk"] = &StrNCpyChk; Optimizations["__stpncpy_chk"] = &StrNCpyChk; - Optimizations["strcmp"] = &StrCmp; - Optimizations["strncmp"] = &StrNCmp; // String and memory library call optimizations. Optimizations["strcat"] = &StrCat; Optimizations["strncat"] = &StrNCat; Optimizations["strchr"] = &StrChr; Optimizations["strrchr"] = &StrRChr; + Optimizations["strcmp"] = &StrCmp; + Optimizations["strncmp"] = &StrNCmp; + Optimizations["strcpy"] = &StrCpy; } Value *LibCallSimplifierImpl::optimizeCall(CallInst *CI) { diff --git a/lib/Transforms/Vectorize/CMakeLists.txt b/lib/Transforms/Vectorize/CMakeLists.txt index 06cf1e4e532..e64034ab26b 100644 --- a/lib/Transforms/Vectorize/CMakeLists.txt +++ b/lib/Transforms/Vectorize/CMakeLists.txt @@ -1,6 +1,7 @@ add_llvm_library(LLVMVectorize BBVectorize.cpp Vectorize.cpp + LoopVectorize.cpp ) add_dependencies(LLVMVectorize intrinsics_gen) diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp new file mode 100644 index 00000000000..6fbf342d06c --- /dev/null +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -0,0 +1,1358 @@ +//===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops +// and generates target-independent LLVM-IR. Legalization of the IR is done +// in the codegen. However, the vectorizes uses (will use) the codegen +// interfaces to generate IR that is likely to result in an optimal binary. +// +// The loop vectorizer combines consecutive loop iteration into a single +// 'wide' iteration. After this transformation the index is incremented +// by the SIMD vector width, and not by one. +// +// This pass has three parts: +// 1. The main loop pass that drives the different parts. +// 2. LoopVectorizationLegality - A helper class that checks for the legality +// of the vectorization. +// 3. SingleBlockLoopVectorizer - A helper class that performs the actual +// widening of instructions. +//===----------------------------------------------------------------------===// +// +// The reduction-variable vectorization is based on the paper: +// D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. +// +// Variable uniformity checks are inspired by: +// Karrenberg, R. and Hack, S. Whole Function Vectorization. +// +// Other ideas/concepts are from: +// A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. +// +//===----------------------------------------------------------------------===// +#define LV_NAME "loop-vectorize" +#define DEBUG_TYPE LV_NAME +#include "llvm/Constants.h" +#include "llvm/DerivedTypes.h" +#include "llvm/Instructions.h" +#include "llvm/LLVMContext.h" +#include "llvm/Pass.h" +#include "llvm/Analysis/LoopPass.h" +#include "llvm/Value.h" +#include "llvm/Function.h" +#include "llvm/Analysis/Verifier.h" +#include "llvm/Module.h" +#include "llvm/Type.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/AliasSetTracker.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/Analysis/ScalarEvolutionExpander.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/DataLayout.h" +#include "llvm/Transforms/Utils/Local.h" +#include <algorithm> +using namespace llvm; + +static cl::opt<unsigned> +DefaultVectorizationFactor("default-loop-vectorize-width", + cl::init(4), cl::Hidden, + cl::desc("Set the default loop vectorization width")); +namespace { + +// Forward declaration. +class LoopVectorizationLegality; + +/// SingleBlockLoopVectorizer vectorizes loops which contain only one basic +/// block to a specified vectorization factor (VF). +/// This class performs the widening of scalars into vectors, or multiple +/// scalars. This class also implements the following features: +/// * It inserts an epilogue loop for handling loops that don't have iteration +/// counts that are known to be a multiple of the vectorization factor. +/// * It handles the code generation for reduction variables. +/// * Scalarization (implementation using scalars) of un-vectorizable +/// instructions. +/// SingleBlockLoopVectorizer does not perform any vectorization-legality +/// checks, and relies on the caller to check for the different legality +/// aspects. The SingleBlockLoopVectorizer relies on the +/// LoopVectorizationLegality class to provide information about the induction +/// and reduction variables that were found to a given vectorization factor. +class SingleBlockLoopVectorizer { +public: + /// Ctor. + SingleBlockLoopVectorizer(Loop *Orig, ScalarEvolution *Se, LoopInfo *Li, + LPPassManager *Lpm, unsigned VecWidth): + OrigLoop(Orig), SE(Se), LI(Li), LPM(Lpm), VF(VecWidth), + Builder(Se->getContext()), Induction(0), OldInduction(0) { } + + // Perform the actual loop widening (vectorization). + void vectorize(LoopVectorizationLegality *Legal) { + ///Create a new empty loop. Unlink the old loop and connect the new one. + createEmptyLoop(Legal); + /// Widen each instruction in the old loop to a new one in the new loop. + /// Use the Legality module to find the induction and reduction variables. + vectorizeLoop(Legal); + // register the new loop. + cleanup(); + } + +private: + /// Create an empty loop, based on the loop ranges of the old loop. + void createEmptyLoop(LoopVectorizationLegality *Legal); + /// Copy and widen the instructions from the old loop. + void vectorizeLoop(LoopVectorizationLegality *Legal); + /// Insert the new loop to the loop hierarchy and pass manager. + void cleanup(); + + /// This instruction is un-vectorizable. Implement it as a sequence + /// of scalars. + void scalarizeInstruction(Instruction *Instr); + + /// Create a broadcast instruction. This method generates a broadcast + /// instruction (shuffle) for loop invariant values and for the induction + /// value. If this is the induction variable then we extend it to N, N+1, ... + /// this is needed because each iteration in the loop corresponds to a SIMD + /// element. + Value *getBroadcastInstrs(Value *V); + + /// This is a helper function used by getBroadcastInstrs. It adds 0, 1, 2 .. + /// for each element in the vector. Starting from zero. + Value *getConsecutiveVector(Value* Val); + + /// When we go over instructions in the basic block we rely on previous + /// values within the current basic block or on loop invariant values. + /// When we widen (vectorize) values we place them in the map. If the values + /// are not within the map, they have to be loop invariant, so we simply + /// broadcast them into a vector. + Value *getVectorValue(Value *V); + + /// Get a uniform vector of constant integers. We use this to get + /// vectors of ones and zeros for the reduction code. + Constant* getUniformVector(unsigned Val, Type* ScalarTy); + + typedef DenseMap<Value*, Value*> ValueMap; + + /// The original loop. + Loop *OrigLoop; + // Scev analysis to use. + ScalarEvolution *SE; + // Loop Info. + LoopInfo *LI; + // Loop Pass Manager; + LPPassManager *LPM; + // The vectorization factor to use. + unsigned VF; + + // The builder that we use + IRBuilder<> Builder; + + // --- Vectorization state --- + + /// Middle Block between the vector and the scalar. + BasicBlock *LoopMiddleBlock; + ///The ExitBlock of the scalar loop. + BasicBlock *LoopExitBlock; + ///The vector loop body. + BasicBlock *LoopVectorBody; + ///The scalar loop body. + BasicBlock *LoopScalarBody; + ///The first bypass block. + BasicBlock *LoopBypassBlock; + + /// The new Induction variable which was added to the new block. + PHINode *Induction; + /// The induction variable of the old basic block. + PHINode *OldInduction; + // Maps scalars to widened vectors. + ValueMap WidenMap; +}; + +/// LoopVectorizationLegality checks if it is legal to vectorize a loop, and +/// to what vectorization factor. +/// This class does not look at the profitability of vectorization, only the +/// legality. This class has two main kinds of checks: +/// * Memory checks - The code in canVectorizeMemory checks if vectorization +/// will change the order of memory accesses in a way that will change the +/// correctness of the program. +/// * Scalars checks - The code in canVectorizeBlock checks for a number +/// of different conditions, such as the availability of a single induction +/// variable, that all types are supported and vectorize-able, etc. +/// This code reflects the capabilities of SingleBlockLoopVectorizer. +/// This class is also used by SingleBlockLoopVectorizer for identifying +/// induction variable and the different reduction variables. +class LoopVectorizationLegality { +public: + LoopVectorizationLegality(Loop *Lp, ScalarEvolution *Se, DataLayout *Dl): + TheLoop(Lp), SE(Se), DL(Dl), Induction(0) { } + + /// This represents the kinds of reductions that we support. + /// We use the enum values to hold the 'identity' value for + /// each operand. This value does not change the result if applied. + enum ReductionKind { + NoReduction = -1, /// Not a reduction. + IntegerAdd = 0, /// Sum of numbers. + IntegerMult = 1 /// Product of numbers. + }; + + /// This POD struct holds information about reduction variables. + struct ReductionDescriptor { + // Default C'tor + ReductionDescriptor(): + StartValue(0), LoopExitInstr(0), Kind(NoReduction) {} + + // C'tor. + ReductionDescriptor(Value *Start, Instruction *Exit, ReductionKind K): + StartValue(Start), LoopExitInstr(Exit), Kind(K) {} + + // The starting value of the reduction. + // It does not have to be zero! + Value *StartValue; + // The instruction who's value is used outside the loop. + Instruction *LoopExitInstr; + // The kind of the reduction. + ReductionKind Kind; + }; + + /// ReductionList contains the reduction descriptors for all + /// of the reductions that were found in the loop. + typedef DenseMap<PHINode*, ReductionDescriptor> ReductionList; + + /// Returns the maximum vectorization factor that we *can* use to vectorize + /// this loop. This does not mean that it is profitable to vectorize this + /// loop, only that it is legal to do so. This may be a large number. We + /// can vectorize to any SIMD width below this number. + unsigned getLoopMaxVF(); + + /// Returns the Induction variable. + PHINode *getInduction() {return Induction;} + + /// Returns the reduction variables found in the loop. + ReductionList *getReductionVars() { return &Reductions; } + + /// Check if the pointer returned by this GEP is consecutive + /// when the index is vectorized. This happens when the last + /// index of the GEP is consecutive, like the induction variable. + /// This check allows us to vectorize A[idx] into a wide load/store. + bool isConsecutiveGep(Value *Ptr); + +private: + /// Check if a single basic block loop is vectorizable. + /// At this point we know that this is a loop with a constant trip count + /// and we only need to check individual instructions. + bool canVectorizeBlock(BasicBlock &BB); + + /// When we vectorize loops we may change the order in which + /// we read and write from memory. This method checks if it is + /// legal to vectorize the code, considering only memory constrains. + /// Returns true if BB is vectorizable + bool canVectorizeMemory(BasicBlock &BB); + + // Check if a pointer value is known to be disjoint. + // Example: Alloca, Global, NoAlias. + bool isIdentifiedSafeObject(Value* Val); + + /// Returns True, if 'Phi' is the kind of reduction variable for type + /// 'Kind'. If this is a reduction variable, it adds it to ReductionList. + bool AddReductionVar(PHINode *Phi, ReductionKind Kind); + /// Returns true if the instruction I can be a reduction variable of type + /// 'Kind'. + bool isReductionInstr(Instruction *I, ReductionKind Kind); + /// Returns True, if 'Phi' is an induction variable. + bool isInductionVariable(PHINode *Phi); + + /// The loop that we evaluate. + Loop *TheLoop; + /// Scev analysis. + ScalarEvolution *SE; + /// DataLayout analysis. + DataLayout *DL; + + // --- vectorization state --- // + + /// Holds the induction variable. + PHINode *Induction; + /// Holds the reduction variables. + ReductionList Reductions; + /// Allowed outside users. This holds the reduction + /// vars which can be accessed from outside the loop. + SmallPtrSet<Value*, 4> AllowedExit; +}; + +struct LoopVectorize : public LoopPass { + static char ID; // Pass identification, replacement for typeid + + LoopVectorize() : LoopPass(ID) { + initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); + } + + ScalarEvolution *SE; + DataLayout *DL; + LoopInfo *LI; + + virtual bool runOnLoop(Loop *L, LPPassManager &LPM) { + // We only vectorize innermost loops. + if (!L->empty()) + return false; + + SE = &getAnalysis<ScalarEvolution>(); + DL = getAnalysisIfAvailable<DataLayout>(); + LI = &getAnalysis<LoopInfo>(); + + DEBUG(dbgs() << "LV: Checking a loop in \"" << + L->getHeader()->getParent()->getName() << "\"\n"); + + // Check if it is legal to vectorize the loop. + LoopVectorizationLegality LVL(L, SE, DL); + unsigned MaxVF = LVL.getLoopMaxVF(); + + // Check that we can vectorize this loop using the chosen vectorization + // width. + if (MaxVF < DefaultVectorizationFactor) { + DEBUG(dbgs() << "LV: non-vectorizable MaxVF ("<< MaxVF << ").\n"); + return false; + } + + DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< MaxVF << ").\n"); + + // If we decided that it is *legal* to vectorizer the loop then do it. + SingleBlockLoopVectorizer LB(L, SE, LI, &LPM, DefaultVectorizationFactor); + LB.vectorize(&LVL); + + DEBUG(verifyFunction(*L->getHeader()->getParent())); + return true; + } + + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + LoopPass::getAnalysisUsage(AU); + AU.addRequiredID(LoopSimplifyID); + AU.addRequiredID(LCSSAID); + AU.addRequired<LoopInfo>(); + AU.addRequired<ScalarEvolution>(); + } + +}; + +Value *SingleBlockLoopVectorizer::getBroadcastInstrs(Value *V) { + // Instructions that access the old induction variable + // actually want to get the new one. + if (V == OldInduction) + V = Induction; + // Create the types. + LLVMContext &C = V->getContext(); + Type *VTy = VectorType::get(V->getType(), VF); + Type *I32 = IntegerType::getInt32Ty(C); + Constant *Zero = ConstantInt::get(I32, 0); + Value *Zeros = ConstantAggregateZero::get(VectorType::get(I32, VF)); + Value *UndefVal = UndefValue::get(VTy); + // Insert the value into a new vector. + Value *SingleElem = Builder.CreateInsertElement(UndefVal, V, Zero); + // Broadcast the scalar into all locations in the vector. + Value *Shuf = Builder.CreateShuffleVector(SingleElem, UndefVal, Zeros, + "broadcast"); + // We are accessing the induction variable. Make sure to promote the + // index for each consecutive SIMD lane. This adds 0,1,2 ... to all lanes. + if (V == Induction) + return getConsecutiveVector(Shuf); + return Shuf; +} + +Value *SingleBlockLoopVectorizer::getConsecutiveVector(Value* Val) { + assert(Val->getType()->isVectorTy() && "Must be a vector"); + assert(Val->getType()->getScalarType()->isIntegerTy() && + "Elem must be an integer"); + // Create the types. + Type *ITy = Val->getType()->getScalarType(); + VectorType *Ty = cast<VectorType>(Val->getType()); + unsigned VLen = Ty->getNumElements(); + SmallVector<Constant*, 8> Indices; + + // Create a vector of consecutive numbers from zero to VF. + for (unsigned i = 0; i < VLen; ++i) + Indices.push_back(ConstantInt::get(ITy, i)); + + // Add the consecutive indices to the vector value. + Constant *Cv = ConstantVector::get(Indices); + assert(Cv->getType() == Val->getType() && "Invalid consecutive vec"); + return Builder.CreateAdd(Val, Cv, "induction"); +} + +bool LoopVectorizationLegality::isConsecutiveGep(Value *Ptr) { + GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr); + if (!Gep) + return false; + + unsigned NumOperands = Gep->getNumOperands(); + Value *LastIndex = Gep->getOperand(NumOperands - 1); + + // Check that all of the gep indices are uniform except for the last. + for (unsigned i = 0; i < NumOperands - 1; ++i) + if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop)) + return false; + + // We can emit wide load/stores only of the last index is the induction + // variable. + const SCEV *Last = SE->getSCEV(LastIndex); + if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Last)) { + const SCEV *Step = AR->getStepRecurrence(*SE); + + // The memory is consecutive because the last index is consecutive + // and all other indices are loop invariant. + if (Step->isOne()) + return true; + } + + return false; +} + +Value *SingleBlockLoopVectorizer::getVectorValue(Value *V) { + assert(!V->getType()->isVectorTy() && "Can't widen a vector"); + // If we saved a vectorized copy of V, use it. + Value *&MapEntry = WidenMap[V]; + if (MapEntry) + return MapEntry; + + // Broadcast V and save the value for future uses. + Value *B = getBroadcastInstrs(V); + MapEntry = B; + return B; +} + +Constant* +SingleBlockLoopVectorizer::getUniformVector(unsigned Val, Type* ScalarTy) { + SmallVector<Constant*, 8> Indices; + // Create a vector of consecutive numbers from zero to VF. + for (unsigned i = 0; i < VF; ++i) + Indices.push_back(ConstantInt::get(ScalarTy, Val)); + + // Add the consecutive indices to the vector value. + return ConstantVector::get(Indices); +} + +void SingleBlockLoopVectorizer::scalarizeInstruction(Instruction *Instr) { + assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); + // Holds vector parameters or scalars, in case of uniform vals. + SmallVector<Value*, 8> Params; + + // Find all of the vectorized parameters. + for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { + Value *SrcOp = Instr->getOperand(op); + + // If we are accessing the old induction variable, use the new one. + if (SrcOp == OldInduction) { + Params.push_back(getBroadcastInstrs(Induction)); + continue; + } + + // Try using previously calculated values. + Instruction *SrcInst = dyn_cast<Instruction>(SrcOp); + + // If the src is an instruction that appeared earlier in the basic block + // then it should already be vectorized. + if (SrcInst && SrcInst->getParent() == Instr->getParent()) { + assert(WidenMap.count(SrcInst) && "Source operand is unavailable"); + // The parameter is a vector value from earlier. + Params.push_back(WidenMap[SrcInst]); + } else { + // The parameter is a scalar from outside the loop. Maybe even a constant. + Params.push_back(SrcOp); + } + } + + assert(Params.size() == Instr->getNumOperands() && + "Invalid number of operands"); + + // Does this instruction return a value ? + bool IsVoidRetTy = Instr->getType()->isVoidTy(); + Value *VecResults = 0; + + // If we have a return value, create an empty vector. We place the scalarized + // instructions in this vector. + if (!IsVoidRetTy) + VecResults = UndefValue::get(VectorType::get(Instr->getType(), VF)); + + // For each scalar that we create: + for (unsigned i = 0; i < VF; ++i) { + Instruction *Cloned = Instr->clone(); + if (!IsVoidRetTy) + Cloned->setName(Instr->getName() + ".cloned"); + // Replace the operands of the cloned instrucions with extracted scalars. + for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { + Value *Op = Params[op]; + // Param is a vector. Need to extract the right lane. + if (Op->getType()->isVectorTy()) + Op = Builder.CreateExtractElement(Op, Builder.getInt32(i)); + Cloned->setOperand(op, Op); + } + + // Place the cloned scalar in the new loop. + Builder.Insert(Cloned); + + // If the original scalar returns a value we need to place it in a vector + // so that future users will be able to use it. + if (!IsVoidRetTy) + VecResults = Builder.CreateInsertElement(VecResults, Cloned, + Builder.getInt32(i)); + } + + if (!IsVoidRetTy) + WidenMap[Instr] = VecResults; +} + +void SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { + /* + In this function we generate a new loop. The new loop will contain + the vectorized instructions while the old loop will continue to run the + scalar remainder. + + [ ] <-- vector loop bypass. + / | + / v +| [ ] <-- vector pre header. +| | +| v +| [ ] \ +| [ ]_| <-- vector loop. +| | + \ v + >[ ] <--- middle-block. + / | + / v +| [ ] <--- new preheader. +| | +| v +| [ ] \ +| [ ]_| <-- old scalar loop to handle remainder. + \ | + \ v + >[ ] <-- exit block. + ... + */ + + // This is the original scalar-loop preheader. + BasicBlock *BypassBlock = OrigLoop->getLoopPreheader(); + BasicBlock *ExitBlock = OrigLoop->getExitBlock(); + assert(ExitBlock && "Must have an exit block"); + + assert(OrigLoop->getNumBlocks() == 1 && "Invalid loop"); + assert(BypassBlock && "Invalid loop structure"); + + BasicBlock *VectorPH = + BypassBlock->splitBasicBlock(BypassBlock->getTerminator(), "vector.ph"); + BasicBlock *VecBody = VectorPH->splitBasicBlock(VectorPH->getTerminator(), + "vector.body"); + + BasicBlock *MiddleBlock = VecBody->splitBasicBlock(VecBody->getTerminator(), + "middle.block"); + BasicBlock *ScalarPH = + MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), + "scalar.preheader"); + // Find the induction variable. + BasicBlock *OldBasicBlock = OrigLoop->getHeader(); + OldInduction = Legal->getInduction(); + assert(OldInduction && "We must have a single phi node."); + Type *IdxTy = OldInduction->getType(); + + // Use this IR builder to create the loop instructions (Phi, Br, Cmp) + // inside the loop. + Builder.SetInsertPoint(VecBody->getFirstInsertionPt()); + + // Generate the induction variable. + Induction = Builder.CreatePHI(IdxTy, 2, "index"); + Constant *Zero = ConstantInt::get(IdxTy, 0); + Constant *Step = ConstantInt::get(IdxTy, VF); + + // Find the loop boundaries. + const SCEV *ExitCount = SE->getExitCount(OrigLoop, OrigLoop->getHeader()); + assert(ExitCount != SE->getCouldNotCompute() && "Invalid loop count"); + + // Get the total trip count from the count by adding 1. + ExitCount = SE->getAddExpr(ExitCount, + SE->getConstant(ExitCount->getType(), 1)); + + // Expand the trip count and place the new instructions in the preheader. + // Notice that the pre-header does not change, only the loop body. + SCEVExpander Exp(*SE, "induction"); + Instruction *Loc = BypassBlock->getTerminator(); + + // We may need to extend the index in case there is a type mismatch. + // We know that the count starts at zero and does not overflow. + // We are using Zext because it should be less expensive. + if (ExitCount->getType() != Induction->getType()) + ExitCount = SE->getZeroExtendExpr(ExitCount, IdxTy); + + // Count holds the overall loop count (N). + Value *Count = Exp.expandCodeFor(ExitCount, Induction->getType(), Loc); + // Now we need to generate the expression for N - (N % VF), which is + // the part that the vectorized body will execute. + Constant *CIVF = ConstantInt::get(IdxTy, VF); + Value *R = BinaryOperator::CreateURem(Count, CIVF, "n.mod.vf", Loc); + Value *CountRoundDown = BinaryOperator::CreateSub(Count, R, "n.vec", Loc); + + // Now, compare the new count to zero. If it is zero, jump to the scalar part. + Value *Cmp = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, + CountRoundDown, ConstantInt::getNullValue(IdxTy), + "cmp.zero", Loc); + BranchInst::Create(MiddleBlock, VectorPH, Cmp, Loc); + // Remove the old terminator. + Loc->eraseFromParent(); + + // Add a check in the middle block to see if we have completed + // all of the iterations in the first vector loop. + // If (N - N%VF) == N, then we *don't* need to run the remainder. + Value *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count, + CountRoundDown, "cmp.n", + MiddleBlock->getTerminator()); + + BranchInst::Create(ExitBlock, ScalarPH, CmpN, MiddleBlock->getTerminator()); + // Remove the old terminator. + MiddleBlock->getTerminator()->eraseFromParent(); + + // Create i+1 and fill the PHINode. + Value *NextIdx = Builder.CreateAdd(Induction, Step, "index.next"); + Induction->addIncoming(Zero, VectorPH); + Induction->addIncoming(NextIdx, VecBody); + // Create the compare. + Value *ICmp = Builder.CreateICmpEQ(NextIdx, CountRoundDown); + Builder.CreateCondBr(ICmp, MiddleBlock, VecBody); + + // Now we have two terminators. Remove the old one from the block. + VecBody->getTerminator()->eraseFromParent(); + + // Fix the scalar body iteration count. + unsigned BlockIdx = OldInduction->getBasicBlockIndex(ScalarPH); + OldInduction->setIncomingValue(BlockIdx, CountRoundDown); + + // Get ready to start creating new instructions into the vectorized body. + Builder.SetInsertPoint(VecBody->getFirstInsertionPt()); + + // Register the new loop. + Loop* Lp = new Loop(); + LPM->insertLoop(Lp, OrigLoop->getParentLoop()); + + Lp->addBasicBlockToLoop(VecBody, LI->getBase()); + + Loop *ParentLoop = OrigLoop->getParentLoop(); + if (ParentLoop) { + ParentLoop->addBasicBlockToLoop(ScalarPH, LI->getBase()); + ParentLoop->addBasicBlockToLoop(VectorPH, LI->getBase()); + ParentLoop->addBasicBlockToLoop(MiddleBlock, LI->getBase()); + } + + // Save the state. + LoopMiddleBlock = MiddleBlock; + LoopExitBlock = ExitBlock; + LoopVectorBody = VecBody; + LoopScalarBody = OldBasicBlock; + LoopBypassBlock = BypassBlock; +} + +void +SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { + typedef SmallVector<PHINode*, 4> PhiVector; + BasicBlock &BB = *OrigLoop->getHeader(); + Constant *Zero = ConstantInt::get( + IntegerType::getInt32Ty(BB.getContext()), 0); + + // In order to support reduction variables we need to be able to vectorize + // Phi nodes. Phi nodes have cycles, so we need to vectorize them in two + // steages. First, we create a new vector PHI node with no incoming edges. + // We use this value when we vectorize all of the instructions that use the + // PHI. Next, after all of the instructions in the block are complete we + // add the new incoming edges to the PHI. At this point all of the + // instructions in the basic block are vectorized, so we can use them to + // construct the PHI. + PhiVector PHIsToFix; + + // For each instruction in the old loop. + for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) { + Instruction *Inst = it; + + switch (Inst->getOpcode()) { + case Instruction::Br: + // Nothing to do for PHIs and BR, since we already took care of the + // loop control flow instructions. + continue; + case Instruction::PHI:{ + PHINode* P = cast<PHINode>(Inst); + // Special handling for the induction var. + if (OldInduction == Inst) + continue; + // This is phase one of vectorizing PHIs. + // This has to be a reduction variable. + assert(Legal->getReductionVars()->count(P) && "Not a Reduction"); + Type *VecTy = VectorType::get(Inst->getType(), VF); + WidenMap[Inst] = Builder.CreatePHI(VecTy, 2, "vec.phi"); + PHIsToFix.push_back(P); + continue; + } + case Instruction::Add: + case Instruction::FAdd: + case Instruction::Sub: + case Instruction::FSub: + case Instruction::Mul: + case Instruction::FMul: + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::FDiv: + case Instruction::URem: + case Instruction::SRem: + case Instruction::FRem: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: { + // Just widen binops. + BinaryOperator *BinOp = dyn_cast<BinaryOperator>(Inst); + Value *A = getVectorValue(Inst->getOperand(0)); + Value *B = getVectorValue(Inst->getOperand(1)); + // Use this vector value for all users of the original instruction. + WidenMap[Inst] = Builder.CreateBinOp(BinOp->getOpcode(), A, B); + break; + } + case Instruction::Select: { + // Widen selects. + // If the selector is loop invariant we can create a select + // instruction with a scalar condition. Otherwise, use vector-select. + Value *Cond = Inst->getOperand(0); + bool InvariantCond = SE->isLoopInvariant(SE->getSCEV(Cond), OrigLoop); + + // The condition can be loop invariant but still defined inside the + // loop. This means that we can't just use the original 'cond' value. + // We have to take the 'vectorized' value and pick the first lane. + // Instcombine will make this a no-op. + Cond = getVectorValue(Cond); + if (InvariantCond) + Cond = Builder.CreateExtractElement(Cond, Builder.getInt32(0)); + + Value *Op0 = getVectorValue(Inst->getOperand(1)); + Value *Op1 = getVectorValue(Inst->getOperand(2)); + WidenMap[Inst] = Builder.CreateSelect(Cond, Op0, Op1); + break; + } + + case Instruction::ICmp: + case Instruction::FCmp: { + // Widen compares. Generate vector compares. + bool FCmp = (Inst->getOpcode() == Instruction::FCmp); + CmpInst *Cmp = dyn_cast<CmpInst>(Inst); + Value *A = getVectorValue(Inst->getOperand(0)); + Value *B = getVectorValue(Inst->getOperand(1)); + if (FCmp) + WidenMap[Inst] = Builder.CreateFCmp(Cmp->getPredicate(), A, B); + else + WidenMap[Inst] = Builder.CreateICmp(Cmp->getPredicate(), A, B); + break; + } + + case Instruction::Store: { + // Attempt to issue a wide store. + StoreInst *SI = dyn_cast<StoreInst>(Inst); + Type *StTy = VectorType::get(SI->getValueOperand()->getType(), VF); + Value *Ptr = SI->getPointerOperand(); + unsigned Alignment = SI->getAlignment(); + GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr); + // This store does not use GEPs. + if (!Legal->isConsecutiveGep(Gep)) { + scalarizeInstruction(Inst); + break; + } + + // The last index does not have to be the induction. It can be + // consecutive and be a function of the index. For example A[I+1]; + unsigned NumOperands = Gep->getNumOperands(); + Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands -1)); + LastIndex = Builder.CreateExtractElement(LastIndex, Builder.getInt32(0)); + + // Create the new GEP with the new induction variable. + GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone()); + Gep2->setOperand(NumOperands - 1, LastIndex); + Ptr = Builder.Insert(Gep2); + Ptr = Builder.CreateBitCast(Ptr, StTy->getPointerTo()); + Value *Val = getVectorValue(SI->getValueOperand()); + Builder.CreateStore(Val, Ptr)->setAlignment(Alignment); + break; + } + case Instruction::Load: { + // Attempt to issue a wide load. + LoadInst *LI = dyn_cast<LoadInst>(Inst); + Type *RetTy = VectorType::get(LI->getType(), VF); + Value *Ptr = LI->getPointerOperand(); + unsigned Alignment = LI->getAlignment(); + GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr); + + // We don't have a gep. Scalarize the load. + if (!Legal->isConsecutiveGep(Gep)) { + scalarizeInstruction(Inst); + break; + } + + // The last index does not have to be the induction. It can be + // consecutive and be a function of the index. For example A[I+1]; + unsigned NumOperands = Gep->getNumOperands(); + Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands -1)); + LastIndex = Builder.CreateExtractElement(LastIndex, Builder.getInt32(0)); + + // Create the new GEP with the new induction variable. + GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone()); + Gep2->setOperand(NumOperands - 1, LastIndex); + Ptr = Builder.Insert(Gep2); + Ptr = Builder.CreateBitCast(Ptr, RetTy->getPointerTo()); + LI = Builder.CreateLoad(Ptr); + LI->setAlignment(Alignment); + // Use this vector value for all users of the load. + WidenMap[Inst] = LI; + break; + } + case Instruction::ZExt: + case Instruction::SExt: + case Instruction::FPToUI: + case Instruction::FPToSI: + case Instruction::FPExt: + case Instruction::PtrToInt: + case Instruction::IntToPtr: + case Instruction::SIToFP: + case Instruction::UIToFP: + case Instruction::Trunc: + case Instruction::FPTrunc: + case Instruction::BitCast: { + /// Vectorize bitcasts. + CastInst *CI = dyn_cast<CastInst>(Inst); + Value *A = getVectorValue(Inst->getOperand(0)); + Type *DestTy = VectorType::get(CI->getType()->getScalarType(), VF); + WidenMap[Inst] = Builder.CreateCast(CI->getOpcode(), A, DestTy); + break; + } + + default: + /// All other instructions are unsupported. Scalarize them. + scalarizeInstruction(Inst); + break; + }// end of switch. + }// end of for_each instr. + + // At this point every instruction in the original loop is widended to + // a vector form. We are almost done. Now, we need to fix the PHI nodes + // that we vectorized. The PHI nodes are currently empty because we did + // not want to introduce cycles. Notice that the remaining PHI nodes + // that we need to fix are reduction variables. + + // Create the 'reduced' values for each of the induction vars. + // The reduced values are the vector values that we scalarize and combine + // after the loop is finished. + for (PhiVector::iterator it = PHIsToFix.begin(), e = PHIsToFix.end(); + it != e; ++it) { + PHINode *RdxPhi = *it; + PHINode *VecRdxPhi = dyn_cast<PHINode>(WidenMap[RdxPhi]); + assert(RdxPhi && "Unable to recover vectorized PHI"); + + // Find the reduction variable descriptor. + assert(Legal->getReductionVars()->count(RdxPhi) && + "Unable to find the reduction variable"); + LoopVectorizationLegality::ReductionDescriptor RdxDesc = + (*Legal->getReductionVars())[RdxPhi]; + + // We need to generate a reduction vector from the incoming scalar. + // To do so, we need to generate the 'identity' vector and overide + // one of the elements with the incoming scalar reduction. We need + // to do it in the vector-loop preheader. + Builder.SetInsertPoint(LoopBypassBlock->getTerminator()); + + // This is the vector-clone of the value that leaves the loop. + Value *VectorExit = getVectorValue(RdxDesc.LoopExitInstr); + Type *VecTy = VectorExit->getType(); + + // Find the reduction identity variable. The value of the enum is the + // identity. Zero for addition. One for Multiplication. + unsigned IdentitySclr = RdxDesc.Kind; + Constant *Identity = getUniformVector(IdentitySclr, + VecTy->getScalarType()); + + // This vector is the Identity vector where the first element is the + // incoming scalar reduction. + Value *VectorStart = Builder.CreateInsertElement(Identity, + RdxDesc.StartValue, Zero); + + + // Fix the vector-loop phi. + // We created the induction variable so we know that the + // preheader is the first entry. + BasicBlock *VecPreheader = Induction->getIncomingBlock(0); + + // Reductions do not have to start at zero. They can start with + // any loop invariant values. + VecRdxPhi->addIncoming(VectorStart, VecPreheader); + unsigned SelfEdgeIdx = (RdxPhi)->getBasicBlockIndex(LoopScalarBody); + Value *Val = getVectorValue(RdxPhi->getIncomingValue(SelfEdgeIdx)); + VecRdxPhi->addIncoming(Val, LoopVectorBody); + + // Before each round, move the insertion point right between + // the PHIs and the values we are going to write. + // This allows us to write both PHINodes and the extractelement + // instructions. + Builder.SetInsertPoint(LoopMiddleBlock->getFirstInsertionPt()); + + // This PHINode contains the vectorized reduction variable, or + // the initial value vector, if we bypass the vector loop. + PHINode *NewPhi = Builder.CreatePHI(VecTy, 2, "rdx.vec.exit.phi"); + NewPhi->addIncoming(VectorStart, LoopBypassBlock); + NewPhi->addIncoming(getVectorValue(RdxDesc.LoopExitInstr), LoopVectorBody); + + // Extract the first scalar. + Value *Scalar0 = + Builder.CreateExtractElement(NewPhi, Builder.getInt32(0)); + // Extract and sum the remaining vector elements. + for (unsigned i=1; i < VF; ++i) { + Value *Scalar1 = + Builder.CreateExtractElement(NewPhi, Builder.getInt32(i)); + if (RdxDesc.Kind == LoopVectorizationLegality::IntegerAdd) { + Scalar0 = Builder.CreateAdd(Scalar0, Scalar1); + } else { + Scalar0 = Builder.CreateMul(Scalar0, Scalar1); + } + } + + // Now, we need to fix the users of the reduction variable + // inside and outside of the scalar remainder loop. + // We know that the loop is in LCSSA form. We need to update the + // PHI nodes in the exit blocks. + for (BasicBlock::iterator LEI = LoopExitBlock->begin(), + LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) { + PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI); + if (!LCSSAPhi) continue; + + // All PHINodes need to have a single entry edge, or two if + // we already fixed them. + assert(LCSSAPhi->getNumIncomingValues() < 3 && "Invalid LCSSA PHI"); + + // We found our reduction value exit-PHI. Update it with the + // incoming bypass edge. + if (LCSSAPhi->getIncomingValue(0) == RdxDesc.LoopExitInstr) { + // Add an edge coming from the bypass. + LCSSAPhi->addIncoming(Scalar0, LoopMiddleBlock); + break; + } + }// end of the LCSSA phi scan. + + // Fix the scalar loop reduction variable with the incoming reduction sum + // from the vector body and from the backedge value. + int IncomingEdgeBlockIdx = (RdxPhi)->getBasicBlockIndex(LoopScalarBody); + int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); // The other block. + (RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, Scalar0); + (RdxPhi)->setIncomingValue(IncomingEdgeBlockIdx, RdxDesc.LoopExitInstr); + }// end of for each redux variable. +} + +void SingleBlockLoopVectorizer::cleanup() { + // The original basic block. + SE->forgetLoop(OrigLoop); +} + +unsigned LoopVectorizationLegality::getLoopMaxVF() { + if (!TheLoop->getLoopPreheader()) { + assert(false && "No preheader!!"); + DEBUG(dbgs() << "LV: Loop not normalized." << "\n"); + return 1; + } + + // We can only vectorize single basic block loops. + unsigned NumBlocks = TheLoop->getNumBlocks(); + if (NumBlocks != 1) { + DEBUG(dbgs() << "LV: Too many blocks:" << NumBlocks << "\n"); + return 1; + } + + // We need to have a loop header. + BasicBlock *BB = TheLoop->getHeader(); + DEBUG(dbgs() << "LV: Found a loop: " << BB->getName() << "\n"); + + // Go over each instruction and look at memory deps. + if (!canVectorizeBlock(*BB)) { + DEBUG(dbgs() << "LV: Can't vectorize this loop header\n"); + return 1; + } + + // ScalarEvolution needs to be able to find the exit count. + const SCEV *ExitCount = SE->getExitCount(TheLoop, BB); + if (ExitCount == SE->getCouldNotCompute()) { + DEBUG(dbgs() << "LV: SCEV could not compute the loop exit count.\n"); + return 1; + } + + DEBUG(dbgs() << "LV: We can vectorize this loop!\n"); + + // Okay! We can vectorize. At this point we don't have any other mem analysis + // which may limit our maximum vectorization factor, so just return the + // maximum SIMD size. + return DefaultVectorizationFactor; +} + +bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) { + // Scan the instructions in the block and look for hazards. + for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) { + Instruction *I = it; + + PHINode *Phi = dyn_cast<PHINode>(I); + if (Phi) { + // This should not happen because the loop should be normalized. + if (Phi->getNumIncomingValues() != 2) { + DEBUG(dbgs() << "LV: Found an invalid PHI.\n"); + return false; + } + // We only look at integer phi nodes. + if (!Phi->getType()->isIntegerTy()) { + DEBUG(dbgs() << "LV: Found an non-int PHI.\n"); + return false; + } + + if (isInductionVariable(Phi)) { + if (Induction) { + DEBUG(dbgs() << "LV: Found too many inductions."<< *Phi <<"\n"); + return false; + } + DEBUG(dbgs() << "LV: Found the induction PHI."<< *Phi <<"\n"); + Induction = Phi; + continue; + } + if (AddReductionVar(Phi, IntegerAdd)) { + DEBUG(dbgs() << "LV: Found an ADD reduction PHI."<< *Phi <<"\n"); + continue; + } + if (AddReductionVar(Phi, IntegerMult)) { + DEBUG(dbgs() << "LV: Found an Mult reduction PHI."<< *Phi <<"\n"); + continue; + } + + DEBUG(dbgs() << "LV: Found an unidentified PHI."<< *Phi <<"\n"); + return false; + }// end of PHI handling + + // We still don't handle functions. + CallInst *CI = dyn_cast<CallInst>(I); + if (CI) { + DEBUG(dbgs() << "LV: Found a call site:"<< + CI->getCalledFunction()->getName() << "\n"); + return false; + } + + // We do not re-vectorize vectors. + if (!VectorType::isValidElementType(I->getType()) && + !I->getType()->isVoidTy()) { + DEBUG(dbgs() << "LV: Found unvectorizable type." << "\n"); + return false; + } + + // Reduction instructions are allowed to have exit users. + // All other instructions must not have external users. + if (!AllowedExit.count(I)) + //Check that all of the users of the loop are inside the BB. + for (Value::use_iterator it = I->use_begin(), e = I->use_end(); + it != e; ++it) { + Instruction *U = cast<Instruction>(*it); + // This user may be a reduction exit value. + BasicBlock *Parent = U->getParent(); + if (Parent != &BB) { + DEBUG(dbgs() << "LV: Found an outside user for : "<< *U << "\n"); + return false; + } + } + } // next instr. + + if (!Induction) { + DEBUG(dbgs() << "LV: Did not find an induction var.\n"); + return false; + } + + // If the memory dependencies do not prevent us from + // vectorizing, then vectorize. + return canVectorizeMemory(BB); +} + +bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) { + typedef SmallVector<Value*, 16> ValueVector; + typedef SmallPtrSet<Value*, 16> ValueSet; + // Holds the Load and Store *instructions*. + ValueVector Loads; + ValueVector Stores; + + // Scan the BB and collect legal loads and stores. + for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) { + Instruction *I = it; + + // If this is a load, save it. If this instruction can read from memory + // but is not a load, then we quit. Notice that we don't handle function + // calls that read or write. + if (I->mayReadFromMemory()) { + LoadInst *Ld = dyn_cast<LoadInst>(I); + if (!Ld) return false; + if (!Ld->isSimple()) { + DEBUG(dbgs() << "LV: Found a non-simple load.\n"); + return false; + } + Loads.push_back(Ld); + continue; + } + + // Save store instructions. Abort if other instructions write to memory. + if (I->mayWriteToMemory()) { + StoreInst *St = dyn_cast<StoreInst>(I); + if (!St) return false; + if (!St->isSimple()) { + DEBUG(dbgs() << "LV: Found a non-simple store.\n"); + return false; + } + Stores.push_back(St); + } + } // next instr. + + // Now we have two lists that hold the loads and the stores. + // Next, we find the pointers that they use. + + // Check if we see any stores. If there are no stores, then we don't + // care if the pointers are *restrict*. + if (!Stores.size()) { + DEBUG(dbgs() << "LV: Found a read-only loop!\n"); + return true; + } + + // Holds the read and read-write *pointers* that we find. + ValueVector Reads; + ValueVector ReadWrites; + + // Holds the analyzed pointers. We don't want to call GetUnderlyingObjects + // multiple times on the same object. If the ptr is accessed twice, once + // for read and once for write, it will only appear once (on the write + // list). This is okay, since we are going to check for conflicts between + // writes and between reads and writes, but not between reads and reads. + ValueSet Seen; + + ValueVector::iterator I, IE; + for (I = Stores.begin(), IE = Stores.end(); I != IE; ++I) { + StoreInst *ST = dyn_cast<StoreInst>(*I); + assert(ST && "Bad StoreInst"); + Value* Ptr = ST->getPointerOperand(); + // If we did *not* see this pointer before, insert it to + // the read-write list. At this phase it is only a 'write' list. + if (Seen.insert(Ptr)) + ReadWrites.push_back(Ptr); + } + + for (I = Loads.begin(), IE = Loads.end(); I != IE; ++I) { + LoadInst *LD = dyn_cast<LoadInst>(*I); + assert(LD && "Bad LoadInst"); + Value* Ptr = LD->getPointerOperand(); + // If we did *not* see this pointer before, insert it to the + // read list. If we *did* see it before, then it is already in + // the read-write list. This allows us to vectorize expressions + // such as A[i] += x; Because the address of A[i] is a read-write + // pointer. This only works if the index of A[i] is consecutive. + // If the address of i is unknown (for example A[B[i]]) then we may + // read a few words, modify, and write a few words, and some of the + // words may be written to the same address. + if (Seen.insert(Ptr) || !isConsecutiveGep(Ptr)) + Reads.push_back(Ptr); + } + + // Now that the pointers are in two lists (Reads and ReadWrites), we + // can check that there are no conflicts between each of the writes and + // between the writes to the reads. + ValueSet WriteObjects; + ValueVector TempObjects; + + // Check that the read-writes do not conflict with other read-write + // pointers. + for (I = ReadWrites.begin(), IE = ReadWrites.end(); I != IE; ++I) { + GetUnderlyingObjects(*I, TempObjects, DL); + for (ValueVector::iterator it=TempObjects.begin(), e=TempObjects.end(); + it != e; ++it) { + if (!isIdentifiedSafeObject(*it)) { + DEBUG(dbgs() << "LV: Found an unidentified write ptr:"<< **it <<"\n"); + return false; + } + if (!WriteObjects.insert(*it)) { + DEBUG(dbgs() << "LV: Found a possible write-write reorder:" + << **it <<"\n"); + return false; + } + } + TempObjects.clear(); + } + + /// Check that the reads don't conflict with the read-writes. + for (I = Reads.begin(), IE = Reads.end(); I != IE; ++I) { + GetUnderlyingObjects(*I, TempObjects, DL); + for (ValueVector::iterator it=TempObjects.begin(), e=TempObjects.end(); + it != e; ++it) { + if (!isIdentifiedSafeObject(*it)) { + DEBUG(dbgs() << "LV: Found an unidentified read ptr:"<< **it <<"\n"); + return false; + } + if (WriteObjects.count(*it)) { + DEBUG(dbgs() << "LV: Found a possible read/write reorder:" + << **it <<"\n"); + return false; + } + } + TempObjects.clear(); + } + + // All is okay. + return true; +} + +/// Checks if the value is a Global variable or if it is an Arguments +/// marked with the NoAlias attribute. +bool LoopVectorizationLegality::isIdentifiedSafeObject(Value* Val) { + assert(Val && "Invalid value"); + if (isa<GlobalValue>(Val)) + return true; + if (isa<AllocaInst>(Val)) + return true; + if (Argument *A = dyn_cast<Argument>(Val)) + return A->hasNoAliasAttr(); + return false; +} + +bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi, + ReductionKind Kind) { + if (Phi->getNumIncomingValues() != 2) + return false; + + // Find the possible incoming reduction variable. + BasicBlock *BB = Phi->getParent(); + int SelfEdgeIdx = Phi->getBasicBlockIndex(BB); + int InEdgeBlockIdx = (SelfEdgeIdx ? 0 : 1); // The other entry. + Value *RdxStart = Phi->getIncomingValue(InEdgeBlockIdx); + + // ExitInstruction is the single value which is used outside the loop. + // We only allow for a single reduction value to be used outside the loop. + // This includes users of the reduction, variables (which form a cycle + // which ends in the phi node). + Instruction *ExitInstruction = 0; + + // Iter is our iterator. We start with the PHI node and scan for all of the + // users of this instruction. All users must be instructions which can be + // used as reduction variables (such as ADD). We may have a single + // out-of-block user. They cycle must end with the original PHI. + // Also, we can't have multiple block-local users. + Instruction *Iter = Phi; + while (true) { + // Any reduction instr must be of one of the allowed kinds. + if (!isReductionInstr(Iter, Kind)) + return false; + + // Did we found a user inside this block ? + bool FoundInBlockUser = false; + // Did we reach the initial PHI node ? + bool FoundStartPHI = false; + + // If the instruction has no users then this is a broken + // chain and can't be a reduction variable. + if (Iter->use_empty()) + return false; + + // For each of the *users* of iter. + for (Value::use_iterator it = Iter->use_begin(), e = Iter->use_end(); + it != e; ++it) { + Instruction *U = cast<Instruction>(*it); + // We already know that the PHI is a user. + if (U == Phi) { + FoundStartPHI = true; + continue; + } + // Check if we found the exit user. + BasicBlock *Parent = U->getParent(); + if (Parent != BB) { + // We must have a single exit instruction. + if (ExitInstruction != 0) + return false; + ExitInstruction = Iter; + } + // We can't have multiple inside users. + if (FoundInBlockUser) + return false; + FoundInBlockUser = true; + Iter = U; + } + + // We found a reduction var if we have reached the original + // phi node and we only have a single instruction with out-of-loop + // users. + if (FoundStartPHI && ExitInstruction) { + // This instruction is allowed to have out-of-loop users. + AllowedExit.insert(ExitInstruction); + + // Save the description of this reduction variable. + ReductionDescriptor RD(RdxStart, ExitInstruction, Kind); + Reductions[Phi] = RD; + return true; + } + } +} + +bool +LoopVectorizationLegality::isReductionInstr(Instruction *I, + ReductionKind Kind) { + switch (I->getOpcode()) { + default: + return false; + case Instruction::PHI: + // possibly. + return true; + case Instruction::Add: + case Instruction::Sub: + return Kind == IntegerAdd; + case Instruction::Mul: + case Instruction::UDiv: + case Instruction::SDiv: + return Kind == IntegerMult; + } +} + +bool LoopVectorizationLegality::isInductionVariable(PHINode *Phi) { + // Check that the PHI is consecutive and starts at zero. + const SCEV *PhiScev = SE->getSCEV(Phi); + const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PhiScev); + if (!AR) { + DEBUG(dbgs() << "LV: PHI is not a poly recurrence.\n"); + return false; + } + const SCEV *Step = AR->getStepRecurrence(*SE); + const SCEV *Start = AR->getStart(); + + if (!Step->isOne() || !Start->isZero()) { + DEBUG(dbgs() << "LV: PHI does not start at zero or steps by one.\n"); + return false; + } + return true; +} + +} // namespace + +char LoopVectorize::ID = 0; +static const char lv_name[] = "Loop Vectorization"; +INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) +INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_DEPENDENCY(LoopSimplify) +INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) + +namespace llvm { + Pass *createLoopVectorizePass() { + return new LoopVectorize(); + } +} + diff --git a/lib/Transforms/Vectorize/Vectorize.cpp b/lib/Transforms/Vectorize/Vectorize.cpp index 1ef60029bcf..d26973a7b38 100644 --- a/lib/Transforms/Vectorize/Vectorize.cpp +++ b/lib/Transforms/Vectorize/Vectorize.cpp @@ -7,7 +7,7 @@ // //===----------------------------------------------------------------------===// // -// This file implements common infrastructure for libLLVMVectorizeOpts.a, which +// This file implements common infrastructure for libLLVMVectorizeOpts.a, which // implements several vectorization transformations over the LLVM intermediate // representation, including the C bindings for that library. // @@ -23,10 +23,11 @@ using namespace llvm; -/// initializeVectorizationPasses - Initialize all passes linked into the +/// initializeVectorizationPasses - Initialize all passes linked into the /// Vectorization library. void llvm::initializeVectorization(PassRegistry &Registry) { initializeBBVectorizePass(Registry); + initializeLoopVectorizePass(Registry); } void LLVMInitializeVectorization(LLVMPassRegistryRef R) { @@ -37,3 +38,6 @@ void LLVMAddBBVectorizePass(LLVMPassManagerRef PM) { unwrap(PM)->add(createBBVectorizePass()); } +void LLVMAddLoopVectorizePass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createLoopVectorizePass()); +} diff --git a/lib/VMCore/DataLayout.cpp b/lib/VMCore/DataLayout.cpp index 3f75069a60a..e6994be257c 100644 --- a/lib/VMCore/DataLayout.cpp +++ b/lib/VMCore/DataLayout.cpp @@ -559,8 +559,10 @@ uint64_t DataLayout::getTypeSizeInBits(Type *Ty) const { // only 80 bits contain information. case Type::X86_FP80TyID: return 80; - case Type::VectorTyID: - return cast<VectorType>(Ty)->getBitWidth(); + case Type::VectorTyID: { + VectorType *VTy = cast<VectorType>(Ty); + return VTy->getNumElements()*getTypeSizeInBits(VTy->getElementType()); + } default: llvm_unreachable("DataLayout::getTypeSizeInBits(): Unsupported type"); } diff --git a/test/BugPoint/crash-narrowfunctiontest.ll b/test/BugPoint/crash-narrowfunctiontest.ll index d080d9dd4b0..9df823ab973 100644 --- a/test/BugPoint/crash-narrowfunctiontest.ll +++ b/test/BugPoint/crash-narrowfunctiontest.ll @@ -2,6 +2,7 @@ ; ; RUN: bugpoint -load %llvmshlibdir/BugpointPasses%shlibext %s -output-prefix %t -bugpoint-crashcalls -silence-passes > /dev/null ; REQUIRES: loadable_module +; XFAIL: lto define i32 @foo() { ret i32 1 } diff --git a/test/BugPoint/metadata.ll b/test/BugPoint/metadata.ll index 0eda5667ba4..98c79ee03a6 100644 --- a/test/BugPoint/metadata.ll +++ b/test/BugPoint/metadata.ll @@ -1,6 +1,7 @@ ; RUN: bugpoint -load %llvmshlibdir/BugpointPasses%shlibext %s -output-prefix %t -bugpoint-crashcalls -silence-passes > /dev/null ; RUN: llvm-dis %t-reduced-simplified.bc -o - | FileCheck %s ; REQUIRES: loadable_module +; XFAIL: lto ; Bugpoint should keep the call's metadata attached to the call. diff --git a/test/BugPoint/remove_arguments_test.ll b/test/BugPoint/remove_arguments_test.ll index 29a03b83107..13aa9c5a653 100644 --- a/test/BugPoint/remove_arguments_test.ll +++ b/test/BugPoint/remove_arguments_test.ll @@ -1,6 +1,7 @@ ; RUN: bugpoint -load %llvmshlibdir/BugpointPasses%shlibext %s -output-prefix %t -bugpoint-crashcalls -silence-passes ; RUN: llvm-dis %t-reduced-simplified.bc -o - | FileCheck %s ; REQUIRES: loadable_module +; XFAIL: lto ; Test to make sure that arguments are removed from the function if they are ; unnecessary. And clean up any types that that frees up too. diff --git a/test/CodeGen/ARM/2012-10-18-PR14099-ByvalFrameAddress.ll b/test/CodeGen/ARM/2012-10-18-PR14099-ByvalFrameAddress.ll new file mode 100644 index 00000000000..fcc6a7f7e96 --- /dev/null +++ b/test/CodeGen/ARM/2012-10-18-PR14099-ByvalFrameAddress.ll @@ -0,0 +1,29 @@ +; RUN: llc < %s -mtriple=armv7-linux-gnueabi | FileCheck %s + +%struct.s = type { [4 x i32] } +@v = constant %struct.s zeroinitializer; + +declare void @f(%struct.s* %p); + +; CHECK: t: +define void @t(i32 %a, %struct.s* byval %s) nounwind { +entry: + +; Here we need to only check proper start address of restored %s argument. +; CHECK: sub sp, sp, #16 +; CHECK: push {r11, lr} +; CHECK: add r0, sp, #12 +; CHECK: stm r0, {r1, r2, r3} +; CHECK: add r0, sp, #12 +; CHECK-NEXT: bl f + call void @f(%struct.s* %s) + ret void +} + +; CHECK: caller: +define void @caller() { + +; CHECK: ldm r0, {r1, r2, r3} + call void @t(i32 0, %struct.s* @v); + ret void +} diff --git a/test/CodeGen/ARM/trap.ll b/test/CodeGen/ARM/trap.ll index 38842a9646f..21865f8e4ae 100644 --- a/test/CodeGen/ARM/trap.ll +++ b/test/CodeGen/ARM/trap.ll @@ -14,4 +14,16 @@ entry: unreachable } +define void @t2() nounwind { +entry: +; INSTR: t2: +; INSTR: trap + +; FUNC: t2: +; FUNC: bl __trap + call void @llvm.debugtrap() + unreachable +} + declare void @llvm.trap() nounwind +declare void @llvm.debugtrap() nounwind diff --git a/test/CodeGen/MSP430/fp.ll b/test/CodeGen/MSP430/fp.ll new file mode 100644 index 00000000000..c3273eff05c --- /dev/null +++ b/test/CodeGen/MSP430/fp.ll @@ -0,0 +1,17 @@ +; RUN: llc -O0 -disable-fp-elim < %s | FileCheck %s + +target datalayout = "e-p:16:16:16-i8:8:8-i16:16:16-i32:16:32-n8:16" +target triple = "msp430---elf" + +define void @fp() nounwind { +entry: +; CHECK: fp: +; CHECK: push.w r4 +; CHECK: mov.w r1, r4 +; CHECK: sub.w #2, r1 + %i = alloca i16, align 2 +; CHECK: mov.w #0, -2(r4) + store i16 0, i16* %i, align 2 +; CHECK: pop.w r4 + ret void +} diff --git a/test/CodeGen/Mips/brconeq.ll b/test/CodeGen/Mips/brconeq.ll new file mode 100644 index 00000000000..613391557ef --- /dev/null +++ b/test/CodeGen/Mips/brconeq.ll @@ -0,0 +1,38 @@ +; RUN: llc -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16 + +@i = global i32 5, align 4 +@j = global i32 10, align 4 +@result = global i32 0, align 4 + +define void @test() nounwind { +entry: + %0 = load i32* @i, align 4 + %1 = load i32* @j, align 4 + %cmp = icmp eq i32 %0, %1 +; 16: cmp ${{[0-9]+}}, ${{[0-9]+}} +; 16: bteqz $[[LABEL:[0-9A-Ba-b_]+]] +; 16: $[[LABEL]]: + br i1 %cmp, label %if.end, label %if.then + +if.then: ; preds = %entry + store i32 1, i32* @result, align 4 + br label %if.end + +if.end: ; preds = %entry, %if.then + ret void +} + + + + + + + + + + + + + + + diff --git a/test/CodeGen/Mips/brconeqk.ll b/test/CodeGen/Mips/brconeqk.ll new file mode 100644 index 00000000000..2c0e72dabd2 --- /dev/null +++ b/test/CodeGen/Mips/brconeqk.ll @@ -0,0 +1,22 @@ +; RUN: llc -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16 + +@i = global i32 5, align 4 +@result = global i32 0, align 4 + +define void @test() nounwind { +entry: + %0 = load i32* @i, align 4 + %cmp = icmp eq i32 %0, 10 + br i1 %cmp, label %if.end, label %if.then +; 16: cmpi ${{[0-9]+}}, {{[0-9]+}} +; 16: bteqz $[[LABEL:[0-9A-Ba-b_]+]] +; 16: $[[LABEL]]: +if.then: ; preds = %entry + store i32 1, i32* @result, align 4 + br label %if.end + +if.end: ; preds = %entry, %if.then + ret void +} + + diff --git a/test/CodeGen/Mips/brconeqz.ll b/test/CodeGen/Mips/brconeqz.ll new file mode 100644 index 00000000000..5586e7b976d --- /dev/null +++ b/test/CodeGen/Mips/brconeqz.ll @@ -0,0 +1,20 @@ +; RUN: llc -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16 + +@i = global i32 5, align 4 +@result = global i32 0, align 4 + +define void @test() nounwind { +entry: + %0 = load i32* @i, align 4 + %cmp = icmp eq i32 %0, 0 + br i1 %cmp, label %if.end, label %if.then +; 16: beqz ${{[0-9]+}}, $[[LABEL:[0-9A-Ba-b_]+]] +; 16: $[[LABEL]]: +if.then: ; preds = %entry + store i32 1, i32* @result, align 4 + br label %if.end + +if.end: ; preds = %entry, %if.then + ret void +} + diff --git a/test/CodeGen/Mips/brconge.ll b/test/CodeGen/Mips/brconge.ll new file mode 100644 index 00000000000..02f0a633b31 --- /dev/null +++ b/test/CodeGen/Mips/brconge.ll @@ -0,0 +1,37 @@ +; RUN: llc -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16 + +@i = global i32 5, align 4 +@j = global i32 10, align 4 +@k = global i32 5, align 4 +@result1 = global i32 0, align 4 +@result2 = global i32 1, align 4 + +define void @test() nounwind { +entry: + %0 = load i32* @i, align 4 + %1 = load i32* @j, align 4 + %cmp = icmp slt i32 %0, %1 + br i1 %cmp, label %if.then, label %if.end + +; 16: slt ${{[0-9]+}}, ${{[0-9]+}} +; 16: bteqz $[[LABEL:[0-9A-Ba-b_]+]] +; 16: $[[LABEL]]: + +if.then: ; preds = %entry + store i32 1, i32* @result1, align 4 + br label %if.end + +if.end: ; preds = %if.then, %entry + %2 = load i32* @k, align 4 + %cmp1 = icmp slt i32 %0, %2 + br i1 %cmp1, label %if.then2, label %if.end3 + +if.then2: ; preds = %if.end + store i32 1, i32* @result1, align 4 + br label %if.end3 + +if.end3: ; preds = %if.then2, %if.end + ret void +} + + diff --git a/test/CodeGen/Mips/brcongt.ll b/test/CodeGen/Mips/brcongt.ll new file mode 100644 index 00000000000..767b51b21b9 --- /dev/null +++ b/test/CodeGen/Mips/brcongt.ll @@ -0,0 +1,25 @@ +; RUN: llc -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16 + +@i = global i32 5, align 4 +@j = global i32 10, align 4 +@k = global i32 5, align 4 +@result = global i32 0, align 4 + +define void @test() nounwind { +entry: + %0 = load i32* @i, align 4 + %1 = load i32* @j, align 4 + %cmp = icmp sgt i32 %0, %1 + br i1 %cmp, label %if.end, label %if.then +; 16: slt ${{[0-9]+}}, ${{[0-9]+}} +; 16: btnez $[[LABEL:[0-9A-Ba-b_]+]] +; 16: $[[LABEL]]: +if.then: ; preds = %entry + store i32 1, i32* @result, align 4 + br label %if.end + +if.end: ; preds = %entry, %if.then + ret void +} + + diff --git a/test/CodeGen/Mips/brconle.ll b/test/CodeGen/Mips/brconle.ll new file mode 100644 index 00000000000..854b2481c6e --- /dev/null +++ b/test/CodeGen/Mips/brconle.ll @@ -0,0 +1,37 @@ +; RUN: llc -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16 + +@i = global i32 -5, align 4 +@j = global i32 10, align 4 +@k = global i32 -5, align 4 +@result1 = global i32 0, align 4 +@result2 = global i32 1, align 4 + +define void @test() nounwind { +entry: + %0 = load i32* @j, align 4 + %1 = load i32* @i, align 4 + %cmp = icmp sgt i32 %0, %1 + br i1 %cmp, label %if.then, label %if.end + +; 16: slt ${{[0-9]+}}, ${{[0-9]+}} +; 16: bteqz $[[LABEL:[0-9A-Ba-b_]+]] +; 16: $[[LABEL]]: + +if.then: ; preds = %entry + store i32 1, i32* @result1, align 4 + br label %if.end + +if.end: ; preds = %if.then, %entry + %2 = load i32* @k, align 4 + %cmp1 = icmp sgt i32 %1, %2 + br i1 %cmp1, label %if.then2, label %if.end3 + +if.then2: ; preds = %if.end + store i32 0, i32* @result1, align 4 + br label %if.end3 + +if.end3: ; preds = %if.then2, %if.end + ret void +} + + diff --git a/test/CodeGen/Mips/brconlt.ll b/test/CodeGen/Mips/brconlt.ll new file mode 100644 index 00000000000..931a3e8c7ba --- /dev/null +++ b/test/CodeGen/Mips/brconlt.ll @@ -0,0 +1,27 @@ +; RUN: llc -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16 + +@i = global i32 5, align 4 +@j = global i32 10, align 4 +@k = global i32 5, align 4 +@result = global i32 0, align 4 + +define void @test() nounwind { +entry: + %0 = load i32* @j, align 4 + %1 = load i32* @i, align 4 + %cmp = icmp slt i32 %0, %1 + br i1 %cmp, label %if.end, label %if.then + +; 16: slt ${{[0-9]+}}, ${{[0-9]+}} +; 16: btnez $[[LABEL:[0-9A-Ba-b_]+]] +; 16: $[[LABEL]]: + +if.then: ; preds = %entry + store i32 1, i32* @result, align 4 + br label %if.end + +if.end: ; preds = %entry, %if.then + ret void +} + + diff --git a/test/CodeGen/Mips/brconne.ll b/test/CodeGen/Mips/brconne.ll new file mode 100644 index 00000000000..5d5bde3fcf9 --- /dev/null +++ b/test/CodeGen/Mips/brconne.ll @@ -0,0 +1,26 @@ +; RUN: llc -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16 + +@i = global i32 5, align 4 +@j = global i32 5, align 4 +@result = global i32 0, align 4 + +define void @test() nounwind { +entry: + %0 = load i32* @j, align 4 + %1 = load i32* @i, align 4 + %cmp = icmp eq i32 %0, %1 + br i1 %cmp, label %if.then, label %if.end +; 16: cmp ${{[0-9]+}}, ${{[0-9]+}} +; 16: btnez $[[LABEL:[0-9A-Ba-b_]+]] +; 16: lw ${{[0-9]+}}, %got(result)(${{[0-9]+}}) +; 16: $[[LABEL]]: + +if.then: ; preds = %entry + store i32 1, i32* @result, align 4 + br label %if.end + +if.end: ; preds = %if.then, %entry + ret void +} + + diff --git a/test/CodeGen/Mips/brconnek.ll b/test/CodeGen/Mips/brconnek.ll new file mode 100644 index 00000000000..6208d7c5a04 --- /dev/null +++ b/test/CodeGen/Mips/brconnek.ll @@ -0,0 +1,25 @@ +; RUN: llc -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16 + +@j = global i32 5, align 4 +@result = global i32 0, align 4 + +define void @test() nounwind { +entry: + %0 = load i32* @j, align 4 + %cmp = icmp eq i32 %0, 5 + br i1 %cmp, label %if.then, label %if.end + +; 16: cmpi ${{[0-9]+}}, {{[0-9]+}} +; 16: btnez $[[LABEL:[0-9A-Ba-b_]+]] +; 16: lw ${{[0-9]+}}, %got(result)(${{[0-9]+}}) +; 16: $[[LABEL]]: + +if.then: ; preds = %entry + store i32 1, i32* @result, align 4 + br label %if.end + +if.end: ; preds = %if.then, %entry + ret void +} + + diff --git a/test/CodeGen/Mips/brconnez.ll b/test/CodeGen/Mips/brconnez.ll new file mode 100644 index 00000000000..47db7901b51 --- /dev/null +++ b/test/CodeGen/Mips/brconnez.ll @@ -0,0 +1,24 @@ +; RUN: llc -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16 + +@j = global i32 0, align 4 +@result = global i32 0, align 4 + +define void @test() nounwind { +entry: + %0 = load i32* @j, align 4 + %cmp = icmp eq i32 %0, 0 + br i1 %cmp, label %if.then, label %if.end + +; 16: bnez ${{[0-9]+}}, $[[LABEL:[0-9A-Ba-b_]+]] +; 16: lw ${{[0-9]+}}, %got(result)(${{[0-9]+}}) +; 16: $[[LABEL]]: + +if.then: ; preds = %entry + store i32 1, i32* @result, align 4 + br label %if.end + +if.end: ; preds = %if.then, %entry + ret void +} + + diff --git a/test/CodeGen/Mips/mips64-sret.ll b/test/CodeGen/Mips/mips64-sret.ll new file mode 100644 index 00000000000..498c5fe1747 --- /dev/null +++ b/test/CodeGen/Mips/mips64-sret.ll @@ -0,0 +1,14 @@ +; RUN: llc -march=mips64el -mcpu=mips64r2 -mattr=n64 -O0 < %s + +%struct.S = type { [8 x i32] } + +@g = common global %struct.S zeroinitializer, align 4 + +define void @f(%struct.S* noalias sret %agg.result) nounwind { +entry: + %0 = bitcast %struct.S* %agg.result to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* bitcast (%struct.S* @g to i8*), i64 32, i32 4, i1 false) + ret void +} + +declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind diff --git a/test/CodeGen/Mips/tailcall.ll b/test/CodeGen/Mips/tailcall.ll new file mode 100644 index 00000000000..4989636a20d --- /dev/null +++ b/test/CodeGen/Mips/tailcall.ll @@ -0,0 +1,100 @@ +; RUN: llc -march=mipsel -relocation-model=pic -enable-mips-tail-calls < %s | \ +; RUN: FileCheck %s -check-prefix=PIC32 +; RUN: llc -march=mipsel -relocation-model=static \ +; RUN: -enable-mips-tail-calls < %s | FileCheck %s -check-prefix=STATIC32 +; RUN: llc -march=mips64el -mcpu=mips64r2 -mattr=+n64 -enable-mips-tail-calls \ +; RUN: < %s | FileCheck %s -check-prefix=N64 + +@g0 = common global i32 0, align 4 +@g1 = common global i32 0, align 4 +@g2 = common global i32 0, align 4 +@g3 = common global i32 0, align 4 +@g4 = common global i32 0, align 4 +@g5 = common global i32 0, align 4 +@g6 = common global i32 0, align 4 +@g7 = common global i32 0, align 4 +@g8 = common global i32 0, align 4 +@g9 = common global i32 0, align 4 + +define i32 @caller1(i32 %a0) nounwind { +entry: +; PIC32-NOT: jalr +; STATIC32-NOT: jal +; N64-NOT: jalr + + %call = tail call i32 @callee1(i32 1, i32 1, i32 1, i32 %a0) nounwind + ret i32 %call +} + +declare i32 @callee1(i32, i32, i32, i32) + +define i32 @caller2(i32 %a0, i32 %a1, i32 %a2, i32 %a3) nounwind { +entry: +; PIC32: jalr +; STATIC32: jal +; N64-NOT: jalr + + %call = tail call i32 @callee2(i32 1, i32 %a0, i32 %a1, i32 %a2, i32 %a3) nounwind + ret i32 %call +} + +declare i32 @callee2(i32, i32, i32, i32, i32) + +define i32 @caller3(i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %a4) nounwind { +entry: +; PIC32: jalr +; STATIC32: jal +; N64-NOT: jalr + + %call = tail call i32 @callee3(i32 1, i32 1, i32 1, i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %a4) nounwind + ret i32 %call +} + +declare i32 @callee3(i32, i32, i32, i32, i32, i32, i32, i32) + +define i32 @caller4(i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7) nounwind { +entry: +; PIC32: jalr +; STATIC32: jal +; N64: jalr + + %call = tail call i32 @callee4(i32 1, i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7) nounwind + ret i32 %call +} + +declare i32 @callee4(i32, i32, i32, i32, i32, i32, i32, i32, i32) + +define i32 @caller5() nounwind readonly { +entry: +; PIC32-NOT: jalr +; STATIC32-NOT: jal +; N64-NOT: jalr + + %0 = load i32* @g0, align 4 + %1 = load i32* @g1, align 4 + %2 = load i32* @g2, align 4 + %3 = load i32* @g3, align 4 + %4 = load i32* @g4, align 4 + %5 = load i32* @g5, align 4 + %6 = load i32* @g6, align 4 + %7 = load i32* @g7, align 4 + %8 = load i32* @g8, align 4 + %9 = load i32* @g9, align 4 + %call = tail call fastcc i32 @callee5(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %8, i32 %9) + ret i32 %call +} + +define internal fastcc i32 @callee5(i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7, i32 %a8, i32 %a9) nounwind readnone noinline { +entry: + %add = add nsw i32 %a1, %a0 + %add1 = add nsw i32 %add, %a2 + %add2 = add nsw i32 %add1, %a3 + %add3 = add nsw i32 %add2, %a4 + %add4 = add nsw i32 %add3, %a5 + %add5 = add nsw i32 %add4, %a6 + %add6 = add nsw i32 %add5, %a7 + %add7 = add nsw i32 %add6, %a8 + %add8 = add nsw i32 %add7, %a9 + ret i32 %add8 +} + diff --git a/test/CodeGen/PowerPC/i64_fp_round.ll b/test/CodeGen/PowerPC/i64_fp_round.ll new file mode 100644 index 00000000000..5a0c072c9c5 --- /dev/null +++ b/test/CodeGen/PowerPC/i64_fp_round.ll @@ -0,0 +1,27 @@ +; RUN: llc -mcpu=pwr7 < %s | FileCheck %s +target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64" +target triple = "powerpc64-unknown-linux-gnu" + +define float @test(i64 %x) nounwind readnone { +entry: + %conv = sitofp i64 %x to float + ret float %conv +} + +; Verify that we get the code sequence needed to avoid double-rounding. +; Note that only parts of the sequence are checked for here, to allow +; for minor code generation differences. + +; CHECK: sradi [[REGISTER:[0-9]+]], 3, 53 +; CHECK: addi [[REGISTER:[0-9]+]], [[REGISTER]], 1 +; CHECK: cmpldi 0, [[REGISTER]], 1 +; CHECK: isel [[REGISTER:[0-9]+]], {{[0-9]+}}, 3, 1 +; CHECK: std [[REGISTER]], -{{[0-9]+}}(1) + + +; Also check that with -enable-unsafe-fp-math we do not get that extra +; code sequence. Simply verify that there is no "isel" present. + +; RUN: llc -mcpu=pwr7 -enable-unsafe-fp-math < %s | FileCheck %s -check-prefix=UNSAFE +; CHECK-UNSAFE-NOT: isel + diff --git a/test/CodeGen/X86/2012-10-18-crash-dagco.ll b/test/CodeGen/X86/2012-10-18-crash-dagco.ll new file mode 100644 index 00000000000..5b98624a37b --- /dev/null +++ b/test/CodeGen/X86/2012-10-18-crash-dagco.ll @@ -0,0 +1,61 @@ +; RUN: llc -march=x86-64 -mcpu=corei7 -disable-cgp-select2branch < %s + +; We should not crash on this test. + +target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32" +target triple = "i386-apple-darwin9.0.0" + +@global = external constant [411 x i8], align 1 + +define void @snork() nounwind { +bb: + br i1 undef, label %bb26, label %bb27 + +bb26: ; preds = %bb48, %bb26, %bb + switch i32 undef, label %bb26 [ + i32 142771596, label %bb28 + ] + +bb27: ; preds = %bb48, %bb + switch i32 undef, label %bb49 [ + i32 142771596, label %bb28 + ] + +bb28: ; preds = %bb27, %bb26 + %tmp = load i32* null + %tmp29 = trunc i32 %tmp to i8 + store i8* undef, i8** undef + %tmp30 = load i32* null + %tmp31 = icmp eq i32 %tmp30, 0 + %tmp32 = getelementptr inbounds [411 x i8]* @global, i32 0, i32 undef + %tmp33 = load i8* %tmp32, align 1 + %tmp34 = getelementptr inbounds [411 x i8]* @global, i32 0, i32 0 + %tmp35 = load i8* %tmp34, align 1 + %tmp36 = select i1 %tmp31, i8 %tmp35, i8 %tmp33 + %tmp37 = select i1 undef, i8 %tmp29, i8 %tmp36 + %tmp38 = zext i8 %tmp37 to i32 + %tmp39 = select i1 undef, i32 0, i32 %tmp38 + %tmp40 = getelementptr inbounds i32* null, i32 %tmp39 + %tmp41 = load i32* %tmp40, align 4 + %tmp42 = load i32* undef, align 4 + %tmp43 = load i32* undef + %tmp44 = xor i32 %tmp42, %tmp43 + %tmp45 = lshr i32 %tmp44, 8 + %tmp46 = lshr i32 %tmp44, 7 + call void @spam() + unreachable + +bb47: ; No predecessors! + ret void + +bb48: ; No predecessors! + br i1 undef, label %bb27, label %bb26 + +bb49: ; preds = %bb49, %bb27 + br label %bb49 + +bb50: ; preds = %bb50 + br label %bb50 +} + +declare void @spam() noreturn nounwind diff --git a/test/CodeGen/X86/buildvec-insertvec.ll b/test/CodeGen/X86/buildvec-insertvec.ll new file mode 100644 index 00000000000..3fb69a48b3c --- /dev/null +++ b/test/CodeGen/X86/buildvec-insertvec.ll @@ -0,0 +1,15 @@ +; RUN: llc < %s -mcpu=corei7 -mtriple=x86_64-unknown-linux-gnu | FileCheck %s + +define void @foo(<3 x float> %in, <4 x i8>* nocapture %out) nounwind { + %t0 = fptoui <3 x float> %in to <3 x i8> + %t1 = shufflevector <3 x i8> %t0, <3 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 undef> + %t2 = insertelement <4 x i8> %t1, i8 -1, i32 3 + store <4 x i8> %t2, <4 x i8>* %out, align 4 + ret void +; CHECK: foo +; CHECK: cvttps2dq +; CHECK-NOT: pextrd +; CHECK: pinsrd +; CHECK-NEXT: pshufb +; CHECK: ret +} diff --git a/test/CodeGen/X86/extract-concat.ll b/test/CodeGen/X86/extract-concat.ll new file mode 100644 index 00000000000..704309eb650 --- /dev/null +++ b/test/CodeGen/X86/extract-concat.ll @@ -0,0 +1,17 @@ +; RUN: llc < %s -mcpu=corei7 -mtriple=x86_64-unknown-linux-gnu | FileCheck %s + +define void @foo(<4 x float> %in, <4 x i8>* %out) { + %t0 = fptosi <4 x float> %in to <4 x i32> + %t1 = trunc <4 x i32> %t0 to <4 x i16> + %t2 = shufflevector <4 x i16> %t1, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + %t3 = trunc <8 x i16> %t2 to <8 x i8> + %t4 = shufflevector <8 x i8> %t3, <8 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + %t5 = insertelement <4 x i8> %t4, i8 -1, i32 3 + store <4 x i8> %t5, <4 x i8>* %out + ret void +; CHECK: foo +; CHECK: cvttps2dq +; CHECK-NOT: pextrd +; CHECK: pshufb +; CHECK: ret +} diff --git a/test/CodeGen/X86/pr14090.ll b/test/CodeGen/X86/pr14090.ll new file mode 100644 index 00000000000..d76b912fd8e --- /dev/null +++ b/test/CodeGen/X86/pr14090.ll @@ -0,0 +1,76 @@ +; RUN: llc < %s -march=x86-64 -print-before=stack-coloring -print-after=stack-coloring >%t 2>&1 && FileCheck <%t %s + +define void @foo(i64* %retval.i, i32 %call, i32* %.ph.i80, i32 %fourteen, i32* %out.lo, i32* %out.hi) nounwind align 2 { +entry: + %_Tmp.i39 = alloca i64, align 8 + %retval.i33 = alloca i64, align 8 + %_Tmp.i = alloca i64, align 8 + %retval.i.i = alloca i64, align 8 + %_First.i = alloca i64, align 8 + + %0 = load i64* %retval.i, align 8 + + %1 = load i64* %retval.i, align 8 + + %_Tmp.i39.0.cast73 = bitcast i64* %_Tmp.i39 to i8* + call void @llvm.lifetime.start(i64 8, i8* %_Tmp.i39.0.cast73) + store i64 %1, i64* %_Tmp.i39, align 8 + %cmp.i.i.i40 = icmp slt i32 %call, 0 + %2 = lshr i64 %1, 32 + %3 = trunc i64 %2 to i32 + %sub.i.i.i44 = sub i32 0, %call + %cmp2.i.i.i45 = icmp ult i32 %3, %sub.i.i.i44 + %or.cond.i.i.i46 = and i1 %cmp.i.i.i40, %cmp2.i.i.i45 + %add.i.i.i47 = add i32 %3, %call + %sub5.i.i.i48 = lshr i32 %add.i.i.i47, 5 + %trunc.i50 = trunc i64 %1 to i32 + %inttoptr.i51 = inttoptr i32 %trunc.i50 to i32* + %add61617.i.i.i52 = or i32 %sub5.i.i.i48, -134217728 + %add61617.i.sub5.i.i.i53 = select i1 %or.cond.i.i.i46, i32 %add61617.i.i.i52, i32 %sub5.i.i.i48 + %storemerge2.i.i54 = getelementptr inbounds i32* %inttoptr.i51, i32 %add61617.i.sub5.i.i.i53 + %_Tmp.i39.0.cast74 = bitcast i64* %_Tmp.i39 to i32** + store i32* %storemerge2.i.i54, i32** %_Tmp.i39.0.cast74, align 8 + %storemerge.i.i55 = and i32 %add.i.i.i47, 31 + %_Tmp.i39.4.raw_idx = getelementptr inbounds i8* %_Tmp.i39.0.cast73, i32 4 + %_Tmp.i39.4.cast = bitcast i8* %_Tmp.i39.4.raw_idx to i32* + store i32 %storemerge.i.i55, i32* %_Tmp.i39.4.cast, align 4 + %srcval.i56 = load i64* %_Tmp.i39, align 8 + call void @llvm.lifetime.end(i64 8, i8* %_Tmp.i39.0.cast73) + +; CHECK: Before Merge disjoint stack slots +; CHECK: [[PREFIX15:MOV64mr.*<fi#]]{{[0-9]}}[[SUFFIX15:.*;]] mem:ST8[%fifteen] +; CHECK: [[PREFIX87:MOV32mr.*;]] mem:ST4[%sunkaddr87] + +; CHECK: After Merge disjoint stack slots +; CHECK: [[PREFIX15]]{{[0-9]}}[[SUFFIX15]] mem:ST8[%_Tmp.i39] +; CHECK: [[PREFIX87]] mem:ST4[<unknown>] + + %fifteen = bitcast i64* %retval.i.i to i32** + %sixteen = bitcast i64* %retval.i.i to i8* + call void @llvm.lifetime.start(i64 8, i8* %sixteen) + store i32* %.ph.i80, i32** %fifteen, align 8, !tbaa !0 + %sunkaddr = ptrtoint i64* %retval.i.i to i32 + %sunkaddr86 = add i32 %sunkaddr, 4 + %sunkaddr87 = inttoptr i32 %sunkaddr86 to i32* + store i32 %fourteen, i32* %sunkaddr87, align 4, !tbaa !3 + %seventeen = load i64* %retval.i.i, align 8 + call void @llvm.lifetime.end(i64 8, i8* %sixteen) + %eighteen = lshr i64 %seventeen, 32 + %nineteen = trunc i64 %eighteen to i32 + %shl.i.i.i = shl i32 1, %nineteen + + store i32 %shl.i.i.i, i32* %out.lo, align 8 + store i32 %nineteen, i32* %out.hi, align 8 + + ret void +} + +declare void @llvm.lifetime.start(i64, i8* nocapture) nounwind + +declare void @llvm.lifetime.end(i64, i8* nocapture) nounwind + +!0 = metadata !{metadata !"int", metadata !1} +!1 = metadata !{metadata !"omnipotent char", metadata !2} +!2 = metadata !{metadata !"Simple C/C++ TBAA"} +!3 = metadata !{metadata !"any pointer", metadata !1} +!4 = metadata !{metadata !"vtable pointer", metadata !2} diff --git a/test/CodeGen/X86/pr14098.ll b/test/CodeGen/X86/pr14098.ll new file mode 100644 index 00000000000..6ce2449ab6a --- /dev/null +++ b/test/CodeGen/X86/pr14098.ll @@ -0,0 +1,23 @@ +; RUN: llc -mtriple i386-unknown-linux-gnu -relocation-model=pic -verify-machineinstrs < %s +; We used to crash on this. + +declare void @foo() +declare void @foo3(i1 %x) +define void @bar(i1 %a1, i16 %a2) nounwind align 2 { +bb0: + %a3 = trunc i16 %a2 to i8 + %a4 = lshr i16 %a2, 8 + %a5 = trunc i16 %a4 to i8 + br i1 %a1, label %bb1, label %bb2 +bb1: + br label %bb2 +bb2: + %a6 = phi i8 [ 3, %bb0 ], [ %a5, %bb1 ] + %a7 = phi i8 [ 9, %bb0 ], [ %a3, %bb1 ] + %a8 = icmp eq i8 %a6, 1 + call void @foo() + %a9 = icmp eq i8 %a7, 0 + call void @foo3(i1 %a9) + call void @foo3(i1 %a8) + ret void +} diff --git a/test/CodeGen/X86/sjlj.ll b/test/CodeGen/X86/sjlj.ll index d594e982994..681db009438 100644 --- a/test/CodeGen/X86/sjlj.ll +++ b/test/CodeGen/X86/sjlj.ll @@ -1,5 +1,7 @@ ; RUN: llc < %s -mtriple=i386-pc-linux -mcpu=corei7 -relocation-model=static | FileCheck --check-prefix=X86 %s -; RUN: llc < %s -mtriple=x86_64-pc-linux -mcpu=corei7 | FileCheck --check-prefix=X64 %s +; RUN: llc < %s -mtriple=i386-pc-linux -mcpu=corei7 -relocation-model=pic | FileCheck --check-prefix=PIC86 %s +; RUN: llc < %s -mtriple=x86_64-pc-linux -mcpu=corei7 -relocation-model=static | FileCheck --check-prefix=X64 %s +; RUN: llc < %s -mtriple=x86_64-pc-linux -mcpu=corei7 -relocation-model=pic | FileCheck --check-prefix=PIC64 %s @buf = internal global [5 x i8*] zeroinitializer @@ -20,14 +22,26 @@ define i32 @sj0() nounwind { ret i32 %r ; X86: sj0 ; x86: movl %ebp, buf -; x86: movl ${{.*LBB.*}}, buf+4 ; X86: movl %esp, buf+8 +; x86: movl ${{.*LBB.*}}, buf+4 ; X86: ret +; PIC86: sj0 +; PIC86: movl %ebp, buf@GOTOFF(%[[GOT:.*]]) +; PIC86: movl %esp, buf@GOTOFF+8(%[[GOT]]) +; PIC86: leal {{.*LBB.*}}@GOTOFF(%[[GOT]]), %[[LREG:.*]] +; PIC86: movl %[[LREG]], buf@GOTOFF+4 +; PIC86: ret ; X64: sj0 ; x64: movq %rbp, buf(%rip) ; x64: movq ${{.*LBB.*}}, buf+8(%rip) ; X64: movq %rsp, buf+16(%rip) ; X64: ret +; PIC64: sj0 +; PIC64: movq %rbp, buf(%rip) +; PIC64: movq %rsp, buf+16(%rip) +; PIC64: leaq {{.*LBB.*}}(%rip), %[[LREG:.*]] +; PIC64: movq %[[LREG]], buf+8(%rip) +; PIC64: ret } define void @lj0() nounwind { diff --git a/test/Instrumentation/AddressSanitizer/basic.ll b/test/Instrumentation/AddressSanitizer/basic.ll index d1900018706..655f69c16fd 100644 --- a/test/Instrumentation/AddressSanitizer/basic.ll +++ b/test/Instrumentation/AddressSanitizer/basic.ll @@ -69,3 +69,23 @@ entry: store i32 42, i32* %a ret void } + +; Check that asan leaves just one alloca. + +declare void @alloca_test_use([10 x i8]*) +define void @alloca_test() address_safety { +entry: + %x = alloca [10 x i8], align 1 + %y = alloca [10 x i8], align 1 + %z = alloca [10 x i8], align 1 + call void @alloca_test_use([10 x i8]* %x) + call void @alloca_test_use([10 x i8]* %y) + call void @alloca_test_use([10 x i8]* %z) + ret void +} + +; CHECK: define void @alloca_test() +; CHECK: = alloca +; CHECK-NOT: = alloca +; CHECK: ret void + diff --git a/test/Makefile b/test/Makefile index 3c6b5b68a4f..810fdded465 100644 --- a/test/Makefile +++ b/test/Makefile @@ -117,6 +117,16 @@ else ENABLE_ASSERTIONS=1 endif +# Derive whether or not LTO is enabled by checking the extra options. +LTO_IS_ENABLED := 0 +ifneq ($(findstring -flto,$(CompileCommonOpts)),) +LTO_IS_ENABLED := 1 +else +ifneq ($(findstring -O4,$(CompileCommonOpts)),) +LTO_IS_ENABLED := 1 +endif +endif + lit.site.cfg: FORCE @echo "Making LLVM 'lit.site.cfg' file..." @$(ECHOPATH) s=@TARGET_TRIPLE@=$(TARGET_TRIPLE)=g > lit.tmp @@ -129,6 +139,7 @@ lit.site.cfg: FORCE @$(ECHOPATH) s=@OCAMLOPT@=$(OCAMLOPT) -cc $(subst *,'\\\"',*$(subst =,"\\=",$(CXX_FOR_OCAMLOPT))*) -I $(LibDir)/ocaml=g >> lit.tmp @$(ECHOPATH) s=@ENABLE_SHARED@=$(ENABLE_SHARED)=g >> lit.tmp @$(ECHOPATH) s=@ENABLE_ASSERTIONS@=$(ENABLE_ASSERTIONS)=g >> lit.tmp + @$(ECHOPATH) s=@LTO_IS_ENABLED@=$(LTO_IS_ENABLED)=g >> lit.tmp @$(ECHOPATH) s=@TARGETS_TO_BUILD@=$(TARGETS_TO_BUILD)=g >> lit.tmp @$(ECHOPATH) s=@LLVM_BINDINGS@=$(BINDINGS_TO_BUILD)=g >> lit.tmp @$(ECHOPATH) s=@HOST_OS@=$(HOST_OS)=g >> lit.tmp diff --git a/test/Transforms/BBVectorize/simple-ldstr-ptrs.ll b/test/Transforms/BBVectorize/simple-ldstr-ptrs.ll index f992d415477..6294543cd81 100644 --- a/test/Transforms/BBVectorize/simple-ldstr-ptrs.ll +++ b/test/Transforms/BBVectorize/simple-ldstr-ptrs.ll @@ -79,3 +79,53 @@ entry: ; CHECK-AO-NOT: <2 x } +; Simple 3-pair chain with loads and stores (using ptrs and gep) +; using pointer vectors. +define void @test3(<2 x i64*>* %a, <2 x i64*>* %b, <2 x i64*>* %c) nounwind uwtable readonly { +entry: + %i0 = load <2 x i64*>* %a, align 8 + %i1 = load <2 x i64*>* %b, align 8 + %arrayidx3 = getelementptr inbounds <2 x i64*>* %a, i64 1 + %i3 = load <2 x i64*>* %arrayidx3, align 8 + %arrayidx4 = getelementptr inbounds <2 x i64*>* %b, i64 1 + %i4 = load <2 x i64*>* %arrayidx4, align 8 + %j1 = extractelement <2 x i64*> %i1, i32 0 + %j4 = extractelement <2 x i64*> %i4, i32 0 + %o1 = load i64* %j1, align 8 + %o4 = load i64* %j4, align 8 + %j0 = extractelement <2 x i64*> %i0, i32 0 + %j3 = extractelement <2 x i64*> %i3, i32 0 + %ptr0 = getelementptr inbounds i64* %j0, i64 %o1 + %ptr3 = getelementptr inbounds i64* %j3, i64 %o4 + %qtr0 = insertelement <2 x i64*> undef, i64* %ptr0, i32 0 + %rtr0 = insertelement <2 x i64*> %qtr0, i64* %ptr0, i32 1 + %qtr3 = insertelement <2 x i64*> undef, i64* %ptr3, i32 0 + %rtr3 = insertelement <2 x i64*> %qtr3, i64* %ptr3, i32 1 + store <2 x i64*> %rtr0, <2 x i64*>* %c, align 8 + %arrayidx5 = getelementptr inbounds <2 x i64*>* %c, i64 1 + store <2 x i64*> %rtr3, <2 x i64*>* %arrayidx5, align 8 + ret void +; CHECK: @test3 +; CHECK: %i0.v.i0 = bitcast <2 x i64*>* %a to <4 x i64*>* +; CHECK: %i1 = load <2 x i64*>* %b, align 8 +; CHECK: %i0 = load <4 x i64*>* %i0.v.i0, align 8 +; CHECK: %arrayidx4 = getelementptr inbounds <2 x i64*>* %b, i64 1 +; CHECK: %i4 = load <2 x i64*>* %arrayidx4, align 8 +; CHECK: %j1 = extractelement <2 x i64*> %i1, i32 0 +; CHECK: %j4 = extractelement <2 x i64*> %i4, i32 0 +; CHECK: %o1 = load i64* %j1, align 8 +; CHECK: %o4 = load i64* %j4, align 8 +; CHECK: %ptr0.v.i1.1 = insertelement <2 x i64> undef, i64 %o1, i32 0 +; CHECK: %ptr0.v.i1.2 = insertelement <2 x i64> %ptr0.v.i1.1, i64 %o4, i32 1 +; CHECK: %ptr0.v.i0 = shufflevector <4 x i64*> %i0, <4 x i64*> undef, <2 x i32> <i32 0, i32 2> +; CHECK: %ptr0 = getelementptr inbounds <2 x i64*> %ptr0.v.i0, <2 x i64> %ptr0.v.i1.2 +; CHECK: %rtr0 = shufflevector <2 x i64*> %ptr0, <2 x i64*> undef, <2 x i32> zeroinitializer +; CHECK: %rtr3 = shufflevector <2 x i64*> %ptr0, <2 x i64*> undef, <2 x i32> <i32 1, i32 1> +; CHECK: %0 = bitcast <2 x i64*>* %c to <4 x i64*>* +; CHECK: %1 = shufflevector <2 x i64*> %rtr0, <2 x i64*> %rtr3, <4 x i32> <i32 0, i32 1, i32 2, i32 3> +; CHECK: store <4 x i64*> %1, <4 x i64*>* %0, align 8 +; CHECK: ret void +; CHECK-AO: @test3 +; CHECK-AO-NOT: <4 x +} + diff --git a/test/Transforms/IndVarSimplify/2012-10-19-congruent-constant.ll b/test/Transforms/IndVarSimplify/2012-10-19-congruent-constant.ll new file mode 100644 index 00000000000..5c478669d29 --- /dev/null +++ b/test/Transforms/IndVarSimplify/2012-10-19-congruent-constant.ll @@ -0,0 +1,27 @@ +; RUN: opt -S -indvars < %s | FileCheck %s + +; PR12627 +define void @test1(i32 %x) nounwind uwtable ssp { +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %phi1 = phi i1 [ false, %entry ], [ %cmpa, %for.body ] + %phi2 = phi i1 [ false, %entry ], [ %cmpb, %for.body ] + %i.07 = phi i32 [ 0, %entry ], [ %inc, %for.body ] + tail call void @aux(i1 %phi1, i1 %phi2) nounwind + %cmpa = icmp sgt i32 %i.07, 200 + %cmpb = icmp sgt i32 %i.07, 100 + %inc = add nsw i32 %i.07, 1 + %exitcond = icmp eq i32 %inc, 100 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void + +; CHECK: @test1 +; CHECK-NOT: phi i1 +; CHECK: call void @aux(i1 false, i1 false) +} + +declare void @aux(i1, i1) diff --git a/test/Transforms/IndVarSimplify/crash.ll b/test/Transforms/IndVarSimplify/crash.ll index 62af42b9d68..1b702a3b1a3 100644 --- a/test/Transforms/IndVarSimplify/crash.ll +++ b/test/Transforms/IndVarSimplify/crash.ll @@ -113,3 +113,21 @@ bb9: ret void } +; PR12536 +define void @fn1() noreturn nounwind { +entry: + br label %for.cond + +for.cond: ; preds = %for.end, %entry + %b.0 = phi i32 [ undef, %entry ], [ %conv, %for.end ] + br label %for.cond1 + +for.cond1: ; preds = %for.cond1, %for.cond + %c.0 = phi i32 [ %b.0, %for.cond1 ], [ 0, %for.cond ] + br i1 undef, label %for.cond1, label %for.end + +for.end: ; preds = %for.cond1 + %cmp2 = icmp slt i32 %c.0, 1 + %conv = zext i1 %cmp2 to i32 + br label %for.cond +} diff --git a/test/Transforms/IndVarSimplify/no-iv-rewrite.ll b/test/Transforms/IndVarSimplify/no-iv-rewrite.ll index bfdd000e38e..507f695e67c 100644 --- a/test/Transforms/IndVarSimplify/no-iv-rewrite.ll +++ b/test/Transforms/IndVarSimplify/no-iv-rewrite.ll @@ -199,7 +199,6 @@ entry: ; back to the loop iv. ; ; CHECK: loop: -; CHECK: phi i32 ; CHECK-NOT: phi ; CHECK: exit: loop: diff --git a/test/Transforms/InstCombine/obfuscated_splat.ll b/test/Transforms/InstCombine/obfuscated_splat.ll new file mode 100644 index 00000000000..c25dade168a --- /dev/null +++ b/test/Transforms/InstCombine/obfuscated_splat.ll @@ -0,0 +1,11 @@ +; RUN: opt -instcombine -S %s | FileCheck %s + +define void @test(<4 x float> *%in_ptr, <4 x float> *%out_ptr) { + %A = load <4 x float>* %in_ptr, align 16 + %B = shufflevector <4 x float> %A, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 undef, i32 undef> + %C = shufflevector <4 x float> %B, <4 x float> %A, <4 x i32> <i32 0, i32 1, i32 4, i32 undef> + %D = shufflevector <4 x float> %C, <4 x float> %A, <4 x i32> <i32 0, i32 1, i32 2, i32 4> +; CHECK: %D = shufflevector <4 x float> %A, <4 x float> undef, <4 x i32> zeroinitializer + store <4 x float> %D, <4 x float> *%out_ptr + ret void +} diff --git a/test/Transforms/InstCombine/select.ll b/test/Transforms/InstCombine/select.ll index 4baae2618dd..cc3aacdce3c 100644 --- a/test/Transforms/InstCombine/select.ll +++ b/test/Transforms/InstCombine/select.ll @@ -829,3 +829,37 @@ define i1 @test63(i1 %A, i1 %B) { ; CHECK: %C = or i1 %B, %not ; CHECK: ret i1 %C } + +; PR14131 +define void @test64(i32 %p, i16 %b) noreturn nounwind { +entry: + %p.addr.0.insert.mask = and i32 %p, -65536 + %conv2 = and i32 %p, 65535 + br i1 undef, label %lor.rhs, label %lor.end + +lor.rhs: + %p.addr.0.extract.trunc = trunc i32 %p.addr.0.insert.mask to i16 + %phitmp = zext i16 %p.addr.0.extract.trunc to i32 + br label %lor.end + +lor.end: + %t.1 = phi i32 [ 0, %entry ], [ %phitmp, %lor.rhs ] + %conv6 = zext i16 %b to i32 + %div = udiv i32 %conv6, %t.1 + %tobool8 = icmp eq i32 %div, 0 + %cmp = icmp eq i32 %t.1, 0 + %cmp12 = icmp ult i32 %conv2, 2 + %cmp.sink = select i1 %tobool8, i1 %cmp12, i1 %cmp + br i1 %cmp.sink, label %cond.end17, label %cond.false16 + +cond.false16: + br label %cond.end17 + +cond.end17: + br label %while.body + +while.body: + br label %while.body +; CHECK: @test64 +; CHECK-NOT: select +} diff --git a/test/Transforms/InstCombine/strcpy-1.ll b/test/Transforms/InstCombine/strcpy-1.ll new file mode 100644 index 00000000000..b6cf048b2a8 --- /dev/null +++ b/test/Transforms/InstCombine/strcpy-1.ll @@ -0,0 +1,45 @@ +; Test that the strcpy library call simplifier works correctly. +; rdar://6839935 +; RUN: opt < %s -instcombine -S | FileCheck %s +; +; This transformation requires the pointer size, as it assumes that size_t is +; the size of a pointer. +target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32" + +@hello = constant [6 x i8] c"hello\00" +@a = common global [32 x i8] zeroinitializer, align 1 +@b = common global [32 x i8] zeroinitializer, align 1 + +declare i8* @strcpy(i8*, i8*) + +define void @test_simplify1() { +; CHECK: @test_simplify1 + + %dst = getelementptr [32 x i8]* @a, i32 0, i32 0 + %src = getelementptr [6 x i8]* @hello, i32 0, i32 0 + + call i8* @strcpy(i8* %dst, i8* %src) +; CHECK: @llvm.memcpy.p0i8.p0i8.i32 + ret void +} + +define i8* @test_simplify2() { +; CHECK: @test_simplify2 + + %dst = getelementptr [32 x i8]* @a, i32 0, i32 0 + + %ret = call i8* @strcpy(i8* %dst, i8* %dst) +; CHECK: ret i8* getelementptr inbounds ([32 x i8]* @a, i32 0, i32 0) + ret i8* %ret +} + +define i8* @test_no_simplify1() { +; CHECK: @test_no_simplify1 + + %dst = getelementptr [32 x i8]* @a, i32 0, i32 0 + %src = getelementptr [32 x i8]* @b, i32 0, i32 0 + + %ret = call i8* @strcpy(i8* %dst, i8* %src) +; CHECK: call i8* @strcpy + ret i8* %ret +} diff --git a/test/Transforms/InstCombine/strcpy-2.ll b/test/Transforms/InstCombine/strcpy-2.ll new file mode 100644 index 00000000000..779e9fdd959 --- /dev/null +++ b/test/Transforms/InstCombine/strcpy-2.ll @@ -0,0 +1,22 @@ +; Test that the strcpy library call simplifier works correctly. +; RUN: opt < %s -instcombine -S | FileCheck %s +; +; This transformation requires the pointer size, as it assumes that size_t is +; the size of a pointer. +target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32" + +@hello = constant [6 x i8] c"hello\00" +@a = common global [32 x i8] zeroinitializer, align 1 + +declare i16* @strcpy(i8*, i8*) + +define void @test_no_simplify1() { +; CHECK: @test_no_simplify1 + + %dst = getelementptr [32 x i8]* @a, i32 0, i32 0 + %src = getelementptr [6 x i8]* @hello, i32 0, i32 0 + + call i16* @strcpy(i8* %dst, i8* %src) +; CHECK: call i16* @strcpy + ret void +} diff --git a/test/Transforms/InstCombine/strcpy_chk-1.ll b/test/Transforms/InstCombine/strcpy_chk-1.ll index c03e8a348b8..3e48f4fd305 100644 --- a/test/Transforms/InstCombine/strcpy_chk-1.ll +++ b/test/Transforms/InstCombine/strcpy_chk-1.ll @@ -7,16 +7,16 @@ target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f3 @a = common global [60 x i8] zeroinitializer, align 1 @b = common global [60 x i8] zeroinitializer, align 1 -@.str = private constant [8 x i8] c"abcdefg\00" +@.str = private constant [12 x i8] c"abcdefghijk\00" ; Check cases where slen >= strlen (src). define void @test_simplify1() { ; CHECK: @test_simplify1 %dst = getelementptr inbounds [60 x i8]* @a, i32 0, i32 0 - %src = getelementptr inbounds [8 x i8]* @.str, i32 0, i32 0 + %src = getelementptr inbounds [12 x i8]* @.str, i32 0, i32 0 -; CHECK-NEXT: call i8* @strcpy +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32 call i8* @__strcpy_chk(i8* %dst, i8* %src, i32 60) ret void } @@ -24,19 +24,19 @@ define void @test_simplify1() { define void @test_simplify2() { ; CHECK: @test_simplify2 %dst = getelementptr inbounds [60 x i8]* @a, i32 0, i32 0 - %src = getelementptr inbounds [8 x i8]* @.str, i32 0, i32 0 + %src = getelementptr inbounds [12 x i8]* @.str, i32 0, i32 0 -; CHECK-NEXT: call i8* @strcpy - call i8* @__strcpy_chk(i8* %dst, i8* %src, i32 8) +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32 + call i8* @__strcpy_chk(i8* %dst, i8* %src, i32 12) ret void } define void @test_simplify3() { ; CHECK: @test_simplify3 %dst = getelementptr inbounds [60 x i8]* @a, i32 0, i32 0 - %src = getelementptr inbounds [8 x i8]* @.str, i32 0, i32 0 + %src = getelementptr inbounds [12 x i8]* @.str, i32 0, i32 0 -; CHECK-NEXT: call i8* @strcpy +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32 call i8* @__strcpy_chk(i8* %dst, i8* %src, i32 -1) ret void } @@ -53,36 +53,42 @@ define void @test_simplify4() { ret void } -define void @test_no_simplify1() { -; CHECK: @test_no_simplify1 +; Check case where the string length is not constant. + +define void @test_simplify5() { +; CHECK: @test_simplify5 %dst = getelementptr inbounds [60 x i8]* @a, i32 0, i32 0 - %src = getelementptr inbounds [60 x i8]* @b, i32 0, i32 0 + %src = getelementptr inbounds [12 x i8]* @.str, i32 0, i32 0 -; CHECK-NEXT: call i8* @__strcpy_chk - call i8* @__strcpy_chk(i8* %dst, i8* %src, i32 8) +; CHECK: @__memcpy_chk + %len = call i32 @llvm.objectsize.i32(i8* %dst, i1 false) + call i8* @__strcpy_chk(i8* %dst, i8* %src, i32 %len) ret void } -; Check case were slen < strlen (src). +; Check case where the source and destination are the same. -define void @test_no_simplify2() { -; CHECK: @test_no_simplify2 +define i8* @test_simplify6() { +; CHECK: @test_simplify6 %dst = getelementptr inbounds [60 x i8]* @a, i32 0, i32 0 - %src = getelementptr inbounds [8 x i8]* @.str, i32 0, i32 0 -; CHECK-NEXT: call i8* @__strcpy_chk - call i8* @__strcpy_chk(i8* %dst, i8* %src, i32 3) - ret void +; CHECK: getelementptr inbounds ([60 x i8]* @a, i32 0, i32 0) + %len = call i32 @llvm.objectsize.i32(i8* %dst, i1 false) + %ret = call i8* @__strcpy_chk(i8* %dst, i8* %dst, i32 %len) + ret i8* %ret } -define void @test_no_simplify3() { -; CHECK: @test_no_simplify3 +; Check case where slen < strlen (src). + +define void @test_no_simplify1() { +; CHECK: @test_no_simplify1 %dst = getelementptr inbounds [60 x i8]* @a, i32 0, i32 0 - %src = getelementptr inbounds [8 x i8]* @.str, i32 0, i32 0 + %src = getelementptr inbounds [60 x i8]* @b, i32 0, i32 0 ; CHECK-NEXT: call i8* @__strcpy_chk - call i8* @__strcpy_chk(i8* %dst, i8* %src, i32 0) + call i8* @__strcpy_chk(i8* %dst, i8* %src, i32 8) ret void } declare i8* @__strcpy_chk(i8*, i8*, i32) nounwind +declare i32 @llvm.objectsize.i32(i8*, i1) nounwind readonly diff --git a/test/Transforms/LoopVectorize/2012-10-20-infloop.ll b/test/Transforms/LoopVectorize/2012-10-20-infloop.ll new file mode 100644 index 00000000000..5caaffc8dde --- /dev/null +++ b/test/Transforms/LoopVectorize/2012-10-20-infloop.ll @@ -0,0 +1,27 @@ +; RUN: opt < %s -loop-vectorize -dce + +; Check that we don't fall into an infinite loop. +define void @test() nounwind { +entry: + br label %for.body + +for.body: + %0 = phi i32 [ 1, %entry ], [ 0, %for.body ] + br label %for.body +} + + + +define void @test2() nounwind { +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv47 = phi i64 [ 0, %entry ], [ %indvars.iv.next48, %for.body ] + %0 = phi i32 [ 1, %entry ], [ 0, %for.body ] + %indvars.iv.next48 = add i64 %indvars.iv47, 1 + br i1 undef, label %for.end, label %for.body + +for.end: ; preds = %for.body + unreachable +} diff --git a/test/Transforms/LoopVectorize/gcc-examples.ll b/test/Transforms/LoopVectorize/gcc-examples.ll new file mode 100644 index 00000000000..6fb1792b2c8 --- /dev/null +++ b/test/Transforms/LoopVectorize/gcc-examples.ll @@ -0,0 +1,648 @@ +; RUN: opt < %s -loop-vectorize -dce -instcombine -licm -S | FileCheck %s + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.8.0" + +@b = common global [2048 x i32] zeroinitializer, align 16 +@c = common global [2048 x i32] zeroinitializer, align 16 +@a = common global [2048 x i32] zeroinitializer, align 16 +@G = common global [32 x [1024 x i32]] zeroinitializer, align 16 +@ub = common global [1024 x i32] zeroinitializer, align 16 +@uc = common global [1024 x i32] zeroinitializer, align 16 +@d = common global [2048 x i32] zeroinitializer, align 16 +@fa = common global [1024 x float] zeroinitializer, align 16 +@fb = common global [1024 x float] zeroinitializer, align 16 +@ic = common global [1024 x i32] zeroinitializer, align 16 +@da = common global [1024 x float] zeroinitializer, align 16 +@db = common global [1024 x float] zeroinitializer, align 16 +@dc = common global [1024 x float] zeroinitializer, align 16 +@dd = common global [1024 x float] zeroinitializer, align 16 +@dj = common global [1024 x i32] zeroinitializer, align 16 + +;CHECK: @example1 +;CHECK: load <4 x i32> +;CHECK: add <4 x i32> +;CHECK: store <4 x i32> +;CHECK: ret void +define void @example1() nounwind uwtable ssp { + br label %1 + +; <label>:1 ; preds = %1, %0 + %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ] + %2 = getelementptr inbounds [2048 x i32]* @b, i64 0, i64 %indvars.iv + %3 = load i32* %2, align 4 + %4 = getelementptr inbounds [2048 x i32]* @c, i64 0, i64 %indvars.iv + %5 = load i32* %4, align 4 + %6 = add nsw i32 %5, %3 + %7 = getelementptr inbounds [2048 x i32]* @a, i64 0, i64 %indvars.iv + store i32 %6, i32* %7, align 4 + %indvars.iv.next = add i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, 256 + br i1 %exitcond, label %8, label %1 + +; <label>:8 ; preds = %1 + ret void +} + +;CHECK: @example2 +;CHECK: store <4 x i32> +;CHECK: ret void +define void @example2(i32 %n, i32 %x) nounwind uwtable ssp { + %1 = icmp sgt i32 %n, 0 + br i1 %1, label %.lr.ph5, label %.preheader + +..preheader_crit_edge: ; preds = %.lr.ph5 + %phitmp = sext i32 %n to i64 + br label %.preheader + +.preheader: ; preds = %..preheader_crit_edge, %0 + %i.0.lcssa = phi i64 [ %phitmp, %..preheader_crit_edge ], [ 0, %0 ] + %2 = icmp eq i32 %n, 0 + br i1 %2, label %._crit_edge, label %.lr.ph + +.lr.ph5: ; preds = %0, %.lr.ph5 + %indvars.iv6 = phi i64 [ %indvars.iv.next7, %.lr.ph5 ], [ 0, %0 ] + %3 = getelementptr inbounds [2048 x i32]* @b, i64 0, i64 %indvars.iv6 + store i32 %x, i32* %3, align 4 + %indvars.iv.next7 = add i64 %indvars.iv6, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next7 to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %n + br i1 %exitcond, label %..preheader_crit_edge, label %.lr.ph5 + +.lr.ph: ; preds = %.preheader, %.lr.ph + %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ %i.0.lcssa, %.preheader ] + %.02 = phi i32 [ %4, %.lr.ph ], [ %n, %.preheader ] + %4 = add nsw i32 %.02, -1 + %5 = getelementptr inbounds [2048 x i32]* @b, i64 0, i64 %indvars.iv + %6 = load i32* %5, align 4 + %7 = getelementptr inbounds [2048 x i32]* @c, i64 0, i64 %indvars.iv + %8 = load i32* %7, align 4 + %9 = and i32 %8, %6 + %10 = getelementptr inbounds [2048 x i32]* @a, i64 0, i64 %indvars.iv + store i32 %9, i32* %10, align 4 + %indvars.iv.next = add i64 %indvars.iv, 1 + %11 = icmp eq i32 %4, 0 + br i1 %11, label %._crit_edge, label %.lr.ph + +._crit_edge: ; preds = %.lr.ph, %.preheader + ret void +} + +; We can't vectorize this loop because it has non constant loop bounds. +;CHECK: @example3 +;CHECK-NOT: <4 x i32> +;CHECK: ret void +define void @example3(i32 %n, i32* noalias nocapture %p, i32* noalias nocapture %q) nounwind uwtable ssp { + %1 = icmp eq i32 %n, 0 + br i1 %1, label %._crit_edge, label %.lr.ph + +.lr.ph: ; preds = %0, %.lr.ph + %.05 = phi i32 [ %2, %.lr.ph ], [ %n, %0 ] + %.014 = phi i32* [ %5, %.lr.ph ], [ %p, %0 ] + %.023 = phi i32* [ %3, %.lr.ph ], [ %q, %0 ] + %2 = add nsw i32 %.05, -1 + %3 = getelementptr inbounds i32* %.023, i64 1 + %4 = load i32* %.023, align 16 + %5 = getelementptr inbounds i32* %.014, i64 1 + store i32 %4, i32* %.014, align 16 + %6 = icmp eq i32 %2, 0 + br i1 %6, label %._crit_edge, label %.lr.ph + +._crit_edge: ; preds = %.lr.ph, %0 + ret void +} + +;CHECK: @example4 +;CHECK: load <4 x i32> +;CHECK: ret void +define void @example4(i32 %n, i32* noalias nocapture %p, i32* noalias nocapture %q) nounwind uwtable ssp { + %1 = add nsw i32 %n, -1 + %2 = icmp eq i32 %n, 0 + br i1 %2, label %.preheader4, label %.lr.ph10 + +.preheader4: ; preds = %0 + %3 = icmp sgt i32 %1, 0 + br i1 %3, label %.lr.ph6, label %._crit_edge + +.lr.ph10: ; preds = %0, %.lr.ph10 + %4 = phi i32 [ %9, %.lr.ph10 ], [ %1, %0 ] + %.018 = phi i32* [ %8, %.lr.ph10 ], [ %p, %0 ] + %.027 = phi i32* [ %5, %.lr.ph10 ], [ %q, %0 ] + %5 = getelementptr inbounds i32* %.027, i64 1 + %6 = load i32* %.027, align 16 + %7 = add nsw i32 %6, 5 + %8 = getelementptr inbounds i32* %.018, i64 1 + store i32 %7, i32* %.018, align 16 + %9 = add nsw i32 %4, -1 + %10 = icmp eq i32 %4, 0 + br i1 %10, label %._crit_edge, label %.lr.ph10 + +.preheader: ; preds = %.lr.ph6 + br i1 %3, label %.lr.ph, label %._crit_edge + +.lr.ph6: ; preds = %.preheader4, %.lr.ph6 + %indvars.iv11 = phi i64 [ %indvars.iv.next12, %.lr.ph6 ], [ 0, %.preheader4 ] + %indvars.iv.next12 = add i64 %indvars.iv11, 1 + %11 = getelementptr inbounds [2048 x i32]* @b, i64 0, i64 %indvars.iv.next12 + %12 = load i32* %11, align 4 + %13 = add nsw i64 %indvars.iv11, 3 + %14 = getelementptr inbounds [2048 x i32]* @c, i64 0, i64 %13 + %15 = load i32* %14, align 4 + %16 = add nsw i32 %15, %12 + %17 = getelementptr inbounds [2048 x i32]* @a, i64 0, i64 %indvars.iv11 + store i32 %16, i32* %17, align 4 + %lftr.wideiv13 = trunc i64 %indvars.iv.next12 to i32 + %exitcond14 = icmp eq i32 %lftr.wideiv13, %1 + br i1 %exitcond14, label %.preheader, label %.lr.ph6 + +.lr.ph: ; preds = %.preheader, %.lr.ph + %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %.preheader ] + %18 = getelementptr inbounds [2048 x i32]* @a, i64 0, i64 %indvars.iv + %19 = load i32* %18, align 4 + %20 = icmp sgt i32 %19, 4 + %21 = select i1 %20, i32 4, i32 0 + %22 = getelementptr inbounds [2048 x i32]* @b, i64 0, i64 %indvars.iv + store i32 %21, i32* %22, align 4 + %indvars.iv.next = add i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %1 + br i1 %exitcond, label %._crit_edge, label %.lr.ph + +._crit_edge: ; preds = %.lr.ph10, %.preheader4, %.lr.ph, %.preheader + ret void +} + +;CHECK: @example8 +;CHECK: store <4 x i32> +;CHECK: ret void +define void @example8(i32 %x) nounwind uwtable ssp { + br label %.preheader + +.preheader: ; preds = %3, %0 + %indvars.iv3 = phi i64 [ 0, %0 ], [ %indvars.iv.next4, %3 ] + br label %1 + +; <label>:1 ; preds = %1, %.preheader + %indvars.iv = phi i64 [ 0, %.preheader ], [ %indvars.iv.next, %1 ] + %2 = getelementptr inbounds [32 x [1024 x i32]]* @G, i64 0, i64 %indvars.iv3, i64 %indvars.iv + store i32 %x, i32* %2, align 4 + %indvars.iv.next = add i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, 1024 + br i1 %exitcond, label %3, label %1 + +; <label>:3 ; preds = %1 + %indvars.iv.next4 = add i64 %indvars.iv3, 1 + %lftr.wideiv5 = trunc i64 %indvars.iv.next4 to i32 + %exitcond6 = icmp eq i32 %lftr.wideiv5, 32 + br i1 %exitcond6, label %4, label %.preheader + +; <label>:4 ; preds = %3 + ret void +} + +;CHECK: @example9 +;CHECK: phi <4 x i32> +;CHECK: ret i32 +define i32 @example9() nounwind uwtable readonly ssp { + br label %1 + +; <label>:1 ; preds = %1, %0 + %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ] + %diff.01 = phi i32 [ 0, %0 ], [ %7, %1 ] + %2 = getelementptr inbounds [1024 x i32]* @ub, i64 0, i64 %indvars.iv + %3 = load i32* %2, align 4 + %4 = getelementptr inbounds [1024 x i32]* @uc, i64 0, i64 %indvars.iv + %5 = load i32* %4, align 4 + %6 = add i32 %3, %diff.01 + %7 = sub i32 %6, %5 + %indvars.iv.next = add i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, 1024 + br i1 %exitcond, label %8, label %1 + +; <label>:8 ; preds = %1 + ret i32 %7 +} + +;CHECK: @example10a +;CHECK: load <4 x i16> +;CHECK: add <4 x i16> +;CHECK: store <4 x i16> +;CHECK: ret void +define void @example10a(i16* noalias nocapture %sa, i16* noalias nocapture %sb, i16* noalias nocapture %sc, i32* noalias nocapture %ia, i32* noalias nocapture %ib, i32* noalias nocapture %ic) nounwind uwtable ssp { + br label %1 + +; <label>:1 ; preds = %1, %0 + %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ] + %2 = getelementptr inbounds i32* %ib, i64 %indvars.iv + %3 = load i32* %2, align 4 + %4 = getelementptr inbounds i32* %ic, i64 %indvars.iv + %5 = load i32* %4, align 4 + %6 = add nsw i32 %5, %3 + %7 = getelementptr inbounds i32* %ia, i64 %indvars.iv + store i32 %6, i32* %7, align 4 + %8 = getelementptr inbounds i16* %sb, i64 %indvars.iv + %9 = load i16* %8, align 2 + %10 = getelementptr inbounds i16* %sc, i64 %indvars.iv + %11 = load i16* %10, align 2 + %12 = add i16 %11, %9 + %13 = getelementptr inbounds i16* %sa, i64 %indvars.iv + store i16 %12, i16* %13, align 2 + %indvars.iv.next = add i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, 1024 + br i1 %exitcond, label %14, label %1 + +; <label>:14 ; preds = %1 + ret void +} + +;CHECK: @example10b +;CHECK: load <4 x i16> +;CHECK: sext <4 x i16> +;CHECK: store <4 x i32> +;CHECK: ret void +define void @example10b(i16* noalias nocapture %sa, i16* noalias nocapture %sb, i16* noalias nocapture %sc, i32* noalias nocapture %ia, i32* noalias nocapture %ib, i32* noalias nocapture %ic) nounwind uwtable ssp { + br label %1 + +; <label>:1 ; preds = %1, %0 + %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ] + %2 = getelementptr inbounds i16* %sb, i64 %indvars.iv + %3 = load i16* %2, align 2 + %4 = sext i16 %3 to i32 + %5 = getelementptr inbounds i32* %ia, i64 %indvars.iv + store i32 %4, i32* %5, align 4 + %indvars.iv.next = add i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, 1024 + br i1 %exitcond, label %6, label %1 + +; <label>:6 ; preds = %1 + ret void +} + +;CHECK: @example11 +;CHECK: load i32 +;CHECK: load i32 +;CHECK: load i32 +;CHECK: load i32 +;CHECK: insertelement +;CHECK: insertelement +;CHECK: insertelement +;CHECK: insertelement +;CHECK: ret void +define void @example11() nounwind uwtable ssp { + br label %1 + +; <label>:1 ; preds = %1, %0 + %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ] + %2 = shl nsw i64 %indvars.iv, 1 + %3 = or i64 %2, 1 + %4 = getelementptr inbounds [2048 x i32]* @b, i64 0, i64 %3 + %5 = load i32* %4, align 4 + %6 = getelementptr inbounds [2048 x i32]* @c, i64 0, i64 %3 + %7 = load i32* %6, align 4 + %8 = mul nsw i32 %7, %5 + %9 = getelementptr inbounds [2048 x i32]* @b, i64 0, i64 %2 + %10 = load i32* %9, align 8 + %11 = getelementptr inbounds [2048 x i32]* @c, i64 0, i64 %2 + %12 = load i32* %11, align 8 + %13 = mul nsw i32 %12, %10 + %14 = sub nsw i32 %8, %13 + %15 = getelementptr inbounds [2048 x i32]* @a, i64 0, i64 %indvars.iv + store i32 %14, i32* %15, align 4 + %16 = mul nsw i32 %7, %10 + %17 = mul nsw i32 %12, %5 + %18 = add nsw i32 %17, %16 + %19 = getelementptr inbounds [2048 x i32]* @d, i64 0, i64 %indvars.iv + store i32 %18, i32* %19, align 4 + %indvars.iv.next = add i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, 512 + br i1 %exitcond, label %20, label %1 + +; <label>:20 ; preds = %1 + ret void +} + +;CHECK: @example12 +;CHECK: trunc <4 x i64> +;CHECK: store <4 x i32> +;CHECK: ret void +define void @example12() nounwind uwtable ssp { + br label %1 + +; <label>:1 ; preds = %1, %0 + %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ] + %2 = getelementptr inbounds [2048 x i32]* @a, i64 0, i64 %indvars.iv + %3 = trunc i64 %indvars.iv to i32 + store i32 %3, i32* %2, align 4 + %indvars.iv.next = add i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, 1024 + br i1 %exitcond, label %4, label %1 + +; <label>:4 ; preds = %1 + ret void +} + +; Can't vectorize because of reductions. +;CHECK: @example13 +;CHECK-NOT: <4 x i32> +;CHECK: ret void +define void @example13(i32** nocapture %A, i32** nocapture %B, i32* nocapture %out) nounwind uwtable ssp { + br label %.preheader + +.preheader: ; preds = %14, %0 + %indvars.iv4 = phi i64 [ 0, %0 ], [ %indvars.iv.next5, %14 ] + %1 = getelementptr inbounds i32** %A, i64 %indvars.iv4 + %2 = load i32** %1, align 8 + %3 = getelementptr inbounds i32** %B, i64 %indvars.iv4 + %4 = load i32** %3, align 8 + br label %5 + +; <label>:5 ; preds = %.preheader, %5 + %indvars.iv = phi i64 [ 0, %.preheader ], [ %indvars.iv.next, %5 ] + %diff.02 = phi i32 [ 0, %.preheader ], [ %11, %5 ] + %6 = getelementptr inbounds i32* %2, i64 %indvars.iv + %7 = load i32* %6, align 4 + %8 = getelementptr inbounds i32* %4, i64 %indvars.iv + %9 = load i32* %8, align 4 + %10 = add i32 %7, %diff.02 + %11 = sub i32 %10, %9 + %indvars.iv.next = add i64 %indvars.iv, 8 + %12 = trunc i64 %indvars.iv.next to i32 + %13 = icmp slt i32 %12, 1024 + br i1 %13, label %5, label %14 + +; <label>:14 ; preds = %5 + %15 = getelementptr inbounds i32* %out, i64 %indvars.iv4 + store i32 %11, i32* %15, align 4 + %indvars.iv.next5 = add i64 %indvars.iv4, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next5 to i32 + %exitcond = icmp eq i32 %lftr.wideiv, 32 + br i1 %exitcond, label %16, label %.preheader + +; <label>:16 ; preds = %14 + ret void +} + +; Can't vectorize because of reductions. +;CHECK: @example14 +;CHECK-NOT: <4 x i32> +;CHECK: ret void +define void @example14(i32** nocapture %in, i32** nocapture %coeff, i32* nocapture %out) nounwind uwtable ssp { +.preheader3: + br label %.preheader + +.preheader: ; preds = %11, %.preheader3 + %indvars.iv7 = phi i64 [ 0, %.preheader3 ], [ %indvars.iv.next8, %11 ] + %sum.05 = phi i32 [ 0, %.preheader3 ], [ %10, %11 ] + br label %0 + +; <label>:0 ; preds = %0, %.preheader + %indvars.iv = phi i64 [ 0, %.preheader ], [ %indvars.iv.next, %0 ] + %sum.12 = phi i32 [ %sum.05, %.preheader ], [ %10, %0 ] + %1 = getelementptr inbounds i32** %in, i64 %indvars.iv + %2 = load i32** %1, align 8 + %3 = getelementptr inbounds i32* %2, i64 %indvars.iv7 + %4 = load i32* %3, align 4 + %5 = getelementptr inbounds i32** %coeff, i64 %indvars.iv + %6 = load i32** %5, align 8 + %7 = getelementptr inbounds i32* %6, i64 %indvars.iv7 + %8 = load i32* %7, align 4 + %9 = mul nsw i32 %8, %4 + %10 = add nsw i32 %9, %sum.12 + %indvars.iv.next = add i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, 1024 + br i1 %exitcond, label %11, label %0 + +; <label>:11 ; preds = %0 + %indvars.iv.next8 = add i64 %indvars.iv7, 1 + %lftr.wideiv9 = trunc i64 %indvars.iv.next8 to i32 + %exitcond10 = icmp eq i32 %lftr.wideiv9, 32 + br i1 %exitcond10, label %.preheader3.1, label %.preheader + +.preheader3.1: ; preds = %11 + store i32 %10, i32* %out, align 4 + br label %.preheader.1 + +.preheader.1: ; preds = %24, %.preheader3.1 + %indvars.iv7.1 = phi i64 [ 0, %.preheader3.1 ], [ %indvars.iv.next8.1, %24 ] + %sum.05.1 = phi i32 [ 0, %.preheader3.1 ], [ %23, %24 ] + br label %12 + +; <label>:12 ; preds = %12, %.preheader.1 + %indvars.iv.1 = phi i64 [ 0, %.preheader.1 ], [ %13, %12 ] + %sum.12.1 = phi i32 [ %sum.05.1, %.preheader.1 ], [ %23, %12 ] + %13 = add nsw i64 %indvars.iv.1, 1 + %14 = getelementptr inbounds i32** %in, i64 %13 + %15 = load i32** %14, align 8 + %16 = getelementptr inbounds i32* %15, i64 %indvars.iv7.1 + %17 = load i32* %16, align 4 + %18 = getelementptr inbounds i32** %coeff, i64 %indvars.iv.1 + %19 = load i32** %18, align 8 + %20 = getelementptr inbounds i32* %19, i64 %indvars.iv7.1 + %21 = load i32* %20, align 4 + %22 = mul nsw i32 %21, %17 + %23 = add nsw i32 %22, %sum.12.1 + %lftr.wideiv.1 = trunc i64 %13 to i32 + %exitcond.1 = icmp eq i32 %lftr.wideiv.1, 1024 + br i1 %exitcond.1, label %24, label %12 + +; <label>:24 ; preds = %12 + %indvars.iv.next8.1 = add i64 %indvars.iv7.1, 1 + %lftr.wideiv9.1 = trunc i64 %indvars.iv.next8.1 to i32 + %exitcond10.1 = icmp eq i32 %lftr.wideiv9.1, 32 + br i1 %exitcond10.1, label %.preheader3.2, label %.preheader.1 + +.preheader3.2: ; preds = %24 + %25 = getelementptr inbounds i32* %out, i64 1 + store i32 %23, i32* %25, align 4 + br label %.preheader.2 + +.preheader.2: ; preds = %38, %.preheader3.2 + %indvars.iv7.2 = phi i64 [ 0, %.preheader3.2 ], [ %indvars.iv.next8.2, %38 ] + %sum.05.2 = phi i32 [ 0, %.preheader3.2 ], [ %37, %38 ] + br label %26 + +; <label>:26 ; preds = %26, %.preheader.2 + %indvars.iv.2 = phi i64 [ 0, %.preheader.2 ], [ %indvars.iv.next.2, %26 ] + %sum.12.2 = phi i32 [ %sum.05.2, %.preheader.2 ], [ %37, %26 ] + %27 = add nsw i64 %indvars.iv.2, 2 + %28 = getelementptr inbounds i32** %in, i64 %27 + %29 = load i32** %28, align 8 + %30 = getelementptr inbounds i32* %29, i64 %indvars.iv7.2 + %31 = load i32* %30, align 4 + %32 = getelementptr inbounds i32** %coeff, i64 %indvars.iv.2 + %33 = load i32** %32, align 8 + %34 = getelementptr inbounds i32* %33, i64 %indvars.iv7.2 + %35 = load i32* %34, align 4 + %36 = mul nsw i32 %35, %31 + %37 = add nsw i32 %36, %sum.12.2 + %indvars.iv.next.2 = add i64 %indvars.iv.2, 1 + %lftr.wideiv.2 = trunc i64 %indvars.iv.next.2 to i32 + %exitcond.2 = icmp eq i32 %lftr.wideiv.2, 1024 + br i1 %exitcond.2, label %38, label %26 + +; <label>:38 ; preds = %26 + %indvars.iv.next8.2 = add i64 %indvars.iv7.2, 1 + %lftr.wideiv9.2 = trunc i64 %indvars.iv.next8.2 to i32 + %exitcond10.2 = icmp eq i32 %lftr.wideiv9.2, 32 + br i1 %exitcond10.2, label %.preheader3.3, label %.preheader.2 + +.preheader3.3: ; preds = %38 + %39 = getelementptr inbounds i32* %out, i64 2 + store i32 %37, i32* %39, align 4 + br label %.preheader.3 + +.preheader.3: ; preds = %52, %.preheader3.3 + %indvars.iv7.3 = phi i64 [ 0, %.preheader3.3 ], [ %indvars.iv.next8.3, %52 ] + %sum.05.3 = phi i32 [ 0, %.preheader3.3 ], [ %51, %52 ] + br label %40 + +; <label>:40 ; preds = %40, %.preheader.3 + %indvars.iv.3 = phi i64 [ 0, %.preheader.3 ], [ %indvars.iv.next.3, %40 ] + %sum.12.3 = phi i32 [ %sum.05.3, %.preheader.3 ], [ %51, %40 ] + %41 = add nsw i64 %indvars.iv.3, 3 + %42 = getelementptr inbounds i32** %in, i64 %41 + %43 = load i32** %42, align 8 + %44 = getelementptr inbounds i32* %43, i64 %indvars.iv7.3 + %45 = load i32* %44, align 4 + %46 = getelementptr inbounds i32** %coeff, i64 %indvars.iv.3 + %47 = load i32** %46, align 8 + %48 = getelementptr inbounds i32* %47, i64 %indvars.iv7.3 + %49 = load i32* %48, align 4 + %50 = mul nsw i32 %49, %45 + %51 = add nsw i32 %50, %sum.12.3 + %indvars.iv.next.3 = add i64 %indvars.iv.3, 1 + %lftr.wideiv.3 = trunc i64 %indvars.iv.next.3 to i32 + %exitcond.3 = icmp eq i32 %lftr.wideiv.3, 1024 + br i1 %exitcond.3, label %52, label %40 + +; <label>:52 ; preds = %40 + %indvars.iv.next8.3 = add i64 %indvars.iv7.3, 1 + %lftr.wideiv9.3 = trunc i64 %indvars.iv.next8.3 to i32 + %exitcond10.3 = icmp eq i32 %lftr.wideiv9.3, 32 + br i1 %exitcond10.3, label %53, label %.preheader.3 + +; <label>:53 ; preds = %52 + %54 = getelementptr inbounds i32* %out, i64 3 + store i32 %51, i32* %54, align 4 + ret void +} + +; Can't vectorize because the src and dst pointers are not disjoint. +;CHECK: @example21 +;CHECK-NOT: <4 x i32> +;CHECK: ret i32 +define i32 @example21(i32* nocapture %b, i32 %n) nounwind uwtable readonly ssp { + %1 = icmp sgt i32 %n, 0 + br i1 %1, label %.lr.ph, label %._crit_edge + +.lr.ph: ; preds = %0 + %2 = sext i32 %n to i64 + br label %3 + +; <label>:3 ; preds = %.lr.ph, %3 + %indvars.iv = phi i64 [ %2, %.lr.ph ], [ %indvars.iv.next, %3 ] + %a.02 = phi i32 [ 0, %.lr.ph ], [ %6, %3 ] + %indvars.iv.next = add i64 %indvars.iv, -1 + %4 = getelementptr inbounds i32* %b, i64 %indvars.iv.next + %5 = load i32* %4, align 4 + %6 = add nsw i32 %5, %a.02 + %7 = trunc i64 %indvars.iv.next to i32 + %8 = icmp sgt i32 %7, 0 + br i1 %8, label %3, label %._crit_edge + +._crit_edge: ; preds = %3, %0 + %a.0.lcssa = phi i32 [ 0, %0 ], [ %6, %3 ] + ret i32 %a.0.lcssa +} + +; Can't vectorize because there are multiple PHIs. +;CHECK: @example23 +;CHECK-NOT: <4 x i32> +;CHECK: ret void +define void @example23(i16* nocapture %src, i32* nocapture %dst) nounwind uwtable ssp { + br label %1 + +; <label>:1 ; preds = %1, %0 + %.04 = phi i16* [ %src, %0 ], [ %2, %1 ] + %.013 = phi i32* [ %dst, %0 ], [ %6, %1 ] + %i.02 = phi i32 [ 0, %0 ], [ %7, %1 ] + %2 = getelementptr inbounds i16* %.04, i64 1 + %3 = load i16* %.04, align 2 + %4 = zext i16 %3 to i32 + %5 = shl nuw nsw i32 %4, 7 + %6 = getelementptr inbounds i32* %.013, i64 1 + store i32 %5, i32* %.013, align 4 + %7 = add nsw i32 %i.02, 1 + %exitcond = icmp eq i32 %7, 256 + br i1 %exitcond, label %8, label %1 + +; <label>:8 ; preds = %1 + ret void +} + +;CHECK: @example24 +;CHECK: shufflevector <4 x i16> +;CHECK: ret void +define void @example24(i16 signext %x, i16 signext %y) nounwind uwtable ssp { + br label %1 + +; <label>:1 ; preds = %1, %0 + %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ] + %2 = getelementptr inbounds [1024 x float]* @fa, i64 0, i64 %indvars.iv + %3 = load float* %2, align 4 + %4 = getelementptr inbounds [1024 x float]* @fb, i64 0, i64 %indvars.iv + %5 = load float* %4, align 4 + %6 = fcmp olt float %3, %5 + %x.y = select i1 %6, i16 %x, i16 %y + %7 = sext i16 %x.y to i32 + %8 = getelementptr inbounds [1024 x i32]* @ic, i64 0, i64 %indvars.iv + store i32 %7, i32* %8, align 4 + %indvars.iv.next = add i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, 1024 + br i1 %exitcond, label %9, label %1 + +; <label>:9 ; preds = %1 + ret void +} + +;CHECK: @example25 +;CHECK: and <4 x i1> +;CHECK: zext <4 x i1> +;CHECK: ret void +define void @example25() nounwind uwtable ssp { + br label %1 + +; <label>:1 ; preds = %1, %0 + %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ] + %2 = getelementptr inbounds [1024 x float]* @da, i64 0, i64 %indvars.iv + %3 = load float* %2, align 4 + %4 = getelementptr inbounds [1024 x float]* @db, i64 0, i64 %indvars.iv + %5 = load float* %4, align 4 + %6 = fcmp olt float %3, %5 + %7 = getelementptr inbounds [1024 x float]* @dc, i64 0, i64 %indvars.iv + %8 = load float* %7, align 4 + %9 = getelementptr inbounds [1024 x float]* @dd, i64 0, i64 %indvars.iv + %10 = load float* %9, align 4 + %11 = fcmp olt float %8, %10 + %12 = and i1 %6, %11 + %13 = zext i1 %12 to i32 + %14 = getelementptr inbounds [1024 x i32]* @dj, i64 0, i64 %indvars.iv + store i32 %13, i32* %14, align 4 + %indvars.iv.next = add i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, 1024 + br i1 %exitcond, label %15, label %1 + +; <label>:15 ; preds = %1 + ret void +} + diff --git a/test/Transforms/LoopVectorize/increment.ll b/test/Transforms/LoopVectorize/increment.ll new file mode 100644 index 00000000000..e818d685626 --- /dev/null +++ b/test/Transforms/LoopVectorize/increment.ll @@ -0,0 +1,66 @@ +; RUN: opt < %s -loop-vectorize -dce -instcombine -licm -S | FileCheck %s + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.8.0" + +@a = common global [2048 x i32] zeroinitializer, align 16 + +; This is the loop. +; for (i=0; i<n; i++){ +; a[i] += i; +; } +;CHECK: @inc +;CHECK: load <4 x i32> +;CHECK: add <4 x i32> +;CHECK: store <4 x i32> +;CHECK: ret void +define void @inc(i32 %n) nounwind uwtable noinline ssp { + %1 = icmp sgt i32 %n, 0 + br i1 %1, label %.lr.ph, label %._crit_edge + +.lr.ph: ; preds = %0, %.lr.ph + %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ] + %2 = getelementptr inbounds [2048 x i32]* @a, i64 0, i64 %indvars.iv + %3 = load i32* %2, align 4 + %4 = trunc i64 %indvars.iv to i32 + %5 = add nsw i32 %3, %4 + store i32 %5, i32* %2, align 4 + %indvars.iv.next = add i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %n + br i1 %exitcond, label %._crit_edge, label %.lr.ph + +._crit_edge: ; preds = %.lr.ph, %0 + ret void +} + +; Can't vectorize this loop because the access to A[X] is non linear. +; +; for (i = 0; i < n; ++i) { +; A[B[i]]++; +; +;CHECK: @histogram +;CHECK-NOT: <4 x i32> +;CHECK: ret i32 +define i32 @histogram(i32* nocapture noalias %A, i32* nocapture noalias %B, i32 %n) nounwind uwtable ssp { +entry: + %cmp6 = icmp sgt i32 %n, 0 + br i1 %cmp6, label %for.body, label %for.end + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i32* %B, i64 %indvars.iv + %0 = load i32* %arrayidx, align 4 + %idxprom1 = sext i32 %0 to i64 + %arrayidx2 = getelementptr inbounds i32* %A, i64 %idxprom1 + %1 = load i32* %arrayidx2, align 4 + %inc = add nsw i32 %1, 1 + store i32 %inc, i32* %arrayidx2, align 4 + %indvars.iv.next = add i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %n + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret i32 0 +} diff --git a/test/Transforms/LoopVectorize/induction_plus.ll b/test/Transforms/LoopVectorize/induction_plus.ll new file mode 100644 index 00000000000..bd90113e523 --- /dev/null +++ b/test/Transforms/LoopVectorize/induction_plus.ll @@ -0,0 +1,30 @@ +; RUN: opt < %s -loop-vectorize -instcombine -S | FileCheck %s + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.8.0" + +@array = common global [1024 x i32] zeroinitializer, align 16 + +;CHECK: @array_at_plus_one +;CHECK: add <4 x i64> +;CHECK: trunc <4 x i64> +;CHECK: add i64 %index, 12 +;CHECK: ret i32 +define i32 @array_at_plus_one(i32 %n) nounwind uwtable ssp { + %1 = icmp sgt i32 %n, 0 + br i1 %1, label %.lr.ph, label %._crit_edge + +.lr.ph: ; preds = %0, %.lr.ph + %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ] + %2 = add nsw i64 %indvars.iv, 12 + %3 = getelementptr inbounds [1024 x i32]* @array, i64 0, i64 %2 + %4 = trunc i64 %indvars.iv to i32 + store i32 %4, i32* %3, align 4 + %indvars.iv.next = add i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %n + br i1 %exitcond, label %._crit_edge, label %.lr.ph + +._crit_edge: ; preds = %.lr.ph, %0 + ret i32 undef +} diff --git a/test/Transforms/LoopVectorize/lit.local.cfg b/test/Transforms/LoopVectorize/lit.local.cfg new file mode 100644 index 00000000000..19eebc0ac7a --- /dev/null +++ b/test/Transforms/LoopVectorize/lit.local.cfg @@ -0,0 +1 @@ +config.suffixes = ['.ll', '.c', '.cpp'] diff --git a/test/Transforms/LoopVectorize/non-const-n.ll b/test/Transforms/LoopVectorize/non-const-n.ll new file mode 100644 index 00000000000..04c5c84a4f4 --- /dev/null +++ b/test/Transforms/LoopVectorize/non-const-n.ll @@ -0,0 +1,38 @@ +; RUN: opt < %s -loop-vectorize -dce -instcombine -licm -S | FileCheck %s + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.8.0" + +@b = common global [2048 x i32] zeroinitializer, align 16 +@c = common global [2048 x i32] zeroinitializer, align 16 +@a = common global [2048 x i32] zeroinitializer, align 16 + +;CHECK: @example1 +;CHECK: shl i32 +;CHECK: zext i32 +;CHECK: load <4 x i32> +;CHECK: add <4 x i32> +;CHECK: store <4 x i32> +;CHECK: ret void +define void @example1(i32 %n) nounwind uwtable ssp { + %n4 = shl i32 %n, 2 + br label %1 + +; <label>:1 ; preds = %1, %0 + %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ] + %2 = getelementptr inbounds [2048 x i32]* @b, i64 0, i64 %indvars.iv + %3 = load i32* %2, align 4 + %4 = getelementptr inbounds [2048 x i32]* @c, i64 0, i64 %indvars.iv + %5 = load i32* %4, align 4 + %6 = add nsw i32 %5, %3 + %7 = getelementptr inbounds [2048 x i32]* @a, i64 0, i64 %indvars.iv + store i32 %6, i32* %7, align 4 + %indvars.iv.next = add i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %n4 + br i1 %exitcond, label %8, label %1 + +; <label>:8 ; preds = %1 + ret void +} + diff --git a/test/Transforms/LoopVectorize/read-only.ll b/test/Transforms/LoopVectorize/read-only.ll new file mode 100644 index 00000000000..4095ea68ef7 --- /dev/null +++ b/test/Transforms/LoopVectorize/read-only.ll @@ -0,0 +1,32 @@ +; RUN: opt < %s -loop-vectorize -dce -instcombine -licm -S | FileCheck %s + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.8.0" + +;CHECK: @read_only_func +;CHECK: load <4 x i32> +;CHECK: ret i32 +define i32 @read_only_func(i32* nocapture %A, i32* nocapture %B, i32 %n) nounwind uwtable readonly ssp { + %1 = icmp sgt i32 %n, 0 + br i1 %1, label %.lr.ph, label %._crit_edge + +.lr.ph: ; preds = %0, %.lr.ph + %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ] + %sum.02 = phi i32 [ %9, %.lr.ph ], [ 0, %0 ] + %2 = getelementptr inbounds i32* %A, i64 %indvars.iv + %3 = load i32* %2, align 4 + %4 = add nsw i64 %indvars.iv, 13 + %5 = getelementptr inbounds i32* %B, i64 %4 + %6 = load i32* %5, align 4 + %7 = shl i32 %6, 1 + %8 = add i32 %3, %sum.02 + %9 = add i32 %8, %7 + %indvars.iv.next = add i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %n + br i1 %exitcond, label %._crit_edge, label %.lr.ph + +._crit_edge: ; preds = %.lr.ph, %0 + %sum.0.lcssa = phi i32 [ 0, %0 ], [ %9, %.lr.ph ] + ret i32 %sum.0.lcssa +} diff --git a/test/Transforms/LoopVectorize/reduction.ll b/test/Transforms/LoopVectorize/reduction.ll new file mode 100644 index 00000000000..3e871b229b5 --- /dev/null +++ b/test/Transforms/LoopVectorize/reduction.ll @@ -0,0 +1,152 @@ +; RUN: opt < %s -loop-vectorize -dce -instcombine -licm -S | FileCheck %s + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.8.0" + +;CHECK: @reduction_sum +;CHECK: phi <4 x i32> +;CHECK: load <4 x i32> +;CHECK: add <4 x i32> +;CHECK: ret i32 +define i32 @reduction_sum(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) nounwind uwtable readonly noinline ssp { + %1 = icmp sgt i32 %n, 0 + br i1 %1, label %.lr.ph, label %._crit_edge + +.lr.ph: ; preds = %0, %.lr.ph + %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ] + %sum.02 = phi i32 [ %9, %.lr.ph ], [ 0, %0 ] + %2 = getelementptr inbounds i32* %A, i64 %indvars.iv + %3 = load i32* %2, align 4 + %4 = getelementptr inbounds i32* %B, i64 %indvars.iv + %5 = load i32* %4, align 4 + %6 = trunc i64 %indvars.iv to i32 + %7 = add i32 %sum.02, %6 + %8 = add i32 %7, %3 + %9 = add i32 %8, %5 + %indvars.iv.next = add i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %n + br i1 %exitcond, label %._crit_edge, label %.lr.ph + +._crit_edge: ; preds = %.lr.ph, %0 + %sum.0.lcssa = phi i32 [ 0, %0 ], [ %9, %.lr.ph ] + ret i32 %sum.0.lcssa +} + +;CHECK: @reduction_prod +;CHECK: phi <4 x i32> +;CHECK: load <4 x i32> +;CHECK: mul <4 x i32> +;CHECK: ret i32 +define i32 @reduction_prod(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) nounwind uwtable readonly noinline ssp { + %1 = icmp sgt i32 %n, 0 + br i1 %1, label %.lr.ph, label %._crit_edge + +.lr.ph: ; preds = %0, %.lr.ph + %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ] + %prod.02 = phi i32 [ %9, %.lr.ph ], [ 1, %0 ] + %2 = getelementptr inbounds i32* %A, i64 %indvars.iv + %3 = load i32* %2, align 4 + %4 = getelementptr inbounds i32* %B, i64 %indvars.iv + %5 = load i32* %4, align 4 + %6 = trunc i64 %indvars.iv to i32 + %7 = mul i32 %prod.02, %6 + %8 = mul i32 %7, %3 + %9 = mul i32 %8, %5 + %indvars.iv.next = add i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %n + br i1 %exitcond, label %._crit_edge, label %.lr.ph + +._crit_edge: ; preds = %.lr.ph, %0 + %prod.0.lcssa = phi i32 [ 1, %0 ], [ %9, %.lr.ph ] + ret i32 %prod.0.lcssa +} + +;CHECK: @reduction_mix +;CHECK: phi <4 x i32> +;CHECK: load <4 x i32> +;CHECK: mul <4 x i32> +;CHECK: ret i32 +define i32 @reduction_mix(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) nounwind uwtable readonly noinline ssp { + %1 = icmp sgt i32 %n, 0 + br i1 %1, label %.lr.ph, label %._crit_edge + +.lr.ph: ; preds = %0, %.lr.ph + %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ] + %sum.02 = phi i32 [ %9, %.lr.ph ], [ 0, %0 ] + %2 = getelementptr inbounds i32* %A, i64 %indvars.iv + %3 = load i32* %2, align 4 + %4 = getelementptr inbounds i32* %B, i64 %indvars.iv + %5 = load i32* %4, align 4 + %6 = mul nsw i32 %5, %3 + %7 = trunc i64 %indvars.iv to i32 + %8 = add i32 %sum.02, %7 + %9 = add i32 %8, %6 + %indvars.iv.next = add i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %n + br i1 %exitcond, label %._crit_edge, label %.lr.ph + +._crit_edge: ; preds = %.lr.ph, %0 + %sum.0.lcssa = phi i32 [ 0, %0 ], [ %9, %.lr.ph ] + ret i32 %sum.0.lcssa +} + +;CHECK: @reduction_mul +;CHECK: mul <4 x i32> +;CHECK: ret i32 +define i32 @reduction_mul(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) nounwind uwtable readonly noinline ssp { + %1 = icmp sgt i32 %n, 0 + br i1 %1, label %.lr.ph, label %._crit_edge + +.lr.ph: ; preds = %0, %.lr.ph + %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ] + %sum.02 = phi i32 [ %9, %.lr.ph ], [ 19, %0 ] + %2 = getelementptr inbounds i32* %A, i64 %indvars.iv + %3 = load i32* %2, align 4 + %4 = getelementptr inbounds i32* %B, i64 %indvars.iv + %5 = load i32* %4, align 4 + %6 = trunc i64 %indvars.iv to i32 + %7 = add i32 %3, %6 + %8 = add i32 %7, %5 + %9 = mul i32 %8, %sum.02 + %indvars.iv.next = add i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %n + br i1 %exitcond, label %._crit_edge, label %.lr.ph + +._crit_edge: ; preds = %.lr.ph, %0 + %sum.0.lcssa = phi i32 [ 0, %0 ], [ %9, %.lr.ph ] + ret i32 %sum.0.lcssa +} + +;CHECK: @start_at_non_zero +;CHECK: phi <4 x i32> +;CHECK: <i32 120, i32 0, i32 0, i32 0> +;CHECK: ret i32 +define i32 @start_at_non_zero(i32* nocapture %in, i32* nocapture %coeff, i32* nocapture %out, i32 %n) nounwind uwtable readonly ssp { +entry: + %cmp7 = icmp sgt i32 %n, 0 + br i1 %cmp7, label %for.body, label %for.end + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] + %sum.09 = phi i32 [ %add, %for.body ], [ 120, %entry ] + %arrayidx = getelementptr inbounds i32* %in, i64 %indvars.iv + %0 = load i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i32* %coeff, i64 %indvars.iv + %1 = load i32* %arrayidx2, align 4 + %mul = mul nsw i32 %1, %0 + %add = add nsw i32 %mul, %sum.09 + %indvars.iv.next = add i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %n + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + %sum.0.lcssa = phi i32 [ 120, %entry ], [ %add, %for.body ] + ret i32 %sum.0.lcssa +} + + diff --git a/test/Transforms/LoopVectorize/scalar-select.ll b/test/Transforms/LoopVectorize/scalar-select.ll new file mode 100644 index 00000000000..8d5b6fd8aff --- /dev/null +++ b/test/Transforms/LoopVectorize/scalar-select.ll @@ -0,0 +1,37 @@ +; RUN: opt < %s -loop-vectorize -dce -instcombine -licm -S | FileCheck %s + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.8.0" + +@a = common global [2048 x i32] zeroinitializer, align 16 +@b = common global [2048 x i32] zeroinitializer, align 16 +@c = common global [2048 x i32] zeroinitializer, align 16 + +;CHECK: @example1 +;CHECK: load <4 x i32> +; make sure that we have a scalar condition and a vector operand. +;CHECK: select i1 %cond, <4 x i32> +;CHECK: store <4 x i32> +;CHECK: ret void +define void @example1(i1 %cond) nounwind uwtable ssp { + br label %1 + +; <label>:1 ; preds = %1, %0 + %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ] + %2 = getelementptr inbounds [2048 x i32]* @b, i64 0, i64 %indvars.iv + %3 = load i32* %2, align 4 + %4 = getelementptr inbounds [2048 x i32]* @c, i64 0, i64 %indvars.iv + %5 = load i32* %4, align 4 + %6 = add nsw i32 %5, %3 + %7 = getelementptr inbounds [2048 x i32]* @a, i64 0, i64 %indvars.iv + %sel = select i1 %cond, i32 %6, i32 zeroinitializer + store i32 %sel, i32* %7, align 4 + %indvars.iv.next = add i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, 256 + br i1 %exitcond, label %8, label %1 + +; <label>:8 ; preds = %1 + ret void +} + diff --git a/test/Transforms/SROA/basictest.ll b/test/Transforms/SROA/basictest.ll index 644fda167d4..03120f7a327 100644 --- a/test/Transforms/SROA/basictest.ll +++ b/test/Transforms/SROA/basictest.ll @@ -1063,3 +1063,23 @@ entry: call void @llvm.lifetime.end(i64 -1, i8* %0) ret void } + +define void @PR14105({ [16 x i8] }* %ptr) { +; Ensure that when rewriting the GEP index '-1' for this alloca we preserve is +; sign as negative. We use a volatile memcpy to ensure promotion never actually +; occurs. +; CHECK: @PR14105 + +entry: + %a = alloca { [16 x i8] }, align 8 +; CHECK: alloca [16 x i8], align 8 + + %gep = getelementptr inbounds { [16 x i8] }* %ptr, i64 -1 +; CHECK-NEXT: getelementptr inbounds { [16 x i8] }* %ptr, i64 -1, i32 0, i64 0 + + %cast1 = bitcast { [16 x i8 ] }* %gep to i8* + %cast2 = bitcast { [16 x i8 ] }* %a to i8* + call void @llvm.memcpy.p0i8.p0i8.i32(i8* %cast1, i8* %cast2, i32 16, i32 8, i1 true) + ret void +; CHECK: ret +} diff --git a/test/Transforms/SimplifyLibCalls/FFS.ll b/test/Transforms/SimplifyLibCalls/FFS.ll index e38d78349d4..6aecbeacd7e 100644 --- a/test/Transforms/SimplifyLibCalls/FFS.ll +++ b/test/Transforms/SimplifyLibCalls/FFS.ll @@ -1,6 +1,7 @@ -; Test that the ToAsciiOptimizer works correctly -; RUN: opt < %s -simplify-libcalls -S | \ -; RUN: not grep "call.*@ffs" +; Test that FFSOpt works correctly +; RUN: opt < %s -simplify-libcalls -S | FileCheck %s + +; CHECK-NOT: call{{.*}}@ffs @non_const = external global i32 ; <i32*> [#uses=1] @@ -34,3 +35,11 @@ define i32 @a(i64) nounwind { %2 = call i32 @ffsll(i64 %0) ; <i32> [#uses=1] ret i32 %2 } + +; PR13028 +define i32 @b() nounwind { + %ffs = call i32 @ffsll(i64 0) + ret i32 %ffs +; CHECK: @b +; CHECK-NEXT: ret i32 0 +} diff --git a/test/Transforms/SimplifyLibCalls/StrCpy.ll b/test/Transforms/SimplifyLibCalls/StrCpy.ll deleted file mode 100644 index 83406ff8f86..00000000000 --- a/test/Transforms/SimplifyLibCalls/StrCpy.ll +++ /dev/null @@ -1,37 +0,0 @@ -; Test that the StrCpyOptimizer works correctly -; RUN: opt < %s -simplify-libcalls -S | FileCheck %s - -; This transformation requires the pointer size, as it assumes that size_t is -; the size of a pointer. -target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32" - -@hello = constant [6 x i8] c"hello\00" - -declare i8* @strcpy(i8*, i8*) - -declare i8* @__strcpy_chk(i8*, i8*, i32) nounwind - -declare i32 @llvm.objectsize.i32(i8*, i1) nounwind readonly - -; rdar://6839935 - -define i32 @t1() { -; CHECK: @t1 - %target = alloca [1024 x i8] - %arg1 = getelementptr [1024 x i8]* %target, i32 0, i32 0 - %arg2 = getelementptr [6 x i8]* @hello, i32 0, i32 0 - %rslt1 = call i8* @strcpy( i8* %arg1, i8* %arg2 ) -; CHECK: @llvm.memcpy.p0i8.p0i8.i32 - ret i32 0 -} - -define i32 @t2() { -; CHECK: @t2 - %target = alloca [1024 x i8] - %arg1 = getelementptr [1024 x i8]* %target, i32 0, i32 0 - %arg2 = getelementptr [6 x i8]* @hello, i32 0, i32 0 - %tmp1 = call i32 @llvm.objectsize.i32(i8* %arg1, i1 false) - %rslt1 = call i8* @__strcpy_chk(i8* %arg1, i8* %arg2, i32 %tmp1) -; CHECK: @__memcpy_chk - ret i32 0 -} diff --git a/test/Transforms/TailCallElim/nocapture.ll b/test/Transforms/TailCallElim/nocapture.ll index 87cb9dd427b..5a1a9a6e7ce 100644 --- a/test/Transforms/TailCallElim/nocapture.ll +++ b/test/Transforms/TailCallElim/nocapture.ll @@ -1,9 +1,9 @@ ; RUN: opt %s -tailcallelim -S | FileCheck %s -; XFAIL: * declare void @use(i8* nocapture, i8* nocapture) +declare void @boring() -define i8* @foo(i8* nocapture %A, i1 %cond) { +define i8* @test1(i8* nocapture %A, i1 %cond) { ; CHECK: tailrecurse: ; CHECK: %A.tr = phi i8* [ %A, %0 ], [ %B, %cond_true ] ; CHECK: %cond.tr = phi i1 [ %cond, %0 ], [ false, %cond_true ] @@ -14,12 +14,27 @@ define i8* @foo(i8* nocapture %A, i1 %cond) { cond_true: ; CHECK: cond_true: ; CHECK: br label %tailrecurse - call i8* @foo(i8* %B, i1 false) + call i8* @test1(i8* %B, i1 false) ret i8* null cond_false: ; CHECK: cond_false call void @use(i8* %A, i8* %B) -; CHECK: tail call void @use(i8* %A.tr, i8* %B) +; CHECK: call void @use(i8* %A.tr, i8* %B) + call void @boring() +; CHECK: tail call void @boring() ret i8* null ; CHECK: ret i8* null } + +; PR14143 +define void @test2(i8* %a, i8* %b) { +; CHECK: @test2 +; CHECK-NOT: tail call +; CHECK: ret void + %c = alloca [100 x i8], align 16 + %tmp = bitcast [100 x i8]* %c to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %b, i8* %tmp, i64 100, i32 1, i1 false) + ret void +} + +declare void @llvm.memcpy.p0i8.p0i8.i64(i8*, i8*, i64, i32, i1) diff --git a/test/lit.cfg b/test/lit.cfg index dc37317ba95..7e6760e95ae 100644 --- a/test/lit.cfg +++ b/test/lit.cfg @@ -146,13 +146,12 @@ if re.search(r'cygwin|mingw32|win32', mcjit_triple): mcjit_triple += "-elf" config.substitutions.append( ('%mcjit_triple', mcjit_triple) ) -# When running under valgrind, we mangle '-vg' or '-vg_leak' onto the end of the -# triple so we can check it with XFAIL and XTARGET. -config.target_triple += lit.valgrindTriple - # Provide a substition for those tests that need to run the jit to obtain data # but simply want use the currently considered most reliable jit for platform -defaultIsMCJIT='true' if 'arm' in config.target_triple else 'false' +if 'arm' in config.target_triple: + defaultIsMCJIT = 'true' +else: + defaultIsMCJIT = 'false' config.substitutions.append( ('%defaultjit', '-use-mcjit='+defaultIsMCJIT) ) # Process jit implementation option @@ -242,6 +241,10 @@ else: if loadable_module: config.available_features.add('loadable_module') +# LTO +if config.lto_is_enabled == "1": + config.available_features.add('lto') + # llc knows whether he is compiled with -DNDEBUG. import subprocess try: diff --git a/test/lit.site.cfg.in b/test/lit.site.cfg.in index 178b22f10f3..2bbe63e6348 100644 --- a/test/lit.site.cfg.in +++ b/test/lit.site.cfg.in @@ -11,6 +11,7 @@ config.python_executable = "@PYTHON_EXECUTABLE@" config.ocamlopt_executable = "@OCAMLOPT@" config.enable_shared = @ENABLE_SHARED@ config.enable_assertions = @ENABLE_ASSERTIONS@ +config.lto_is_enabled = "@LTO_IS_ENABLED@" config.targets_to_build = "@TARGETS_TO_BUILD@" config.llvm_bindings = "@LLVM_BINDINGS@" config.host_os = "@HOST_OS@" diff --git a/tools/bugpoint-passes/bugpoint.exports b/tools/bugpoint-passes/bugpoint.exports index d8fdd6a5767..e69de29bb2d 100644 --- a/tools/bugpoint-passes/bugpoint.exports +++ b/tools/bugpoint-passes/bugpoint.exports @@ -1 +0,0 @@ -_ZN4llvm14BasicBlockPass14doFinalizationERNS_6ModuleE diff --git a/tools/llvm-dwarfdump/llvm-dwarfdump.cpp b/tools/llvm-dwarfdump/llvm-dwarfdump.cpp index 38c3a1e76f5..309bc4ecd46 100644 --- a/tools/llvm-dwarfdump/llvm-dwarfdump.cpp +++ b/tools/llvm-dwarfdump/llvm-dwarfdump.cpp @@ -1,4 +1,4 @@ -//===-- llvm-dwarfdump.cpp - Debug info dumping utility for llvm -----------===// +//===-- llvm-dwarfdump.cpp - Debug info dumping utility for llvm ----------===// // // The LLVM Compiler Infrastructure // @@ -118,8 +118,8 @@ static void DumpInput(const StringRef &Filename) { if (PrintFunctions) SpecFlags |= DILineInfoSpecifier::FunctionName; if (PrintInlining) { - DIInliningInfo InliningInfo = dictx->getInliningInfoForAddress( - Address, SpecFlags); + DIInliningInfo InliningInfo = + dictx->getInliningInfoForAddress(Address, SpecFlags); uint32_t n = InliningInfo.getNumberOfFrames(); if (n == 0) { // Print one empty debug line info in any case. diff --git a/unittests/ExecutionEngine/JIT/JITTest.cpp b/unittests/ExecutionEngine/JIT/JITTest.cpp index ae6855e68bf..6933091949b 100644 --- a/unittests/ExecutionEngine/JIT/JITTest.cpp +++ b/unittests/ExecutionEngine/JIT/JITTest.cpp @@ -606,7 +606,7 @@ TEST_F(JITTest, FunctionIsRecompiledAndRelinked) { // program from the IR input to the JIT to assert that the JIT doesn't use its // definition. extern "C" int32_t JITTest_AvailableExternallyGlobal; -int32_t JITTest_AvailableExternallyGlobal = 42; +int32_t JITTest_AvailableExternallyGlobal LLVM_ATTRIBUTE_USED = 42; namespace { // Tests on ARM disabled as we're running the old jit diff --git a/unittests/ExecutionEngine/JIT/Makefile b/unittests/ExecutionEngine/JIT/Makefile index b535a6b2960..9e0bb9ea593 100644 --- a/unittests/ExecutionEngine/JIT/Makefile +++ b/unittests/ExecutionEngine/JIT/Makefile @@ -35,8 +35,15 @@ ifeq ($(USE_OPROFILE), 1) LINK_COMPONENTS += oprofilejit endif +EXPORTED_SYMBOL_FILE = $(PROJ_OBJ_DIR)/JITTests.exports include $(LLVM_SRC_ROOT)/unittests/Makefile.unittest # Permit these tests to use the JIT's symbolic lookup. LD.Flags += $(RDYNAMIC) + +# Symbol exports are necessary (at least for now) when building with LTO. +$(LLVMUnitTestExe): $(NativeExportsFile) +$(PROJ_OBJ_DIR)/JITTests.exports: $(PROJ_SRC_DIR)/JITTests.def $(PROJ_OBJ_DIR)/.dir + tail -n +2 $< > $@ + diff --git a/utils/TableGen/CodeGenInstruction.cpp b/utils/TableGen/CodeGenInstruction.cpp index 99d2f173a87..fd38672bfcf 100644 --- a/utils/TableGen/CodeGenInstruction.cpp +++ b/utils/TableGen/CodeGenInstruction.cpp @@ -233,11 +233,12 @@ static void ParseConstraint(const std::string &CStr, CGIOperandList &Ops) { if (wpos == std::string::npos) throw "Illegal format for tied-to constraint: '" + CStr + "'"; - std::pair<unsigned,unsigned> SrcOp = - Ops.ParseOperandName(Name.substr(wpos), false); - if (SrcOp > DestOp) - throw "Illegal tied-to operand constraint '" + CStr + "'"; - + std::string SrcOpName = Name.substr(wpos); + std::pair<unsigned,unsigned> SrcOp = Ops.ParseOperandName(SrcOpName, false); + if (SrcOp > DestOp) { + std::swap(SrcOp, DestOp); + std::swap(SrcOpName, DestOpName); + } unsigned FlatOpNo = Ops.getFlattenedOperandNumber(SrcOp); diff --git a/utils/lit/lit/ExampleTests/LLVM.InTree/test/Bar/bar-test.ll b/utils/lit/lit/ExampleTests/LLVM.InTree/test/Bar/bar-test.ll deleted file mode 100644 index 3017b13e48c..00000000000 --- a/utils/lit/lit/ExampleTests/LLVM.InTree/test/Bar/bar-test.ll +++ /dev/null @@ -1,3 +0,0 @@ -; RUN: true -; XFAIL: * -; XTARGET: darwin diff --git a/utils/lit/lit/ExampleTests/LLVM.InTree/test/lit.cfg b/utils/lit/lit/ExampleTests/LLVM.InTree/test/lit.cfg index e9df1e5b53b..3fdd63c2245 100644 --- a/utils/lit/lit/ExampleTests/LLVM.InTree/test/lit.cfg +++ b/utils/lit/lit/ExampleTests/LLVM.InTree/test/lit.cfg @@ -77,7 +77,7 @@ for line in open(os.path.join(config.llvm_obj_root, 'test', 'site.exp')): excludes = [] -# Provide target_triple for use in XFAIL and XTARGET. +# Provide target_triple for use in XFAIL. config.target_triple = site_exp['target_triplet'] # Provide llvm_supports_target for use in local configs. diff --git a/utils/lit/lit/ExampleTests/LLVM.OutOfTree/src/test/lit.cfg b/utils/lit/lit/ExampleTests/LLVM.OutOfTree/src/test/lit.cfg index e9df1e5b53b..3fdd63c2245 100644 --- a/utils/lit/lit/ExampleTests/LLVM.OutOfTree/src/test/lit.cfg +++ b/utils/lit/lit/ExampleTests/LLVM.OutOfTree/src/test/lit.cfg @@ -77,7 +77,7 @@ for line in open(os.path.join(config.llvm_obj_root, 'test', 'site.exp')): excludes = [] -# Provide target_triple for use in XFAIL and XTARGET. +# Provide target_triple for use in XFAIL. config.target_triple = site_exp['target_triplet'] # Provide llvm_supports_target for use in local configs. diff --git a/utils/lit/lit/ExampleTests/lit.cfg b/utils/lit/lit/ExampleTests/lit.cfg index 20ee37dcef2..2629918d9f6 100644 --- a/utils/lit/lit/ExampleTests/lit.cfg +++ b/utils/lit/lit/ExampleTests/lit.cfg @@ -23,4 +23,4 @@ config.test_exec_root = None config.target_triple = 'foo' # available_features: Used by ShTest and TclTest formats for REQUIRES checks. -config.available_features = ['some-feature-name'] +config.available_features.add('some-feature-name') diff --git a/utils/lit/lit/ExampleTests/vg-fail.c b/utils/lit/lit/ExampleTests/vg-fail.c new file mode 100644 index 00000000000..e3339ff91aa --- /dev/null +++ b/utils/lit/lit/ExampleTests/vg-fail.c @@ -0,0 +1,4 @@ +// This test should XPASS, when run without valgrind. + +// RUN: true +// XFAIL: valgrind diff --git a/utils/lit/lit/ExampleTests/xfail-feature.c b/utils/lit/lit/ExampleTests/xfail-feature.c new file mode 100644 index 00000000000..3444bf87008 --- /dev/null +++ b/utils/lit/lit/ExampleTests/xfail-feature.c @@ -0,0 +1,4 @@ +// This test should XPASS. + +// RUN: true +// XFAIL: some-feature-name diff --git a/utils/lit/lit/LitConfig.py b/utils/lit/lit/LitConfig.py index c71c0ccdea9..0a359a3db8e 100644 --- a/utils/lit/lit/LitConfig.py +++ b/utils/lit/lit/LitConfig.py @@ -42,14 +42,11 @@ class LitConfig: self.numWarnings = 0 self.valgrindArgs = [] - self.valgrindTriple = "" if self.useValgrind: - self.valgrindTriple = "-vg" self.valgrindArgs = ['valgrind', '-q', '--run-libc-freeres=no', '--tool=memcheck', '--trace-children=yes', '--error-exitcode=123'] if self.valgrindLeakCheck: - self.valgrindTriple += "_leak" self.valgrindArgs.append('--leak-check=full') else: # The default is 'summary'. diff --git a/utils/lit/lit/TestRunner.py b/utils/lit/lit/TestRunner.py index 71882b76f8b..0c1911ed356 100644 --- a/utils/lit/lit/TestRunner.py +++ b/utils/lit/lit/TestRunner.py @@ -370,27 +370,27 @@ def executeScript(test, litConfig, tmpBase, commands, cwd): return executeCommand(command, cwd=cwd, env=test.config.environment) -def isExpectedFail(xfails, xtargets, target_triple): - # Check if any xfail matches this target. +def isExpectedFail(test, xfails): + # Check if any of the xfails match an available feature or the target. for item in xfails: - if item == '*' or item in target_triple: - break - else: - return False + # If this is the wildcard, it always fails. + if item == '*': + return True - # If so, see if it is expected to pass on this target. - # - # FIXME: Rename XTARGET to something that makes sense, like XPASS. - for item in xtargets: - if item == '*' or item in target_triple: - return False + # If this is an exact match for one of the features, it fails. + if item in test.config.available_features: + return True + + # If this is a part of the target triple, it fails. + if item in test.suite.config.target_triple: + return True - return True + return False def parseIntegratedTestScript(test, normalize_slashes=False, extra_substitutions=[]): """parseIntegratedTestScript - Scan an LLVM/Clang style integrated test - script and extract the lines to 'RUN' as well as 'XFAIL' and 'XTARGET' + script and extract the lines to 'RUN' as well as 'XFAIL' and 'REQUIRES' information. The RUN lines also will have variable substitution performed. """ @@ -431,7 +431,6 @@ def parseIntegratedTestScript(test, normalize_slashes=False, # Collect the test lines from the script. script = [] xfails = [] - xtargets = [] requires = [] for ln in open(sourcepath): if 'RUN:' in ln: @@ -450,9 +449,6 @@ def parseIntegratedTestScript(test, normalize_slashes=False, elif 'XFAIL:' in ln: items = ln[ln.index('XFAIL:') + 6:].split(',') xfails.extend([s.strip() for s in items]) - elif 'XTARGET:' in ln: - items = ln[ln.index('XTARGET:') + 8:].split(',') - xtargets.extend([s.strip() for s in items]) elif 'REQUIRES:' in ln: items = ln[ln.index('REQUIRES:') + 9:].split(',') requires.extend([s.strip() for s in items]) @@ -491,7 +487,7 @@ def parseIntegratedTestScript(test, normalize_slashes=False, return (Test.UNSUPPORTED, "Test requires the following features: %s" % msg) - isXFail = isExpectedFail(xfails, xtargets, test.suite.config.target_triple) + isXFail = isExpectedFail(test, xfails) return script,isXFail,tmpBase,execdir def formatTestOutput(status, out, err, exitCode, failDueToStderr, script): diff --git a/utils/lit/lit/TestingConfig.py b/utils/lit/lit/TestingConfig.py index 223120c4fe2..a1f79a3bfc4 100644 --- a/utils/lit/lit/TestingConfig.py +++ b/utils/lit/lit/TestingConfig.py @@ -16,6 +16,7 @@ class TestingConfig: 'PATH' : os.pathsep.join(litConfig.path + [os.environ.get('PATH','')]), 'SYSTEMROOT' : os.environ.get('SYSTEMROOT',''), + 'TERM' : os.environ.get('TERM',''), 'LLVM_DISABLE_CRASH_REPORT' : '1', } @@ -28,6 +29,13 @@ class TestingConfig: 'TMP' : os.environ.get('TMP',''), }) + # Set the default available features based on the LitConfig. + available_features = [] + if litConfig.useValgrind: + available_features.append('valgrind') + if litConfig.valgrindLeakCheck: + available_features.append('vg_leak') + config = TestingConfig(parent, name = '<unnamed>', suffixes = set(), @@ -39,7 +47,7 @@ class TestingConfig: test_exec_root = None, test_source_root = None, excludes = [], - available_features = []) + available_features = available_features) if os.path.exists(path): # FIXME: Improve detection and error reporting of errors in the diff --git a/utils/vim/llvm.vim b/utils/vim/llvm.vim index 9cd483d1479..f80de83da8c 100644 --- a/utils/vim/llvm.vim +++ b/utils/vim/llvm.vim @@ -79,7 +79,6 @@ syn match llvmSpecialComment /;\s*RUN:.*$/ syn match llvmSpecialComment /;\s*PR\d*\s*$/ syn match llvmSpecialComment /;\s*END\.\s*$/ syn match llvmSpecialComment /;\s*XFAIL:.*$/ -syn match llvmSpecialComment /;\s*XTARGET:.*$/ if version >= 508 || !exists("did_c_syn_inits") if version < 508 |