Merge master branch

git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/R600/@166411 91177308-0d34-0410-b5e6-96231b3b80d8
author: tstellar <tstellar@91177308-0d34-0410-b5e6-96231b3b80d8> 2012-10-22 14:55:30 +0000
committer: tstellar <tstellar@91177308-0d34-0410-b5e6-96231b3b80d8> 2012-10-22 14:55:30 +0000
commit: 81d3998812b5977a220d62076cc446822747af21 (patch)
tree: b073cb1be9c65dbdd6cd312c322efd3c04abb0af
parent: 63a079bf76d139b91e170557cf16b17bdbecd58d (diff)
164 files changed, 5586 insertions, 783 deletions
diff --git a/bindings/ocaml/executionengine/llvm_executionengine.ml b/bindings/ocaml/executionengine/llvm_executionengine.ml
index b6178dc055e..ddb53bbb5af 100644
--- a/bindings/ocaml/executionengine/llvm_executionengine.ml
+++ b/bindings/ocaml/executionengine/llvm_executionengine.ml
@@ -84,7 +84,7 @@ module ExecutionEngine = struct
     = "llvm_ee_free_machine_code"
 
   external target_data: t -> Llvm_target.DataLayout.t
-    = "LLVMGetExecutionEngineDataLayout"
+    = "LLVMGetExecutionEngineTargetData"
   
   (* The following are not bound. Patches are welcome.
   
diff --git a/docs/CodingStandards.rst b/docs/CodingStandards.rst
index de50e6eeaf5..418e3f05a36 100644
--- a/docs/CodingStandards.rst
+++ b/docs/CodingStandards.rst
@@ -146,6 +146,132 @@ useful to use C style (``/* */``) comments however:
 To comment out a large block of code, use ``#if 0`` and ``#endif``. These nest
 properly and are better behaved in general than C style comments.
 
+Doxygen Use in Documentation Comments
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Use the ``\file`` command to turn the standard file header into a file-level
+comment.
+
+Include descriptive ``\brief`` paragraphs for all public interfaces (public
+classes, member and non-member functions).  Explain API use and purpose in
+``\brief`` paragraphs, don't just restate the information that can be inferred
+from the API name.  Put detailed discussion into separate paragraphs.
+
+To refer to parameter names inside a paragraph, use the ``\p name`` command.
+Don't use the ``\arg name`` command since it starts a new paragraph that
+contains documentation for the parameter.
+
+Wrap non-inline code examples in ``\code ... \endcode``.
+
+To document a function parameter, start a new paragraph with the
+``\param name`` command.  If the parameter is used as an out or an in/out
+parameter, use the ``\param [out] name`` or ``\param [in,out] name`` command,
+respectively.
+
+To describe function return value, start a new paragraph with the ``\returns``
+command.
+
+A minimal documentation comment:
+
+.. code-block:: c++
+
+  /// \brief Does foo and bar.
+  void fooBar(bool Baz);
+
+A documentation comment that uses all Doxygen features in a preferred way:
+
+.. code-block:: c++
+
+  /// \brief Does foo and bar.
+  ///
+  /// Does not do foo the usual way if \p Baz is true.
+  ///
+  /// Typical usage:
+  /// \code
+  ///   fooBar(false, "quux", Res);
+  /// \endcode
+  ///
+  /// \param Quux kind of foo to do.
+  /// \param [out] Result filled with bar sequence on foo success.
+  ///
+  /// \returns true on success.
+  bool fooBar(bool Baz, StringRef Quux, std::vector<int> &Result);
+
+Don't duplicate the documentation comment in the header file and in the
+implementation file.  Put the documentation comments for public APIs into the
+header file.  Documentation comments for private APIs can go to the
+implementation file.  In any case, implementation files can include additional
+comments (not necessarily in Doxygen markup) to explain implementation details
+as needed.
+
+Don't duplicate function or class name at the beginning of the comment.
+For humans it is obvious which function or class is being documented;
+automatic documentation processing tools are smart enough to bind the comment
+to the correct declaration.
+
+Wrong:
+
+.. code-block:: c++
+
+  // In Something.h:
+
+  /// Something - An abstraction for some complicated thing.
+  class Something {
+  public:
+    /// fooBar - Does foo and bar.
+    void fooBar();
+  };
+
+  // In Something.cpp:
+
+  /// fooBar - Does foo and bar.
+  void Something::fooBar() { ... }
+
+Correct:
+
+.. code-block:: c++
+
+  // In Something.h:
+
+  /// \brief An abstraction for some complicated thing.
+  class Something {
+  public:
+    /// \brief Does foo and bar.
+    void fooBar();
+  };
+
+  // In Something.cpp:
+
+  // Builds a B-tree in order to do foo.  See paper by...
+  void Something::fooBar() { ... }
+
+It is not required to use additional Doxygen features, but sometimes it might
+be a good idea to do so.
+
+Consider:
+
+* adding comments to any narrow namespace containing a collection of
+  related functions or types;
+
+* using top-level groups to organize a collection of related functions at
+  namespace scope where the grouping is smaller than the namespace;
+
+* using member groups and additional comments attached to member
+  groups to organize within a class.
+
+For example:
+
+.. code-block:: c++
+
+  class Something {
+    /// \name Functions that do Foo.
+    /// @{
+    void fooBar();
+    void fooBaz();
+    /// @}
+    ...
+  };
+
 ``#include`` Style
 ^^^^^^^^^^^^^^^^^^
 
@@ -604,8 +730,7 @@ code to be structured like this:
 
 .. code-block:: c++
 
-  /// containsFoo - Return true if the specified list has an element that is
-  /// a foo.
+  /// \returns true if the specified list has an element that is a foo.
   static bool containsFoo(const std::vector<Bar*> &List) {
     for (unsigned i = 0, e = List.size(); i != e; ++i)
       if (List[i]->isFoo())
@@ -1051,21 +1176,21 @@ If a namespace definition is small and *easily* fits on a screen (say, less than
 
   namespace llvm {
     namespace X86 {
-      /// RelocationType - An enum for the x86 relocation codes. Note that
+      /// \brief An enum for the x86 relocation codes.  Note that
       /// the terminology here doesn't follow x86 convention - word means
       /// 32-bit and dword means 64-bit.
       enum RelocationType {
-        /// reloc_pcrel_word - PC relative relocation, add the relocated value to
+        /// \brief PC relative relocation, add the relocated value to
         /// the value already in memory, after we adjust it for where the PC is.
         reloc_pcrel_word = 0,
 
-        /// reloc_picrel_word - PIC base relative relocation, add the relocated
-        /// value to the value already in memory, after we adjust it for where the
+        /// \brief PIC base relative relocation, add the relocated value to
+        /// the value already in memory, after we adjust it for where the
         /// PIC base is.
         reloc_picrel_word = 1,
 
-        /// reloc_absolute_word, reloc_absolute_dword - Absolute relocation, just
-        /// add the relocated value to the value already in memory.
+        /// \brief Absolute relocation, just add the relocated value to the
+        /// value already in memory.
         reloc_absolute_word = 2,
         reloc_absolute_dword = 3
       };
@@ -1084,7 +1209,7 @@ closed.  For example:
   namespace llvm {
   namespace knowledge {
 
-  /// Grokable - This class represents things that Smith can have an intimate
+  /// This class represents things that Smith can have an intimate
   /// understanding of and contains the data associated with it.
   class Grokable {
   ...
diff --git a/docs/CommandGuide/lit.rst b/docs/CommandGuide/lit.rst
index 3eb0be91f13..9e96cd2a4bf 100644
--- a/docs/CommandGuide/lit.rst
+++ b/docs/CommandGuide/lit.rst
@@ -125,6 +125,10 @@ EXECUTION OPTIONS
  *--error-exitcode* argument for valgrind is used so that valgrind failures will
  cause the program to exit with a non-zero status.
 
+ When this option is enabled, **lit** will also automatically provide a
+ "valgrind" feature that can be used to conditionally disable (or expect failure
+ in) certain tests.
+
 
 
 **--vg-arg**\ =\ *ARG*
@@ -133,6 +137,15 @@ EXECUTION OPTIONS
 
 
 
+**--vg-leak**
+
+ When *--vg* is used, enable memory leak checks. When this option is enabled,
+ **lit** will also automatically provide a "vg_leak" feature that can be
+ used to conditionally disable (or expect failure in) certain tests.
+
+
+
+
 **--time-tests**
 
  Track the wall time individual tests take to execute and includes the results in
diff --git a/docs/LangRef.html b/docs/LangRef.html
index 167397ff53d..874e12fa44a 100644
--- a/docs/LangRef.html
+++ b/docs/LangRef.html
@@ -1364,11 +1364,13 @@ target datalayout = "<i>layout specification</i>"
       8-bits. If omitted, the natural stack alignment defaults to "unspecified",
       which does not prevent any alignment promotions.</dd>
 
-  <dt><tt>p:<i>size</i>:<i>abi</i>:<i>pref</i></tt></dt>
+  <dt><tt>p[n]:<i>size</i>:<i>abi</i>:<i>pref</i></tt></dt>
   <dd>This specifies the <i>size</i> of a pointer and its <i>abi</i> and
-      <i>preferred</i> alignments. All sizes are in bits. Specifying
-      the <i>pref</i> alignment is optional. If omitted, the
-      preceding <tt>:</tt> should be omitted too.</dd>
+      <i>preferred</i> alignments for address space <i>n</i>. All sizes are in
+      bits. Specifying the <i>pref</i> alignment is optional. If omitted, the
+      preceding <tt>:</tt> should be omitted too. The address space,
+      <i>n</i> is optional, and if not specified, denotes the default address
+      space 0. The value of <i>n</i> must be in the range [1,2^23).</dd>
 
   <dt><tt>i<i>size</i>:<i>abi</i>:<i>pref</i></tt></dt>
   <dd>This specifies the alignment for an integer type of a given bit
@@ -1409,6 +1411,10 @@ target datalayout = "<i>layout specification</i>"
 <ul>
   <li><tt>E</tt> - big endian</li>
   <li><tt>p:64:64:64</tt> - 64-bit pointers with 64-bit alignment</li>
+  <li><tt>p1:32:32:32</tt> - 32-bit pointers with 32-bit alignment for
+  address space 1</li>
+  <li><tt>p2:16:32:32</tt> - 16-bit pointers with 32-bit alignment for
+  address space 2</li>
   <li><tt>i1:8:8</tt> - i1 is 8-bit (byte) aligned</li>
   <li><tt>i8:8:8</tt> - i8 is 8-bit (byte) aligned</li>
   <li><tt>i16:16:16</tt> - i16 is 16-bit aligned</li>
diff --git a/docs/README.txt b/docs/README.txt
index 2fbbf987405..5ddd599d8a7 100644
--- a/docs/README.txt
+++ b/docs/README.txt
@@ -6,7 +6,7 @@ The LLVM documentation is currently written in two formats:
   * Plain HTML documentation.
 
   * reStructured Text documentation using the Sphinx documentation generator. It
-    is currently tested with Sphinx 1.1.3.
+    is currently tested with Sphinx 1.1.3. 
 
     For more information, see the "Sphinx Introduction for LLVM Developers"
     document.
diff --git a/docs/ReleaseNotes.html b/docs/ReleaseNotes.html
index 75a6fd1ca10..0ef8f3d1f37 100644
--- a/docs/ReleaseNotes.html
+++ b/docs/ReleaseNotes.html
@@ -466,6 +466,20 @@ Release Notes</a>.</h1>
 <p>In addition to many minor performance tweaks and bug fixes, this release
    includes a few major enhancements and additions to the optimizers:</p>
 
+<p> Loop Vectorizer - We've added a loop vectorizer and we are now able to
+    vectorize small loops. The loop vectorizer is disabled by default and
+    can be enabled using the <b>-mllvm -vectorize</b> flag. <br/>
+    We can now vectorize this code:
+
+    <pre class="doc_code">
+    for (i=0; i&lt;n; i++) {
+      a[i] = b[i+1] + c[i+3] + i;
+      sum += d[i];
+    }
+    </pre>
+
+ </p>
+
 <ul>
   <li>...</li>
 </ul>
@@ -506,6 +520,8 @@ Release Notes</a>.</h1>
   We use the lifetime markers to tell the codegen that a certain alloca
   is used within a region.</p>
 
+<p> We now merge consecutive loads and stores. </p>
+
 <p>We have put a significant amount of work into the code generator
    infrastructure, which allows us to implement more aggressive algorithms and
    make it run faster:</p>
@@ -645,6 +661,11 @@ Release Notes</a>.</h1>
 <p>In addition, many APIs have changed in this release.  Some of the major
    LLVM API changes are:</p>
 
+<p> We've added a new interface for allowing IR-level passes to access
+  target-specific information. A new IR-level pass, called
+  "TargetTransformInfo" provides a number of low-level interfaces.
+  LSR and LowerInvoke already use the new interface. </p>
+
 <ul>
   <li>...</li>
 </ul>
diff --git a/docs/TestingGuide.html b/docs/TestingGuide.html
index 1f9c9157306..ae2643fe4e8 100644
--- a/docs/TestingGuide.html
+++ b/docs/TestingGuide.html
@@ -798,14 +798,15 @@ define two separate CHECK lines that match on the same line.
   <p>Sometimes it is necessary to mark a test case as "expected fail" or XFAIL.
   You can easily mark a test as XFAIL just by including <tt>XFAIL: </tt> on a
   line near the top of the file. This signals that the test case should succeed
-  if the test fails. Such test cases are counted separately by the testing tool. To
-  specify an expected fail, use the XFAIL keyword in the comments of the test
-  program followed by a colon and one or more regular expressions (separated by
-  a comma). The regular expressions allow you to XFAIL the test conditionally by
-  host platform. The regular expressions following the : are matched against the
-  target triplet for the host machine. If there is a match, the test is expected
-  to fail. If not, the test is expected to succeed. To XFAIL everywhere just
-  specify <tt>XFAIL: *</tt>. Here is an example of an <tt>XFAIL</tt> line:</p>
+  if the test fails. Such test cases are counted separately by the testing
+  tool. To specify an expected fail, use the XFAIL keyword in the comments of
+  the test program followed by a colon and one or more failure patterns. Each
+  failure pattern can be either '*' (to specify fail everywhere), or a part of a
+  target triple (indicating the test should fail on that platform), or the name
+  of a configurable feature (for example, "loadable_module"). If there is a
+  match, the test is expected to fail. If not, the test is expected to
+  succeed. To XFAIL everywhere just specify <tt>XFAIL: *</tt>. Here is an
+  example of an <tt>XFAIL</tt> line:</p>
 
 <div class="doc_code">
 <pre>
diff --git a/docs/subsystems.rst b/docs/subsystems.rst
index 8c3cdf2417b..6f77b79fbe2 100644
--- a/docs/subsystems.rst
+++ b/docs/subsystems.rst
@@ -91,3 +91,10 @@ Subsystem Documentation
 * :ref:`segmented_stacks`
 
    This document describes segmented stacks and how they are used in LLVM.
+
+* `Howto: Implementing LLVM Integrated Assembler`_
+
+   A simple guide for how to implement an LLVM integrated assembler for an
+   architecture.
+
+.. _`Howto: Implementing LLVM Integrated Assembler`: http://www.embecosm.com/download/ean10.html
diff --git a/include/llvm-c/Transforms/Vectorize.h b/include/llvm-c/Transforms/Vectorize.h
index 9e7c7540d76..68a9bdd3885 100644
--- a/include/llvm-c/Transforms/Vectorize.h
+++ b/include/llvm-c/Transforms/Vectorize.h
@@ -36,6 +36,9 @@ extern "C" {
 /** See llvm::createBBVectorizePass function. */
 void LLVMAddBBVectorizePass(LLVMPassManagerRef PM);
 
+/** See llvm::createLoopVectorizePass function. */
+void LLVMAddLoopVectorizePass(LLVMPassManagerRef PM);
+
 /**
  * @}
  */
diff --git a/include/llvm/ADT/Optional.h b/include/llvm/ADT/Optional.h
index ee8b69f3d12..f43aeb1bc4d 100644
--- a/include/llvm/ADT/Optional.h
+++ b/include/llvm/ADT/Optional.h
@@ -16,8 +16,13 @@
 #ifndef LLVM_ADT_OPTIONAL
 #define LLVM_ADT_OPTIONAL
 
+#include "llvm/Support/Compiler.h"
 #include <cassert>
 
+#if LLVM_USE_RVALUE_REFERENCES
+#include <utility>
+#endif
+
 namespace llvm {
 
 template<typename T>
@@ -28,6 +33,10 @@ public:
   explicit Optional() : x(), hasVal(false) {}
   Optional(const T &y) : x(y), hasVal(true) {}
 
+#if LLVM_USE_RVALUE_REFERENCES
+  Optional(T &&y) : x(std::forward<T>(y)), hasVal(true) {}
+#endif
+
   static inline Optional create(const T* y) {
     return y ? Optional(*y) : Optional();
   }
diff --git a/include/llvm/ADT/OwningPtr.h b/include/llvm/ADT/OwningPtr.h
index ea9495d3869..05bcd40d086 100644
--- a/include/llvm/ADT/OwningPtr.h
+++ b/include/llvm/ADT/OwningPtr.h
@@ -32,6 +32,15 @@ class OwningPtr {
 public:
   explicit OwningPtr(T *P = 0) : Ptr(P) {}
 
+#if LLVM_USE_RVALUE_REFERENCES
+  OwningPtr(OwningPtr &&Other) : Ptr(Other.take()) {}
+
+  OwningPtr &operator=(OwningPtr &&Other) {
+    reset(Other.take());
+    return *this;
+  }
+#endif
+
   ~OwningPtr() {
     delete Ptr;
   }
@@ -86,6 +95,15 @@ class OwningArrayPtr {
 public:
   explicit OwningArrayPtr(T *P = 0) : Ptr(P) {}
 
+#if LLVM_USE_RVALUE_REFERENCES
+  OwningArrayPtr(OwningArrayPtr &&Other) : Ptr(Other.take()) {}
+
+  OwningArrayPtr &operator=(OwningArrayPtr &&Other) {
+    reset(Other.take());
+    return *this;
+  }
+#endif
+
   ~OwningArrayPtr() {
     delete [] Ptr;
   }
diff --git a/include/llvm/ADT/SmallBitVector.h b/include/llvm/ADT/SmallBitVector.h
index fba1d12542a..a9cd54e13b3 100644
--- a/include/llvm/ADT/SmallBitVector.h
+++ b/include/llvm/ADT/SmallBitVector.h
@@ -306,8 +306,8 @@ public:
     assert(E <= size() && "Attempted to set out-of-bounds range!");
     if (I == E) return *this;
     if (isSmall()) {
-      uintptr_t EMask = 1 << E;
-      uintptr_t IMask = 1 << I;
+      uintptr_t EMask = ((uintptr_t)1) << E;
+      uintptr_t IMask = ((uintptr_t)1) << I;
       uintptr_t Mask = EMask - IMask;
       setSmallBits(getSmallBits() | Mask);
     } else
@@ -337,8 +337,8 @@ public:
     assert(E <= size() && "Attempted to reset out-of-bounds range!");
     if (I == E) return *this;
     if (isSmall()) {
-      uintptr_t EMask = 1 << E;
-      uintptr_t IMask = 1 << I;
+      uintptr_t EMask = ((uintptr_t)1) << E;
+      uintptr_t IMask = ((uintptr_t)1) << I;
       uintptr_t Mask = EMask - IMask;
       setSmallBits(getSmallBits() & ~Mask);
     } else
diff --git a/include/llvm/Analysis/ScalarEvolutionExpander.h b/include/llvm/Analysis/ScalarEvolutionExpander.h
index 3ab9c8256bb..3f8f149cb42 100644
--- a/include/llvm/Analysis/ScalarEvolutionExpander.h
+++ b/include/llvm/Analysis/ScalarEvolutionExpander.h
@@ -22,7 +22,7 @@
 #include <set>
 
 namespace llvm {
-  class ScalarTargetTransformInfo;
+  class TargetLowering;
 
   /// Return true if the given expression is safe to expand in the sense that
   /// all materialized values are safe to speculate.
@@ -129,7 +129,7 @@ namespace llvm {
     /// representative. Return the number of phis eliminated.
     unsigned replaceCongruentIVs(Loop *L, const DominatorTree *DT,
                                  SmallVectorImpl<WeakVH> &DeadInsts,
-                                 const ScalarTargetTransformInfo *STTI = NULL);
+                                 const TargetLowering *TLI = NULL);
 
     /// expandCodeFor - Insert code to directly compute the specified SCEV
     /// expression into the program.  The inserted code is inserted into the
diff --git a/include/llvm/Attributes.h b/include/llvm/Attributes.h
index a28aa183473..1e995f9a858 100644
--- a/include/llvm/Attributes.h
+++ b/include/llvm/Attributes.h
@@ -290,16 +290,7 @@ struct AttributeWithIndex {
 
   static AttributeWithIndex get(LLVMContext &C, unsigned Idx,
                                 ArrayRef<Attributes::AttrVal> Attrs) {
-    AttrBuilder B;
-
-    for (ArrayRef<Attributes::AttrVal>::iterator I = Attrs.begin(),
-           E = Attrs.end(); I != E; ++I)
-      B.addAttribute(*I);
-
-    AttributeWithIndex P;
-    P.Index = Idx;
-    P.Attrs = Attributes::get(C, B);
-    return P;
+    return get(Idx, Attributes::get(C, Attrs));
   }
   static AttributeWithIndex get(unsigned Idx, Attributes Attrs) {
     AttributeWithIndex P;
@@ -324,8 +315,8 @@ public:
     FunctionIndex = ~0U
   };
 private:
-  /// AttrList - The attributes that we are managing.  This can be null
-  /// to represent the empty attributes list.
+  /// AttrList - The attributes that we are managing.  This can be null to
+  /// represent the empty attributes list.
   AttributeListImpl *AttrList;
 public:
   AttrListPtr() : AttrList(0) {}
diff --git a/include/llvm/CodeGen/MachineFrameInfo.h b/include/llvm/CodeGen/MachineFrameInfo.h
index 7188b1abbdf..0e4e132e40d 100644
--- a/include/llvm/CodeGen/MachineFrameInfo.h
+++ b/include/llvm/CodeGen/MachineFrameInfo.h
@@ -29,6 +29,7 @@ class MachineBasicBlock;
 class TargetFrameLowering;
 class BitVector;
 class Value;
+class AllocaInst;
 
 /// The CalleeSavedInfo class tracks the information need to locate where a
 /// callee saved register is in the current frame.
@@ -106,14 +107,14 @@ class MachineFrameInfo {
 
     /// Alloca - If this stack object is originated from an Alloca instruction
     /// this value saves the original IR allocation. Can be NULL.
-    const Value *Alloca;
+    const AllocaInst *Alloca;
 
     // PreAllocated - If true, the object was mapped into the local frame
     // block and doesn't need additional handling for allocation beyond that.
     bool PreAllocated;
 
     StackObject(uint64_t Sz, unsigned Al, int64_t SP, bool IM,
-                bool isSS, bool NSP, const Value *Val)
+                bool isSS, bool NSP, const AllocaInst *Val)
       : SPOffset(SP), Size(Sz), Alignment(Al), isImmutable(IM),
         isSpillSlot(isSS), MayNeedSP(NSP), Alloca(Val), PreAllocated(false) {}
   };
@@ -369,7 +370,7 @@ public:
 
   /// getObjectAllocation - Return the underlying Alloca of the specified
   /// stack object if it exists. Returns 0 if none exists.
-  const Value* getObjectAllocation(int ObjectIdx) const {
+  const AllocaInst* getObjectAllocation(int ObjectIdx) const {
     assert(unsigned(ObjectIdx+NumFixedObjects) < Objects.size() &&
            "Invalid Object Idx!");
     return Objects[ObjectIdx+NumFixedObjects].Alloca;
@@ -495,7 +496,7 @@ public:
   /// a nonnegative identifier to represent it.
   ///
   int CreateStackObject(uint64_t Size, unsigned Alignment, bool isSS,
-                        bool MayNeedSP = false, const Value *Alloca = 0) {
+                        bool MayNeedSP = false, const AllocaInst *Alloca = 0) {
     assert(Size != 0 && "Cannot allocate zero size stack objects!");
     Objects.push_back(StackObject(Size, Alignment, 0, false, isSS, MayNeedSP,
                                   Alloca));
diff --git a/include/llvm/CodeGen/MachineRegisterInfo.h b/include/llvm/CodeGen/MachineRegisterInfo.h
index a5bc7f7d391..4e86363f071 100644
--- a/include/llvm/CodeGen/MachineRegisterInfo.h
+++ b/include/llvm/CodeGen/MachineRegisterInfo.h
@@ -77,16 +77,20 @@ class MachineRegisterInfo {
     return MO->Contents.Reg.Next;
   }
 
-  /// UsedPhysRegs - This is a bit vector that is computed and set by the
+  /// UsedRegUnits - This is a bit vector that is computed and set by the
   /// register allocator, and must be kept up to date by passes that run after
   /// register allocation (though most don't modify this).  This is used
   /// so that the code generator knows which callee save registers to save and
   /// for other target specific uses.
-  /// This vector only has bits set for registers explicitly used, not their
-  /// aliases.
-  BitVector UsedPhysRegs;
-
-  /// UsedPhysRegMask - Additional used physregs, but including aliases.
+  /// This vector has bits set for register units that are modified in the
+  /// current function. It doesn't include registers clobbered by function
+  /// calls with register mask operands.
+  BitVector UsedRegUnits;
+
+  /// UsedPhysRegMask - Additional used physregs including aliases.
+  /// This bit vector represents all the registers clobbered by function calls.
+  /// It can model things that UsedRegUnits can't, such as function calls that
+  /// clobber ymm7 but preserve the low half in xmm7.
   BitVector UsedPhysRegMask;
 
   /// ReservedRegs - This is a bit vector of reserved registers.  The target
@@ -357,29 +361,27 @@ public:
   //===--------------------------------------------------------------------===//
 
   /// isPhysRegUsed - Return true if the specified register is used in this
-  /// function.  This only works after register allocation.
+  /// function. Also check for clobbered aliases and registers clobbered by
+  /// function calls with register mask operands.
+  ///
+  /// This only works after register allocation. It is primarily used by
+  /// PrologEpilogInserter to determine which callee-saved registers need
+  /// spilling.
   bool isPhysRegUsed(unsigned Reg) const {
-    return UsedPhysRegs.test(Reg) || UsedPhysRegMask.test(Reg);
-  }
-
-  /// isPhysRegOrOverlapUsed - Return true if Reg or any overlapping register
-  /// is used in this function.
-  bool isPhysRegOrOverlapUsed(unsigned Reg) const {
     if (UsedPhysRegMask.test(Reg))
       return true;
-    for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
-      if (UsedPhysRegs.test(*AI))
+    for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units)
+      if (UsedRegUnits.test(*Units))
         return true;
     return false;
   }
 
   /// setPhysRegUsed - Mark the specified register used in this function.
   /// This should only be called during and after register allocation.
-  void setPhysRegUsed(unsigned Reg) { UsedPhysRegs.set(Reg); }
-
-  /// addPhysRegsUsed - Mark the specified registers used in this function.
-  /// This should only be called during and after register allocation.
-  void addPhysRegsUsed(const BitVector &Regs) { UsedPhysRegs |= Regs; }
+  void setPhysRegUsed(unsigned Reg) {
+    for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units)
+      UsedRegUnits.set(*Units);
+  }
 
   /// addPhysRegsUsedFromRegMask - Mark any registers not in RegMask as used.
   /// This corresponds to the bit mask attached to register mask operands.
@@ -390,8 +392,9 @@ public:
   /// setPhysRegUnused - Mark the specified register unused in this function.
   /// This should only be called during and after register allocation.
   void setPhysRegUnused(unsigned Reg) {
-    UsedPhysRegs.reset(Reg);
     UsedPhysRegMask.reset(Reg);
+    for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units)
+      UsedRegUnits.reset(*Units);
   }
 
 
diff --git a/include/llvm/CodeGen/SchedulerRegistry.h b/include/llvm/CodeGen/SchedulerRegistry.h
index a582b0c40c8..836b73a15a2 100644
--- a/include/llvm/CodeGen/SchedulerRegistry.h
+++ b/include/llvm/CodeGen/SchedulerRegistry.h
@@ -102,6 +102,11 @@ ScheduleDAGSDNodes *createVLIWDAGScheduler(SelectionDAGISel *IS,
 ScheduleDAGSDNodes *createDefaultScheduler(SelectionDAGISel *IS,
                                            CodeGenOpt::Level OptLevel);
 
+/// createDAGLinearizer - This creates a "no-scheduling" scheduler which
+/// linearize the DAG using topological order.
+ScheduleDAGSDNodes *createDAGLinearizer(SelectionDAGISel *IS,
+                                        CodeGenOpt::Level OptLevel);
+
 } // end namespace llvm
 
 #endif
diff --git a/include/llvm/ExecutionEngine/RuntimeDyld.h b/include/llvm/ExecutionEngine/RuntimeDyld.h
index 7da4a4e09a3..a71b1411c8a 100644
--- a/include/llvm/ExecutionEngine/RuntimeDyld.h
+++ b/include/llvm/ExecutionEngine/RuntimeDyld.h
@@ -63,7 +63,7 @@ protected:
   // Any relocations already associated with the symbol will be re-resolved.
   void reassignSectionAddress(unsigned SectionID, uint64_t Addr);
 public:
-  RuntimeDyld(RTDyldMemoryManager*);
+  RuntimeDyld(RTDyldMemoryManager *);
   ~RuntimeDyld();
 
   /// loadObject - prepare the object contained in the input buffer for
diff --git a/include/llvm/InitializePasses.h b/include/llvm/InitializePasses.h
index ee9b1c5852e..a5f7008b6fb 100644
--- a/include/llvm/InitializePasses.h
+++ b/include/llvm/InitializePasses.h
@@ -66,6 +66,7 @@ void initializeAliasDebuggerPass(PassRegistry&);
 void initializeAliasSetPrinterPass(PassRegistry&);
 void initializeAlwaysInlinerPass(PassRegistry&);
 void initializeArgPromotionPass(PassRegistry&);
+void initializeBarrierNoopPass(PassRegistry&);
 void initializeBasicAliasAnalysisPass(PassRegistry&);
 void initializeBasicCallGraphPass(PassRegistry&);
 void initializeBlockExtractorPassPass(PassRegistry&);
@@ -261,6 +262,7 @@ void initializeVirtRegRewriterPass(PassRegistry&);
 void initializeInstSimplifierPass(PassRegistry&);
 void initializeUnpackMachineBundlesPass(PassRegistry&);
 void initializeFinalizeMachineBundlesPass(PassRegistry&);
+void initializeLoopVectorizePass(PassRegistry&);
 void initializeBBVectorizePass(PassRegistry&);
 void initializeMachineFunctionPrinterPassPass(PassRegistry&);
 }
diff --git a/include/llvm/Intrinsics.td b/include/llvm/Intrinsics.td
index 63d8d2c0f29..059bd804aa1 100644
--- a/include/llvm/Intrinsics.td
+++ b/include/llvm/Intrinsics.td
@@ -285,9 +285,9 @@ let Properties = [IntrNoMem] in {
 
 // NOTE: these are internal interfaces.
 def int_setjmp     : Intrinsic<[llvm_i32_ty],  [llvm_ptr_ty]>;
-def int_longjmp    : Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty]>;
+def int_longjmp    : Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty], [IntrNoReturn]>;
 def int_sigsetjmp  : Intrinsic<[llvm_i32_ty] , [llvm_ptr_ty, llvm_i32_ty]>;
-def int_siglongjmp : Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty]>;
+def int_siglongjmp : Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty], [IntrNoReturn]>;
 
 // Internal interface for object size checking
 def int_objectsize : Intrinsic<[llvm_anyint_ty], [llvm_ptr_ty, llvm_i1_ty],
@@ -345,7 +345,7 @@ let Properties = [IntrNoMem] in {
 }
 def int_eh_sjlj_functioncontext : Intrinsic<[], [llvm_ptr_ty]>;
 def int_eh_sjlj_setjmp          : Intrinsic<[llvm_i32_ty], [llvm_ptr_ty]>;
-def int_eh_sjlj_longjmp         : Intrinsic<[], [llvm_ptr_ty]>;
+def int_eh_sjlj_longjmp         : Intrinsic<[], [llvm_ptr_ty], [IntrNoReturn]>;
 
 //===---------------- Generic Variable Attribute Intrinsics----------------===//
 //
diff --git a/include/llvm/LinkAllPasses.h b/include/llvm/LinkAllPasses.h
index 4b10d0e5415..8652acd941f 100644
--- a/include/llvm/LinkAllPasses.h
+++ b/include/llvm/LinkAllPasses.h
@@ -156,6 +156,7 @@ namespace {
       (void) llvm::createCorrelatedValuePropagationPass();
       (void) llvm::createMemDepPrinter();
       (void) llvm::createInstructionSimplifierPass();
+      (void) llvm::createLoopVectorizePass();
       (void) llvm::createBBVectorizePass();
 
       (void)new llvm::IntervalPartition();
diff --git a/include/llvm/MC/MCParser/MCAsmParser.h b/include/llvm/MC/MCParser/MCAsmParser.h
index 08758cda226..fcf5b6a9b59 100644
--- a/include/llvm/MC/MCParser/MCAsmParser.h
+++ b/include/llvm/MC/MCParser/MCAsmParser.h
@@ -20,6 +20,8 @@ class MCAsmLexer;
 class MCAsmParserExtension;
 class MCContext;
 class MCExpr;
+class MCInstPrinter;
+class MCInstrInfo;
 class MCParsedAsmOperand;
 class MCStreamer;
 class MCTargetAsmParser;
@@ -29,6 +31,14 @@ class SourceMgr;
 class StringRef;
 class Twine;
 
+/// MCAsmParserSemaCallback - Generic Sema callback for assembly parser.
+class MCAsmParserSemaCallback {
+public:
+  virtual ~MCAsmParserSemaCallback(); 
+  virtual void *LookupInlineAsmIdentifier(StringRef Name, void *Loc,
+                                          unsigned &Size) = 0;
+};
+
 /// MCAsmParser - Generic assembler parser interface, for use by target specific
 /// assembly parsers.
 class MCAsmParser {
@@ -75,26 +85,21 @@ public:
   virtual bool Run(bool NoInitialTextSection, bool NoFinalize = false) = 0;
 
   virtual void setParsingInlineAsm(bool V) = 0;
+  virtual bool isParsingInlineAsm() = 0;
+
+  /// ParseMSInlineAsm - Parse ms-style inline assembly.
+  virtual bool ParseMSInlineAsm(void *AsmLoc, std::string &AsmString,
+                                unsigned &NumOutputs, unsigned &NumInputs,
+                                SmallVectorImpl<void *> &OpDecls,
+                                SmallVectorImpl<std::string> &Constraints,
+                                SmallVectorImpl<std::string> &Clobbers,
+                                const MCInstrInfo *MII,
+                                const MCInstPrinter *IP,
+                                MCAsmParserSemaCallback &SI) = 0;
 
   /// ParseStatement - Parse the next statement.
   virtual bool ParseStatement() = 0;
 
-  /// getNumParsedOperands - Returns the number of MCAsmParsedOperands from the
-  /// previously parsed statement.
-  virtual unsigned getNumParsedOperands() = 0;
-
-  /// getParsedOperand - Get a MCAsmParsedOperand.
-  virtual MCParsedAsmOperand &getParsedOperand(unsigned OpNum) = 0;
-
-  /// freeParsedOperands - Free the MCAsmParsedOperands.
-  virtual void freeParsedOperands() = 0;
-
-  /// isInstruction - Was the previously parsed statement an instruction?
-  virtual bool isInstruction() = 0;
-
-  /// getOpcode - Get the opcode from the previously parsed instruction.
-  virtual unsigned getOpcode() = 0;
-
   /// Warning - Emit a warning at the location \p L, with the message \p Msg.
   ///
   /// \return The return value is true, if warnings are fatal.
diff --git a/include/llvm/MC/MCParser/MCParsedAsmOperand.h b/include/llvm/MC/MCParser/MCParsedAsmOperand.h
index 280145bfbc8..7d0914ebb49 100644
--- a/include/llvm/MC/MCParser/MCParsedAsmOperand.h
+++ b/include/llvm/MC/MCParser/MCParsedAsmOperand.h
@@ -57,12 +57,17 @@ public:
 
   /// isMem - Is this a memory operand?
   virtual bool isMem() const = 0;
+  virtual unsigned getMemSize() const { return 0; }
 
   /// getStartLoc - Get the location of the first token of this operand.
   virtual SMLoc getStartLoc() const = 0;
   /// getEndLoc - Get the location of the last token of this operand.
   virtual SMLoc getEndLoc() const = 0;
 
+  /// needSizeDirective - Do we need to emit a sizing directive for this
+  /// operand?  Only valid when parsing MS-style inline assembly.
+  virtual bool needSizeDirective() const { return false; }
+
   /// print - Print a debug representation of the operand to the given stream.
   virtual void print(raw_ostream &OS) const = 0;
   /// dump - Print to the debug stream.
diff --git a/include/llvm/MC/MCSchedule.h b/include/llvm/MC/MCSchedule.h
index 0504dc13c8f..c9a060c79b1 100644
--- a/include/llvm/MC/MCSchedule.h
+++ b/include/llvm/MC/MCSchedule.h
@@ -54,10 +54,12 @@ struct MCWriteProcResEntry {
 };
 
 /// Specify the latency in cpu cycles for a particular scheduling class and def
-/// index. Also identify the WriteResources of this def. When the operand
-/// expands to a sequence of writes, this ID is the last write in the sequence.
+/// index. -1 indicates an invalid latency. Heuristics would typically consider
+/// an instruction with invalid latency to have infinite latency.  Also identify
+/// the WriteResources of this def. When the operand expands to a sequence of
+/// writes, this ID is the last write in the sequence.
 struct MCWriteLatencyEntry {
-  unsigned Cycles;
+  int Cycles;
   unsigned WriteResourceID;
 
   bool operator==(const MCWriteLatencyEntry &Other) const {
diff --git a/include/llvm/MC/MCTargetAsmParser.h b/include/llvm/MC/MCTargetAsmParser.h
index c9ea5ae4846..05537f9211f 100644
--- a/include/llvm/MC/MCTargetAsmParser.h
+++ b/include/llvm/MC/MCTargetAsmParser.h
@@ -41,12 +41,26 @@ protected: // Can only create subclasses.
   /// AvailableFeatures - The current set of available features.
   unsigned AvailableFeatures;
 
+  /// ParsingInlineAsm - Are we parsing ms-style inline assembly?
+  bool ParsingInlineAsm;
+
+  /// SemaCallback - The Sema callback implementation.  Must be set when parsing
+  /// ms-style inline assembly.
+  MCAsmParserSemaCallback *SemaCallback;
+
 public:
   virtual ~MCTargetAsmParser();
 
   unsigned getAvailableFeatures() const { return AvailableFeatures; }
   void setAvailableFeatures(unsigned Value) { AvailableFeatures = Value; }
 
+  bool isParsingInlineAsm () { return ParsingInlineAsm; }
+  void setParsingInlineAsm (bool Value) { ParsingInlineAsm = Value; }
+
+  void setSemaCallback(MCAsmParserSemaCallback *Callback) {
+    SemaCallback = Callback;
+  }
+
   virtual bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
                              SMLoc &EndLoc) = 0;
 
diff --git a/include/llvm/Object/ObjectFile.h b/include/llvm/Object/ObjectFile.h
index 41959bd34e1..4185ffea130 100644
--- a/include/llvm/Object/ObjectFile.h
+++ b/include/llvm/Object/ObjectFile.h
@@ -266,7 +266,7 @@ const uint64_t UnknownAddressOrSize = ~0ULL;
 
 /// ObjectFile - This class is the base class for all object file types.
 /// Concrete instances of this object are created by createObjectFile, which
-/// figure out which type to create.
+/// figures out which type to create.
 class ObjectFile : public Binary {
   virtual void anchor();
   ObjectFile() LLVM_DELETED_FUNCTION;
diff --git a/include/llvm/Operator.h b/include/llvm/Operator.h
index bc5da8e8aa3..462324a6694 100644
--- a/include/llvm/Operator.h
+++ b/include/llvm/Operator.h
@@ -36,8 +36,8 @@ private:
   void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
   void *operator new(size_t s) LLVM_DELETED_FUNCTION;
   Operator() LLVM_DELETED_FUNCTION;
-  // NOTE: cannot use LLVM_DELETED_FUNCTION because gcc errors when deleting
-  // an override of a non-deleted function.
+  // NOTE: cannot use LLVM_DELETED_FUNCTION because it's not legal to delete
+  // an overridden method that's not deleted in the base class.
   ~Operator();
 
 public:
@@ -191,7 +191,7 @@ public:
 /// opcodes.
 template<typename SuperClass, unsigned Opc>
 class ConcreteOperator : public SuperClass {
-  ~ConcreteOperator() LLVM_DELETED_FUNCTION;
+  ~ConcreteOperator(); // DO NOT IMPLEMENT
 public:
   static inline bool classof(const Instruction *I) {
     return I->getOpcode() == Opc;
@@ -207,44 +207,44 @@ public:
 
 class AddOperator
   : public ConcreteOperator<OverflowingBinaryOperator, Instruction::Add> {
-  ~AddOperator() LLVM_DELETED_FUNCTION;
+  ~AddOperator(); // DO NOT IMPLEMENT
 };
 class SubOperator
   : public ConcreteOperator<OverflowingBinaryOperator, Instruction::Sub> {
-  ~SubOperator() LLVM_DELETED_FUNCTION;
+  ~SubOperator(); // DO NOT IMPLEMENT
 };
 class MulOperator
   : public ConcreteOperator<OverflowingBinaryOperator, Instruction::Mul> {
-  ~MulOperator() LLVM_DELETED_FUNCTION;
+  ~MulOperator(); // DO NOT IMPLEMENT
 };
 class ShlOperator
   : public ConcreteOperator<OverflowingBinaryOperator, Instruction::Shl> {
-  ~ShlOperator() LLVM_DELETED_FUNCTION;
+  ~ShlOperator(); // DO NOT IMPLEMENT
 };
 
-  
+
 class SDivOperator
   : public ConcreteOperator<PossiblyExactOperator, Instruction::SDiv> {
-  ~SDivOperator() LLVM_DELETED_FUNCTION;
+  ~SDivOperator(); // DO NOT IMPLEMENT
 };
 class UDivOperator
   : public ConcreteOperator<PossiblyExactOperator, Instruction::UDiv> {
-  ~UDivOperator() LLVM_DELETED_FUNCTION;
+  ~UDivOperator(); // DO NOT IMPLEMENT
 };
 class AShrOperator
   : public ConcreteOperator<PossiblyExactOperator, Instruction::AShr> {
-  ~AShrOperator() LLVM_DELETED_FUNCTION;
+  ~AShrOperator(); // DO NOT IMPLEMENT
 };
 class LShrOperator
   : public ConcreteOperator<PossiblyExactOperator, Instruction::LShr> {
-  ~LShrOperator() LLVM_DELETED_FUNCTION;
+  ~LShrOperator(); // DO NOT IMPLEMENT
 };
-  
-  
-  
+
+
+
 class GEPOperator
   : public ConcreteOperator<Operator, Instruction::GetElementPtr> {
-  ~GEPOperator() LLVM_DELETED_FUNCTION;
+  ~GEPOperator(); // DO NOT IMPLEMENT
 
   enum {
     IsInBounds = (1 << 0)
diff --git a/include/llvm/Transforms/IPO.h b/include/llvm/Transforms/IPO.h
index 962cb63758d..08d3bbd941d 100644
--- a/include/llvm/Transforms/IPO.h
+++ b/include/llvm/Transforms/IPO.h
@@ -198,6 +198,11 @@ ModulePass *createPartialInliningPass();
 //
 ModulePass *createMetaRenamerPass();
 
+//===----------------------------------------------------------------------===//
+/// createBarrierNoopPass - This pass is purely a module pass barrier in a pass
+/// manager.
+ModulePass *createBarrierNoopPass();
+
 } // End llvm namespace
 
 #endif
diff --git a/include/llvm/Transforms/Scalar.h b/include/llvm/Transforms/Scalar.h
index 3b665bf4b68..a5d8eed7462 100644
--- a/include/llvm/Transforms/Scalar.h
+++ b/include/llvm/Transforms/Scalar.h
@@ -119,7 +119,7 @@ Pass *createLICMPass();
 // optional parameter used to consult the target machine whether certain
 // transformations are profitable.
 //
-Pass *createLoopStrengthReducePass();
+Pass *createLoopStrengthReducePass(const TargetLowering *TLI = 0);
 
 Pass *createGlobalMergePass(const TargetLowering *TLI = 0);
 
@@ -249,8 +249,9 @@ extern char &LowerSwitchID;
 // purpose "my LLVM-to-LLVM pass doesn't support the invoke instruction yet"
 // lowering pass.
 //
-FunctionPass *createLowerInvokePass();
-FunctionPass *createLowerInvokePass(bool useExpensiveEHSupport);
+FunctionPass *createLowerInvokePass(const TargetLowering *TLI = 0);
+FunctionPass *createLowerInvokePass(const TargetLowering *TLI,
+                                    bool useExpensiveEHSupport);
 extern char &LowerInvokePassID;
 
 //===----------------------------------------------------------------------===//
diff --git a/include/llvm/Transforms/Utils/BasicBlockUtils.h b/include/llvm/Transforms/Utils/BasicBlockUtils.h
index 2510aecc69c..b810f1a818c 100644
--- a/include/llvm/Transforms/Utils/BasicBlockUtils.h
+++ b/include/llvm/Transforms/Utils/BasicBlockUtils.h
@@ -25,9 +25,11 @@ namespace llvm {
 
 class AliasAnalysis;
 class Instruction;
+class MDNode;
 class Pass;
 class ReturnInst;
 class TargetLibraryInfo;
+class TerminatorInst;
 
 /// DeleteDeadBlock - Delete the specified block, which must have no
 /// predecessors.
@@ -203,6 +205,29 @@ void SplitLandingPadPredecessors(BasicBlock *OrigBB,ArrayRef<BasicBlock*> Preds,
 ReturnInst *FoldReturnIntoUncondBranch(ReturnInst *RI, BasicBlock *BB,
                                        BasicBlock *Pred);
 
+/// SplitBlockAndInsertIfThen - Split the containing block at the
+/// specified instruction - everything before and including Cmp stays
+/// in the old basic block, and everything after Cmp is moved to a
+/// new block. The two blocks are connected by a conditional branch
+/// (with value of Cmp being the condition).
+/// Before:
+///   Head
+///   Cmp
+///   Tail
+/// After:
+///   Head
+///   Cmp
+///   if (Cmp)
+///     ThenBlock
+///   Tail
+///
+/// If Unreachable is true, then ThenBlock ends with
+/// UnreachableInst, otherwise it branches to Tail.
+/// Returns the NewBasicBlock's terminator.
+
+TerminatorInst *SplitBlockAndInsertIfThen(Instruction *Cmp,
+    bool Unreachable, MDNode *BranchWeights = 0);
+
 } // End llvm namespace
 
 #endif
diff --git a/include/llvm/Transforms/Vectorize.h b/include/llvm/Transforms/Vectorize.h
index 1e49a9c01e6..41e53a83e2f 100644
--- a/include/llvm/Transforms/Vectorize.h
+++ b/include/llvm/Transforms/Vectorize.h
@@ -107,6 +107,12 @@ BasicBlockPass *
 createBBVectorizePass(const VectorizeConfig &C = VectorizeConfig());
 
 //===----------------------------------------------------------------------===//
+//
+// LoopVectorize - Create a loop vectorization pass.
+//
+Pass * createLoopVectorizePass();
+
+//===----------------------------------------------------------------------===//
 /// @brief Vectorize the BasicBlock.
 ///
 /// @param BB The BasicBlock to be vectorized
diff --git a/lib/Analysis/ScalarEvolutionExpander.cpp b/lib/Analysis/ScalarEvolutionExpander.cpp
index 5c2a49e767f..111bfb4a6a7 100644
--- a/lib/Analysis/ScalarEvolutionExpander.cpp
+++ b/lib/Analysis/ScalarEvolutionExpander.cpp
@@ -19,8 +19,8 @@
 #include "llvm/LLVMContext.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/DataLayout.h"
+#include "llvm/Target/TargetLowering.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/TargetTransformInfo.h"
 
 using namespace llvm;
 
@@ -1599,15 +1599,15 @@ static bool width_descending(Value *lhs, Value *rhs) {
 /// This does not depend on any SCEVExpander state but should be used in
 /// the same context that SCEVExpander is used.
 unsigned SCEVExpander::replaceCongruentIVs(Loop *L, const DominatorTree *DT,
-                                          SmallVectorImpl<WeakVH> &DeadInsts,
-                                          const ScalarTargetTransformInfo *STTI) {
+                                           SmallVectorImpl<WeakVH> &DeadInsts,
+                                           const TargetLowering *TLI) {
   // Find integer phis in order of increasing width.
   SmallVector<PHINode*, 8> Phis;
   for (BasicBlock::iterator I = L->getHeader()->begin();
        PHINode *Phi = dyn_cast<PHINode>(I); ++I) {
     Phis.push_back(Phi);
   }
-  if (STTI)
+  if (TLI)
     std::sort(Phis.begin(), Phis.end(), width_descending);
 
   unsigned NumElim = 0;
@@ -1618,14 +1618,25 @@ unsigned SCEVExpander::replaceCongruentIVs(Loop *L, const DominatorTree *DT,
          PEnd = Phis.end(); PIter != PEnd; ++PIter) {
     PHINode *Phi = *PIter;
 
+    // Fold constant phis. They may be congruent to other constant phis and
+    // would confuse the logic below that expects proper IVs.
+    if (Value *V = Phi->hasConstantValue()) {
+      Phi->replaceAllUsesWith(V);
+      DeadInsts.push_back(Phi);
+      ++NumElim;
+      DEBUG_WITH_TYPE(DebugType, dbgs()
+                      << "INDVARS: Eliminated constant iv: " << *Phi << '\n');
+      continue;
+    }
+
     if (!SE.isSCEVable(Phi->getType()))
       continue;
 
     PHINode *&OrigPhiRef = ExprToIVMap[SE.getSCEV(Phi)];
     if (!OrigPhiRef) {
       OrigPhiRef = Phi;
-      if (Phi->getType()->isIntegerTy() && STTI &&
-          STTI->isTruncateFree(Phi->getType(), Phis.back()->getType())) {
+      if (Phi->getType()->isIntegerTy() && TLI
+          && TLI->isTruncateFree(Phi->getType(), Phis.back()->getType())) {
         // This phi can be freely truncated to the narrowest phi type. Map the
         // truncated expression to it so it will be reused for narrow types.
         const SCEV *TruncExpr =
diff --git a/lib/CodeGen/ExecutionDepsFix.cpp b/lib/CodeGen/ExecutionDepsFix.cpp
index 2311842671a..ed78f194215 100644
--- a/lib/CodeGen/ExecutionDepsFix.cpp
+++ b/lib/CodeGen/ExecutionDepsFix.cpp
@@ -657,7 +657,7 @@ bool ExeDepsFix::runOnMachineFunction(MachineFunction &mf) {
   bool anyregs = false;
   for (TargetRegisterClass::const_iterator I = RC->begin(), E = RC->end();
        I != E; ++I)
-    if (MF->getRegInfo().isPhysRegOrOverlapUsed(*I)) {
+    if (MF->getRegInfo().isPhysRegUsed(*I)) {
       anyregs = true;
       break;
     }
diff --git a/lib/CodeGen/LiveRangeEdit.cpp b/lib/CodeGen/LiveRangeEdit.cpp
index 0dfb084f1e1..f8fbc7ddf0c 100644
--- a/lib/CodeGen/LiveRangeEdit.cpp
+++ b/lib/CodeGen/LiveRangeEdit.cpp
@@ -96,6 +96,13 @@ bool LiveRangeEdit::allUsesAvailableAt(const MachineInstr *OrigMI,
     const VNInfo *OVNI = li.getVNInfoAt(OrigIdx);
     if (!OVNI)
       continue;
+
+    // Don't allow rematerialization immediately after the original def.
+    // It would be incorrect if OrigMI redefines the register.
+    // See PR14098.
+    if (SlotIndex::isSameInstr(OrigIdx, UseIdx))
+      return false;
+
     if (OVNI != li.getVNInfoAt(UseIdx))
       return false;
   }
diff --git a/lib/CodeGen/MachineFunctionPrinterPass.cpp b/lib/CodeGen/MachineFunctionPrinterPass.cpp
index 0102ac708d0..ed94efb9355 100644
--- a/lib/CodeGen/MachineFunctionPrinterPass.cpp
+++ b/lib/CodeGen/MachineFunctionPrinterPass.cpp
@@ -51,7 +51,7 @@ struct MachineFunctionPrinterPass : public MachineFunctionPass {
 char MachineFunctionPrinterPass::ID = 0;
 }
 
-char &MachineFunctionPrinterPassID = MachineFunctionPrinterPass::ID;
+char &llvm::MachineFunctionPrinterPassID = MachineFunctionPrinterPass::ID;
 INITIALIZE_PASS(MachineFunctionPrinterPass, "print-machineinstrs",
                 "Machine Function Printer", false, false)
 
diff --git a/lib/CodeGen/MachineRegisterInfo.cpp b/lib/CodeGen/MachineRegisterInfo.cpp
index ae7c15be158..95d7a7dd689 100644
--- a/lib/CodeGen/MachineRegisterInfo.cpp
+++ b/lib/CodeGen/MachineRegisterInfo.cpp
@@ -21,7 +21,7 @@ MachineRegisterInfo::MachineRegisterInfo(const TargetRegisterInfo &TRI)
   : TRI(&TRI), IsSSA(true), TracksLiveness(true) {
   VRegInfo.reserve(256);
   RegAllocHints.reserve(256);
-  UsedPhysRegs.resize(TRI.getNumRegs());
+  UsedRegUnits.resize(TRI.getNumRegUnits());
   UsedPhysRegMask.resize(TRI.getNumRegs());
 
   // Create the physreg use/def lists.
@@ -32,7 +32,7 @@ MachineRegisterInfo::MachineRegisterInfo(const TargetRegisterInfo &TRI)
 MachineRegisterInfo::~MachineRegisterInfo() {
 #ifndef NDEBUG
   clearVirtRegs();
-  for (unsigned i = 0, e = UsedPhysRegs.size(); i != e; ++i)
+  for (unsigned i = 0, e = TRI->getNumRegs(); i != e; ++i)
     assert(!PhysRegUseDefLists[i] &&
            "PhysRegUseDefLists has entries after all instructions are deleted");
 #endif
diff --git a/lib/CodeGen/Passes.cpp b/lib/CodeGen/Passes.cpp
index abd62efc026..4ea21d4ff7b 100644
--- a/lib/CodeGen/Passes.cpp
+++ b/lib/CodeGen/Passes.cpp
@@ -359,7 +359,7 @@ void TargetPassConfig::addIRPasses() {
 
   // Run loop strength reduction before anything else.
   if (getOptLevel() != CodeGenOpt::None && !DisableLSR) {
-    addPass(createLoopStrengthReducePass());
+    addPass(createLoopStrengthReducePass(getTargetLowering()));
     if (PrintLSR)
       addPass(createPrintFunctionPass("\n\n*** Code after LSR ***\n", &dbgs()));
   }
@@ -389,7 +389,7 @@ void TargetPassConfig::addPassesToHandleExceptions() {
     addPass(createDwarfEHPass(TM));
     break;
   case ExceptionHandling::None:
-    addPass(createLowerInvokePass());
+    addPass(createLowerInvokePass(TM->getTargetLowering()));
 
     // The lower invoke pass may create unreachable code. Remove it.
     addPass(createUnreachableBlockEliminationPass());
diff --git a/lib/CodeGen/PrologEpilogInserter.cpp b/lib/CodeGen/PrologEpilogInserter.cpp
index 86df0a127bf..77554d691c2 100644
--- a/lib/CodeGen/PrologEpilogInserter.cpp
+++ b/lib/CodeGen/PrologEpilogInserter.cpp
@@ -227,7 +227,7 @@ void PEI::calculateCalleeSavedRegisters(MachineFunction &Fn) {
   std::vector<CalleeSavedInfo> CSI;
   for (unsigned i = 0; CSRegs[i]; ++i) {
     unsigned Reg = CSRegs[i];
-    if (Fn.getRegInfo().isPhysRegOrOverlapUsed(Reg)) {
+    if (Fn.getRegInfo().isPhysRegUsed(Reg)) {
       // If the reg is modified, save it!
       CSI.push_back(CalleeSavedInfo(Reg));
     }
diff --git a/lib/CodeGen/RegAllocFast.cpp b/lib/CodeGen/RegAllocFast.cpp
index e096240e04b..d6ed36ef95b 100644
--- a/lib/CodeGen/RegAllocFast.cpp
+++ b/lib/CodeGen/RegAllocFast.cpp
@@ -113,9 +113,11 @@ namespace {
     // PhysRegState - One of the RegState enums, or a virtreg.
     std::vector<unsigned> PhysRegState;
 
-    // UsedInInstr - BitVector of physregs that are used in the current
-    // instruction, and so cannot be allocated.
-    BitVector UsedInInstr;
+    typedef SparseSet<unsigned> UsedInInstrSet;
+
+    // UsedInInstr - Set of physregs that are used in the current instruction,
+    // and so cannot be allocated.
+    UsedInInstrSet UsedInInstr;
 
     // SkippedInstrs - Descriptors of instructions whose clobber list was
     // ignored because all registers were spilled. It is still necessary to
@@ -340,7 +342,7 @@ void RAFast::usePhysReg(MachineOperand &MO) {
     PhysRegState[PhysReg] = regFree;
     // Fall through
   case regFree:
-    UsedInInstr.set(PhysReg);
+    UsedInInstr.insert(PhysReg);
     MO.setIsKill();
     return;
   default:
@@ -360,13 +362,13 @@ void RAFast::usePhysReg(MachineOperand &MO) {
              "Instruction is not using a subregister of a reserved register");
       // Leave the superregister in the working set.
       PhysRegState[Alias] = regFree;
-      UsedInInstr.set(Alias);
+      UsedInInstr.insert(Alias);
       MO.getParent()->addRegisterKilled(Alias, TRI, true);
       return;
     case regFree:
       if (TRI->isSuperRegister(PhysReg, Alias)) {
         // Leave the superregister in the working set.
-        UsedInInstr.set(Alias);
+        UsedInInstr.insert(Alias);
         MO.getParent()->addRegisterKilled(Alias, TRI, true);
         return;
       }
@@ -380,7 +382,7 @@ void RAFast::usePhysReg(MachineOperand &MO) {
 
   // All aliases are disabled, bring register into working set.
   PhysRegState[PhysReg] = regFree;
-  UsedInInstr.set(PhysReg);
+  UsedInInstr.insert(PhysReg);
   MO.setIsKill();
 }
 
@@ -389,7 +391,7 @@ void RAFast::usePhysReg(MachineOperand &MO) {
 /// reserved instead of allocated.
 void RAFast::definePhysReg(MachineInstr *MI, unsigned PhysReg,
                            RegState NewState) {
-  UsedInInstr.set(PhysReg);
+  UsedInInstr.insert(PhysReg);
   switch (unsigned VirtReg = PhysRegState[PhysReg]) {
   case regDisabled:
     break;
@@ -429,7 +431,7 @@ void RAFast::definePhysReg(MachineInstr *MI, unsigned PhysReg,
 // can be allocated directly.
 // Returns spillImpossible when PhysReg or an alias can't be spilled.
 unsigned RAFast::calcSpillCost(unsigned PhysReg) const {
-  if (UsedInInstr.test(PhysReg)) {
+  if (UsedInInstr.count(PhysReg)) {
     DEBUG(dbgs() << PrintReg(PhysReg, TRI) << " is already used in instr.\n");
     return spillImpossible;
   }
@@ -454,7 +456,7 @@ unsigned RAFast::calcSpillCost(unsigned PhysReg) const {
   unsigned Cost = 0;
   for (MCRegAliasIterator AI(PhysReg, TRI, false); AI.isValid(); ++AI) {
     unsigned Alias = *AI;
-    if (UsedInInstr.test(Alias))
+    if (UsedInInstr.count(Alias))
       return spillImpossible;
     switch (unsigned VirtReg = PhysRegState[Alias]) {
     case regDisabled:
@@ -530,7 +532,7 @@ RAFast::LiveRegMap::iterator RAFast::allocVirtReg(MachineInstr *MI,
   // First try to find a completely free register.
   for (ArrayRef<unsigned>::iterator I = AO.begin(), E = AO.end(); I != E; ++I) {
     unsigned PhysReg = *I;
-    if (PhysRegState[PhysReg] == regFree && !UsedInInstr.test(PhysReg)) {
+    if (PhysRegState[PhysReg] == regFree && !UsedInInstr.count(PhysReg)) {
       assignVirtToPhysReg(*LRI, PhysReg);
       return LRI;
     }
@@ -596,7 +598,7 @@ RAFast::defineVirtReg(MachineInstr *MI, unsigned OpNum,
   LRI->LastUse = MI;
   LRI->LastOpNum = OpNum;
   LRI->Dirty = true;
-  UsedInInstr.set(LRI->PhysReg);
+  UsedInInstr.insert(LRI->PhysReg);
   return LRI;
 }
 
@@ -646,7 +648,7 @@ RAFast::reloadVirtReg(MachineInstr *MI, unsigned OpNum,
   assert(LRI->PhysReg && "Register not assigned");
   LRI->LastUse = MI;
   LRI->LastOpNum = OpNum;
-  UsedInInstr.set(LRI->PhysReg);
+  UsedInInstr.insert(LRI->PhysReg);
   return LRI;
 }
 
@@ -708,7 +710,7 @@ void RAFast::handleThroughOperands(MachineInstr *MI,
     unsigned Reg = MO.getReg();
     if (!Reg || !TargetRegisterInfo::isPhysicalRegister(Reg)) continue;
     for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) {
-      UsedInInstr.set(*AI);
+      UsedInInstr.insert(*AI);
       if (ThroughRegs.count(PhysRegState[*AI]))
         definePhysReg(MI, *AI, regFree);
     }
@@ -756,7 +758,7 @@ void RAFast::handleThroughOperands(MachineInstr *MI,
   }
 
   // Restore UsedInInstr to a state usable for allocating normal virtual uses.
-  UsedInInstr.reset();
+  UsedInInstr.clear();
   for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
     MachineOperand &MO = MI->getOperand(i);
     if (!MO.isReg() || (MO.isDef() && !MO.isEarlyClobber())) continue;
@@ -764,12 +766,12 @@ void RAFast::handleThroughOperands(MachineInstr *MI,
     if (!Reg || !TargetRegisterInfo::isPhysicalRegister(Reg)) continue;
     DEBUG(dbgs() << "\tSetting " << PrintReg(Reg, TRI)
                  << " as used in instr\n");
-    UsedInInstr.set(Reg);
+    UsedInInstr.insert(Reg);
   }
 
   // Also mark PartialDefs as used to avoid reallocation.
   for (unsigned i = 0, e = PartialDefs.size(); i != e; ++i)
-    UsedInInstr.set(PartialDefs[i]);
+    UsedInInstr.insert(PartialDefs[i]);
 }
 
 /// addRetOperand - ensure that a return instruction has an operand for each
@@ -942,7 +944,7 @@ void RAFast::AllocateBasicBlock() {
     }
 
     // Track registers used by instruction.
-    UsedInInstr.reset();
+    UsedInInstr.clear();
 
     // First scan.
     // Mark physreg uses and early clobbers as used.
@@ -1016,11 +1018,13 @@ void RAFast::AllocateBasicBlock() {
       }
     }
 
-    MRI->addPhysRegsUsed(UsedInInstr);
+    for (UsedInInstrSet::iterator
+         I = UsedInInstr.begin(), E = UsedInInstr.end(); I != E; ++I)
+      MRI->setPhysRegUsed(*I);
 
     // Track registers defined by instruction - early clobbers and tied uses at
     // this point.
-    UsedInInstr.reset();
+    UsedInInstr.clear();
     if (hasEarlyClobbers) {
       for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
         MachineOperand &MO = MI->getOperand(i);
@@ -1030,7 +1034,7 @@ void RAFast::AllocateBasicBlock() {
         // Look for physreg defs and tied uses.
         if (!MO.isDef() && !MI->isRegTiedToDefOperand(i)) continue;
         for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
-          UsedInInstr.set(*AI);
+          UsedInInstr.insert(*AI);
       }
     }
 
@@ -1080,7 +1084,9 @@ void RAFast::AllocateBasicBlock() {
       killVirtReg(VirtDead[i]);
     VirtDead.clear();
 
-    MRI->addPhysRegsUsed(UsedInInstr);
+    for (UsedInInstrSet::iterator
+         I = UsedInInstr.begin(), E = UsedInInstr.end(); I != E; ++I)
+      MRI->setPhysRegUsed(*I);
 
     if (CopyDst && CopyDst == CopySrc && CopyDstSub == CopySrcSub) {
       DEBUG(dbgs() << "-- coalescing: " << *MI);
@@ -1118,7 +1124,8 @@ bool RAFast::runOnMachineFunction(MachineFunction &Fn) {
   TII = TM->getInstrInfo();
   MRI->freezeReservedRegs(Fn);
   RegClassInfo.runOnMachineFunction(Fn);
-  UsedInInstr.resize(TRI->getNumRegs());
+  UsedInInstr.clear();
+  UsedInInstr.setUniverse(TRI->getNumRegs());
 
   assert(!MRI->isSSA() && "regalloc requires leaving SSA");
 
diff --git a/lib/CodeGen/RegisterCoalescer.cpp b/lib/CodeGen/RegisterCoalescer.cpp
index ba6b4569a8f..2ca67d63257 100644
--- a/lib/CodeGen/RegisterCoalescer.cpp
+++ b/lib/CodeGen/RegisterCoalescer.cpp
@@ -1302,7 +1302,7 @@ public:
                    SmallVectorImpl<unsigned> &ShrinkRegs);
 
   /// Get the value assignments suitable for passing to LiveInterval::join.
-  const int *getAssignments() const { return &Assignments[0]; }
+  const int *getAssignments() const { return Assignments.data(); }
 };
 } // end anonymous namespace
 
diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 8846247090c..17386b74e36 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -5308,6 +5308,48 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
     if (Reduced.getNode())
       return Reduced;
   }
+  // fold (trunc (concat ... x ...)) -> (concat ..., (trunc x), ...)),
+  // where ... are all 'undef'.
+  if (N0.getOpcode() == ISD::CONCAT_VECTORS && !LegalTypes) {
+    SmallVector<EVT, 8> VTs;
+    SDValue V;
+    unsigned Idx = 0;
+    unsigned NumDefs = 0;
+
+    for (unsigned i = 0, e = N0.getNumOperands(); i != e; ++i) {
+      SDValue X = N0.getOperand(i);
+      if (X.getOpcode() != ISD::UNDEF) {
+        V = X;
+        Idx = i;
+        NumDefs++;
+      }
+      // Stop if more than one members are non-undef.
+      if (NumDefs > 1)
+        break;
+      VTs.push_back(EVT::getVectorVT(*DAG.getContext(),
+                                     VT.getVectorElementType(),
+                                     X.getValueType().getVectorNumElements()));
+    }
+
+    if (NumDefs == 0)
+      return DAG.getUNDEF(VT);
+
+    if (NumDefs == 1) {
+      assert(V.getNode() && "The single defined operand is empty!");
+      SmallVector<SDValue, 8> Opnds;
+      for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
+        if (i != Idx) {
+          Opnds.push_back(DAG.getUNDEF(VTs[i]));
+          continue;
+        }
+        SDValue NV = DAG.getNode(ISD::TRUNCATE, V.getDebugLoc(), VTs[i], V);
+        AddToWorkList(NV.getNode());
+        Opnds.push_back(NV);
+      }
+      return DAG.getNode(ISD::CONCAT_VECTORS, N->getDebugLoc(), VT,
+                         &Opnds[0], Opnds.size());
+    }
+  }
 
   // Simplify the operands using demanded-bits information.
   if (!VT.isVector() &&
@@ -8564,8 +8606,8 @@ SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode* N) {
       return SDValue();
 
     // Only handle cases where both indexes are constants with the same type.
-    ConstantSDNode *InsIdx = dyn_cast<ConstantSDNode>(N->getOperand(1));
-    ConstantSDNode *ExtIdx = dyn_cast<ConstantSDNode>(V->getOperand(2));
+    ConstantSDNode *ExtIdx = dyn_cast<ConstantSDNode>(N->getOperand(1));
+    ConstantSDNode *InsIdx = dyn_cast<ConstantSDNode>(V->getOperand(2));
 
     if (InsIdx && ExtIdx &&
         InsIdx->getValueType(0).getSizeInBits() <= 64 &&
@@ -8582,6 +8624,21 @@ SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode* N) {
     }
   }
 
+  if (V->getOpcode() == ISD::CONCAT_VECTORS) {
+    // Combine:
+    //    (extract_subvec (concat V1, V2, ...), i)
+    // Into:
+    //    Vi if possible
+    // Only operand 0 is checked as 'concat' assumes all inputs of the same type.
+    if (V->getOperand(0).getValueType() != NVT)
+      return SDValue();
+    unsigned Idx = dyn_cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+    unsigned NumElems = NVT.getVectorNumElements();
+    assert((Idx % NumElems) == 0 &&
+           "IDX in concat is not a multiple of the result vector length.");
+    return V->getOperand(Idx / NumElems);
+  }
+
   return SDValue();
 }
 
@@ -9018,6 +9075,10 @@ bool DAGCombiner::SimplifySelectOps(SDNode *TheSelect, SDValue LHS,
       if ((LLD->hasAnyUseOfValue(1) && LLD->isPredecessorOf(CondNode)) ||
           (RLD->hasAnyUseOfValue(1) && RLD->isPredecessorOf(CondNode)))
         return false;
+      // The loads must not depend on one another.
+      if (LLD->isPredecessorOf(RLD) ||
+          RLD->isPredecessorOf(LLD))
+        return false;
       Addr = DAG.getNode(ISD::SELECT, TheSelect->getDebugLoc(),
                          LLD->getBasePtr().getValueType(),
                          TheSelect->getOperand(0), LLD->getBasePtr(),
diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 2ec129f7308..abf40b77a18 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -1240,6 +1240,19 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
     if (Action == TargetLowering::Legal)
       Action = TargetLowering::Custom;
     break;
+  case ISD::DEBUGTRAP:
+    Action = TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0));
+    if (Action == TargetLowering::Expand) {
+      // replace ISD::DEBUGTRAP with ISD::TRAP
+      SDValue NewVal;
+      NewVal = DAG.getNode(ISD::TRAP, Node->getDebugLoc(), Node->getVTList(),
+                           Node->getOperand(0));
+      ReplaceNode(Node, NewVal.getNode());
+      LegalizeOp(NewVal.getNode());
+      return;
+    }
+    break;
+
   default:
     if (Node->getOpcode() >= ISD::BUILTIN_OP_END) {
       Action = TargetLowering::Legal;
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
index 515eff3b253..2ae08692ae1 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
@@ -13,6 +13,7 @@
 
 #define DEBUG_TYPE "pre-RA-sched"
 #include "ScheduleDAGSDNodes.h"
+#include "InstrEmitter.h"
 #include "llvm/InlineAsm.h"
 #include "llvm/CodeGen/SchedulerRegistry.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
@@ -34,6 +35,10 @@ STATISTIC(NumPRCopies,   "Number of physical copies");
 static RegisterScheduler
   fastDAGScheduler("fast", "Fast suboptimal list scheduling",
                    createFastDAGScheduler);
+static RegisterScheduler
+  linearizeDAGScheduler("linearize", "Linearize DAG, no scheduling",
+                        createDAGLinearizer);
+
 
 namespace {
   /// FastPriorityQueue - A degenerate priority queue that considers
@@ -629,6 +634,155 @@ void ScheduleDAGFast::ListScheduleBottomUp() {
 #endif
 }
 
+
+namespace {
+//===----------------------------------------------------------------------===//
+// ScheduleDAGLinearize - No scheduling scheduler, it simply linearize the
+// DAG in topological order.
+// IMPORTANT: this may not work for targets with phyreg dependency.
+//
+class ScheduleDAGLinearize : public ScheduleDAGSDNodes {
+public:
+  ScheduleDAGLinearize(MachineFunction &mf) : ScheduleDAGSDNodes(mf) {}
+
+  void Schedule();
+
+  MachineBasicBlock *EmitSchedule(MachineBasicBlock::iterator &InsertPos);
+
+private:
+  std::vector<SDNode*> Sequence;
+  DenseMap<SDNode*, SDNode*> GluedMap;  // Cache glue to its user
+
+  void ScheduleNode(SDNode *N);
+};
+} // end anonymous namespace
+
+void ScheduleDAGLinearize::ScheduleNode(SDNode *N) {
+  if (N->getNodeId() != 0)
+    llvm_unreachable(0);
+
+  if (!N->isMachineOpcode() &&
+      (N->getOpcode() == ISD::EntryToken || isPassiveNode(N)))
+    // These nodes do not need to be translated into MIs.
+    return;
+
+  DEBUG(dbgs() << "\n*** Scheduling: ");
+  DEBUG(N->dump(DAG));
+  Sequence.push_back(N);
+
+  unsigned NumOps = N->getNumOperands();
+  if (unsigned NumLeft = NumOps) {
+    SDNode *GluedOpN = 0;
+    do {
+      const SDValue &Op = N->getOperand(NumLeft-1);
+      SDNode *OpN = Op.getNode();
+
+      if (NumLeft == NumOps && Op.getValueType() == MVT::Glue) {
+        // Schedule glue operand right above N.
+        GluedOpN = OpN;
+        assert(OpN->getNodeId() != 0 && "Glue operand not ready?");
+        OpN->setNodeId(0);
+        ScheduleNode(OpN);
+        continue;
+      }
+
+      if (OpN == GluedOpN)
+        // Glue operand is already scheduled.
+        continue;
+
+      DenseMap<SDNode*, SDNode*>::iterator DI = GluedMap.find(OpN);
+      if (DI != GluedMap.end() && DI->second != N)
+        // Users of glues are counted against the glued users.
+        OpN = DI->second;
+
+      unsigned Degree = OpN->getNodeId();
+      assert(Degree > 0 && "Predecessor over-released!");
+      OpN->setNodeId(--Degree);
+      if (Degree == 0)
+        ScheduleNode(OpN);
+    } while (--NumLeft);
+  }
+}
+
+/// findGluedUser - Find the representative use of a glue value by walking
+/// the use chain.
+static SDNode *findGluedUser(SDNode *N) {
+  while (SDNode *Glued = N->getGluedUser())
+    N = Glued;
+  return N;
+}
+
+void ScheduleDAGLinearize::Schedule() {
+  DEBUG(dbgs() << "********** DAG Linearization **********\n");
+
+  SmallVector<SDNode*, 8> Glues;
+  unsigned DAGSize = 0;
+  for (SelectionDAG::allnodes_iterator I = DAG->allnodes_begin(),
+         E = DAG->allnodes_end(); I != E; ++I) {
+    SDNode *N = I;
+
+    // Use node id to record degree.
+    unsigned Degree = N->use_size();
+    N->setNodeId(Degree);
+    unsigned NumVals = N->getNumValues();
+    if (NumVals && N->getValueType(NumVals-1) == MVT::Glue &&
+        N->hasAnyUseOfValue(NumVals-1)) {
+      SDNode *User = findGluedUser(N);
+      if (User) {
+        Glues.push_back(N);
+        GluedMap.insert(std::make_pair(N, User));
+      }
+    }
+
+    if (N->isMachineOpcode() ||
+        (N->getOpcode() != ISD::EntryToken && !isPassiveNode(N)))
+      ++DAGSize;
+  }
+
+  for (unsigned i = 0, e = Glues.size(); i != e; ++i) {
+    SDNode *Glue = Glues[i];
+    SDNode *GUser = GluedMap[Glue];
+    unsigned Degree = Glue->getNodeId();
+    unsigned UDegree = GUser->getNodeId();
+
+    // Glue user must be scheduled together with the glue operand. So other
+    // users of the glue operand must be treated as its users.
+    SDNode *ImmGUser = Glue->getGluedUser();
+    for (SDNode::use_iterator ui = Glue->use_begin(), ue = Glue->use_end();
+         ui != ue; ++ui)
+      if (*ui == ImmGUser)
+        --Degree;
+    GUser->setNodeId(UDegree + Degree);
+    Glue->setNodeId(1);
+  }
+
+  Sequence.reserve(DAGSize);
+  ScheduleNode(DAG->getRoot().getNode());
+}
+
+MachineBasicBlock*
+ScheduleDAGLinearize::EmitSchedule(MachineBasicBlock::iterator &InsertPos) {
+  InstrEmitter Emitter(BB, InsertPos);
+  DenseMap<SDValue, unsigned> VRBaseMap;
+
+  DEBUG({
+      dbgs() << "\n*** Final schedule ***\n";
+    });
+
+  // FIXME: Handle dbg_values.
+  unsigned NumNodes = Sequence.size();
+  for (unsigned i = 0; i != NumNodes; ++i) {
+    SDNode *N = Sequence[NumNodes-i-1];
+    DEBUG(N->dump(DAG));
+    Emitter.EmitNode(N, false, false, VRBaseMap);
+  }
+
+  DEBUG(dbgs() << '\n');
+
+  InsertPos = Emitter.getInsertPos();
+  return Emitter.getBlock();
+}
+
 //===----------------------------------------------------------------------===//
 //                         Public Constructor Functions
 //===----------------------------------------------------------------------===//
@@ -637,3 +791,8 @@ llvm::ScheduleDAGSDNodes *
 llvm::createFastDAGScheduler(SelectionDAGISel *IS, CodeGenOpt::Level) {
   return new ScheduleDAGFast(*IS->MF);
 }
+
+llvm::ScheduleDAGSDNodes *
+llvm::createDAGLinearizer(SelectionDAGISel *IS, CodeGenOpt::Level) {
+  return new ScheduleDAGLinearize(*IS->MF);
+}
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
index 660223a5059..714471f559e 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
@@ -831,8 +831,7 @@ EmitSchedule(MachineBasicBlock::iterator &InsertPos) {
     }
 
     SmallVector<SDNode *, 4> GluedNodes;
-    for (SDNode *N = SU->getNode()->getGluedNode(); N;
-         N = N->getGluedNode())
+    for (SDNode *N = SU->getNode()->getGluedNode(); N; N = N->getGluedNode())
       GluedNodes.push_back(N);
     while (!GluedNodes.empty()) {
       SDNode *N = GluedNodes.back();
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h
index 8e7bd822014..907356fd212 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h
@@ -114,7 +114,8 @@ namespace llvm {
     /// EmitSchedule - Insert MachineInstrs into the MachineBasicBlock
     /// according to the order specified in Sequence.
     ///
-    MachineBasicBlock *EmitSchedule(MachineBasicBlock::iterator &InsertPos);
+    virtual MachineBasicBlock*
+    EmitSchedule(MachineBasicBlock::iterator &InsertPos);
 
     virtual void dumpNode(const SUnit *SU) const;
 
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index cdebff94e59..db159941749 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -5175,10 +5175,13 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     return 0;
   }
 
+  case Intrinsic::debugtrap:
   case Intrinsic::trap: {
     StringRef TrapFuncName = TM.Options.getTrapFunctionName();
     if (TrapFuncName.empty()) {
-      DAG.setRoot(DAG.getNode(ISD::TRAP, dl,MVT::Other, getRoot()));
+      ISD::NodeType Op = (Intrinsic == Intrinsic::trap) ? 
+        ISD::TRAP : ISD::DEBUGTRAP;
+      DAG.setRoot(DAG.getNode(Op, dl,MVT::Other, getRoot()));
       return 0;
     }
     TargetLowering::ArgListTy Args;
@@ -5193,10 +5196,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     DAG.setRoot(Result.second);
     return 0;
   }
-  case Intrinsic::debugtrap: {
-    DAG.setRoot(DAG.getNode(ISD::DEBUGTRAP, dl,MVT::Other, getRoot()));
-    return 0;
-  }
+
   case Intrinsic::uadd_with_overflow:
   case Intrinsic::sadd_with_overflow:
   case Intrinsic::usub_with_overflow:
diff --git a/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index cd485ac235c..5abc55ba8e2 100644
--- a/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -583,6 +583,11 @@ TargetLowering::TargetLowering(const TargetMachine &tm,
   // Default ISD::TRAP to expand (which turns it into abort).
   setOperationAction(ISD::TRAP, MVT::Other, Expand);
 
+  // On most systems, DEBUGTRAP and TRAP have no difference. The "Expand"
+  // here is to inform DAG Legalizer to replace DEBUGTRAP with TRAP.
+  //
+  setOperationAction(ISD::DEBUGTRAP, MVT::Other, Expand);
+
   IsLittleEndian = TD->isLittleEndian();
   PointerTy = MVT::getIntegerVT(8*TD->getPointerSize(0));
   memset(RegClassForVT, 0,MVT::LAST_VALUETYPE*sizeof(TargetRegisterClass*));
diff --git a/lib/CodeGen/StackColoring.cpp b/lib/CodeGen/StackColoring.cpp
index 54d8c8cde7e..1cbee843a12 100644
--- a/lib/CodeGen/StackColoring.cpp
+++ b/lib/CodeGen/StackColoring.cpp
@@ -48,6 +48,7 @@
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/SlotIndexes.h"
 #include "llvm/DebugInfo.h"
+#include "llvm/Instructions.h"
 #include "llvm/MC/MCInstrItineraries.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetRegisterInfo.h"
@@ -260,7 +261,7 @@ unsigned StackColoring::collectMarkers(unsigned NumSlot) {
 
       MarkersFound++;
 
-      const Value *Allocation = MFI->getObjectAllocation(Slot);
+      const AllocaInst *Allocation = MFI->getObjectAllocation(Slot);
       if (Allocation) {
         DEBUG(dbgs()<<"Found a lifetime marker for slot #"<<Slot<<
               " with allocation: "<< Allocation->getName()<<"\n");
@@ -480,11 +481,11 @@ void StackColoring::remapInstructions(DenseMap<int, int> &SlotRemap) {
   }
 
   // Keep a list of *allocas* which need to be remapped.
-  DenseMap<const Value*, const Value*> Allocas;
+  DenseMap<const AllocaInst*, const AllocaInst*> Allocas;
   for (DenseMap<int, int>::iterator it = SlotRemap.begin(),
        e = SlotRemap.end(); it != e; ++it) {
-    const Value *From = MFI->getObjectAllocation(it->first);
-    const Value *To = MFI->getObjectAllocation(it->second);
+    const AllocaInst *From = MFI->getObjectAllocation(it->first);
+    const AllocaInst *To = MFI->getObjectAllocation(it->second);
     assert(To && From && "Invalid allocation object");
     Allocas[From] = To;
   }
@@ -514,10 +515,17 @@ void StackColoring::remapInstructions(DenseMap<int, int> &SlotRemap) {
         V = GetUnderlyingObject(V);
         // If we did not find one, or if the one that we found is not in our
         // map, then move on.
-        if (!V || !Allocas.count(V))
+        if (!V || !isa<AllocaInst>(V)) {
+          // Clear mem operand since we don't know for sure that it doesn't
+          // alias a merged alloca.
+          MMO->setValue(0);
+          continue;
+        }
+        const AllocaInst *AI= cast<AllocaInst>(V);
+        if (!Allocas.count(AI))
           continue;
 
-        MMO->setValue(Allocas[V]);
+        MMO->setValue(Allocas[AI]);
         FixedMemOp++;
       }
 
diff --git a/lib/CodeGen/TargetSchedule.cpp b/lib/CodeGen/TargetSchedule.cpp
index 7a6e2604d77..6a096a16c4e 100644
--- a/lib/CodeGen/TargetSchedule.cpp
+++ b/lib/CodeGen/TargetSchedule.cpp
@@ -58,6 +58,14 @@ unsigned TargetSchedModel::getNumMicroOps(MachineInstr *MI) const {
   return MI->isTransient() ? 0 : 1;
 }
 
+// The machine model may explicitly specify an invalid latency, which
+// effectively means infinite latency. Since users of the TargetSchedule API
+// don't know how to handle this, we convert it to a very large latency that is
+// easy to distinguish when debugging the DAG but won't induce overflow.
+static unsigned convertLatency(int Cycles) {
+  return Cycles >= 0 ? Cycles : 1000;
+}
+
 /// If we can determine the operand latency from the def only, without machine
 /// model or itinerary lookup, do so. Otherwise return -1.
 int TargetSchedModel::getDefLatency(const MachineInstr *DefMI,
@@ -178,7 +186,7 @@ unsigned TargetSchedModel::computeOperandLatency(
     const MCWriteLatencyEntry *WLEntry =
       STI->getWriteLatencyEntry(SCDesc, DefIdx);
     unsigned WriteID = WLEntry->WriteResourceID;
-    unsigned Latency = WLEntry->Cycles;
+    unsigned Latency = convertLatency(WLEntry->Cycles);
     if (!UseMI)
       return Latency;
 
@@ -219,7 +227,7 @@ unsigned TargetSchedModel::computeInstrLatency(const MachineInstr *MI) const {
         // Lookup the definition's write latency in SubtargetInfo.
         const MCWriteLatencyEntry *WLEntry =
           STI->getWriteLatencyEntry(SCDesc, DefIdx);
-        Latency = std::max(Latency, WLEntry->Cycles);
+        Latency = std::max(Latency, convertLatency(WLEntry->Cycles));
       }
       return Latency;
     }
diff --git a/lib/DebugInfo/DWARFContext.cpp b/lib/DebugInfo/DWARFContext.cpp
index 241f55eaed2..afd614cc356 100644
--- a/lib/DebugInfo/DWARFContext.cpp
+++ b/lib/DebugInfo/DWARFContext.cpp
@@ -17,6 +17,8 @@
 using namespace llvm;
 using namespace dwarf;
 
+typedef DWARFDebugLine::LineTable DWARFLineTable;
+
 void DWARFContext::dump(raw_ostream &OS) {
   OS << ".debug_abbrev contents:\n";
   getDebugAbbrev()->dump(OS);
@@ -94,7 +96,7 @@ const DWARFDebugAranges *DWARFContext::getDebugAranges() {
   return Aranges.get();
 }
 
-const DWARFDebugLine::LineTable *
+const DWARFLineTable *
 DWARFContext::getLineTableForCompileUnit(DWARFCompileUnit *cu) {
   if (!Line)
     Line.reset(new DWARFDebugLine());
@@ -106,7 +108,7 @@ DWARFContext::getLineTableForCompileUnit(DWARFCompileUnit *cu) {
     return 0; // No line table for this compile unit.
 
   // See if the line table is cached.
-  if (const DWARFDebugLine::LineTable *lt = Line->getLineTable(stmtOffset))
+  if (const DWARFLineTable *lt = Line->getLineTable(stmtOffset))
     return lt;
 
   // We have to parse it first.
@@ -117,11 +119,11 @@ DWARFContext::getLineTableForCompileUnit(DWARFCompileUnit *cu) {
 
 void DWARFContext::parseCompileUnits() {
   uint32_t offset = 0;
-  const DataExtractor &debug_info_data = DataExtractor(getInfoSection(),
-                                                       isLittleEndian(), 0);
-  while (debug_info_data.isValidOffset(offset)) {
+  const DataExtractor &DIData = DataExtractor(getInfoSection(),
+                                              isLittleEndian(), 0);
+  while (DIData.isValidOffset(offset)) {
     CUs.push_back(DWARFCompileUnit(*this));
-    if (!CUs.back().extract(debug_info_data, &offset)) {
+    if (!CUs.back().extract(DIData, &offset)) {
       CUs.pop_back();
       break;
     }
@@ -163,9 +165,11 @@ DWARFCompileUnit *DWARFContext::getCompileUnitForAddress(uint64_t Address) {
   return getCompileUnitForOffset(CUOffset);
 }
 
-static bool getFileNameForCompileUnit(
-    DWARFCompileUnit *CU, const DWARFDebugLine::LineTable *LineTable,
-    uint64_t FileIndex, bool NeedsAbsoluteFilePath, std::string &FileName) {
+static bool getFileNameForCompileUnit(DWARFCompileUnit *CU,
+                                      const DWARFLineTable *LineTable,
+                                      uint64_t FileIndex,
+                                      bool NeedsAbsoluteFilePath,
+                                      std::string &FileName) {
   if (CU == 0 ||
       LineTable == 0 ||
       !LineTable->getFileNameByIndex(FileIndex, NeedsAbsoluteFilePath,
@@ -183,10 +187,12 @@ static bool getFileNameForCompileUnit(
   return true;
 }
 
-static bool getFileLineInfoForCompileUnit(
-    DWARFCompileUnit *CU, const DWARFDebugLine::LineTable *LineTable,
-    uint64_t Address, bool NeedsAbsoluteFilePath, std::string &FileName,
-    uint32_t &Line, uint32_t &Column) {
+static bool getFileLineInfoForCompileUnit(DWARFCompileUnit *CU,
+                                          const DWARFLineTable *LineTable,
+                                          uint64_t Address,
+                                          bool NeedsAbsoluteFilePath,
+                                          std::string &FileName,
+                                          uint32_t &Line, uint32_t &Column) {
   if (CU == 0 || LineTable == 0)
     return false;
   // Get the index of row we're looking for in the line table.
@@ -225,8 +231,7 @@ DILineInfo DWARFContext::getLineInfoForAddress(uint64_t Address,
     }
   }
   if (Specifier.needs(DILineInfoSpecifier::FileLineInfo)) {
-    const DWARFDebugLine::LineTable *LineTable =
-        getLineTableForCompileUnit(CU);
+    const DWARFLineTable *LineTable = getLineTableForCompileUnit(CU);
     const bool NeedsAbsoluteFilePath =
         Specifier.needs(DILineInfoSpecifier::AbsoluteFilePath);
     getFileLineInfoForCompileUnit(CU, LineTable, Address,
@@ -250,7 +255,7 @@ DIInliningInfo DWARFContext::getInliningInfoForAddress(uint64_t Address,
 
   DIInliningInfo InliningInfo;
   uint32_t CallFile = 0, CallLine = 0, CallColumn = 0;
-  const DWARFDebugLine::LineTable *LineTable = 0;
+  const DWARFLineTable *LineTable = 0;
   for (uint32_t i = 0, n = InlinedChain.size(); i != n; i++) {
     const DWARFDebugInfoEntryMinimal &FunctionDIE = InlinedChain[i];
     std::string FileName = "<invalid>";
diff --git a/lib/MC/ELFObjectWriter.cpp b/lib/MC/ELFObjectWriter.cpp
index 6e37b5c1a03..b8b3188ce48 100644
--- a/lib/MC/ELFObjectWriter.cpp
+++ b/lib/MC/ELFObjectWriter.cpp
@@ -733,8 +733,7 @@ void ELFObjectWriter::RecordRelocation(const MCAssembler &Asm,
       Index = -1;
     }
     Addend = Value;
-    // Compensate for the addend on i386.
-    if (is64Bit())
+    if (hasRelocationAddend())
       Value = 0;
   }
 
diff --git a/lib/MC/MCParser/AsmParser.cpp b/lib/MC/MCParser/AsmParser.cpp
index 0406ff8d446..f22b2754f68 100644
--- a/lib/MC/MCParser/AsmParser.cpp
+++ b/lib/MC/MCParser/AsmParser.cpp
@@ -19,6 +19,8 @@
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCDwarf.h"
 #include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInstPrinter.h"
+#include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCParser/AsmCond.h"
 #include "llvm/MC/MCParser/AsmLexer.h"
 #include "llvm/MC/MCParser/MCAsmParser.h"
@@ -35,6 +37,8 @@
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cctype>
+#include <set>
+#include <string>
 #include <vector>
 using namespace llvm;
 
@@ -42,6 +46,8 @@ static cl::opt<bool>
 FatalAssemblerWarnings("fatal-assembler-warnings",
                        cl::desc("Consider warnings as error"));
 
+MCAsmParserSemaCallback::~MCAsmParserSemaCallback() {} 
+
 namespace {
 
 /// \brief Helper class for tracking macro definitions.
@@ -139,7 +145,8 @@ private:
   /// ParsedOperands - The parsed operands from the last parsed statement.
   SmallVector<MCParsedAsmOperand*, 8> ParsedOperands;
 
-  /// Opcode - The opcode from the last parsed instruction.
+  /// Opcode - The opcode from the last parsed instruction.  This is MS-style
+  /// inline asm specific.
   unsigned Opcode;
 
 public:
@@ -180,20 +187,17 @@ public:
 
   virtual const AsmToken &Lex();
 
-  bool ParseStatement();
   void setParsingInlineAsm(bool V) { ParsingInlineAsm = V; }
-  unsigned getNumParsedOperands() { return ParsedOperands.size(); }
-  MCParsedAsmOperand &getParsedOperand(unsigned OpNum) {
-    assert (ParsedOperands.size() > OpNum);
-    return *ParsedOperands[OpNum];
-  }
-  void freeParsedOperands() {
-    for (unsigned i = 0, e = ParsedOperands.size(); i != e; ++i)
-      delete ParsedOperands[i];
-    ParsedOperands.clear();
-  }
-  bool isInstruction() { return Opcode != (unsigned)~0x0; }
-  unsigned getOpcode() { return Opcode; }
+  bool isParsingInlineAsm() { return ParsingInlineAsm; }
+
+  bool ParseMSInlineAsm(void *AsmLoc, std::string &AsmString,
+                        unsigned &NumOutputs, unsigned &NumInputs,
+                        SmallVectorImpl<void *> &OpDecls,
+                        SmallVectorImpl<std::string> &Constraints,
+                        SmallVectorImpl<std::string> &Clobbers,
+                        const MCInstrInfo *MII,
+                        const MCInstPrinter *IP,
+                        MCAsmParserSemaCallback &SI);
 
   bool ParseExpression(const MCExpr *&Res);
   virtual bool ParseExpression(const MCExpr *&Res, SMLoc &EndLoc);
@@ -205,6 +209,7 @@ public:
 private:
   void CheckForValidSection();
 
+  bool ParseStatement();
   void EatToEndOfLine();
   bool ParseCppHashLineFilenameComment(const SMLoc &L);
 
@@ -310,6 +315,11 @@ private:
   bool ParseDirectiveIrp(SMLoc DirectiveLoc);  // ".irp"
   bool ParseDirectiveIrpc(SMLoc DirectiveLoc); // ".irpc"
   bool ParseDirectiveEndr(SMLoc DirectiveLoc); // ".endr"
+
+  // MS-style inline assembly parsing.
+  bool isInstruction() { return Opcode != (unsigned)~0x0; }
+  unsigned getOpcode() { return Opcode; }
+  void setOpcode(unsigned Value) { Opcode = Value; }
 };
 
 /// \brief Generic implementations of directive handling, etc. which is shared
@@ -1377,10 +1387,13 @@ bool AsmParser::ParseStatement() {
                                                          ParsingInlineAsm);
   }
 
-  // Free any parsed operands.  If parsing ms-style inline assembly it is the
-  // responsibility of the caller (i.e., clang) to free the parsed operands.
-  if (!ParsingInlineAsm)
-    freeParsedOperands();
+  // Free any parsed operands.  If parsing ms-style inline assembly the operands
+  // will be freed by the ParseMSInlineAsm() function.
+  if (!ParsingInlineAsm) {
+    for (unsigned i = 0, e = ParsedOperands.size(); i != e; ++i)
+      delete ParsedOperands[i];
+    ParsedOperands.clear();
+  }
 
   // Don't skip the rest of the line, the instruction parser is responsible for
   // that.
@@ -3561,6 +3574,222 @@ bool AsmParser::ParseDirectiveEndr(SMLoc DirectiveLoc) {
   return false;
 }
 
+namespace {
+enum AsmRewriteKind {
+   AOK_Imm,
+   AOK_Input,
+   AOK_Output,
+   AOK_SizeDirective,
+   AOK_Skip
+};
+
+struct AsmRewrite {
+  AsmRewriteKind Kind;
+  SMLoc Loc;
+  unsigned Len;
+  unsigned Size;
+public:
+  AsmRewrite(AsmRewriteKind kind, SMLoc loc, unsigned len, unsigned size = 0)
+    : Kind(kind), Loc(loc), Len(len), Size(size) { }
+};
+}
+
+bool AsmParser::ParseMSInlineAsm(void *AsmLoc, std::string &AsmString,
+                                 unsigned &NumOutputs, unsigned &NumInputs,
+                                 SmallVectorImpl<void *> &OpDecls,
+                                 SmallVectorImpl<std::string> &Constraints,
+                                 SmallVectorImpl<std::string> &Clobbers,
+                                 const MCInstrInfo *MII,
+                                 const MCInstPrinter *IP,
+                                 MCAsmParserSemaCallback &SI) {
+  SmallVector<void*, 4> InputDecls;
+  SmallVector<void*, 4> OutputDecls;
+  SmallVector<std::string, 4> InputConstraints;
+  SmallVector<std::string, 4> OutputConstraints;
+  std::set<std::string> ClobberRegs;
+
+  SmallVector<struct AsmRewrite, 4> AsmStrRewrites;
+
+  // Prime the lexer.
+  Lex();
+
+  // While we have input, parse each statement.
+  unsigned InputIdx = 0;
+  unsigned OutputIdx = 0;
+  while (getLexer().isNot(AsmToken::Eof)) {
+    // Clear the opcode.
+    setOpcode(~0x0);
+
+    // Save the conditional ignore state of the parser prior to parsing the statement.
+    bool PreParseCondStateIgnore = TheCondState.Ignore;
+
+    // Save the starting point of this statement in case we need to skip it.
+    SMLoc Start = getLexer().getLoc();
+
+    if (ParseStatement())
+      return true;
+
+    // If PreParseCondStateIgnore is false, but TheCondState.Ignore is true, then we
+    // just parsed a directive that changed the state to ignore.  Don't skip 
+    // emitting this directive.
+    if (PreParseCondStateIgnore && TheCondState.Ignore) {
+      unsigned Len = getLexer().getLoc().getPointer() - Start.getPointer();
+      AsmStrRewrites.push_back(AsmRewrite(AOK_Skip, Start, Len));
+      continue;
+    }
+
+    if (isInstruction()) {
+      const MCInstrDesc &Desc = MII->get(getOpcode());
+
+      // Build the list of clobbers, outputs and inputs.
+      for (unsigned i = 1, e = ParsedOperands.size(); i != e; ++i) {
+        MCParsedAsmOperand *Operand = ParsedOperands[i];
+
+        // Immediate.
+        if (Operand->isImm()) {
+          AsmStrRewrites.push_back(AsmRewrite(AOK_Imm,
+                                                Operand->getStartLoc(),
+                                                Operand->getNameLen()));
+          continue;
+        }
+
+        // Register operand.
+        if (Operand->isReg()) {
+          unsigned NumDefs = Desc.getNumDefs();
+          // Clobber.
+          if (NumDefs && Operand->getMCOperandNum() < NumDefs) {
+            std::string Reg;
+            raw_string_ostream OS(Reg);
+            IP->printRegName(OS, Operand->getReg());
+            ClobberRegs.insert(StringRef(OS.str()));
+          }
+          continue;
+        }
+
+        // Expr/Input or Output.
+        unsigned Size;
+        void *OpDecl = SI.LookupInlineAsmIdentifier(Operand->getName(), AsmLoc,
+                                                    Size);
+        if (OpDecl) {
+          bool isOutput = (i == 1) && Desc.mayStore();
+          if (Operand->needSizeDirective())
+            AsmStrRewrites.push_back(AsmRewrite(AOK_SizeDirective,
+                                                  Operand->getStartLoc(), 0,
+                                                  Operand->getMemSize()));
+          
+          if (isOutput) {
+            std::string Constraint = "=";
+            ++InputIdx;
+            OutputDecls.push_back(OpDecl);
+            Constraint += Operand->getConstraint().str();
+            OutputConstraints.push_back(Constraint);
+            AsmStrRewrites.push_back(AsmRewrite(AOK_Output,
+                                                  Operand->getStartLoc(),
+                                                  Operand->getNameLen()));
+          } else {
+            InputDecls.push_back(OpDecl);
+            InputConstraints.push_back(Operand->getConstraint().str());
+            AsmStrRewrites.push_back(AsmRewrite(AOK_Input,
+                                                  Operand->getStartLoc(),
+                                                  Operand->getNameLen()));
+          }
+        }
+      }
+      // Free any parsed operands.
+      for (unsigned i = 0, e = ParsedOperands.size(); i != e; ++i)
+        delete ParsedOperands[i];
+      ParsedOperands.clear();
+    }
+  }
+
+  // Set the number of Outputs and Inputs.
+  NumOutputs = OutputDecls.size();
+  NumInputs = InputDecls.size();
+
+  // Set the unique clobbers.
+  for (std::set<std::string>::iterator I = ClobberRegs.begin(),
+         E = ClobberRegs.end(); I != E; ++I)
+    Clobbers.push_back(*I);
+
+  // Merge the various outputs and inputs.  Output are expected first.
+  if (NumOutputs || NumInputs) {
+    unsigned NumExprs = NumOutputs + NumInputs;
+    OpDecls.resize(NumExprs);
+    Constraints.resize(NumExprs);
+    for (unsigned i = 0; i < NumOutputs; ++i) {
+      OpDecls[i] = OutputDecls[i];
+      Constraints[i] = OutputConstraints[i];
+    }
+    for (unsigned i = 0, j = NumOutputs; i < NumInputs; ++i, ++j) {
+      OpDecls[j] = InputDecls[i];
+      Constraints[j] = InputConstraints[i];
+    }
+  }
+
+  // Build the IR assembly string.
+  std::string AsmStringIR;
+  AsmRewriteKind PrevKind = AOK_Imm;
+  raw_string_ostream OS(AsmStringIR);
+  const char *Start = SrcMgr.getMemoryBuffer(0)->getBufferStart();
+  for (SmallVectorImpl<struct AsmRewrite>::iterator
+         I = AsmStrRewrites.begin(), E = AsmStrRewrites.end(); I != E; ++I) {
+    const char *Loc = (*I).Loc.getPointer();
+
+    AsmRewriteKind Kind = (*I).Kind;
+
+    // Emit everything up to the immediate/expression.  If the previous rewrite
+    // was a size directive, then this has already been done.
+    if (PrevKind != AOK_SizeDirective)
+      OS << StringRef(Start, Loc - Start);
+    PrevKind = Kind;
+
+    // Skip the original expression.
+    if (Kind == AOK_Skip) {
+      Start = Loc + (*I).Len;
+      continue;
+    }
+
+    // Rewrite expressions in $N notation.
+    switch (Kind) {
+    default: break;
+    case AOK_Imm:
+      OS << Twine("$$") + StringRef(Loc, (*I).Len);
+      break;
+    case AOK_Input:
+      OS << '$';
+      OS << InputIdx++;
+      break;
+    case AOK_Output:
+      OS << '$';
+      OS << OutputIdx++;
+      break;
+    case AOK_SizeDirective:
+      switch((*I).Size) {
+      default: break;
+      case 8:  OS << "byte ptr "; break;
+      case 16: OS << "word ptr "; break;
+      case 32: OS << "dword ptr "; break;
+      case 64: OS << "qword ptr "; break;
+      case 80: OS << "xword ptr "; break;
+      case 128: OS << "xmmword ptr "; break;
+      case 256: OS << "ymmword ptr "; break;
+      }
+    }
+
+    // Skip the original expression.
+    if (Kind != AOK_SizeDirective)
+      Start = Loc + (*I).Len;
+  }
+
+  // Emit the remainder of the asm string.
+  const char *AsmEnd = SrcMgr.getMemoryBuffer(0)->getBufferEnd();
+  if (Start != AsmEnd)
+    OS << StringRef(Start, AsmEnd - Start);
+
+  AsmString = OS.str();
+  return false;
+}
+
 /// \brief Create an MCAsmParser instance.
 MCAsmParser *llvm::createMCAsmParser(SourceMgr &SM,
                                      MCContext &C, MCStreamer &Out,
diff --git a/lib/MC/MCParser/MCTargetAsmParser.cpp b/lib/MC/MCParser/MCTargetAsmParser.cpp
index 6fb1ba4216f..60a3a3b59a3 100644
--- a/lib/MC/MCParser/MCTargetAsmParser.cpp
+++ b/lib/MC/MCParser/MCTargetAsmParser.cpp
@@ -11,7 +11,7 @@
 using namespace llvm;
 
 MCTargetAsmParser::MCTargetAsmParser()
-  : AvailableFeatures(0)
+  : AvailableFeatures(0), ParsingInlineAsm(false)
 {
 }
 
diff --git a/lib/MC/SubtargetFeature.cpp b/lib/MC/SubtargetFeature.cpp
index 378c4184a83..7625abd465f 100644
--- a/lib/MC/SubtargetFeature.cpp
+++ b/lib/MC/SubtargetFeature.cpp
@@ -119,8 +119,8 @@ void SubtargetFeatures::AddFeature(const StringRef String,
 }
 
 /// Find KV in array using binary search.
-const SubtargetFeatureKV *Find(const StringRef S, const SubtargetFeatureKV *A,
-                               size_t L) {
+static const SubtargetFeatureKV *Find(StringRef S, const SubtargetFeatureKV *A,
+                                      size_t L) {
   // Make the lower bound element we're looking for
   SubtargetFeatureKV KV;
   KV.Key = S.data();
diff --git a/lib/Support/Unix/Signals.inc b/lib/Support/Unix/Signals.inc
index cf64fa8f385..9e94068c9c3 100644
--- a/lib/Support/Unix/Signals.inc
+++ b/lib/Support/Unix/Signals.inc
@@ -121,17 +121,29 @@ static void UnregisterHandlers() {
 /// NB: This must be an async signal safe function. It cannot allocate or free
 /// memory, even in debug builds.
 static void RemoveFilesToRemove() {
-  // Note: avoid iterators in case of debug iterators that allocate or release
+  // We avoid iterators in case of debug iterators that allocate or release
   // memory.
   for (unsigned i = 0, e = FilesToRemove.size(); i != e; ++i) {
-    // Note that we don't want to use any external code here, and we don't care
-    // about errors. We're going to try as hard as we can as often as we need
-    // to to make these files go away. If these aren't files, too bad.
-    //
-    // We do however rely on a std::string implementation for which repeated
-    // calls to 'c_str()' don't allocate memory. We pre-call 'c_str()' on all
-    // of these strings to try to ensure this is safe.
-    unlink(FilesToRemove[i].c_str());
+    // We rely on a std::string implementation for which repeated calls to
+    // 'c_str()' don't allocate memory. We pre-call 'c_str()' on all of these
+    // strings to try to ensure this is safe.
+    const char *path = FilesToRemove[i].c_str();
+
+    // Get the status so we can determine if it's a file or directory. If we
+    // can't stat the file, ignore it.
+    struct stat buf;
+    if (stat(path, &buf) != 0)
+      continue;
+
+    // If this is not a regular file, ignore it. We want to prevent removal of
+    // special files like /dev/null, even if the compiler is being run with the
+    // super-user permissions.
+    if (!S_ISREG(buf.st_mode))
+      continue;
+  
+    // Otherwise, remove the file. We ignore any errors here as there is nothing
+    // else we can do.
+    unlink(path);
   }
 }
 
diff --git a/lib/Target/ARM/ARMFrameLowering.cpp b/lib/Target/ARM/ARMFrameLowering.cpp
index 3e70d578231..1d4bbb74d33 100644
--- a/lib/Target/ARM/ARMFrameLowering.cpp
+++ b/lib/Target/ARM/ARMFrameLowering.cpp
@@ -1176,7 +1176,7 @@ static void checkNumAlignedDPRCS2Regs(MachineFunction &MF) {
   MachineRegisterInfo &MRI = MF.getRegInfo();
   unsigned NumSpills = 0;
   for (; NumSpills < 8; ++NumSpills)
-    if (!MRI.isPhysRegOrOverlapUsed(ARM::D8 + NumSpills))
+    if (!MRI.isPhysRegUsed(ARM::D8 + NumSpills))
       break;
 
   // Don't do this for just one d-register. It's not worth it.
@@ -1249,7 +1249,7 @@ ARMFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
   for (unsigned i = 0; CSRegs[i]; ++i) {
     unsigned Reg = CSRegs[i];
     bool Spilled = false;
-    if (MF.getRegInfo().isPhysRegOrOverlapUsed(Reg)) {
+    if (MF.getRegInfo().isPhysRegUsed(Reg)) {
       Spilled = true;
       CanEliminateFrame = false;
     }
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index b2eb5784879..8de23872349 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -2554,7 +2554,8 @@ ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG,
                                         DebugLoc dl, SDValue &Chain,
                                         const Value *OrigArg,
                                         unsigned OffsetFromOrigArg,
-                                        unsigned ArgOffset) const {
+                                        unsigned ArgOffset,
+                                        bool ForceMutable) const {
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo *MFI = MF.getFrameInfo();
   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
@@ -2603,7 +2604,8 @@ ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG,
                           &MemOps[0], MemOps.size());
   } else
     // This will point to the next argument passed via stack.
-    AFI->setVarArgsFrameIndex(MFI->CreateFixedObject(4, ArgOffset, true));
+    AFI->setVarArgsFrameIndex(
+        MFI->CreateFixedObject(4, ArgOffset, !ForceMutable));
 }
 
 SDValue
@@ -2729,15 +2731,20 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain,
           // Since they could be overwritten by lowering of arguments in case of
           // a tail call.
           if (Flags.isByVal()) {
-            unsigned VARegSize, VARegSaveSize;
-            computeRegArea(CCInfo, MF, VARegSize, VARegSaveSize);
-            VarArgStyleRegisters(CCInfo, DAG,
-                                 dl, Chain, CurOrigArg, Ins[VA.getValNo()].PartOffset, 0);
-            unsigned Bytes = Flags.getByValSize() - VARegSize;
-            if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
-            int FI = MFI->CreateFixedObject(Bytes,
-                                            VA.getLocMemOffset(), false);
-            InVals.push_back(DAG.getFrameIndex(FI, getPointerTy()));
+            ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+            if (!AFI->getVarArgsFrameIndex()) {
+              VarArgStyleRegisters(CCInfo, DAG,
+                                   dl, Chain, CurOrigArg,
+                                   Ins[VA.getValNo()].PartOffset,
+                                   VA.getLocMemOffset(),
+                                   true /*force mutable frames*/);
+              int VAFrameIndex = AFI->getVarArgsFrameIndex();
+              InVals.push_back(DAG.getFrameIndex(VAFrameIndex, getPointerTy()));
+            } else {
+              int FI = MFI->CreateFixedObject(Flags.getByValSize(),
+                                              VA.getLocMemOffset(), false);
+              InVals.push_back(DAG.getFrameIndex(FI, getPointerTy()));              
+            }
           } else {
             int FI = MFI->CreateFixedObject(VA.getLocVT().getSizeInBits()/8,
                                             VA.getLocMemOffset(), true);
diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h
index 9acab0b0834..4eb3b2cb515 100644
--- a/lib/Target/ARM/ARMISelLowering.h
+++ b/lib/Target/ARM/ARMISelLowering.h
@@ -469,7 +469,8 @@ namespace llvm {
                               DebugLoc dl, SDValue &Chain,
                               const Value *OrigArg,
                               unsigned OffsetFromOrigArg,
-                              unsigned ArgOffset)
+                              unsigned ArgOffset,
+                              bool ForceMutable = false)
       const;
 
     void computeRegArea(CCState &CCInfo, MachineFunction &MF,
diff --git a/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
index bf0dabb4a0f..d2b1cc37f21 100644
--- a/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
+++ b/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
@@ -525,8 +525,9 @@ static bool tryAddingSymbolicOperand(uint64_t Address, int32_t Value,
     else
        ReferenceType = LLVMDisassembler_ReferenceType_InOut_None;
     const char *ReferenceName;
-    const char *Name = SymbolLookUp(DisInfo, Value, &ReferenceType, Address,
-                                    &ReferenceName);
+    uint64_t SymbolValue = 0x00000000ffffffffULL & Value;
+    const char *Name = SymbolLookUp(DisInfo, SymbolValue, &ReferenceType,
+                                    Address, &ReferenceName);
     if (Name) {
       SymbolicOp.AddSymbol.Name = Name;
       SymbolicOp.AddSymbol.Present = true;
diff --git a/lib/Target/MSP430/MSP430FrameLowering.cpp b/lib/Target/MSP430/MSP430FrameLowering.cpp
index a312c8d5b25..2e170f17bf9 100644
--- a/lib/Target/MSP430/MSP430FrameLowering.cpp
+++ b/lib/Target/MSP430/MSP430FrameLowering.cpp
@@ -221,3 +221,17 @@ MSP430FrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
 
   return true;
 }
+
+void
+MSP430FrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF)
+                                                                         const {
+  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+
+  // Create a frame entry for the FPW register that must be saved.
+  if (TFI->hasFP(MF)) {
+    int FrameIdx = MF.getFrameInfo()->CreateFixedObject(2, -4, true);
+    (void)FrameIdx;
+    assert(FrameIdx == MF.getFrameInfo()->getObjectIndexBegin() &&
+           "Slot for FPW register must be last in order to be found!");
+  }
+}
diff --git a/lib/Target/MSP430/MSP430FrameLowering.h b/lib/Target/MSP430/MSP430FrameLowering.h
index b636827da7b..cb02545852b 100644
--- a/lib/Target/MSP430/MSP430FrameLowering.h
+++ b/lib/Target/MSP430/MSP430FrameLowering.h
@@ -46,6 +46,7 @@ public:
 
   bool hasFP(const MachineFunction &MF) const;
   bool hasReservedCallFrame(const MachineFunction &MF) const;
+  void processFunctionBeforeFrameFinalized(MachineFunction &MF) const;
 };
 
 } // End llvm namespace
diff --git a/lib/Target/MSP430/MSP430RegisterInfo.cpp b/lib/Target/MSP430/MSP430RegisterInfo.cpp
index aed46a2ec59..9ae238f66f5 100644
--- a/lib/Target/MSP430/MSP430RegisterInfo.cpp
+++ b/lib/Target/MSP430/MSP430RegisterInfo.cpp
@@ -220,20 +220,6 @@ MSP430RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   MI.getOperand(i+1).ChangeToImmediate(Offset);
 }
 
-void
-MSP430RegisterInfo::processFunctionBeforeFrameFinalized(MachineFunction &MF)
-                                                                         const {
-  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
-
-  // Create a frame entry for the FPW register that must be saved.
-  if (TFI->hasFP(MF)) {
-    int FrameIdx = MF.getFrameInfo()->CreateFixedObject(2, -4, true);
-    (void)FrameIdx;
-    assert(FrameIdx == MF.getFrameInfo()->getObjectIndexBegin() &&
-           "Slot for FPW register must be last in order to be found!");
-  }
-}
-
 unsigned MSP430RegisterInfo::getFrameRegister(const MachineFunction &MF) const {
   const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
 
diff --git a/lib/Target/MSP430/MSP430RegisterInfo.h b/lib/Target/MSP430/MSP430RegisterInfo.h
index 9ee0a03f631..64a43bcafbb 100644
--- a/lib/Target/MSP430/MSP430RegisterInfo.h
+++ b/lib/Target/MSP430/MSP430RegisterInfo.h
@@ -49,8 +49,6 @@ public:
   void eliminateFrameIndex(MachineBasicBlock::iterator II,
                            int SPAdj, RegScavenger *RS = NULL) const;
 
-  void processFunctionBeforeFrameFinalized(MachineFunction &MF) const;
-
   // Debug information queries.
   unsigned getFrameRegister(const MachineFunction &MF) const;
 };
diff --git a/lib/Target/Mips/Mips16InstrInfo.cpp b/lib/Target/Mips/Mips16InstrInfo.cpp
index 8991433005d..5e33fed0cc9 100644
--- a/lib/Target/Mips/Mips16InstrInfo.cpp
+++ b/lib/Target/Mips/Mips16InstrInfo.cpp
@@ -25,7 +25,7 @@
 using namespace llvm;
 
 Mips16InstrInfo::Mips16InstrInfo(MipsTargetMachine &tm)
-  : MipsInstrInfo(tm, /* FIXME: set mips16 unconditional br */ 0),
+  : MipsInstrInfo(tm, Mips::BimmX16),
     RI(*tm.getSubtargetImpl()) {}
 
 const MipsRegisterInfo &Mips16InstrInfo::getRegisterInfo() const {
@@ -137,12 +137,39 @@ bool Mips16InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
 /// GetOppositeBranchOpc - Return the inverse of the specified
 /// opcode, e.g. turning BEQ to BNE.
 unsigned Mips16InstrInfo::GetOppositeBranchOpc(unsigned Opc) const {
+  switch (Opc) {
+  default:  llvm_unreachable("Illegal opcode!");
+  case Mips::BeqzRxImmX16: return Mips::BnezRxImmX16;
+  case Mips::BnezRxImmX16: return Mips::BeqzRxImmX16;
+  case Mips::BteqzT8CmpX16: return Mips::BtnezT8CmpX16;
+  case Mips::BteqzT8SltX16: return Mips::BtnezT8SltX16;
+  case Mips::BteqzT8SltiX16: return Mips::BtnezT8SltiX16;
+  case Mips::BtnezX16: return Mips::BteqzX16;
+  case Mips::BtnezT8CmpiX16: return Mips::BteqzT8CmpiX16;
+  case Mips::BtnezT8SltuX16: return Mips::BteqzT8SltuX16;
+  case Mips::BtnezT8SltiuX16: return Mips::BteqzT8SltiuX16;
+  case Mips::BteqzX16: return Mips::BtnezX16;
+  case Mips::BteqzT8CmpiX16: return Mips::BtnezT8CmpiX16;
+  case Mips::BteqzT8SltuX16: return Mips::BtnezT8SltuX16;
+  case Mips::BteqzT8SltiuX16: return Mips::BtnezT8SltiuX16;
+  case Mips::BtnezT8CmpX16: return Mips::BteqzT8CmpX16;
+  case Mips::BtnezT8SltX16: return Mips::BteqzT8SltX16;
+  case Mips::BtnezT8SltiX16: return Mips::BteqzT8SltiX16;
+  }
   assert(false && "Implement this function.");
   return 0;
 }
 
 unsigned Mips16InstrInfo::GetAnalyzableBrOpc(unsigned Opc) const {
-  return 0;
+  return (Opc == Mips::BeqzRxImmX16   || Opc == Mips::BimmX16  ||
+          Opc == Mips::BnezRxImmX16   || Opc == Mips::BteqzX16 ||
+          Opc == Mips::BteqzT8CmpX16  || Opc == Mips::BteqzT8CmpiX16 ||
+          Opc == Mips::BteqzT8SltX16  || Opc == Mips::BteqzT8SltuX16  ||
+          Opc == Mips::BteqzT8SltiX16 || Opc == Mips::BteqzT8SltiuX16 ||
+          Opc == Mips::BtnezX16       || Opc == Mips::BtnezT8CmpX16 ||
+          Opc == Mips::BtnezT8CmpiX16 || Opc == Mips::BtnezT8SltX16 ||
+          Opc == Mips::BtnezT8SltuX16 || Opc == Mips::BtnezT8SltiX16 ||
+          Opc == Mips::BtnezT8SltiuX16 ) ? Opc : 0;
 }
 
 void Mips16InstrInfo::ExpandRetRA16(MachineBasicBlock &MBB,
diff --git a/lib/Target/Mips/Mips16InstrInfo.td b/lib/Target/Mips/Mips16InstrInfo.td
index eba201a0ea9..2694b09206f 100644
--- a/lib/Target/Mips/Mips16InstrInfo.td
+++ b/lib/Target/Mips/Mips16InstrInfo.td
@@ -21,6 +21,26 @@ def mem16 : Operand<i32> {
 }
 
 //
+// EXT-I instruction format
+//
+class FEXT_I16_ins<bits<5> eop, string asmstr, InstrItinClass itin> :
+  FEXT_I16<eop, (outs), (ins brtarget:$imm16),
+           !strconcat(asmstr, "\t$imm16"),[], itin>;
+
+//
+// EXT-I8 instruction format
+//
+
+class FEXT_I816_ins_base<bits<3> _func, string asmstr,
+                         string asmstr2, InstrItinClass itin>:
+  FEXT_I816<_func, (outs), (ins uimm16:$imm), !strconcat(asmstr, asmstr2),
+            [], itin>;
+
+class FEXT_I816_ins<bits<3> _func, string asmstr,
+                    InstrItinClass itin>:
+  FEXT_I816_ins_base<_func, asmstr, "\t$imm", itin>;
+
+//
 // Assembler formats in alphabetical order.
 // Natural and pseudos are mixed together.
 //
@@ -40,6 +60,11 @@ class FEXT_RI16_ins<bits<5> _op, string asmstr,
 class FEXT_RI16_PC_ins<bits<5> _op, string asmstr, InstrItinClass itin>:
   FEXT_RI16_ins_base<_op, asmstr, "\t$rx, $$pc, $imm", itin>;
 
+class FEXT_RI16_B_ins<bits<5> _op, string asmstr,
+                      InstrItinClass itin>:
+  FEXT_RI16<_op, (outs), (ins  CPU16Regs:$rx, brtarget:$imm),
+            !strconcat(asmstr, "\t$rx, $imm"), [], itin>;
+
 class FEXT_2RI16_ins<bits<5> _op, string asmstr,
                      InstrItinClass itin>:
   FEXT_RI16<_op, (outs CPU16Regs:$rx), (ins CPU16Regs:$rx_, simm16:$imm),
@@ -47,6 +72,7 @@ class FEXT_2RI16_ins<bits<5> _op, string asmstr,
   let Constraints = "$rx_ = $rx";
 }
 
+
 // this has an explicit sp argument that we ignore to work around a problem
 // in the compiler
 class FEXT_RI16_SP_explicit_ins<bits<5> _op, string asmstr,
@@ -75,6 +101,31 @@ class FEXT_SHIFT16_ins<bits<2> _f, string asmstr, InstrItinClass itin>:
   FEXT_SHIFT16<_f, (outs CPU16Regs:$rx), (ins CPU16Regs:$ry, shamt:$sa),
                !strconcat(asmstr, "\t$rx, $ry, $sa"), [], itin>;
 
+//
+// EXT-T8I8
+//
+class FEXT_T8I816_ins<bits<3> _func, string asmstr, string asmstr2,
+                      InstrItinClass itin>:
+  FEXT_I816<_func, (outs),
+            (ins CPU16Regs:$rx, CPU16Regs:$ry, brtarget:$imm),
+            !strconcat(asmstr2, !strconcat("\t$rx, $ry\n\t",
+            !strconcat(asmstr, "\t$imm"))),[], itin> {
+  let isCodeGenOnly=1;
+}
+
+//
+// EXT-T8I8I
+//
+class FEXT_T8I8I16_ins<bits<3> _func, string asmstr, string asmstr2,
+                       InstrItinClass itin>:
+  FEXT_I816<_func, (outs),
+            (ins CPU16Regs:$rx, simm16:$imm, brtarget:$targ),
+            !strconcat(asmstr2, !strconcat("\t$rx, $imm\n\t",
+            !strconcat(asmstr, "\t$targ"))), [], itin> {
+  let isCodeGenOnly=1;
+}
+//
+
 
 //
 // I8_MOVR32 instruction format (used only by the MOVR32 instructio
@@ -165,6 +216,17 @@ class ArithLogic16Defs<bit isCom=0> {
   bit neverHasSideEffects = 1;
 }
 
+class branch16 {
+  bit isBranch = 1;
+  bit isTerminator = 1;
+  bit isBarrier = 1;
+}
+
+class cbranch16 {
+  bit isBranch = 1;
+  bit isTerminator = 1;
+}
+
 class MayLoad {
   bit mayLoad = 1;
 }
@@ -204,6 +266,69 @@ def AdduRxRyRz16: FRRR16_ins<01, "addu", IIAlu>, ArithLogic16Defs<1>;
 // To do a bitwise logical AND.
 
 def AndRxRxRy16: FRxRxRy16_ins<0b01100, "and", IIAlu>, ArithLogic16Defs<1>;
+
+
+//
+// Format: BEQZ rx, offset MIPS16e
+// Purpose: Branch on Equal to Zero (Extended)
+// To test a GPR then do a PC-relative conditional branch.
+//
+def BeqzRxImmX16: FEXT_RI16_B_ins<0b00100, "beqz", IIAlu>, cbranch16;
+
+// Format: B offset MIPS16e
+// Purpose: Unconditional Branch
+// To do an unconditional PC-relative branch.
+//
+def BimmX16: FEXT_I16_ins<0b00010, "b", IIAlu>, branch16;
+
+//
+// Format: BNEZ rx, offset MIPS16e
+// Purpose: Branch on Not Equal to Zero (Extended)
+// To test a GPR then do a PC-relative conditional branch.
+//
+def BnezRxImmX16: FEXT_RI16_B_ins<0b00101, "bnez", IIAlu>, cbranch16;
+
+//
+// Format: BTEQZ offset MIPS16e
+// Purpose: Branch on T Equal to Zero (Extended)
+// To test special register T then do a PC-relative conditional branch.
+//
+def BteqzX16: FEXT_I816_ins<0b000, "bteqz", IIAlu>, cbranch16;
+
+def BteqzT8CmpX16: FEXT_T8I816_ins<0b000, "bteqz", "cmp", IIAlu>, cbranch16;
+
+def BteqzT8CmpiX16: FEXT_T8I8I16_ins<0b000, "bteqz", "cmpi", IIAlu>,
+  cbranch16;
+
+def BteqzT8SltX16: FEXT_T8I816_ins<0b000, "bteqz", "slt", IIAlu>, cbranch16;
+
+def BteqzT8SltuX16: FEXT_T8I816_ins<0b000, "bteqz", "sltu", IIAlu>, cbranch16;
+
+def BteqzT8SltiX16: FEXT_T8I8I16_ins<0b000, "bteqz", "slti", IIAlu>, cbranch16;
+
+def BteqzT8SltiuX16: FEXT_T8I8I16_ins<0b000, "bteqz", "sltiu", IIAlu>,
+  cbranch16;
+
+//
+// Format: BTNEZ offset MIPS16e
+// Purpose: Branch on T Not Equal to Zero (Extended)
+// To test special register T then do a PC-relative conditional branch.
+//
+def BtnezX16: FEXT_I816_ins<0b001, "btnez", IIAlu> ,cbranch16;
+
+def BtnezT8CmpX16: FEXT_T8I816_ins<0b000, "btnez", "cmp", IIAlu>, cbranch16;
+
+def BtnezT8CmpiX16: FEXT_T8I8I16_ins<0b000, "btnez", "cmpi", IIAlu>, cbranch16;
+
+def BtnezT8SltX16: FEXT_T8I816_ins<0b000, "btnez", "slt", IIAlu>, cbranch16;
+
+def BtnezT8SltuX16: FEXT_T8I816_ins<0b000, "btnez", "sltu", IIAlu>, cbranch16;
+
+def BtnezT8SltiX16: FEXT_T8I8I16_ins<0b000, "btnez", "slti", IIAlu>, cbranch16;
+
+def BtnezT8SltiuX16: FEXT_T8I8I16_ins<0b000, "btnez", "sltiu", IIAlu>,
+  cbranch16;
+
 //
 // Format: DIV rx, ry MIPS16e
 // Purpose: Divide Word
@@ -562,6 +687,11 @@ def: StoreM16_pat<truncstorei8, SbRxRyOffMemX16>;
 def: StoreM16_pat<truncstorei16, ShRxRyOffMemX16>;
 def: StoreM16_pat<store, SwRxRyOffMemX16>;
 
+// Unconditional branch
+class UncondBranch16_pat<SDNode OpNode, Instruction I>:
+  Mips16Pat<(OpNode bb:$imm16), (I bb:$imm16)> {
+    let Predicates = [RelocPIC, InMips16Mode];
+  }
 
 // Jump and Link (Call)
 let isCall=1, hasDelaySlot=1 in
@@ -574,7 +704,144 @@ let isReturn=1, isTerminator=1, hasDelaySlot=1, isBarrier=1, hasCtrlDep=1,
   hasExtraSrcRegAllocReq = 1 in
 def RetRA16 : MipsPseudo16<(outs), (ins), "", [(MipsRet)]>;
 
+
+//
+// Some branch conditional patterns are not generated by llvm at this time.
+// Some are for seemingly arbitrary reasons not used: i.e. with signed number
+// comparison they are used and for unsigned a different pattern is used.
+// I am pushing upstream from the full mips16 port and it seemed that I needed
+// these earlier and the mips32 port has these but now I cannot create test
+// cases that use these patterns. While I sort this all out I will leave these
+// extra patterns commented out and if I can be sure they are really not used,
+// I will delete the code. I don't want to check the code in uncommented without
+// a valid test case. In some cases, the compiler is generating patterns with
+// setcc instead and earlier I had implemented setcc first so may have masked
+// the problem. The setcc variants are suboptimal for mips16 so I may wantto
+// figure out how to enable the brcond patterns or else possibly new
+// combinations of of brcond and setcc.
+//
+//
+// bcond-seteq
+//
+def: Mips16Pat
+  <(brcond (i32 (seteq CPU16Regs:$rx, CPU16Regs:$ry)), bb:$imm16),
+   (BteqzT8CmpX16 CPU16Regs:$rx, CPU16Regs:$ry,  bb:$imm16)
+  >;
+
+
+def: Mips16Pat
+  <(brcond (i32 (seteq CPU16Regs:$rx, immZExt16:$imm)), bb:$targ16),
+   (BteqzT8CmpiX16 CPU16Regs:$rx, immSExt16:$imm,  bb:$targ16)
+  >;
+
+def: Mips16Pat
+  <(brcond (i32 (seteq CPU16Regs:$rx, 0)), bb:$targ16),
+   (BeqzRxImmX16 CPU16Regs:$rx, bb:$targ16)
+  >;
+
+//
+// bcond-setgt (do we need to have this pair of setlt, setgt??)
+//
+def: Mips16Pat
+  <(brcond (i32 (setgt CPU16Regs:$rx, CPU16Regs:$ry)), bb:$imm16),
+   (BtnezT8SltX16 CPU16Regs:$ry, CPU16Regs:$rx,  bb:$imm16)
+  >;
+
+//
+// bcond-setge
+//
+def: Mips16Pat
+  <(brcond (i32 (setge CPU16Regs:$rx, CPU16Regs:$ry)), bb:$imm16),
+   (BteqzT8SltX16 CPU16Regs:$rx, CPU16Regs:$ry,  bb:$imm16)
+  >;
+
+//
+// never called because compiler transforms a >= k to a > (k-1)
+//def: Mips16Pat
+//  <(brcond (i32 (setge CPU16Regs:$rx, immSExt16:$imm)), bb:$imm16),
+//   (BteqzT8SltiX16 CPU16Regs:$rx, immSExt16:$imm,  bb:$imm16)
+//  >;
+
+//
+// bcond-setlt
+//
+def: Mips16Pat
+  <(brcond (i32 (setlt CPU16Regs:$rx, CPU16Regs:$ry)), bb:$imm16),
+   (BtnezT8SltX16 CPU16Regs:$rx, CPU16Regs:$ry,  bb:$imm16)
+  >;
+
+def: Mips16Pat
+  <(brcond (i32 (setlt CPU16Regs:$rx, immSExt16:$imm)), bb:$imm16),
+   (BtnezT8SltiX16 CPU16Regs:$rx, immSExt16:$imm,  bb:$imm16)
+  >;
+
+//
+// bcond-setle
+//
+def: Mips16Pat
+  <(brcond (i32 (setle CPU16Regs:$rx, CPU16Regs:$ry)), bb:$imm16),
+   (BteqzT8SltX16 CPU16Regs:$ry, CPU16Regs:$rx,  bb:$imm16)
+  >;
+
+//
+// bcond-setne
+//
+def: Mips16Pat
+  <(brcond (i32 (setne CPU16Regs:$rx, CPU16Regs:$ry)), bb:$imm16),
+   (BtnezT8CmpX16 CPU16Regs:$rx, CPU16Regs:$ry,  bb:$imm16)
+  >;
+
+def: Mips16Pat
+  <(brcond (i32 (setne CPU16Regs:$rx, immZExt16:$imm)), bb:$targ16),
+   (BtnezT8CmpiX16 CPU16Regs:$rx, immSExt16:$imm,  bb:$targ16)
+  >;
+
+def: Mips16Pat
+  <(brcond (i32 (setne CPU16Regs:$rx, 0)), bb:$targ16),
+   (BnezRxImmX16 CPU16Regs:$rx, bb:$targ16)
+  >;
+
+//
+// This needs to be there but I forget which code will generate it
+//
+def: Mips16Pat
+  <(brcond CPU16Regs:$rx, bb:$targ16),
+   (BnezRxImmX16 CPU16Regs:$rx, bb:$targ16)
+  >;
+
+//
+
+//
+// bcond-setugt
+//
+//def: Mips16Pat
+//  <(brcond (i32 (setugt CPU16Regs:$rx, CPU16Regs:$ry)), bb:$imm16),
+//   (BtnezT8SltuX16 CPU16Regs:$ry, CPU16Regs:$rx,  bb:$imm16)
+//  >;
+
+//
+// bcond-setuge
+//
+//def: Mips16Pat
+//  <(brcond (i32 (setuge CPU16Regs:$rx, CPU16Regs:$ry)), bb:$imm16),
+//   (BteqzT8SltuX16 CPU16Regs:$rx, CPU16Regs:$ry,  bb:$imm16)
+//  >;
+
+
+//
+// bcond-setult
+//
+//def: Mips16Pat
+//  <(brcond (i32 (setult CPU16Regs:$rx, CPU16Regs:$ry)), bb:$imm16),
+//   (BtnezT8SltuX16 CPU16Regs:$rx, CPU16Regs:$ry,  bb:$imm16)
+//  >;
+
+def: UncondBranch16_pat<br, BimmX16>;
+
 // Small immediates
+def: Mips16Pat<(i32 immSExt16:$in),
+               (AddiuRxRxImmX16 (Move32R16 ZERO), immSExt16:$in)>;
+
 def: Mips16Pat<(i32 immZExt16:$in), (LiRxImmX16 immZExt16:$in)>;
 
 //
diff --git a/lib/Target/Mips/Mips64InstrInfo.td b/lib/Target/Mips/Mips64InstrInfo.td
index 99a9f25abd2..ed0ea0e849a 100644
--- a/lib/Target/Mips/Mips64InstrInfo.td
+++ b/lib/Target/Mips/Mips64InstrInfo.td
@@ -177,6 +177,7 @@ def BLTZ64 : CBranchZero<0x01, 0, "bltz", setlt, CPU64Regs>;
 }
 let DecoderNamespace = "Mips64" in
 def JALR64 : JumpLinkReg<0x00, 0x09, "jalr", CPU64Regs>;
+def TAILCALL64_R : JumpFR<CPU64Regs, MipsTailCall>, IsTailCall;
 
 let DecoderNamespace = "Mips64" in {
 /// Multiply and Divide Instructions.
diff --git a/lib/Target/Mips/MipsISelLowering.cpp b/lib/Target/Mips/MipsISelLowering.cpp
index 2236af9a7b6..5cc9662864e 100644
--- a/lib/Target/Mips/MipsISelLowering.cpp
+++ b/lib/Target/Mips/MipsISelLowering.cpp
@@ -25,6 +25,7 @@
 #include "llvm/GlobalVariable.h"
 #include "llvm/Intrinsics.h"
 #include "llvm/CallingConv.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -32,12 +33,19 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
 #include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
 
+STATISTIC(NumTailCalls, "Number of tail calls");
+
+static cl::opt<bool>
+EnableMipsTailCalls("enable-mips-tail-calls", cl::Hidden,
+                    cl::desc("MIPS: Enable tail calls."), cl::init(false));
+
 // If I is a shifted mask, set the size (Size) and the first bit of the
 // mask (Pos), and return true.
 // For example, if I is 0x003ff800, (Pos, Size) = (11, 11).
@@ -58,6 +66,7 @@ static SDValue GetGlobalReg(SelectionDAG &DAG, EVT Ty) {
 const char *MipsTargetLowering::getTargetNodeName(unsigned Opcode) const {
   switch (Opcode) {
   case MipsISD::JmpLink:           return "MipsISD::JmpLink";
+  case MipsISD::TailCall:          return "MipsISD::TailCall";
   case MipsISD::Hi:                return "MipsISD::Hi";
   case MipsISD::Lo:                return "MipsISD::Lo";
   case MipsISD::GPRel:             return "MipsISD::GPRel";
@@ -2870,9 +2879,26 @@ PassByValArg64(SDValue Chain, DebugLoc dl,
   MemOpChains.push_back(Chain);
 }
 
+/// IsEligibleForTailCallOptimization - Check whether the call is eligible
+/// for tail call optimization.
+bool MipsTargetLowering::
+IsEligibleForTailCallOptimization(CallingConv::ID CalleeCC,
+                                  unsigned NextStackOffset) const {
+  if (!EnableMipsTailCalls)
+    return false;
+
+  // Do not tail-call optimize if there is an argument passed on stack.
+  if (IsO32 && (CalleeCC != CallingConv::Fast)) {
+    if (NextStackOffset > 16)
+      return false;
+  } else if (NextStackOffset)
+    return false;
+
+  return true;
+}
+
 /// LowerCall - functions arguments are copied from virtual regs to
 /// (physical regs)/(stack frame), CALLSEQ_START and CALLSEQ_END are emitted.
-/// TODO: isTailCall.
 SDValue
 MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                               SmallVectorImpl<SDValue> &InVals) const {
@@ -2887,14 +2913,10 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   CallingConv::ID CallConv              = CLI.CallConv;
   bool isVarArg                         = CLI.IsVarArg;
 
-  // MIPs target does not yet support tail call optimization.
-  isTailCall = false;
-
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo *MFI = MF.getFrameInfo();
   const TargetFrameLowering *TFL = MF.getTarget().getFrameLowering();
   bool IsPIC = getTargetMachine().getRelocationModel() == Reloc::PIC_;
-  MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
 
   // Analyze operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
@@ -2921,19 +2943,25 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   if (IsO32 && (CallConv != CallingConv::Fast))
     NextStackOffset = std::max(NextStackOffset, (unsigned)16);
 
+  // Check if it's really possible to do a tail call.
+  if (isTailCall)
+    isTailCall = IsEligibleForTailCallOptimization(CallConv, NextStackOffset);
+
+  if (isTailCall)
+    ++NumTailCalls;
+
   // Chain is the output chain of the last Load/Store or CopyToReg node.
   // ByValChain is the output chain of the last Memcpy node created for copying
   // byval arguments to the stack.
   SDValue NextStackOffsetVal = DAG.getIntPtrConstant(NextStackOffset, true);
-  Chain = DAG.getCALLSEQ_START(Chain, NextStackOffsetVal);
+
+  if (!isTailCall)
+    Chain = DAG.getCALLSEQ_START(Chain, NextStackOffsetVal);
 
   SDValue StackPtr = DAG.getCopyFromReg(Chain, dl,
                                         IsN64 ? Mips::SP_64 : Mips::SP,
                                         getPointerTy());
 
-  if (MipsFI->getMaxCallFrameSize() < NextStackOffset)
-    MipsFI->setMaxCallFrameSize(NextStackOffset);
-
   // With EABI is it possible to have 16 args on registers.
   SmallVector<std::pair<unsigned, SDValue>, 16> RegsToPass;
   SmallVector<SDValue, 8> MemOpChains;
@@ -3138,6 +3166,9 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   if (InFlag.getNode())
     Ops.push_back(InFlag);
 
+  if (isTailCall)
+    return DAG.getNode(MipsISD::TailCall, dl, MVT::Other, &Ops[0], Ops.size());
+
   Chain  = DAG.getNode(MipsISD::JmpLink, dl, NodeTys, &Ops[0], Ops.size());
   InFlag = Chain.getValue(1);
 
@@ -3379,7 +3410,8 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain,
   if (DAG.getMachineFunction().getFunction()->hasStructRetAttr()) {
     unsigned Reg = MipsFI->getSRetReturnReg();
     if (!Reg) {
-      Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i32));
+      Reg = MF.getRegInfo().
+        createVirtualRegister(getRegClassFor(IsN64 ? MVT::i64 : MVT::i32));
       MipsFI->setSRetReturnReg(Reg);
     }
     SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[0]);
@@ -3508,7 +3540,8 @@ MipsTargetLowering::LowerReturn(SDValue Chain,
       llvm_unreachable("sret virtual register not created in the entry block");
     SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy());
 
-    Chain = DAG.getCopyToReg(Chain, dl, Mips::V0, Val, Flag);
+    Chain = DAG.getCopyToReg(Chain, dl, IsN64 ? Mips::V0_64 : Mips::V0, Val,
+                             Flag);
     Flag = Chain.getValue(1);
   }
 
diff --git a/lib/Target/Mips/MipsISelLowering.h b/lib/Target/Mips/MipsISelLowering.h
index 28d126b62c6..b75a513fa7e 100644
--- a/lib/Target/Mips/MipsISelLowering.h
+++ b/lib/Target/Mips/MipsISelLowering.h
@@ -29,6 +29,9 @@ namespace llvm {
       // Jump and link (call)
       JmpLink,
 
+      // Tail call
+      TailCall,
+
       // Get the Higher 16 bits from a 32-bit immediate
       // No relation with Mips Hi register
       Hi,
@@ -205,6 +208,11 @@ namespace llvm {
     SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const;
 
+    /// IsEligibleForTailCallOptimization - Check whether the call is eligible
+    /// for tail call optimization.
+    bool IsEligibleForTailCallOptimization(CallingConv::ID CalleeCC,
+                                           unsigned NextStackOffset) const;
+
     virtual SDValue
       LowerFormalArguments(SDValue Chain,
                            CallingConv::ID CallConv, bool isVarArg,
diff --git a/lib/Target/Mips/MipsInstrInfo.td b/lib/Target/Mips/MipsInstrInfo.td
index 5a51fc80eb6..cc216c391dd 100644
--- a/lib/Target/Mips/MipsInstrInfo.td
+++ b/lib/Target/Mips/MipsInstrInfo.td
@@ -52,6 +52,10 @@ def MipsJmpLink : SDNode<"MipsISD::JmpLink",SDT_MipsJmpLink,
                          [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue,
                           SDNPVariadic]>;
 
+// Tail call
+def MipsTailCall : SDNode<"MipsISD::TailCall", SDT_MipsJmpLink,
+                          [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+
 // Hi and Lo nodes are used to handle global addresses. Used on
 // MipsISelLowering to lower stuff like GlobalAddress, ExternalSymbol
 // static model. (nothing to do with Mips Registers Hi and Lo)
@@ -175,6 +179,27 @@ class MipsPat<dag pattern, dag result> : Pat<pattern, result> {
   let Predicates = [HasStandardEncoding];
 }
 
+class IsBranch {
+  bit isBranch = 1;
+}
+
+class IsReturn {
+  bit isReturn = 1;
+}
+
+class IsCall {
+  bit isCall = 1;
+}
+
+class IsTailCall {
+  bit isCall = 1;
+  bit isTerminator = 1;
+  bit isReturn = 1;
+  bit isBarrier = 1;
+  bit hasExtraSrcRegAllocReq = 1;
+  bit isCodeGenOnly = 1;
+}
+
 //===----------------------------------------------------------------------===//
 // Instruction format superclass
 //===----------------------------------------------------------------------===//
@@ -573,14 +598,13 @@ class SetCC_I<bits<6> op, string instr_asm, PatFrag cond_op, Operand Od,
      IIAlu>;
 
 // Jump
-class JumpFJ<bits<6> op, string instr_asm>:
-  FJ<op, (outs), (ins jmptarget:$target),
-     !strconcat(instr_asm, "\t$target"), [(br bb:$target)], IIBranch> {
-  let isBranch=1;
+class JumpFJ<bits<6> op, DAGOperand opnd, string instr_asm,
+             SDPatternOperator operator, SDPatternOperator targetoperator>:
+  FJ<op, (outs), (ins opnd:$target), !strconcat(instr_asm, "\t$target"),
+     [(operator targetoperator:$target)], IIBranch> {
   let isTerminator=1;
   let isBarrier=1;
   let hasDelaySlot = 1;
-  let Predicates = [RelocStatic, HasStandardEncoding];
   let DecoderMethod = "DecodeJumpTarget";
   let Defs = [AT];
 }
@@ -601,21 +625,21 @@ class UncondBranch<bits<6> op, string instr_asm>:
 
 // Base class for indirect branch and return instruction classes.
 let isTerminator=1, isBarrier=1, hasDelaySlot = 1 in
-class JumpFR<RegisterClass RC, list<dag> pattern>:
-  FR<0, 0x8, (outs), (ins RC:$rs), "jr\t$rs", pattern, IIBranch> {
+class JumpFR<RegisterClass RC, SDPatternOperator operator = null_frag>:
+  FR<0, 0x8, (outs), (ins RC:$rs), "jr\t$rs", [(operator RC:$rs)], IIBranch> {
   let rt = 0;
   let rd = 0;
   let shamt = 0;
 }
 
 // Indirect branch
-class IndirectBranch<RegisterClass RC>: JumpFR<RC, [(brind RC:$rs)]> {
+class IndirectBranch<RegisterClass RC>: JumpFR<RC, brind> {
   let isBranch = 1;
   let isIndirectBranch = 1;
 }
 
 // Return instruction
-class RetBase<RegisterClass RC>: JumpFR<RC, []> {
+class RetBase<RegisterClass RC>: JumpFR<RC> {
   let isReturn = 1;
   let isCodeGenOnly = 1;
   let hasCtrlDep = 1;
@@ -980,7 +1004,8 @@ def SC_P8 : SCBase<0x38, "sc", CPURegs, mem64>,
 }
 
 /// Jump and Branch Instructions
-def J       : JumpFJ<0x02, "j">;
+def J       : JumpFJ<0x02, jmptarget, "j", br, bb>,
+              Requires<[RelocStatic, HasStandardEncoding]>, IsBranch;
 def JR      : IndirectBranch<CPURegs>;
 def B       : UncondBranch<0x04, "b">;
 def BEQ     : CBranch<0x04, "beq", seteq, CPURegs>;
@@ -998,6 +1023,8 @@ def JAL  : JumpLink<0x03, "jal">;
 def JALR : JumpLinkReg<0x00, 0x09, "jalr", CPURegs>;
 def BGEZAL  : BranchLink<"bgezal", 0x11, CPURegs>;
 def BLTZAL  : BranchLink<"bltzal", 0x10, CPURegs>;
+def TAILCALL : JumpFJ<0x02, calltarget, "j", MipsTailCall, imm>, IsTailCall;
+def TAILCALL_R : JumpFR<CPURegs, MipsTailCall>, IsTailCall;
 
 def RET : RetBase<CPURegs>;
 
@@ -1107,6 +1134,11 @@ def : MipsPat<(MipsJmpLink (i32 texternalsym:$dst)),
 //def : MipsPat<(MipsJmpLink CPURegs:$dst),
 //              (JALR CPURegs:$dst)>;
 
+// Tail call
+def : MipsPat<(MipsTailCall (iPTR tglobaladdr:$dst)),
+              (TAILCALL tglobaladdr:$dst)>;
+def : MipsPat<(MipsTailCall (iPTR texternalsym:$dst)),
+              (TAILCALL texternalsym:$dst)>;
 // hi/lo relocs
 def : MipsPat<(MipsHi tglobaladdr:$in), (LUi tglobaladdr:$in)>;
 def : MipsPat<(MipsHi tblockaddress:$in), (LUi tblockaddress:$in)>;
diff --git a/lib/Target/Mips/MipsMachineFunction.h b/lib/Target/Mips/MipsMachineFunction.h
index 93ce94803a0..5b766f22a80 100644
--- a/lib/Target/Mips/MipsMachineFunction.h
+++ b/lib/Target/Mips/MipsMachineFunction.h
@@ -46,15 +46,13 @@ class MipsFunctionInfo : public MachineFunctionInfo {
   // InArgFIRange: Range of indices of all frame objects created during call to
   //               LowerFormalArguments.
   std::pair<int, int> InArgFIRange;
-  unsigned MaxCallFrameSize;
 
   bool EmitNOAT;
 
 public:
   MipsFunctionInfo(MachineFunction& MF)
   : MF(MF), SRetReturnReg(0), GlobalBaseReg(0),
-    VarArgsFrameIndex(0), InArgFIRange(std::make_pair(-1, 0)),
-    MaxCallFrameSize(0), EmitNOAT(false)
+    VarArgsFrameIndex(0), InArgFIRange(std::make_pair(-1, 0)), EmitNOAT(false)
   {}
 
   bool isInArgFI(int FI) const {
@@ -71,9 +69,6 @@ public:
   int getVarArgsFrameIndex() const { return VarArgsFrameIndex; }
   void setVarArgsFrameIndex(int Index) { VarArgsFrameIndex = Index; }
 
-  unsigned getMaxCallFrameSize() const { return MaxCallFrameSize; }
-  void setMaxCallFrameSize(unsigned S) { MaxCallFrameSize = S; }
-
   bool getEmitNOAT() const { return EmitNOAT; }
   void setEmitNOAT() { EmitNOAT = true; }
 };
diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp
index c18250a78f7..36db4b51799 100644
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -4224,7 +4224,52 @@ SDValue PPCTargetLowering::LowerSINT_TO_FP(SDValue Op,
     return SDValue();
 
   if (Op.getOperand(0).getValueType() == MVT::i64) {
-    SDValue Bits = DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op.getOperand(0));
+    SDValue SINT = Op.getOperand(0);
+    // When converting to single-precision, we actually need to convert
+    // to double-precision first and then round to single-precision.
+    // To avoid double-rounding effects during that operation, we have
+    // to prepare the input operand.  Bits that might be truncated when
+    // converting to double-precision are replaced by a bit that won't
+    // be lost at this stage, but is below the single-precision rounding
+    // position.
+    //
+    // However, if -enable-unsafe-fp-math is in effect, accept double
+    // rounding to avoid the extra overhead.
+    if (Op.getValueType() == MVT::f32 &&
+        !DAG.getTarget().Options.UnsafeFPMath) {
+
+      // Twiddle input to make sure the low 11 bits are zero.  (If this
+      // is the case, we are guaranteed the value will fit into the 53 bit
+      // mantissa of an IEEE double-precision value without rounding.)
+      // If any of those low 11 bits were not zero originally, make sure
+      // bit 12 (value 2048) is set instead, so that the final rounding
+      // to single-precision gets the correct result.
+      SDValue Round = DAG.getNode(ISD::AND, dl, MVT::i64,
+                                  SINT, DAG.getConstant(2047, MVT::i64));
+      Round = DAG.getNode(ISD::ADD, dl, MVT::i64,
+                          Round, DAG.getConstant(2047, MVT::i64));
+      Round = DAG.getNode(ISD::OR, dl, MVT::i64, Round, SINT);
+      Round = DAG.getNode(ISD::AND, dl, MVT::i64,
+                          Round, DAG.getConstant(-2048, MVT::i64));
+
+      // However, we cannot use that value unconditionally: if the magnitude
+      // of the input value is small, the bit-twiddling we did above might
+      // end up visibly changing the output.  Fortunately, in that case, we
+      // don't need to twiddle bits since the original input will convert
+      // exactly to double-precision floating-point already.  Therefore,
+      // construct a conditional to use the original value if the top 11
+      // bits are all sign-bit copies, and use the rounded value computed
+      // above otherwise.
+      SDValue Cond = DAG.getNode(ISD::SRA, dl, MVT::i64,
+                                 SINT, DAG.getConstant(53, MVT::i32));
+      Cond = DAG.getNode(ISD::ADD, dl, MVT::i64,
+                         Cond, DAG.getConstant(1, MVT::i64));
+      Cond = DAG.getSetCC(dl, MVT::i32,
+                          Cond, DAG.getConstant(1, MVT::i64), ISD::SETUGT);
+
+      SINT = DAG.getNode(ISD::SELECT, dl, MVT::i64, Cond, Round, SINT);
+    }
+    SDValue Bits = DAG.getNode(ISD::BITCAST, dl, MVT::f64, SINT);
     SDValue FP = DAG.getNode(PPCISD::FCFID, dl, MVT::f64, Bits);
     if (Op.getValueType() == MVT::f32)
       FP = DAG.getNode(ISD::FP_ROUND, dl,
diff --git a/lib/Target/X86/AsmParser/X86AsmParser.cpp b/lib/Target/X86/AsmParser/X86AsmParser.cpp
index 454664e3ed1..e07fafcb207 100644
--- a/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -11,6 +11,7 @@
 #include "llvm/MC/MCTargetAsmParser.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
@@ -180,6 +181,7 @@ struct X86Operand : public MCParsedAsmOperand {
       unsigned IndexReg;
       unsigned Scale;
       unsigned Size;
+      bool NeedSizeDir;
     } Mem;
   };
 
@@ -312,6 +314,16 @@ struct X86Operand : public MCParsedAsmOperand {
     return isImmSExti64i32Value(CE->getValue());
   }
 
+  unsigned getMemSize() const {
+    assert(Kind == Memory && "Invalid access!");
+    return Mem.Size;
+  }
+
+  bool needSizeDirective() const {
+    assert(Kind == Memory && "Invalid access!");
+    return Mem.NeedSizeDir;
+  }
+
   bool isMem() const { return Kind == Memory; }
   bool isMem8() const {
     return Kind == Memory && (!Mem.Size || Mem.Size == 8);
@@ -451,7 +463,8 @@ struct X86Operand : public MCParsedAsmOperand {
 
   /// Create an absolute memory operand.
   static X86Operand *CreateMem(const MCExpr *Disp, SMLoc StartLoc,
-                               SMLoc EndLoc, unsigned Size = 0) {
+                               SMLoc EndLoc, unsigned Size = 0,
+                               bool NeedSizeDir = false) {
     X86Operand *Res = new X86Operand(Memory, StartLoc, EndLoc);
     Res->Mem.SegReg   = 0;
     Res->Mem.Disp     = Disp;
@@ -459,6 +472,7 @@ struct X86Operand : public MCParsedAsmOperand {
     Res->Mem.IndexReg = 0;
     Res->Mem.Scale    = 1;
     Res->Mem.Size     = Size;
+    Res->Mem.NeedSizeDir = NeedSizeDir;
     return Res;
   }
 
@@ -466,7 +480,7 @@ struct X86Operand : public MCParsedAsmOperand {
   static X86Operand *CreateMem(unsigned SegReg, const MCExpr *Disp,
                                unsigned BaseReg, unsigned IndexReg,
                                unsigned Scale, SMLoc StartLoc, SMLoc EndLoc,
-                               unsigned Size = 0) {
+                               unsigned Size = 0, bool NeedSizeDir = false) {
     // We should never just have a displacement, that should be parsed as an
     // absolute memory operand.
     assert((SegReg || BaseReg || IndexReg) && "Invalid memory operand!");
@@ -481,6 +495,7 @@ struct X86Operand : public MCParsedAsmOperand {
     Res->Mem.IndexReg = IndexReg;
     Res->Mem.Scale    = Scale;
     Res->Mem.Size     = Size;
+    Res->Mem.NeedSizeDir = NeedSizeDir;
     return Res;
   }
 };
@@ -753,7 +768,19 @@ X86Operand *X86AsmParser::ParseIntelMemOperand(unsigned SegReg, SMLoc Start) {
   const MCExpr *Disp = MCConstantExpr::Create(0, getParser().getContext());
   if (getParser().ParseExpression(Disp, End)) return 0;
   End = Parser.getTok().getLoc();
-  return X86Operand::CreateMem(Disp, Start, End, Size);
+
+  bool NeedSizeDir = false;
+  if (!Size && isParsingInlineAsm()) {
+    if (const MCSymbolRefExpr *SymRef = dyn_cast<MCSymbolRefExpr>(Disp)) {
+      const MCSymbol &Sym = SymRef->getSymbol();
+      // FIXME: The SemaLookup will fail if the name is anything other then an
+      // identifier.
+      // FIXME: Pass a valid SMLoc.
+      SemaCallback->LookupInlineAsmIdentifier(Sym.getName(), NULL, Size);
+      NeedSizeDir = Size > 0;
+    }
+  }
+  return X86Operand::CreateMem(Disp, Start, End, Size, NeedSizeDir);
 }
 
 X86Operand *X86AsmParser::ParseIntelOperand() {
@@ -1526,9 +1553,6 @@ MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
   assert(Op->isToken() && "Leading operand should always be a mnemonic!");
   ArrayRef<SMRange> EmptyRanges = ArrayRef<SMRange>();
 
-  // Clear the opcode.
-  Opcode = ~0x0;
-
   // First, handle aliases that expand to multiple instructions.
   // FIXME: This should be replaced with a real .td file alias mechanism.
   // Also, MatchInstructionImpl should actually *do* the EmitInstruction
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 2f09e9e6ff1..5034cc0330a 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -459,7 +459,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
   setOperationAction(ISD::EH_RETURN       , MVT::Other, Custom);
   // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intened to support
   // SjLj exception handling but a light-weight setjmp/longjmp replacement to
-  // support continuation, user-level threading, and etc.. As a result, not
+  // support continuation, user-level threading, and etc.. As a result, no
   // other SjLj exception interfaces are implemented and please don't build
   // your own exception handling based on them.
   // LLVM/Clang supports zero-cost DWARF exception handling.
@@ -557,6 +557,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
   setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
 
   setOperationAction(ISD::TRAP, MVT::Other, Legal);
+  setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
 
   // VASTART needs to be custom lowered to use the VarArgsFrameIndex
   setOperationAction(ISD::VASTART           , MVT::Other, Custom);
@@ -1058,6 +1059,10 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     setOperationAction(ISD::FNEG,               MVT::v4f64, Custom);
     setOperationAction(ISD::FABS,               MVT::v4f64, Custom);
 
+    setOperationAction(ISD::TRUNCATE,           MVT::v8i16, Custom);
+
+    setOperationAction(ISD::FP_TO_SINT,         MVT::v8i16, Custom);
+
     setOperationAction(ISD::FP_TO_SINT,         MVT::v8i32, Legal);
     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i32, Legal);
     setOperationAction(ISD::FP_ROUND,           MVT::v4f32, Legal);
@@ -1255,7 +1260,6 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
   setTargetDAGCombine(ISD::UINT_TO_FP);
   setTargetDAGCombine(ISD::SINT_TO_FP);
   setTargetDAGCombine(ISD::SETCC);
-  setTargetDAGCombine(ISD::FP_TO_SINT);
   if (Subtarget->is64Bit())
     setTargetDAGCombine(ISD::MUL);
   setTargetDAGCombine(ISD::XOR);
@@ -5173,6 +5177,80 @@ X86TargetLowering::LowerVectorBroadcast(SDValue Op, SelectionDAG &DAG) const {
 }
 
 SDValue
+X86TargetLowering::buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) const {
+  EVT VT = Op.getValueType();
+
+  // Skip if insert_vec_elt is not supported.
+  if (!isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
+    return SDValue();
+
+  DebugLoc DL = Op.getDebugLoc();
+  unsigned NumElems = Op.getNumOperands();
+
+  SDValue VecIn1;
+  SDValue VecIn2;
+  SmallVector<unsigned, 4> InsertIndices;
+  SmallVector<int, 8> Mask(NumElems, -1);
+
+  for (unsigned i = 0; i != NumElems; ++i) {
+    unsigned Opc = Op.getOperand(i).getOpcode();
+
+    if (Opc == ISD::UNDEF)
+      continue;
+
+    if (Opc != ISD::EXTRACT_VECTOR_ELT) {
+      // Quit if more than 1 elements need inserting.
+      if (InsertIndices.size() > 1)
+        return SDValue();
+
+      InsertIndices.push_back(i);
+      continue;
+    }
+
+    SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
+    SDValue ExtIdx = Op.getOperand(i).getOperand(1);
+
+    // Quit if extracted from vector of different type.
+    if (ExtractedFromVec.getValueType() != VT)
+      return SDValue();
+
+    // Quit if non-constant index.
+    if (!isa<ConstantSDNode>(ExtIdx))
+      return SDValue();
+
+    if (VecIn1.getNode() == 0)
+      VecIn1 = ExtractedFromVec;
+    else if (VecIn1 != ExtractedFromVec) {
+      if (VecIn2.getNode() == 0)
+        VecIn2 = ExtractedFromVec;
+      else if (VecIn2 != ExtractedFromVec)
+        // Quit if more than 2 vectors to shuffle
+        return SDValue();
+    }
+
+    unsigned Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
+
+    if (ExtractedFromVec == VecIn1)
+      Mask[i] = Idx;
+    else if (ExtractedFromVec == VecIn2)
+      Mask[i] = Idx + NumElems;
+  }
+
+  if (VecIn1.getNode() == 0)
+    return SDValue();
+
+  VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
+  SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, &Mask[0]);
+  for (unsigned i = 0, e = InsertIndices.size(); i != e; ++i) {
+    unsigned Idx = InsertIndices[i];
+    NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
+                     DAG.getIntPtrConstant(Idx));
+  }
+
+  return NV;
+}
+
+SDValue
 X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
   DebugLoc dl = Op.getDebugLoc();
 
@@ -5448,6 +5526,11 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
     if (LD.getNode())
       return LD;
 
+    // Check for a build vector from mostly shuffle plus few inserting.
+    SDValue Sh = buildFromShuffleMostly(Op, DAG);
+    if (Sh.getNode())
+      return Sh;
+
     // For SSE 4.1, use insertps to put the high elements into the low element.
     if (getSubtarget()->hasSSE41()) {
       SDValue Result;
@@ -8104,10 +8187,42 @@ FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned, bool IsReplace) co
   }
 }
 
+SDValue X86TargetLowering::lowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
+  DebugLoc DL = Op.getDebugLoc();
+  EVT VT = Op.getValueType();
+  EVT SVT = Op.getOperand(0).getValueType();
+
+  if (!VT.is128BitVector() || !SVT.is256BitVector() ||
+      VT.getVectorNumElements() != SVT.getVectorNumElements())
+    return SDValue();
+
+  assert(Subtarget->hasAVX() && "256-bit vector is observed without AVX!");
+
+  unsigned NumElems = VT.getVectorNumElements();
+  EVT NVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(),
+                             NumElems * 2);
+
+  SDValue In = Op.getOperand(0);
+  SmallVector<int, 16> MaskVec(NumElems * 2, -1);
+  // Prepare truncation shuffle mask
+  for (unsigned i = 0; i != NumElems; ++i)
+    MaskVec[i] = i * 2;
+  SDValue V = DAG.getVectorShuffle(NVT, DL,
+                                   DAG.getNode(ISD::BITCAST, DL, NVT, In),
+                                   DAG.getUNDEF(NVT), &MaskVec[0]);
+  return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V,
+                     DAG.getIntPtrConstant(0));
+}
+
 SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op,
                                            SelectionDAG &DAG) const {
-  if (Op.getValueType().isVector())
+  if (Op.getValueType().isVector()) {
+    if (Op.getValueType() == MVT::v8i16)
+      return DAG.getNode(ISD::TRUNCATE, Op.getDebugLoc(), Op.getValueType(),
+                         DAG.getNode(ISD::FP_TO_SINT, Op.getDebugLoc(),
+                                     MVT::v8i32, Op.getOperand(0)));
     return SDValue();
+  }
 
   std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
     /*IsSigned=*/ true, /*IsReplace=*/ false);
@@ -11376,6 +11491,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::SRL_PARTS:          return LowerShiftParts(Op, DAG);
   case ISD::SINT_TO_FP:         return LowerSINT_TO_FP(Op, DAG);
   case ISD::UINT_TO_FP:         return LowerUINT_TO_FP(Op, DAG);
+  case ISD::TRUNCATE:           return lowerTRUNCATE(Op, DAG);
   case ISD::FP_TO_SINT:         return LowerFP_TO_SINT(Op, DAG);
   case ISD::FP_TO_UINT:         return LowerFP_TO_UINT(Op, DAG);
   case ISD::FP_EXTEND:          return lowerFP_EXTEND(Op, DAG);
@@ -13276,7 +13392,7 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
   // For v = setjmp(buf), we generate
   //
   // thisMBB:
-  //  buf[Label_Offset] = ljMBB
+  //  buf[LabelOffset] = restoreMBB
   //  SjLjSetup restoreMBB
   //
   // mainMBB:
@@ -13304,18 +13420,48 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
   sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
 
   // thisMBB:
-  unsigned PtrImmStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
-  const int64_t Label_Offset = 1 * PVT.getStoreSize();
-
+  unsigned PtrStoreOpc = 0;
+  unsigned LabelReg = 0;
+  const int64_t LabelOffset = 1 * PVT.getStoreSize();
+  Reloc::Model RM = getTargetMachine().getRelocationModel();
+  bool UseImmLabel = (getTargetMachine().getCodeModel() == CodeModel::Small) &&
+                     (RM == Reloc::Static || RM == Reloc::DynamicNoPIC);
+
+  // Prepare IP either in reg or imm.
+  if (!UseImmLabel) {
+    PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
+    const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
+    LabelReg = MRI.createVirtualRegister(PtrRC);
+    if (Subtarget->is64Bit()) {
+      MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
+              .addReg(X86::RIP)
+              .addImm(0)
+              .addReg(0)
+              .addMBB(restoreMBB)
+              .addReg(0);
+    } else {
+      const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
+      MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)
+              .addReg(XII->getGlobalBaseReg(MF))
+              .addImm(0)
+              .addReg(0)
+              .addMBB(restoreMBB, Subtarget->ClassifyBlockAddressReference())
+              .addReg(0);
+    }
+  } else
+    PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
   // Store IP
-  MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrImmStoreOpc));
+  MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
     if (i == X86::AddrDisp)
-      MIB.addDisp(MI->getOperand(MemOpndSlot + i), Label_Offset);
+      MIB.addDisp(MI->getOperand(MemOpndSlot + i), LabelOffset);
     else
       MIB.addOperand(MI->getOperand(MemOpndSlot + i));
   }
-  MIB.addMBB(restoreMBB);
+  if (!UseImmLabel)
+    MIB.addReg(LabelReg);
+  else
+    MIB.addMBB(restoreMBB);
   MIB.setMemRefs(MMOBegin, MMOEnd);
   // Setup
   MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
@@ -13370,8 +13516,8 @@ X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI,
 
   MachineInstrBuilder MIB;
 
-  const int64_t Label_Offset = 1 * PVT.getStoreSize();
-  const int64_t SP_Offset = 2 * PVT.getStoreSize();
+  const int64_t LabelOffset = 1 * PVT.getStoreSize();
+  const int64_t SPOffset = 2 * PVT.getStoreSize();
 
   unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
   unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
@@ -13385,7 +13531,7 @@ X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI,
   MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
     if (i == X86::AddrDisp)
-      MIB.addDisp(MI->getOperand(i), Label_Offset);
+      MIB.addDisp(MI->getOperand(i), LabelOffset);
     else
       MIB.addOperand(MI->getOperand(i));
   }
@@ -13394,7 +13540,7 @@ X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI,
   MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), SP);
   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
     if (i == X86::AddrDisp)
-      MIB.addDisp(MI->getOperand(i), SP_Offset);
+      MIB.addDisp(MI->getOperand(i), SPOffset);
     else
       MIB.addOperand(MI->getOperand(i));
   }
@@ -15411,11 +15557,11 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
   ISD::LoadExtType Ext = Ld->getExtensionType();
 
   // If this is a vector EXT Load then attempt to optimize it using a
-  // shuffle. We need SSE4 for the shuffles.
+  // shuffle. We need SSSE3 shuffles.
   // TODO: It is possible to support ZExt by zeroing the undef values
   // during the shuffle phase or after the shuffle.
   if (RegVT.isVector() && RegVT.isInteger() &&
-      Ext == ISD::EXTLOAD && Subtarget->hasSSE41()) {
+      Ext == ISD::EXTLOAD && Subtarget->hasSSSE3()) {
     assert(MemVT != RegVT && "Cannot extend to the same type");
     assert(MemVT.isVector() && "Must load a vector from memory");
 
@@ -16269,20 +16415,6 @@ static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
-static SDValue PerformFP_TO_SINTCombine(SDNode *N, SelectionDAG &DAG) {
-  EVT VT = N->getValueType(0);
-
-  // v4i8 = FP_TO_SINT() -> v4i8 = TRUNCATE (V4i32 = FP_TO_SINT()
-  if (VT == MVT::v8i8 || VT == MVT::v4i8) {
-    DebugLoc dl = N->getDebugLoc();
-    MVT DstVT = VT == MVT::v4i8 ? MVT::v4i32 : MVT::v8i32;
-    SDValue I = DAG.getNode(ISD::FP_TO_SINT, dl, DstVT, N->getOperand(0));
-    return DAG.getNode(ISD::TRUNCATE, dl, VT, I);
-  }
-
-  return SDValue();
-}
-
 // Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
 static SDValue PerformADCCombine(SDNode *N, SelectionDAG &DAG,
                                  X86TargetLowering::DAGCombinerInfo &DCI) {
@@ -16421,7 +16553,6 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::STORE:          return PerformSTORECombine(N, DAG, Subtarget);
   case ISD::UINT_TO_FP:     return PerformUINT_TO_FPCombine(N, DAG);
   case ISD::SINT_TO_FP:     return PerformSINT_TO_FPCombine(N, DAG, this);
-  case ISD::FP_TO_SINT:     return PerformFP_TO_SINTCombine(N, DAG);
   case ISD::FADD:           return PerformFADDCombine(N, DAG, Subtarget);
   case ISD::FSUB:           return PerformFSUBCombine(N, DAG, Subtarget);
   case X86ISD::FXOR:
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
index 40e966ad676..228fab1689e 100644
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -795,6 +795,7 @@ namespace llvm {
     SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) const;
     SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
@@ -824,9 +825,10 @@ namespace llvm {
 
     SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;
 
-    // Utility functions to help LowerVECTOR_SHUFFLE
+    // Utility functions to help LowerVECTOR_SHUFFLE & LowerBUILD_VECTOR
     SDValue LowerVectorBroadcast(SDValue Op, SelectionDAG &DAG) const;
     SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG) const;
+    SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) const;
 
     SDValue LowerVectorAllZeroTest(SDValue Op, SelectionDAG &DAG) const;
 
diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp
index c44549c30ac..cfd68f74b7b 100644
--- a/lib/Target/X86/X86MCInstLower.cpp
+++ b/lib/Target/X86/X86MCInstLower.cpp
@@ -67,15 +67,11 @@ MachineModuleInfoMachO &X86MCInstLower::getMachOMMI() const {
 /// operand to an MCSymbol.
 MCSymbol *X86MCInstLower::
 GetSymbolFromOperand(const MachineOperand &MO) const {
-  assert((MO.isGlobal() || MO.isSymbol()) && "Isn't a symbol reference");
+  assert((MO.isGlobal() || MO.isSymbol() || MO.isMBB()) && "Isn't a symbol reference");
 
   SmallString<128> Name;
 
-  if (!MO.isGlobal()) {
-    assert(MO.isSymbol());
-    Name += MAI.getGlobalPrefix();
-    Name += MO.getSymbolName();
-  } else {
+  if (MO.isGlobal()) {
     const GlobalValue *GV = MO.getGlobal();
     bool isImplicitlyPrivate = false;
     if (MO.getTargetFlags() == X86II::MO_DARWIN_STUB ||
@@ -85,6 +81,11 @@ GetSymbolFromOperand(const MachineOperand &MO) const {
       isImplicitlyPrivate = true;
 
     Mang->getNameWithPrefix(Name, GV, isImplicitlyPrivate);
+  } else if (MO.isSymbol()) {
+    Name += MAI.getGlobalPrefix();
+    Name += MO.getSymbolName();
+  } else if (MO.isMBB()) {
+    Name += MO.getMBB()->getSymbol()->getName();
   }
 
   // If the target flags on the operand changes the name of the symbol, do that
@@ -215,7 +216,7 @@ MCOperand X86MCInstLower::LowerSymbolOperand(const MachineOperand &MO,
   if (Expr == 0)
     Expr = MCSymbolRefExpr::Create(Sym, RefKind, Ctx);
 
-  if (!MO.isJTI() && MO.getOffset())
+  if (!MO.isJTI() && !MO.isMBB() && MO.getOffset())
     Expr = MCBinaryExpr::CreateAdd(Expr,
                                    MCConstantExpr::Create(MO.getOffset(), Ctx),
                                    Ctx);
@@ -348,9 +349,6 @@ void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
       MCOp = MCOperand::CreateImm(MO.getImm());
       break;
     case MachineOperand::MO_MachineBasicBlock:
-      MCOp = MCOperand::CreateExpr(MCSymbolRefExpr::Create(
-                       MO.getMBB()->getSymbol(), Ctx));
-      break;
     case MachineOperand::MO_GlobalAddress:
     case MachineOperand::MO_ExternalSymbol:
       MCOp = LowerSymbolOperand(MO, GetSymbolFromOperand(MO));
diff --git a/lib/Target/X86/X86VZeroUpper.cpp b/lib/Target/X86/X86VZeroUpper.cpp
index 449eed3d8d5..c4a58874a41 100644
--- a/lib/Target/X86/X86VZeroUpper.cpp
+++ b/lib/Target/X86/X86VZeroUpper.cpp
@@ -147,7 +147,7 @@ bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) {
   const TargetRegisterClass *RC = &X86::VR256RegClass;
   for (TargetRegisterClass::iterator i = RC->begin(), e = RC->end();
        i != e; i++) {
-    if (MRI.isPhysRegUsed(*i)) {
+    if (!MRI.reg_nodbg_empty(*i)) {
       YMMUsed = true;
       break;
     }
diff --git a/lib/Transforms/IPO/BarrierNoopPass.cpp b/lib/Transforms/IPO/BarrierNoopPass.cpp
new file mode 100644
index 00000000000..2e32240621f
--- /dev/null
+++ b/lib/Transforms/IPO/BarrierNoopPass.cpp
@@ -0,0 +1,47 @@
+//===- BarrierNoopPass.cpp - A barrier pass for the pass manager ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// NOTE: DO NOT USE THIS IF AVOIDABLE
+//
+// This pass is a nonce pass intended to allow manipulation of the implicitly
+// nesting pass manager. For example, it can be used to cause a CGSCC pass
+// manager to be closed prior to running a new collection of function passes.
+//
+// FIXME: This is a huge HACK. This should be removed when the pass manager's
+// nesting is made explicit instead of implicit.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Pass.h"
+#include "llvm/Transforms/IPO.h"
+using namespace llvm;
+
+namespace {
+/// \brief A nonce module pass used to place a barrier in a pass manager.
+///
+/// There is no mechanism for ending a CGSCC pass manager once one is started.
+/// This prevents extension points from having clear deterministic ordering
+/// when they are phrased as non-module passes.
+class BarrierNoop : public ModulePass {
+public:
+  static char ID; // Pass identification.
+
+  BarrierNoop() : ModulePass(ID) {
+    initializeBarrierNoopPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnModule(Module &M) { return false; }
+};
+}
+
+ModulePass *llvm::createBarrierNoopPass() { return new BarrierNoop(); }
+
+char BarrierNoop::ID = 0;
+INITIALIZE_PASS(BarrierNoop, "barrier", "A No-Op Barrier Pass",
+                false, false)
diff --git a/lib/Transforms/IPO/CMakeLists.txt b/lib/Transforms/IPO/CMakeLists.txt
index 3f6b1de614d..90c1c33e6dc 100644
--- a/lib/Transforms/IPO/CMakeLists.txt
+++ b/lib/Transforms/IPO/CMakeLists.txt
@@ -1,5 +1,6 @@
 add_llvm_library(LLVMipo
   ArgumentPromotion.cpp
+  BarrierNoopPass.cpp
   ConstantMerge.cpp
   DeadArgumentElimination.cpp
   ExtractGV.cpp
diff --git a/lib/Transforms/IPO/PassManagerBuilder.cpp b/lib/Transforms/IPO/PassManagerBuilder.cpp
index a2862022a3b..1d8f1e531a0 100644
--- a/lib/Transforms/IPO/PassManagerBuilder.cpp
+++ b/lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -119,6 +119,14 @@ void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) {
       MPM.add(Inliner);
       Inliner = 0;
     }
+
+    // FIXME: This is a HACK! The inliner pass above implicitly creates a CGSCC
+    // pass manager, but we don't want to add extensions into that pass manager.
+    // To prevent this we must insert a no-op module pass to reset the pass
+    // manager to get the same behavior as EP_OptimizerLast in non-O0 builds.
+    if (!GlobalExtensions->empty() || !Extensions.empty())
+      MPM.add(createBarrierNoopPass());
+
     addExtensionsToPM(EP_EnabledOnOptLevel0, MPM);
     return;
   }
@@ -176,6 +184,12 @@ void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) {
   MPM.add(createIndVarSimplifyPass());        // Canonicalize indvars
   MPM.add(createLoopIdiomPass());             // Recognize idioms like memset.
   MPM.add(createLoopDeletionPass());          // Delete dead loops
+
+  if (Vectorize) {
+    MPM.add(createLoopVectorizePass());
+    MPM.add(createLICMPass());
+  }
+
   if (!DisableUnrollLoops)
     MPM.add(createLoopUnrollPass());          // Unroll small loops
   addExtensionsToPM(EP_LoopOptimizerEnd, MPM);
diff --git a/lib/Transforms/InstCombine/InstCombineSelect.cpp b/lib/Transforms/InstCombine/InstCombineSelect.cpp
index ecce242a81e..a2d4c888f2c 100644
--- a/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -333,6 +333,10 @@ static Value *SimplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp,
 
     // All operands were constants, fold it.
     if (ConstOps.size() == I->getNumOperands()) {
+      if (CmpInst *C = dyn_cast<CmpInst>(I))
+        return ConstantFoldCompareInstOperands(C->getPredicate(), ConstOps[0],
+                                               ConstOps[1], TD, TLI);
+
       if (LoadInst *LI = dyn_cast<LoadInst>(I))
         if (!LI->isVolatile())
           return ConstantFoldLoadFromConstPtr(ConstOps[0], TD);
diff --git a/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
index cf60f0f426d..dd7ea14e8a8 100644
--- a/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
+++ b/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
@@ -636,8 +636,11 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
 
       // If LHS's width is changed, shift the mask value accordingly.
       // If newRHS == NULL, i.e. LHSOp0 == RHSOp0, we want to remap any
-      // references to RHSOp0 to LHSOp0, so we don't need to shift the mask.
-      if (eltMask >= 0 && newRHS != NULL)
+      // references from RHSOp0 to LHSOp0, so we don't need to shift the mask.
+      // If newRHS == newLHS, we want to remap any references from newRHS to
+      // newLHS so that we can properly identify splats that may occur due to
+      // obfuscation accross the two vectors.
+      if (eltMask >= 0 && newRHS != NULL && newLHS != newRHS)
         eltMask += newLHSWidth;
     }
 
diff --git a/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index b566994edfc..4abaeca0c5c 100644
--- a/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -245,38 +245,6 @@ static GlobalVariable *createPrivateGlobalForString(Module &M, StringRef Str) {
                             GlobalValue::PrivateLinkage, StrConst, "");
 }
 
-// Split the basic block and insert an if-then code.
-// Before:
-//   Head
-//   Cmp
-//   Tail
-// After:
-//   Head
-//   if (Cmp)
-//     ThenBlock
-//   Tail
-//
-// ThenBlock block is created and its terminator is returned.
-// If Unreachable, ThenBlock is terminated with UnreachableInst, otherwise
-// it is terminated with BranchInst to Tail.
-static TerminatorInst *splitBlockAndInsertIfThen(Value *Cmp, bool Unreachable) {
-  Instruction *SplitBefore = cast<Instruction>(Cmp)->getNextNode();
-  BasicBlock *Head = SplitBefore->getParent();
-  BasicBlock *Tail = Head->splitBasicBlock(SplitBefore);
-  TerminatorInst *HeadOldTerm = Head->getTerminator();
-  LLVMContext &C = Head->getParent()->getParent()->getContext();
-  BasicBlock *ThenBlock = BasicBlock::Create(C, "", Head->getParent(), Tail);
-  TerminatorInst *CheckTerm;
-  if (Unreachable)
-    CheckTerm = new UnreachableInst(C, ThenBlock);
-  else
-    CheckTerm = BranchInst::Create(Tail, ThenBlock);
-  BranchInst *HeadNewTerm =
-    BranchInst::Create(/*ifTrue*/ThenBlock, /*ifFalse*/Tail, Cmp);
-  ReplaceInstWithInst(HeadOldTerm, HeadNewTerm);
-  return CheckTerm;
-}
-
 Value *AddressSanitizer::memToShadow(Value *Shadow, IRBuilder<> &IRB) {
   // Shadow >> scale
   Shadow = IRB.CreateLShr(Shadow, MappingScale);
@@ -324,7 +292,7 @@ bool AddressSanitizer::instrumentMemIntrinsic(MemIntrinsic *MI) {
 
     Value *Cmp = IRB.CreateICmpNE(Length,
                                   Constant::getNullValue(Length->getType()));
-    InsertBefore = splitBlockAndInsertIfThen(Cmp, false);
+    InsertBefore = SplitBlockAndInsertIfThen(cast<Instruction>(Cmp), false);
   }
 
   instrumentMemIntrinsicParam(MI, Dst, Length, InsertBefore, true);
@@ -480,7 +448,8 @@ void AddressSanitizer::instrumentAddress(Instruction *OrigIns,
   TerminatorInst *CrashTerm = 0;
 
   if (ClAlwaysSlowPath || (TypeSize < 8 * Granularity)) {
-    TerminatorInst *CheckTerm = splitBlockAndInsertIfThen(Cmp, false);
+    TerminatorInst *CheckTerm =
+        SplitBlockAndInsertIfThen(cast<Instruction>(Cmp), false);
     assert(dyn_cast<BranchInst>(CheckTerm)->isUnconditional());
     BasicBlock *NextBB = CheckTerm->getSuccessor(0);
     IRB.SetInsertPoint(CheckTerm);
@@ -491,7 +460,7 @@ void AddressSanitizer::instrumentAddress(Instruction *OrigIns,
     BranchInst *NewTerm = BranchInst::Create(CrashBlock, NextBB, Cmp2);
     ReplaceInstWithInst(CheckTerm, NewTerm);
   } else {
-    CrashTerm = splitBlockAndInsertIfThen(Cmp, true);
+    CrashTerm = SplitBlockAndInsertIfThen(cast<Instruction>(Cmp), true);
   }
 
   Instruction *Crash =
@@ -534,7 +503,7 @@ void AddressSanitizer::createInitializerPoisonCalls(Module &M,
 
 bool AddressSanitizer::ShouldInstrumentGlobal(GlobalVariable *G) {
   Type *Ty = cast<PointerType>(G->getType())->getElementType();
-  DEBUG(dbgs() << "GLOBAL: " << *G);
+  DEBUG(dbgs() << "GLOBAL: " << *G << "\n");
 
   if (BL->isIn(*G)) return false;
   if (!Ty->isSized()) return false;
@@ -682,7 +651,7 @@ bool AddressSanitizer::insertGlobalRedzones(Module &M) {
         FirstDynamic = LastDynamic;
     }
 
-    DEBUG(dbgs() << "NEW GLOBAL:\n" << *NewGlobal);
+    DEBUG(dbgs() << "NEW GLOBAL: " << *NewGlobal << "\n");
   }
 
   ArrayType *ArrayOfGlobalStructTy = ArrayType::get(GlobalStructTy, n);
@@ -851,6 +820,7 @@ bool AddressSanitizer::maybeInsertAsanInitAtFunctionEntry(Function &F) {
 bool AddressSanitizer::runOnFunction(Function &F) {
   if (BL->isIn(F)) return false;
   if (&F == AsanCtorFunction) return false;
+  DEBUG(dbgs() << "ASAN instrumenting:\n" << F << "\n");
 
   // If needed, insert __asan_init before checking for AddressSafety attr.
   maybeInsertAsanInitAtFunctionEntry(F);
@@ -914,8 +884,6 @@ bool AddressSanitizer::runOnFunction(Function &F) {
     NumInstrumented++;
   }
 
-  DEBUG(dbgs() << F);
-
   bool ChangedStack = poisonStackInFunction(F);
 
   // We must unpoison the stack before every NoReturn call (throw, _exit, etc).
@@ -925,6 +893,7 @@ bool AddressSanitizer::runOnFunction(Function &F) {
     IRBuilder<> IRB(CI);
     IRB.CreateCall(AsanHandleNoReturnFunc);
   }
+  DEBUG(dbgs() << "ASAN done instrumenting:\n" << F << "\n");
 
   return NumInstrumented > 0 || ChangedStack || !NoReturnCalls.empty();
 }
@@ -1148,6 +1117,10 @@ bool AddressSanitizer::poisonStackInFunction(Function &F) {
     }
   }
 
+  // We are done. Remove the old unused alloca instructions.
+  for (size_t i = 0, n = AllocaVec.size(); i < n; i++)
+    AllocaVec[i]->eraseFromParent();
+
   if (ClDebugStack) {
     DEBUG(dbgs() << F);
   }
diff --git a/lib/Transforms/Instrumentation/BlackList.cpp b/lib/Transforms/Instrumentation/BlackList.cpp
index 2cb119964a3..ef34b8a56d8 100644
--- a/lib/Transforms/Instrumentation/BlackList.cpp
+++ b/lib/Transforms/Instrumentation/BlackList.cpp
@@ -45,6 +45,9 @@ BlackList::BlackList(const StringRef Path) {
   StringMap<std::string> Regexps;
   for (SmallVector<StringRef, 16>::iterator I = Lines.begin(), E = Lines.end();
        I != E; ++I) {
+    // Ignore empty lines and lines starting with "#"
+    if (I->empty() || I->startswith("#"))
+      continue;
     // Get our prefix and unparsed regexp.
     std::pair<StringRef, StringRef> SplitLine = I->split(":");
     StringRef Prefix = SplitLine.first;
diff --git a/lib/Transforms/Instrumentation/BlackList.h b/lib/Transforms/Instrumentation/BlackList.h
index 73977fc10a6..f3c05a5058c 100644
--- a/lib/Transforms/Instrumentation/BlackList.h
+++ b/lib/Transforms/Instrumentation/BlackList.h
@@ -12,7 +12,9 @@
 //
 // The blacklist disables instrumentation of various functions and global
 // variables.  Each line contains a prefix, followed by a wild card expression.
+// Empty lines and lines starting with "#" are ignored.
 // ---
+// # Blacklisted items:
 // fun:*_ZN4base6subtle*
 // global:*global_with_bad_access_or_initialization*
 // global-init:*global_with_initialization_issues*
diff --git a/lib/Transforms/Scalar/DCE.cpp b/lib/Transforms/Scalar/DCE.cpp
index 086f0a1a714..a2e074fae89 100644
--- a/lib/Transforms/Scalar/DCE.cpp
+++ b/lib/Transforms/Scalar/DCE.cpp
@@ -118,13 +118,8 @@ bool DCE::runOnFunction(Function &F) {
       I->eraseFromParent();
 
       // Remove the instruction from the worklist if it still exists in it.
-      for (std::vector<Instruction*>::iterator WI = WorkList.begin();
-           WI != WorkList.end(); ) {
-        if (*WI == I)
-          WI = WorkList.erase(WI);
-        else
-          ++WI;
-      }
+      WorkList.erase(std::remove(WorkList.begin(), WorkList.end(), I),
+                     WorkList.end());
 
       MadeChange = true;
       ++DCEEliminated;
diff --git a/lib/Transforms/Scalar/IndVarSimplify.cpp b/lib/Transforms/Scalar/IndVarSimplify.cpp
index 78630b2a9f3..82eb7464677 100644
--- a/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -551,15 +551,17 @@ void IndVarSimplify::RewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) {
 
         PN->setIncomingValue(i, ExitVal);
 
-        // If this instruction is dead now, delete it.
-        RecursivelyDeleteTriviallyDeadInstructions(Inst, TLI);
+        // If this instruction is dead now, delete it. Don't do it now to avoid
+        // invalidating iterators.
+        if (isInstructionTriviallyDead(Inst, TLI))
+          DeadInsts.push_back(Inst);
 
         if (NumPreds == 1) {
           // Completely replace a single-pred PHI. This is safe, because the
           // NewVal won't be variant in the loop, so we don't need an LCSSA phi
           // node anymore.
           PN->replaceAllUsesWith(ExitVal);
-          RecursivelyDeleteTriviallyDeadInstructions(PN, TLI);
+          PN->eraseFromParent();
         }
       }
       if (NumPreds != 1) {
diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 99a62dbe62f..958348d9faa 100644
--- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -37,7 +37,7 @@
 //
 // TODO: Handle multiple loops at a time.
 //
-// TODO: Should AddrMode::BaseGV be changed to a ConstantExpr
+// TODO: Should TargetLowering::AddrMode::BaseGV be changed to a ConstantExpr
 //       instead of a GlobalValue?
 //
 // TODO: When truncation is free, truncate ICmp users' operands to make it a
@@ -67,7 +67,6 @@
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
-#include "llvm/TargetTransformInfo.h"
 #include "llvm/ADT/SmallBitVector.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/DenseSet.h"
@@ -75,6 +74,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ValueHandle.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetLowering.h"
 #include <algorithm>
 using namespace llvm;
 
@@ -1118,7 +1118,7 @@ public:
   enum KindType {
     Basic,   ///< A normal use, with no folding.
     Special, ///< A special case of basic, allowing -1 scales.
-    Address, ///< An address use; folding according to ScalarTargetTransformInfo.
+    Address, ///< An address use; folding according to TargetLowering
     ICmpZero ///< An equality icmp with both operands folded into one.
     // TODO: Add a generic icmp too?
   };
@@ -1272,12 +1272,12 @@ void LSRUse::dump() const {
 /// address-mode folding and special icmp tricks.
 static bool isLegalUse(const AddrMode &AM,
                        LSRUse::KindType Kind, Type *AccessTy,
-                       const ScalarTargetTransformInfo *STTI) {
+                       const TargetLowering *TLI) {
   switch (Kind) {
   case LSRUse::Address:
     // If we have low-level target information, ask the target if it can
     // completely fold this address.
-    if (STTI) return STTI->isLegalAddressingMode(AM, AccessTy);
+    if (TLI) return TLI->isLegalAddressingMode(AM, AccessTy);
 
     // Otherwise, just guess that reg+reg addressing is legal.
     return !AM.BaseGV && AM.BaseOffs == 0 && AM.Scale <= 1;
@@ -1300,7 +1300,7 @@ static bool isLegalUse(const AddrMode &AM,
     // If we have low-level target information, ask the target if it can fold an
     // integer immediate on an icmp.
     if (AM.BaseOffs != 0) {
-      if (!STTI)
+      if (!TLI)
         return false;
       // We have one of:
       // ICmpZero     BaseReg + Offset => ICmp BaseReg, -Offset
@@ -1309,7 +1309,7 @@ static bool isLegalUse(const AddrMode &AM,
       int64_t Offs = AM.BaseOffs;
       if (AM.Scale == 0)
         Offs = -(uint64_t)Offs; // The cast does the right thing with INT64_MIN.
-      return STTI->isLegalICmpImmediate(Offs);
+      return TLI->isLegalICmpImmediate(Offs);
     }
 
     // ICmpZero BaseReg + -1*ScaleReg => ICmp BaseReg, ScaleReg
@@ -1330,20 +1330,20 @@ static bool isLegalUse(const AddrMode &AM,
 static bool isLegalUse(AddrMode AM,
                        int64_t MinOffset, int64_t MaxOffset,
                        LSRUse::KindType Kind, Type *AccessTy,
-                       const ScalarTargetTransformInfo *LTTI) {
+                       const TargetLowering *TLI) {
   // Check for overflow.
   if (((int64_t)((uint64_t)AM.BaseOffs + MinOffset) > AM.BaseOffs) !=
       (MinOffset > 0))
     return false;
   AM.BaseOffs = (uint64_t)AM.BaseOffs + MinOffset;
-  if (isLegalUse(AM, Kind, AccessTy, LTTI)) {
+  if (isLegalUse(AM, Kind, AccessTy, TLI)) {
     AM.BaseOffs = (uint64_t)AM.BaseOffs - MinOffset;
     // Check for overflow.
     if (((int64_t)((uint64_t)AM.BaseOffs + MaxOffset) > AM.BaseOffs) !=
         (MaxOffset > 0))
       return false;
     AM.BaseOffs = (uint64_t)AM.BaseOffs + MaxOffset;
-    return isLegalUse(AM, Kind, AccessTy, LTTI);
+    return isLegalUse(AM, Kind, AccessTy, TLI);
   }
   return false;
 }
@@ -1352,7 +1352,7 @@ static bool isAlwaysFoldable(int64_t BaseOffs,
                              GlobalValue *BaseGV,
                              bool HasBaseReg,
                              LSRUse::KindType Kind, Type *AccessTy,
-                             const ScalarTargetTransformInfo *LTTI) {
+                             const TargetLowering *TLI) {
   // Fast-path: zero is always foldable.
   if (BaseOffs == 0 && !BaseGV) return true;
 
@@ -1371,14 +1371,14 @@ static bool isAlwaysFoldable(int64_t BaseOffs,
     AM.HasBaseReg = true;
   }
 
-  return isLegalUse(AM, Kind, AccessTy, LTTI);
+  return isLegalUse(AM, Kind, AccessTy, TLI);
 }
 
 static bool isAlwaysFoldable(const SCEV *S,
                              int64_t MinOffset, int64_t MaxOffset,
                              bool HasBaseReg,
                              LSRUse::KindType Kind, Type *AccessTy,
-                             const ScalarTargetTransformInfo *LTTI,
+                             const TargetLowering *TLI,
                              ScalarEvolution &SE) {
   // Fast-path: zero is always foldable.
   if (S->isZero()) return true;
@@ -1402,7 +1402,7 @@ static bool isAlwaysFoldable(const SCEV *S,
   AM.HasBaseReg = HasBaseReg;
   AM.Scale = Kind == LSRUse::ICmpZero ? -1 : 1;
 
-  return isLegalUse(AM, MinOffset, MaxOffset, Kind, AccessTy, LTTI);
+  return isLegalUse(AM, MinOffset, MaxOffset, Kind, AccessTy, TLI);
 }
 
 namespace {
@@ -1502,7 +1502,7 @@ class LSRInstance {
   ScalarEvolution &SE;
   DominatorTree &DT;
   LoopInfo &LI;
-  const ScalarTargetTransformInfo *const STTI;
+  const TargetLowering *const TLI;
   Loop *const L;
   bool Changed;
 
@@ -1638,7 +1638,7 @@ class LSRInstance {
                          Pass *P);
 
 public:
-  LSRInstance(const ScalarTargetTransformInfo *ltti, Loop *l, Pass *P);
+  LSRInstance(const TargetLowering *tli, Loop *l, Pass *P);
 
   bool getChanged() const { return Changed; }
 
@@ -1688,10 +1688,11 @@ void LSRInstance::OptimizeShadowIV() {
     }
     if (!DestTy) continue;
 
-    if (STTI) {
+    if (TLI) {
       // If target does not support DestTy natively then do not apply
       // this transformation.
-      if (!STTI->isTypeLegal(DestTy)) continue;
+      EVT DVT = TLI->getValueType(DestTy);
+      if (!TLI->isTypeLegal(DVT)) continue;
     }
 
     PHINode *PH = dyn_cast<PHINode>(ShadowUse->getOperand(0));
@@ -2014,18 +2015,18 @@ LSRInstance::OptimizeLoopTermCond() {
             if (C->getValue().getMinSignedBits() >= 64 ||
                 C->getValue().isMinSignedValue())
               goto decline_post_inc;
-            // Without STTI, assume that any stride might be valid, and so any
+            // Without TLI, assume that any stride might be valid, and so any
             // use might be shared.
-            if (!STTI)
+            if (!TLI)
               goto decline_post_inc;
             // Check for possible scaled-address reuse.
             Type *AccessTy = getAccessType(UI->getUser());
             AddrMode AM;
             AM.Scale = C->getSExtValue();
-            if (STTI->isLegalAddressingMode(AM, AccessTy))
+            if (TLI->isLegalAddressingMode(AM, AccessTy))
               goto decline_post_inc;
             AM.Scale = -AM.Scale;
-            if (STTI->isLegalAddressingMode(AM, AccessTy))
+            if (TLI->isLegalAddressingMode(AM, AccessTy))
               goto decline_post_inc;
           }
         }
@@ -2096,12 +2097,12 @@ LSRInstance::reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg,
   // Conservatively assume HasBaseReg is true for now.
   if (NewOffset < LU.MinOffset) {
     if (!isAlwaysFoldable(LU.MaxOffset - NewOffset, 0, HasBaseReg,
-                          Kind, AccessTy, STTI))
+                          Kind, AccessTy, TLI))
       return false;
     NewMinOffset = NewOffset;
   } else if (NewOffset > LU.MaxOffset) {
     if (!isAlwaysFoldable(NewOffset - LU.MinOffset, 0, HasBaseReg,
-                          Kind, AccessTy, STTI))
+                          Kind, AccessTy, TLI))
       return false;
     NewMaxOffset = NewOffset;
   }
@@ -2130,7 +2131,7 @@ LSRInstance::getUse(const SCEV *&Expr,
   int64_t Offset = ExtractImmediate(Expr, SE);
 
   // Basic uses can't accept any offset, for example.
-  if (!isAlwaysFoldable(Offset, 0, /*HasBaseReg=*/true, Kind, AccessTy, STTI)) {
+  if (!isAlwaysFoldable(Offset, 0, /*HasBaseReg=*/true, Kind, AccessTy, TLI)) {
     Expr = Copy;
     Offset = 0;
   }
@@ -2395,7 +2396,7 @@ bool IVChain::isProfitableIncrement(const SCEV *OperExpr,
 /// TODO: Consider IVInc free if it's already used in another chains.
 static bool
 isProfitableChain(IVChain &Chain, SmallPtrSet<Instruction*, 4> &Users,
-                  ScalarEvolution &SE, const ScalarTargetTransformInfo *STTI) {
+                  ScalarEvolution &SE, const TargetLowering *TLI) {
   if (StressIVChain)
     return true;
 
@@ -2653,7 +2654,7 @@ void LSRInstance::CollectChains() {
   for (unsigned UsersIdx = 0, NChains = IVChainVec.size();
        UsersIdx < NChains; ++UsersIdx) {
     if (!isProfitableChain(IVChainVec[UsersIdx],
-                           ChainUsersVec[UsersIdx].FarUsers, SE, STTI))
+                           ChainUsersVec[UsersIdx].FarUsers, SE, TLI))
       continue;
     // Preserve the chain at UsesIdx.
     if (ChainIdx != UsersIdx)
@@ -2680,8 +2681,7 @@ void LSRInstance::FinalizeChain(IVChain &Chain) {
 
 /// Return true if the IVInc can be folded into an addressing mode.
 static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst,
-                             Value *Operand,
-                             const ScalarTargetTransformInfo *STTI) {
+                             Value *Operand, const TargetLowering *TLI) {
   const SCEVConstant *IncConst = dyn_cast<SCEVConstant>(IncExpr);
   if (!IncConst || !isAddressUse(UserInst, Operand))
     return false;
@@ -2691,7 +2691,7 @@ static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst,
 
   int64_t IncOffset = IncConst->getValue()->getSExtValue();
   if (!isAlwaysFoldable(IncOffset, /*BaseGV=*/0, /*HaseBaseReg=*/false,
-                       LSRUse::Address, getAccessType(UserInst), STTI))
+                       LSRUse::Address, getAccessType(UserInst), TLI))
     return false;
 
   return true;
@@ -2762,7 +2762,7 @@ void LSRInstance::GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter,
 
       // If an IV increment can't be folded, use it as the next IV value.
       if (!canFoldIVIncExpr(LeftOverExpr, IncI->UserInst, IncI->IVOperand,
-                            STTI)) {
+                            TLI)) {
         assert(IVTy == IVOper->getType() && "inconsistent IV increment type");
         IVSrc = IVOper;
         LeftOverExpr = 0;
@@ -3108,7 +3108,7 @@ void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx,
       // into an immediate field.
       if (isAlwaysFoldable(*J, LU.MinOffset, LU.MaxOffset,
                            Base.getNumRegs() > 1,
-                           LU.Kind, LU.AccessTy, STTI, SE))
+                           LU.Kind, LU.AccessTy, TLI, SE))
         continue;
 
       // Collect all operands except *J.
@@ -3122,7 +3122,7 @@ void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx,
       if (InnerAddOps.size() == 1 &&
           isAlwaysFoldable(InnerAddOps[0], LU.MinOffset, LU.MaxOffset,
                            Base.getNumRegs() > 1,
-                           LU.Kind, LU.AccessTy, STTI, SE))
+                           LU.Kind, LU.AccessTy, TLI, SE))
         continue;
 
       const SCEV *InnerSum = SE.getAddExpr(InnerAddOps);
@@ -3132,9 +3132,9 @@ void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx,
 
       // Add the remaining pieces of the add back into the new formula.
       const SCEVConstant *InnerSumSC = dyn_cast<SCEVConstant>(InnerSum);
-      if (STTI && InnerSumSC &&
+      if (TLI && InnerSumSC &&
           SE.getTypeSizeInBits(InnerSumSC->getType()) <= 64 &&
-          STTI->isLegalAddImmediate((uint64_t)F.UnfoldedOffset +
+          TLI->isLegalAddImmediate((uint64_t)F.UnfoldedOffset +
                                    InnerSumSC->getValue()->getZExtValue())) {
         F.UnfoldedOffset = (uint64_t)F.UnfoldedOffset +
                            InnerSumSC->getValue()->getZExtValue();
@@ -3144,8 +3144,8 @@ void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx,
 
       // Add J as its own register, or an unfolded immediate.
       const SCEVConstant *SC = dyn_cast<SCEVConstant>(*J);
-      if (STTI && SC && SE.getTypeSizeInBits(SC->getType()) <= 64 &&
-          STTI->isLegalAddImmediate((uint64_t)F.UnfoldedOffset +
+      if (TLI && SC && SE.getTypeSizeInBits(SC->getType()) <= 64 &&
+          TLI->isLegalAddImmediate((uint64_t)F.UnfoldedOffset +
                                    SC->getValue()->getZExtValue()))
         F.UnfoldedOffset = (uint64_t)F.UnfoldedOffset +
                            SC->getValue()->getZExtValue();
@@ -3205,7 +3205,7 @@ void LSRInstance::GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx,
     Formula F = Base;
     F.AM.BaseGV = GV;
     if (!isLegalUse(F.AM, LU.MinOffset, LU.MaxOffset,
-                    LU.Kind, LU.AccessTy, STTI))
+                    LU.Kind, LU.AccessTy, TLI))
       continue;
     F.BaseRegs[i] = G;
     (void)InsertFormula(LU, LUIdx, F);
@@ -3230,7 +3230,7 @@ void LSRInstance::GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx,
       Formula F = Base;
       F.AM.BaseOffs = (uint64_t)Base.AM.BaseOffs - *I;
       if (isLegalUse(F.AM, LU.MinOffset - *I, LU.MaxOffset - *I,
-                     LU.Kind, LU.AccessTy, STTI)) {
+                     LU.Kind, LU.AccessTy, TLI)) {
         // Add the offset to the base register.
         const SCEV *NewG = SE.getAddExpr(SE.getConstant(G->getType(), *I), G);
         // If it cancelled out, drop the base register, otherwise update it.
@@ -3250,7 +3250,7 @@ void LSRInstance::GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx,
     Formula F = Base;
     F.AM.BaseOffs = (uint64_t)F.AM.BaseOffs + Imm;
     if (!isLegalUse(F.AM, LU.MinOffset, LU.MaxOffset,
-                    LU.Kind, LU.AccessTy, STTI))
+                    LU.Kind, LU.AccessTy, TLI))
       continue;
     F.BaseRegs[i] = G;
     (void)InsertFormula(LU, LUIdx, F);
@@ -3297,7 +3297,7 @@ void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx,
     F.AM.BaseOffs = NewBaseOffs;
 
     // Check that this scale is legal.
-    if (!isLegalUse(F.AM, Offset, Offset, LU.Kind, LU.AccessTy, STTI))
+    if (!isLegalUse(F.AM, Offset, Offset, LU.Kind, LU.AccessTy, TLI))
       continue;
 
     // Compensate for the use having MinOffset built into it.
@@ -3353,12 +3353,12 @@ void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) {
     Base.AM.HasBaseReg = Base.BaseRegs.size() > 1;
     // Check whether this scale is going to be legal.
     if (!isLegalUse(Base.AM, LU.MinOffset, LU.MaxOffset,
-                    LU.Kind, LU.AccessTy, STTI)) {
+                    LU.Kind, LU.AccessTy, TLI)) {
       // As a special-case, handle special out-of-loop Basic users specially.
       // TODO: Reconsider this special case.
       if (LU.Kind == LSRUse::Basic &&
           isLegalUse(Base.AM, LU.MinOffset, LU.MaxOffset,
-                     LSRUse::Special, LU.AccessTy, STTI) &&
+                     LSRUse::Special, LU.AccessTy, TLI) &&
           LU.AllFixupsOutsideLoop)
         LU.Kind = LSRUse::Special;
       else
@@ -3391,8 +3391,8 @@ void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) {
 
 /// GenerateTruncates - Generate reuse formulae from different IV types.
 void LSRInstance::GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base) {
-  // This requires ScalarTargetTransformInfo to tell us which truncates are free.
-  if (!STTI) return;
+  // This requires TargetLowering to tell us which truncates are free.
+  if (!TLI) return;
 
   // Don't bother truncating symbolic values.
   if (Base.AM.BaseGV) return;
@@ -3405,7 +3405,7 @@ void LSRInstance::GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base) {
   for (SmallSetVector<Type *, 4>::const_iterator
        I = Types.begin(), E = Types.end(); I != E; ++I) {
     Type *SrcTy = *I;
-    if (SrcTy != DstTy && STTI->isTruncateFree(SrcTy, DstTy)) {
+    if (SrcTy != DstTy && TLI->isTruncateFree(SrcTy, DstTy)) {
       Formula F = Base;
 
       if (F.ScaledReg) F.ScaledReg = SE.getAnyExtendExpr(F.ScaledReg, *I);
@@ -3561,7 +3561,7 @@ void LSRInstance::GenerateCrossUseConstantOffsets() {
         Formula NewF = F;
         NewF.AM.BaseOffs = Offs;
         if (!isLegalUse(NewF.AM, LU.MinOffset, LU.MaxOffset,
-                        LU.Kind, LU.AccessTy, STTI))
+                        LU.Kind, LU.AccessTy, TLI))
           continue;
         NewF.ScaledReg = SE.getAddExpr(NegImmS, NewF.ScaledReg);
 
@@ -3586,9 +3586,9 @@ void LSRInstance::GenerateCrossUseConstantOffsets() {
           Formula NewF = F;
           NewF.AM.BaseOffs = (uint64_t)NewF.AM.BaseOffs + Imm;
           if (!isLegalUse(NewF.AM, LU.MinOffset, LU.MaxOffset,
-                          LU.Kind, LU.AccessTy, STTI)) {
-            if (!STTI ||
-                !STTI->isLegalAddImmediate((uint64_t)NewF.UnfoldedOffset + Imm))
+                          LU.Kind, LU.AccessTy, TLI)) {
+            if (!TLI ||
+                !TLI->isLegalAddImmediate((uint64_t)NewF.UnfoldedOffset + Imm))
               continue;
             NewF = F;
             NewF.UnfoldedOffset = (uint64_t)NewF.UnfoldedOffset + Imm;
@@ -3900,7 +3900,7 @@ void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() {
                 Formula &F = LUThatHas->Formulae[i];
                 if (!isLegalUse(F.AM,
                                 LUThatHas->MinOffset, LUThatHas->MaxOffset,
-                                LUThatHas->Kind, LUThatHas->AccessTy, STTI)) {
+                                LUThatHas->Kind, LUThatHas->AccessTy, TLI)) {
                   DEBUG(dbgs() << "  Deleting "; F.print(dbgs());
                         dbgs() << '\n');
                   LUThatHas->DeleteFormula(F);
@@ -4589,12 +4589,12 @@ LSRInstance::ImplementSolution(const SmallVectorImpl<const Formula *> &Solution,
   Changed |= DeleteTriviallyDeadInstructions(DeadInsts);
 }
 
-LSRInstance::LSRInstance(const ScalarTargetTransformInfo *stti, Loop *l, Pass *P)
+LSRInstance::LSRInstance(const TargetLowering *tli, Loop *l, Pass *P)
   : IU(P->getAnalysis<IVUsers>()),
     SE(P->getAnalysis<ScalarEvolution>()),
     DT(P->getAnalysis<DominatorTree>()),
     LI(P->getAnalysis<LoopInfo>()),
-    STTI(stti), L(l), Changed(false), IVIncInsertPos(0) {
+    TLI(tli), L(l), Changed(false), IVIncInsertPos(0) {
 
   // If LoopSimplify form is not available, stay out of trouble.
   if (!L->isLoopSimplifyForm())
@@ -4684,7 +4684,7 @@ LSRInstance::LSRInstance(const ScalarTargetTransformInfo *stti, Loop *l, Pass *P
      for (SmallVectorImpl<Formula>::const_iterator J = LU.Formulae.begin(),
           JE = LU.Formulae.end(); J != JE; ++J)
         assert(isLegalUse(J->AM, LU.MinOffset, LU.MaxOffset,
-                          LU.Kind, LU.AccessTy, STTI) &&
+                          LU.Kind, LU.AccessTy, TLI) &&
                "Illegal formula generated!");
   };
 #endif
@@ -4757,13 +4757,13 @@ void LSRInstance::dump() const {
 namespace {
 
 class LoopStrengthReduce : public LoopPass {
-  /// ScalarTargetTransformInfo provides target information that is needed
-  /// for strength reducing loops.
-  const ScalarTargetTransformInfo *STTI;
+  /// TLI - Keep a pointer of a TargetLowering to consult for determining
+  /// transformation profitability.
+  const TargetLowering *const TLI;
 
 public:
   static char ID; // Pass ID, replacement for typeid
-  LoopStrengthReduce();
+  explicit LoopStrengthReduce(const TargetLowering *tli = 0);
 
 private:
   bool runOnLoop(Loop *L, LPPassManager &LPM);
@@ -4783,12 +4783,13 @@ INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
 INITIALIZE_PASS_END(LoopStrengthReduce, "loop-reduce",
                 "Loop Strength Reduction", false, false)
 
-Pass *llvm::createLoopStrengthReducePass() {
-  return new LoopStrengthReduce();
+
+Pass *llvm::createLoopStrengthReducePass(const TargetLowering *TLI) {
+  return new LoopStrengthReduce(TLI);
 }
 
-LoopStrengthReduce::LoopStrengthReduce()
-  : LoopPass(ID), STTI(0) {
+LoopStrengthReduce::LoopStrengthReduce(const TargetLowering *tli)
+  : LoopPass(ID), TLI(tli) {
     initializeLoopStrengthReducePass(*PassRegistry::getPassRegistry());
   }
 
@@ -4814,13 +4815,8 @@ void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const {
 bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager & /*LPM*/) {
   bool Changed = false;
 
-  TargetTransformInfo *TTI = getAnalysisIfAvailable<TargetTransformInfo>();
-
-  if (TTI)
-    STTI = TTI->getScalarTargetTransformInfo();
-
   // Run the main LSR transformation.
-  Changed |= LSRInstance(STTI, L, this).getChanged();
+  Changed |= LSRInstance(TLI, L, this).getChanged();
 
   // Remove any extra phis created by processing inner loops.
   Changed |= DeleteDeadPHIs(L->getHeader());
@@ -4831,7 +4827,7 @@ bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager & /*LPM*/) {
     Rewriter.setDebugType(DEBUG_TYPE);
 #endif
     unsigned numFolded = Rewriter.
-      replaceCongruentIVs(L, &getAnalysis<DominatorTree>(), DeadInsts, STTI);
+      replaceCongruentIVs(L, &getAnalysis<DominatorTree>(), DeadInsts, TLI);
     if (numFolded) {
       Changed = true;
       DeleteTriviallyDeadInstructions(DeadInsts);
diff --git a/lib/Transforms/Scalar/LoopUnswitch.cpp b/lib/Transforms/Scalar/LoopUnswitch.cpp
index 15b168fe2aa..047b43eb84f 100644
--- a/lib/Transforms/Scalar/LoopUnswitch.cpp
+++ b/lib/Transforms/Scalar/LoopUnswitch.cpp
@@ -907,13 +907,9 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val,
 /// specified.
 static void RemoveFromWorklist(Instruction *I,
                                std::vector<Instruction*> &Worklist) {
-  std::vector<Instruction*>::iterator WI = std::find(Worklist.begin(),
-                                                     Worklist.end(), I);
-  while (WI != Worklist.end()) {
-    unsigned Offset = WI-Worklist.begin();
-    Worklist.erase(WI);
-    WI = std::find(Worklist.begin()+Offset, Worklist.end(), I);
-  }
+
+  Worklist.erase(std::remove(Worklist.begin(), Worklist.end(), I),
+                 Worklist.end());
 }
 
 /// ReplaceUsesOfWith - When we find that I really equals V, remove I from the
diff --git a/lib/Transforms/Scalar/SROA.cpp b/lib/Transforms/Scalar/SROA.cpp
index 3e84a91c1db..71c62257e7f 100644
--- a/lib/Transforms/Scalar/SROA.cpp
+++ b/lib/Transforms/Scalar/SROA.cpp
@@ -132,9 +132,6 @@ public:
     ///
     /// We flag partitions as splittable when they are formed entirely due to
     /// accesses by trivially splittable operations such as memset and memcpy.
-    ///
-    /// FIXME: At some point we should consider loads and stores of FCAs to be
-    /// splittable and eagerly split them into scalar values.
     bool IsSplittable;
 
     /// \brief Test whether a partition has been marked as dead.
@@ -1785,9 +1782,9 @@ static Value *getNaturalGEPWithType(IRBuilder<> &IRB, const DataLayout &TD,
       break;
     if (SequentialType *SeqTy = dyn_cast<SequentialType>(ElementTy)) {
       ElementTy = SeqTy->getElementType();
-      Indices.push_back(IRB.getInt(APInt(TD.getPointerSizeInBits(
-                ElementTy->isPointerTy() ? 
-                cast<PointerType>(ElementTy)->getAddressSpace(): 0), 0)));
+      // Note that we use the default address space as this index is over an
+      // array or a vector, not a pointer.
+      Indices.push_back(IRB.getInt(APInt(TD.getPointerSizeInBits(0), 0)));
     } else if (StructType *STy = dyn_cast<StructType>(ElementTy)) {
       if (STy->element_begin() == STy->element_end())
         break; // Nothing left to descend into.
@@ -1828,7 +1825,7 @@ static Value *getNaturalGEPRecursively(IRBuilder<> &IRB, const DataLayout &TD,
     if (ElementSizeInBits % 8)
       return 0; // GEPs over non-multiple of 8 size vector elements are invalid.
     APInt ElementSize(Offset.getBitWidth(), ElementSizeInBits / 8);
-    APInt NumSkippedElements = Offset.udiv(ElementSize);
+    APInt NumSkippedElements = Offset.sdiv(ElementSize);
     if (NumSkippedElements.ugt(VecTy->getNumElements()))
       return 0;
     Offset -= NumSkippedElements * ElementSize;
@@ -1840,7 +1837,7 @@ static Value *getNaturalGEPRecursively(IRBuilder<> &IRB, const DataLayout &TD,
   if (ArrayType *ArrTy = dyn_cast<ArrayType>(Ty)) {
     Type *ElementTy = ArrTy->getElementType();
     APInt ElementSize(Offset.getBitWidth(), TD.getTypeAllocSize(ElementTy));
-    APInt NumSkippedElements = Offset.udiv(ElementSize);
+    APInt NumSkippedElements = Offset.sdiv(ElementSize);
     if (NumSkippedElements.ugt(ArrTy->getNumElements()))
       return 0;
 
@@ -1896,7 +1893,7 @@ static Value *getNaturalGEPWithOffset(IRBuilder<> &IRB, const DataLayout &TD,
   APInt ElementSize(Offset.getBitWidth(), TD.getTypeAllocSize(ElementTy));
   if (ElementSize == 0)
     return 0; // Zero-length arrays can't help us build a natural GEP.
-  APInt NumSkippedElements = Offset.udiv(ElementSize);
+  APInt NumSkippedElements = Offset.sdiv(ElementSize);
 
   Offset -= NumSkippedElements * ElementSize;
   Indices.push_back(IRB.getInt(NumSkippedElements));
@@ -2211,6 +2208,48 @@ static bool isIntegerWideningViable(const DataLayout &TD,
   return WholeAllocaOp;
 }
 
+static Value *extractInteger(const DataLayout &DL, IRBuilder<> &IRB, Value *V,
+                             IntegerType *Ty, uint64_t Offset,
+                             const Twine &Name) {
+  IntegerType *IntTy = cast<IntegerType>(V->getType());
+  assert(DL.getTypeStoreSize(Ty) + Offset <= DL.getTypeStoreSize(IntTy) &&
+         "Element extends past full value");
+  uint64_t ShAmt = 8*Offset;
+  if (DL.isBigEndian())
+    ShAmt = 8*(DL.getTypeStoreSize(IntTy) - DL.getTypeStoreSize(Ty) - Offset);
+  if (ShAmt)
+    V = IRB.CreateLShr(V, ShAmt, Name + ".shift");
+  assert(Ty->getBitWidth() <= IntTy->getBitWidth() &&
+         "Cannot extract to a larger integer!");
+  if (Ty != IntTy)
+    V = IRB.CreateTrunc(V, Ty, Name + ".trunc");
+  return V;
+}
+
+static Value *insertInteger(const DataLayout &DL, IRBuilder<> &IRB, Value *Old,
+                            Value *V, uint64_t Offset, const Twine &Name) {
+  IntegerType *IntTy = cast<IntegerType>(Old->getType());
+  IntegerType *Ty = cast<IntegerType>(V->getType());
+  assert(Ty->getBitWidth() <= IntTy->getBitWidth() &&
+         "Cannot insert a larger integer!");
+  if (Ty != IntTy)
+    V = IRB.CreateZExt(V, IntTy, Name + ".ext");
+  assert(DL.getTypeStoreSize(Ty) + Offset <= DL.getTypeStoreSize(IntTy) &&
+         "Element store outside of alloca store");
+  uint64_t ShAmt = 8*Offset;
+  if (DL.isBigEndian())
+    ShAmt = 8*(DL.getTypeStoreSize(IntTy) - DL.getTypeStoreSize(Ty) - Offset);
+  if (ShAmt)
+    V = IRB.CreateShl(V, ShAmt, Name + ".shift");
+
+  if (ShAmt || Ty->getBitWidth() < IntTy->getBitWidth()) {
+    APInt Mask = ~Ty->getMask().zext(IntTy->getBitWidth()).shl(ShAmt);
+    Old = IRB.CreateAnd(Old, Mask, Name + ".mask");
+    V = IRB.CreateOr(Old, V, Name + ".insert");
+  }
+  return V;
+}
+
 namespace {
 /// \brief Visitor to rewrite instructions using a partition of an alloca to
 /// use a new alloca.
@@ -2371,60 +2410,6 @@ private:
     return IRB.getInt32(Index);
   }
 
-  Value *extractInteger(IRBuilder<> &IRB, IntegerType *TargetTy,
-                        uint64_t Offset) {
-    assert(IntTy && "We cannot extract an integer from the alloca");
-    Value *V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
-                                     getName(".load"));
-    V = convertValue(TD, IRB, V, IntTy);
-    assert(Offset >= NewAllocaBeginOffset && "Out of bounds offset");
-    uint64_t RelOffset = Offset - NewAllocaBeginOffset;
-    assert(TD.getTypeStoreSize(TargetTy) + RelOffset <=
-           TD.getTypeStoreSize(IntTy) &&
-           "Element load outside of alloca store");
-    uint64_t ShAmt = 8*RelOffset;
-    if (TD.isBigEndian())
-      ShAmt = 8*(TD.getTypeStoreSize(IntTy) -
-                 TD.getTypeStoreSize(TargetTy) - RelOffset);
-    if (ShAmt)
-      V = IRB.CreateLShr(V, ShAmt, getName(".shift"));
-    assert(TargetTy->getBitWidth() <= IntTy->getBitWidth() &&
-           "Cannot extract to a larger integer!");
-    if (TargetTy != IntTy)
-      V = IRB.CreateTrunc(V, TargetTy, getName(".trunc"));
-    return V;
-  }
-
-  StoreInst *insertInteger(IRBuilder<> &IRB, Value *V, uint64_t Offset) {
-    IntegerType *Ty = cast<IntegerType>(V->getType());
-    assert(Ty->getBitWidth() <= IntTy->getBitWidth() &&
-           "Cannot insert a larger integer!");
-    if (Ty != IntTy)
-      V = IRB.CreateZExt(V, IntTy, getName(".ext"));
-    assert(Offset >= NewAllocaBeginOffset && "Out of bounds offset");
-    uint64_t RelOffset = Offset - NewAllocaBeginOffset;
-    assert(TD.getTypeStoreSize(Ty) + RelOffset <=
-           TD.getTypeStoreSize(IntTy) &&
-           "Element store outside of alloca store");
-    uint64_t ShAmt = 8*RelOffset;
-    if (TD.isBigEndian())
-      ShAmt = 8*(TD.getTypeStoreSize(IntTy) - TD.getTypeStoreSize(Ty)
-                 - RelOffset);
-    if (ShAmt)
-      V = IRB.CreateShl(V, ShAmt, getName(".shift"));
-
-    if (ShAmt || Ty->getBitWidth() < IntTy->getBitWidth()) {
-      APInt Mask = ~Ty->getMask().zext(IntTy->getBitWidth()).shl(ShAmt);
-      Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
-                                         getName(".oldload"));
-      Old = convertValue(TD, IRB, Old, IntTy);
-      Old = IRB.CreateAnd(Old, Mask, getName(".mask"));
-      V = IRB.CreateOr(Old, V, getName(".insert"));
-    }
-    V = convertValue(TD, IRB, V, NewAllocaTy);
-    return IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment());
-  }
-
   void deleteIfTriviallyDead(Value *V) {
     Instruction *I = cast<Instruction>(V);
     if (isInstructionTriviallyDead(I))
@@ -2452,12 +2437,18 @@ private:
   }
 
   bool rewriteIntegerLoad(IRBuilder<> &IRB, LoadInst &LI) {
+    assert(IntTy && "We cannot insert an integer to the alloca");
     assert(!LI.isVolatile());
-    Value *Result = extractInteger(IRB, cast<IntegerType>(LI.getType()),
-                                   BeginOffset);
-    LI.replaceAllUsesWith(Result);
+    Value *V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
+                                     getName(".load"));
+    V = convertValue(TD, IRB, V, IntTy);
+    assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
+    uint64_t Offset = BeginOffset - NewAllocaBeginOffset;
+    V = extractInteger(TD, IRB, V, cast<IntegerType>(LI.getType()), Offset,
+                       getName(".extract"));
+    LI.replaceAllUsesWith(V);
     Pass.DeadInsts.push_back(&LI);
-    DEBUG(dbgs() << "          to: " << *Result << "\n");
+    DEBUG(dbgs() << "          to: " << *V << "\n");
     return true;
   }
 
@@ -2519,8 +2510,20 @@ private:
   }
 
   bool rewriteIntegerStore(IRBuilder<> &IRB, StoreInst &SI) {
+    assert(IntTy && "We cannot extract an integer from the alloca");
     assert(!SI.isVolatile());
-    StoreInst *Store = insertInteger(IRB, SI.getValueOperand(), BeginOffset);
+    Value *V = SI.getValueOperand();
+    if (TD.getTypeSizeInBits(V->getType()) != IntTy->getBitWidth()) {
+      Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
+                                         getName(".oldload"));
+      Old = convertValue(TD, IRB, Old, IntTy);
+      assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
+      uint64_t Offset = BeginOffset - NewAllocaBeginOffset;
+      V = insertInteger(TD, IRB, Old, SI.getValueOperand(), Offset,
+                        getName(".insert"));
+    }
+    V = convertValue(TD, IRB, V, NewAllocaTy);
+    StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment());
     Pass.DeadInsts.push_back(&SI);
     (void)Store;
     DEBUG(dbgs() << "          to: " << *Store << "\n");
@@ -2652,10 +2655,12 @@ private:
     if (IntTy && (BeginOffset > NewAllocaBeginOffset ||
                   EndOffset < NewAllocaEndOffset)) {
       assert(!II.isVolatile());
-      StoreInst *Store = insertInteger(IRB, V, BeginOffset);
-      (void)Store;
-      DEBUG(dbgs() << "          to: " << *Store << "\n");
-      return true;
+      Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
+                                         getName(".oldload"));
+      Old = convertValue(TD, IRB, Old, IntTy);
+      assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
+      uint64_t Offset = BeginOffset - NewAllocaBeginOffset;
+      V = insertInteger(TD, IRB, Old, V, Offset, getName(".insert"));
     }
 
     if (V->getType() != AllocaTy)
@@ -2811,17 +2816,25 @@ private:
         getIndex(IRB, BeginOffset),
         getName(".copyextract"));
     } else if (IntTy && !IsWholeAlloca && !IsDest) {
-      Src = extractInteger(IRB, SubIntTy, BeginOffset);
+      Src = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
+                                  getName(".load"));
+      Src = convertValue(TD, IRB, Src, IntTy);
+      assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
+      uint64_t Offset = BeginOffset - NewAllocaBeginOffset;
+      Src = extractInteger(TD, IRB, Src, SubIntTy, Offset, getName(".extract"));
     } else {
       Src = IRB.CreateAlignedLoad(SrcPtr, Align, II.isVolatile(),
                                   getName(".copyload"));
     }
 
     if (IntTy && !IsWholeAlloca && IsDest) {
-      StoreInst *Store = insertInteger(IRB, Src, BeginOffset);
-      (void)Store;
-      DEBUG(dbgs() << "          to: " << *Store << "\n");
-      return true;
+      Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
+                                         getName(".oldload"));
+      Old = convertValue(TD, IRB, Old, IntTy);
+      assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
+      uint64_t Offset = BeginOffset - NewAllocaBeginOffset;
+      Src = insertInteger(TD, IRB, Old, Src, Offset, getName(".insert"));
+      Src = convertValue(TD, IRB, Src, NewAllocaTy);
     }
 
     if (IsVectorElement && IsDest) {
@@ -2875,10 +2888,7 @@ private:
 
     Value *NewPtr = getAdjustedAllocaPtr(PtrBuilder, OldPtr->getType());
     // Replace the operands which were using the old pointer.
-    User::op_iterator OI = PN.op_begin(), OE = PN.op_end();
-    for (; OI != OE; ++OI)
-      if (*OI == OldPtr)
-        *OI = NewPtr;
+    std::replace(PN.op_begin(), PN.op_end(), cast<Value>(OldPtr), NewPtr);
 
     DEBUG(dbgs() << "          to: " << PN << "\n");
     deleteIfTriviallyDead(OldPtr);
@@ -3248,11 +3258,7 @@ static Type *getTypePartition(const DataLayout &TD, Type *Ty,
   }
 
   // Try to build up a sub-structure.
-  SmallVector<Type *, 4> ElementTys;
-  do {
-    ElementTys.push_back(*EI++);
-  } while (EI != EE);
-  StructType *SubTy = StructType::get(STy->getContext(), ElementTys,
+  StructType *SubTy = StructType::get(STy->getContext(), makeArrayRef(EI, EE),
                                       STy->isPacked());
   const StructLayout *SubSL = TD.getStructLayout(SubTy);
   if (Size != SubSL->getSizeInBytes())
diff --git a/lib/Transforms/Scalar/SimplifyLibCalls.cpp b/lib/Transforms/Scalar/SimplifyLibCalls.cpp
index d86c4cbc9f6..c82a00fc2c8 100644
--- a/lib/Transforms/Scalar/SimplifyLibCalls.cpp
+++ b/lib/Transforms/Scalar/SimplifyLibCalls.cpp
@@ -135,47 +135,6 @@ static bool IsOnlyUsedInEqualityComparison(Value *V, Value *With) {
 
 namespace {
 //===---------------------------------------===//
-// 'strcpy' Optimizations
-
-struct StrCpyOpt : public LibCallOptimization {
-  bool OptChkCall;  // True if it's optimizing a __strcpy_chk libcall.
-
-  StrCpyOpt(bool c) : OptChkCall(c) {}
-
-  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
-    // Verify the "strcpy" function prototype.
-    unsigned NumParams = OptChkCall ? 3 : 2;
-    FunctionType *FT = Callee->getFunctionType();
-    if (FT->getNumParams() != NumParams ||
-        FT->getReturnType() != FT->getParamType(0) ||
-        FT->getParamType(0) != FT->getParamType(1) ||
-        FT->getParamType(0) != B.getInt8PtrTy())
-      return 0;
-
-    Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1);
-    if (Dst == Src)      // strcpy(x,x)  -> x
-      return Src;
-
-    // These optimizations require DataLayout.
-    if (!TD) return 0;
-
-    // See if we can get the length of the input string.
-    uint64_t Len = GetStringLength(Src);
-    if (Len == 0) return 0;
-
-    // We have enough information to now generate the memcpy call to do the
-    // concatenation for us.  Make a memcpy to copy the nul byte with align = 1.
-    if (!OptChkCall ||
-        !EmitMemCpyChk(Dst, Src,
-                       ConstantInt::get(TD->getIntPtrType(*Context), Len),
-                       CI->getArgOperand(2), B, TD, TLI))
-      B.CreateMemCpy(Dst, Src,
-                     ConstantInt::get(TD->getIntPtrType(*Context), Len), 1);
-    return Dst;
-  }
-};
-
-//===---------------------------------------===//
 // 'stpcpy' Optimizations
 
 struct StpCpyOpt: public LibCallOptimization {
@@ -813,8 +772,8 @@ struct FFSOpt : public LibCallOptimization {
 
     // Constant fold.
     if (ConstantInt *CI = dyn_cast<ConstantInt>(Op)) {
-      if (CI->getValue() == 0)  // ffs(0) -> 0.
-        return Constant::getNullValue(CI->getType());
+      if (CI->isZero()) // ffs(0) -> 0.
+        return B.getInt32(0);
       // ffs(c) -> cttz(c)+1
       return B.getInt32(CI->getValue().countTrailingZeros() + 1);
     }
@@ -1275,7 +1234,6 @@ namespace {
 
     StringMap<LibCallOptimization*> Optimizations;
     // String and Memory LibCall Optimizations
-    StrCpyOpt StrCpy; StrCpyOpt StrCpyChk;
     StpCpyOpt StpCpy; StpCpyOpt StpCpyChk;
     StrNCpyOpt StrNCpy;
     StrLenOpt StrLen; StrPBrkOpt StrPBrk;
@@ -1295,8 +1253,7 @@ namespace {
     bool Modified;  // This is only used by doInitialization.
   public:
     static char ID; // Pass identification
-    SimplifyLibCalls() : FunctionPass(ID), StrCpy(false), StrCpyChk(true),
-                         StpCpy(false), StpCpyChk(true),
+    SimplifyLibCalls() : FunctionPass(ID), StpCpy(false), StpCpyChk(true),
                          UnaryDoubleFP(false), UnsafeUnaryDoubleFP(true) {
       initializeSimplifyLibCallsPass(*PassRegistry::getPassRegistry());
     }
@@ -1348,7 +1305,6 @@ void SimplifyLibCalls::AddOpt(LibFunc::Func F1, LibFunc::Func F2,
 /// we know.
 void SimplifyLibCalls::InitOptimizations() {
   // String and Memory LibCall Optimizations
-  Optimizations["strcpy"] = &StrCpy;
   Optimizations["strncpy"] = &StrNCpy;
   Optimizations["stpcpy"] = &StpCpy;
   Optimizations["strlen"] = &StrLen;
@@ -1369,7 +1325,6 @@ void SimplifyLibCalls::InitOptimizations() {
   AddOpt(LibFunc::memset, &MemSet);
 
   // _chk variants of String and Memory LibCall Optimizations.
-  Optimizations["__strcpy_chk"] = &StrCpyChk;
   Optimizations["__stpcpy_chk"] = &StpCpyChk;
 
   // Math Library Optimizations
diff --git a/lib/Transforms/Scalar/TailRecursionElimination.cpp b/lib/Transforms/Scalar/TailRecursionElimination.cpp
index 6557d630a94..28c278f0362 100644
--- a/lib/Transforms/Scalar/TailRecursionElimination.cpp
+++ b/lib/Transforms/Scalar/TailRecursionElimination.cpp
@@ -69,6 +69,7 @@
 #include "llvm/Support/CFG.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/ValueHandle.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/STLExtras.h"
 using namespace llvm;
@@ -117,34 +118,45 @@ FunctionPass *llvm::createTailCallEliminationPass() {
   return new TailCallElim();
 }
 
-/// AllocaMightEscapeToCalls - Return true if this alloca may be accessed by
-/// callees of this function.  We only do very simple analysis right now, this
-/// could be expanded in the future to use mod/ref information for particular
-/// call sites if desired.
-static bool AllocaMightEscapeToCalls(AllocaInst *AI) {
-  // FIXME: do simple 'address taken' analysis.
-  return true;
+/// CanTRE - Scan the specified basic block for alloca instructions.
+/// If it contains any that are variable-sized or not in the entry block,
+/// returns false.
+static bool CanTRE(AllocaInst *AI) {
+  // Because of PR962, we don't TRE allocas outside the entry block.
+
+  // If this alloca is in the body of the function, or if it is a variable
+  // sized allocation, we cannot tail call eliminate calls marked 'tail'
+  // with this mechanism.
+  BasicBlock *BB = AI->getParent();
+  return BB == &BB->getParent()->getEntryBlock() &&
+         isa<ConstantInt>(AI->getArraySize());
 }
 
-/// CheckForEscapingAllocas - Scan the specified basic block for alloca
-/// instructions.  If it contains any that might be accessed by calls, return
-/// true.
-static bool CheckForEscapingAllocas(BasicBlock *BB,
-                                    bool &CannotTCETailMarkedCall) {
-  bool RetVal = false;
-  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I)
-    if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) {
-      RetVal |= AllocaMightEscapeToCalls(AI);
-
-      // If this alloca is in the body of the function, or if it is a variable
-      // sized allocation, we cannot tail call eliminate calls marked 'tail'
-      // with this mechanism.
-      if (BB != &BB->getParent()->getEntryBlock() ||
-          !isa<ConstantInt>(AI->getArraySize()))
-        CannotTCETailMarkedCall = true;
-    }
-  return RetVal;
-}
+struct AllocaCaptureTracker : public CaptureTracker {
+  AllocaCaptureTracker() : Captured(false) {}
+
+  void tooManyUses() { Captured = true; }
+
+  bool shouldExplore(Use *U) {
+    Value *V = U->getUser();
+    if (isa<CallInst>(V) || isa<InvokeInst>(V))
+      UsesAlloca.push_back(V);
+
+    return true;
+  }
+
+  bool captured(Use *U) {
+    if (isa<ReturnInst>(U->getUser()))
+      return false;
+
+    Captured = true;
+    return true;
+  }
+
+  SmallVector<WeakVH, 64> UsesAlloca;
+
+  bool Captured;
+};
 
 bool TailCallElim::runOnFunction(Function &F) {
   // If this function is a varargs function, we won't be able to PHI the args
@@ -157,38 +169,34 @@ bool TailCallElim::runOnFunction(Function &F) {
   bool MadeChange = false;
   bool FunctionContainsEscapingAllocas = false;
 
-  // CannotTCETailMarkedCall - If true, we cannot perform TCE on tail calls
+  // CanTRETailMarkedCall - If false, we cannot perform TRE on tail calls
   // marked with the 'tail' attribute, because doing so would cause the stack
-  // size to increase (real TCE would deallocate variable sized allocas, TCE
+  // size to increase (real TRE would deallocate variable sized allocas, TRE
   // doesn't).
-  bool CannotTCETailMarkedCall = false;
-
-  // Loop over the function, looking for any returning blocks, and keeping track
-  // of whether this function has any non-trivially used allocas.
-  for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
-    if (FunctionContainsEscapingAllocas && CannotTCETailMarkedCall)
-      break;
-
-    FunctionContainsEscapingAllocas |=
-      CheckForEscapingAllocas(BB, CannotTCETailMarkedCall);
+  bool CanTRETailMarkedCall = true;
+
+  // Find calls that can be marked tail.
+  AllocaCaptureTracker ACT;
+  for (Function::iterator BB = F.begin(), EE = F.end(); BB != EE; ++BB) {
+    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
+      if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) {
+        CanTRETailMarkedCall &= CanTRE(AI);
+        PointerMayBeCaptured(AI, &ACT);
+        if (ACT.Captured)
+          return false;
+      }
+    }
   }
 
-  /// FIXME: The code generator produces really bad code when an 'escaping
-  /// alloca' is changed from being a static alloca to being a dynamic alloca.
-  /// Until this is resolved, disable this transformation if that would ever
-  /// happen.  This bug is PR962.
-  if (FunctionContainsEscapingAllocas)
-    return false;
-
-  // Second pass, change any tail calls to loops.
+  // Second pass, change any tail recursive calls to loops.
   for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
     if (ReturnInst *Ret = dyn_cast<ReturnInst>(BB->getTerminator())) {
       bool Change = ProcessReturningBlock(Ret, OldEntry, TailCallsAreMarkedTail,
-                                          ArgumentPHIs,CannotTCETailMarkedCall);
+                                          ArgumentPHIs, !CanTRETailMarkedCall);
       if (!Change && BB->getFirstNonPHIOrDbg() == Ret)
         Change = FoldReturnAndProcessPred(BB, Ret, OldEntry,
                                           TailCallsAreMarkedTail, ArgumentPHIs,
-                                          CannotTCETailMarkedCall);
+                                          !CanTRETailMarkedCall);
       MadeChange |= Change;
     }
   }
@@ -210,21 +218,24 @@ bool TailCallElim::runOnFunction(Function &F) {
     }
   }
 
-  // Finally, if this function contains no non-escaping allocas, or calls
-  // setjmp, mark all calls in the function as eligible for tail calls
-  //(there is no stack memory for them to access).
+  // Finally, if this function contains no non-escaping allocas and doesn't
+  // call setjmp, mark all calls in the function as eligible for tail calls
+  // (there is no stack memory for them to access).
+  std::sort(ACT.UsesAlloca.begin(), ACT.UsesAlloca.end());
+
   if (!FunctionContainsEscapingAllocas && !F.callsFunctionThatReturnsTwice())
     for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
       for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I)
-        if (CallInst *CI = dyn_cast<CallInst>(I)) {
-          CI->setTailCall();
-          MadeChange = true;
-        }
+        if (CallInst *CI = dyn_cast<CallInst>(I))
+          if (!std::binary_search(ACT.UsesAlloca.begin(), ACT.UsesAlloca.end(),
+                                  CI)) {
+            CI->setTailCall();
+            MadeChange = true;
+          }
 
   return MadeChange;
 }
 
-
 /// CanMoveAboveCall - Return true if it is safe to move the specified
 /// instruction from after the call to before the call, assuming that all
 /// instructions between the call and this instruction are movable.
diff --git a/lib/Transforms/Utils/BasicBlockUtils.cpp b/lib/Transforms/Utils/BasicBlockUtils.cpp
index edeebe1b79a..9fea11391a1 100644
--- a/lib/Transforms/Utils/BasicBlockUtils.cpp
+++ b/lib/Transforms/Utils/BasicBlockUtils.cpp
@@ -687,3 +687,42 @@ ReturnInst *llvm::FoldReturnIntoUncondBranch(ReturnInst *RI, BasicBlock *BB,
   return cast<ReturnInst>(NewRet);
 }
 
+/// SplitBlockAndInsertIfThen - Split the containing block at the
+/// specified instruction - everything before and including Cmp stays
+/// in the old basic block, and everything after Cmp is moved to a
+/// new block. The two blocks are connected by a conditional branch
+/// (with value of Cmp being the condition).
+/// Before:
+///   Head
+///   Cmp
+///   Tail
+/// After:
+///   Head
+///   Cmp
+///   if (Cmp)
+///     ThenBlock
+///   Tail
+///
+/// If Unreachable is true, then ThenBlock ends with
+/// UnreachableInst, otherwise it branches to Tail.
+/// Returns the NewBasicBlock's terminator.
+
+TerminatorInst *llvm::SplitBlockAndInsertIfThen(Instruction *Cmp,
+    bool Unreachable, MDNode *BranchWeights) {
+  Instruction *SplitBefore = Cmp->getNextNode();
+  BasicBlock *Head = SplitBefore->getParent();
+  BasicBlock *Tail = Head->splitBasicBlock(SplitBefore);
+  TerminatorInst *HeadOldTerm = Head->getTerminator();
+  LLVMContext &C = Head->getContext();
+  BasicBlock *ThenBlock = BasicBlock::Create(C, "", Head->getParent(), Tail);
+  TerminatorInst *CheckTerm;
+  if (Unreachable)
+    CheckTerm = new UnreachableInst(C, ThenBlock);
+  else
+    CheckTerm = BranchInst::Create(Tail, ThenBlock);
+  BranchInst *HeadNewTerm =
+    BranchInst::Create(/*ifTrue*/ThenBlock, /*ifFalse*/Tail, Cmp);
+  HeadNewTerm->setMetadata(LLVMContext::MD_prof, BranchWeights);
+  ReplaceInstWithInst(HeadOldTerm, HeadNewTerm);
+  return CheckTerm;
+}
diff --git a/lib/Transforms/Utils/LowerInvoke.cpp b/lib/Transforms/Utils/LowerInvoke.cpp
index f35cbbdde5e..930555424de 100644
--- a/lib/Transforms/Utils/LowerInvoke.cpp
+++ b/lib/Transforms/Utils/LowerInvoke.cpp
@@ -45,10 +45,10 @@
 #include "llvm/Pass.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
-#include "llvm/TargetTransformInfo.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetLowering.h"
 #include <csetjmp>
 #include <set>
 using namespace llvm;
@@ -70,14 +70,15 @@ namespace {
     Constant *SetJmpFn, *LongJmpFn, *StackSaveFn, *StackRestoreFn;
     bool useExpensiveEHSupport;
 
-    // We peek in STTI to grab the target's jmp_buf size and alignment
-    const ScalarTargetTransformInfo *STTI;
+    // We peek in TLI to grab the target's jmp_buf size and alignment
+    const TargetLowering *TLI;
 
   public:
     static char ID; // Pass identification, replacement for typeid
-    explicit LowerInvoke(bool useExpensiveEHSupport = ExpensiveEHSupport)
+    explicit LowerInvoke(const TargetLowering *tli = NULL,
+                         bool useExpensiveEHSupport = ExpensiveEHSupport)
       : FunctionPass(ID), useExpensiveEHSupport(useExpensiveEHSupport),
-        STTI(0) {
+        TLI(tli) {
       initializeLowerInvokePass(*PassRegistry::getPassRegistry());
     }
     bool doInitialization(Module &M);
@@ -107,24 +108,21 @@ INITIALIZE_PASS(LowerInvoke, "lowerinvoke",
 char &llvm::LowerInvokePassID = LowerInvoke::ID;
 
 // Public Interface To the LowerInvoke pass.
-FunctionPass *llvm::createLowerInvokePass() {
-  return new LowerInvoke(ExpensiveEHSupport);
+FunctionPass *llvm::createLowerInvokePass(const TargetLowering *TLI) {
+  return new LowerInvoke(TLI, ExpensiveEHSupport);
 }
-FunctionPass *llvm::createLowerInvokePass(bool useExpensiveEHSupport) {
-  return new LowerInvoke(useExpensiveEHSupport);
+FunctionPass *llvm::createLowerInvokePass(const TargetLowering *TLI,
+                                          bool useExpensiveEHSupport) {
+  return new LowerInvoke(TLI, useExpensiveEHSupport);
 }
 
 // doInitialization - Make sure that there is a prototype for abort in the
 // current module.
 bool LowerInvoke::doInitialization(Module &M) {
-  TargetTransformInfo *TTI = getAnalysisIfAvailable<TargetTransformInfo>();
-  if (TTI)
-    STTI = TTI->getScalarTargetTransformInfo();
-
   Type *VoidPtrTy = Type::getInt8PtrTy(M.getContext());
   if (useExpensiveEHSupport) {
     // Insert a type for the linked list of jump buffers.
-    unsigned JBSize = STTI ? STTI->getJumpBufSize() : 0;
+    unsigned JBSize = TLI ? TLI->getJumpBufSize() : 0;
     JBSize = JBSize ? JBSize : 200;
     Type *JmpBufTy = ArrayType::get(VoidPtrTy, JBSize);
 
@@ -432,7 +430,7 @@ bool LowerInvoke::insertExpensiveEHSupport(Function &F) {
     // Create an alloca for the incoming jump buffer ptr and the new jump buffer
     // that needs to be restored on all exits from the function.  This is an
     // alloca because the value needs to be live across invokes.
-    unsigned Align = STTI ? STTI->getJumpBufAlignment() : 0;
+    unsigned Align = TLI ? TLI->getJumpBufAlignment() : 0;
     AllocaInst *JmpBuf =
       new AllocaInst(JBLinkTy, 0, Align,
                      "jblink", F.begin()->begin());
@@ -577,10 +575,6 @@ bool LowerInvoke::insertExpensiveEHSupport(Function &F) {
 }
 
 bool LowerInvoke::runOnFunction(Function &F) {
-  TargetTransformInfo *TTI = getAnalysisIfAvailable<TargetTransformInfo>();
-  if (TTI)
-    STTI = TTI->getScalarTargetTransformInfo();
-
   if (useExpensiveEHSupport)
     return insertExpensiveEHSupport(F);
   else
diff --git a/lib/Transforms/Utils/SimplifyLibCalls.cpp b/lib/Transforms/Utils/SimplifyLibCalls.cpp
index bd28ec35273..b15acdff633 100644
--- a/lib/Transforms/Utils/SimplifyLibCalls.cpp
+++ b/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -183,14 +183,30 @@ struct StrCpyChkOpt : public InstFortifiedLibCallOptimization {
         FT->getParamType(2) != TD->getIntPtrType(Context))
       return 0;
 
+    Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1);
+    if (Dst == Src)      // __strcpy_chk(x,x)  -> x
+      return Src;
+
     // If a) we don't have any length information, or b) we know this will
     // fit then just lower to a plain st[rp]cpy. Otherwise we'll keep our
     // st[rp]cpy_chk call which may fail at runtime if the size is too long.
     // TODO: It might be nice to get a maximum length out of the possible
     // string lengths for varying.
     if (isFoldable(2, 1, true)) {
-      Value *Ret = EmitStrCpy(CI->getArgOperand(0), CI->getArgOperand(1), B, TD,
-                              TLI, Name.substr(2, 6));
+      Value *Ret = EmitStrCpy(Dst, Src, B, TD, TLI, Name.substr(2, 6));
+      return Ret;
+    } else {
+      // Maybe we can stil fold __strcpy_chk to __memcpy_chk.
+      uint64_t Len = GetStringLength(Src);
+      if (Len == 0) return 0;
+
+      // This optimization require DataLayout.
+      if (!TD) return 0;
+
+      Value *Ret =
+	EmitMemCpyChk(Dst, Src,
+                      ConstantInt::get(TD->getIntPtrType(Context), Len),
+                      CI->getArgOperand(2), B, TD, TLI);
       return Ret;
     }
     return 0;
@@ -497,6 +513,35 @@ struct StrNCmpOpt : public LibCallOptimization {
   }
 };
 
+struct StrCpyOpt : public LibCallOptimization {
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    // Verify the "strcpy" function prototype.
+    FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 2 ||
+        FT->getReturnType() != FT->getParamType(0) ||
+        FT->getParamType(0) != FT->getParamType(1) ||
+        FT->getParamType(0) != B.getInt8PtrTy())
+      return 0;
+
+    Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1);
+    if (Dst == Src)      // strcpy(x,x)  -> x
+      return Src;
+
+    // These optimizations require DataLayout.
+    if (!TD) return 0;
+
+    // See if we can get the length of the input string.
+    uint64_t Len = GetStringLength(Src);
+    if (Len == 0) return 0;
+
+    // We have enough information to now generate the memcpy call to do the
+    // copy for us.  Make a memcpy to copy the nul byte with align = 1.
+    B.CreateMemCpy(Dst, Src,
+		   ConstantInt::get(TD->getIntPtrType(*Context), Len), 1);
+    return Dst;
+  }
+};
+
 } // End anonymous namespace.
 
 namespace llvm {
@@ -520,6 +565,7 @@ class LibCallSimplifierImpl {
   StrRChrOpt StrRChr;
   StrCmpOpt StrCmp;
   StrNCmpOpt StrNCmp;
+  StrCpyOpt StrCpy;
 
   void initOptimizations();
 public:
@@ -540,14 +586,15 @@ void LibCallSimplifierImpl::initOptimizations() {
   Optimizations["__stpcpy_chk"] = &StrCpyChk;
   Optimizations["__strncpy_chk"] = &StrNCpyChk;
   Optimizations["__stpncpy_chk"] = &StrNCpyChk;
-  Optimizations["strcmp"] = &StrCmp;
-  Optimizations["strncmp"] = &StrNCmp;
 
   // String and memory library call optimizations.
   Optimizations["strcat"] = &StrCat;
   Optimizations["strncat"] = &StrNCat;
   Optimizations["strchr"] = &StrChr;
   Optimizations["strrchr"] = &StrRChr;
+  Optimizations["strcmp"] = &StrCmp;
+  Optimizations["strncmp"] = &StrNCmp;
+  Optimizations["strcpy"] = &StrCpy;
 }
 
 Value *LibCallSimplifierImpl::optimizeCall(CallInst *CI) {
diff --git a/lib/Transforms/Vectorize/CMakeLists.txt b/lib/Transforms/Vectorize/CMakeLists.txt
index 06cf1e4e532..e64034ab26b 100644
--- a/lib/Transforms/Vectorize/CMakeLists.txt
+++ b/lib/Transforms/Vectorize/CMakeLists.txt
@@ -1,6 +1,7 @@
 add_llvm_library(LLVMVectorize
   BBVectorize.cpp
   Vectorize.cpp
+  LoopVectorize.cpp
   )
 
 add_dependencies(LLVMVectorize intrinsics_gen)
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
new file mode 100644
index 00000000000..6fbf342d06c
--- /dev/null
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -0,0 +1,1358 @@
+//===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
+// and generates target-independent LLVM-IR. Legalization of the IR is done
+// in the codegen. However, the vectorizes uses (will use) the codegen
+// interfaces to generate IR that is likely to result in an optimal binary.
+//
+// The loop vectorizer combines consecutive loop iteration into a single
+// 'wide' iteration. After this transformation the index is incremented
+// by the SIMD vector width, and not by one.
+//
+// This pass has three parts:
+// 1. The main loop pass that drives the different parts.
+// 2. LoopVectorizationLegality - A helper class that checks for the legality
+//    of the vectorization.
+// 3. SingleBlockLoopVectorizer - A helper class that performs the actual
+//    widening of instructions.
+//===----------------------------------------------------------------------===//
+//
+// The reduction-variable vectorization is based on the paper:
+//  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
+//
+// Variable uniformity checks are inspired by:
+// Karrenberg, R. and Hack, S. Whole Function Vectorization.
+//
+// Other ideas/concepts are from:
+//  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
+//
+//===----------------------------------------------------------------------===//
+#define LV_NAME "loop-vectorize"
+#define DEBUG_TYPE LV_NAME
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Instructions.h"
+#include "llvm/LLVMContext.h"
+#include "llvm/Pass.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Value.h"
+#include "llvm/Function.h"
+#include "llvm/Analysis/Verifier.h"
+#include "llvm/Module.h"
+#include "llvm/Type.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AliasSetTracker.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/DataLayout.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <algorithm>
+using namespace llvm;
+
+static cl::opt<unsigned>
+DefaultVectorizationFactor("default-loop-vectorize-width",
+                          cl::init(4), cl::Hidden,
+                          cl::desc("Set the default loop vectorization width"));
+namespace {
+
+// Forward declaration.
+class LoopVectorizationLegality;
+
+/// SingleBlockLoopVectorizer vectorizes loops which contain only one basic
+/// block to a specified vectorization factor (VF).
+/// This class performs the widening of scalars into vectors, or multiple
+/// scalars. This class also implements the following features:
+/// * It inserts an epilogue loop for handling loops that don't have iteration
+///   counts that are known to be a multiple of the vectorization factor.
+/// * It handles the code generation for reduction variables.
+/// * Scalarization (implementation using scalars) of un-vectorizable
+///   instructions.
+/// SingleBlockLoopVectorizer does not perform any vectorization-legality
+/// checks, and relies on the caller to check for the different legality
+/// aspects. The SingleBlockLoopVectorizer relies on the
+/// LoopVectorizationLegality class to provide information about the induction
+/// and reduction variables that were found to a given vectorization factor.
+class SingleBlockLoopVectorizer {
+public:
+  /// Ctor.
+  SingleBlockLoopVectorizer(Loop *Orig, ScalarEvolution *Se, LoopInfo *Li,
+                            LPPassManager *Lpm, unsigned VecWidth):
+  OrigLoop(Orig), SE(Se), LI(Li), LPM(Lpm), VF(VecWidth),
+  Builder(Se->getContext()), Induction(0), OldInduction(0) { }
+
+  // Perform the actual loop widening (vectorization).
+  void vectorize(LoopVectorizationLegality *Legal) {
+    ///Create a new empty loop. Unlink the old loop and connect the new one.
+    createEmptyLoop(Legal);
+    /// Widen each instruction in the old loop to a new one in the new loop.
+    /// Use the Legality module to find the induction and reduction variables.
+   vectorizeLoop(Legal);
+    // register the new loop.
+    cleanup();
+ }
+
+private:
+  /// Create an empty loop, based on the loop ranges of the old loop.
+  void createEmptyLoop(LoopVectorizationLegality *Legal);
+  /// Copy and widen the instructions from the old loop.
+  void vectorizeLoop(LoopVectorizationLegality *Legal);
+  /// Insert the new loop to the loop hierarchy and pass manager.
+  void cleanup();
+
+  /// This instruction is un-vectorizable. Implement it as a sequence
+  /// of scalars.
+  void scalarizeInstruction(Instruction *Instr);
+
+  /// Create a broadcast instruction. This method generates a broadcast
+  /// instruction (shuffle) for loop invariant values and for the induction
+  /// value. If this is the induction variable then we extend it to N, N+1, ...
+  /// this is needed because each iteration in the loop corresponds to a SIMD
+  /// element.
+  Value *getBroadcastInstrs(Value *V);
+
+  /// This is a helper function used by getBroadcastInstrs. It adds 0, 1, 2 ..
+  /// for each element in the vector. Starting from zero.
+  Value *getConsecutiveVector(Value* Val);
+
+  /// When we go over instructions in the basic block we rely on previous
+  /// values within the current basic block or on loop invariant values.
+  /// When we widen (vectorize) values we place them in the map. If the values
+  /// are not within the map, they have to be loop invariant, so we simply
+  /// broadcast them into a vector.
+  Value *getVectorValue(Value *V);
+
+  /// Get a uniform vector of constant integers. We use this to get
+  /// vectors of ones and zeros for the reduction code.
+  Constant* getUniformVector(unsigned Val, Type* ScalarTy);
+
+  typedef DenseMap<Value*, Value*> ValueMap;
+
+  /// The original loop.
+  Loop *OrigLoop;
+  // Scev analysis to use.
+  ScalarEvolution *SE;
+  // Loop Info.
+  LoopInfo *LI;
+  // Loop Pass Manager;
+  LPPassManager *LPM;
+  // The vectorization factor to use.
+  unsigned VF;
+
+  // The builder that we use
+  IRBuilder<> Builder;
+
+  // --- Vectorization state ---
+
+  /// Middle Block between the vector and the scalar.
+  BasicBlock *LoopMiddleBlock;
+  ///The ExitBlock of the scalar loop.
+  BasicBlock *LoopExitBlock;
+  ///The vector loop body.
+  BasicBlock *LoopVectorBody;
+  ///The scalar loop body.
+  BasicBlock *LoopScalarBody;
+  ///The first bypass block.
+  BasicBlock *LoopBypassBlock;
+
+  /// The new Induction variable which was added to the new block.
+  PHINode *Induction;
+  /// The induction variable of the old basic block.
+  PHINode *OldInduction;
+  // Maps scalars to widened vectors.
+  ValueMap WidenMap;
+};
+
+/// LoopVectorizationLegality checks if it is legal to vectorize a loop, and
+/// to what vectorization factor.
+/// This class does not look at the profitability of vectorization, only the
+/// legality. This class has two main kinds of checks:
+/// * Memory checks - The code in canVectorizeMemory checks if vectorization
+///   will change the order of memory accesses in a way that will change the
+///   correctness of the program.
+/// * Scalars checks - The code in canVectorizeBlock checks for a number
+///   of different conditions, such as the availability of a single induction
+///   variable, that all types are supported and vectorize-able, etc.
+/// This code reflects the capabilities of SingleBlockLoopVectorizer.
+/// This class is also used by SingleBlockLoopVectorizer for identifying
+/// induction variable and the different reduction variables.
+class LoopVectorizationLegality {
+public:
+  LoopVectorizationLegality(Loop *Lp, ScalarEvolution *Se, DataLayout *Dl):
+  TheLoop(Lp), SE(Se), DL(Dl), Induction(0) { }
+
+  /// This represents the kinds of reductions that we support.
+  /// We use the enum values to hold the 'identity' value for
+  /// each operand. This value does not change the result if applied.
+  enum ReductionKind {
+    NoReduction = -1, /// Not a reduction.
+    IntegerAdd  = 0,  /// Sum of numbers.
+    IntegerMult = 1  /// Product of numbers.
+  };
+
+  /// This POD struct holds information about reduction variables.
+  struct ReductionDescriptor {
+    // Default C'tor
+    ReductionDescriptor():
+    StartValue(0), LoopExitInstr(0), Kind(NoReduction) {}
+
+    // C'tor.
+    ReductionDescriptor(Value *Start, Instruction *Exit, ReductionKind K):
+    StartValue(Start), LoopExitInstr(Exit), Kind(K) {}
+
+    // The starting value of the reduction.
+    // It does not have to be zero!
+    Value *StartValue;
+    // The instruction who's value is used outside the loop.
+    Instruction *LoopExitInstr;
+    // The kind of the reduction.
+    ReductionKind Kind;
+  };
+
+  /// ReductionList contains the reduction descriptors for all
+  /// of the reductions that were found in the loop.
+  typedef DenseMap<PHINode*, ReductionDescriptor> ReductionList;
+
+  /// Returns the maximum vectorization factor that we *can* use to vectorize
+  /// this loop. This does not mean that it is profitable to vectorize this
+  /// loop, only that it is legal to do so. This may be a large number. We
+  /// can vectorize to any SIMD width below this number.
+  unsigned getLoopMaxVF();
+
+  /// Returns the Induction variable.
+  PHINode *getInduction() {return Induction;}
+
+  /// Returns the reduction variables found in the loop.
+  ReductionList *getReductionVars() { return &Reductions; }
+
+  /// Check if the pointer returned by this GEP is consecutive
+  /// when the index is vectorized. This happens when the last
+  /// index of the GEP is consecutive, like the induction variable.
+  /// This check allows us to vectorize A[idx] into a wide load/store.
+  bool isConsecutiveGep(Value *Ptr);
+
+private:
+  /// Check if a single basic block loop is vectorizable.
+  /// At this point we know that this is a loop with a constant trip count
+  /// and we only need to check individual instructions.
+  bool canVectorizeBlock(BasicBlock &BB);
+
+  /// When we vectorize loops we may change the order in which
+  /// we read and write from memory. This method checks if it is
+  /// legal to vectorize the code, considering only memory constrains.
+  /// Returns true if BB is vectorizable
+  bool canVectorizeMemory(BasicBlock &BB);
+
+  // Check if a pointer value is known to be disjoint.
+  // Example: Alloca, Global, NoAlias.
+  bool isIdentifiedSafeObject(Value* Val);
+
+  /// Returns True, if 'Phi' is the kind of reduction variable for type
+  /// 'Kind'. If this is a reduction variable, it adds it to ReductionList.
+  bool AddReductionVar(PHINode *Phi, ReductionKind Kind);
+  /// Returns true if the instruction I can be a reduction variable of type
+  /// 'Kind'.
+  bool isReductionInstr(Instruction *I, ReductionKind Kind);
+  /// Returns True, if 'Phi' is an induction variable.
+  bool isInductionVariable(PHINode *Phi);
+
+  /// The loop that we evaluate.
+  Loop *TheLoop;
+  /// Scev analysis.
+  ScalarEvolution *SE;
+  /// DataLayout analysis.
+  DataLayout *DL;
+
+  //  ---  vectorization state --- //
+
+  /// Holds the induction variable.
+  PHINode *Induction;
+  /// Holds the reduction variables.
+  ReductionList Reductions;
+  /// Allowed outside users. This holds the reduction
+  /// vars which can be accessed from outside the loop.
+  SmallPtrSet<Value*, 4> AllowedExit;
+};
+
+struct LoopVectorize : public LoopPass {
+  static char ID; // Pass identification, replacement for typeid
+
+  LoopVectorize() : LoopPass(ID) {
+    initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
+  }
+
+  ScalarEvolution *SE;
+  DataLayout *DL;
+  LoopInfo *LI;
+
+  virtual bool runOnLoop(Loop *L, LPPassManager &LPM) {
+    // We only vectorize innermost loops.
+    if (!L->empty())
+      return false;
+
+    SE = &getAnalysis<ScalarEvolution>();
+    DL = getAnalysisIfAvailable<DataLayout>();
+    LI = &getAnalysis<LoopInfo>();
+
+    DEBUG(dbgs() << "LV: Checking a loop in \"" <<
+          L->getHeader()->getParent()->getName() << "\"\n");
+
+    // Check if it is legal to vectorize the loop.
+    LoopVectorizationLegality LVL(L, SE, DL);
+    unsigned MaxVF = LVL.getLoopMaxVF();
+
+    // Check that we can vectorize this loop using the chosen vectorization
+    // width.
+    if (MaxVF < DefaultVectorizationFactor) {
+      DEBUG(dbgs() << "LV: non-vectorizable MaxVF ("<< MaxVF << ").\n");
+      return false;
+    }
+
+    DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< MaxVF << ").\n");
+
+    // If we decided that it is *legal* to vectorizer the loop then do it.
+    SingleBlockLoopVectorizer LB(L, SE, LI, &LPM, DefaultVectorizationFactor);
+    LB.vectorize(&LVL);
+
+    DEBUG(verifyFunction(*L->getHeader()->getParent()));
+    return true;
+  }
+
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    LoopPass::getAnalysisUsage(AU);
+    AU.addRequiredID(LoopSimplifyID);
+    AU.addRequiredID(LCSSAID);
+    AU.addRequired<LoopInfo>();
+    AU.addRequired<ScalarEvolution>();
+  }
+
+};
+
+Value *SingleBlockLoopVectorizer::getBroadcastInstrs(Value *V) {
+  // Instructions that access the old induction variable
+  // actually want to get the new one.
+  if (V == OldInduction)
+    V = Induction;
+  // Create the types.
+  LLVMContext &C = V->getContext();
+  Type *VTy = VectorType::get(V->getType(), VF);
+  Type *I32 = IntegerType::getInt32Ty(C);
+  Constant *Zero = ConstantInt::get(I32, 0);
+  Value *Zeros = ConstantAggregateZero::get(VectorType::get(I32, VF));
+  Value *UndefVal = UndefValue::get(VTy);
+  // Insert the value into a new vector.
+  Value *SingleElem = Builder.CreateInsertElement(UndefVal, V, Zero);
+  // Broadcast the scalar into all locations in the vector.
+  Value *Shuf = Builder.CreateShuffleVector(SingleElem, UndefVal, Zeros,
+                                             "broadcast");
+  // We are accessing the induction variable. Make sure to promote the
+  // index for each consecutive SIMD lane. This adds 0,1,2 ... to all lanes.
+  if (V == Induction)
+    return getConsecutiveVector(Shuf);
+  return Shuf;
+}
+
+Value *SingleBlockLoopVectorizer::getConsecutiveVector(Value* Val) {
+  assert(Val->getType()->isVectorTy() && "Must be a vector");
+  assert(Val->getType()->getScalarType()->isIntegerTy() &&
+         "Elem must be an integer");
+  // Create the types.
+  Type *ITy = Val->getType()->getScalarType();
+  VectorType *Ty = cast<VectorType>(Val->getType());
+  unsigned VLen = Ty->getNumElements();
+  SmallVector<Constant*, 8> Indices;
+
+  // Create a vector of consecutive numbers from zero to VF.
+  for (unsigned i = 0; i < VLen; ++i)
+    Indices.push_back(ConstantInt::get(ITy, i));
+
+  // Add the consecutive indices to the vector value.
+  Constant *Cv = ConstantVector::get(Indices);
+  assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
+  return Builder.CreateAdd(Val, Cv, "induction");
+}
+
+bool LoopVectorizationLegality::isConsecutiveGep(Value *Ptr) {
+  GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
+  if (!Gep)
+    return false;
+
+  unsigned NumOperands = Gep->getNumOperands();
+  Value *LastIndex = Gep->getOperand(NumOperands - 1);
+
+  // Check that all of the gep indices are uniform except for the last.
+  for (unsigned i = 0; i < NumOperands - 1; ++i)
+    if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop))
+      return false;
+
+  // We can emit wide load/stores only of the last index is the induction
+  // variable.
+  const SCEV *Last = SE->getSCEV(LastIndex);
+  if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Last)) {
+    const SCEV *Step = AR->getStepRecurrence(*SE);
+
+    // The memory is consecutive because the last index is consecutive
+    // and all other indices are loop invariant.
+    if (Step->isOne())
+      return true;
+  }
+
+  return false;
+}
+
+Value *SingleBlockLoopVectorizer::getVectorValue(Value *V) {
+  assert(!V->getType()->isVectorTy() && "Can't widen a vector");
+  // If we saved a vectorized copy of V, use it.
+  Value *&MapEntry = WidenMap[V];
+  if (MapEntry)
+    return MapEntry;
+
+  // Broadcast V and save the value for future uses.
+  Value *B = getBroadcastInstrs(V);
+  MapEntry = B;
+  return B;
+}
+
+Constant*
+SingleBlockLoopVectorizer::getUniformVector(unsigned Val, Type* ScalarTy) {
+  SmallVector<Constant*, 8> Indices;
+  // Create a vector of consecutive numbers from zero to VF.
+  for (unsigned i = 0; i < VF; ++i)
+    Indices.push_back(ConstantInt::get(ScalarTy, Val));
+
+  // Add the consecutive indices to the vector value.
+  return ConstantVector::get(Indices);
+}
+
+void SingleBlockLoopVectorizer::scalarizeInstruction(Instruction *Instr) {
+  assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
+  // Holds vector parameters or scalars, in case of uniform vals.
+  SmallVector<Value*, 8> Params;
+
+  // Find all of the vectorized parameters.
+  for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
+    Value *SrcOp = Instr->getOperand(op);
+
+    // If we are accessing the old induction variable, use the new one.
+    if (SrcOp == OldInduction) {
+      Params.push_back(getBroadcastInstrs(Induction));
+      continue;
+    }
+
+    // Try using previously calculated values.
+    Instruction *SrcInst = dyn_cast<Instruction>(SrcOp);
+
+    // If the src is an instruction that appeared earlier in the basic block
+    // then it should already be vectorized.
+    if (SrcInst && SrcInst->getParent() == Instr->getParent()) {
+      assert(WidenMap.count(SrcInst) && "Source operand is unavailable");
+      // The parameter is a vector value from earlier.
+      Params.push_back(WidenMap[SrcInst]);
+    } else {
+      // The parameter is a scalar from outside the loop. Maybe even a constant.
+      Params.push_back(SrcOp);
+    }
+  }
+
+  assert(Params.size() == Instr->getNumOperands() &&
+         "Invalid number of operands");
+
+  // Does this instruction return a value ?
+  bool IsVoidRetTy = Instr->getType()->isVoidTy();
+  Value *VecResults = 0;
+
+  // If we have a return value, create an empty vector. We place the scalarized
+  // instructions in this vector.
+  if (!IsVoidRetTy)
+    VecResults = UndefValue::get(VectorType::get(Instr->getType(), VF));
+
+  // For each scalar that we create:
+  for (unsigned i = 0; i < VF; ++i) {
+    Instruction *Cloned = Instr->clone();
+    if (!IsVoidRetTy)
+      Cloned->setName(Instr->getName() + ".cloned");
+    // Replace the operands of the cloned instrucions with extracted scalars.
+    for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
+      Value *Op = Params[op];
+      // Param is a vector. Need to extract the right lane.
+      if (Op->getType()->isVectorTy())
+        Op = Builder.CreateExtractElement(Op, Builder.getInt32(i));
+      Cloned->setOperand(op, Op);
+    }
+
+    // Place the cloned scalar in the new loop.
+    Builder.Insert(Cloned);
+
+    // If the original scalar returns a value we need to place it in a vector
+    // so that future users will be able to use it.
+    if (!IsVoidRetTy)
+      VecResults = Builder.CreateInsertElement(VecResults, Cloned,
+                                               Builder.getInt32(i));
+  }
+
+  if (!IsVoidRetTy)
+    WidenMap[Instr] = VecResults;
+}
+
+void SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
+  /*
+   In this function we generate a new loop. The new loop will contain
+   the vectorized instructions while the old loop will continue to run the
+   scalar remainder.
+
+    [ ] <-- vector loop bypass.
+  /  |
+ /   v
+|   [ ]     <-- vector pre header.
+|    |
+|    v
+|   [  ] \
+|   [  ]_|   <-- vector loop.
+|    |
+ \   v
+   >[ ]   <--- middle-block.
+  /  |
+ /   v
+|   [ ]     <--- new preheader.
+|    |
+|    v
+|   [ ] \
+|   [ ]_|   <-- old scalar loop to handle remainder.
+ \   |
+  \  v
+   >[ ]     <-- exit block.
+   ...
+   */
+
+  // This is the original scalar-loop preheader.
+  BasicBlock *BypassBlock = OrigLoop->getLoopPreheader();
+  BasicBlock *ExitBlock = OrigLoop->getExitBlock();
+  assert(ExitBlock && "Must have an exit block");
+
+  assert(OrigLoop->getNumBlocks() == 1 && "Invalid loop");
+  assert(BypassBlock && "Invalid loop structure");
+
+  BasicBlock *VectorPH =
+      BypassBlock->splitBasicBlock(BypassBlock->getTerminator(), "vector.ph");
+  BasicBlock *VecBody = VectorPH->splitBasicBlock(VectorPH->getTerminator(),
+                                                 "vector.body");
+
+  BasicBlock *MiddleBlock = VecBody->splitBasicBlock(VecBody->getTerminator(),
+                                                  "middle.block");
+  BasicBlock *ScalarPH =
+    MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(),
+                                 "scalar.preheader");
+  // Find the induction variable.
+  BasicBlock *OldBasicBlock = OrigLoop->getHeader();
+  OldInduction = Legal->getInduction();
+  assert(OldInduction && "We must have a single phi node.");
+  Type *IdxTy = OldInduction->getType();
+
+  // Use this IR builder to create the loop instructions (Phi, Br, Cmp)
+  // inside the loop.
+  Builder.SetInsertPoint(VecBody->getFirstInsertionPt());
+
+  // Generate the induction variable.
+  Induction = Builder.CreatePHI(IdxTy, 2, "index");
+  Constant *Zero = ConstantInt::get(IdxTy, 0);
+  Constant *Step = ConstantInt::get(IdxTy, VF);
+
+  // Find the loop boundaries.
+  const SCEV *ExitCount = SE->getExitCount(OrigLoop, OrigLoop->getHeader());
+  assert(ExitCount != SE->getCouldNotCompute() && "Invalid loop count");
+
+  // Get the total trip count from the count by adding 1.
+  ExitCount = SE->getAddExpr(ExitCount,
+                             SE->getConstant(ExitCount->getType(), 1));
+
+  // Expand the trip count and place the new instructions in the preheader.
+  // Notice that the pre-header does not change, only the loop body.
+  SCEVExpander Exp(*SE, "induction");
+  Instruction *Loc = BypassBlock->getTerminator();
+
+  // We may need to extend the index in case there is a type mismatch.
+  // We know that the count starts at zero and does not overflow.
+  // We are using Zext because it should be less expensive.
+  if (ExitCount->getType() != Induction->getType())
+    ExitCount = SE->getZeroExtendExpr(ExitCount, IdxTy);
+
+  // Count holds the overall loop count (N).
+  Value *Count = Exp.expandCodeFor(ExitCount, Induction->getType(), Loc);
+  // Now we need to generate the expression for N - (N % VF), which is
+  // the part that the vectorized body will execute.
+  Constant *CIVF = ConstantInt::get(IdxTy, VF);
+  Value *R = BinaryOperator::CreateURem(Count, CIVF, "n.mod.vf", Loc);
+  Value *CountRoundDown = BinaryOperator::CreateSub(Count, R, "n.vec", Loc);
+
+  // Now, compare the new count to zero. If it is zero, jump to the scalar part.
+  Value *Cmp = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,
+                               CountRoundDown, ConstantInt::getNullValue(IdxTy),
+                               "cmp.zero", Loc);
+  BranchInst::Create(MiddleBlock, VectorPH, Cmp, Loc);
+  // Remove the old terminator.
+  Loc->eraseFromParent();
+
+  // Add a check in the middle block to see if we have completed
+  // all of the iterations in the first vector loop.
+  // If (N - N%VF) == N, then we *don't* need to run the remainder.
+  Value *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
+                                CountRoundDown, "cmp.n",
+                                MiddleBlock->getTerminator());
+
+  BranchInst::Create(ExitBlock, ScalarPH, CmpN, MiddleBlock->getTerminator());
+  // Remove the old terminator.
+  MiddleBlock->getTerminator()->eraseFromParent();
+
+  // Create i+1 and fill the PHINode.
+  Value *NextIdx = Builder.CreateAdd(Induction, Step, "index.next");
+  Induction->addIncoming(Zero, VectorPH);
+  Induction->addIncoming(NextIdx, VecBody);
+  // Create the compare.
+  Value *ICmp = Builder.CreateICmpEQ(NextIdx, CountRoundDown);
+  Builder.CreateCondBr(ICmp, MiddleBlock, VecBody);
+
+  // Now we have two terminators. Remove the old one from the block.
+  VecBody->getTerminator()->eraseFromParent();
+
+  // Fix the scalar body iteration count.
+  unsigned BlockIdx = OldInduction->getBasicBlockIndex(ScalarPH);
+  OldInduction->setIncomingValue(BlockIdx, CountRoundDown);
+
+  // Get ready to start creating new instructions into the vectorized body.
+  Builder.SetInsertPoint(VecBody->getFirstInsertionPt());
+
+  // Register the new loop.
+  Loop* Lp = new Loop();
+  LPM->insertLoop(Lp, OrigLoop->getParentLoop());
+
+  Lp->addBasicBlockToLoop(VecBody, LI->getBase());
+
+  Loop *ParentLoop = OrigLoop->getParentLoop();
+  if (ParentLoop) {
+    ParentLoop->addBasicBlockToLoop(ScalarPH, LI->getBase());
+    ParentLoop->addBasicBlockToLoop(VectorPH, LI->getBase());
+    ParentLoop->addBasicBlockToLoop(MiddleBlock, LI->getBase());
+  }
+
+  // Save the state.
+  LoopMiddleBlock = MiddleBlock;
+  LoopExitBlock = ExitBlock;
+  LoopVectorBody = VecBody;
+  LoopScalarBody = OldBasicBlock;
+  LoopBypassBlock = BypassBlock;
+}
+
+void
+SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
+  typedef SmallVector<PHINode*, 4> PhiVector;
+  BasicBlock &BB = *OrigLoop->getHeader();
+  Constant *Zero = ConstantInt::get(
+    IntegerType::getInt32Ty(BB.getContext()), 0);
+
+  // In order to support reduction variables we need to be able to vectorize
+  // Phi nodes. Phi nodes have cycles, so we need to vectorize them in two
+  // steages. First, we create a new vector PHI node with no incoming edges.
+  // We use this value when we vectorize all of the instructions that use the
+  // PHI. Next, after all of the instructions in the block are complete we
+  // add the new incoming edges to the PHI. At this point all of the
+  // instructions in the basic block are vectorized, so we can use them to
+  // construct the PHI.
+  PhiVector PHIsToFix;
+
+  // For each instruction in the old loop.
+  for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) {
+    Instruction *Inst = it;
+
+    switch (Inst->getOpcode()) {
+      case Instruction::Br:
+        // Nothing to do for PHIs and BR, since we already took care of the
+        // loop control flow instructions.
+        continue;
+      case Instruction::PHI:{
+        PHINode* P = cast<PHINode>(Inst);
+        // Special handling for the induction var.
+        if (OldInduction == Inst)
+          continue;
+        // This is phase one of vectorizing PHIs.
+        // This has to be a reduction variable.
+        assert(Legal->getReductionVars()->count(P) && "Not a Reduction");
+        Type *VecTy = VectorType::get(Inst->getType(), VF);
+        WidenMap[Inst] = Builder.CreatePHI(VecTy, 2, "vec.phi");
+        PHIsToFix.push_back(P);
+        continue;
+      }
+      case Instruction::Add:
+      case Instruction::FAdd:
+      case Instruction::Sub:
+      case Instruction::FSub:
+      case Instruction::Mul:
+      case Instruction::FMul:
+      case Instruction::UDiv:
+      case Instruction::SDiv:
+      case Instruction::FDiv:
+      case Instruction::URem:
+      case Instruction::SRem:
+      case Instruction::FRem:
+      case Instruction::Shl:
+      case Instruction::LShr:
+      case Instruction::AShr:
+      case Instruction::And:
+      case Instruction::Or:
+      case Instruction::Xor: {
+        // Just widen binops.
+        BinaryOperator *BinOp = dyn_cast<BinaryOperator>(Inst);
+        Value *A = getVectorValue(Inst->getOperand(0));
+        Value *B = getVectorValue(Inst->getOperand(1));
+        // Use this vector value for all users of the original instruction.
+        WidenMap[Inst] = Builder.CreateBinOp(BinOp->getOpcode(), A, B);
+        break;
+      }
+      case Instruction::Select: {
+        // Widen selects.
+        // If the selector is loop invariant we can create a select
+        // instruction with a scalar condition. Otherwise, use vector-select.
+        Value *Cond = Inst->getOperand(0);
+        bool InvariantCond = SE->isLoopInvariant(SE->getSCEV(Cond), OrigLoop);
+
+        // The condition can be loop invariant  but still defined inside the
+        // loop. This means that we can't just use the original 'cond' value.
+        // We have to take the 'vectorized' value and pick the first lane.
+        // Instcombine will make this a no-op.
+        Cond = getVectorValue(Cond);
+        if (InvariantCond)
+          Cond = Builder.CreateExtractElement(Cond, Builder.getInt32(0));
+
+        Value *Op0 = getVectorValue(Inst->getOperand(1));
+        Value *Op1 = getVectorValue(Inst->getOperand(2));
+        WidenMap[Inst] = Builder.CreateSelect(Cond, Op0, Op1);
+        break;
+      }
+
+      case Instruction::ICmp:
+      case Instruction::FCmp: {
+        // Widen compares. Generate vector compares.
+        bool FCmp = (Inst->getOpcode() == Instruction::FCmp);
+        CmpInst *Cmp = dyn_cast<CmpInst>(Inst);
+        Value *A = getVectorValue(Inst->getOperand(0));
+        Value *B = getVectorValue(Inst->getOperand(1));
+        if (FCmp)
+          WidenMap[Inst] = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
+        else
+          WidenMap[Inst] = Builder.CreateICmp(Cmp->getPredicate(), A, B);
+        break;
+      }
+
+      case Instruction::Store: {
+        // Attempt to issue a wide store.
+        StoreInst *SI = dyn_cast<StoreInst>(Inst);
+        Type *StTy = VectorType::get(SI->getValueOperand()->getType(), VF);
+        Value *Ptr = SI->getPointerOperand();
+        unsigned Alignment = SI->getAlignment();
+        GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
+        // This store does not use GEPs.
+        if (!Legal->isConsecutiveGep(Gep)) {
+          scalarizeInstruction(Inst);
+          break;
+        }
+
+        // The last index does not have to be the induction. It can be
+        // consecutive and be a function of the index. For example A[I+1];
+        unsigned NumOperands = Gep->getNumOperands();
+        Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands -1));
+        LastIndex = Builder.CreateExtractElement(LastIndex, Builder.getInt32(0));
+
+        // Create the new GEP with the new induction variable.
+        GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
+        Gep2->setOperand(NumOperands - 1, LastIndex);
+        Ptr = Builder.Insert(Gep2);
+        Ptr = Builder.CreateBitCast(Ptr, StTy->getPointerTo());
+        Value *Val = getVectorValue(SI->getValueOperand());
+        Builder.CreateStore(Val, Ptr)->setAlignment(Alignment);
+        break;
+      }
+      case Instruction::Load: {
+        // Attempt to issue a wide load.
+        LoadInst *LI = dyn_cast<LoadInst>(Inst);
+        Type *RetTy = VectorType::get(LI->getType(), VF);
+        Value *Ptr = LI->getPointerOperand();
+        unsigned Alignment = LI->getAlignment();
+        GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
+
+        // We don't have a gep. Scalarize the load.
+        if (!Legal->isConsecutiveGep(Gep)) {
+          scalarizeInstruction(Inst);
+          break;
+        }
+
+        // The last index does not have to be the induction. It can be
+        // consecutive and be a function of the index. For example A[I+1];
+        unsigned NumOperands = Gep->getNumOperands();
+        Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands -1));
+        LastIndex = Builder.CreateExtractElement(LastIndex, Builder.getInt32(0));
+
+        // Create the new GEP with the new induction variable.
+        GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
+        Gep2->setOperand(NumOperands - 1, LastIndex);
+        Ptr = Builder.Insert(Gep2);
+        Ptr = Builder.CreateBitCast(Ptr, RetTy->getPointerTo());
+        LI = Builder.CreateLoad(Ptr);
+        LI->setAlignment(Alignment);
+        // Use this vector value for all users of the load.
+        WidenMap[Inst] = LI;
+        break;
+      }
+      case Instruction::ZExt:
+      case Instruction::SExt:
+      case Instruction::FPToUI:
+      case Instruction::FPToSI:
+      case Instruction::FPExt:
+      case Instruction::PtrToInt:
+      case Instruction::IntToPtr:
+      case Instruction::SIToFP:
+      case Instruction::UIToFP:
+      case Instruction::Trunc:
+      case Instruction::FPTrunc:
+      case Instruction::BitCast: {
+        /// Vectorize bitcasts.
+        CastInst *CI = dyn_cast<CastInst>(Inst);
+        Value *A = getVectorValue(Inst->getOperand(0));
+        Type *DestTy = VectorType::get(CI->getType()->getScalarType(), VF);
+        WidenMap[Inst] = Builder.CreateCast(CI->getOpcode(), A, DestTy);
+        break;
+      }
+
+      default:
+        /// All other instructions are unsupported. Scalarize them.
+        scalarizeInstruction(Inst);
+        break;
+    }// end of switch.
+  }// end of for_each instr.
+
+  // At this point every instruction in the original loop is widended to
+  // a vector form. We are almost done. Now, we need to fix the PHI nodes
+  // that we vectorized. The PHI nodes are currently empty because we did
+  // not want to introduce cycles. Notice that the remaining PHI nodes
+  // that we need to fix are reduction variables.
+
+  // Create the 'reduced' values for each of the induction vars.
+  // The reduced values are the vector values that we scalarize and combine
+  // after the loop is finished.
+  for (PhiVector::iterator it = PHIsToFix.begin(), e = PHIsToFix.end();
+       it != e; ++it) {
+    PHINode *RdxPhi = *it;
+    PHINode *VecRdxPhi = dyn_cast<PHINode>(WidenMap[RdxPhi]);
+    assert(RdxPhi && "Unable to recover vectorized PHI");
+
+    // Find the reduction variable descriptor.
+    assert(Legal->getReductionVars()->count(RdxPhi) &&
+           "Unable to find the reduction variable");
+    LoopVectorizationLegality::ReductionDescriptor RdxDesc =
+      (*Legal->getReductionVars())[RdxPhi];
+
+    // We need to generate a reduction vector from the incoming scalar.
+    // To do so, we need to generate the 'identity' vector and overide
+    // one of the elements with the incoming scalar reduction. We need
+    // to do it in the vector-loop preheader.
+    Builder.SetInsertPoint(LoopBypassBlock->getTerminator());
+
+    // This is the vector-clone of the value that leaves the loop.
+    Value *VectorExit = getVectorValue(RdxDesc.LoopExitInstr);
+    Type *VecTy = VectorExit->getType();
+
+    // Find the reduction identity variable. The value of the enum is the
+    // identity. Zero for addition. One for Multiplication.
+    unsigned IdentitySclr =  RdxDesc.Kind;
+    Constant *Identity = getUniformVector(IdentitySclr,
+                                          VecTy->getScalarType());
+
+    // This vector is the Identity vector where the first element is the
+    // incoming scalar reduction.
+    Value *VectorStart = Builder.CreateInsertElement(Identity,
+                                                    RdxDesc.StartValue, Zero);
+
+
+    // Fix the vector-loop phi.
+    // We created the induction variable so we know that the
+    // preheader is the first entry.
+    BasicBlock *VecPreheader = Induction->getIncomingBlock(0);
+
+    // Reductions do not have to start at zero. They can start with
+    // any loop invariant values.
+    VecRdxPhi->addIncoming(VectorStart, VecPreheader);
+    unsigned SelfEdgeIdx = (RdxPhi)->getBasicBlockIndex(LoopScalarBody);
+    Value *Val = getVectorValue(RdxPhi->getIncomingValue(SelfEdgeIdx));
+    VecRdxPhi->addIncoming(Val, LoopVectorBody);
+
+    // Before each round, move the insertion point right between
+    // the PHIs and the values we are going to write.
+    // This allows us to write both PHINodes and the extractelement
+    // instructions.
+    Builder.SetInsertPoint(LoopMiddleBlock->getFirstInsertionPt());
+
+    // This PHINode contains the vectorized reduction variable, or
+    // the initial value vector, if we bypass the vector loop.
+    PHINode *NewPhi = Builder.CreatePHI(VecTy, 2, "rdx.vec.exit.phi");
+    NewPhi->addIncoming(VectorStart, LoopBypassBlock);
+    NewPhi->addIncoming(getVectorValue(RdxDesc.LoopExitInstr), LoopVectorBody);
+
+    // Extract the first scalar.
+    Value *Scalar0 =
+      Builder.CreateExtractElement(NewPhi, Builder.getInt32(0));
+    // Extract and sum the remaining vector elements.
+    for (unsigned i=1; i < VF; ++i) {
+      Value *Scalar1 =
+        Builder.CreateExtractElement(NewPhi, Builder.getInt32(i));
+      if (RdxDesc.Kind == LoopVectorizationLegality::IntegerAdd) {
+        Scalar0 = Builder.CreateAdd(Scalar0, Scalar1);
+      } else {
+        Scalar0 = Builder.CreateMul(Scalar0, Scalar1);
+      }
+    }
+
+    // Now, we need to fix the users of the reduction variable
+    // inside and outside of the scalar remainder loop.
+    // We know that the loop is in LCSSA form. We need to update the
+    // PHI nodes in the exit blocks.
+    for (BasicBlock::iterator LEI = LoopExitBlock->begin(),
+         LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) {
+      PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI);
+      if (!LCSSAPhi) continue;
+
+      // All PHINodes need to have a single entry edge, or two if
+      // we already fixed them.
+      assert(LCSSAPhi->getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
+
+      // We found our reduction value exit-PHI. Update it with the
+      // incoming bypass edge.
+      if (LCSSAPhi->getIncomingValue(0) == RdxDesc.LoopExitInstr) {
+        // Add an edge coming from the bypass.
+        LCSSAPhi->addIncoming(Scalar0, LoopMiddleBlock);
+        break;
+      }
+    }// end of the LCSSA phi scan.
+
+    // Fix the scalar loop reduction variable with the incoming reduction sum
+    // from the vector body and from the backedge value.
+    int IncomingEdgeBlockIdx = (RdxPhi)->getBasicBlockIndex(LoopScalarBody);
+    int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); // The other block.
+    (RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, Scalar0);
+    (RdxPhi)->setIncomingValue(IncomingEdgeBlockIdx, RdxDesc.LoopExitInstr);
+  }// end of for each redux variable.
+}
+
+void SingleBlockLoopVectorizer::cleanup() {
+  // The original basic block.
+  SE->forgetLoop(OrigLoop);
+}
+
+unsigned LoopVectorizationLegality::getLoopMaxVF() {
+  if (!TheLoop->getLoopPreheader()) {
+    assert(false && "No preheader!!");
+    DEBUG(dbgs() << "LV: Loop not normalized." << "\n");
+    return  1;
+  }
+
+  // We can only vectorize single basic block loops.
+  unsigned NumBlocks = TheLoop->getNumBlocks();
+  if (NumBlocks != 1) {
+    DEBUG(dbgs() << "LV: Too many blocks:" << NumBlocks << "\n");
+    return 1;
+  }
+
+  // We need to have a loop header.
+  BasicBlock *BB = TheLoop->getHeader();
+  DEBUG(dbgs() << "LV: Found a loop: " << BB->getName() << "\n");
+
+  // Go over each instruction and look at memory deps.
+  if (!canVectorizeBlock(*BB)) {
+    DEBUG(dbgs() << "LV: Can't vectorize this loop header\n");
+    return 1;
+  }
+
+  // ScalarEvolution needs to be able to find the exit count.
+  const SCEV *ExitCount = SE->getExitCount(TheLoop, BB);
+  if (ExitCount == SE->getCouldNotCompute()) {
+    DEBUG(dbgs() << "LV: SCEV could not compute the loop exit count.\n");
+    return 1;
+  }
+
+  DEBUG(dbgs() << "LV: We can vectorize this loop!\n");
+
+  // Okay! We can vectorize. At this point we don't have any other mem analysis
+  // which may limit our maximum vectorization factor, so just return the
+  // maximum SIMD size.
+  return DefaultVectorizationFactor;
+}
+
+bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) {
+  // Scan the instructions in the block and look for hazards.
+  for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) {
+    Instruction *I = it;
+
+    PHINode *Phi = dyn_cast<PHINode>(I);
+    if (Phi) {
+      // This should not happen because the loop should be normalized.
+      if (Phi->getNumIncomingValues() != 2) {
+        DEBUG(dbgs() << "LV: Found an invalid PHI.\n");
+        return false;
+      }
+      // We only look at integer phi nodes.
+      if (!Phi->getType()->isIntegerTy()) {
+        DEBUG(dbgs() << "LV: Found an non-int PHI.\n");
+        return false;
+      }
+
+      if (isInductionVariable(Phi)) {
+        if (Induction) {
+          DEBUG(dbgs() << "LV: Found too many inductions."<< *Phi <<"\n");
+          return false;
+        }
+        DEBUG(dbgs() << "LV: Found the induction PHI."<< *Phi <<"\n");
+        Induction = Phi;
+        continue;
+      }
+      if (AddReductionVar(Phi, IntegerAdd)) {
+        DEBUG(dbgs() << "LV: Found an ADD reduction PHI."<< *Phi <<"\n");
+        continue;
+      }
+      if (AddReductionVar(Phi, IntegerMult)) {
+        DEBUG(dbgs() << "LV: Found an Mult reduction PHI."<< *Phi <<"\n");
+        continue;
+      }
+
+      DEBUG(dbgs() << "LV: Found an unidentified PHI."<< *Phi <<"\n");
+      return false;
+    }// end of PHI handling
+
+    // We still don't handle functions.
+    CallInst *CI = dyn_cast<CallInst>(I);
+    if (CI) {
+      DEBUG(dbgs() << "LV: Found a call site:"<<
+            CI->getCalledFunction()->getName() << "\n");
+      return false;
+    }
+
+    // We do not re-vectorize vectors.
+    if (!VectorType::isValidElementType(I->getType()) &&
+        !I->getType()->isVoidTy()) {
+      DEBUG(dbgs() << "LV: Found unvectorizable type." << "\n");
+      return false;
+    }
+
+    // Reduction instructions are allowed to have exit users.
+    // All other instructions must not have external users.
+    if (!AllowedExit.count(I))
+      //Check that all of the users of the loop are inside the BB.
+      for (Value::use_iterator it = I->use_begin(), e = I->use_end();
+           it != e; ++it) {
+        Instruction *U = cast<Instruction>(*it);
+        // This user may be a reduction exit value.
+        BasicBlock *Parent = U->getParent();
+        if (Parent != &BB) {
+          DEBUG(dbgs() << "LV: Found an outside user for : "<< *U << "\n");
+          return false;
+        }
+    }
+  } // next instr.
+
+  if (!Induction) {
+      DEBUG(dbgs() << "LV: Did not find an induction var.\n");
+      return false;
+  }
+
+  // If the memory dependencies do not prevent us from
+  // vectorizing, then vectorize.
+  return canVectorizeMemory(BB);
+}
+
+bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) {
+  typedef SmallVector<Value*, 16> ValueVector;
+  typedef SmallPtrSet<Value*, 16> ValueSet;
+  // Holds the Load and Store *instructions*.
+  ValueVector Loads;
+  ValueVector Stores;
+
+  // Scan the BB and collect legal loads and stores.
+  for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) {
+    Instruction *I = it;
+
+    // If this is a load, save it. If this instruction can read from memory
+    // but is not a load, then we quit. Notice that we don't handle function
+    // calls that read or write.
+    if (I->mayReadFromMemory()) {
+      LoadInst *Ld = dyn_cast<LoadInst>(I);
+      if (!Ld) return false;
+      if (!Ld->isSimple()) {
+        DEBUG(dbgs() << "LV: Found a non-simple load.\n");
+        return false;
+      }
+      Loads.push_back(Ld);
+      continue;
+    }
+
+    // Save store instructions. Abort if other instructions write to memory.
+    if (I->mayWriteToMemory()) {
+      StoreInst *St = dyn_cast<StoreInst>(I);
+      if (!St) return false;
+      if (!St->isSimple()) {
+        DEBUG(dbgs() << "LV: Found a non-simple store.\n");
+        return false;
+      }
+      Stores.push_back(St);
+    }
+  } // next instr.
+
+  // Now we have two lists that hold the loads and the stores.
+  // Next, we find the pointers that they use.
+
+  // Check if we see any stores. If there are no stores, then we don't
+  // care if the pointers are *restrict*.
+  if (!Stores.size()) {
+        DEBUG(dbgs() << "LV: Found a read-only loop!\n");
+        return true;
+  }
+
+  // Holds the read and read-write *pointers* that we find.
+  ValueVector Reads;
+  ValueVector ReadWrites;
+
+  // Holds the analyzed pointers. We don't want to call GetUnderlyingObjects
+  // multiple times on the same object. If the ptr is accessed twice, once
+  // for read and once for write, it will only appear once (on the write
+  // list). This is okay, since we are going to check for conflicts between
+  // writes and between reads and writes, but not between reads and reads.
+  ValueSet Seen;
+
+  ValueVector::iterator I, IE;
+  for (I = Stores.begin(), IE = Stores.end(); I != IE; ++I) {
+    StoreInst *ST = dyn_cast<StoreInst>(*I);
+    assert(ST && "Bad StoreInst");
+    Value* Ptr = ST->getPointerOperand();
+    // If we did *not* see this pointer before, insert it to
+    // the read-write list. At this phase it is only a 'write' list.
+    if (Seen.insert(Ptr))
+      ReadWrites.push_back(Ptr);
+  }
+
+  for (I = Loads.begin(), IE = Loads.end(); I != IE; ++I) {
+    LoadInst *LD = dyn_cast<LoadInst>(*I);
+    assert(LD && "Bad LoadInst");
+    Value* Ptr = LD->getPointerOperand();
+    // If we did *not* see this pointer before, insert it to the
+    // read list. If we *did* see it before, then it is already in
+    // the read-write list. This allows us to vectorize expressions
+    // such as A[i] += x;  Because the address of A[i] is a read-write
+    // pointer. This only works if the index of A[i] is consecutive.
+    // If the address of i is unknown (for example A[B[i]]) then we may
+    // read a few words, modify, and write a few words, and some of the
+    // words may be written to the same address.
+    if (Seen.insert(Ptr) || !isConsecutiveGep(Ptr))
+      Reads.push_back(Ptr);
+  }
+
+  // Now that the pointers are in two lists (Reads and ReadWrites), we
+  // can check that there are no conflicts between each of the writes and
+  // between the writes to the reads.
+  ValueSet WriteObjects;
+  ValueVector TempObjects;
+
+  // Check that the read-writes do not conflict with other read-write
+  // pointers.
+  for (I = ReadWrites.begin(), IE = ReadWrites.end(); I != IE; ++I) {
+    GetUnderlyingObjects(*I, TempObjects, DL);
+    for (ValueVector::iterator it=TempObjects.begin(), e=TempObjects.end();
+         it != e; ++it) {
+      if (!isIdentifiedSafeObject(*it)) {
+        DEBUG(dbgs() << "LV: Found an unidentified write ptr:"<< **it <<"\n");
+        return false;
+      }
+      if (!WriteObjects.insert(*it)) {
+        DEBUG(dbgs() << "LV: Found a possible write-write reorder:"
+              << **it <<"\n");
+        return false;
+      }
+    }
+    TempObjects.clear();
+  }
+
+  /// Check that the reads don't conflict with the read-writes.
+  for (I = Reads.begin(), IE = Reads.end(); I != IE; ++I) {
+    GetUnderlyingObjects(*I, TempObjects, DL);
+    for (ValueVector::iterator it=TempObjects.begin(), e=TempObjects.end();
+         it != e; ++it) {
+      if (!isIdentifiedSafeObject(*it)) {
+        DEBUG(dbgs() << "LV: Found an unidentified read ptr:"<< **it <<"\n");
+        return false;
+      }
+      if (WriteObjects.count(*it)) {
+        DEBUG(dbgs() << "LV: Found a possible read/write reorder:"
+              << **it <<"\n");
+        return false;
+      }
+    }
+    TempObjects.clear();
+  }
+
+  // All is okay.
+  return true;
+}
+
+/// Checks if the value is a Global variable or if it is an Arguments
+/// marked with the NoAlias attribute.
+bool LoopVectorizationLegality::isIdentifiedSafeObject(Value* Val) {
+  assert(Val && "Invalid value");
+  if (isa<GlobalValue>(Val))
+    return true;
+  if (isa<AllocaInst>(Val))
+    return true;
+  if (Argument *A = dyn_cast<Argument>(Val))
+    return A->hasNoAliasAttr();
+  return false;
+}
+
+bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
+                                                ReductionKind Kind) {
+  if (Phi->getNumIncomingValues() != 2)
+    return false;
+
+  // Find the possible incoming reduction variable.
+  BasicBlock *BB = Phi->getParent();
+  int SelfEdgeIdx = Phi->getBasicBlockIndex(BB);
+  int InEdgeBlockIdx = (SelfEdgeIdx ? 0 : 1); // The other entry.
+  Value *RdxStart = Phi->getIncomingValue(InEdgeBlockIdx);
+
+  // ExitInstruction is the single value which is used outside the loop.
+  // We only allow for a single reduction value to be used outside the loop.
+  // This includes users of the reduction, variables (which form a cycle
+  // which ends in the phi node).
+  Instruction *ExitInstruction = 0;
+
+  // Iter is our iterator. We start with the PHI node and scan for all of the
+  // users of this instruction. All users must be instructions which can be
+  // used as reduction variables (such as ADD). We may have a single
+  // out-of-block user. They cycle must end with the original PHI.
+  // Also, we can't have multiple block-local users.
+  Instruction *Iter = Phi;
+  while (true) {
+    // Any reduction instr must be of one of the allowed kinds.
+    if (!isReductionInstr(Iter, Kind))
+      return false;
+
+    // Did we found a user inside this block ?
+    bool FoundInBlockUser = false;
+    // Did we reach the initial PHI node ?
+    bool FoundStartPHI = false;
+
+    // If the instruction has no users then this is a broken
+    // chain and can't be a reduction variable.
+    if (Iter->use_empty())
+      return false;
+
+    // For each of the *users* of iter.
+    for (Value::use_iterator it = Iter->use_begin(), e = Iter->use_end();
+         it != e; ++it) {
+      Instruction *U = cast<Instruction>(*it);
+      // We already know that the PHI is a user.
+      if (U == Phi) {
+        FoundStartPHI = true;
+        continue;
+      }
+      // Check if we found the exit user.
+      BasicBlock *Parent = U->getParent();
+      if (Parent != BB) {
+        // We must have a single exit instruction.
+        if (ExitInstruction != 0)
+          return false;
+        ExitInstruction = Iter;
+      }
+      // We can't have multiple inside users.
+      if (FoundInBlockUser)
+        return false;
+      FoundInBlockUser = true;
+      Iter = U;
+    }
+
+    // We found a reduction var if we have reached the original
+    // phi node and we only have a single instruction with out-of-loop
+    // users.
+   if (FoundStartPHI && ExitInstruction) {
+     // This instruction is allowed to have out-of-loop users.
+     AllowedExit.insert(ExitInstruction);
+
+     // Save the description of this reduction variable.
+     ReductionDescriptor RD(RdxStart, ExitInstruction, Kind);
+     Reductions[Phi] = RD;
+     return true;
+   }
+  }
+}
+
+bool
+LoopVectorizationLegality::isReductionInstr(Instruction *I,
+                                            ReductionKind Kind) {
+    switch (I->getOpcode()) {
+    default:
+      return false;
+    case Instruction::PHI:
+      // possibly.
+      return true;
+    case Instruction::Add:
+    case Instruction::Sub:
+      return Kind == IntegerAdd;
+    case Instruction::Mul:
+    case Instruction::UDiv:
+    case Instruction::SDiv:
+      return Kind == IntegerMult;
+    }
+}
+
+bool LoopVectorizationLegality::isInductionVariable(PHINode *Phi) {
+  // Check that the PHI is consecutive and starts at zero.
+  const SCEV *PhiScev = SE->getSCEV(Phi);
+  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PhiScev);
+  if (!AR) {
+    DEBUG(dbgs() << "LV: PHI is not a poly recurrence.\n");
+    return false;
+  }
+  const SCEV *Step = AR->getStepRecurrence(*SE);
+  const SCEV *Start = AR->getStart();
+
+  if (!Step->isOne() || !Start->isZero()) {
+    DEBUG(dbgs() << "LV: PHI does not start at zero or steps by one.\n");
+    return false;
+  }
+  return true;
+}
+
+} // namespace
+
+char LoopVectorize::ID = 0;
+static const char lv_name[] = "Loop Vectorization";
+INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
+INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
+INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
+INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
+
+namespace llvm {
+  Pass *createLoopVectorizePass() {
+    return new LoopVectorize();
+  }
+}
+
diff --git a/lib/Transforms/Vectorize/Vectorize.cpp b/lib/Transforms/Vectorize/Vectorize.cpp
index 1ef60029bcf..d26973a7b38 100644
--- a/lib/Transforms/Vectorize/Vectorize.cpp
+++ b/lib/Transforms/Vectorize/Vectorize.cpp
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file implements common infrastructure for libLLVMVectorizeOpts.a, which 
+// This file implements common infrastructure for libLLVMVectorizeOpts.a, which
 // implements several vectorization transformations over the LLVM intermediate
 // representation, including the C bindings for that library.
 //
@@ -23,10 +23,11 @@
 
 using namespace llvm;
 
-/// initializeVectorizationPasses - Initialize all passes linked into the 
+/// initializeVectorizationPasses - Initialize all passes linked into the
 /// Vectorization library.
 void llvm::initializeVectorization(PassRegistry &Registry) {
   initializeBBVectorizePass(Registry);
+  initializeLoopVectorizePass(Registry);
 }
 
 void LLVMInitializeVectorization(LLVMPassRegistryRef R) {
@@ -37,3 +38,6 @@ void LLVMAddBBVectorizePass(LLVMPassManagerRef PM) {
   unwrap(PM)->add(createBBVectorizePass());
 }
 
+void LLVMAddLoopVectorizePass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createLoopVectorizePass());
+}
diff --git a/lib/VMCore/DataLayout.cpp b/lib/VMCore/DataLayout.cpp
index 3f75069a60a..e6994be257c 100644
--- a/lib/VMCore/DataLayout.cpp
+++ b/lib/VMCore/DataLayout.cpp
@@ -559,8 +559,10 @@ uint64_t DataLayout::getTypeSizeInBits(Type *Ty) const {
   // only 80 bits contain information.
   case Type::X86_FP80TyID:
     return 80;
-  case Type::VectorTyID:
-    return cast<VectorType>(Ty)->getBitWidth();
+  case Type::VectorTyID: {
+    VectorType *VTy = cast<VectorType>(Ty);
+    return VTy->getNumElements()*getTypeSizeInBits(VTy->getElementType());
+  }
   default:
     llvm_unreachable("DataLayout::getTypeSizeInBits(): Unsupported type");
   }
diff --git a/test/BugPoint/crash-narrowfunctiontest.ll b/test/BugPoint/crash-narrowfunctiontest.ll
index d080d9dd4b0..9df823ab973 100644
--- a/test/BugPoint/crash-narrowfunctiontest.ll
+++ b/test/BugPoint/crash-narrowfunctiontest.ll
@@ -2,6 +2,7 @@
 ;
 ; RUN: bugpoint -load %llvmshlibdir/BugpointPasses%shlibext %s -output-prefix %t -bugpoint-crashcalls -silence-passes > /dev/null
 ; REQUIRES: loadable_module
+; XFAIL: lto
 
 define i32 @foo() { ret i32 1 }
 
diff --git a/test/BugPoint/metadata.ll b/test/BugPoint/metadata.ll
index 0eda5667ba4..98c79ee03a6 100644
--- a/test/BugPoint/metadata.ll
+++ b/test/BugPoint/metadata.ll
@@ -1,6 +1,7 @@
 ; RUN: bugpoint -load %llvmshlibdir/BugpointPasses%shlibext %s -output-prefix %t -bugpoint-crashcalls -silence-passes > /dev/null
 ; RUN: llvm-dis %t-reduced-simplified.bc -o - | FileCheck %s
 ; REQUIRES: loadable_module
+; XFAIL: lto
 
 ; Bugpoint should keep the call's metadata attached to the call.
 
diff --git a/test/BugPoint/remove_arguments_test.ll b/test/BugPoint/remove_arguments_test.ll
index 29a03b83107..13aa9c5a653 100644
--- a/test/BugPoint/remove_arguments_test.ll
+++ b/test/BugPoint/remove_arguments_test.ll
@@ -1,6 +1,7 @@
 ; RUN: bugpoint -load %llvmshlibdir/BugpointPasses%shlibext %s -output-prefix %t -bugpoint-crashcalls -silence-passes
 ; RUN: llvm-dis %t-reduced-simplified.bc -o - | FileCheck %s
 ; REQUIRES: loadable_module
+; XFAIL: lto
 
 ; Test to make sure that arguments are removed from the function if they are 
 ; unnecessary. And clean up any types that that frees up too.
diff --git a/test/CodeGen/ARM/2012-10-18-PR14099-ByvalFrameAddress.ll b/test/CodeGen/ARM/2012-10-18-PR14099-ByvalFrameAddress.ll
new file mode 100644
index 00000000000..fcc6a7f7e96
--- /dev/null
+++ b/test/CodeGen/ARM/2012-10-18-PR14099-ByvalFrameAddress.ll
@@ -0,0 +1,29 @@
+; RUN: llc < %s -mtriple=armv7-linux-gnueabi | FileCheck %s
+
+%struct.s = type { [4 x i32] }
+@v = constant %struct.s zeroinitializer; 
+
+declare void @f(%struct.s* %p);
+
+; CHECK: t:
+define void @t(i32 %a, %struct.s* byval %s) nounwind {
+entry:
+
+; Here we need to only check proper start address of restored %s argument.
+; CHECK:      sub     sp, sp, #16
+; CHECK:      push    {r11, lr}
+; CHECK:      add     r0, sp, #12
+; CHECK:      stm     r0, {r1, r2, r3}
+; CHECK:      add     r0, sp, #12
+; CHECK-NEXT: bl f
+  call void @f(%struct.s* %s)
+  ret void
+}
+
+; CHECK: caller:
+define void @caller() {
+
+; CHECK:      ldm     r0, {r1, r2, r3}
+  call void @t(i32 0, %struct.s* @v);
+  ret void
+}
diff --git a/test/CodeGen/ARM/trap.ll b/test/CodeGen/ARM/trap.ll
index 38842a9646f..21865f8e4ae 100644
--- a/test/CodeGen/ARM/trap.ll
+++ b/test/CodeGen/ARM/trap.ll
@@ -14,4 +14,16 @@ entry:
   unreachable
 }
 
+define void @t2() nounwind {
+entry:
+; INSTR: t2:
+; INSTR: trap
+
+; FUNC: t2:
+; FUNC: bl __trap
+  call void @llvm.debugtrap()
+  unreachable
+}
+
 declare void @llvm.trap() nounwind
+declare void @llvm.debugtrap() nounwind
diff --git a/test/CodeGen/MSP430/fp.ll b/test/CodeGen/MSP430/fp.ll
new file mode 100644
index 00000000000..c3273eff05c
--- /dev/null
+++ b/test/CodeGen/MSP430/fp.ll
@@ -0,0 +1,17 @@
+; RUN: llc -O0 -disable-fp-elim < %s | FileCheck %s
+
+target datalayout = "e-p:16:16:16-i8:8:8-i16:16:16-i32:16:32-n8:16"
+target triple = "msp430---elf"
+
+define void @fp() nounwind {
+entry:
+; CHECK: fp:
+; CHECK: push.w r4
+; CHECK: mov.w r1, r4
+; CHECK: sub.w #2, r1
+  %i = alloca i16, align 2
+; CHECK: mov.w #0, -2(r4)
+  store i16 0, i16* %i, align 2
+; CHECK: pop.w r4
+  ret void
+}
diff --git a/test/CodeGen/Mips/brconeq.ll b/test/CodeGen/Mips/brconeq.ll
new file mode 100644
index 00000000000..613391557ef
--- /dev/null
+++ b/test/CodeGen/Mips/brconeq.ll
@@ -0,0 +1,38 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@i = global i32 5, align 4
+@j = global i32 10, align 4
+@result = global i32 0, align 4
+
+define void @test() nounwind {
+entry:
+  %0 = load i32* @i, align 4
+  %1 = load i32* @j, align 4
+  %cmp = icmp eq i32 %0, %1
+; 16:	cmp	${{[0-9]+}}, ${{[0-9]+}}
+; 16:	bteqz	$[[LABEL:[0-9A-Ba-b_]+]]
+; 16: $[[LABEL]]:
+  br i1 %cmp, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  store i32 1, i32* @result, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %entry, %if.then
+  ret void
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/test/CodeGen/Mips/brconeqk.ll b/test/CodeGen/Mips/brconeqk.ll
new file mode 100644
index 00000000000..2c0e72dabd2
--- /dev/null
+++ b/test/CodeGen/Mips/brconeqk.ll
@@ -0,0 +1,22 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@i = global i32 5, align 4
+@result = global i32 0, align 4
+
+define void @test() nounwind {
+entry:
+  %0 = load i32* @i, align 4
+  %cmp = icmp eq i32 %0, 10
+  br i1 %cmp, label %if.end, label %if.then
+; 16:	cmpi	${{[0-9]+}}, {{[0-9]+}}
+; 16:	bteqz	$[[LABEL:[0-9A-Ba-b_]+]]
+; 16: $[[LABEL]]:
+if.then:                                          ; preds = %entry
+  store i32 1, i32* @result, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %entry, %if.then
+  ret void
+}
+
+
diff --git a/test/CodeGen/Mips/brconeqz.ll b/test/CodeGen/Mips/brconeqz.ll
new file mode 100644
index 00000000000..5586e7b976d
--- /dev/null
+++ b/test/CodeGen/Mips/brconeqz.ll
@@ -0,0 +1,20 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@i = global i32 5, align 4
+@result = global i32 0, align 4
+
+define void @test() nounwind {
+entry:
+  %0 = load i32* @i, align 4
+  %cmp = icmp eq i32 %0, 0
+  br i1 %cmp, label %if.end, label %if.then
+; 16:	beqz	${{[0-9]+}}, $[[LABEL:[0-9A-Ba-b_]+]]
+; 16: $[[LABEL]]:
+if.then:                                          ; preds = %entry
+  store i32 1, i32* @result, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %entry, %if.then
+  ret void
+}
+
diff --git a/test/CodeGen/Mips/brconge.ll b/test/CodeGen/Mips/brconge.ll
new file mode 100644
index 00000000000..02f0a633b31
--- /dev/null
+++ b/test/CodeGen/Mips/brconge.ll
@@ -0,0 +1,37 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@i = global i32 5, align 4
+@j = global i32 10, align 4
+@k = global i32 5, align 4
+@result1 = global i32 0, align 4
+@result2 = global i32 1, align 4
+
+define void @test() nounwind {
+entry:
+  %0 = load i32* @i, align 4
+  %1 = load i32* @j, align 4
+  %cmp = icmp slt i32 %0, %1
+  br i1 %cmp, label %if.then, label %if.end
+
+; 16:	slt	${{[0-9]+}}, ${{[0-9]+}}
+; 16:	bteqz	$[[LABEL:[0-9A-Ba-b_]+]]
+; 16: $[[LABEL]]:
+
+if.then:                                          ; preds = %entry
+  store i32 1, i32* @result1, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  %2 = load i32* @k, align 4
+  %cmp1 = icmp slt i32 %0, %2
+  br i1 %cmp1, label %if.then2, label %if.end3
+
+if.then2:                                         ; preds = %if.end
+  store i32 1, i32* @result1, align 4
+  br label %if.end3
+
+if.end3:                                          ; preds = %if.then2, %if.end
+  ret void
+}
+
+
diff --git a/test/CodeGen/Mips/brcongt.ll b/test/CodeGen/Mips/brcongt.ll
new file mode 100644
index 00000000000..767b51b21b9
--- /dev/null
+++ b/test/CodeGen/Mips/brcongt.ll
@@ -0,0 +1,25 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@i = global i32 5, align 4
+@j = global i32 10, align 4
+@k = global i32 5, align 4
+@result = global i32 0, align 4
+
+define void @test() nounwind {
+entry:
+  %0 = load i32* @i, align 4
+  %1 = load i32* @j, align 4
+  %cmp = icmp sgt i32 %0, %1
+  br i1 %cmp, label %if.end, label %if.then
+; 16:	slt	${{[0-9]+}}, ${{[0-9]+}}
+; 16:	btnez	$[[LABEL:[0-9A-Ba-b_]+]]
+; 16: $[[LABEL]]:
+if.then:                                          ; preds = %entry
+  store i32 1, i32* @result, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %entry, %if.then
+  ret void
+}
+
+
diff --git a/test/CodeGen/Mips/brconle.ll b/test/CodeGen/Mips/brconle.ll
new file mode 100644
index 00000000000..854b2481c6e
--- /dev/null
+++ b/test/CodeGen/Mips/brconle.ll
@@ -0,0 +1,37 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@i = global i32 -5, align 4
+@j = global i32 10, align 4
+@k = global i32 -5, align 4
+@result1 = global i32 0, align 4
+@result2 = global i32 1, align 4
+
+define void @test() nounwind {
+entry:
+  %0 = load i32* @j, align 4
+  %1 = load i32* @i, align 4
+  %cmp = icmp sgt i32 %0, %1
+  br i1 %cmp, label %if.then, label %if.end
+
+; 16:	slt	${{[0-9]+}}, ${{[0-9]+}}
+; 16:	bteqz	$[[LABEL:[0-9A-Ba-b_]+]]
+; 16: $[[LABEL]]:
+
+if.then:                                          ; preds = %entry
+  store i32 1, i32* @result1, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  %2 = load i32* @k, align 4
+  %cmp1 = icmp sgt i32 %1, %2
+  br i1 %cmp1, label %if.then2, label %if.end3
+
+if.then2:                                         ; preds = %if.end
+  store i32 0, i32* @result1, align 4
+  br label %if.end3
+
+if.end3:                                          ; preds = %if.then2, %if.end
+  ret void
+}
+
+
diff --git a/test/CodeGen/Mips/brconlt.ll b/test/CodeGen/Mips/brconlt.ll
new file mode 100644
index 00000000000..931a3e8c7ba
--- /dev/null
+++ b/test/CodeGen/Mips/brconlt.ll
@@ -0,0 +1,27 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@i = global i32 5, align 4
+@j = global i32 10, align 4
+@k = global i32 5, align 4
+@result = global i32 0, align 4
+
+define void @test() nounwind {
+entry:
+  %0 = load i32* @j, align 4
+  %1 = load i32* @i, align 4
+  %cmp = icmp slt i32 %0, %1
+  br i1 %cmp, label %if.end, label %if.then
+
+; 16:	slt	${{[0-9]+}}, ${{[0-9]+}}
+; 16:	btnez	$[[LABEL:[0-9A-Ba-b_]+]]
+; 16: $[[LABEL]]:
+
+if.then:                                          ; preds = %entry
+  store i32 1, i32* @result, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %entry, %if.then
+  ret void
+}
+
+
diff --git a/test/CodeGen/Mips/brconne.ll b/test/CodeGen/Mips/brconne.ll
new file mode 100644
index 00000000000..5d5bde3fcf9
--- /dev/null
+++ b/test/CodeGen/Mips/brconne.ll
@@ -0,0 +1,26 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@i = global i32 5, align 4
+@j = global i32 5, align 4
+@result = global i32 0, align 4
+
+define void @test() nounwind {
+entry:
+  %0 = load i32* @j, align 4
+  %1 = load i32* @i, align 4
+  %cmp = icmp eq i32 %0, %1
+  br i1 %cmp, label %if.then, label %if.end
+; 16:	cmp	${{[0-9]+}}, ${{[0-9]+}}
+; 16:	btnez	$[[LABEL:[0-9A-Ba-b_]+]]
+; 16:   lw ${{[0-9]+}}, %got(result)(${{[0-9]+}})
+; 16: $[[LABEL]]:
+
+if.then:                                          ; preds = %entry
+  store i32 1, i32* @result, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret void
+}
+
+
diff --git a/test/CodeGen/Mips/brconnek.ll b/test/CodeGen/Mips/brconnek.ll
new file mode 100644
index 00000000000..6208d7c5a04
--- /dev/null
+++ b/test/CodeGen/Mips/brconnek.ll
@@ -0,0 +1,25 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@j = global i32 5, align 4
+@result = global i32 0, align 4
+
+define void @test() nounwind {
+entry:
+  %0 = load i32* @j, align 4
+  %cmp = icmp eq i32 %0, 5
+  br i1 %cmp, label %if.then, label %if.end
+
+; 16:	cmpi	${{[0-9]+}}, {{[0-9]+}}
+; 16:	btnez	$[[LABEL:[0-9A-Ba-b_]+]]
+; 16:   lw ${{[0-9]+}}, %got(result)(${{[0-9]+}})
+; 16: $[[LABEL]]:
+
+if.then:                                          ; preds = %entry
+  store i32 1, i32* @result, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret void
+}
+
+
diff --git a/test/CodeGen/Mips/brconnez.ll b/test/CodeGen/Mips/brconnez.ll
new file mode 100644
index 00000000000..47db7901b51
--- /dev/null
+++ b/test/CodeGen/Mips/brconnez.ll
@@ -0,0 +1,24 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@j = global i32 0, align 4
+@result = global i32 0, align 4
+
+define void @test() nounwind {
+entry:
+  %0 = load i32* @j, align 4
+  %cmp = icmp eq i32 %0, 0
+  br i1 %cmp, label %if.then, label %if.end
+
+; 16:	bnez	${{[0-9]+}}, $[[LABEL:[0-9A-Ba-b_]+]]
+; 16:   lw ${{[0-9]+}}, %got(result)(${{[0-9]+}})
+; 16: $[[LABEL]]:
+
+if.then:                                          ; preds = %entry
+  store i32 1, i32* @result, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret void
+}
+
+
diff --git a/test/CodeGen/Mips/mips64-sret.ll b/test/CodeGen/Mips/mips64-sret.ll
new file mode 100644
index 00000000000..498c5fe1747
--- /dev/null
+++ b/test/CodeGen/Mips/mips64-sret.ll
@@ -0,0 +1,14 @@
+; RUN: llc -march=mips64el -mcpu=mips64r2 -mattr=n64 -O0 < %s
+
+%struct.S = type { [8 x i32] }
+
+@g = common global %struct.S zeroinitializer, align 4
+
+define void @f(%struct.S* noalias sret %agg.result) nounwind {
+entry:
+  %0 = bitcast %struct.S* %agg.result to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* bitcast (%struct.S* @g to i8*), i64 32, i32 4, i1 false)
+  ret void
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
diff --git a/test/CodeGen/Mips/tailcall.ll b/test/CodeGen/Mips/tailcall.ll
new file mode 100644
index 00000000000..4989636a20d
--- /dev/null
+++ b/test/CodeGen/Mips/tailcall.ll
@@ -0,0 +1,100 @@
+; RUN: llc -march=mipsel -relocation-model=pic -enable-mips-tail-calls < %s | \
+; RUN: FileCheck %s -check-prefix=PIC32
+; RUN: llc -march=mipsel -relocation-model=static \
+; RUN: -enable-mips-tail-calls < %s | FileCheck %s -check-prefix=STATIC32
+; RUN: llc -march=mips64el -mcpu=mips64r2 -mattr=+n64 -enable-mips-tail-calls \
+; RUN: < %s | FileCheck %s -check-prefix=N64
+
+@g0 = common global i32 0, align 4
+@g1 = common global i32 0, align 4
+@g2 = common global i32 0, align 4
+@g3 = common global i32 0, align 4
+@g4 = common global i32 0, align 4
+@g5 = common global i32 0, align 4
+@g6 = common global i32 0, align 4
+@g7 = common global i32 0, align 4
+@g8 = common global i32 0, align 4
+@g9 = common global i32 0, align 4
+
+define i32 @caller1(i32 %a0) nounwind {
+entry:
+; PIC32-NOT: jalr
+; STATIC32-NOT: jal
+; N64-NOT: jalr
+
+  %call = tail call i32 @callee1(i32 1, i32 1, i32 1, i32 %a0) nounwind
+  ret i32 %call
+}
+
+declare i32 @callee1(i32, i32, i32, i32)
+
+define i32 @caller2(i32 %a0, i32 %a1, i32 %a2, i32 %a3) nounwind {
+entry:
+; PIC32: jalr
+; STATIC32: jal
+; N64-NOT: jalr
+
+  %call = tail call i32 @callee2(i32 1, i32 %a0, i32 %a1, i32 %a2, i32 %a3) nounwind
+  ret i32 %call
+}
+
+declare i32 @callee2(i32, i32, i32, i32, i32)
+
+define i32 @caller3(i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %a4) nounwind {
+entry:
+; PIC32: jalr
+; STATIC32: jal
+; N64-NOT: jalr
+
+  %call = tail call i32 @callee3(i32 1, i32 1, i32 1, i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %a4) nounwind
+  ret i32 %call
+}
+
+declare i32 @callee3(i32, i32, i32, i32, i32, i32, i32, i32)
+
+define i32 @caller4(i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7) nounwind {
+entry:
+; PIC32: jalr
+; STATIC32: jal
+; N64: jalr
+
+  %call = tail call i32 @callee4(i32 1, i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7) nounwind
+  ret i32 %call
+}
+
+declare i32 @callee4(i32, i32, i32, i32, i32, i32, i32, i32, i32)
+
+define i32 @caller5() nounwind readonly {
+entry:
+; PIC32-NOT: jalr
+; STATIC32-NOT: jal
+; N64-NOT: jalr
+
+  %0 = load i32* @g0, align 4
+  %1 = load i32* @g1, align 4
+  %2 = load i32* @g2, align 4
+  %3 = load i32* @g3, align 4
+  %4 = load i32* @g4, align 4
+  %5 = load i32* @g5, align 4
+  %6 = load i32* @g6, align 4
+  %7 = load i32* @g7, align 4
+  %8 = load i32* @g8, align 4
+  %9 = load i32* @g9, align 4
+  %call = tail call fastcc i32 @callee5(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %8, i32 %9)
+  ret i32 %call
+}
+
+define internal fastcc i32 @callee5(i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7, i32 %a8, i32 %a9) nounwind readnone noinline {
+entry:
+  %add = add nsw i32 %a1, %a0
+  %add1 = add nsw i32 %add, %a2
+  %add2 = add nsw i32 %add1, %a3
+  %add3 = add nsw i32 %add2, %a4
+  %add4 = add nsw i32 %add3, %a5
+  %add5 = add nsw i32 %add4, %a6
+  %add6 = add nsw i32 %add5, %a7
+  %add7 = add nsw i32 %add6, %a8
+  %add8 = add nsw i32 %add7, %a9
+  ret i32 %add8
+}
+
diff --git a/test/CodeGen/PowerPC/i64_fp_round.ll b/test/CodeGen/PowerPC/i64_fp_round.ll
new file mode 100644
index 00000000000..5a0c072c9c5
--- /dev/null
+++ b/test/CodeGen/PowerPC/i64_fp_round.ll
@@ -0,0 +1,27 @@
+; RUN: llc -mcpu=pwr7 < %s | FileCheck %s
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+define float @test(i64 %x) nounwind readnone {
+entry:
+  %conv = sitofp i64 %x to float
+  ret float %conv
+}
+
+; Verify that we get the code sequence needed to avoid double-rounding.
+; Note that only parts of the sequence are checked for here, to allow
+; for minor code generation differences.
+
+; CHECK: sradi [[REGISTER:[0-9]+]], 3, 53
+; CHECK: addi [[REGISTER:[0-9]+]], [[REGISTER]], 1
+; CHECK: cmpldi 0, [[REGISTER]], 1
+; CHECK: isel [[REGISTER:[0-9]+]], {{[0-9]+}}, 3, 1
+; CHECK: std [[REGISTER]], -{{[0-9]+}}(1)
+
+
+; Also check that with -enable-unsafe-fp-math we do not get that extra
+; code sequence.  Simply verify that there is no "isel" present.
+
+; RUN: llc -mcpu=pwr7 -enable-unsafe-fp-math < %s | FileCheck %s -check-prefix=UNSAFE
+; CHECK-UNSAFE-NOT: isel
+
diff --git a/test/CodeGen/X86/2012-10-18-crash-dagco.ll b/test/CodeGen/X86/2012-10-18-crash-dagco.ll
new file mode 100644
index 00000000000..5b98624a37b
--- /dev/null
+++ b/test/CodeGen/X86/2012-10-18-crash-dagco.ll
@@ -0,0 +1,61 @@
+; RUN: llc -march=x86-64 -mcpu=corei7 -disable-cgp-select2branch < %s
+
+; We should not crash on this test.
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32"
+target triple = "i386-apple-darwin9.0.0"
+
+@global = external constant [411 x i8], align 1
+
+define void @snork() nounwind {
+bb:
+  br i1 undef, label %bb26, label %bb27
+
+bb26:                                             ; preds = %bb48, %bb26, %bb
+  switch i32 undef, label %bb26 [
+    i32 142771596, label %bb28
+  ]
+
+bb27:                                             ; preds = %bb48, %bb
+  switch i32 undef, label %bb49 [
+    i32 142771596, label %bb28
+  ]
+
+bb28:                                             ; preds = %bb27, %bb26
+  %tmp = load i32* null
+  %tmp29 = trunc i32 %tmp to i8
+  store i8* undef, i8** undef
+  %tmp30 = load i32* null
+  %tmp31 = icmp eq i32 %tmp30, 0
+  %tmp32 = getelementptr inbounds [411 x i8]* @global, i32 0, i32 undef
+  %tmp33 = load i8* %tmp32, align 1
+  %tmp34 = getelementptr inbounds [411 x i8]* @global, i32 0, i32 0
+  %tmp35 = load i8* %tmp34, align 1
+  %tmp36 = select i1 %tmp31, i8 %tmp35, i8 %tmp33
+  %tmp37 = select i1 undef, i8 %tmp29, i8 %tmp36
+  %tmp38 = zext i8 %tmp37 to i32
+  %tmp39 = select i1 undef, i32 0, i32 %tmp38
+  %tmp40 = getelementptr inbounds i32* null, i32 %tmp39
+  %tmp41 = load i32* %tmp40, align 4
+  %tmp42 = load i32* undef, align 4
+  %tmp43 = load i32* undef
+  %tmp44 = xor i32 %tmp42, %tmp43
+  %tmp45 = lshr i32 %tmp44, 8
+  %tmp46 = lshr i32 %tmp44, 7
+  call void @spam()
+  unreachable
+
+bb47:                                             ; No predecessors!
+  ret void
+
+bb48:                                             ; No predecessors!
+  br i1 undef, label %bb27, label %bb26
+
+bb49:                                             ; preds = %bb49, %bb27
+  br label %bb49
+
+bb50:                                             ; preds = %bb50
+  br label %bb50
+}
+
+declare void @spam() noreturn nounwind
diff --git a/test/CodeGen/X86/buildvec-insertvec.ll b/test/CodeGen/X86/buildvec-insertvec.ll
new file mode 100644
index 00000000000..3fb69a48b3c
--- /dev/null
+++ b/test/CodeGen/X86/buildvec-insertvec.ll
@@ -0,0 +1,15 @@
+; RUN: llc < %s -mcpu=corei7 -mtriple=x86_64-unknown-linux-gnu | FileCheck %s
+
+define void @foo(<3 x float> %in, <4 x i8>* nocapture %out) nounwind {
+  %t0 = fptoui <3 x float> %in to <3 x i8>
+  %t1 = shufflevector <3 x i8> %t0, <3 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
+  %t2 = insertelement <4 x i8> %t1, i8 -1, i32 3
+  store <4 x i8> %t2, <4 x i8>* %out, align 4
+  ret void
+; CHECK: foo
+; CHECK: cvttps2dq
+; CHECK-NOT: pextrd
+; CHECK: pinsrd
+; CHECK-NEXT: pshufb
+; CHECK: ret
+}
diff --git a/test/CodeGen/X86/extract-concat.ll b/test/CodeGen/X86/extract-concat.ll
new file mode 100644
index 00000000000..704309eb650
--- /dev/null
+++ b/test/CodeGen/X86/extract-concat.ll
@@ -0,0 +1,17 @@
+; RUN: llc < %s -mcpu=corei7 -mtriple=x86_64-unknown-linux-gnu | FileCheck %s
+
+define void @foo(<4 x float> %in, <4 x i8>* %out) {
+  %t0 = fptosi <4 x float> %in to <4 x i32>
+  %t1 = trunc <4 x i32> %t0 to <4 x i16>
+  %t2 = shufflevector <4 x i16> %t1, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %t3 = trunc <8 x i16> %t2 to <8 x i8>
+  %t4 = shufflevector <8 x i8> %t3, <8 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %t5 = insertelement <4 x i8> %t4, i8 -1, i32 3
+  store <4 x i8> %t5, <4 x i8>* %out
+  ret void
+; CHECK: foo
+; CHECK: cvttps2dq
+; CHECK-NOT: pextrd
+; CHECK: pshufb
+; CHECK: ret
+}
diff --git a/test/CodeGen/X86/pr14090.ll b/test/CodeGen/X86/pr14090.ll
new file mode 100644
index 00000000000..d76b912fd8e
--- /dev/null
+++ b/test/CodeGen/X86/pr14090.ll
@@ -0,0 +1,76 @@
+; RUN: llc < %s -march=x86-64 -print-before=stack-coloring -print-after=stack-coloring >%t 2>&1 && FileCheck <%t %s
+
+define void @foo(i64* %retval.i, i32 %call, i32* %.ph.i80, i32 %fourteen, i32* %out.lo, i32* %out.hi) nounwind align 2 {
+entry:
+  %_Tmp.i39 = alloca i64, align 8
+  %retval.i33 = alloca i64, align 8
+  %_Tmp.i = alloca i64, align 8
+  %retval.i.i = alloca i64, align 8
+  %_First.i = alloca i64, align 8
+
+  %0 = load i64* %retval.i, align 8
+
+  %1 = load i64* %retval.i, align 8
+
+  %_Tmp.i39.0.cast73 = bitcast i64* %_Tmp.i39 to i8*
+  call void @llvm.lifetime.start(i64 8, i8* %_Tmp.i39.0.cast73)
+  store i64 %1, i64* %_Tmp.i39, align 8
+  %cmp.i.i.i40 = icmp slt i32 %call, 0
+  %2 = lshr i64 %1, 32
+  %3 = trunc i64 %2 to i32
+  %sub.i.i.i44 = sub i32 0, %call
+  %cmp2.i.i.i45 = icmp ult i32 %3, %sub.i.i.i44
+  %or.cond.i.i.i46 = and i1 %cmp.i.i.i40, %cmp2.i.i.i45
+  %add.i.i.i47 = add i32 %3, %call
+  %sub5.i.i.i48 = lshr i32 %add.i.i.i47, 5
+  %trunc.i50 = trunc i64 %1 to i32
+  %inttoptr.i51 = inttoptr i32 %trunc.i50 to i32*
+  %add61617.i.i.i52 = or i32 %sub5.i.i.i48, -134217728
+  %add61617.i.sub5.i.i.i53 = select i1 %or.cond.i.i.i46, i32 %add61617.i.i.i52, i32 %sub5.i.i.i48
+  %storemerge2.i.i54 = getelementptr inbounds i32* %inttoptr.i51, i32 %add61617.i.sub5.i.i.i53
+  %_Tmp.i39.0.cast74 = bitcast i64* %_Tmp.i39 to i32**
+  store i32* %storemerge2.i.i54, i32** %_Tmp.i39.0.cast74, align 8
+  %storemerge.i.i55 = and i32 %add.i.i.i47, 31
+  %_Tmp.i39.4.raw_idx = getelementptr inbounds i8* %_Tmp.i39.0.cast73, i32 4
+  %_Tmp.i39.4.cast = bitcast i8* %_Tmp.i39.4.raw_idx to i32*
+  store i32 %storemerge.i.i55, i32* %_Tmp.i39.4.cast, align 4
+  %srcval.i56 = load i64* %_Tmp.i39, align 8
+  call void @llvm.lifetime.end(i64 8, i8* %_Tmp.i39.0.cast73)
+
+; CHECK: Before Merge disjoint stack slots
+; CHECK: [[PREFIX15:MOV64mr.*<fi#]]{{[0-9]}}[[SUFFIX15:.*;]] mem:ST8[%fifteen]
+; CHECK: [[PREFIX87:MOV32mr.*;]] mem:ST4[%sunkaddr87]
+
+; CHECK: After Merge disjoint stack slots
+; CHECK: [[PREFIX15]]{{[0-9]}}[[SUFFIX15]] mem:ST8[%_Tmp.i39]
+; CHECK: [[PREFIX87]] mem:ST4[<unknown>]
+
+  %fifteen = bitcast i64* %retval.i.i to i32**
+  %sixteen = bitcast i64* %retval.i.i to i8*
+  call void @llvm.lifetime.start(i64 8, i8* %sixteen)
+  store i32* %.ph.i80, i32** %fifteen, align 8, !tbaa !0
+  %sunkaddr = ptrtoint i64* %retval.i.i to i32
+  %sunkaddr86 = add i32 %sunkaddr, 4
+  %sunkaddr87 = inttoptr i32 %sunkaddr86 to i32*
+  store i32 %fourteen, i32* %sunkaddr87, align 4, !tbaa !3
+  %seventeen = load i64* %retval.i.i, align 8
+  call void @llvm.lifetime.end(i64 8, i8* %sixteen)
+  %eighteen = lshr i64 %seventeen, 32
+  %nineteen = trunc i64 %eighteen to i32
+  %shl.i.i.i = shl i32 1, %nineteen
+
+  store i32 %shl.i.i.i, i32* %out.lo, align 8
+  store i32 %nineteen, i32* %out.hi, align 8
+
+  ret void
+}
+
+declare void @llvm.lifetime.start(i64, i8* nocapture) nounwind
+
+declare void @llvm.lifetime.end(i64, i8* nocapture) nounwind
+
+!0 = metadata !{metadata !"int", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}
+!3 = metadata !{metadata !"any pointer", metadata !1}
+!4 = metadata !{metadata !"vtable pointer", metadata !2}
diff --git a/test/CodeGen/X86/pr14098.ll b/test/CodeGen/X86/pr14098.ll
new file mode 100644
index 00000000000..6ce2449ab6a
--- /dev/null
+++ b/test/CodeGen/X86/pr14098.ll
@@ -0,0 +1,23 @@
+; RUN: llc -mtriple i386-unknown-linux-gnu -relocation-model=pic -verify-machineinstrs < %s
+; We used to crash on this.
+
+declare void @foo()
+declare void @foo3(i1 %x)
+define void @bar(i1 %a1, i16 %a2) nounwind align 2 {
+bb0:
+  %a3 = trunc i16 %a2 to i8
+  %a4 = lshr i16 %a2, 8
+  %a5 = trunc i16 %a4 to i8
+  br i1 %a1, label %bb1, label %bb2
+bb1:
+  br label %bb2
+bb2:
+  %a6 = phi i8 [ 3, %bb0 ], [ %a5, %bb1 ]
+  %a7 = phi i8 [ 9, %bb0 ], [ %a3, %bb1 ]
+  %a8 = icmp eq i8 %a6, 1
+  call void @foo()
+  %a9 = icmp eq i8 %a7, 0
+  call void @foo3(i1 %a9)
+  call void @foo3(i1 %a8)
+  ret void
+}
diff --git a/test/CodeGen/X86/sjlj.ll b/test/CodeGen/X86/sjlj.ll
index d594e982994..681db009438 100644
--- a/test/CodeGen/X86/sjlj.ll
+++ b/test/CodeGen/X86/sjlj.ll
@@ -1,5 +1,7 @@
 ; RUN: llc < %s -mtriple=i386-pc-linux -mcpu=corei7 -relocation-model=static | FileCheck --check-prefix=X86 %s
-; RUN: llc < %s -mtriple=x86_64-pc-linux -mcpu=corei7 | FileCheck --check-prefix=X64 %s
+; RUN: llc < %s -mtriple=i386-pc-linux -mcpu=corei7 -relocation-model=pic | FileCheck --check-prefix=PIC86 %s
+; RUN: llc < %s -mtriple=x86_64-pc-linux -mcpu=corei7 -relocation-model=static | FileCheck --check-prefix=X64 %s
+; RUN: llc < %s -mtriple=x86_64-pc-linux -mcpu=corei7 -relocation-model=pic | FileCheck --check-prefix=PIC64 %s
 
 @buf = internal global [5 x i8*] zeroinitializer
 
@@ -20,14 +22,26 @@ define i32 @sj0() nounwind {
   ret i32 %r
 ; X86: sj0
 ; x86: movl %ebp, buf
-; x86: movl ${{.*LBB.*}}, buf+4
 ; X86: movl %esp, buf+8
+; x86: movl ${{.*LBB.*}}, buf+4
 ; X86: ret
+; PIC86: sj0
+; PIC86: movl %ebp, buf@GOTOFF(%[[GOT:.*]])
+; PIC86: movl %esp, buf@GOTOFF+8(%[[GOT]])
+; PIC86: leal {{.*LBB.*}}@GOTOFF(%[[GOT]]), %[[LREG:.*]]
+; PIC86: movl %[[LREG]], buf@GOTOFF+4
+; PIC86: ret
 ; X64: sj0
 ; x64: movq %rbp, buf(%rip)
 ; x64: movq ${{.*LBB.*}}, buf+8(%rip)
 ; X64: movq %rsp, buf+16(%rip)
 ; X64: ret
+; PIC64: sj0
+; PIC64: movq %rbp, buf(%rip)
+; PIC64: movq %rsp, buf+16(%rip)
+; PIC64: leaq {{.*LBB.*}}(%rip), %[[LREG:.*]]
+; PIC64: movq %[[LREG]], buf+8(%rip)
+; PIC64: ret
 }
 
 define void @lj0() nounwind {
diff --git a/test/Instrumentation/AddressSanitizer/basic.ll b/test/Instrumentation/AddressSanitizer/basic.ll
index d1900018706..655f69c16fd 100644
--- a/test/Instrumentation/AddressSanitizer/basic.ll
+++ b/test/Instrumentation/AddressSanitizer/basic.ll
@@ -69,3 +69,23 @@ entry:
   store i32 42, i32* %a
   ret void
 }
+
+; Check that asan leaves just one alloca.
+
+declare void @alloca_test_use([10 x i8]*)
+define void @alloca_test() address_safety {
+entry:
+  %x = alloca [10 x i8], align 1
+  %y = alloca [10 x i8], align 1
+  %z = alloca [10 x i8], align 1
+  call void @alloca_test_use([10 x i8]* %x)
+  call void @alloca_test_use([10 x i8]* %y)
+  call void @alloca_test_use([10 x i8]* %z)
+  ret void
+}
+
+; CHECK: define void @alloca_test()
+; CHECK: = alloca
+; CHECK-NOT: = alloca
+; CHECK: ret void
+
diff --git a/test/Makefile b/test/Makefile
index 3c6b5b68a4f..810fdded465 100644
--- a/test/Makefile
+++ b/test/Makefile
@@ -117,6 +117,16 @@ else
 ENABLE_ASSERTIONS=1
 endif
 
+# Derive whether or not LTO is enabled by checking the extra options.
+LTO_IS_ENABLED := 0
+ifneq ($(findstring -flto,$(CompileCommonOpts)),)
+LTO_IS_ENABLED := 1
+else
+ifneq ($(findstring -O4,$(CompileCommonOpts)),)
+LTO_IS_ENABLED := 1
+endif
+endif
+
 lit.site.cfg: FORCE
 	@echo "Making LLVM 'lit.site.cfg' file..."
 	@$(ECHOPATH) s=@TARGET_TRIPLE@=$(TARGET_TRIPLE)=g > lit.tmp
@@ -129,6 +139,7 @@ lit.site.cfg: FORCE
 	@$(ECHOPATH) s=@OCAMLOPT@=$(OCAMLOPT) -cc $(subst *,'\\\"',*$(subst =,"\\=",$(CXX_FOR_OCAMLOPT))*) -I $(LibDir)/ocaml=g >> lit.tmp
 	@$(ECHOPATH) s=@ENABLE_SHARED@=$(ENABLE_SHARED)=g >> lit.tmp
 	@$(ECHOPATH) s=@ENABLE_ASSERTIONS@=$(ENABLE_ASSERTIONS)=g >> lit.tmp
+	@$(ECHOPATH) s=@LTO_IS_ENABLED@=$(LTO_IS_ENABLED)=g >> lit.tmp
 	@$(ECHOPATH) s=@TARGETS_TO_BUILD@=$(TARGETS_TO_BUILD)=g >> lit.tmp
 	@$(ECHOPATH) s=@LLVM_BINDINGS@=$(BINDINGS_TO_BUILD)=g >> lit.tmp
 	@$(ECHOPATH) s=@HOST_OS@=$(HOST_OS)=g >> lit.tmp
diff --git a/test/Transforms/BBVectorize/simple-ldstr-ptrs.ll b/test/Transforms/BBVectorize/simple-ldstr-ptrs.ll
index f992d415477..6294543cd81 100644
--- a/test/Transforms/BBVectorize/simple-ldstr-ptrs.ll
+++ b/test/Transforms/BBVectorize/simple-ldstr-ptrs.ll
@@ -79,3 +79,53 @@ entry:
 ; CHECK-AO-NOT: <2 x
 }
 
+; Simple 3-pair chain with loads and stores (using ptrs and gep)
+; using pointer vectors.
+define void @test3(<2 x i64*>* %a, <2 x i64*>* %b, <2 x i64*>* %c) nounwind uwtable readonly {
+entry:
+  %i0 = load <2 x i64*>* %a, align 8
+  %i1 = load <2 x i64*>* %b, align 8
+  %arrayidx3 = getelementptr inbounds <2 x i64*>* %a, i64 1
+  %i3 = load <2 x i64*>* %arrayidx3, align 8
+  %arrayidx4 = getelementptr inbounds <2 x i64*>* %b, i64 1
+  %i4 = load <2 x i64*>* %arrayidx4, align 8
+  %j1 = extractelement <2 x i64*> %i1, i32 0
+  %j4 = extractelement <2 x i64*> %i4, i32 0
+  %o1 = load i64* %j1, align 8
+  %o4 = load i64* %j4, align 8
+  %j0 = extractelement <2 x i64*> %i0, i32 0
+  %j3 = extractelement <2 x i64*> %i3, i32 0
+  %ptr0 = getelementptr inbounds i64* %j0, i64 %o1
+  %ptr3 = getelementptr inbounds i64* %j3, i64 %o4
+  %qtr0 = insertelement <2 x i64*> undef, i64* %ptr0, i32 0
+  %rtr0 = insertelement <2 x i64*> %qtr0, i64* %ptr0, i32 1
+  %qtr3 = insertelement <2 x i64*> undef, i64* %ptr3, i32 0
+  %rtr3 = insertelement <2 x i64*> %qtr3, i64* %ptr3, i32 1
+  store <2 x i64*> %rtr0, <2 x i64*>* %c, align 8
+  %arrayidx5 = getelementptr inbounds <2 x i64*>* %c, i64 1
+  store <2 x i64*> %rtr3, <2 x i64*>* %arrayidx5, align 8
+  ret void
+; CHECK: @test3
+; CHECK: %i0.v.i0 = bitcast <2 x i64*>* %a to <4 x i64*>*
+; CHECK: %i1 = load <2 x i64*>* %b, align 8
+; CHECK: %i0 = load <4 x i64*>* %i0.v.i0, align 8
+; CHECK: %arrayidx4 = getelementptr inbounds <2 x i64*>* %b, i64 1
+; CHECK: %i4 = load <2 x i64*>* %arrayidx4, align 8
+; CHECK: %j1 = extractelement <2 x i64*> %i1, i32 0
+; CHECK: %j4 = extractelement <2 x i64*> %i4, i32 0
+; CHECK: %o1 = load i64* %j1, align 8
+; CHECK: %o4 = load i64* %j4, align 8
+; CHECK: %ptr0.v.i1.1 = insertelement <2 x i64> undef, i64 %o1, i32 0
+; CHECK: %ptr0.v.i1.2 = insertelement <2 x i64> %ptr0.v.i1.1, i64 %o4, i32 1
+; CHECK: %ptr0.v.i0 = shufflevector <4 x i64*> %i0, <4 x i64*> undef, <2 x i32> <i32 0, i32 2>
+; CHECK: %ptr0 = getelementptr inbounds <2 x i64*> %ptr0.v.i0, <2 x i64> %ptr0.v.i1.2
+; CHECK: %rtr0 = shufflevector <2 x i64*> %ptr0, <2 x i64*> undef, <2 x i32> zeroinitializer
+; CHECK: %rtr3 = shufflevector <2 x i64*> %ptr0, <2 x i64*> undef, <2 x i32> <i32 1, i32 1>
+; CHECK: %0 = bitcast <2 x i64*>* %c to <4 x i64*>*
+; CHECK: %1 = shufflevector <2 x i64*> %rtr0, <2 x i64*> %rtr3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK: store <4 x i64*> %1, <4 x i64*>* %0, align 8
+; CHECK: ret void
+; CHECK-AO: @test3
+; CHECK-AO-NOT: <4 x
+}
+
diff --git a/test/Transforms/IndVarSimplify/2012-10-19-congruent-constant.ll b/test/Transforms/IndVarSimplify/2012-10-19-congruent-constant.ll
new file mode 100644
index 00000000000..5c478669d29
--- /dev/null
+++ b/test/Transforms/IndVarSimplify/2012-10-19-congruent-constant.ll
@@ -0,0 +1,27 @@
+; RUN: opt -S -indvars < %s | FileCheck %s
+
+; PR12627
+define void @test1(i32 %x) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %phi1 = phi i1 [ false, %entry ], [ %cmpa, %for.body ]
+  %phi2 = phi i1 [ false, %entry ], [ %cmpb, %for.body ]
+  %i.07 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  tail call void @aux(i1 %phi1, i1 %phi2) nounwind
+  %cmpa = icmp sgt i32 %i.07, 200
+  %cmpb = icmp sgt i32 %i.07, 100
+  %inc = add nsw i32 %i.07, 1
+  %exitcond = icmp eq i32 %inc, 100
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+
+; CHECK: @test1
+; CHECK-NOT: phi i1
+; CHECK: call void @aux(i1 false, i1 false)
+}
+
+declare void @aux(i1, i1)
diff --git a/test/Transforms/IndVarSimplify/crash.ll b/test/Transforms/IndVarSimplify/crash.ll
index 62af42b9d68..1b702a3b1a3 100644
--- a/test/Transforms/IndVarSimplify/crash.ll
+++ b/test/Transforms/IndVarSimplify/crash.ll
@@ -113,3 +113,21 @@ bb9:
   ret void
 }
 
+; PR12536
+define void @fn1() noreturn nounwind {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.end, %entry
+  %b.0 = phi i32 [ undef, %entry ], [ %conv, %for.end ]
+  br label %for.cond1
+
+for.cond1:                                        ; preds = %for.cond1, %for.cond
+  %c.0 = phi i32 [ %b.0, %for.cond1 ], [ 0, %for.cond ]
+  br i1 undef, label %for.cond1, label %for.end
+
+for.end:                                          ; preds = %for.cond1
+  %cmp2 = icmp slt i32 %c.0, 1
+  %conv = zext i1 %cmp2 to i32
+  br label %for.cond
+}
diff --git a/test/Transforms/IndVarSimplify/no-iv-rewrite.ll b/test/Transforms/IndVarSimplify/no-iv-rewrite.ll
index bfdd000e38e..507f695e67c 100644
--- a/test/Transforms/IndVarSimplify/no-iv-rewrite.ll
+++ b/test/Transforms/IndVarSimplify/no-iv-rewrite.ll
@@ -199,7 +199,6 @@ entry:
 ; back to the loop iv.
 ;
 ; CHECK: loop:
-; CHECK: phi i32
 ; CHECK-NOT: phi
 ; CHECK: exit:
 loop:
diff --git a/test/Transforms/InstCombine/obfuscated_splat.ll b/test/Transforms/InstCombine/obfuscated_splat.ll
new file mode 100644
index 00000000000..c25dade168a
--- /dev/null
+++ b/test/Transforms/InstCombine/obfuscated_splat.ll
@@ -0,0 +1,11 @@
+; RUN: opt -instcombine -S %s | FileCheck %s
+
+define void @test(<4 x float> *%in_ptr, <4 x float> *%out_ptr) {
+  %A = load <4 x float>* %in_ptr, align 16
+  %B = shufflevector <4 x float> %A, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 undef, i32 undef>
+  %C = shufflevector <4 x float> %B, <4 x float> %A, <4 x i32> <i32 0, i32 1, i32 4, i32 undef>
+  %D = shufflevector <4 x float> %C, <4 x float> %A, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
+; CHECK:  %D = shufflevector <4 x float> %A, <4 x float> undef, <4 x i32> zeroinitializer
+  store <4 x float> %D, <4 x float> *%out_ptr
+  ret void
+}
diff --git a/test/Transforms/InstCombine/select.ll b/test/Transforms/InstCombine/select.ll
index 4baae2618dd..cc3aacdce3c 100644
--- a/test/Transforms/InstCombine/select.ll
+++ b/test/Transforms/InstCombine/select.ll
@@ -829,3 +829,37 @@ define i1 @test63(i1 %A, i1 %B) {
 ; CHECK: %C = or i1 %B, %not
 ; CHECK: ret i1 %C
 }
+
+; PR14131
+define void @test64(i32 %p, i16 %b) noreturn nounwind {
+entry:
+  %p.addr.0.insert.mask = and i32 %p, -65536
+  %conv2 = and i32 %p, 65535
+  br i1 undef, label %lor.rhs, label %lor.end
+
+lor.rhs:
+  %p.addr.0.extract.trunc = trunc i32 %p.addr.0.insert.mask to i16
+  %phitmp = zext i16 %p.addr.0.extract.trunc to i32
+  br label %lor.end
+
+lor.end:
+  %t.1 = phi i32 [ 0, %entry ], [ %phitmp, %lor.rhs ]
+  %conv6 = zext i16 %b to i32
+  %div = udiv i32 %conv6, %t.1
+  %tobool8 = icmp eq i32 %div, 0
+  %cmp = icmp eq i32 %t.1, 0
+  %cmp12 = icmp ult i32 %conv2, 2
+  %cmp.sink = select i1 %tobool8, i1 %cmp12, i1 %cmp
+  br i1 %cmp.sink, label %cond.end17, label %cond.false16
+
+cond.false16:
+  br label %cond.end17
+
+cond.end17:
+  br label %while.body
+
+while.body:
+  br label %while.body
+; CHECK: @test64
+; CHECK-NOT: select
+}
diff --git a/test/Transforms/InstCombine/strcpy-1.ll b/test/Transforms/InstCombine/strcpy-1.ll
new file mode 100644
index 00000000000..b6cf048b2a8
--- /dev/null
+++ b/test/Transforms/InstCombine/strcpy-1.ll
@@ -0,0 +1,45 @@
+; Test that the strcpy library call simplifier works correctly.
+; rdar://6839935
+; RUN: opt < %s -instcombine -S | FileCheck %s
+;
+; This transformation requires the pointer size, as it assumes that size_t is
+; the size of a pointer.
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32"
+
+@hello = constant [6 x i8] c"hello\00"
+@a = common global [32 x i8] zeroinitializer, align 1
+@b = common global [32 x i8] zeroinitializer, align 1
+
+declare i8* @strcpy(i8*, i8*)
+
+define void @test_simplify1() {
+; CHECK: @test_simplify1
+
+  %dst = getelementptr [32 x i8]* @a, i32 0, i32 0
+  %src = getelementptr [6 x i8]* @hello, i32 0, i32 0
+
+  call i8* @strcpy(i8* %dst, i8* %src)
+; CHECK: @llvm.memcpy.p0i8.p0i8.i32
+  ret void
+}
+
+define i8* @test_simplify2() {
+; CHECK: @test_simplify2
+
+  %dst = getelementptr [32 x i8]* @a, i32 0, i32 0
+
+  %ret = call i8* @strcpy(i8* %dst, i8* %dst)
+; CHECK: ret i8* getelementptr inbounds ([32 x i8]* @a, i32 0, i32 0)
+  ret i8* %ret
+}
+
+define i8* @test_no_simplify1() {
+; CHECK: @test_no_simplify1
+
+  %dst = getelementptr [32 x i8]* @a, i32 0, i32 0
+  %src = getelementptr [32 x i8]* @b, i32 0, i32 0
+
+  %ret = call i8* @strcpy(i8* %dst, i8* %src)
+; CHECK: call i8* @strcpy
+  ret i8* %ret
+}
diff --git a/test/Transforms/InstCombine/strcpy-2.ll b/test/Transforms/InstCombine/strcpy-2.ll
new file mode 100644
index 00000000000..779e9fdd959
--- /dev/null
+++ b/test/Transforms/InstCombine/strcpy-2.ll
@@ -0,0 +1,22 @@
+; Test that the strcpy library call simplifier works correctly.
+; RUN: opt < %s -instcombine -S | FileCheck %s
+;
+; This transformation requires the pointer size, as it assumes that size_t is
+; the size of a pointer.
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32"
+
+@hello = constant [6 x i8] c"hello\00"
+@a = common global [32 x i8] zeroinitializer, align 1
+
+declare i16* @strcpy(i8*, i8*)
+
+define void @test_no_simplify1() {
+; CHECK: @test_no_simplify1
+
+  %dst = getelementptr [32 x i8]* @a, i32 0, i32 0
+  %src = getelementptr [6 x i8]* @hello, i32 0, i32 0
+
+  call i16* @strcpy(i8* %dst, i8* %src)
+; CHECK: call i16* @strcpy
+  ret void
+}
diff --git a/test/Transforms/InstCombine/strcpy_chk-1.ll b/test/Transforms/InstCombine/strcpy_chk-1.ll
index c03e8a348b8..3e48f4fd305 100644
--- a/test/Transforms/InstCombine/strcpy_chk-1.ll
+++ b/test/Transforms/InstCombine/strcpy_chk-1.ll
@@ -7,16 +7,16 @@ target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f3
 
 @a = common global [60 x i8] zeroinitializer, align 1
 @b = common global [60 x i8] zeroinitializer, align 1
-@.str = private constant [8 x i8] c"abcdefg\00"
+@.str = private constant [12 x i8] c"abcdefghijk\00"
 
 ; Check cases where slen >= strlen (src).
 
 define void @test_simplify1() {
 ; CHECK: @test_simplify1
   %dst = getelementptr inbounds [60 x i8]* @a, i32 0, i32 0
-  %src = getelementptr inbounds [8 x i8]* @.str, i32 0, i32 0
+  %src = getelementptr inbounds [12 x i8]* @.str, i32 0, i32 0
 
-; CHECK-NEXT: call i8* @strcpy
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32
   call i8* @__strcpy_chk(i8* %dst, i8* %src, i32 60)
   ret void
 }
@@ -24,19 +24,19 @@ define void @test_simplify1() {
 define void @test_simplify2() {
 ; CHECK: @test_simplify2
   %dst = getelementptr inbounds [60 x i8]* @a, i32 0, i32 0
-  %src = getelementptr inbounds [8 x i8]* @.str, i32 0, i32 0
+  %src = getelementptr inbounds [12 x i8]* @.str, i32 0, i32 0
 
-; CHECK-NEXT: call i8* @strcpy
-  call i8* @__strcpy_chk(i8* %dst, i8* %src, i32 8)
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32
+  call i8* @__strcpy_chk(i8* %dst, i8* %src, i32 12)
   ret void
 }
 
 define void @test_simplify3() {
 ; CHECK: @test_simplify3
   %dst = getelementptr inbounds [60 x i8]* @a, i32 0, i32 0
-  %src = getelementptr inbounds [8 x i8]* @.str, i32 0, i32 0
+  %src = getelementptr inbounds [12 x i8]* @.str, i32 0, i32 0
 
-; CHECK-NEXT: call i8* @strcpy
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32
   call i8* @__strcpy_chk(i8* %dst, i8* %src, i32 -1)
   ret void
 }
@@ -53,36 +53,42 @@ define void @test_simplify4() {
   ret void
 }
 
-define void @test_no_simplify1() {
-; CHECK: @test_no_simplify1
+; Check case where the string length is not constant.
+
+define void @test_simplify5() {
+; CHECK: @test_simplify5
   %dst = getelementptr inbounds [60 x i8]* @a, i32 0, i32 0
-  %src = getelementptr inbounds [60 x i8]* @b, i32 0, i32 0
+  %src = getelementptr inbounds [12 x i8]* @.str, i32 0, i32 0
 
-; CHECK-NEXT: call i8* @__strcpy_chk
-  call i8* @__strcpy_chk(i8* %dst, i8* %src, i32 8)
+; CHECK: @__memcpy_chk
+  %len = call i32 @llvm.objectsize.i32(i8* %dst, i1 false)
+  call i8* @__strcpy_chk(i8* %dst, i8* %src, i32 %len)
   ret void
 }
 
-; Check case were slen < strlen (src).
+; Check case where the source and destination are the same.
 
-define void @test_no_simplify2() {
-; CHECK: @test_no_simplify2
+define i8* @test_simplify6() {
+; CHECK: @test_simplify6
   %dst = getelementptr inbounds [60 x i8]* @a, i32 0, i32 0
-  %src = getelementptr inbounds [8 x i8]* @.str, i32 0, i32 0
 
-; CHECK-NEXT: call i8* @__strcpy_chk
-  call i8* @__strcpy_chk(i8* %dst, i8* %src, i32 3)
-  ret void
+; CHECK: getelementptr inbounds ([60 x i8]* @a, i32 0, i32 0)
+  %len = call i32 @llvm.objectsize.i32(i8* %dst, i1 false)
+  %ret = call i8* @__strcpy_chk(i8* %dst, i8* %dst, i32 %len)
+  ret i8* %ret
 }
 
-define void @test_no_simplify3() {
-; CHECK: @test_no_simplify3
+; Check case where slen < strlen (src).
+
+define void @test_no_simplify1() {
+; CHECK: @test_no_simplify1
   %dst = getelementptr inbounds [60 x i8]* @a, i32 0, i32 0
-  %src = getelementptr inbounds [8 x i8]* @.str, i32 0, i32 0
+  %src = getelementptr inbounds [60 x i8]* @b, i32 0, i32 0
 
 ; CHECK-NEXT: call i8* @__strcpy_chk
-  call i8* @__strcpy_chk(i8* %dst, i8* %src, i32 0)
+  call i8* @__strcpy_chk(i8* %dst, i8* %src, i32 8)
   ret void
 }
 
 declare i8* @__strcpy_chk(i8*, i8*, i32) nounwind
+declare i32 @llvm.objectsize.i32(i8*, i1) nounwind readonly
diff --git a/test/Transforms/LoopVectorize/2012-10-20-infloop.ll b/test/Transforms/LoopVectorize/2012-10-20-infloop.ll
new file mode 100644
index 00000000000..5caaffc8dde
--- /dev/null
+++ b/test/Transforms/LoopVectorize/2012-10-20-infloop.ll
@@ -0,0 +1,27 @@
+; RUN: opt < %s  -loop-vectorize -dce
+
+; Check that we don't fall into an infinite loop.
+define void @test() nounwind {
+entry:
+ br label %for.body
+
+for.body:
+ %0 = phi i32 [ 1, %entry ], [ 0, %for.body ]
+ br label %for.body
+}
+
+
+
+define void @test2() nounwind {
+entry:
+ br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+ %indvars.iv47 = phi i64 [ 0, %entry ], [ %indvars.iv.next48, %for.body ]
+ %0 = phi i32 [ 1, %entry ], [ 0, %for.body ]
+ %indvars.iv.next48 = add i64 %indvars.iv47, 1
+ br i1 undef, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+ unreachable
+}
diff --git a/test/Transforms/LoopVectorize/gcc-examples.ll b/test/Transforms/LoopVectorize/gcc-examples.ll
new file mode 100644
index 00000000000..6fb1792b2c8
--- /dev/null
+++ b/test/Transforms/LoopVectorize/gcc-examples.ll
@@ -0,0 +1,648 @@
+; RUN: opt < %s  -loop-vectorize -dce -instcombine -licm -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+@b = common global [2048 x i32] zeroinitializer, align 16
+@c = common global [2048 x i32] zeroinitializer, align 16
+@a = common global [2048 x i32] zeroinitializer, align 16
+@G = common global [32 x [1024 x i32]] zeroinitializer, align 16
+@ub = common global [1024 x i32] zeroinitializer, align 16
+@uc = common global [1024 x i32] zeroinitializer, align 16
+@d = common global [2048 x i32] zeroinitializer, align 16
+@fa = common global [1024 x float] zeroinitializer, align 16
+@fb = common global [1024 x float] zeroinitializer, align 16
+@ic = common global [1024 x i32] zeroinitializer, align 16
+@da = common global [1024 x float] zeroinitializer, align 16
+@db = common global [1024 x float] zeroinitializer, align 16
+@dc = common global [1024 x float] zeroinitializer, align 16
+@dd = common global [1024 x float] zeroinitializer, align 16
+@dj = common global [1024 x i32] zeroinitializer, align 16
+
+;CHECK: @example1
+;CHECK: load <4 x i32>
+;CHECK: add <4 x i32>
+;CHECK: store <4 x i32>
+;CHECK: ret void
+define void @example1() nounwind uwtable ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds [2048 x i32]* @b, i64 0, i64 %indvars.iv
+  %3 = load i32* %2, align 4
+  %4 = getelementptr inbounds [2048 x i32]* @c, i64 0, i64 %indvars.iv
+  %5 = load i32* %4, align 4
+  %6 = add nsw i32 %5, %3
+  %7 = getelementptr inbounds [2048 x i32]* @a, i64 0, i64 %indvars.iv
+  store i32 %6, i32* %7, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 256
+  br i1 %exitcond, label %8, label %1
+
+; <label>:8                                       ; preds = %1
+  ret void
+}
+
+;CHECK: @example2
+;CHECK: store <4 x i32>
+;CHECK: ret void
+define void @example2(i32 %n, i32 %x) nounwind uwtable ssp {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph5, label %.preheader
+
+..preheader_crit_edge:                            ; preds = %.lr.ph5
+  %phitmp = sext i32 %n to i64
+  br label %.preheader
+
+.preheader:                                       ; preds = %..preheader_crit_edge, %0
+  %i.0.lcssa = phi i64 [ %phitmp, %..preheader_crit_edge ], [ 0, %0 ]
+  %2 = icmp eq i32 %n, 0
+  br i1 %2, label %._crit_edge, label %.lr.ph
+
+.lr.ph5:                                          ; preds = %0, %.lr.ph5
+  %indvars.iv6 = phi i64 [ %indvars.iv.next7, %.lr.ph5 ], [ 0, %0 ]
+  %3 = getelementptr inbounds [2048 x i32]* @b, i64 0, i64 %indvars.iv6
+  store i32 %x, i32* %3, align 4
+  %indvars.iv.next7 = add i64 %indvars.iv6, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next7 to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %..preheader_crit_edge, label %.lr.ph5
+
+.lr.ph:                                           ; preds = %.preheader, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ %i.0.lcssa, %.preheader ]
+  %.02 = phi i32 [ %4, %.lr.ph ], [ %n, %.preheader ]
+  %4 = add nsw i32 %.02, -1
+  %5 = getelementptr inbounds [2048 x i32]* @b, i64 0, i64 %indvars.iv
+  %6 = load i32* %5, align 4
+  %7 = getelementptr inbounds [2048 x i32]* @c, i64 0, i64 %indvars.iv
+  %8 = load i32* %7, align 4
+  %9 = and i32 %8, %6
+  %10 = getelementptr inbounds [2048 x i32]* @a, i64 0, i64 %indvars.iv
+  store i32 %9, i32* %10, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %11 = icmp eq i32 %4, 0
+  br i1 %11, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %.preheader
+  ret void
+}
+
+; We can't vectorize this loop because it has non constant loop bounds.
+;CHECK: @example3
+;CHECK-NOT: <4 x i32>
+;CHECK: ret void
+define void @example3(i32 %n, i32* noalias nocapture %p, i32* noalias nocapture %q) nounwind uwtable ssp {
+  %1 = icmp eq i32 %n, 0
+  br i1 %1, label %._crit_edge, label %.lr.ph
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %.05 = phi i32 [ %2, %.lr.ph ], [ %n, %0 ]
+  %.014 = phi i32* [ %5, %.lr.ph ], [ %p, %0 ]
+  %.023 = phi i32* [ %3, %.lr.ph ], [ %q, %0 ]
+  %2 = add nsw i32 %.05, -1
+  %3 = getelementptr inbounds i32* %.023, i64 1
+  %4 = load i32* %.023, align 16
+  %5 = getelementptr inbounds i32* %.014, i64 1
+  store i32 %4, i32* %.014, align 16
+  %6 = icmp eq i32 %2, 0
+  br i1 %6, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  ret void
+}
+
+;CHECK: @example4
+;CHECK: load <4 x i32>
+;CHECK: ret void
+define void @example4(i32 %n, i32* noalias nocapture %p, i32* noalias nocapture %q) nounwind uwtable ssp {
+  %1 = add nsw i32 %n, -1
+  %2 = icmp eq i32 %n, 0
+  br i1 %2, label %.preheader4, label %.lr.ph10
+
+.preheader4:                                      ; preds = %0
+  %3 = icmp sgt i32 %1, 0
+  br i1 %3, label %.lr.ph6, label %._crit_edge
+
+.lr.ph10:                                         ; preds = %0, %.lr.ph10
+  %4 = phi i32 [ %9, %.lr.ph10 ], [ %1, %0 ]
+  %.018 = phi i32* [ %8, %.lr.ph10 ], [ %p, %0 ]
+  %.027 = phi i32* [ %5, %.lr.ph10 ], [ %q, %0 ]
+  %5 = getelementptr inbounds i32* %.027, i64 1
+  %6 = load i32* %.027, align 16
+  %7 = add nsw i32 %6, 5
+  %8 = getelementptr inbounds i32* %.018, i64 1
+  store i32 %7, i32* %.018, align 16
+  %9 = add nsw i32 %4, -1
+  %10 = icmp eq i32 %4, 0
+  br i1 %10, label %._crit_edge, label %.lr.ph10
+
+.preheader:                                       ; preds = %.lr.ph6
+  br i1 %3, label %.lr.ph, label %._crit_edge
+
+.lr.ph6:                                          ; preds = %.preheader4, %.lr.ph6
+  %indvars.iv11 = phi i64 [ %indvars.iv.next12, %.lr.ph6 ], [ 0, %.preheader4 ]
+  %indvars.iv.next12 = add i64 %indvars.iv11, 1
+  %11 = getelementptr inbounds [2048 x i32]* @b, i64 0, i64 %indvars.iv.next12
+  %12 = load i32* %11, align 4
+  %13 = add nsw i64 %indvars.iv11, 3
+  %14 = getelementptr inbounds [2048 x i32]* @c, i64 0, i64 %13
+  %15 = load i32* %14, align 4
+  %16 = add nsw i32 %15, %12
+  %17 = getelementptr inbounds [2048 x i32]* @a, i64 0, i64 %indvars.iv11
+  store i32 %16, i32* %17, align 4
+  %lftr.wideiv13 = trunc i64 %indvars.iv.next12 to i32
+  %exitcond14 = icmp eq i32 %lftr.wideiv13, %1
+  br i1 %exitcond14, label %.preheader, label %.lr.ph6
+
+.lr.ph:                                           ; preds = %.preheader, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %.preheader ]
+  %18 = getelementptr inbounds [2048 x i32]* @a, i64 0, i64 %indvars.iv
+  %19 = load i32* %18, align 4
+  %20 = icmp sgt i32 %19, 4
+  %21 = select i1 %20, i32 4, i32 0
+  %22 = getelementptr inbounds [2048 x i32]* @b, i64 0, i64 %indvars.iv
+  store i32 %21, i32* %22, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %1
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph10, %.preheader4, %.lr.ph, %.preheader
+  ret void
+}
+
+;CHECK: @example8
+;CHECK: store <4 x i32>
+;CHECK: ret void
+define void @example8(i32 %x) nounwind uwtable ssp {
+  br label %.preheader
+
+.preheader:                                       ; preds = %3, %0
+  %indvars.iv3 = phi i64 [ 0, %0 ], [ %indvars.iv.next4, %3 ]
+  br label %1
+
+; <label>:1                                       ; preds = %1, %.preheader
+  %indvars.iv = phi i64 [ 0, %.preheader ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds [32 x [1024 x i32]]* @G, i64 0, i64 %indvars.iv3, i64 %indvars.iv
+  store i32 %x, i32* %2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %3, label %1
+
+; <label>:3                                       ; preds = %1
+  %indvars.iv.next4 = add i64 %indvars.iv3, 1
+  %lftr.wideiv5 = trunc i64 %indvars.iv.next4 to i32
+  %exitcond6 = icmp eq i32 %lftr.wideiv5, 32
+  br i1 %exitcond6, label %4, label %.preheader
+
+; <label>:4                                       ; preds = %3
+  ret void
+}
+
+;CHECK: @example9
+;CHECK: phi <4 x i32>
+;CHECK: ret i32
+define i32 @example9() nounwind uwtable readonly ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %diff.01 = phi i32 [ 0, %0 ], [ %7, %1 ]
+  %2 = getelementptr inbounds [1024 x i32]* @ub, i64 0, i64 %indvars.iv
+  %3 = load i32* %2, align 4
+  %4 = getelementptr inbounds [1024 x i32]* @uc, i64 0, i64 %indvars.iv
+  %5 = load i32* %4, align 4
+  %6 = add i32 %3, %diff.01
+  %7 = sub i32 %6, %5
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %8, label %1
+
+; <label>:8                                       ; preds = %1
+  ret i32 %7
+}
+
+;CHECK: @example10a
+;CHECK: load <4 x i16>
+;CHECK: add <4 x i16>
+;CHECK: store <4 x i16>
+;CHECK: ret void
+define void @example10a(i16* noalias nocapture %sa, i16* noalias nocapture %sb, i16* noalias nocapture %sc, i32* noalias nocapture %ia, i32* noalias nocapture %ib, i32* noalias nocapture %ic) nounwind uwtable ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds i32* %ib, i64 %indvars.iv
+  %3 = load i32* %2, align 4
+  %4 = getelementptr inbounds i32* %ic, i64 %indvars.iv
+  %5 = load i32* %4, align 4
+  %6 = add nsw i32 %5, %3
+  %7 = getelementptr inbounds i32* %ia, i64 %indvars.iv
+  store i32 %6, i32* %7, align 4
+  %8 = getelementptr inbounds i16* %sb, i64 %indvars.iv
+  %9 = load i16* %8, align 2
+  %10 = getelementptr inbounds i16* %sc, i64 %indvars.iv
+  %11 = load i16* %10, align 2
+  %12 = add i16 %11, %9
+  %13 = getelementptr inbounds i16* %sa, i64 %indvars.iv
+  store i16 %12, i16* %13, align 2
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %14, label %1
+
+; <label>:14                                      ; preds = %1
+  ret void
+}
+
+;CHECK: @example10b
+;CHECK: load <4 x i16>
+;CHECK: sext <4 x i16>
+;CHECK: store <4 x i32>
+;CHECK: ret void
+define void @example10b(i16* noalias nocapture %sa, i16* noalias nocapture %sb, i16* noalias nocapture %sc, i32* noalias nocapture %ia, i32* noalias nocapture %ib, i32* noalias nocapture %ic) nounwind uwtable ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds i16* %sb, i64 %indvars.iv
+  %3 = load i16* %2, align 2
+  %4 = sext i16 %3 to i32
+  %5 = getelementptr inbounds i32* %ia, i64 %indvars.iv
+  store i32 %4, i32* %5, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %6, label %1
+
+; <label>:6                                       ; preds = %1
+  ret void
+}
+
+;CHECK: @example11
+;CHECK: load i32
+;CHECK: load i32
+;CHECK: load i32
+;CHECK: load i32
+;CHECK: insertelement
+;CHECK: insertelement
+;CHECK: insertelement
+;CHECK: insertelement
+;CHECK: ret void
+define void @example11() nounwind uwtable ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = shl nsw i64 %indvars.iv, 1
+  %3 = or i64 %2, 1
+  %4 = getelementptr inbounds [2048 x i32]* @b, i64 0, i64 %3
+  %5 = load i32* %4, align 4
+  %6 = getelementptr inbounds [2048 x i32]* @c, i64 0, i64 %3
+  %7 = load i32* %6, align 4
+  %8 = mul nsw i32 %7, %5
+  %9 = getelementptr inbounds [2048 x i32]* @b, i64 0, i64 %2
+  %10 = load i32* %9, align 8
+  %11 = getelementptr inbounds [2048 x i32]* @c, i64 0, i64 %2
+  %12 = load i32* %11, align 8
+  %13 = mul nsw i32 %12, %10
+  %14 = sub nsw i32 %8, %13
+  %15 = getelementptr inbounds [2048 x i32]* @a, i64 0, i64 %indvars.iv
+  store i32 %14, i32* %15, align 4
+  %16 = mul nsw i32 %7, %10
+  %17 = mul nsw i32 %12, %5
+  %18 = add nsw i32 %17, %16
+  %19 = getelementptr inbounds [2048 x i32]* @d, i64 0, i64 %indvars.iv
+  store i32 %18, i32* %19, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 512
+  br i1 %exitcond, label %20, label %1
+
+; <label>:20                                      ; preds = %1
+  ret void
+}
+
+;CHECK: @example12
+;CHECK: trunc <4 x i64>
+;CHECK: store <4 x i32>
+;CHECK: ret void
+define void @example12() nounwind uwtable ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds [2048 x i32]* @a, i64 0, i64 %indvars.iv
+  %3 = trunc i64 %indvars.iv to i32
+  store i32 %3, i32* %2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %4, label %1
+
+; <label>:4                                       ; preds = %1
+  ret void
+}
+
+; Can't vectorize because of reductions.
+;CHECK: @example13
+;CHECK-NOT: <4 x i32>
+;CHECK: ret void
+define void @example13(i32** nocapture %A, i32** nocapture %B, i32* nocapture %out) nounwind uwtable ssp {
+  br label %.preheader
+
+.preheader:                                       ; preds = %14, %0
+  %indvars.iv4 = phi i64 [ 0, %0 ], [ %indvars.iv.next5, %14 ]
+  %1 = getelementptr inbounds i32** %A, i64 %indvars.iv4
+  %2 = load i32** %1, align 8
+  %3 = getelementptr inbounds i32** %B, i64 %indvars.iv4
+  %4 = load i32** %3, align 8
+  br label %5
+
+; <label>:5                                       ; preds = %.preheader, %5
+  %indvars.iv = phi i64 [ 0, %.preheader ], [ %indvars.iv.next, %5 ]
+  %diff.02 = phi i32 [ 0, %.preheader ], [ %11, %5 ]
+  %6 = getelementptr inbounds i32* %2, i64 %indvars.iv
+  %7 = load i32* %6, align 4
+  %8 = getelementptr inbounds i32* %4, i64 %indvars.iv
+  %9 = load i32* %8, align 4
+  %10 = add i32 %7, %diff.02
+  %11 = sub i32 %10, %9
+  %indvars.iv.next = add i64 %indvars.iv, 8
+  %12 = trunc i64 %indvars.iv.next to i32
+  %13 = icmp slt i32 %12, 1024
+  br i1 %13, label %5, label %14
+
+; <label>:14                                      ; preds = %5
+  %15 = getelementptr inbounds i32* %out, i64 %indvars.iv4
+  store i32 %11, i32* %15, align 4
+  %indvars.iv.next5 = add i64 %indvars.iv4, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next5 to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 32
+  br i1 %exitcond, label %16, label %.preheader
+
+; <label>:16                                      ; preds = %14
+  ret void
+}
+
+; Can't vectorize because of reductions.
+;CHECK: @example14
+;CHECK-NOT: <4 x i32>
+;CHECK: ret void
+define void @example14(i32** nocapture %in, i32** nocapture %coeff, i32* nocapture %out) nounwind uwtable ssp {
+.preheader3:
+  br label %.preheader
+
+.preheader:                                       ; preds = %11, %.preheader3
+  %indvars.iv7 = phi i64 [ 0, %.preheader3 ], [ %indvars.iv.next8, %11 ]
+  %sum.05 = phi i32 [ 0, %.preheader3 ], [ %10, %11 ]
+  br label %0
+
+; <label>:0                                       ; preds = %0, %.preheader
+  %indvars.iv = phi i64 [ 0, %.preheader ], [ %indvars.iv.next, %0 ]
+  %sum.12 = phi i32 [ %sum.05, %.preheader ], [ %10, %0 ]
+  %1 = getelementptr inbounds i32** %in, i64 %indvars.iv
+  %2 = load i32** %1, align 8
+  %3 = getelementptr inbounds i32* %2, i64 %indvars.iv7
+  %4 = load i32* %3, align 4
+  %5 = getelementptr inbounds i32** %coeff, i64 %indvars.iv
+  %6 = load i32** %5, align 8
+  %7 = getelementptr inbounds i32* %6, i64 %indvars.iv7
+  %8 = load i32* %7, align 4
+  %9 = mul nsw i32 %8, %4
+  %10 = add nsw i32 %9, %sum.12
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %11, label %0
+
+; <label>:11                                      ; preds = %0
+  %indvars.iv.next8 = add i64 %indvars.iv7, 1
+  %lftr.wideiv9 = trunc i64 %indvars.iv.next8 to i32
+  %exitcond10 = icmp eq i32 %lftr.wideiv9, 32
+  br i1 %exitcond10, label %.preheader3.1, label %.preheader
+
+.preheader3.1:                                    ; preds = %11
+  store i32 %10, i32* %out, align 4
+  br label %.preheader.1
+
+.preheader.1:                                     ; preds = %24, %.preheader3.1
+  %indvars.iv7.1 = phi i64 [ 0, %.preheader3.1 ], [ %indvars.iv.next8.1, %24 ]
+  %sum.05.1 = phi i32 [ 0, %.preheader3.1 ], [ %23, %24 ]
+  br label %12
+
+; <label>:12                                      ; preds = %12, %.preheader.1
+  %indvars.iv.1 = phi i64 [ 0, %.preheader.1 ], [ %13, %12 ]
+  %sum.12.1 = phi i32 [ %sum.05.1, %.preheader.1 ], [ %23, %12 ]
+  %13 = add nsw i64 %indvars.iv.1, 1
+  %14 = getelementptr inbounds i32** %in, i64 %13
+  %15 = load i32** %14, align 8
+  %16 = getelementptr inbounds i32* %15, i64 %indvars.iv7.1
+  %17 = load i32* %16, align 4
+  %18 = getelementptr inbounds i32** %coeff, i64 %indvars.iv.1
+  %19 = load i32** %18, align 8
+  %20 = getelementptr inbounds i32* %19, i64 %indvars.iv7.1
+  %21 = load i32* %20, align 4
+  %22 = mul nsw i32 %21, %17
+  %23 = add nsw i32 %22, %sum.12.1
+  %lftr.wideiv.1 = trunc i64 %13 to i32
+  %exitcond.1 = icmp eq i32 %lftr.wideiv.1, 1024
+  br i1 %exitcond.1, label %24, label %12
+
+; <label>:24                                      ; preds = %12
+  %indvars.iv.next8.1 = add i64 %indvars.iv7.1, 1
+  %lftr.wideiv9.1 = trunc i64 %indvars.iv.next8.1 to i32
+  %exitcond10.1 = icmp eq i32 %lftr.wideiv9.1, 32
+  br i1 %exitcond10.1, label %.preheader3.2, label %.preheader.1
+
+.preheader3.2:                                    ; preds = %24
+  %25 = getelementptr inbounds i32* %out, i64 1
+  store i32 %23, i32* %25, align 4
+  br label %.preheader.2
+
+.preheader.2:                                     ; preds = %38, %.preheader3.2
+  %indvars.iv7.2 = phi i64 [ 0, %.preheader3.2 ], [ %indvars.iv.next8.2, %38 ]
+  %sum.05.2 = phi i32 [ 0, %.preheader3.2 ], [ %37, %38 ]
+  br label %26
+
+; <label>:26                                      ; preds = %26, %.preheader.2
+  %indvars.iv.2 = phi i64 [ 0, %.preheader.2 ], [ %indvars.iv.next.2, %26 ]
+  %sum.12.2 = phi i32 [ %sum.05.2, %.preheader.2 ], [ %37, %26 ]
+  %27 = add nsw i64 %indvars.iv.2, 2
+  %28 = getelementptr inbounds i32** %in, i64 %27
+  %29 = load i32** %28, align 8
+  %30 = getelementptr inbounds i32* %29, i64 %indvars.iv7.2
+  %31 = load i32* %30, align 4
+  %32 = getelementptr inbounds i32** %coeff, i64 %indvars.iv.2
+  %33 = load i32** %32, align 8
+  %34 = getelementptr inbounds i32* %33, i64 %indvars.iv7.2
+  %35 = load i32* %34, align 4
+  %36 = mul nsw i32 %35, %31
+  %37 = add nsw i32 %36, %sum.12.2
+  %indvars.iv.next.2 = add i64 %indvars.iv.2, 1
+  %lftr.wideiv.2 = trunc i64 %indvars.iv.next.2 to i32
+  %exitcond.2 = icmp eq i32 %lftr.wideiv.2, 1024
+  br i1 %exitcond.2, label %38, label %26
+
+; <label>:38                                      ; preds = %26
+  %indvars.iv.next8.2 = add i64 %indvars.iv7.2, 1
+  %lftr.wideiv9.2 = trunc i64 %indvars.iv.next8.2 to i32
+  %exitcond10.2 = icmp eq i32 %lftr.wideiv9.2, 32
+  br i1 %exitcond10.2, label %.preheader3.3, label %.preheader.2
+
+.preheader3.3:                                    ; preds = %38
+  %39 = getelementptr inbounds i32* %out, i64 2
+  store i32 %37, i32* %39, align 4
+  br label %.preheader.3
+
+.preheader.3:                                     ; preds = %52, %.preheader3.3
+  %indvars.iv7.3 = phi i64 [ 0, %.preheader3.3 ], [ %indvars.iv.next8.3, %52 ]
+  %sum.05.3 = phi i32 [ 0, %.preheader3.3 ], [ %51, %52 ]
+  br label %40
+
+; <label>:40                                      ; preds = %40, %.preheader.3
+  %indvars.iv.3 = phi i64 [ 0, %.preheader.3 ], [ %indvars.iv.next.3, %40 ]
+  %sum.12.3 = phi i32 [ %sum.05.3, %.preheader.3 ], [ %51, %40 ]
+  %41 = add nsw i64 %indvars.iv.3, 3
+  %42 = getelementptr inbounds i32** %in, i64 %41
+  %43 = load i32** %42, align 8
+  %44 = getelementptr inbounds i32* %43, i64 %indvars.iv7.3
+  %45 = load i32* %44, align 4
+  %46 = getelementptr inbounds i32** %coeff, i64 %indvars.iv.3
+  %47 = load i32** %46, align 8
+  %48 = getelementptr inbounds i32* %47, i64 %indvars.iv7.3
+  %49 = load i32* %48, align 4
+  %50 = mul nsw i32 %49, %45
+  %51 = add nsw i32 %50, %sum.12.3
+  %indvars.iv.next.3 = add i64 %indvars.iv.3, 1
+  %lftr.wideiv.3 = trunc i64 %indvars.iv.next.3 to i32
+  %exitcond.3 = icmp eq i32 %lftr.wideiv.3, 1024
+  br i1 %exitcond.3, label %52, label %40
+
+; <label>:52                                      ; preds = %40
+  %indvars.iv.next8.3 = add i64 %indvars.iv7.3, 1
+  %lftr.wideiv9.3 = trunc i64 %indvars.iv.next8.3 to i32
+  %exitcond10.3 = icmp eq i32 %lftr.wideiv9.3, 32
+  br i1 %exitcond10.3, label %53, label %.preheader.3
+
+; <label>:53                                      ; preds = %52
+  %54 = getelementptr inbounds i32* %out, i64 3
+  store i32 %51, i32* %54, align 4
+  ret void
+}
+
+; Can't vectorize because the src and dst pointers are not disjoint.
+;CHECK: @example21
+;CHECK-NOT: <4 x i32>
+;CHECK: ret i32
+define i32 @example21(i32* nocapture %b, i32 %n) nounwind uwtable readonly ssp {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0
+  %2 = sext i32 %n to i64
+  br label %3
+
+; <label>:3                                       ; preds = %.lr.ph, %3
+  %indvars.iv = phi i64 [ %2, %.lr.ph ], [ %indvars.iv.next, %3 ]
+  %a.02 = phi i32 [ 0, %.lr.ph ], [ %6, %3 ]
+  %indvars.iv.next = add i64 %indvars.iv, -1
+  %4 = getelementptr inbounds i32* %b, i64 %indvars.iv.next
+  %5 = load i32* %4, align 4
+  %6 = add nsw i32 %5, %a.02
+  %7 = trunc i64 %indvars.iv.next to i32
+  %8 = icmp sgt i32 %7, 0
+  br i1 %8, label %3, label %._crit_edge
+
+._crit_edge:                                      ; preds = %3, %0
+  %a.0.lcssa = phi i32 [ 0, %0 ], [ %6, %3 ]
+  ret i32 %a.0.lcssa
+}
+
+; Can't vectorize because there are multiple PHIs.
+;CHECK: @example23
+;CHECK-NOT: <4 x i32>
+;CHECK: ret void
+define void @example23(i16* nocapture %src, i32* nocapture %dst) nounwind uwtable ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %.04 = phi i16* [ %src, %0 ], [ %2, %1 ]
+  %.013 = phi i32* [ %dst, %0 ], [ %6, %1 ]
+  %i.02 = phi i32 [ 0, %0 ], [ %7, %1 ]
+  %2 = getelementptr inbounds i16* %.04, i64 1
+  %3 = load i16* %.04, align 2
+  %4 = zext i16 %3 to i32
+  %5 = shl nuw nsw i32 %4, 7
+  %6 = getelementptr inbounds i32* %.013, i64 1
+  store i32 %5, i32* %.013, align 4
+  %7 = add nsw i32 %i.02, 1
+  %exitcond = icmp eq i32 %7, 256
+  br i1 %exitcond, label %8, label %1
+
+; <label>:8                                       ; preds = %1
+  ret void
+}
+
+;CHECK: @example24
+;CHECK: shufflevector <4 x i16>
+;CHECK: ret void
+define void @example24(i16 signext %x, i16 signext %y) nounwind uwtable ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds [1024 x float]* @fa, i64 0, i64 %indvars.iv
+  %3 = load float* %2, align 4
+  %4 = getelementptr inbounds [1024 x float]* @fb, i64 0, i64 %indvars.iv
+  %5 = load float* %4, align 4
+  %6 = fcmp olt float %3, %5
+  %x.y = select i1 %6, i16 %x, i16 %y
+  %7 = sext i16 %x.y to i32
+  %8 = getelementptr inbounds [1024 x i32]* @ic, i64 0, i64 %indvars.iv
+  store i32 %7, i32* %8, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %9, label %1
+
+; <label>:9                                       ; preds = %1
+  ret void
+}
+
+;CHECK: @example25
+;CHECK: and <4 x i1>
+;CHECK: zext <4 x i1>
+;CHECK: ret void
+define void @example25() nounwind uwtable ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds [1024 x float]* @da, i64 0, i64 %indvars.iv
+  %3 = load float* %2, align 4
+  %4 = getelementptr inbounds [1024 x float]* @db, i64 0, i64 %indvars.iv
+  %5 = load float* %4, align 4
+  %6 = fcmp olt float %3, %5
+  %7 = getelementptr inbounds [1024 x float]* @dc, i64 0, i64 %indvars.iv
+  %8 = load float* %7, align 4
+  %9 = getelementptr inbounds [1024 x float]* @dd, i64 0, i64 %indvars.iv
+  %10 = load float* %9, align 4
+  %11 = fcmp olt float %8, %10
+  %12 = and i1 %6, %11
+  %13 = zext i1 %12 to i32
+  %14 = getelementptr inbounds [1024 x i32]* @dj, i64 0, i64 %indvars.iv
+  store i32 %13, i32* %14, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %15, label %1
+
+; <label>:15                                      ; preds = %1
+  ret void
+}
+
diff --git a/test/Transforms/LoopVectorize/increment.ll b/test/Transforms/LoopVectorize/increment.ll
new file mode 100644
index 00000000000..e818d685626
--- /dev/null
+++ b/test/Transforms/LoopVectorize/increment.ll
@@ -0,0 +1,66 @@
+; RUN: opt < %s  -loop-vectorize -dce -instcombine -licm -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+@a = common global [2048 x i32] zeroinitializer, align 16
+
+; This is the loop.
+;  for (i=0; i<n; i++){
+;    a[i] += i;
+;  }
+;CHECK: @inc
+;CHECK: load <4 x i32>
+;CHECK: add <4 x i32>
+;CHECK: store <4 x i32>
+;CHECK: ret void
+define void @inc(i32 %n) nounwind uwtable noinline ssp {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
+  %2 = getelementptr inbounds [2048 x i32]* @a, i64 0, i64 %indvars.iv
+  %3 = load i32* %2, align 4
+  %4 = trunc i64 %indvars.iv to i32
+  %5 = add nsw i32 %3, %4
+  store i32 %5, i32* %2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  ret void
+}
+
+; Can't vectorize this loop because the access to A[X] is non linear.
+;
+;  for (i = 0; i < n; ++i) {
+;    A[B[i]]++;
+;
+;CHECK: @histogram
+;CHECK-NOT: <4 x i32>
+;CHECK: ret i32
+define i32 @histogram(i32* nocapture noalias %A, i32* nocapture noalias %B, i32 %n) nounwind uwtable ssp {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32* %B, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %idxprom1 = sext i32 %0 to i64
+  %arrayidx2 = getelementptr inbounds i32* %A, i64 %idxprom1
+  %1 = load i32* %arrayidx2, align 4
+  %inc = add nsw i32 %1, 1
+  store i32 %inc, i32* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret i32 0
+}
diff --git a/test/Transforms/LoopVectorize/induction_plus.ll b/test/Transforms/LoopVectorize/induction_plus.ll
new file mode 100644
index 00000000000..bd90113e523
--- /dev/null
+++ b/test/Transforms/LoopVectorize/induction_plus.ll
@@ -0,0 +1,30 @@
+; RUN: opt < %s -loop-vectorize -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+@array = common global [1024 x i32] zeroinitializer, align 16
+
+;CHECK: @array_at_plus_one
+;CHECK: add <4 x i64>
+;CHECK: trunc <4 x i64>
+;CHECK: add i64 %index, 12
+;CHECK: ret i32
+define i32 @array_at_plus_one(i32 %n) nounwind uwtable ssp {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
+  %2 = add nsw i64 %indvars.iv, 12
+  %3 = getelementptr inbounds [1024 x i32]* @array, i64 0, i64 %2
+  %4 = trunc i64 %indvars.iv to i32
+  store i32 %4, i32* %3, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  ret i32 undef
+}
diff --git a/test/Transforms/LoopVectorize/lit.local.cfg b/test/Transforms/LoopVectorize/lit.local.cfg
new file mode 100644
index 00000000000..19eebc0ac7a
--- /dev/null
+++ b/test/Transforms/LoopVectorize/lit.local.cfg
@@ -0,0 +1 @@
+config.suffixes = ['.ll', '.c', '.cpp']
diff --git a/test/Transforms/LoopVectorize/non-const-n.ll b/test/Transforms/LoopVectorize/non-const-n.ll
new file mode 100644
index 00000000000..04c5c84a4f4
--- /dev/null
+++ b/test/Transforms/LoopVectorize/non-const-n.ll
@@ -0,0 +1,38 @@
+; RUN: opt < %s  -loop-vectorize -dce -instcombine -licm -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+@b = common global [2048 x i32] zeroinitializer, align 16
+@c = common global [2048 x i32] zeroinitializer, align 16
+@a = common global [2048 x i32] zeroinitializer, align 16
+
+;CHECK: @example1
+;CHECK: shl i32
+;CHECK: zext i32
+;CHECK: load <4 x i32>
+;CHECK: add <4 x i32>
+;CHECK: store <4 x i32>
+;CHECK: ret void
+define void @example1(i32 %n) nounwind uwtable ssp {
+  %n4 = shl i32 %n, 2
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds [2048 x i32]* @b, i64 0, i64 %indvars.iv
+  %3 = load i32* %2, align 4
+  %4 = getelementptr inbounds [2048 x i32]* @c, i64 0, i64 %indvars.iv
+  %5 = load i32* %4, align 4
+  %6 = add nsw i32 %5, %3
+  %7 = getelementptr inbounds [2048 x i32]* @a, i64 0, i64 %indvars.iv
+  store i32 %6, i32* %7, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n4
+  br i1 %exitcond, label %8, label %1
+
+; <label>:8                                       ; preds = %1
+  ret void
+}
+
diff --git a/test/Transforms/LoopVectorize/read-only.ll b/test/Transforms/LoopVectorize/read-only.ll
new file mode 100644
index 00000000000..4095ea68ef7
--- /dev/null
+++ b/test/Transforms/LoopVectorize/read-only.ll
@@ -0,0 +1,32 @@
+; RUN: opt < %s  -loop-vectorize -dce -instcombine -licm -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+;CHECK: @read_only_func
+;CHECK: load <4 x i32>
+;CHECK: ret i32
+define i32 @read_only_func(i32* nocapture %A, i32* nocapture %B, i32 %n) nounwind uwtable readonly ssp {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
+  %sum.02 = phi i32 [ %9, %.lr.ph ], [ 0, %0 ]
+  %2 = getelementptr inbounds i32* %A, i64 %indvars.iv
+  %3 = load i32* %2, align 4
+  %4 = add nsw i64 %indvars.iv, 13
+  %5 = getelementptr inbounds i32* %B, i64 %4
+  %6 = load i32* %5, align 4
+  %7 = shl i32 %6, 1
+  %8 = add i32 %3, %sum.02
+  %9 = add i32 %8, %7
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  %sum.0.lcssa = phi i32 [ 0, %0 ], [ %9, %.lr.ph ]
+  ret i32 %sum.0.lcssa
+}
diff --git a/test/Transforms/LoopVectorize/reduction.ll b/test/Transforms/LoopVectorize/reduction.ll
new file mode 100644
index 00000000000..3e871b229b5
--- /dev/null
+++ b/test/Transforms/LoopVectorize/reduction.ll
@@ -0,0 +1,152 @@
+; RUN: opt < %s  -loop-vectorize -dce -instcombine -licm -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+;CHECK: @reduction_sum
+;CHECK: phi <4 x i32>
+;CHECK: load <4 x i32>
+;CHECK: add <4 x i32>
+;CHECK: ret i32
+define i32 @reduction_sum(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) nounwind uwtable readonly noinline ssp {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
+  %sum.02 = phi i32 [ %9, %.lr.ph ], [ 0, %0 ]
+  %2 = getelementptr inbounds i32* %A, i64 %indvars.iv
+  %3 = load i32* %2, align 4
+  %4 = getelementptr inbounds i32* %B, i64 %indvars.iv
+  %5 = load i32* %4, align 4
+  %6 = trunc i64 %indvars.iv to i32
+  %7 = add i32 %sum.02, %6
+  %8 = add i32 %7, %3
+  %9 = add i32 %8, %5
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  %sum.0.lcssa = phi i32 [ 0, %0 ], [ %9, %.lr.ph ]
+  ret i32 %sum.0.lcssa
+}
+
+;CHECK: @reduction_prod
+;CHECK: phi <4 x i32>
+;CHECK: load <4 x i32>
+;CHECK: mul <4 x i32>
+;CHECK: ret i32
+define i32 @reduction_prod(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) nounwind uwtable readonly noinline ssp {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
+  %prod.02 = phi i32 [ %9, %.lr.ph ], [ 1, %0 ]
+  %2 = getelementptr inbounds i32* %A, i64 %indvars.iv
+  %3 = load i32* %2, align 4
+  %4 = getelementptr inbounds i32* %B, i64 %indvars.iv
+  %5 = load i32* %4, align 4
+  %6 = trunc i64 %indvars.iv to i32
+  %7 = mul i32 %prod.02, %6
+  %8 = mul i32 %7, %3
+  %9 = mul i32 %8, %5
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  %prod.0.lcssa = phi i32 [ 1, %0 ], [ %9, %.lr.ph ]
+  ret i32 %prod.0.lcssa
+}
+
+;CHECK: @reduction_mix
+;CHECK: phi <4 x i32>
+;CHECK: load <4 x i32>
+;CHECK: mul <4 x i32>
+;CHECK: ret i32
+define i32 @reduction_mix(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) nounwind uwtable readonly noinline ssp {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
+  %sum.02 = phi i32 [ %9, %.lr.ph ], [ 0, %0 ]
+  %2 = getelementptr inbounds i32* %A, i64 %indvars.iv
+  %3 = load i32* %2, align 4
+  %4 = getelementptr inbounds i32* %B, i64 %indvars.iv
+  %5 = load i32* %4, align 4
+  %6 = mul nsw i32 %5, %3
+  %7 = trunc i64 %indvars.iv to i32
+  %8 = add i32 %sum.02, %7
+  %9 = add i32 %8, %6
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  %sum.0.lcssa = phi i32 [ 0, %0 ], [ %9, %.lr.ph ]
+  ret i32 %sum.0.lcssa
+}
+
+;CHECK: @reduction_mul
+;CHECK: mul <4 x i32>
+;CHECK: ret i32
+define i32 @reduction_mul(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) nounwind uwtable readonly noinline ssp {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
+  %sum.02 = phi i32 [ %9, %.lr.ph ], [ 19, %0 ]
+  %2 = getelementptr inbounds i32* %A, i64 %indvars.iv
+  %3 = load i32* %2, align 4
+  %4 = getelementptr inbounds i32* %B, i64 %indvars.iv
+  %5 = load i32* %4, align 4
+  %6 = trunc i64 %indvars.iv to i32
+  %7 = add i32 %3, %6
+  %8 = add i32 %7, %5
+  %9 = mul i32 %8, %sum.02
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  %sum.0.lcssa = phi i32 [ 0, %0 ], [ %9, %.lr.ph ]
+  ret i32 %sum.0.lcssa
+}
+
+;CHECK: @start_at_non_zero
+;CHECK: phi <4 x i32>
+;CHECK: <i32 120, i32 0, i32 0, i32 0>
+;CHECK: ret i32
+define i32 @start_at_non_zero(i32* nocapture %in, i32* nocapture %coeff, i32* nocapture %out, i32 %n) nounwind uwtable readonly ssp {
+entry:
+  %cmp7 = icmp sgt i32 %n, 0
+  br i1 %cmp7, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %sum.09 = phi i32 [ %add, %for.body ], [ 120, %entry ]
+  %arrayidx = getelementptr inbounds i32* %in, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32* %coeff, i64 %indvars.iv
+  %1 = load i32* %arrayidx2, align 4
+  %mul = mul nsw i32 %1, %0
+  %add = add nsw i32 %mul, %sum.09
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %sum.0.lcssa = phi i32 [ 120, %entry ], [ %add, %for.body ]
+  ret i32 %sum.0.lcssa
+}
+
+
diff --git a/test/Transforms/LoopVectorize/scalar-select.ll b/test/Transforms/LoopVectorize/scalar-select.ll
new file mode 100644
index 00000000000..8d5b6fd8aff
--- /dev/null
+++ b/test/Transforms/LoopVectorize/scalar-select.ll
@@ -0,0 +1,37 @@
+; RUN: opt < %s  -loop-vectorize -dce -instcombine -licm -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+@a = common global [2048 x i32] zeroinitializer, align 16
+@b = common global [2048 x i32] zeroinitializer, align 16
+@c = common global [2048 x i32] zeroinitializer, align 16
+
+;CHECK: @example1
+;CHECK: load <4 x i32>
+; make sure that we have a scalar condition and a vector operand.
+;CHECK: select i1 %cond, <4 x i32>
+;CHECK: store <4 x i32>
+;CHECK: ret void
+define void @example1(i1 %cond) nounwind uwtable ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds [2048 x i32]* @b, i64 0, i64 %indvars.iv
+  %3 = load i32* %2, align 4
+  %4 = getelementptr inbounds [2048 x i32]* @c, i64 0, i64 %indvars.iv
+  %5 = load i32* %4, align 4
+  %6 = add nsw i32 %5, %3
+  %7 = getelementptr inbounds [2048 x i32]* @a, i64 0, i64 %indvars.iv
+  %sel = select i1 %cond, i32 %6, i32 zeroinitializer
+  store i32 %sel, i32* %7, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 256
+  br i1 %exitcond, label %8, label %1
+
+; <label>:8                                       ; preds = %1
+  ret void
+}
+
diff --git a/test/Transforms/SROA/basictest.ll b/test/Transforms/SROA/basictest.ll
index 644fda167d4..03120f7a327 100644
--- a/test/Transforms/SROA/basictest.ll
+++ b/test/Transforms/SROA/basictest.ll
@@ -1063,3 +1063,23 @@ entry:
   call void @llvm.lifetime.end(i64 -1, i8* %0)
   ret void
 }
+
+define void @PR14105({ [16 x i8] }* %ptr) {
+; Ensure that when rewriting the GEP index '-1' for this alloca we preserve is
+; sign as negative. We use a volatile memcpy to ensure promotion never actually
+; occurs.
+; CHECK: @PR14105
+
+entry:
+  %a = alloca { [16 x i8] }, align 8
+; CHECK: alloca [16 x i8], align 8
+
+  %gep = getelementptr inbounds { [16 x i8] }* %ptr, i64 -1
+; CHECK-NEXT: getelementptr inbounds { [16 x i8] }* %ptr, i64 -1, i32 0, i64 0
+
+  %cast1 = bitcast { [16 x i8 ] }* %gep to i8*
+  %cast2 = bitcast { [16 x i8 ] }* %a to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %cast1, i8* %cast2, i32 16, i32 8, i1 true)
+  ret void
+; CHECK: ret
+}
diff --git a/test/Transforms/SimplifyLibCalls/FFS.ll b/test/Transforms/SimplifyLibCalls/FFS.ll
index e38d78349d4..6aecbeacd7e 100644
--- a/test/Transforms/SimplifyLibCalls/FFS.ll
+++ b/test/Transforms/SimplifyLibCalls/FFS.ll
@@ -1,6 +1,7 @@
-; Test that the ToAsciiOptimizer works correctly
-; RUN: opt < %s -simplify-libcalls -S | \
-; RUN:   not grep "call.*@ffs"
+; Test that FFSOpt works correctly
+; RUN: opt < %s -simplify-libcalls -S | FileCheck %s
+
+; CHECK-NOT: call{{.*}}@ffs
 
 @non_const = external global i32		; <i32*> [#uses=1]
 
@@ -34,3 +35,11 @@ define i32 @a(i64) nounwind {
         %2 = call i32 @ffsll(i64 %0)            ; <i32> [#uses=1]
         ret i32 %2
 }
+
+; PR13028
+define i32 @b() nounwind {
+  %ffs = call i32 @ffsll(i64 0)
+  ret i32 %ffs
+; CHECK: @b
+; CHECK-NEXT: ret i32 0
+}
diff --git a/test/Transforms/SimplifyLibCalls/StrCpy.ll b/test/Transforms/SimplifyLibCalls/StrCpy.ll
deleted file mode 100644
index 83406ff8f86..00000000000
--- a/test/Transforms/SimplifyLibCalls/StrCpy.ll
+++ /dev/null
@@ -1,37 +0,0 @@
-; Test that the StrCpyOptimizer works correctly
-; RUN: opt < %s -simplify-libcalls -S | FileCheck %s
-
-; This transformation requires the pointer size, as it assumes that size_t is
-; the size of a pointer.
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32"
-
-@hello = constant [6 x i8] c"hello\00"
-
-declare i8* @strcpy(i8*, i8*)
-
-declare i8* @__strcpy_chk(i8*, i8*, i32) nounwind
-
-declare i32 @llvm.objectsize.i32(i8*, i1) nounwind readonly
-
-; rdar://6839935
-
-define i32 @t1() {
-; CHECK: @t1
-  %target = alloca [1024 x i8]
-  %arg1 = getelementptr [1024 x i8]* %target, i32 0, i32 0
-  %arg2 = getelementptr [6 x i8]* @hello, i32 0, i32 0
-  %rslt1 = call i8* @strcpy( i8* %arg1, i8* %arg2 )
-; CHECK: @llvm.memcpy.p0i8.p0i8.i32
-  ret i32 0
-}
-
-define i32 @t2() {
-; CHECK: @t2
-  %target = alloca [1024 x i8]
-  %arg1 = getelementptr [1024 x i8]* %target, i32 0, i32 0
-  %arg2 = getelementptr [6 x i8]* @hello, i32 0, i32 0
-  %tmp1 = call i32 @llvm.objectsize.i32(i8* %arg1, i1 false)
-  %rslt1 = call i8* @__strcpy_chk(i8* %arg1, i8* %arg2, i32 %tmp1)
-; CHECK: @__memcpy_chk
-  ret i32 0
-}
diff --git a/test/Transforms/TailCallElim/nocapture.ll b/test/Transforms/TailCallElim/nocapture.ll
index 87cb9dd427b..5a1a9a6e7ce 100644
--- a/test/Transforms/TailCallElim/nocapture.ll
+++ b/test/Transforms/TailCallElim/nocapture.ll
@@ -1,9 +1,9 @@
 ; RUN: opt %s -tailcallelim -S | FileCheck %s
-; XFAIL: *
 
 declare void @use(i8* nocapture, i8* nocapture)
+declare void @boring()
 
-define i8* @foo(i8* nocapture %A, i1 %cond) {
+define i8* @test1(i8* nocapture %A, i1 %cond) {
 ; CHECK: tailrecurse:
 ; CHECK: %A.tr = phi i8* [ %A, %0 ], [ %B, %cond_true ]
 ; CHECK: %cond.tr = phi i1 [ %cond, %0 ], [ false, %cond_true ]
@@ -14,12 +14,27 @@ define i8* @foo(i8* nocapture %A, i1 %cond) {
 cond_true:
 ; CHECK: cond_true:
 ; CHECK: br label %tailrecurse
-  call i8* @foo(i8* %B, i1 false)
+  call i8* @test1(i8* %B, i1 false)
   ret i8* null
 cond_false:
 ; CHECK: cond_false
   call void @use(i8* %A, i8* %B)
-; CHECK: tail call void @use(i8* %A.tr, i8* %B)
+; CHECK: call void @use(i8* %A.tr, i8* %B)
+  call void @boring()
+; CHECK: tail call void @boring()
   ret i8* null
 ; CHECK: ret i8* null
 }
+
+; PR14143
+define void @test2(i8* %a, i8* %b) {
+; CHECK: @test2
+; CHECK-NOT: tail call
+; CHECK: ret void
+  %c = alloca [100 x i8], align 16
+  %tmp = bitcast [100 x i8]* %c to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %b, i8* %tmp, i64 100, i32 1, i1 false)
+  ret void
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8*, i8*, i64, i32, i1)
diff --git a/test/lit.cfg b/test/lit.cfg
index dc37317ba95..7e6760e95ae 100644
--- a/test/lit.cfg
+++ b/test/lit.cfg
@@ -146,13 +146,12 @@ if re.search(r'cygwin|mingw32|win32', mcjit_triple):
   mcjit_triple += "-elf"
 config.substitutions.append( ('%mcjit_triple', mcjit_triple) )
 
-# When running under valgrind, we mangle '-vg' or '-vg_leak' onto the end of the
-# triple so we can check it with XFAIL and XTARGET.
-config.target_triple += lit.valgrindTriple
-
 # Provide a substition for those tests that need to run the jit to obtain data
 # but simply want use the currently considered most reliable jit for platform
-defaultIsMCJIT='true' if 'arm' in config.target_triple else 'false'
+if 'arm' in config.target_triple:
+    defaultIsMCJIT = 'true'
+else:
+    defaultIsMCJIT = 'false'
 config.substitutions.append( ('%defaultjit', '-use-mcjit='+defaultIsMCJIT) )
 
 # Process jit implementation option
@@ -242,6 +241,10 @@ else:
 if loadable_module:
     config.available_features.add('loadable_module')
 
+# LTO
+if config.lto_is_enabled == "1":
+    config.available_features.add('lto')
+
 # llc knows whether he is compiled with -DNDEBUG.
 import subprocess
 try:
diff --git a/test/lit.site.cfg.in b/test/lit.site.cfg.in
index 178b22f10f3..2bbe63e6348 100644
--- a/test/lit.site.cfg.in
+++ b/test/lit.site.cfg.in
@@ -11,6 +11,7 @@ config.python_executable = "@PYTHON_EXECUTABLE@"
 config.ocamlopt_executable = "@OCAMLOPT@"
 config.enable_shared = @ENABLE_SHARED@
 config.enable_assertions = @ENABLE_ASSERTIONS@
+config.lto_is_enabled = "@LTO_IS_ENABLED@"
 config.targets_to_build = "@TARGETS_TO_BUILD@"
 config.llvm_bindings = "@LLVM_BINDINGS@"
 config.host_os = "@HOST_OS@"
diff --git a/tools/bugpoint-passes/bugpoint.exports b/tools/bugpoint-passes/bugpoint.exports
index d8fdd6a5767..e69de29bb2d 100644
--- a/tools/bugpoint-passes/bugpoint.exports
+++ b/tools/bugpoint-passes/bugpoint.exports
@@ -1 +0,0 @@
-_ZN4llvm14BasicBlockPass14doFinalizationERNS_6ModuleE
diff --git a/tools/llvm-dwarfdump/llvm-dwarfdump.cpp b/tools/llvm-dwarfdump/llvm-dwarfdump.cpp
index 38c3a1e76f5..309bc4ecd46 100644
--- a/tools/llvm-dwarfdump/llvm-dwarfdump.cpp
+++ b/tools/llvm-dwarfdump/llvm-dwarfdump.cpp
@@ -1,4 +1,4 @@
-//===-- llvm-dwarfdump.cpp - Debug info dumping utility for llvm -----------===//
+//===-- llvm-dwarfdump.cpp - Debug info dumping utility for llvm ----------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -118,8 +118,8 @@ static void DumpInput(const StringRef &Filename) {
     if (PrintFunctions)
       SpecFlags |= DILineInfoSpecifier::FunctionName;
     if (PrintInlining) {
-      DIInliningInfo InliningInfo = dictx->getInliningInfoForAddress(
-          Address, SpecFlags);
+      DIInliningInfo InliningInfo =
+        dictx->getInliningInfoForAddress(Address, SpecFlags);
       uint32_t n = InliningInfo.getNumberOfFrames();
       if (n == 0) {
         // Print one empty debug line info in any case.
diff --git a/unittests/ExecutionEngine/JIT/JITTest.cpp b/unittests/ExecutionEngine/JIT/JITTest.cpp
index ae6855e68bf..6933091949b 100644
--- a/unittests/ExecutionEngine/JIT/JITTest.cpp
+++ b/unittests/ExecutionEngine/JIT/JITTest.cpp
@@ -606,7 +606,7 @@ TEST_F(JITTest, FunctionIsRecompiledAndRelinked) {
 // program from the IR input to the JIT to assert that the JIT doesn't use its
 // definition.
 extern "C" int32_t JITTest_AvailableExternallyGlobal;
-int32_t JITTest_AvailableExternallyGlobal = 42;
+int32_t JITTest_AvailableExternallyGlobal LLVM_ATTRIBUTE_USED = 42;
 namespace {
 
 // Tests on ARM disabled as we're running the old jit
diff --git a/unittests/ExecutionEngine/JIT/Makefile b/unittests/ExecutionEngine/JIT/Makefile
index b535a6b2960..9e0bb9ea593 100644
--- a/unittests/ExecutionEngine/JIT/Makefile
+++ b/unittests/ExecutionEngine/JIT/Makefile
@@ -35,8 +35,15 @@ ifeq ($(USE_OPROFILE), 1)
   LINK_COMPONENTS += oprofilejit
 endif
 
+EXPORTED_SYMBOL_FILE = $(PROJ_OBJ_DIR)/JITTests.exports
 
 include $(LLVM_SRC_ROOT)/unittests/Makefile.unittest
 
 # Permit these tests to use the JIT's symbolic lookup.
 LD.Flags += $(RDYNAMIC)
+
+# Symbol exports are necessary (at least for now) when building with LTO.
+$(LLVMUnitTestExe): $(NativeExportsFile)
+$(PROJ_OBJ_DIR)/JITTests.exports: $(PROJ_SRC_DIR)/JITTests.def $(PROJ_OBJ_DIR)/.dir
+	tail -n +2 $< > $@
+
diff --git a/utils/TableGen/CodeGenInstruction.cpp b/utils/TableGen/CodeGenInstruction.cpp
index 99d2f173a87..fd38672bfcf 100644
--- a/utils/TableGen/CodeGenInstruction.cpp
+++ b/utils/TableGen/CodeGenInstruction.cpp
@@ -233,11 +233,12 @@ static void ParseConstraint(const std::string &CStr, CGIOperandList &Ops) {
   if (wpos == std::string::npos)
     throw "Illegal format for tied-to constraint: '" + CStr + "'";
 
-  std::pair<unsigned,unsigned> SrcOp =
-  Ops.ParseOperandName(Name.substr(wpos), false);
-  if (SrcOp > DestOp)
-    throw "Illegal tied-to operand constraint '" + CStr + "'";
-
+  std::string SrcOpName = Name.substr(wpos);
+  std::pair<unsigned,unsigned> SrcOp = Ops.ParseOperandName(SrcOpName, false);
+  if (SrcOp > DestOp) {
+    std::swap(SrcOp, DestOp);
+    std::swap(SrcOpName, DestOpName);
+  }
 
   unsigned FlatOpNo = Ops.getFlattenedOperandNumber(SrcOp);
 
diff --git a/utils/lit/lit/ExampleTests/LLVM.InTree/test/Bar/bar-test.ll b/utils/lit/lit/ExampleTests/LLVM.InTree/test/Bar/bar-test.ll
deleted file mode 100644
index 3017b13e48c..00000000000
--- a/utils/lit/lit/ExampleTests/LLVM.InTree/test/Bar/bar-test.ll
+++ /dev/null
@@ -1,3 +0,0 @@
-; RUN: true
-; XFAIL: *
-; XTARGET: darwin
diff --git a/utils/lit/lit/ExampleTests/LLVM.InTree/test/lit.cfg b/utils/lit/lit/ExampleTests/LLVM.InTree/test/lit.cfg
index e9df1e5b53b..3fdd63c2245 100644
--- a/utils/lit/lit/ExampleTests/LLVM.InTree/test/lit.cfg
+++ b/utils/lit/lit/ExampleTests/LLVM.InTree/test/lit.cfg
@@ -77,7 +77,7 @@ for line in open(os.path.join(config.llvm_obj_root, 'test', 'site.exp')):
 
 excludes = []
 
-# Provide target_triple for use in XFAIL and XTARGET.
+# Provide target_triple for use in XFAIL.
 config.target_triple = site_exp['target_triplet']
 
 # Provide llvm_supports_target for use in local configs.
diff --git a/utils/lit/lit/ExampleTests/LLVM.OutOfTree/src/test/lit.cfg b/utils/lit/lit/ExampleTests/LLVM.OutOfTree/src/test/lit.cfg
index e9df1e5b53b..3fdd63c2245 100644
--- a/utils/lit/lit/ExampleTests/LLVM.OutOfTree/src/test/lit.cfg
+++ b/utils/lit/lit/ExampleTests/LLVM.OutOfTree/src/test/lit.cfg
@@ -77,7 +77,7 @@ for line in open(os.path.join(config.llvm_obj_root, 'test', 'site.exp')):
 
 excludes = []
 
-# Provide target_triple for use in XFAIL and XTARGET.
+# Provide target_triple for use in XFAIL.
 config.target_triple = site_exp['target_triplet']
 
 # Provide llvm_supports_target for use in local configs.
diff --git a/utils/lit/lit/ExampleTests/lit.cfg b/utils/lit/lit/ExampleTests/lit.cfg
index 20ee37dcef2..2629918d9f6 100644
--- a/utils/lit/lit/ExampleTests/lit.cfg
+++ b/utils/lit/lit/ExampleTests/lit.cfg
@@ -23,4 +23,4 @@ config.test_exec_root = None
 config.target_triple = 'foo'
 
 # available_features: Used by ShTest and TclTest formats for REQUIRES checks.
-config.available_features = ['some-feature-name']
+config.available_features.add('some-feature-name')
diff --git a/utils/lit/lit/ExampleTests/vg-fail.c b/utils/lit/lit/ExampleTests/vg-fail.c
new file mode 100644
index 00000000000..e3339ff91aa
--- /dev/null
+++ b/utils/lit/lit/ExampleTests/vg-fail.c
@@ -0,0 +1,4 @@
+// This test should XPASS, when run without valgrind.
+
+// RUN: true
+// XFAIL: valgrind
diff --git a/utils/lit/lit/ExampleTests/xfail-feature.c b/utils/lit/lit/ExampleTests/xfail-feature.c
new file mode 100644
index 00000000000..3444bf87008
--- /dev/null
+++ b/utils/lit/lit/ExampleTests/xfail-feature.c
@@ -0,0 +1,4 @@
+// This test should XPASS.
+
+// RUN: true
+// XFAIL: some-feature-name
diff --git a/utils/lit/lit/LitConfig.py b/utils/lit/lit/LitConfig.py
index c71c0ccdea9..0a359a3db8e 100644
--- a/utils/lit/lit/LitConfig.py
+++ b/utils/lit/lit/LitConfig.py
@@ -42,14 +42,11 @@ class LitConfig:
         self.numWarnings = 0
 
         self.valgrindArgs = []
-        self.valgrindTriple = ""
         if self.useValgrind:
-            self.valgrindTriple = "-vg"
             self.valgrindArgs = ['valgrind', '-q', '--run-libc-freeres=no',
                                  '--tool=memcheck', '--trace-children=yes',
                                  '--error-exitcode=123']
             if self.valgrindLeakCheck:
-                self.valgrindTriple += "_leak"
                 self.valgrindArgs.append('--leak-check=full')
             else:
                 # The default is 'summary'.
diff --git a/utils/lit/lit/TestRunner.py b/utils/lit/lit/TestRunner.py
index 71882b76f8b..0c1911ed356 100644
--- a/utils/lit/lit/TestRunner.py
+++ b/utils/lit/lit/TestRunner.py
@@ -370,27 +370,27 @@ def executeScript(test, litConfig, tmpBase, commands, cwd):
 
     return executeCommand(command, cwd=cwd, env=test.config.environment)
 
-def isExpectedFail(xfails, xtargets, target_triple):
-    # Check if any xfail matches this target.
+def isExpectedFail(test, xfails):
+    # Check if any of the xfails match an available feature or the target.
     for item in xfails:
-        if item == '*' or item in target_triple:
-            break
-    else:
-        return False
+        # If this is the wildcard, it always fails.
+        if item == '*':
+            return True
 
-    # If so, see if it is expected to pass on this target.
-    #
-    # FIXME: Rename XTARGET to something that makes sense, like XPASS.
-    for item in xtargets:
-        if item == '*' or item in target_triple:
-            return False
+        # If this is an exact match for one of the features, it fails.
+        if item in test.config.available_features:
+            return True
+
+        # If this is a part of the target triple, it fails.
+        if item in test.suite.config.target_triple:
+            return True
 
-    return True
+    return False
 
 def parseIntegratedTestScript(test, normalize_slashes=False,
                               extra_substitutions=[]):
     """parseIntegratedTestScript - Scan an LLVM/Clang style integrated test
-    script and extract the lines to 'RUN' as well as 'XFAIL' and 'XTARGET'
+    script and extract the lines to 'RUN' as well as 'XFAIL' and 'REQUIRES'
     information. The RUN lines also will have variable substitution performed.
     """
 
@@ -431,7 +431,6 @@ def parseIntegratedTestScript(test, normalize_slashes=False,
     # Collect the test lines from the script.
     script = []
     xfails = []
-    xtargets = []
     requires = []
     for ln in open(sourcepath):
         if 'RUN:' in ln:
@@ -450,9 +449,6 @@ def parseIntegratedTestScript(test, normalize_slashes=False,
         elif 'XFAIL:' in ln:
             items = ln[ln.index('XFAIL:') + 6:].split(',')
             xfails.extend([s.strip() for s in items])
-        elif 'XTARGET:' in ln:
-            items = ln[ln.index('XTARGET:') + 8:].split(',')
-            xtargets.extend([s.strip() for s in items])
         elif 'REQUIRES:' in ln:
             items = ln[ln.index('REQUIRES:') + 9:].split(',')
             requires.extend([s.strip() for s in items])
@@ -491,7 +487,7 @@ def parseIntegratedTestScript(test, normalize_slashes=False,
         return (Test.UNSUPPORTED,
                 "Test requires the following features: %s" % msg)
 
-    isXFail = isExpectedFail(xfails, xtargets, test.suite.config.target_triple)
+    isXFail = isExpectedFail(test, xfails)
     return script,isXFail,tmpBase,execdir
 
 def formatTestOutput(status, out, err, exitCode, failDueToStderr, script):
diff --git a/utils/lit/lit/TestingConfig.py b/utils/lit/lit/TestingConfig.py
index 223120c4fe2..a1f79a3bfc4 100644
--- a/utils/lit/lit/TestingConfig.py
+++ b/utils/lit/lit/TestingConfig.py
@@ -16,6 +16,7 @@ class TestingConfig:
                 'PATH' : os.pathsep.join(litConfig.path +
                                          [os.environ.get('PATH','')]),
                 'SYSTEMROOT' : os.environ.get('SYSTEMROOT',''),
+                'TERM' : os.environ.get('TERM',''),
                 'LLVM_DISABLE_CRASH_REPORT' : '1',
                 }
 
@@ -28,6 +29,13 @@ class TestingConfig:
                         'TMP' : os.environ.get('TMP',''),
                         })
 
+            # Set the default available features based on the LitConfig.
+            available_features = []
+            if litConfig.useValgrind:
+                available_features.append('valgrind')
+                if litConfig.valgrindLeakCheck:
+                    available_features.append('vg_leak')
+
             config = TestingConfig(parent,
                                    name = '<unnamed>',
                                    suffixes = set(),
@@ -39,7 +47,7 @@ class TestingConfig:
                                    test_exec_root = None,
                                    test_source_root = None,
                                    excludes = [],
-                                   available_features = [])
+                                   available_features = available_features)
 
         if os.path.exists(path):
             # FIXME: Improve detection and error reporting of errors in the
diff --git a/utils/vim/llvm.vim b/utils/vim/llvm.vim
index 9cd483d1479..f80de83da8c 100644
--- a/utils/vim/llvm.vim
+++ b/utils/vim/llvm.vim
@@ -79,7 +79,6 @@ syn match  llvmSpecialComment /;\s*RUN:.*$/
 syn match  llvmSpecialComment /;\s*PR\d*\s*$/
 syn match  llvmSpecialComment /;\s*END\.\s*$/
 syn match  llvmSpecialComment /;\s*XFAIL:.*$/
-syn match  llvmSpecialComment /;\s*XTARGET:.*$/
 
 if version >= 508 || !exists("did_c_syn_inits")
   if version < 508
author	tstellar <tstellar@91177308-0d34-0410-b5e6-96231b3b80d8>	2012-10-22 14:55:30 +0000
committer	tstellar <tstellar@91177308-0d34-0410-b5e6-96231b3b80d8>	2012-10-22 14:55:30 +0000
commit	81d3998812b5977a220d62076cc446822747af21 (patch)
tree	b073cb1be9c65dbdd6cd312c322efd3c04abb0af
parent	63a079bf76d139b91e170557cf16b17bdbecd58d (diff)